diff --git a/internal/collector/generated/gte_pg16_metrics.json b/internal/collector/generated/gte_pg16_metrics.json new file mode 100644 index 0000000000..3b27be7bc0 --- /dev/null +++ b/internal/collector/generated/gte_pg16_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually analyzed","metric_name":"ccp_stat_user_tables_analyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"analyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been analyzed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autoanalyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"autoanalyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been vacuumed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autovacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"autovacuum_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of index scans initiated on this table","metric_name":"ccp_stat_user_tables_idx_scan","static_attributes":{"server":"localhost:5432"},"value_column":"idx_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by index scans","metric_name":"ccp_stat_user_tables_idx_tup_fetch","static_attributes":{"server":"localhost:5432"},"value_column":"idx_tup_fetch"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of dead rows","metric_name":"ccp_stat_user_tables_n_dead_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_dead_tup"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of live rows","metric_name":"ccp_stat_user_tables_n_live_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_live_tup"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows deleted","metric_name":"ccp_stat_user_tables_n_tup_del","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_del"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows HOT updated (i.e., with no separate index update required)","metric_name":"ccp_stat_user_tables_n_tup_hot_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_hot_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows inserted","metric_name":"ccp_stat_user_tables_n_tup_ins","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_ins"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows updated","metric_name":"ccp_stat_user_tables_n_tup_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of sequential scans initiated on this table","metric_name":"ccp_stat_user_tables_seq_scan","static_attributes":{"server":"localhost:5432"},"value_column":"seq_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by sequential scans","metric_name":"ccp_stat_user_tables_seq_tup_read","static_attributes":{"server":"localhost:5432"},"value_column":"seq_tup_read"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually vacuumed (not counting VACUUM FULL)","metric_name":"ccp_stat_user_tables_vacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"vacuum_count"}],"sql":"SELECT\n current_database() as dbname\n , p.schemaname\n , p.relname\n , p.seq_scan\n , p.seq_tup_read\n , COALESCE(p.idx_scan, 0) AS idx_scan\n , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch\n , p.n_tup_ins\n , p.n_tup_upd\n , p.n_tup_del\n , p.n_tup_hot_upd\n , p.n_tup_newpage_upd\n , p.n_live_tup\n , p.n_dead_tup\n , p.vacuum_count\n , p.autovacuum_count\n , p.analyze_count\n , p.autoanalyze_count\n FROM pg_catalog.pg_stat_user_tables p;\n"}] diff --git a/internal/collector/generated/gte_pg17_metrics.json b/internal/collector/generated/gte_pg17_metrics.json new file mode 100644 index 0000000000..563abf01b3 --- /dev/null +++ b/internal/collector/generated/gte_pg17_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"data_type":"sum","description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_bgwriter_buffers_checkpoint","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT c.buffers_written FROM pg_catalog.pg_stat_checkpointer c;\n"},{"metrics":[{"data_type":"sum","description":"Number of write operations, each of the size specified in op_bytes.","metric_name":"ccp_stat_bgwriter_buffers_backend","static_attributes":{"server":"localhost:5432"},"value_column":"writes"},{"data_type":"sum","description":"Number of fsync calls. These are only tracked in context normal.","metric_name":"ccp_stat_bgwriter_buffers_backend_fsync","static_attributes":{"server":"localhost:5432"},"value_column":"fsyncs"}],"sql":"SELECT\n s.writes\n , s.fsyncs\nFROM pg_catalog.pg_stat_io s WHERE backend_type = 'background writer';\n"},{"metrics":[{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_sync_time","static_attributes":{"server":"localhost:5432"},"value_column":"sync_time"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_write_time","static_attributes":{"server":"localhost:5432"},"value_column":"write_time","value_type":"double"},{"description":"Number of requested checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_req","static_attributes":{"server":"localhost:5432"},"value_column":"num_requested"},{"description":"Number of scheduled checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_timed","static_attributes":{"server":"localhost:5432"},"value_column":"num_timed"},{"description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_checkpointer_buffers_written","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT\n c.num_timed\n , c.num_requested\n , c.write_time\n , c.sync_time\n , c.buffers_written\nFROM pg_catalog.pg_stat_checkpointer c;\n"}] diff --git a/internal/collector/generated/lt_pg16_metrics.json b/internal/collector/generated/lt_pg16_metrics.json new file mode 100644 index 0000000000..98bb0cc213 --- /dev/null +++ b/internal/collector/generated/lt_pg16_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually analyzed","metric_name":"ccp_stat_user_tables_analyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"analyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been analyzed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autoanalyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"autoanalyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been vacuumed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autovacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"autovacuum_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of index scans initiated on this table","metric_name":"ccp_stat_user_tables_idx_scan","static_attributes":{"server":"localhost:5432"},"value_column":"idx_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by index scans","metric_name":"ccp_stat_user_tables_idx_tup_fetch","static_attributes":{"server":"localhost:5432"},"value_column":"idx_tup_fetch"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of dead rows","metric_name":"ccp_stat_user_tables_n_dead_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_dead_tup"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of live rows","metric_name":"ccp_stat_user_tables_n_live_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_live_tup"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows deleted","metric_name":"ccp_stat_user_tables_n_tup_del","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_del"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows HOT updated (i.e., with no separate index update required)","metric_name":"ccp_stat_user_tables_n_tup_hot_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_hot_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows inserted","metric_name":"ccp_stat_user_tables_n_tup_ins","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_ins"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows updated","metric_name":"ccp_stat_user_tables_n_tup_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of sequential scans initiated on this table","metric_name":"ccp_stat_user_tables_seq_scan","static_attributes":{"server":"localhost:5432"},"value_column":"seq_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by sequential scans","metric_name":"ccp_stat_user_tables_seq_tup_read","static_attributes":{"server":"localhost:5432"},"value_column":"seq_tup_read"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually vacuumed (not counting VACUUM FULL)","metric_name":"ccp_stat_user_tables_vacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"vacuum_count"}],"sql":"SELECT\n current_database() as dbname\n , p.schemaname\n , p.relname\n , p.seq_scan\n , p.seq_tup_read\n , COALESCE(p.idx_scan, 0) AS idx_scan\n , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch\n , p.n_tup_ins\n , p.n_tup_upd\n , p.n_tup_del\n , p.n_tup_hot_upd\n , 0::bigint AS n_tup_newpage_upd\n , p.n_live_tup\n , p.n_dead_tup\n , p.vacuum_count\n , p.autovacuum_count\n , p.analyze_count\n , p.autoanalyze_count\nFROM pg_catalog.pg_stat_user_tables p;\n"}] diff --git a/internal/collector/generated/lt_pg17_metrics.json b/internal/collector/generated/lt_pg17_metrics.json new file mode 100644 index 0000000000..d6266ffacb --- /dev/null +++ b/internal/collector/generated/lt_pg17_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"data_type":"sum","description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_bgwriter_buffers_checkpoint","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT c.buffers_checkpoint AS buffers_written FROM pg_catalog.pg_stat_bgwriter c;\n"},{"metrics":[{"data_type":"sum","description":"Number of write operations, each of the size specified in op_bytes.","metric_name":"ccp_stat_bgwriter_buffers_backend","static_attributes":{"server":"localhost:5432"},"value_column":"writes"},{"data_type":"sum","description":"Number of fsync calls. These are only tracked in context normal.","metric_name":"ccp_stat_bgwriter_buffers_backend_fsync","static_attributes":{"server":"localhost:5432"},"value_column":"fsyncs"}],"sql":"SELECT\n s.buffers_backend AS writes\n , s.buffers_backend_fsync AS fsyncs\nFROM pg_catalog.pg_stat_bgwriter s;\n"},{"metrics":[{"description":"Number of scheduled checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_timed","static_attributes":{"server":"localhost:5432"},"value_column":"num_timed"},{"description":"Number of requested checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_req","static_attributes":{"server":"localhost:5432"},"value_column":"num_requested"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_write_time","static_attributes":{"server":"localhost:5432"},"value_column":"write_time","value_type":"double"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_sync_time","static_attributes":{"server":"localhost:5432"},"value_column":"sync_time"},{"description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_checkpointer_buffers_written","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT\n c.checkpoints_timed AS num_timed\n , c.checkpoints_req AS num_requested\n , c.checkpoint_write_time AS write_time\n , c.checkpoint_sync_time AS sync_time\n , c.buffers_checkpoint AS buffers_written\nFROM pg_catalog.pg_stat_bgwriter c;\n"}] diff --git a/internal/collector/generated/pgbackrest_metrics.json b/internal/collector/generated/pgbackrest_metrics.json new file mode 100644 index 0000000000..63114afc03 --- /dev/null +++ b/internal/collector/generated/pgbackrest_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"SELECT * FROM get_pgbackrest_info();\n"}] diff --git a/internal/collector/generated/pgbouncer_metrics_queries.json b/internal/collector/generated/pgbouncer_metrics_queries.json index 5b0ed8abc5..0248051d94 100644 --- a/internal/collector/generated/pgbouncer_metrics_queries.json +++ b/internal/collector/generated/pgbouncer_metrics_queries.json @@ -1 +1 @@ -[{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"Current waiting time in seconds","metric_name":"ccp_pgbouncer_clients_wait_seconds","value_column":"wait"}],"sql":"SHOW CLIENTS"},{"metrics":[{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Maximum number of server connections","metric_name":"ccp_pgbouncer_databases_pool_size","value_column":"pool_size"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Minimum number of server connections","metric_name":"ccp_pgbouncer_databases_min_pool_size","value_column":"min_pool_size"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Maximum number of additional connections for this database","metric_name":"ccp_pgbouncer_databases_reserve_pool","value_column":"reserve_pool"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Maximum number of allowed connections for this database, as set by max_db_connections, either globally or per database","metric_name":"ccp_pgbouncer_databases_max_connections","value_column":"max_connections"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Current number of connections for this database","metric_name":"ccp_pgbouncer_databases_current_connections","value_column":"current_connections"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"1 if this database is currently paused, else 0","metric_name":"ccp_pgbouncer_databases_paused","value_column":"paused"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"1 if this database is currently disabled, else 0","metric_name":"ccp_pgbouncer_databases_disabled","value_column":"disabled"}],"sql":"SHOW DATABASES"},{"metrics":[{"attribute_columns":["list"],"description":"Count of items registered with pgBouncer","metric_name":"ccp_pgbouncer_lists_item_count","value_column":"items"}],"sql":"SHOW LISTS"},{"metrics":[{"attribute_columns":["database","user"],"description":"Client connections that are either linked to server connections or are idle with no queries waiting to be processed","metric_name":"ccp_pgbouncer_pools_client_active","value_column":"cl_active"},{"attribute_columns":["database","user"],"description":"Client connections that have sent queries but have not yet got a server connection","metric_name":"ccp_pgbouncer_pools_client_waiting","value_column":"cl_waiting"},{"attribute_columns":["database","user"],"description":"Server connections that are linked to a client","metric_name":"ccp_pgbouncer_pools_server_active","value_column":"sv_active"},{"attribute_columns":["database","user"],"description":"Server connections that are unused and immediately usable for client queries","metric_name":"ccp_pgbouncer_pools_server_idle","value_column":"sv_idle"},{"attribute_columns":["database","user"],"description":"Server connections that have been idle for more than server_check_delay, so they need server_check_query to run on them before they can be used again","metric_name":"ccp_pgbouncer_pools_server_used","value_column":"sv_used"}],"sql":"SHOW POOLS"},{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"1 if the connection will be closed as soon as possible, because a configuration file reload or DNS update changed the connection information or RECONNECT was issued","metric_name":"ccp_pgbouncer_servers_close_needed","value_column":"close_needed"}],"sql":"SHOW SERVERS"}] +[{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"Current waiting time in seconds","metric_name":"ccp_pgbouncer_clients_wait_seconds","value_column":"wait"}],"sql":"SHOW CLIENTS"},{"metrics":[{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Maximum number of server connections","metric_name":"ccp_pgbouncer_databases_pool_size","value_column":"pool_size"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Minimum number of server connections","metric_name":"ccp_pgbouncer_databases_min_pool_size","value_column":"min_pool_size"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Maximum number of additional connections for this database","metric_name":"ccp_pgbouncer_databases_reserve_pool","value_column":"reserve_pool"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Maximum number of allowed connections for this database, as set by max_db_connections, either globally or per database","metric_name":"ccp_pgbouncer_databases_max_connections","value_column":"max_connections"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Current number of connections for this database","metric_name":"ccp_pgbouncer_databases_current_connections","value_column":"current_connections"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"1 if this database is currently paused, else 0","metric_name":"ccp_pgbouncer_databases_paused","value_column":"paused"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"1 if this database is currently disabled, else 0","metric_name":"ccp_pgbouncer_databases_disabled","value_column":"disabled"}],"sql":"SHOW DATABASES"},{"metrics":[{"attribute_columns":["list"],"description":"Count of items registered with pgBouncer","metric_name":"ccp_pgbouncer_lists_item_count","value_column":"items"}],"sql":"SHOW LISTS"},{"metrics":[{"attribute_columns":["database","user"],"description":"Client connections that are either linked to server connections or are idle with no queries waiting to be processed","metric_name":"ccp_pgbouncer_pools_client_active","value_column":"cl_active"},{"attribute_columns":["database","user"],"description":"Client connections that have sent queries but have not yet got a server connection","metric_name":"ccp_pgbouncer_pools_client_waiting","value_column":"cl_waiting"},{"attribute_columns":["database","user"],"description":"Server connections that are linked to a client","metric_name":"ccp_pgbouncer_pools_server_active","value_column":"sv_active"},{"attribute_columns":["database","user"],"description":"Server connections that are unused and immediately usable for client queries","metric_name":"ccp_pgbouncer_pools_server_idle","value_column":"sv_idle"},{"attribute_columns":["database","user"],"description":"Server connections that have been idle for more than server_check_delay, so they need server_check_query to run on them before they can be used again","metric_name":"ccp_pgbouncer_pools_server_used","value_column":"sv_used"}],"sql":"SHOW POOLS"},{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"1 if the connection will be closed as soon as possible, because a configuration file reload or DNS update changed the connection information or RECONNECT was issued","metric_name":"ccp_pgbouncer_servers_close_needed","value_column":"close_needed"}],"sql":"SHOW SERVERS"}] diff --git a/internal/collector/generated/postgres_5m_metrics.json b/internal/collector/generated/postgres_5m_metrics.json new file mode 100644 index 0000000000..a9a3500a02 --- /dev/null +++ b/internal/collector/generated/postgres_5m_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["dbname"],"description":"Database size in bytes","metric_name":"ccp_database_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes"}],"sql":"SELECT datname as dbname , pg_database_size(datname) as bytes FROM pg_catalog.pg_database WHERE datistemplate = false;\n"},{"metrics":[{"description":"Count of sequences that have reached greater than or equal to 75% of their max available numbers.\nFunction monitor.sequence_status() can provide more details if run directly on system.\n","metric_name":"ccp_sequence_exhaustion_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM (\n SELECT CEIL((s.max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots\n , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used\n FROM pg_catalog.pg_sequences s\n) x WHERE (ROUND(used/slots*100)::int) \u003e 75;\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Number of times disk blocks were found already in the buffer cache, so that a read was not necessary","metric_name":"ccp_stat_database_blks_hit","static_attributes":{"server":"localhost:5432"},"value_column":"blks_hit"},{"attribute_columns":["dbname"],"description":"Number of disk blocks read in this database","metric_name":"ccp_stat_database_blks_read","static_attributes":{"server":"localhost:5432"},"value_column":"blks_read"},{"attribute_columns":["dbname"],"description":"Number of queries canceled due to conflicts with recovery in this database","metric_name":"ccp_stat_database_conflicts","static_attributes":{"server":"localhost:5432"},"value_column":"conflicts"},{"attribute_columns":["dbname"],"description":"Number of deadlocks detected in this database","metric_name":"ccp_stat_database_deadlocks","static_attributes":{"server":"localhost:5432"},"value_column":"deadlocks"},{"attribute_columns":["dbname"],"description":"Total amount of data written to temporary files by queries in this database","metric_name":"ccp_stat_database_temp_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"temp_bytes"},{"attribute_columns":["dbname"],"description":"Number of rows deleted by queries in this database","metric_name":"ccp_stat_database_temp_files","static_attributes":{"server":"localhost:5432"},"value_column":"temp_files"},{"attribute_columns":["dbname"],"description":"Number of rows deleted by queries in this database","metric_name":"ccp_stat_database_tup_deleted","static_attributes":{"server":"localhost:5432"},"value_column":"tup_deleted"},{"attribute_columns":["dbname"],"description":"Number of rows fetched by queries in this database","metric_name":"ccp_stat_database_tup_fetched","static_attributes":{"server":"localhost:5432"},"value_column":"tup_fetched"},{"attribute_columns":["dbname"],"description":"Number of rows inserted by queries in this database","metric_name":"ccp_stat_database_tup_inserted","static_attributes":{"server":"localhost:5432"},"value_column":"tup_inserted"},{"attribute_columns":["dbname"],"description":"Number of rows returned by queries in this database","metric_name":"ccp_stat_database_tup_returned","static_attributes":{"server":"localhost:5432"},"value_column":"tup_returned"},{"attribute_columns":["dbname"],"description":"Number of rows updated by queries in this database","metric_name":"ccp_stat_database_tup_updated","static_attributes":{"server":"localhost:5432"},"value_column":"tup_updated"},{"attribute_columns":["dbname"],"description":"Number of transactions in this database that have been committed","metric_name":"ccp_stat_database_xact_commit","static_attributes":{"server":"localhost:5432"},"value_column":"xact_commit"},{"attribute_columns":["dbname"],"description":"Number of transactions in this database that have been rolled back","metric_name":"ccp_stat_database_xact_rollback","static_attributes":{"server":"localhost:5432"},"value_column":"xact_rollback"}],"sql":"SELECT s.datname AS dbname , s.xact_commit , s.xact_rollback , s.blks_read , s.blks_hit , s.tup_returned , s.tup_fetched , s.tup_inserted , s.tup_updated , s.tup_deleted , s.conflicts , s.temp_files , s.temp_bytes , s.deadlocks FROM pg_catalog.pg_stat_database s JOIN pg_catalog.pg_database d ON d.datname = s.datname WHERE d.datistemplate = false;\n"}] diff --git a/internal/collector/generated/postgres_5s_metrics.json b/internal/collector/generated/postgres_5s_metrics.json new file mode 100644 index 0000000000..09ea77846b --- /dev/null +++ b/internal/collector/generated/postgres_5s_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["application_name","datname","state","usename"],"description":"number of connections in this state","metric_name":"ccp_pg_stat_activity_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT\n pg_database.datname,\n tmp.state,\n COALESCE(tmp2.usename, '') as usename,\n COALESCE(tmp2.application_name, '') as application_name,\n COALESCE(count,0) as count,\n COALESCE(max_tx_duration,0) as max_tx_duration\nFROM\n (\n VALUES ('active'),\n ('idle'),\n ('idle in transaction'),\n ('idle in transaction (aborted)'),\n ('fastpath function call'),\n ('disabled')\n ) AS tmp(state) CROSS JOIN pg_database\nLEFT JOIN (\n SELECT\n datname,\n state,\n usename,\n application_name,\n count(*) AS count,\n MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration\n FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2\n ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname;\n"},{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT COALESCE(EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)), 0) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver;\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT * FROM get_replication_lag();\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n COALESCE(\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS replay_time,\n COALESCE(\n CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS received_time,\n CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END AS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"SELECT * FROM get_pgbackrest_info();\n"}] diff --git a/internal/collector/gte_pg16_metrics.yaml b/internal/collector/gte_pg16_metrics.yaml new file mode 100644 index 0000000000..319aad62dc --- /dev/null +++ b/internal/collector/gte_pg16_metrics.yaml @@ -0,0 +1,127 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + +# NOTE: Some of the columns below can return NULL values, for which sqlqueryreceiver will warn. +# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/sqlqueryreceiver#null-values +# Those columns are idx_scan and idx_tup_fetch and we avoid NULL by using COALESCE. + - sql: > + SELECT + current_database() as dbname + , p.schemaname + , p.relname + , p.seq_scan + , p.seq_tup_read + , COALESCE(p.idx_scan, 0) AS idx_scan + , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch + , p.n_tup_ins + , p.n_tup_upd + , p.n_tup_del + , p.n_tup_hot_upd + , p.n_tup_newpage_upd + , p.n_live_tup + , p.n_dead_tup + , p.vacuum_count + , p.autovacuum_count + , p.analyze_count + , p.autoanalyze_count + FROM pg_catalog.pg_stat_user_tables p; + metrics: + - metric_name: ccp_stat_user_tables_analyze_count + data_type: sum + value_column: analyze_count + description: Number of times this table has been manually analyzed + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autoanalyze_count + data_type: sum + value_column: autoanalyze_count + description: Number of times this table has been analyzed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autovacuum_count + data_type: sum + value_column: autovacuum_count + description: Number of times this table has been vacuumed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_scan + data_type: sum + value_column: idx_scan + description: Number of index scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_tup_fetch + data_type: sum + value_column: idx_tup_fetch + description: Number of live rows fetched by index scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_dead_tup + value_column: n_dead_tup + description: Estimated number of dead rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_live_tup + value_column: n_live_tup + description: Estimated number of live rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_del + data_type: sum + value_column: n_tup_del + description: Number of rows deleted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_hot_upd + data_type: sum + value_column: n_tup_hot_upd + description: Number of rows HOT updated (i.e., with no separate index update required) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_ins + data_type: sum + value_column: n_tup_ins + description: Number of rows inserted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_upd + data_type: sum + value_column: n_tup_upd + description: Number of rows updated + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_seq_scan + data_type: sum + value_column: seq_scan + description: Number of sequential scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_seq_tup_read + data_type: sum + value_column: seq_tup_read + description: Number of live rows fetched by sequential scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_vacuum_count + data_type: sum + value_column: vacuum_count + description: Number of times this table has been manually vacuumed (not counting VACUUM FULL) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/gte_pg17_metrics.yaml b/internal/collector/gte_pg17_metrics.yaml new file mode 100644 index 0000000000..de8f6786f5 --- /dev/null +++ b/internal/collector/gte_pg17_metrics.yaml @@ -0,0 +1,72 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + + - sql: > + SELECT c.buffers_written + FROM pg_catalog.pg_stat_checkpointer c; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_checkpoint + value_column: buffers_written + data_type: sum + description: Number of buffers written during checkpoints and restartpoints + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + s.writes + , s.fsyncs + FROM pg_catalog.pg_stat_io s + WHERE backend_type = 'background writer'; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_backend + value_column: writes + data_type: sum + description: Number of write operations, each of the size specified in op_bytes. + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_buffers_backend_fsync + value_column: fsyncs + data_type: sum + description: Number of fsync calls. These are only tracked in context normal. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + c.num_timed + , c.num_requested + , c.write_time + , c.sync_time + , c.buffers_written + FROM pg_catalog.pg_stat_checkpointer c; + metrics: + - metric_name: ccp_stat_bgwriter_checkpoint_sync_time + value_column: sync_time + description: Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoint_write_time + value_column: write_time + value_type: double + description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoints_req + value_column: num_requested + description: Number of requested checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoints_timed + value_column: num_timed + description: Number of scheduled checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_checkpointer_buffers_written + description: Number of buffers written during checkpoints and restartpoints + value_column: buffers_written + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/lt_pg16_metrics.yaml b/internal/collector/lt_pg16_metrics.yaml new file mode 100644 index 0000000000..ca9fe8a0c8 --- /dev/null +++ b/internal/collector/lt_pg16_metrics.yaml @@ -0,0 +1,135 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + +# NOTE: Some of the columns below can return NULL values, for which sqlqueryreceiver will warn. +# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/sqlqueryreceiver#null-values +# Those columns are idx_scan and idx_tup_fetch and we avoid NULL by using COALESCE. + - sql: > + SELECT + current_database() as dbname + , p.schemaname + , p.relname + , p.seq_scan + , p.seq_tup_read + , COALESCE(p.idx_scan, 0) AS idx_scan + , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch + , p.n_tup_ins + , p.n_tup_upd + , p.n_tup_del + , p.n_tup_hot_upd + , 0::bigint AS n_tup_newpage_upd + , p.n_live_tup + , p.n_dead_tup + , p.vacuum_count + , p.autovacuum_count + , p.analyze_count + , p.autoanalyze_count + FROM pg_catalog.pg_stat_user_tables p; + metrics: + - metric_name: ccp_stat_user_tables_analyze_count + data_type: sum + value_column: analyze_count + description: Number of times this table has been manually analyzed + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autoanalyze_count + data_type: sum + value_column: autoanalyze_count + description: Number of times this table has been analyzed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autovacuum_count + data_type: sum + value_column: autovacuum_count + description: Number of times this table has been vacuumed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_scan + data_type: sum + value_column: idx_scan + description: Number of index scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_tup_fetch + data_type: sum + value_column: idx_tup_fetch + description: Number of live rows fetched by index scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_dead_tup + value_column: n_dead_tup + description: Estimated number of dead rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_n_live_tup + value_column: n_live_tup + description: Estimated number of live rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_del + data_type: sum + value_column: n_tup_del + description: Number of rows deleted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_hot_upd + data_type: sum + value_column: n_tup_hot_upd + description: Number of rows HOT updated (i.e., with no separate index update required) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_n_tup_ins + data_type: sum + value_column: n_tup_ins + description: Number of rows inserted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_upd + data_type: sum + value_column: n_tup_upd + description: Number of rows updated + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_seq_scan + data_type: sum + value_column: seq_scan + description: Number of sequential scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_seq_tup_read + data_type: sum + value_column: seq_tup_read + description: Number of live rows fetched by sequential scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_vacuum_count + data_type: sum + value_column: vacuum_count + description: Number of times this table has been manually vacuumed (not counting VACUUM FULL) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/lt_pg17_metrics.yaml b/internal/collector/lt_pg17_metrics.yaml new file mode 100644 index 0000000000..330ff7d798 --- /dev/null +++ b/internal/collector/lt_pg17_metrics.yaml @@ -0,0 +1,71 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + + - sql: > + SELECT c.buffers_checkpoint AS buffers_written + FROM pg_catalog.pg_stat_bgwriter c; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_checkpoint + value_column: buffers_written + data_type: sum + description: Number of buffers written during checkpoints and restartpoints + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + s.buffers_backend AS writes + , s.buffers_backend_fsync AS fsyncs + FROM pg_catalog.pg_stat_bgwriter s; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_backend + value_column: writes + data_type: sum + description: Number of write operations, each of the size specified in op_bytes. + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_buffers_backend_fsync + value_column: fsyncs + data_type: sum + description: Number of fsync calls. These are only tracked in context normal. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + c.checkpoints_timed AS num_timed + , c.checkpoints_req AS num_requested + , c.checkpoint_write_time AS write_time + , c.checkpoint_sync_time AS sync_time + , c.buffers_checkpoint AS buffers_written + FROM pg_catalog.pg_stat_bgwriter c; + metrics: + - metric_name: ccp_stat_bgwriter_checkpoints_timed + value_column: num_timed + description: Number of scheduled checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoints_req + value_column: num_requested + description: Number of requested checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoint_write_time + value_column: write_time + value_type: double + description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoint_sync_time + value_column: sync_time + description: Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_checkpointer_buffers_written + description: Number of buffers written during checkpoints and restartpoints + value_column: buffers_written + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/naming.go b/internal/collector/naming.go index 3dad4205fa..4a414a9bad 100644 --- a/internal/collector/naming.go +++ b/internal/collector/naming.go @@ -9,5 +9,16 @@ const DebugExporter = "debug" const OneSecondBatchProcessor = "batch/1s" const SubSecondBatchProcessor = "batch/200ms" const Prometheus = "prometheus" -const Metrics = "metrics" +const PGBouncerMetrics = "metrics/pgbouncer" +const PostgresMetrics = "metrics/postgres" +const PatroniMetrics = "metrics/patroni" + const SqlQuery = "sqlquery" + +// For slow queries, we'll use pgMonitor's default 5 minute interval. +// https://github.com/CrunchyData/pgmonitor-extension/blob/main/sql/matviews/matviews.sql +const FiveMinuteSqlQuery = "sqlquery/300s" + +// We'll use pgMonitor's Prometheus collection interval for most queries. +// https://github.com/CrunchyData/pgmonitor/blob/development/prometheus/linux/crunchy-prometheus.yml +const FiveSecondSqlQuery = "sqlquery/5s" diff --git a/internal/collector/patroni.go b/internal/collector/patroni.go index 3199d9c0ea..1f0846eedb 100644 --- a/internal/collector/patroni.go +++ b/internal/collector/patroni.go @@ -133,7 +133,7 @@ func EnablePatroniMetrics(ctx context.Context, if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Add Prometheus exporter outConfig.Exporters[Prometheus] = map[string]any{ - "endpoint": "0.0.0.0:8889", + "endpoint": "0.0.0.0:9187", } // Add Prometheus Receiver @@ -160,8 +160,12 @@ func EnablePatroniMetrics(ctx context.Context, } // Add Metrics Pipeline - outConfig.Pipelines[Metrics] = Pipeline{ + outConfig.Pipelines[PatroniMetrics] = Pipeline{ Receivers: []ComponentID{Prometheus}, + Processors: []ComponentID{ + SubSecondBatchProcessor, + CompactingProcessor, + }, Exporters: []ComponentID{Prometheus}, } } diff --git a/internal/collector/pgbouncer.go b/internal/collector/pgbouncer.go index 610843212b..59ba0b7495 100644 --- a/internal/collector/pgbouncer.go +++ b/internal/collector/pgbouncer.go @@ -172,7 +172,7 @@ func EnablePgBouncerMetrics(ctx context.Context, config *Config, sqlQueryUsernam if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Add Prometheus exporter config.Exporters[Prometheus] = map[string]any{ - "endpoint": "0.0.0.0:8889", + "endpoint": "0.0.0.0:9187", } // Add SqlQuery Receiver @@ -184,8 +184,12 @@ func EnablePgBouncerMetrics(ctx context.Context, config *Config, sqlQueryUsernam } // Add Metrics Pipeline - config.Pipelines[Metrics] = Pipeline{ + config.Pipelines[PGBouncerMetrics] = Pipeline{ Receivers: []ComponentID{SqlQuery}, + Processors: []ComponentID{ + SubSecondBatchProcessor, + CompactingProcessor, + }, Exporters: []ComponentID{Prometheus}, } } diff --git a/internal/collector/pgbouncer_metrics_queries.yaml b/internal/collector/pgbouncer_metrics_queries.yaml index d1ab237d63..228fef1cc0 100644 --- a/internal/collector/pgbouncer_metrics_queries.yaml +++ b/internal/collector/pgbouncer_metrics_queries.yaml @@ -11,43 +11,45 @@ attribute_columns: ["database", "user", "state", "application_name", "link"] description: "Current waiting time in seconds" + # NOTE: Avoid collecting "host" column because it can be null; the collector will warn against null. + # The host column should always point either to pgBouncer's virtual database (the null case) or to the primary. - sql: "SHOW DATABASES" metrics: - metric_name: ccp_pgbouncer_databases_pool_size value_column: pool_size - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Maximum number of server connections" - metric_name: ccp_pgbouncer_databases_min_pool_size value_column: min_pool_size - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Minimum number of server connections" - metric_name: ccp_pgbouncer_databases_reserve_pool value_column: reserve_pool - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Maximum number of additional connections for this database" - metric_name: ccp_pgbouncer_databases_max_connections value_column: max_connections - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: >- Maximum number of allowed connections for this database, as set by max_db_connections, either globally or per database - metric_name: ccp_pgbouncer_databases_current_connections value_column: current_connections - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Current number of connections for this database" - metric_name: ccp_pgbouncer_databases_paused value_column: paused - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "1 if this database is currently paused, else 0" - metric_name: ccp_pgbouncer_databases_disabled value_column: disabled - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "1 if this database is currently disabled, else 0" - sql: "SHOW LISTS" diff --git a/internal/collector/postgres.go b/internal/collector/postgres.go index 544f0e9feb..416c27ecda 100644 --- a/internal/collector/postgres.go +++ b/internal/collector/postgres.go @@ -23,8 +23,9 @@ func NewConfigForPostgresPod(ctx context.Context, ) *Config { config := NewConfig(inCluster.Spec.Instrumentation) - EnablePatroniLogging(ctx, inCluster, config) + EnablePostgresMetrics(ctx, inCluster, config) EnablePatroniMetrics(ctx, inCluster, config) + EnablePatroniLogging(ctx, inCluster, config) EnablePostgresLogging(ctx, inCluster, config, outParameters) return config diff --git a/internal/collector/postgres_5m_metrics.yaml b/internal/collector/postgres_5m_metrics.yaml new file mode 100644 index 0000000000..9f5c3212dc --- /dev/null +++ b/internal/collector/postgres_5m_metrics.yaml @@ -0,0 +1,143 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + - sql: > + SELECT datname as dbname + , pg_database_size(datname) as bytes + FROM pg_catalog.pg_database + WHERE datistemplate = false; + metrics: + - metric_name: ccp_database_size_bytes + value_column: bytes + description: Database size in bytes + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + # Returns count of sequences that have used up 75% of what's available. + # https://github.com/CrunchyData/pgmonitor-extension/blob/main/sql/functions/functions.sql#L67 + # NOTE: Postgres 13 requires an alias, x below, where PG 17 doesn't. + - sql: > + SELECT count(*) AS count + FROM ( + SELECT CEIL((s.max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots + , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used + FROM pg_catalog.pg_sequences s + ) x + WHERE (ROUND(used/slots*100)::int) > 75; + metrics: + - metric_name: ccp_sequence_exhaustion_count + value_column: count + description: | + Count of sequences that have reached greater than or equal to 75% of their max available numbers. + Function monitor.sequence_status() can provide more details if run directly on system. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT s.datname AS dbname + , s.xact_commit + , s.xact_rollback + , s.blks_read + , s.blks_hit + , s.tup_returned + , s.tup_fetched + , s.tup_inserted + , s.tup_updated + , s.tup_deleted + , s.conflicts + , s.temp_files + , s.temp_bytes + , s.deadlocks + FROM pg_catalog.pg_stat_database s + JOIN pg_catalog.pg_database d ON d.datname = s.datname + WHERE d.datistemplate = false; + metrics: + - metric_name: ccp_stat_database_blks_hit + value_column: blks_hit + description: Number of times disk blocks were found already in the buffer cache, so that a read was not necessary + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_blks_read + value_column: blks_read + description: Number of disk blocks read in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_conflicts + value_column: conflicts + description: Number of queries canceled due to conflicts with recovery in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_deadlocks + value_column: deadlocks + description: Number of deadlocks detected in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_temp_bytes + value_column: temp_bytes + description: Total amount of data written to temporary files by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_temp_files + value_column: temp_files + description: Number of rows deleted by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_deleted + value_column: tup_deleted + description: Number of rows deleted by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_fetched + value_column: tup_fetched + description: Number of rows fetched by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_inserted + value_column: tup_inserted + description: Number of rows inserted by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_returned + value_column: tup_returned + description: Number of rows returned by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_updated + value_column: tup_updated + description: Number of rows updated by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_xact_commit + value_column: xact_commit + description: Number of transactions in this database that have been committed + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_xact_rollback + value_column: xact_rollback + description: Number of transactions in this database that have been rolled back + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + diff --git a/internal/collector/postgres_5s_metrics.yaml b/internal/collector/postgres_5s_metrics.yaml new file mode 100644 index 0000000000..4f1a142782 --- /dev/null +++ b/internal/collector/postgres_5s_metrics.yaml @@ -0,0 +1,949 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml +# + # TODO ccp_pg_stat_activity can be removed after metrics are fully aligned with the latest pgMonitor + - sql: > + SELECT + pg_database.datname, + tmp.state, + COALESCE(tmp2.usename, '') as usename, + COALESCE(tmp2.application_name, '') as application_name, + COALESCE(count,0) as count, + COALESCE(max_tx_duration,0) as max_tx_duration + FROM + ( + VALUES ('active'), + ('idle'), + ('idle in transaction'), + ('idle in transaction (aborted)'), + ('fastpath function call'), + ('disabled') + ) AS tmp(state) CROSS JOIN pg_database + LEFT JOIN + ( + SELECT + datname, + state, + usename, + application_name, + count(*) AS count, + MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration + FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2 + ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname; + metrics: + - metric_name: ccp_pg_stat_activity_count + value_column: count + description: number of connections in this state + attribute_columns: ["application_name", "datname", "state", "usename"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + COALESCE(EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)), 0) AS seconds_since_last_archive + FROM pg_catalog.pg_stat_archiver; + + metrics: + - metric_name: ccp_archive_command_status_seconds_since_last_archive + value_column: seconds_since_last_archive + value_type: double + description: Seconds since the last successful archive operation + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT archived_count + FROM pg_catalog.pg_stat_archiver + metrics: + - metric_name: ccp_archive_command_status_archived_count + value_column: archived_count + description: Number of WAL files that have been successfully archived + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT failed_count + FROM pg_catalog.pg_stat_archiver + metrics: + - metric_name: ccp_archive_command_status_failed_count + value_column: failed_count + description: Number of failed attempts for archiving WAL files + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT CASE + WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0 + WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) < 0 THEN 0 + ELSE EXTRACT(epoch from (last_failed_time - last_archived_time)) + END AS seconds_since_last_fail + FROM pg_catalog.pg_stat_archiver + + metrics: + - metric_name: ccp_archive_command_status_seconds_since_last_fail + value_column: seconds_since_last_fail + description: Seconds since the last recorded failure of the archive_command + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT ((total - idle) - idle_in_txn) as active + , total + , idle + , idle_in_txn + , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time + , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state <> 'idle' ) AS max_query_time + , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time + , max_connections + FROM ( + SELECT COUNT(*) as total + , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle + , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x + JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true); + + metrics: + - metric_name: ccp_connection_stats_active + value_column: active + description: Total non-idle connections + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_idle + value_column: idle + description: Total idle connections + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_idle_in_txn + value_column: idle_in_txn + description: Total idle in transaction connections + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_blocked_query_time + value_column: max_blocked_query_time + value_type: double + description: Value of max_connections for the monitored database + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_connections + value_column: max_connections + description: Value of max_connections for the monitored database + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_idle_in_txn_time + value_column: max_idle_in_txn_time + value_type: double + description: Length of time in seconds of the longest idle in transaction session + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_query_time + value_column: max_query_time + value_type: double + description: Length of time in seconds of the longest running query + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_total + value_column: total + description: Total idle and non-idle connections + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT datname AS dbname + , checksum_failures AS count + , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds + FROM pg_catalog.pg_stat_database + WHERE pg_stat_database.datname IS NOT NULL; + metrics: + - metric_name: ccp_data_checksum_failure_count + value_column: count + attribute_columns: ["dbname"] + description: Total number of checksum failures on this database + static_attributes: + server: "localhost:5432" + - metric_name: ccp_data_checksum_failure_time_since_last_failure_seconds + value_column: time_since_last_failure_seconds + value_type: double + attribute_columns: ["dbname"] + description: Time interval in seconds since the last checksum failure was encountered + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT pg_database.datname as dbname + , tmp.mode + , COALESCE(count,0) as count + FROM + ( + VALUES ('accesssharelock'), + ('rowsharelock'), + ('rowexclusivelock'), + ('shareupdateexclusivelock'), + ('sharelock'), + ('sharerowexclusivelock'), + ('exclusivelock'), + ('accessexclusivelock') + ) AS tmp(mode) CROSS JOIN pg_catalog.pg_database + LEFT JOIN + (SELECT database, lower(mode) AS mode,count(*) AS count + FROM pg_catalog.pg_locks WHERE database IS NOT NULL + GROUP BY database, lower(mode) + ) AS tmp2 + ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database; + metrics: + - metric_name: ccp_locks_count + value_column: count + attribute_columns: ["dbname", "mode"] + description: Return value of 1 means database is in recovery. Otherwise 2 it is a primary. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request + , monitor.kdapi_scalar_bigint('cpu_limit') AS limit + metrics: + - metric_name: ccp_nodemx_cpu_limit + value_column: limit + description: CPU limit value in milli cores + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpu_request + value_column: request + description: CPU request value in milli cores + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT CASE WHEN monitor.cgroup_mode() = 'legacy' + THEN monitor.cgroup_scalar_bigint('cpuacct.usage') + ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000 + END AS usage, + extract(epoch from clock_timestamp()) AS usage_ts; + metrics: + - metric_name: ccp_nodemx_cpuacct_usage + value_column: usage + value_type: double + description: CPU usage in nanoseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpuacct_usage_ts + value_column: usage_ts + value_type: double + description: CPU usage snapshot timestamp + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + CASE + WHEN monitor.cgroup_mode() = 'legacy' THEN + monitor.cgroup_scalar_bigint('cpu.cfs_period_us') + ELSE + (monitor.cgroup_array_bigint('cpu.max'))[2] + END AS period_us, + CASE + WHEN monitor.cgroup_mode() = 'legacy' THEN + GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0) + ELSE + GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0) + END AS quota_us; + metrics: + - metric_name: ccp_nodemx_cpucfs_period_us + value_column: period_us + description: The total available run-time within a period (in microseconds) + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpucfs_quota_us + value_column: quota_us + value_type: double + description: The length of a period (in microseconds) + static_attributes: + server: "localhost:5432" + + # NOTE: cgroup v2 has throttled_usec, vs. throttled_time. + - sql: > + WITH d(key, val) AS + (select key, val from monitor.cgroup_setof_kv('cpu.stat')) + SELECT + (SELECT val FROM d WHERE key='nr_periods') AS nr_periods, + (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled, + (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time, + extract(epoch from clock_timestamp()) as snap_ts; + metrics: + - metric_name: ccp_nodemx_cpustat_nr_periods + value_column: nr_periods + value_type: double + description: Number of periods that any thread was runnable + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpustat_nr_throttled + value_column: nr_throttled + description: Number of runnable periods in which the application used its entire quota and was throttled + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpustat_snap_ts + value_column: snap_ts + value_type: double + description: CPU stat snapshot timestamp + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpustat_throttled_time + value_column: throttled_time + value_type: double # TODO: Is this right? + description: Sum total amount of time individual threads within the monitor.cgroup were throttled + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes + FROM monitor.proc_mountinfo() m + JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number) + WHERE m.mount_point IN ('/pgdata', '/pgwal') OR + m.mount_point like '/tablespaces/%' + metrics: + - metric_name: ccp_nodemx_data_disk_available_bytes + value_column: available_bytes + value_type: double + description: Available size in bytes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_data_disk_free_file_nodes + value_column: free_file_nodes + description: Available file nodes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_data_disk_total_bytes + value_column: total_bytes + description: Size in bytes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_data_disk_total_file_nodes + value_column: total_file_nodes + description: Total file nodes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT mount_point,sectors_read,sectors_written + FROM monitor.proc_mountinfo() m + JOIN monitor.proc_diskstats() d USING (major_number, minor_number) + WHERE m.mount_point IN ('/pgdata', '/pgwal') OR + m.mount_point like '/tablespaces/%'; + metrics: + - metric_name: ccp_nodemx_disk_activity_sectors_read + value_column: sectors_read + description: Total sectors read + attribute_columns: ["mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_disk_activity_sectors_written + value_column: sectors_written + description: Total sectors written + attribute_columns: ["mount_point"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) + SELECT + monitor.kdapi_scalar_bigint('mem_request') AS request, + CASE + WHEN monitor.cgroup_mode() = 'legacy' THEN + (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END) + ELSE + (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END) + END AS limit, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='cache') + ELSE 0 + END as cache, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='rss') + ELSE 0 + END as RSS, + (SELECT val FROM d WHERE key='shmem') as shmem, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='mapped_file') + ELSE 0 + END as mapped_file, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='dirty') + ELSE (SELECT val FROM d WHERE key='file_dirty') + END as dirty, + (SELECT val FROM d WHERE key='active_anon') as active_anon, + (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon, + (SELECT val FROM d WHERE key='active_file') as active_file, + (SELECT val FROM d WHERE key='inactive_file') as inactive_file, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes') + ELSE monitor.cgroup_scalar_bigint('memory.current') + END as usage_in_bytes, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes') + ELSE 0 + END as kmem_usage_in_byte; + metrics: + - metric_name: ccp_nodemx_mem_active_anon + value_column: active_anon + value_type: double + description: Total bytes of anonymous and swap cache memory on active LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_active_file + value_column: active_file + value_type: double + description: Total bytes of file-backed memory on active LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_cache + value_column: cache + value_type: double + description: Total bytes of page cache memory + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_dirty + value_column: dirty + description: Total bytes that are waiting to get written back to the disk + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_inactive_anon + value_column: inactive_anon + value_type: double + description: Total bytes of anonymous and swap cache memory on inactive LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_inactive_file + value_column: inactive_file + value_type: double + description: Total bytes of file-backed memory on inactive LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_kmem_usage_in_byte + value_column: kmem_usage_in_byte + description: Unknown metric from ccp_nodemx_mem + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_limit + value_column: limit + description: Memory limit value in bytes + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_mapped_file + value_column: mapped_file + description: Total bytes of mapped file (includes tmpfs/shmem) + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_request + value_column: request + description: Memory request value in bytes + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_rss + value_column: rss + value_type: double + description: Total bytes of anonymous and swap cache memory + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_shmem + value_column: shmem + value_type: double + description: Total bytes of shared memory + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_usage_in_bytes + value_column: usage_in_bytes + description: Total usage in bytes + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT interface + ,tx_bytes + ,tx_packets + ,rx_bytes + ,rx_packets from monitor.proc_network_stats() + metrics: + - metric_name: ccp_nodemx_network_rx_bytes + value_column: rx_bytes + description: Number of bytes received + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_network_rx_packets + value_column: rx_packets + description: Number of packets received + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_network_tx_bytes + value_column: tx_bytes + description: Number of bytes transmitted + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_network_tx_packets + value_column: tx_packets + description: Number of packets transmitted + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT monitor.cgroup_process_count() as count; + metrics: + - metric_name: ccp_nodemx_process_count + value_column: count + description: Total number of database processes + static_attributes: + server: "localhost:5432" + + # Setting pg_stat_statements_reset_info to -1 means update as often as possible. + - sql: > + SELECT monitor.pg_stat_statements_reset_info(-1) as time; + metrics: + - metric_name: ccp_pg_stat_statements_reset_time + value_column: time + description: Epoch time when stats were reset + static_attributes: + server: "localhost:5432" + + + # This query against pg_stat_statements is compatible with PG 13 and later. + # https://github.com/CrunchyData/pgmonitor-extension/blob/main/sql/functions/functions.sql + # TODO: Double-check the sorting and the attribute values on the below. + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time + , s.max_exec_time AS max_exec_time + , s.mean_exec_time AS mean_exec_time + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , queryid + , query + , max(monitor.mean_exec_time) AS top_mean_exec_time_ms + FROM monitor + GROUP BY 1,2,3,4 + ORDER BY 5 DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_mean_exec_time_ms + value_column: top_mean_exec_time_ms + value_type: double + description: Average query runtime in milliseconds + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.calls + , s.total_exec_time + , s.mean_exec_time + , s.rows + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , sum(calls) AS calls_count + , sum(total_exec_time) AS exec_time_ms + , avg(mean_exec_time) AS mean_exec_time_ms + , sum(rows) AS row_count + FROM monitor + GROUP BY 1,2; + metrics: + - metric_name: ccp_pg_stat_statements_total_calls_count + value_column: calls_count + value_type: double + description: Total number of queries run per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_total_exec_time_ms + value_column: exec_time_ms + value_type: double + description: Total runtime of all queries per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_total_mean_exec_time_ms + value_column: mean_exec_time_ms + value_type: double + description: Total runtime of all queries per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_total_row_count + value_column: row_count + value_type: double + description: Total rows returned from all queries per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT current_setting('server_version_num')::int AS current; + metrics: + - metric_name: ccp_postgresql_version_current + value_column: current + description: The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######). + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds; + metrics: + - metric_name: ccp_postmaster_uptime_seconds + value_column: seconds + value_type: double + description: Time interval in seconds since PostgreSQL database was last restarted. + static_attributes: + server: "localhost:5432" + + # get_replication_lag is created in metrics_setup.sql + - sql: > + SELECT * FROM get_replication_lag(); + metrics: + - metric_name: ccp_replication_lag_size_bytes + value_column: bytes + value_type: double + description: Time interval in seconds since PostgreSQL database was last restarted. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + COALESCE( + CASE + WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0 + ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER + END, + 0 + ) AS replay_time, + COALESCE( + CASE + WHEN pg_is_in_recovery() = false THEN 0 + ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER + END, + 0 + ) AS received_time, + CASE + WHEN pg_is_in_recovery() = true THEN 'replica' + ELSE 'primary' + END AS role; + metrics: + - metric_name: ccp_replication_lag_received_time + value_column: received_time + value_type: double + description: | + Length of time since the last WAL file was received and replayed on replica. + Always increases, possibly causing false positives if the primary stops writing. + Monitors for replicas that stop receiving WAL all together. + attribute_columns: ["role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_replication_lag_replay_time + value_column: replay_time + value_type: double + description: | + Length of time since the last transaction was replayed on replica. + Returns zero if last WAL received equals last WAL replayed. Avoids + false positives when primary stops writing. Monitors for replicas that + cannot keep up with primary WAL generation. + attribute_columns: ["role"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true; + metrics: + - metric_name: ccp_settings_pending_restart_count + value_column: count + description: Number of settings from pg_settings catalog in a pending_restart state + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + buffers_clean + , maxwritten_clean + , buffers_alloc + FROM pg_catalog.pg_stat_bgwriter; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_alloc + value_column: buffers_alloc + description: Number of buffers allocated + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_buffers_clean + value_column: buffers_clean + data_type: sum + description: Number of buffers written by the background writer + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_maxwritten_clean + value_column: maxwritten_clean + description: Number of times the background writer stopped a cleaning scan because it had written too many buffers + static_attributes: + server: "localhost:5432" + + - sql: > + WITH max_age AS ( + SELECT 2000000000 as max_old_xid + , setting AS autovacuum_freeze_max_age + FROM pg_catalog.pg_settings + WHERE name = 'autovacuum_freeze_max_age') + , per_database_stats AS ( + SELECT datname + , m.max_old_xid::int + , m.autovacuum_freeze_max_age::int + , age(d.datfrozenxid) AS oldest_current_xid + FROM pg_catalog.pg_database d + JOIN max_age m ON (true) + WHERE d.datallowconn) + SELECT max(oldest_current_xid) AS oldest_current_xid + , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound + , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac + FROM per_database_stats; + metrics: + - metric_name: ccp_transaction_wraparound_oldest_current_xid + value_column: oldest_current_xid + description: Oldest current transaction ID in cluster + static_attributes: + server: "localhost:5432" + - metric_name: ccp_transaction_wraparound_percent_towards_emergency_autovac + value_column: percent_towards_emergency_autovac + description: Percentage towards emergency autovacuum process starting + static_attributes: + server: "localhost:5432" + - metric_name: ccp_transaction_wraparound_percent_towards_wraparound + value_column: percent_towards_wraparound + description: Percentage towards transaction ID wraparound + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT last_5_min_size_bytes, + (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes + FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x; + metrics: + - metric_name: ccp_wal_activity_total_size_bytes + value_column: total_size_bytes + description: Current size in bytes of the WAL directory + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time + , s.max_exec_time AS max_exec_time_ms + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , queryid + , query + , max_exec_time_ms + , records + FROM monitor + ORDER BY 5 DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_max_exec_time_ms + value_column: max_exec_time_ms + value_type: double + description: Epoch time when stats were reset + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time_ms + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , queryid + , query + , total_exec_time_ms + , records + FROM monitor + ORDER BY 5 DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_total_exec_time_ms + value_column: total_exec_time_ms + value_type: double + description: Total time spent in the statement in milliseconds + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time + , s.max_exec_time AS max_exec_time + , s.mean_exec_time AS mean_exec_time + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , query + , queryid + , records + , fpi + , bytes + FROM monitor + ORDER BY bytes DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_wal_bytes + value_column: bytes + value_type: double + description: Total amount of WAL generated by the statement in bytes + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_top_wal_fpi + value_column: fpi + value_type: double + description: Total number of WAL full page images generated by the statement + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_top_wal_records + value_column: records + value_type: double + description: Total number of WAL records generated by the statement + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: | + SELECT * FROM get_pgbackrest_info(); + metrics: + - metric_name: ccp_backrest_last_diff_backup_time_since_completion_seconds + description: Seconds since the last completed full or differential backup. Differential is always based off last full. + value_column: last_diff_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_full_backup_time_since_completion_seconds + description: Seconds since the last completed full backup + value_column: last_full_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_incr_backup_time_since_completion_seconds + description: | + Seconds since the last completed full, differential or incremental backup. + Incremental is always based off last full or differential. + value_column: last_incr_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backrest_repo_version + description: pgBackRest version number when this backup was performed + value_column: last_info_backrest_repo_version + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backup_error + description: An error has been encountered in the backup. Check logs for more information. + value_column: last_info_backup_error + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backup_runtime_seconds + description: Total runtime in seconds of this backup + value_column: backup_runtime_seconds + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_repo_backup_size_bytes + description: Actual size of only this individual backup in the pgbackrest repository + value_column: repo_backup_size_bytes + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_repo_total_size_bytes + description: Total size of this backup in the pgbackrest repository, including all required previous backups and WAL + value_column: repo_total_size_bytes + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_oldest_full_backup_time_seconds + description: Seconds since the oldest completed full backup + value_column: oldest_full_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/postgres_metrics.go b/internal/collector/postgres_metrics.go new file mode 100644 index 0000000000..8377676813 --- /dev/null +++ b/internal/collector/postgres_metrics.go @@ -0,0 +1,107 @@ +// Copyright 2024 - 2025 Crunchy Data Solutions, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +package collector + +import ( + "context" + _ "embed" + "encoding/json" + "fmt" + "slices" + + "github.com/crunchydata/postgres-operator/internal/feature" + "github.com/crunchydata/postgres-operator/internal/pgmonitor" + "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" +) + +// https://pkg.go.dev/embed +// +//go:embed "generated/postgres_5s_metrics.json" +var fiveSecondMetrics json.RawMessage + +//go:embed "generated/postgres_5m_metrics.json" +var fiveMinuteMetrics json.RawMessage + +//go:embed "generated/gte_pg17_metrics.json" +var gtePG17 json.RawMessage + +//go:embed "generated/lt_pg17_metrics.json" +var ltPG17 json.RawMessage + +//go:embed "generated/gte_pg16_metrics.json" +var gtePG16 json.RawMessage + +//go:embed "generated/lt_pg16_metrics.json" +var ltPG16 json.RawMessage + +func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresCluster, config *Config) { + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if inCluster.Spec.PostgresVersion >= 17 { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, gtePG17) + } else { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, ltPG17) + } + + if inCluster.Spec.PostgresVersion >= 16 { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, gtePG16) + } else { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, ltPG16) + } + // Add Prometheus exporter + config.Exporters[Prometheus] = map[string]any{ + "endpoint": "0.0.0.0:9187", + } + + config.Receivers[FiveSecondSqlQuery] = map[string]any{ + "driver": "postgres", + "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD}`, pgmonitor.MonitoringUser), + "collection_interval": "5s", + // Give Postgres time to finish setup. + "initial_delay": "10s", + "queries": slices.Clone(fiveSecondMetrics), + } + + config.Receivers[FiveMinuteSqlQuery] = map[string]any{ + "driver": "postgres", + "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD}`, pgmonitor.MonitoringUser), + "collection_interval": "300s", + // Give Postgres time to finish setup. + "initial_delay": "10s", + "queries": slices.Clone(fiveMinuteMetrics), + } + // Add Metrics Pipeline + config.Pipelines[PostgresMetrics] = Pipeline{ + Receivers: []ComponentID{FiveSecondSqlQuery, FiveMinuteSqlQuery}, + Processors: []ComponentID{ + SubSecondBatchProcessor, + CompactingProcessor, + }, + Exporters: []ComponentID{Prometheus}, + } + } +} + +// appendToJSONArray appends elements of a json.RawMessage containing an array +// to another json.RawMessage containing an array. +func appendToJSONArray(a1, a2 json.RawMessage) (json.RawMessage, error) { + var slc1 []json.RawMessage + if err := json.Unmarshal(a1, &slc1); err != nil { + return nil, err + } + + var slc2 []json.RawMessage + if err := json.Unmarshal(a2, &slc2); err != nil { + return nil, err + } + + mergedSlice := append(slc1, slc2...) + + merged, err := json.Marshal(mergedSlice) + if err != nil { + return nil, err + } + + return merged, nil +} diff --git a/internal/controller/postgrescluster/controller.go b/internal/controller/postgrescluster/controller.go index 5af8ba89ee..c200fa0e27 100644 --- a/internal/controller/postgrescluster/controller.go +++ b/internal/controller/postgrescluster/controller.go @@ -234,13 +234,13 @@ func (r *Reconciler) Reconcile( } pgHBAs := postgres.NewHBAs() - pgmonitor.PostgreSQLHBAs(cluster, &pgHBAs) + pgmonitor.PostgreSQLHBAs(ctx, cluster, &pgHBAs) pgbouncer.PostgreSQL(cluster, &pgHBAs) pgParameters := postgres.NewParameters() pgaudit.PostgreSQLParameters(&pgParameters) pgbackrest.PostgreSQL(cluster, &pgParameters, backupsSpecFound) - pgmonitor.PostgreSQLParameters(cluster, &pgParameters) + pgmonitor.PostgreSQLParameters(ctx, cluster, &pgParameters) otelConfig := collector.NewConfigForPostgresPod(ctx, cluster, &pgParameters) @@ -383,7 +383,7 @@ func (r *Reconciler) Reconcile( err = r.reconcilePGBouncer(ctx, cluster, instances, primaryCertificate, rootCA) } if err == nil { - err = r.reconcilePGMonitor(ctx, cluster, instances, monitoringSecret) + err = r.reconcilePGMonitorExporter(ctx, cluster, instances, monitoringSecret) } if err == nil { err = r.reconcileDatabaseInitSQL(ctx, cluster, instances) diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 42e86e62cb..3bbd10b0c3 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -1201,16 +1201,30 @@ func (r *Reconciler) reconcileInstance( } if err == nil && - (feature.Enabled(ctx, feature.OpenTelemetryLogs) || feature.Enabled(ctx, feature.OpenTelemetryMetrics)) { + (feature.Enabled(ctx, feature.OpenTelemetryLogs) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics)) { + // TODO: Setting the includeLogrotate argument to false for now. This // should be changed when we implement log rotation for postgres collector.AddToPod(ctx, cluster.Spec.Instrumentation, cluster.Spec.ImagePullPolicy, instanceConfigMap, &instance.Spec.Template.Spec, []corev1.VolumeMount{postgres.DataVolumeMount()}, "", false) } - // Add pgMonitor resources to the instance Pod spec + if err == nil && + feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + + monitoringUserSecret := &corev1.Secret{ObjectMeta: naming.MonitoringUserSecret(cluster)} + err = errors.WithStack( + r.Client.Get(ctx, client.ObjectKeyFromObject(monitoringUserSecret), monitoringUserSecret)) + + if err == nil { + collector.AddToPod(ctx, cluster.Spec.Instrumentation, cluster.Spec.ImagePullPolicy, instanceConfigMap, &instance.Spec.Template.Spec, + []corev1.VolumeMount{postgres.DataVolumeMount()}, string(monitoringUserSecret.Data["password"]), false) + } + } + + // Add postgres-exporter to the instance Pod spec if err == nil { - err = addPGMonitorToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) } // add nss_wrapper init container and add nss_wrapper env vars to the database and pgbackrest diff --git a/internal/controller/postgrescluster/metrics_setup.sql b/internal/controller/postgrescluster/metrics_setup.sql new file mode 100644 index 0000000000..728de80c3e --- /dev/null +++ b/internal/controller/postgrescluster/metrics_setup.sql @@ -0,0 +1,222 @@ +-- +-- Copyright © 2017-2025 Crunchy Data Solutions, Inc. All Rights Reserved. +-- + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ccp_monitoring') THEN + CREATE ROLE ccp_monitoring WITH LOGIN; + END IF; + + -- The pgmonitor role is required by the pgnodemx extension in PostgreSQL versions 9.5 and 9.6 + -- and should be removed when upgrading to PostgreSQL 10 and above. + IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pgmonitor') THEN + DROP ROLE pgmonitor; + END IF; +END +$$; + +GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; + +ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; +ALTER ROLE ccp_monitoring SET jit TO 'off'; + +CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring; + +DROP TABLE IF EXISTS monitor.pg_stat_statements_reset_info; +-- Table to store last reset time for pg_stat_statements +CREATE TABLE monitor.pg_stat_statements_reset_info( + reset_time timestamptz +); + +DROP FUNCTION IF EXISTS monitor.pg_stat_statements_reset_info(int); +-- Function to reset pg_stat_statements periodically +CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer DEFAULT 1440) + RETURNS bigint + LANGUAGE plpgsql + SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp +AS $function$ +DECLARE + + v_reset_timestamp timestamptz; + v_throttle interval; + +BEGIN + + IF p_throttle_minutes < 0 THEN + RETURN 0; + END IF; + + v_throttle := make_interval(mins := p_throttle_minutes); + + SELECT COALESCE(max(reset_time), '1970-01-01'::timestamptz) INTO v_reset_timestamp FROM monitor.pg_stat_statements_reset_info; + + IF ((CURRENT_TIMESTAMP - v_reset_timestamp) > v_throttle) THEN + -- Ensure table is empty + DELETE FROM monitor.pg_stat_statements_reset_info; + PERFORM pg_stat_statements_reset(); + INSERT INTO monitor.pg_stat_statements_reset_info(reset_time) values (now()); + END IF; + + RETURN (SELECT extract(epoch from reset_time) FROM monitor.pg_stat_statements_reset_info); + +EXCEPTION + WHEN others then + RETURN 0; +END +$function$; + +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA monitor TO ccp_monitoring; +GRANT ALL ON ALL TABLES IN SCHEMA monitor TO ccp_monitoring; + +--- get_pgbackrest_info is used by the OTel collector. +--- get_replication_lag is created as function, so that we can query without warning on a replica. +CREATE OR REPLACE FUNCTION get_replication_lag() RETURNS TABLE(bytes NUMERIC) AS $$ +BEGIN + IF pg_is_in_recovery() THEN + RETURN QUERY SELECT 0::NUMERIC AS bytes; + ELSE + RETURN QUERY SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes + FROM pg_catalog.pg_stat_replication; + END IF; +END; +$$ LANGUAGE plpgsql; + +--- get_pgbackrest_info is used by the OTel collector. +--- get_pgbackrest_info is created as a function so that no ddl runs on a replica. +--- In the query, the --stanza argument matches DefaultStanzaName, defined in internal/pgbackrest/config.go. +CREATE OR REPLACE FUNCTION get_pgbackrest_info() +RETURNS TABLE ( + last_diff_backup BIGINT, + last_full_backup BIGINT, + last_incr_backup BIGINT, + last_info_backrest_repo_version TEXT, + last_info_backup_error INT, + backup_type TEXT, + backup_runtime_seconds BIGINT, + repo_backup_size_bytes TEXT, + repo_total_size_bytes TEXT, + oldest_full_backup BIGINT, + repo TEXT +) AS $$ +BEGIN + IF pg_is_in_recovery() THEN + RETURN QUERY + SELECT + 0::bigint AS last_diff_backup, + 0::bigint AS last_full_backup, + 0::bigint AS last_incr_backup, + '0' AS last_info_backrest_repo_version, + 0::int AS last_info_backup_error, + 'n/a'::text AS backup_type, + 0::bigint AS backup_runtime_seconds, + '0'::text AS repo_backup_size_bytes, + '0'::text AS repo_total_size_bytes, + 0::bigint AS oldest_full_backup, + 'n/a' AS repo; + ELSE + DROP TABLE IF EXISTS pgbackrest_info; + CREATE TEMPORARY TABLE pgbackrest_info (data json); + COPY pgbackrest_info (data) + FROM PROGRAM 'export LC_ALL=C && printf "\f" && pgbackrest info --log-level-console=info --log-level-stderr=warn --output=json --stanza=db && printf "\f"' + WITH (FORMAT csv, HEADER false, QUOTE E'\f'); + + RETURN QUERY + WITH + all_backups (data) AS ( + SELECT jsonb_array_elements(to_jsonb(data)) FROM pgbackrest_info + ), + stanza_backups (stanza, backup) AS ( + SELECT data->>'name', jsonb_array_elements(data->'backup') FROM all_backups + ), + ordered_backups (stanza, backup, seq_oldest, seq_newest) AS ( + SELECT stanza, backup, + ROW_NUMBER() OVER ( + PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' + ORDER BY backup->'timestamp'->>'start' ASC, backup->'timestamp'->>'stop' ASC + ), + ROW_NUMBER() OVER ( + PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' + ORDER BY backup->'timestamp'->>'start' DESC, backup->'timestamp'->>'stop' DESC + ) + FROM stanza_backups + ), + + ccp_backrest_last_info AS ( + SELECT + stanza, + split_part(backup->'backrest'->>'version', '.', 1) || lpad(split_part(backup->'backrest'->>'version', '.', 2), 2, '0') || lpad(coalesce(nullif(split_part(backup->'backrest'->>'version', '.', 3), ''), '00'), 2, '0') AS backrest_repo_version, + backup->'database'->>'repo-key' AS repo, + backup->>'type' AS backup_type, + backup->'info'->'repository'->>'delta' AS repo_backup_size_bytes, + backup->'info'->'repository'->>'size' AS repo_total_size_bytes, + (backup->'timestamp'->>'stop')::bigint - (backup->'timestamp'->>'start')::bigint AS backup_runtime_seconds, + CASE WHEN backup->>'error' = 'true' THEN 1 ELSE 0 END AS backup_error + FROM ordered_backups + WHERE seq_newest = 1 + ), + + ccp_backrest_oldest_full_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + min((backup->'timestamp'->>'stop')::bigint) AS time_seconds + FROM ordered_backups + WHERE seq_oldest = 1 AND backup->>'type' IN ('full') + GROUP BY 1,2 + ), + + ccp_backrest_last_full_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full') + GROUP BY 1,2 + ), + + ccp_backrest_last_diff_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff') + GROUP BY 1,2 + ), + + ccp_backrest_last_incr_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff','incr') + GROUP BY 1,2 + ) + + SELECT + ccp_backrest_last_diff_backup.time_since_completion_seconds, + ccp_backrest_last_full_backup.time_since_completion_seconds, + ccp_backrest_last_incr_backup.time_since_completion_seconds, + ccp_backrest_last_info.backrest_repo_version, + ccp_backrest_last_info.backup_error, + ccp_backrest_last_info.backup_type, + ccp_backrest_last_info.backup_runtime_seconds, + ccp_backrest_last_info.repo_backup_size_bytes, + ccp_backrest_last_info.repo_total_size_bytes, + ccp_backrest_oldest_full_backup.time_seconds, + ccp_backrest_last_incr_backup.repo + FROM + ccp_backrest_last_diff_backup + JOIN ccp_backrest_last_full_backup ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_last_full_backup.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_last_full_backup.repo + JOIN ccp_backrest_last_incr_backup ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_last_incr_backup.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_last_incr_backup.repo + JOIN ccp_backrest_last_info ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_last_info.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_last_info.repo + JOIN ccp_backrest_oldest_full_backup ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_oldest_full_backup.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_oldest_full_backup.repo; + END IF; +END; +$$ LANGUAGE plpgsql; + diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 956a99bffd..84b955559a 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -6,6 +6,7 @@ package postgrescluster import ( "context" + _ "embed" "fmt" "io" "os" @@ -27,17 +28,8 @@ import ( "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) -// If pgMonitor is enabled the pgMonitor sidecar(s) have been added to the -// instance pod. reconcilePGMonitor will update the database to -// create the necessary objects for the tool to run -func (r *Reconciler) reconcilePGMonitor(ctx context.Context, - cluster *v1beta1.PostgresCluster, instances *observedInstances, - monitoringSecret *corev1.Secret) error { - - err := r.reconcilePGMonitorExporter(ctx, cluster, instances, monitoringSecret) - - return err -} +//go:embed "metrics_setup.sql" +var metricsSetupForOTelCollector string // reconcilePGMonitorExporter performs setup the postgres_exporter sidecar // - PodExec to run the sql in the primary database @@ -69,19 +61,24 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, // We use this ImageID and the setup.sql file in the hash we make to see if the operator needs to rerun // the `EnableExporterInPostgreSQL` funcs; that way we are always running // that function against an updated and running pod. - if pgmonitor.ExporterEnabled(cluster) { + + if pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { sql, err := os.ReadFile(fmt.Sprintf("%s/pg%d/setup.sql", pgmonitor.GetQueriesConfigDir(ctx), cluster.Spec.PostgresVersion)) if err != nil { return err } - // TODO: Revisit how pgbackrest_info.sh is used with pgMonitor. - // pgMonitor queries expect a path to a script that runs pgBackRest - // info and provides json output. In the queries yaml for pgBackRest - // the default path is `/usr/bin/pgbackrest-info.sh`. We update - // the path to point to the script in our database image. - setup = strings.ReplaceAll(string(sql), "/usr/bin/pgbackrest-info.sh", - "/opt/crunchy/bin/postgres/pgbackrest_info.sh") + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + setup = metricsSetupForOTelCollector + } else { + // TODO: Revisit how pgbackrest_info.sh is used with pgMonitor. + // pgMonitor queries expect a path to a script that runs pgBackRest + // info and provides json output. In the queries yaml for pgBackRest + // the default path is `/usr/bin/pgbackrest-info.sh`. We update + // the path to point to the script in our database image. + setup = strings.ReplaceAll(string(sql), "/usr/bin/pgbackrest-info.sh", + "/opt/crunchy/bin/postgres/pgbackrest_info.sh") + } for _, containerStatus := range writablePod.Status.ContainerStatuses { if containerStatus.Name == naming.ContainerDatabase { @@ -102,9 +99,9 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, return pgmonitor.EnableExporterInPostgreSQL(ctx, exec, monitoringSecret, pgmonitor.ExporterDB, setup) } - if !pgmonitor.ExporterEnabled(cluster) { + if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { action = func(ctx context.Context, exec postgres.Executor) error { - return pgmonitor.DisableExporterInPostgreSQL(ctx, exec) + return pgmonitor.DisableMonitoringUserInPostgres(ctx, exec) } } @@ -160,12 +157,11 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } - if !pgmonitor.ExporterEnabled(cluster) { - // TODO: Checking if the exporter is enabled to determine when monitoring - // secret should be created. If more tools are added to the monitoring - // suite, they could need the secret when the exporter is not enabled. - // This check may need to be updated. - // Exporter is disabled; delete monitoring secret if it exists. + // Checking if the exporter is enabled or OpenTelemetryMetrics feature + // is enabled to determine when monitoring secret should be created, + // since our implementation of the SqlQuery receiver in the OTel Collector + // uses the monitoring user as well. + if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { if err == nil { err = errors.WithStack(r.deleteControlled(ctx, cluster, existing)) } @@ -227,19 +223,6 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } -// addPGMonitorToInstancePodSpec performs the necessary setup to add -// pgMonitor resources on a PodTemplateSpec -func addPGMonitorToInstancePodSpec( - ctx context.Context, - cluster *v1beta1.PostgresCluster, - template *corev1.PodTemplateSpec, - exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) error { - - err := addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, exporterWebConfig) - - return err -} - // addPGMonitorExporterToInstancePodSpec performs the necessary setup to // add pgMonitor exporter resources to a PodTemplateSpec // TODO (jmckulk): refactor to pass around monitoring secret; Without the secret @@ -249,10 +232,10 @@ func addPGMonitorExporterToInstancePodSpec( ctx context.Context, cluster *v1beta1.PostgresCluster, template *corev1.PodTemplateSpec, - exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) error { + exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) { - if !pgmonitor.ExporterEnabled(cluster) { - return nil + if !pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + return } certSecret := cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret @@ -385,21 +368,23 @@ func addPGMonitorExporterToInstancePodSpec( // add the proper label to support Pod discovery by Prometheus per pgMonitor configuration initialize.Labels(template) template.Labels[naming.LabelPGMonitorDiscovery] = "true" - - return nil } // reconcileExporterWebConfig reconciles the configmap containing the webconfig for exporter tls func (r *Reconciler) reconcileExporterWebConfig(ctx context.Context, cluster *v1beta1.PostgresCluster) (*corev1.ConfigMap, error) { + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + return nil, nil + } + existing := &corev1.ConfigMap{ObjectMeta: naming.ExporterWebConfigMap(cluster)} err := errors.WithStack(r.Client.Get(ctx, client.ObjectKeyFromObject(existing), existing)) if client.IgnoreNotFound(err) != nil { return nil, err } - if !pgmonitor.ExporterEnabled(cluster) || cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret == nil { + if !pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) || cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret == nil { // We could still have a NotFound error here so check the err. // If no error that means the configmap is found and needs to be deleted if err == nil { @@ -456,7 +441,7 @@ func (r *Reconciler) reconcileExporterQueriesConfig(ctx context.Context, return nil, err } - if !pgmonitor.ExporterEnabled(cluster) { + if !pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // We could still have a NotFound error here so check the err. // If no error that means the configmap is found and needs to be deleted if err == nil { diff --git a/internal/controller/postgrescluster/pgmonitor_test.go b/internal/controller/postgrescluster/pgmonitor_test.go index 36a5027aaa..bf46dd204b 100644 --- a/internal/controller/postgrescluster/pgmonitor_test.go +++ b/internal/controller/postgrescluster/pgmonitor_test.go @@ -39,7 +39,7 @@ func testExporterCollectorsAnnotation(t *testing.T, ctx context.Context, cluster naming.PostgresExporterCollectorsAnnotation: "wrong-value", }) - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig) assert.Equal(t, len(template.Spec.Containers), 1) container := template.Spec.Containers[0] @@ -56,7 +56,7 @@ func testExporterCollectorsAnnotation(t *testing.T, ctx context.Context, cluster naming.PostgresExporterCollectorsAnnotation: "None", }) - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig) assert.Equal(t, len(template.Spec.Containers), 1) container := template.Spec.Containers[0] @@ -71,7 +71,7 @@ func testExporterCollectorsAnnotation(t *testing.T, ctx context.Context, cluster naming.PostgresExporterCollectorsAnnotation: "none", }) - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig) assert.Assert(t, cmp.Contains(strings.Join(template.Spec.Containers[0].Command, "\n"), "--[no-]collector")) }) }) @@ -100,7 +100,7 @@ func TestAddPGMonitorExporterToInstancePodSpec(t *testing.T) { t.Run("ExporterDisabled", func(t *testing.T) { template := &corev1.PodTemplateSpec{} - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, nil, nil)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, nil, nil) assert.DeepEqual(t, template, &corev1.PodTemplateSpec{}) }) @@ -121,8 +121,7 @@ func TestAddPGMonitorExporterToInstancePodSpec(t *testing.T) { }, } - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] @@ -205,8 +204,7 @@ volumeMounts: }, } - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] @@ -255,8 +253,7 @@ name: exporter-config }, } - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] @@ -301,8 +298,7 @@ name: exporter-config testConfigMap := new(corev1.ConfigMap) testConfigMap.Name = "test-web-conf" - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, testConfigMap)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, testConfigMap) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] diff --git a/internal/pgmonitor/postgres.go b/internal/pgmonitor/postgres.go index 292d116e30..08a428d465 100644 --- a/internal/pgmonitor/postgres.go +++ b/internal/pgmonitor/postgres.go @@ -10,6 +10,7 @@ import ( corev1 "k8s.io/api/core/v1" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/internal/logging" "github.com/crunchydata/postgres-operator/internal/postgres" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" @@ -22,8 +23,8 @@ const ( // PostgreSQLHBAs provides the Postgres HBA rules for allowing the monitoring // exporter to be accessible -func PostgreSQLHBAs(inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) { - if ExporterEnabled(inCluster) { +func PostgreSQLHBAs(ctx context.Context, inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) { + if ExporterEnabled(ctx, inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Limit the monitoring user to local connections using SCRAM. outHBAs.Mandatory = append(outHBAs.Mandatory, postgres.NewHBA().TCP().User(MonitoringUser).Method("scram-sha-256").Network("127.0.0.0/8"), @@ -34,8 +35,8 @@ func PostgreSQLHBAs(inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) // PostgreSQLParameters provides additional required configuration parameters // that Postgres needs to support monitoring -func PostgreSQLParameters(inCluster *v1beta1.PostgresCluster, outParameters *postgres.Parameters) { - if ExporterEnabled(inCluster) { +func PostgreSQLParameters(ctx context.Context, inCluster *v1beta1.PostgresCluster, outParameters *postgres.Parameters) { + if ExporterEnabled(ctx, inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Exporter expects that shared_preload_libraries are installed // pg_stat_statements: https://access.crunchydata.com/documentation/pgmonitor/latest/exporter/ // pgnodemx: https://github.com/CrunchyData/pgnodemx @@ -45,11 +46,11 @@ func PostgreSQLParameters(inCluster *v1beta1.PostgresCluster, outParameters *pos } } -// DisableExporterInPostgreSQL disables the exporter configuration in PostgreSQL. +// DisableMonitoringUserInPostgres disables the exporter configuration in PostgreSQL. // Currently the exporter is disabled by removing login permissions for the // monitoring user. // TODO: evaluate other uninstall/removal options -func DisableExporterInPostgreSQL(ctx context.Context, exec postgres.Executor) error { +func DisableMonitoringUserInPostgres(ctx context.Context, exec postgres.Executor) error { log := logging.FromContext(ctx) stdout, stderr, err := exec.Exec(ctx, strings.NewReader(` diff --git a/internal/pgmonitor/postgres_test.go b/internal/pgmonitor/postgres_test.go index b91e9ba125..3b6bff58de 100644 --- a/internal/pgmonitor/postgres_test.go +++ b/internal/pgmonitor/postgres_test.go @@ -5,6 +5,7 @@ package pgmonitor import ( + "context" "strings" "testing" @@ -15,10 +16,12 @@ import ( ) func TestPostgreSQLHBA(t *testing.T) { + ctx := context.Background() + t.Run("ExporterDisabled", func(t *testing.T) { inCluster := &v1beta1.PostgresCluster{} outHBAs := postgres.HBAs{} - PostgreSQLHBAs(inCluster, &outHBAs) + PostgreSQLHBAs(ctx, inCluster, &outHBAs) assert.Equal(t, len(outHBAs.Mandatory), 0) }) @@ -33,7 +36,7 @@ func TestPostgreSQLHBA(t *testing.T) { } outHBAs := postgres.HBAs{} - PostgreSQLHBAs(inCluster, &outHBAs) + PostgreSQLHBAs(ctx, inCluster, &outHBAs) assert.Equal(t, len(outHBAs.Mandatory), 3) assert.Equal(t, outHBAs.Mandatory[0].String(), `host all "ccp_monitoring" "127.0.0.0/8" scram-sha-256`) @@ -43,10 +46,12 @@ func TestPostgreSQLHBA(t *testing.T) { } func TestPostgreSQLParameters(t *testing.T) { + ctx := context.Background() + t.Run("ExporterDisabled", func(t *testing.T) { inCluster := &v1beta1.PostgresCluster{} outParameters := postgres.NewParameters() - PostgreSQLParameters(inCluster, &outParameters) + PostgreSQLParameters(ctx, inCluster, &outParameters) assert.Assert(t, !outParameters.Mandatory.Has("shared_preload_libraries")) }) @@ -61,7 +66,7 @@ func TestPostgreSQLParameters(t *testing.T) { } outParameters := postgres.NewParameters() - PostgreSQLParameters(inCluster, &outParameters) + PostgreSQLParameters(ctx, inCluster, &outParameters) libs, found := outParameters.Mandatory.Get("shared_preload_libraries") assert.Assert(t, found) assert.Assert(t, strings.Contains(libs, "pg_stat_statements")) @@ -80,7 +85,7 @@ func TestPostgreSQLParameters(t *testing.T) { outParameters := postgres.NewParameters() outParameters.Mandatory.Add("shared_preload_libraries", "daisy") - PostgreSQLParameters(inCluster, &outParameters) + PostgreSQLParameters(ctx, inCluster, &outParameters) libs, found := outParameters.Mandatory.Get("shared_preload_libraries") assert.Assert(t, found) assert.Assert(t, strings.Contains(libs, "pg_stat_statements")) diff --git a/internal/pgmonitor/util.go b/internal/pgmonitor/util.go index 8c89815829..32cf222448 100644 --- a/internal/pgmonitor/util.go +++ b/internal/pgmonitor/util.go @@ -8,6 +8,7 @@ import ( "context" "os" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/internal/logging" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) @@ -26,7 +27,7 @@ func GetQueriesConfigDir(ctx context.Context) string { } // ExporterEnabled returns true if the monitoring exporter is enabled -func ExporterEnabled(cluster *v1beta1.PostgresCluster) bool { +func ExporterEnabled(ctx context.Context, cluster *v1beta1.PostgresCluster) bool { if cluster.Spec.Monitoring == nil { return false } @@ -36,5 +37,8 @@ func ExporterEnabled(cluster *v1beta1.PostgresCluster) bool { if cluster.Spec.Monitoring.PGMonitor.Exporter == nil { return false } + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + return false + } return true } diff --git a/internal/pgmonitor/util_test.go b/internal/pgmonitor/util_test.go index 30d28b45d7..e83bbb3730 100644 --- a/internal/pgmonitor/util_test.go +++ b/internal/pgmonitor/util_test.go @@ -5,24 +5,34 @@ package pgmonitor import ( + "context" "testing" "gotest.tools/v3/assert" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) func TestExporterEnabled(t *testing.T) { cluster := &v1beta1.PostgresCluster{} - assert.Assert(t, !ExporterEnabled(cluster)) + ctx := context.Background() + assert.Assert(t, !ExporterEnabled(ctx, cluster)) cluster.Spec.Monitoring = &v1beta1.MonitoringSpec{} - assert.Assert(t, !ExporterEnabled(cluster)) + assert.Assert(t, !ExporterEnabled(ctx, cluster)) cluster.Spec.Monitoring.PGMonitor = &v1beta1.PGMonitorSpec{} - assert.Assert(t, !ExporterEnabled(cluster)) + assert.Assert(t, !ExporterEnabled(ctx, cluster)) cluster.Spec.Monitoring.PGMonitor.Exporter = &v1beta1.ExporterSpec{} - assert.Assert(t, ExporterEnabled(cluster)) + assert.Assert(t, ExporterEnabled(ctx, cluster)) + gate := feature.NewGate() + assert.NilError(t, gate.SetFromMap(map[string]bool{ + feature.OpenTelemetryMetrics: true, + })) + ctx = feature.NewContext(ctx, gate) + cluster.Spec.Monitoring.PGMonitor.Exporter = &v1beta1.ExporterSpec{} + assert.Assert(t, !ExporterEnabled(ctx, cluster)) }