Skip to content

Commit 3f10a01

Browse files
committed
Add metrics/logic for per-db metrics
1 parent 6957050 commit 3f10a01

File tree

3 files changed

+232
-23
lines changed

3 files changed

+232
-23
lines changed

internal/collector/generated/postgres_5m_per_db_metrics.json

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# This list of queries configures an OTel SQL Query Receiver to read pgMonitor
2+
# metrics from Postgres.
3+
#
4+
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries
5+
# https://github.com/CrunchyData/pgmonitor/blob/v5.2.1/sql_exporter/common/crunchy_per_db_collector.yml
6+
#
7+
# Note: Several metrics in the `crunchy_per_db_collector` track the materialized views and
8+
# pgMonitor-extension version -- metrics that aren't meaningful in the CPK environment.
9+
# The list of metrics that fall into this category include
10+
# * ccp_metric_matview_refresh_last_run_fail_count
11+
# * ccp_metric_matview_refresh_longest_runtime_seconds
12+
# * ccp_metric_matview_refresh_longest_runtime
13+
# * ccp_metric_table_refresh_longest_runtime
14+
# * ccp_pgmonitor_extension_per_db
15+
16+
- sql: >
17+
SELECT current_database() as dbname
18+
, n.nspname as schemaname
19+
, c.relname
20+
, pg_catalog.pg_total_relation_size(c.oid) as bytes
21+
FROM pg_catalog.pg_class c
22+
JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid
23+
WHERE NOT pg_is_other_temp_schema(n.oid)
24+
AND relkind IN ('r', 'm', 'f');
25+
metrics:
26+
- metric_name: ccp_table_size_bytes
27+
value_type: double
28+
value_column: bytes
29+
description: "Table size in bytes including indexes"
30+
attribute_columns: ["dbname", "schemaname", "relname"]
31+
static_attributes:
32+
server: "localhost:5432"
33+
34+
- sql: >
35+
SELECT current_database() as dbname
36+
, p.schemaname
37+
, p.relname
38+
, p.seq_scan
39+
, p.seq_tup_read
40+
, COALESCE(p.idx_scan, 0) AS idx_scan
41+
, COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch
42+
, p.n_tup_ins
43+
, p.n_tup_upd
44+
, p.n_tup_del
45+
, p.n_tup_hot_upd
46+
, CASE
47+
WHEN current_setting('server_version_num')::int >= 160000
48+
THEN p.n_tup_newpage_upd
49+
ELSE 0::bigint
50+
END AS n_tup_newpage_upd
51+
, p.n_live_tup
52+
, p.n_dead_tup
53+
, p.vacuum_count
54+
, p.autovacuum_count
55+
, p.analyze_count
56+
, p.autoanalyze_count
57+
FROM pg_catalog.pg_stat_user_tables p;
58+
metrics:
59+
- metric_name: ccp_stat_user_tables_seq_scan
60+
data_type: sum
61+
value_column: seq_scan
62+
description: "Number of sequential scans initiated on this table"
63+
attribute_columns: ["dbname", "schemaname", "relname"]
64+
static_attributes:
65+
server: "localhost:5432"
66+
- metric_name: ccp_stat_user_tables_seq_tup_read
67+
data_type: sum
68+
value_column: seq_tup_read
69+
description: "Number of live rows fetched by sequential scans"
70+
attribute_columns: ["dbname", "schemaname", "relname"]
71+
static_attributes:
72+
server: "localhost:5432"
73+
- metric_name: ccp_stat_user_tables_idx_scan
74+
data_type: sum
75+
description: "Number of index scans initiated on this table"
76+
value_column: idx_scan
77+
static_attributes:
78+
server: "localhost:5432"
79+
attribute_columns: ["dbname", "schemaname", "relname"]
80+
- metric_name: ccp_stat_user_tables_idx_tup_fetch
81+
data_type: sum
82+
description: "Number of live rows fetched by index scans"
83+
value_column: idx_tup_fetch
84+
static_attributes:
85+
server: "localhost:5432"
86+
attribute_columns: ["dbname", "schemaname", "relname"]
87+
- metric_name: ccp_stat_user_tables_n_tup_ins
88+
data_type: sum
89+
description: "Number of rows inserted"
90+
value_column: n_tup_ins
91+
static_attributes:
92+
server: "localhost:5432"
93+
attribute_columns: ["dbname", "schemaname", "relname"]
94+
- metric_name: ccp_stat_user_tables_n_tup_upd
95+
data_type: sum
96+
description: "Number of rows updated"
97+
value_column: n_tup_upd
98+
static_attributes:
99+
server: "localhost:5432"
100+
attribute_columns: ["dbname", "schemaname", "relname"]
101+
- metric_name: ccp_stat_user_tables_n_tup_del
102+
data_type: sum
103+
description: "Number of rows deleted"
104+
value_column: n_tup_del
105+
static_attributes:
106+
server: "localhost:5432"
107+
attribute_columns: ["dbname", "schemaname", "relname"]
108+
- metric_name: ccp_stat_user_tables_n_tup_hot_upd
109+
data_type: sum
110+
description: "Number of rows HOT updated (i.e., with no separate index update required)"
111+
value_column: n_tup_hot_upd
112+
static_attributes:
113+
server: "localhost:5432"
114+
attribute_columns: ["dbname", "schemaname", "relname"]
115+
- metric_name: ccp_stat_user_tables_n_tup_newpage_upd
116+
data_type: sum
117+
description: "Number of rows updated where the successor version goes onto a new heap page, leaving behind an original version with a t_ctid field that points to a different heap page. These are always non-HOT updates."
118+
value_column: n_tup_newpage_upd
119+
static_attributes:
120+
server: "localhost:5432"
121+
attribute_columns: ["dbname", "schemaname", "relname"]
122+
- metric_name: ccp_stat_user_tables_n_live_tup
123+
description: "Estimated number of live rows"
124+
value_column: n_live_tup
125+
static_attributes:
126+
server: "localhost:5432"
127+
attribute_columns: ["dbname", "schemaname", "relname"]
128+
- metric_name: ccp_stat_user_tables_n_dead_tup
129+
description: "Estimated number of dead rows"
130+
value_column: n_dead_tup
131+
static_attributes:
132+
server: "localhost:5432"
133+
attribute_columns: ["dbname", "schemaname", "relname"]
134+
- metric_name: ccp_stat_user_tables_vacuum_count
135+
data_type: sum
136+
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
137+
value_column: vacuum_count
138+
static_attributes:
139+
server: "localhost:5432"
140+
attribute_columns: ["dbname", "schemaname", "relname"]
141+
- metric_name: ccp_stat_user_tables_autovacuum_count
142+
data_type: sum
143+
description: "Number of times this table has been vacuumed by the autovacuum daemon"
144+
value_column: autovacuum_count
145+
static_attributes:
146+
server: "localhost:5432"
147+
attribute_columns: ["dbname", "schemaname", "relname"]
148+
- metric_name: ccp_stat_user_tables_analyze_count
149+
data_type: sum
150+
description: "Number of times this table has been manually analyzed"
151+
value_column: analyze_count
152+
static_attributes:
153+
server: "localhost:5432"
154+
attribute_columns: ["dbname", "schemaname", "relname"]
155+
- metric_name: ccp_stat_user_tables_autoanalyze_count
156+
data_type: sum
157+
description: "Number of times this table has been analyzed by the autovacuum daemon"
158+
value_column: autoanalyze_count
159+
static_attributes:
160+
server: "localhost:5432"
161+
attribute_columns: ["dbname", "schemaname", "relname"]

internal/collector/postgres_metrics.go

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ import (
2121
//go:embed "generated/postgres_5s_metrics.json"
2222
var fiveSecondMetrics json.RawMessage
2323

24+
//go:embed "generated/postgres_5m_per_db_metrics.json"
25+
var fiveMinutePerDBMetrics json.RawMessage
26+
2427
//go:embed "generated/postgres_5m_metrics.json"
2528
var fiveMinuteMetrics json.RawMessage
2629

@@ -71,6 +74,7 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust
7174
// will continually append to it and blow up our ConfigMap
7275
fiveSecondMetricsClone := slices.Clone(fiveSecondMetrics)
7376
fiveMinuteMetricsClone := slices.Clone(fiveMinuteMetrics)
77+
fiveMinutePerDBMetricsClone := slices.Clone(fiveMinutePerDBMetrics)
7478

7579
if inCluster.Spec.PostgresVersion >= 17 {
7680
fiveSecondMetricsClone, err = appendToJSONArray(fiveSecondMetricsClone, gtePG17Fast)
@@ -117,7 +121,7 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust
117121
var fiveSecondMetricsArr []queryMetrics
118122
err := json.Unmarshal(fiveSecondMetricsClone, &fiveSecondMetricsArr)
119123
if err != nil {
120-
log.Error(err, "error compiling postgres metrics")
124+
log.Error(err, "error compiling five second postgres metrics")
121125
}
122126

123127
// Remove any specified metrics from the five second metrics
@@ -128,19 +132,31 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust
128132
var fiveMinuteMetricsArr []queryMetrics
129133
err = json.Unmarshal(fiveMinuteMetricsClone, &fiveMinuteMetricsArr)
130134
if err != nil {
131-
log.Error(err, "error compiling postgres metrics")
135+
log.Error(err, "error compiling five minute postgres metrics")
132136
}
133137

134138
// Remove any specified metrics from the five minute metrics
135139
fiveMinuteMetricsArr = removeMetricsFromQueries(
136140
inCluster.Spec.Instrumentation.Metrics.CustomQueries.Remove, fiveMinuteMetricsArr)
137141

142+
// Convert json to array of queryMetrics objects
143+
var fiveMinutePerDBMetricsArr []queryMetrics
144+
err = json.Unmarshal(fiveMinutePerDBMetricsClone, &fiveMinutePerDBMetricsArr)
145+
if err != nil {
146+
log.Error(err, "error compiling per-db postgres metrics")
147+
}
148+
149+
// Remove any specified metrics from the five minute per-db metrics
150+
fiveMinutePerDBMetricsArr = removeMetricsFromQueries(
151+
inCluster.Spec.Instrumentation.Metrics.CustomQueries.Remove, fiveMinutePerDBMetricsArr)
152+
138153
// Convert back to json data
139154
// The error return value can be ignored as the errchkjson linter
140155
// deems the []queryMetrics to be a safe argument:
141156
// https://github.com/breml/errchkjson
142157
fiveSecondMetricsClone, _ = json.Marshal(fiveSecondMetricsArr)
143158
fiveMinuteMetricsClone, _ = json.Marshal(fiveMinuteMetricsArr)
159+
fiveMinutePerDBMetricsClone, _ = json.Marshal(fiveMinutePerDBMetricsArr)
144160
}
145161

146162
// Add Prometheus exporter
@@ -182,29 +198,60 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust
182198

183199
// Add custom queries if they are defined in the spec
184200
if inCluster.Spec.Instrumentation != nil &&
185-
inCluster.Spec.Instrumentation.Metrics != nil &&
186-
inCluster.Spec.Instrumentation.Metrics.CustomQueries != nil &&
187-
inCluster.Spec.Instrumentation.Metrics.CustomQueries.Add != nil {
188-
189-
for _, querySet := range inCluster.Spec.Instrumentation.Metrics.CustomQueries.Add {
190-
// Create a receiver for the query set
191-
receiverName := "sqlquery/" + querySet.Name
192-
config.Receivers[receiverName] = map[string]any{
193-
"driver": "postgres",
194-
"datasource": fmt.Sprintf(
195-
`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD}`,
196-
MonitoringUser),
197-
"collection_interval": querySet.CollectionInterval,
198-
// Give Postgres time to finish setup.
199-
"initial_delay": "15s",
200-
"queries": "${file:/etc/otel-collector/" +
201-
querySet.Name + "/" + querySet.Queries.Key + "}",
201+
inCluster.Spec.Instrumentation.Metrics != nil {
202+
203+
if inCluster.Spec.Instrumentation.Metrics.CustomQueries != nil &&
204+
inCluster.Spec.Instrumentation.Metrics.CustomQueries.Add != nil {
205+
206+
for _, querySet := range inCluster.Spec.Instrumentation.Metrics.CustomQueries.Add {
207+
// Create a receiver for the query set
208+
209+
db := "postgres"
210+
if querySet.Database != "" {
211+
db = querySet.Database
212+
}
213+
receiverName := "sqlquery/" + querySet.Name
214+
config.Receivers[receiverName] = map[string]any{
215+
"driver": "postgres",
216+
"datasource": fmt.Sprintf(
217+
`host=localhost dbname=%s port=5432 user=%s password=${env:PGPASSWORD}`,
218+
db,
219+
MonitoringUser),
220+
"collection_interval": querySet.CollectionInterval,
221+
// Give Postgres time to finish setup.
222+
"initial_delay": "15s",
223+
"queries": "${file:/etc/otel-collector/" +
224+
querySet.Name + "/" + querySet.Queries.Key + "}",
225+
}
226+
227+
// Add the receiver to the pipeline
228+
pipeline := config.Pipelines[PostgresMetrics]
229+
pipeline.Receivers = append(pipeline.Receivers, receiverName)
230+
config.Pipelines[PostgresMetrics] = pipeline
202231
}
232+
}
233+
if inCluster.Spec.Instrumentation.Metrics.PerDBMetricTargets != nil {
234+
235+
for _, db := range inCluster.Spec.Instrumentation.Metrics.PerDBMetricTargets {
236+
// Create a receiver for the query set for the db
237+
receiverName := "sqlquery/" + db
238+
config.Receivers[receiverName] = map[string]any{
239+
"driver": "postgres",
240+
"datasource": fmt.Sprintf(
241+
`host=localhost dbname=%s port=5432 user=%s password=${env:PGPASSWORD}`,
242+
db,
243+
MonitoringUser),
244+
"collection_interval": "5m",
245+
// Give Postgres time to finish setup.
246+
"initial_delay": "15s",
247+
"queries": slices.Clone(fiveMinutePerDBMetricsClone),
248+
}
203249

204-
// Add the receiver to the pipeline
205-
pipeline := config.Pipelines[PostgresMetrics]
206-
pipeline.Receivers = append(pipeline.Receivers, receiverName)
207-
config.Pipelines[PostgresMetrics] = pipeline
250+
// Add the receiver to the pipeline
251+
pipeline := config.Pipelines[PostgresMetrics]
252+
pipeline.Receivers = append(pipeline.Receivers, receiverName)
253+
config.Pipelines[PostgresMetrics] = pipeline
254+
}
208255
}
209256
}
210257
}

0 commit comments

Comments
 (0)