Skip to content

Commit f1cf4ab

Browse files
authored
Merge pull request #5 from rhythmictech/ENGB360-22
add env to queries, improve titles, fix queries
2 parents aec6215 + c4b466f commit f1cf4ab

File tree

13 files changed

+116
-116
lines changed

13 files changed

+116
-116
lines changed

aws/alb/main.tf

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "http_5xx_responses" {
1212
count = var.http_5xx_responses_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "ALB 5xx Responses - {{host.name}}", local.title_suffix])
14+
name = join("", [local.title_prefix, "ALB 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
1515
include_tags = true
1616
message = local.query_alert_base_message
1717
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -27,8 +27,8 @@ resource "datadog_monitor" "http_5xx_responses" {
2727

2828
query = <<END
2929
min(${var.http_5xx_responses_evaluation_window}):
30-
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 0) / (
31-
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 1)
30+
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 0) / (
31+
default(avg:aws.applicationelb.request_count${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 1)
3232
) * 100 > ${var.http_5xx_responses_threshold_critical}
3333
END
3434

@@ -41,7 +41,7 @@ END
4141
resource "datadog_monitor" "http_5xx_tg_responses" {
4242
count = var.http_5xx_tg_responses_enabled ? 1 : 0
4343

44-
name = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{host.name}}", local.title_suffix])
44+
name = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
4545
include_tags = true
4646
message = local.query_alert_base_message
4747
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -57,8 +57,8 @@ resource "datadog_monitor" "http_5xx_tg_responses" {
5757

5858
query = <<END
5959
min(${var.http_5xx_tg_responses_evaluation_window}):
60-
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 0) / (
61-
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 1)
60+
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 0) / (
61+
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 1)
6262
) * 100 > ${var.http_5xx_tg_responses_threshold_critical}
6363
END
6464

@@ -72,7 +72,7 @@ END
7272
resource "datadog_monitor" "latency" {
7373
count = var.latency_enabled ? 1 : 0
7474

75-
name = join("", [local.title_prefix, "ALB latency - {{host.name}}", local.title_suffix])
75+
name = join("", [local.title_prefix, "{{loadbalancer.name}} ALB latency - {{value}}s ", local.title_suffix])
7676
include_tags = true
7777
message = local.query_alert_base_message
7878
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -88,7 +88,7 @@ resource "datadog_monitor" "latency" {
8888

8989
query = <<END
9090
avg(${var.latency_evaluation_window}):
91-
default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {loadbalancer,region,aws_account}, 0
91+
default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {aws_account,env,loadbalancer,region}, 0
9292
) > ${var.latency_threshold_critical}
9393
END
9494

@@ -101,7 +101,7 @@ END
101101
resource "datadog_monitor" "no_healthy_instances" {
102102
count = var.no_healthy_instances_enabled ? 1 : 0
103103

104-
name = join("", [local.title_prefix, "ALB healthy instances - {{host.name}}", local.title_suffix])
104+
name = join("", [local.title_prefix, "{{loadbalancer.name}} ALB healthy instances is at {{value}}%", local.title_suffix])
105105
include_tags = true
106106
message = local.query_alert_base_message
107107
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -117,10 +117,10 @@ resource "datadog_monitor" "no_healthy_instances" {
117117

118118
query = <<END
119119
min(${var.no_healthy_instances_evaluation_window}): (
120-
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} / (
121-
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} +
122-
sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {loadbalancer,region,aws_account} )
123-
) <= ${var.no_healthy_instances_threshold_critical}
120+
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} / (
121+
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} +
122+
sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {aws_account,env,region,loadbalancer} )
123+
) * 100 <= ${var.no_healthy_instances_threshold_critical}
124124
END
125125

126126
monitor_thresholds {

aws/apigateway/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

aws/beanstalk/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ locals {
1717

1818
latency_metric = local.latency_metric_map[var.latency_measurement]
1919

20-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
20+
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
2121
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
2222
}
2323

aws/ec2/main.tf

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "status_failed_check" {
1212
count = var.status_failed_check_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "EC2 instance status - status check failure - {{host.name}}", local.title_suffix])
14+
name = join("", [local.title_prefix, "EC2 instance status - status check failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
1515
include_tags = true
1616
message = local.query_alert_base_message
1717
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -26,7 +26,7 @@ resource "datadog_monitor" "status_failed_check" {
2626

2727
query = <<END
2828
max(${var.status_failed_check_evaluation_window}):
29-
max:aws.ec2.status_check_failed${local.query_filter} by {instance_id,region,aws_account}
29+
max:aws.ec2.status_check_failed${local.query_filter} by {aws_account,env,instance_id,name,region}
3030
>= 1
3131
END
3232

@@ -38,7 +38,7 @@ END
3838
resource "datadog_monitor" "status_failed_instance" {
3939
count = var.status_failed_instance_enabled ? 1 : 0
4040

41-
name = join("", [local.title_prefix, "EC2 instance status - instance failure - {{host.name}}", local.title_suffix])
41+
name = join("", [local.title_prefix, "EC2 instance status - instance failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
4242
include_tags = true
4343
message = local.query_alert_base_message
4444
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -53,7 +53,7 @@ resource "datadog_monitor" "status_failed_instance" {
5353

5454
query = <<END
5555
max(${var.status_failed_instance_evaluation_window}):
56-
max:aws.ec2.status_check_failed_instance${local.query_filter} by {instance_id,region,aws_account}
56+
max:aws.ec2.status_check_failed_instance${local.query_filter} by {aws_account,env,instance_id,name,region}
5757
>= 1
5858
END
5959

@@ -65,7 +65,7 @@ END
6565
resource "datadog_monitor" "status_failed_system" {
6666
count = var.status_failed_system_enabled ? 1 : 0
6767

68-
name = join("", [local.title_prefix, "EC2 instance status - host failure - {{host.name}}", local.title_suffix])
68+
name = join("", [local.title_prefix, "EC2 instance status - host failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
6969
include_tags = true
7070
message = local.query_alert_base_message
7171
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -80,7 +80,7 @@ resource "datadog_monitor" "status_failed_system" {
8080

8181
query = <<END
8282
max(${var.status_failed_system_evaluation_window}):
83-
max:aws.ec2.status_check_failed_system${local.query_filter} by {instance_id,region,aws_account}
83+
max:aws.ec2.status_check_failed_system${local.query_filter} by {aws_account,env,instance_id,name,region}
8484
>= 1
8585
END
8686

@@ -92,7 +92,7 @@ END
9292
resource "datadog_monitor" "status_failed_volume" {
9393
count = var.status_failed_volume_enabled ? 1 : 0
9494

95-
name = join("", [local.title_prefix, "EC2 instance status - volume failure - {{host.name}}", local.title_suffix])
95+
name = join("", [local.title_prefix, "EC2 instance status - volume failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
9696
include_tags = true
9797
message = local.query_alert_base_message
9898
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -107,7 +107,7 @@ resource "datadog_monitor" "status_failed_volume" {
107107

108108
query = <<END
109109
max(${var.status_failed_volume_evaluation_window}):
110-
max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {instance_id,region,aws_account}
110+
max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {aws_account,env,instance_id,name,region}
111111
>= 1
112112
END
113113

aws/ecs-cluster/main.tf

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,18 @@ locals {
55
monitor_warn_default_priority = null
66
monitor_nodata_default_priority = null
77

8-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
8+
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
99
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
1010
}
1111

1212
resource "datadog_monitor" "agent_status" {
1313
count = var.agent_status_enabled ? 1 : 0
1414

15-
name = join("", [local.title_prefix, "ECS Cluster Agent Status - {{host.name}}", local.title_suffix])
15+
name = join("", [local.title_prefix, "ECS Agent disconnected - {{clustername.name}}", local.title_suffix])
1616
include_tags = true
1717
message = local.query_alert_base_message
1818
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
19-
type = "query alert"
19+
type = "service check"
2020

2121
evaluation_delay = var.evaluation_delay
2222
new_group_delay = var.new_group_delay
@@ -26,11 +26,9 @@ resource "datadog_monitor" "agent_status" {
2626
require_full_window = true
2727
timeout_h = var.timeout_h
2828

29-
query = <<END
30-
min(${var.agent_status_evaluation_window}):
31-
aws.ecs.agent_connected${local.service_filter}.by("cluster", "instance_id").last(6).count_by_status()
32-
>= ${var.agent_status_threshold_critical}
33-
END
29+
query = <<EOQ
30+
"aws.ecs.agent_connected"${local.service_filter}.by("clustername","instance_id").last(6).count_by_status()
31+
EOQ
3432

3533
monitor_thresholds {
3634
critical = var.agent_status_threshold_critical
@@ -41,7 +39,7 @@ END
4139
resource "datadog_monitor" "cpu_utilization" {
4240
count = var.cpu_utilization_enabled ? 1 : 0
4341

44-
name = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{host.name}}", local.title_suffix])
42+
name = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{clustername.name}} - {{value}}%", local.title_suffix])
4543
include_tags = true
4644
message = local.query_alert_base_message
4745
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -57,7 +55,7 @@ resource "datadog_monitor" "cpu_utilization" {
5755

5856
query = <<END
5957
min(${var.cpu_utilization_evaluation_window}):
60-
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}
58+
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}
6159
> ${var.cpu_utilization_threshold_critical}
6260
END
6361

@@ -70,7 +68,7 @@ END
7068
resource "datadog_monitor" "cpu_utilization_anomaly" {
7169
count = var.cpu_utilization_anomaly_enabled ? 1 : 0
7270

73-
name = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
71+
name = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{clustername.name}}", local.title_suffix])
7472
include_tags = true
7573
message = local.query_alert_base_message
7674
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -86,7 +84,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {
8684

8785
query = <<END
8886
avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
89-
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
87+
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
9088
direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
9189
seasonality='${var.cpu_utilization_anomaly_seasonality}'
9290
) >= ${var.cpu_utilization_anomaly_threshold_critical}
@@ -106,7 +104,7 @@ END
106104
resource "datadog_monitor" "memory_reservation" {
107105
count = var.memory_reservation_enabled ? 1 : 0
108106

109-
name = join("", [local.title_prefix, "ECS Cluster CPU Reservation - {{host.name}}", local.title_suffix])
107+
name = join("", [local.title_prefix, "ECS Cluster Memory Reservation High - {{clustername.name}} - {{value}}%", local.title_suffix])
110108
include_tags = true
111109
message = local.query_alert_base_message
112110
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -122,7 +120,7 @@ resource "datadog_monitor" "memory_reservation" {
122120

123121
query = <<END
124122
min(${var.memory_reservation_evaluation_window}):
125-
avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account}
123+
avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account,env}
126124
> ${var.memory_reservation_threshold_critical}
127125
END
128126

aws/ecs-fargate/main.tf

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@ locals {
55
monitor_warn_default_priority = null
66
monitor_nodata_default_priority = null
77

8-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
8+
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
99
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
1010
}
1111

1212
resource "datadog_monitor" "fargate_check" {
1313
count = var.fargate_check_enabled ? 1 : 0
1414

15-
name = join("", [local.title_prefix, "ECS Fargate task status check - {{host.name}}", local.title_suffix])
15+
name = join("", [local.title_prefix, "Fargate service not responding", local.title_suffix])
1616
include_tags = true
1717
message = local.query_alert_base_message
1818
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -40,7 +40,7 @@ END
4040
resource "datadog_monitor" "cpu_utilization" {
4141
count = var.cpu_utilization_enabled ? 1 : 0
4242

43-
name = join("", [local.title_prefix, "ECS Fargate task CPU utilization - {{host.name}}", local.title_suffix])
43+
name = join("", [local.title_prefix, "ECS Fargate task CPU utilization", local.title_suffix])
4444
include_tags = true
4545
message = local.query_alert_base_message
4646
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -56,7 +56,7 @@ resource "datadog_monitor" "cpu_utilization" {
5656

5757
query = <<END
5858
avg(${var.cpu_utilization_evaluation_window}):
59-
avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
59+
avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
6060
> ${var.cpu_utilization_threshold_critical}
6161
END
6262

@@ -69,7 +69,7 @@ END
6969
resource "datadog_monitor" "cpu_utilization_anomaly" {
7070
count = var.cpu_utilization_anomaly_enabled ? 1 : 0
7171

72-
name = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
72+
name = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity", local.title_suffix])
7373
include_tags = true
7474
message = local.query_alert_base_message
7575
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -85,7 +85,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {
8585

8686
query = <<END
8787
avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
88-
avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
88+
avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
8989
direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
9090
seasonality='${var.cpu_utilization_anomaly_seasonality}'
9191
) >= ${var.cpu_utilization_anomaly_threshold_critical}
@@ -105,7 +105,7 @@ END
105105
resource "datadog_monitor" "memory_utilization" {
106106
count = var.memory_utilization_enabled ? 1 : 0
107107

108-
name = join("", [local.title_prefix, "ECS Fargate task memory utilization - {{host.name}}", local.title_suffix])
108+
name = join("", [local.title_prefix, "ECS Fargate task memory utilization", local.title_suffix])
109109
include_tags = true
110110
message = local.query_alert_base_message
111111
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -121,8 +121,8 @@ resource "datadog_monitor" "memory_utilization" {
121121

122122
query = <<END
123123
avg(${var.memory_utilization_evaluation_window}):(
124-
avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account} /
125-
avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
124+
avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env} /
125+
avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
126126
) >= ${var.memory_utilization_threshold_critical}
127127
END
128128

0 commit comments

Comments
 (0)