rhythmictech
diff --git a/‎aws/alb/main.tf‎
Lines changed: 14 additions & 14 deletions b/‎aws/alb/main.tf‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎aws/apigateway/main.tf‎
Lines changed: 1 addition & 1 deletion b/‎aws/apigateway/main.tf‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aws/beanstalk/main.tf‎
Lines changed: 1 addition & 1 deletion b/‎aws/beanstalk/main.tf‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aws/ec2/main.tf‎
Lines changed: 9 additions & 9 deletions b/‎aws/ec2/main.tf‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎aws/ecs-cluster/main.tf‎
Lines changed: 12 additions & 14 deletions b/‎aws/ecs-cluster/main.tf‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎aws/ecs-fargate/main.tf‎
Lines changed: 9 additions & 9 deletions b/‎aws/ecs-fargate/main.tf‎
Lines changed: 9 additions & 9 deletions
@@ -4,14 +4,14 @@ locals {
   monitor_warn_default_priority   = null
   monitor_nodata_default_priority = null
 
-  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
+  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
   title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
 }
 
 resource "datadog_monitor" "http_5xx_responses" {
   count = var.http_5xx_responses_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ALB 5xx Responses - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ALB 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -27,8 +27,8 @@ resource "datadog_monitor" "http_5xx_responses" {
 
   query = <<END
     min(${var.http_5xx_responses_evaluation_window}):
-      default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 0) / (
-      default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 1)
+      default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 0) / (
+      default(avg:aws.applicationelb.request_count${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 1)
     ) * 100 > ${var.http_5xx_responses_threshold_critical}
 END
 
@@ -41,7 +41,7 @@ END
 resource "datadog_monitor" "http_5xx_tg_responses" {
   count = var.http_5xx_tg_responses_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -57,8 +57,8 @@ resource "datadog_monitor" "http_5xx_tg_responses" {
 
   query = <<END
     min(${var.http_5xx_tg_responses_evaluation_window}):
-      default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 0) / (
-      default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 1)
+      default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 0) / (
+      default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 1)
     ) * 100 > ${var.http_5xx_tg_responses_threshold_critical}
 END
 
@@ -72,7 +72,7 @@ END
 resource "datadog_monitor" "latency" {
   count = var.latency_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ALB latency - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "{{loadbalancer.name}} ALB latency - {{value}}s ", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -88,7 +88,7 @@ resource "datadog_monitor" "latency" {
 
   query = <<END
     avg(${var.latency_evaluation_window}):
-      default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {loadbalancer,region,aws_account}, 0
+      default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {aws_account,env,loadbalancer,region}, 0
     ) > ${var.latency_threshold_critical}
 END
 
@@ -101,7 +101,7 @@ END
 resource "datadog_monitor" "no_healthy_instances" {
   count = var.no_healthy_instances_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ALB healthy instances - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "{{loadbalancer.name}} ALB healthy instances is at {{value}}%", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -117,10 +117,10 @@ resource "datadog_monitor" "no_healthy_instances" {
 
   query = <<END
     min(${var.no_healthy_instances_evaluation_window}): (
-      sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} / (
-      sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} +
-      sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {loadbalancer,region,aws_account} )
-    ) <= ${var.no_healthy_instances_threshold_critical}
+      sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} / (
+      sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} +
+      sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {aws_account,env,region,loadbalancer} )
+    ) * 100 <= ${var.no_healthy_instances_threshold_critical}
 END
 
   monitor_thresholds {
 
@@ -4,7 +4,7 @@ locals {
   monitor_warn_default_priority   = null
   monitor_nodata_default_priority = null
 
-  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
+  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
   title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
 }
 
 
@@ -17,7 +17,7 @@ locals {
 
   latency_metric = local.latency_metric_map[var.latency_measurement]
 
-  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
+  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
   title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
 }
 
 
@@ -4,14 +4,14 @@ locals {
   monitor_warn_default_priority   = null
   monitor_nodata_default_priority = null
 
-  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
+  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
   title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
 }
 
 resource "datadog_monitor" "status_failed_check" {
   count = var.status_failed_check_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "EC2 instance status - status check failure - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "EC2 instance status - status check failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -26,7 +26,7 @@ resource "datadog_monitor" "status_failed_check" {
 
   query = <<END
     max(${var.status_failed_check_evaluation_window}):
-      max:aws.ec2.status_check_failed${local.query_filter} by {instance_id,region,aws_account}
+      max:aws.ec2.status_check_failed${local.query_filter} by {aws_account,env,instance_id,name,region}
     >= 1
 END
 
@@ -38,7 +38,7 @@ END
 resource "datadog_monitor" "status_failed_instance" {
   count = var.status_failed_instance_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "EC2 instance status - instance failure - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "EC2 instance status - instance failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -53,7 +53,7 @@ resource "datadog_monitor" "status_failed_instance" {
 
   query = <<END
     max(${var.status_failed_instance_evaluation_window}):
-      max:aws.ec2.status_check_failed_instance${local.query_filter} by {instance_id,region,aws_account}
+      max:aws.ec2.status_check_failed_instance${local.query_filter} by {aws_account,env,instance_id,name,region}
     >= 1
 END
 
@@ -65,7 +65,7 @@ END
 resource "datadog_monitor" "status_failed_system" {
   count = var.status_failed_system_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "EC2 instance status - host failure - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "EC2 instance status - host failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -80,7 +80,7 @@ resource "datadog_monitor" "status_failed_system" {
 
   query = <<END
     max(${var.status_failed_system_evaluation_window}):
-      max:aws.ec2.status_check_failed_system${local.query_filter} by {instance_id,region,aws_account}
+      max:aws.ec2.status_check_failed_system${local.query_filter} by {aws_account,env,instance_id,name,region}
     >= 1
 END
 
@@ -92,7 +92,7 @@ END
 resource "datadog_monitor" "status_failed_volume" {
   count = var.status_failed_volume_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "EC2 instance status - volume failure - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "EC2 instance status - volume failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -107,7 +107,7 @@ resource "datadog_monitor" "status_failed_volume" {
 
   query = <<END
     max(${var.status_failed_volume_evaluation_window}):
-      max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {instance_id,region,aws_account}
+      max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {aws_account,env,instance_id,name,region}
     >= 1
 END
 
 
@@ -5,18 +5,18 @@ locals {
   monitor_warn_default_priority   = null
   monitor_nodata_default_priority = null
 
-  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
+  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
   title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
 }
 
 resource "datadog_monitor" "agent_status" {
   count = var.agent_status_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS Cluster Agent Status - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS Agent disconnected - {{clustername.name}}", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
-  type         = "query alert"
+  type    = "service check"
 
   evaluation_delay    = var.evaluation_delay
   new_group_delay     = var.new_group_delay
@@ -26,11 +26,9 @@ resource "datadog_monitor" "agent_status" {
   require_full_window = true
   timeout_h           = var.timeout_h
 
-  query = <<END
-    min(${var.agent_status_evaluation_window}):
-      aws.ecs.agent_connected${local.service_filter}.by("cluster", "instance_id").last(6).count_by_status()
-    >= ${var.agent_status_threshold_critical}
-END
+  query = <<EOQ
+    "aws.ecs.agent_connected"${local.service_filter}.by("clustername","instance_id").last(6).count_by_status()
+EOQ
 
   monitor_thresholds {
     critical = var.agent_status_threshold_critical
@@ -41,7 +39,7 @@ END
 resource "datadog_monitor" "cpu_utilization" {
   count = var.cpu_utilization_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{clustername.name}} - {{value}}%", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -57,7 +55,7 @@ resource "datadog_monitor" "cpu_utilization" {
 
   query = <<END
     min(${var.cpu_utilization_evaluation_window}):
-      avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}
+      avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}
     > ${var.cpu_utilization_threshold_critical}
 END
 
@@ -70,7 +68,7 @@ END
 resource "datadog_monitor" "cpu_utilization_anomaly" {
   count = var.cpu_utilization_anomaly_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{clustername.name}}", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -86,7 +84,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {
 
   query = <<END
     avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
-      avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
+      avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
       direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
       seasonality='${var.cpu_utilization_anomaly_seasonality}'
     ) >= ${var.cpu_utilization_anomaly_threshold_critical}
@@ -106,7 +104,7 @@ END
 resource "datadog_monitor" "memory_reservation" {
   count = var.memory_reservation_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS Cluster CPU Reservation - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS Cluster Memory Reservation High - {{clustername.name}} - {{value}}%", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -122,7 +120,7 @@ resource "datadog_monitor" "memory_reservation" {
 
   query = <<END
     min(${var.memory_reservation_evaluation_window}):
-      avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account}
+      avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account,env}
     > ${var.memory_reservation_threshold_critical}
 END
 
 
@@ -5,14 +5,14 @@ locals {
   monitor_warn_default_priority   = null
   monitor_nodata_default_priority = null
 
-  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
+  title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
   title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
 }
 
 resource "datadog_monitor" "fargate_check" {
   count = var.fargate_check_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS Fargate task status check - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "Fargate service not responding", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -40,7 +40,7 @@ END
 resource "datadog_monitor" "cpu_utilization" {
   count = var.cpu_utilization_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS Fargate task CPU utilization - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS Fargate task CPU utilization", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -56,7 +56,7 @@ resource "datadog_monitor" "cpu_utilization" {
 
   query = <<END
     avg(${var.cpu_utilization_evaluation_window}):
-      avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
+      avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
     > ${var.cpu_utilization_threshold_critical}
 END
 
@@ -69,7 +69,7 @@ END
 resource "datadog_monitor" "cpu_utilization_anomaly" {
   count = var.cpu_utilization_anomaly_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -85,7 +85,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {
 
   query = <<END
     avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
-      avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
+      avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
       direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
       seasonality='${var.cpu_utilization_anomaly_seasonality}'
     ) >= ${var.cpu_utilization_anomaly_threshold_critical}
@@ -105,7 +105,7 @@ END
 resource "datadog_monitor" "memory_utilization" {
   count = var.memory_utilization_enabled ? 1 : 0
 
-  name         = join("", [local.title_prefix, "ECS Fargate task memory utilization - {{host.name}}", local.title_suffix])
+  name         = join("", [local.title_prefix, "ECS Fargate task memory utilization", local.title_suffix])
   include_tags = true
   message      = local.query_alert_base_message
   tags         = concat(local.common_tags, var.base_tags, var.additional_tags)
@@ -121,8 +121,8 @@ resource "datadog_monitor" "memory_utilization" {
 
   query = <<END
     avg(${var.memory_utilization_evaluation_window}):(
-      avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account} /
-      avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
+      avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env} /
+      avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
     )  >= ${var.memory_utilization_threshold_critical}
 END
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ locals {`
`4`	`4`	`monitor_warn_default_priority = null`
`5`	`5`	`monitor_nodata_default_priority = null`
`6`	`6`
`7`		`- title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "`
	`7`	`+ title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"`
`8`	`8`	`title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"`
`9`	`9`	`}`
`10`	`10`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ locals {`
`17`	`17`
`18`	`18`	`latency_metric = local.latency_metric_map[var.latency_measurement]`
`19`	`19`
`20`		`- title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "`
	`20`	`+ title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"`
`21`	`21`	`title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"`
`22`	`22`	`}`
`23`	`23`