Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aws/elasticsearch/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@ variable "cpu_utilization_evaluation_window" {
}

variable "cpu_utilization_threshold_critical" {
default = 0.90
default = 90
description = "Critical threshold (percent)"
type = number
}

variable "cpu_utilization_threshold_warning" {
default = 0.80
default = 80
description = "Warning threshold (percent)"
type = number
}
Expand Down
2 changes: 1 addition & 1 deletion aws/rds/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ variable "connection_count_anomaly_enabled" {
}

variable "connection_count_anomaly_evaluation_window" {
default = "last_1h"
default = "last_4h"
description = "Evaluation window for monitor (`last_?m` (1, 5, 10, 15, or 30), `last_?h` (1, 2, or 4), or `last_1d`]"
type = string
}
Expand Down
14 changes: 8 additions & 6 deletions host/agent/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,34 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "host_unreachable" {
count = var.host_unreachable_enabled ? 1 : 0

name = join("", [local.title_prefix, "Host Unreachable - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "Datadog Agent Status - {{name.name}}", local.title_suffix])
include_tags = false
message = var.host_unreachable_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "service check"

evaluation_delay = var.evaluation_delay
new_group_delay = var.new_group_delay
notify_no_data = var.notify_no_data
no_data_timeframe = "5"
notify_no_data = true
renotify_interval = var.renotify_interval
require_full_window = true
timeout_h = var.timeout_h

query = <<EOQ
"datadog.agent.up"${local.service_filter}.by("host").last(6).count_by_status()
"datadog.agent.up"${local.service_filter}.by("name","aws_account","env","datadog_managed").last(2).count_by_status()
EOQ

monitor_thresholds {
ok = 1
warning = 1
critical = 5
critical = 1
}
}
6 changes: 6 additions & 0 deletions host/agent/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,9 @@ variable "host_unreachable_enabled" {
description = "Flag to enable Host unreachable monitor"
type = bool
}

variable "host_unreachable_use_message" {
default = true
description = "Flag to enable Host unreachable alerting"
type = bool
}
11 changes: 6 additions & 5 deletions host/clock/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,28 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "system_clock" {
count = var.system_clock_enabled ? 1 : 0

name = join("", [local.title_prefix, "System Clock - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "System Clock - {{name.name}}", local.title_suffix])
include_tags = false
message = var.system_clock_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "service check"

evaluation_delay = var.evaluation_delay
new_group_delay = var.new_group_delay
notify_no_data = var.notify_no_data
notify_no_data = false
renotify_interval = var.renotify_interval
require_full_window = true
timeout_h = var.timeout_h

query = <<EOQ
"ntp.in_sync"${local.service_filter}.by("host").last(6).count_by_status()
"ntp.in_sync"${local.service_filter}.by("name","aws_account","env","datadog_managed").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand Down
6 changes: 6 additions & 0 deletions host/clock/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,9 @@ variable "system_clock_enabled" {
description = "Flag to enable Host unreachable monitor"
type = bool
}

variable "system_clock_use_message" {
default = false
description = "Flag to enable Host unreachable alerting"
type = bool
}
12 changes: 7 additions & 5 deletions host/cpu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,31 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "cpu_utilization" {
count = var.cpu_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "CPU Utilization - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "CPU Utilization - {{name.name}}", local.title_suffix])
message = var.cpu_utilization_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"

evaluation_delay = var.evaluation_delay
new_group_delay = var.new_group_delay
notify_no_data = var.notify_no_data
notify_no_data = false
no_data_timeframe = var.cpu_utilization_no_data_window
renotify_interval = var.renotify_interval
require_full_window = true
timeout_h = var.timeout_h
include_tags = false


query = <<EOQ
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}): (
100 - avg:system.cpu.idle${local.query_filter} by {host}
100 - avg:system.cpu.idle${local.query_filter} by {name,aws_account,env,datadog_managed}
) > ${var.cpu_utilization_threshold_critical}
EOQ

Expand Down
6 changes: 6 additions & 0 deletions host/cpu/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,9 @@ variable "cpu_utilization_threshold_warning" {
description = "Warning threshold (percent)"
type = number
}

variable "cpu_utilization_use_message" {
default = false
description = "Flag to enable CPU Utilitzation alerting"
type = bool
}
24 changes: 12 additions & 12 deletions host/disk/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "disk_space" {
count = var.disk_space_enabled ? 1 : 0

name = join("", [local.title_prefix, "Disk Space - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "Disk Space - {{name.name}}", local.title_suffix])
message = var.disk_space_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"

Expand All @@ -26,7 +26,7 @@ resource "datadog_monitor" "disk_space" {

query = <<EOQ
${var.disk_space_time_aggregator}(${var.disk_space_timeframe}):
avg:system.disk.in_use${local.query_filter} by {host,device}
avg:system.disk.in_use${local.query_filter} by {name,aws_account,device,env,datadog_managed}
* 100 > ${var.disk_space_threshold_critical}
EOQ

Expand All @@ -39,23 +39,23 @@ resource "datadog_monitor" "disk_space" {
resource "datadog_monitor" "disk_space_forecast" {
count = var.disk_space_forecast_enabled ? 1 : 0

name = join("", [local.title_prefix, "Disk Space Forecast - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "Disk Space Forecast - {{name.name}}", local.title_suffix])
include_tags = false
message = var.disk_space_forecast_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"

evaluation_delay = var.evaluation_delay
new_group_delay = var.new_group_delay
notify_audit = false
timeout_h = var.timeout_h
include_tags = true
require_full_window = true
notify_no_data = false
renotify_interval = 0

query = <<EOQ
${var.disk_space_forecast_time_aggregator}(${var.disk_space_forecast_timeframe}):
forecast(avg:system.disk.in_use${local.query_filter} by {host,device} * 100,
forecast(avg:system.disk.in_use${local.query_filter} by {name,aws_account,device,env,datadog_managed} * 100,
'${var.disk_space_forecast_algorithm}',
${var.disk_space_forecast_deviations},
interval='${var.disk_space_forecast_interval}',
Expand All @@ -74,14 +74,15 @@ resource "datadog_monitor" "disk_space_forecast" {
resource "datadog_monitor" "disk_inodes" {
count = var.disk_inodes_enabled ? 1 : 0

name = join("", [local.title_prefix, "Disk Inodes Usage - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "Disk Inodes Usage - {{name.name}}", local.title_suffix])
include_tags = false
message = var.disk_inodes_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"

query = <<EOQ
${var.disk_inodes_time_aggregator}(${var.disk_inodes_timeframe}):
avg:system.fs.inodes.in_use${local.query_filter} by {host,device}
avg:system.fs.inodes.in_use${local.query_filter} by {name,aws_account,device,env,datadog_managed}
* 100 > ${var.disk_inodes_threshold_critical}
EOQ

Expand All @@ -90,7 +91,6 @@ resource "datadog_monitor" "disk_inodes" {
notify_no_data = false
notify_audit = false
timeout_h = var.timeout_h
include_tags = true
require_full_window = true

monitor_thresholds {
Expand Down
18 changes: 18 additions & 0 deletions host/disk/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ variable "disk_space_threshold_critical" {
default = 90
}

variable "disk_space_use_message" {
description = "Flag to enable Free diskspace alerting"
type = string
default = "true"
}

########################################
# Disk Space Forecast
########################################
Expand Down Expand Up @@ -115,6 +121,12 @@ variable "disk_space_forecast_threshold_critical" {
default = 80
}

variable "disk_space_forecast_use_message" {
description = "Flag to enable Free diskspace forecast alerting"
type = string
default = "false"
}

########################################
# Disk Inodes
########################################
Expand Down Expand Up @@ -147,3 +159,9 @@ variable "disk_inodes_threshold_critical" {
type = number
default = 95
}

variable "disk_inodes_use_message" {
description = "Flag to enable Free disk inodes alerting"
type = string
default = "true"
}
12 changes: 6 additions & 6 deletions host/memory/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "memory" {
count = var.memory_enabled ? 1 : 0

name = join("", [local.title_prefix, "Usable Memory - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
name = join("", [local.title_prefix, "Usable Memory - {{name.name}}", local.title_suffix])
include_tags = false
message = var.memory_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"

query = <<EOQ
${var.memory_time_aggregator}(${var.memory_timeframe}):
avg:system.mem.usable${local.query_filter} by {host} /
avg:system.mem.total${local.query_filter} by {host} * 100
avg:system.mem.usable${local.query_filter} by {name,aws_account,env,datadog_managed} /
avg:system.mem.total${local.query_filter} by {name,aws_account,env,datadog_managed} * 100
< ${var.memory_threshold_critical}
EOQ

Expand All @@ -29,7 +30,6 @@ resource "datadog_monitor" "memory" {
renotify_interval = 0
notify_audit = false
timeout_h = var.timeout_h
include_tags = true
require_full_window = true

monitor_thresholds {
Expand Down
6 changes: 6 additions & 0 deletions host/memory/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,9 @@ variable "memory_threshold_critical" {
type = number
default = 5
}

variable "memory_use_message" {
description = "Flag to enable Free memory alerting"
type = string
default = "true"
}
6 changes: 3 additions & 3 deletions host/process/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "process_alert" {
count = var.process_alert_enabled ? 1 : 0

name = join("", [local.title_prefix, "Process Alert - {{host.name}}", local.title_suffix])
message = local.query_alert_base_message
message = var.process_alert_use_message ? local.query_alert_base_message : ""
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "process alert"

Expand All @@ -21,7 +21,7 @@ resource "datadog_monitor" "process_alert" {
renotify_interval = 0
notify_audit = false
timeout_h = var.timeout_h
include_tags = true
include_tags = false
require_full_window = true

query = <<EOQ
Expand Down
6 changes: 6 additions & 0 deletions host/process/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,9 @@ variable "process_alert_operator" {
type = string
default = "<"
}

variable "process_alert_use_message" {
description = "Flag to enable Process Check alerting"
type = string
default = "true"
}
Loading
Loading