From 7ef5d62278107d19375f0db6e0f92c7d1204ddd5 Mon Sep 17 00:00:00 2001 From: Lalo Galvan <106835113+lalo-galvan@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:07:31 -0600 Subject: [PATCH 1/2] created acm certificate renewal failure monitor --- aws/acm/.terraform.lock.hcl | 44 +++++++++++++++++++++++++ aws/acm/README.md | 65 +++++++++++++++++++++++++++++++++++++ aws/acm/common.tf | 1 + aws/acm/main.tf | 30 +++++++++++++++++ aws/acm/variables.tf | 23 +++++++++++++ aws/acm/versions.tf | 1 + common/common.tf | 51 +++++++++++++++++++++++++++++ 7 files changed, 215 insertions(+) create mode 100644 aws/acm/.terraform.lock.hcl create mode 100644 aws/acm/README.md create mode 120000 aws/acm/common.tf create mode 100644 aws/acm/main.tf create mode 100644 aws/acm/variables.tf create mode 120000 aws/acm/versions.tf diff --git a/aws/acm/.terraform.lock.hcl b/aws/acm/.terraform.lock.hcl new file mode 100644 index 0000000..bda5b8d --- /dev/null +++ b/aws/acm/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/datadog/datadog" { + version = "3.76.0" + constraints = ">= 3.37.0" + hashes = [ + "h1:WDN2Ar3dt3s35zy1jng7O28gDRaGZgLqqDzvb9pAtFo=", + "zh:055bfca2ab3d987fb1a6827c27673d08d50d7c6026be020db0176722fe73715a", + "zh:0568ce6217ca42c06d0d3d7788194d806c415f3ed831285751c245aa19377d3a", + "zh:48a82913ff629d3e20db815f98c916493abda8fef5c8a0258401e5122a5823d9", + "zh:893f70d972afbaf92f4d7c2d8e0cc2d542ac4fa46931ff7ef176dc53948fe985", + "zh:9213a9ad4f0a0806cf048c0a73fcf4abaf4b5a0459b5a82f5ce15e567f24fddf", + "zh:a22d66b33f372b703cc6e680880689ad7f46473ea402e2d82c1167793e953191", + "zh:a9606223ad215174a871cfc49e97aee04fbad4b1d444f6f00c49f76682596ddf", + "zh:adede6b1f1536b90339288b2741b0cdde3f5e26107bedef1ac04e15d164a3d9a", + "zh:c4597d7bb5fb9c17c3a11d58d1aa052f004cac080176ee995ee8a04ac8f9a2d3", + "zh:c4a64ca1ba1b58b90466c51591327d09214022dd47885fada20655b7e9830074", + "zh:d3f28181e605090b96d0059f201209649bde557feec2bd5955538dae88cc1a20", + "zh:d40402ba3289816f933895f009a04f525894417d6fc839c19cb12e612a9fea0b", + "zh:e7308e1315e9328f0c3ae9c631231bf71541b8220b5a9b3be32e00990b3b344d", + "zh:eb8189220c79020cff0033aa433332999b35b8c0a85a0a6afbd6a663b7211e0b", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.4" + constraints = ">= 3.1.0" + hashes = [ + "h1:L5V05xwp/Gto1leRryuesxjMfgZwjb7oool4WS1UEFQ=", + "zh:59f6b52ab4ff35739647f9509ee6d93d7c032985d9f8c6237d1f8a59471bbbe2", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:795c897119ff082133150121d39ff26cb5f89a730a2c8c26f3a9c1abf81a9c43", + "zh:7b9c7b16f118fbc2b05a983817b8ce2f86df125857966ad356353baf4bff5c0a", + "zh:85e33ab43e0e1726e5f97a874b8e24820b6565ff8076523cc2922ba671492991", + "zh:9d32ac3619cfc93eb3c4f423492a8e0f79db05fec58e449dee9b2d5873d5f69f", + "zh:9e15c3c9dd8e0d1e3731841d44c34571b6c97f5b95e8296a45318b94e5287a6e", + "zh:b4c2ab35d1b7696c30b64bf2c0f3a62329107bd1a9121ce70683dec58af19615", + "zh:c43723e8cc65bcdf5e0c92581dcbbdcbdcf18b8d2037406a5f2033b1e22de442", + "zh:ceb5495d9c31bfb299d246ab333f08c7fb0d67a4f82681fbf47f2a21c3e11ab5", + "zh:e171026b3659305c558d9804062762d168f50ba02b88b231d20ec99578a6233f", + "zh:ed0fe2acdb61330b01841fa790be00ec6beaac91d41f311fb8254f74eb6a711f", + ] +} diff --git a/aws/acm/README.md b/aws/acm/README.md new file mode 100644 index 0000000..17bb89c --- /dev/null +++ b/aws/acm/README.md @@ -0,0 +1,65 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.76.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.certificate_renewal_failure_check](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[| no | +| [certificate\_renewal\_failure\_check\_enabled](#input\_certificate\_renewal\_failure\_check\_enabled) | Whether to enable the certificate renewal failure check | `bool` | `true` | no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + \ No newline at end of file diff --git a/aws/acm/common.tf b/aws/acm/common.tf new file mode 120000 index 0000000..47c0063 --- /dev/null +++ b/aws/acm/common.tf @@ -0,0 +1 @@ +../../common/common.tf \ No newline at end of file diff --git a/aws/acm/main.tf b/aws/acm/main.tf new file mode 100644 index 0000000..a1c84be --- /dev/null +++ b/aws/acm/main.tf @@ -0,0 +1,30 @@ +locals { + # these must be defined but do not need to be overridden + monitor_alert_default_priority = null + monitor_warn_default_priority = null + monitor_nodata_default_priority = null + + title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]" + title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})" +} + +resource "datadog_monitor" "certificate_renewal_failure_check" { + count = var.certificate_renewal_failure_check_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "ACM - Certificate Renewal Failure", local.title_suffix]) + type = "event-v2 alert" + message = local.event_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + include_tags = false + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + + query = <<-EOQ + events("source:amazon_acm").rollup("count").by("@aggregation_key,env").last("5m") > 0 + EOQ + + monitor_thresholds { + critical = 0 + } +} diff --git a/aws/acm/variables.tf b/aws/acm/variables.tf new file mode 100644 index 0000000..fe1b364 --- /dev/null +++ b/aws/acm/variables.tf @@ -0,0 +1,23 @@ +######################################## +# Global variables +######################################## +variable "additional_tags" { + default = [] + description = "Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`)" + type = list(string) +} + +variable "base_tags" { + default = ["resource:ec2"] + description = "Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this)" + type = list(string) +} + +######################################## +# Certificate Renewal Failure Check +######################################## +variable "certificate_renewal_failure_check_enabled" { + default = true + description = "Whether to enable the certificate renewal failure check" + type = bool +} diff --git a/aws/acm/versions.tf b/aws/acm/versions.tf new file mode 120000 index 0000000..cbeda73 --- /dev/null +++ b/aws/acm/versions.tf @@ -0,0 +1 @@ +../../common/versions.tf \ No newline at end of file diff --git a/common/common.tf b/common/common.tf index 016c3f1..403aed3 100644 --- a/common/common.tf +++ b/common/common.tf @@ -341,6 +341,57 @@ ${local.alert_context} **Alert Information** {{#is_alert}} ${local.notify_on_alert} {{/is_alert}} {{#is_recovery}} ${local.notify_on_recovery} {{/is_recovery}} +END + + event_alert_base_message = <
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:ec2"
]
[| no | | [certificate\_renewal\_failure\_check\_enabled](#input\_certificate\_renewal\_failure\_check\_enabled) | Whether to enable the certificate renewal failure check | `bool` | `true` | no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | diff --git a/aws/acm/variables.tf b/aws/acm/variables.tf index fe1b364..b406c50 100644 --- a/aws/acm/variables.tf +++ b/aws/acm/variables.tf @@ -8,7 +8,7 @@ variable "additional_tags" { } variable "base_tags" { - default = ["resource:ec2"] + default = ["resource:acm"] description = "Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this)" type = list(string) } diff --git a/common/common.tf b/common/common.tf index 403aed3..e4fa7af 100644 --- a/common/common.tf +++ b/common/common.tf @@ -340,7 +340,6 @@ END ${local.alert_context} **Alert Information** {{#is_alert}} ${local.notify_on_alert} {{/is_alert}} -{{#is_recovery}} ${local.notify_on_recovery} {{/is_recovery}} END event_alert_base_message = <
"resource:acm"
]