From b67838eafb052081218804debe713f7dc88704c8 Mon Sep 17 00:00:00 2001 From: Kevin Date: Fri, 18 Apr 2025 08:42:33 -0400 Subject: [PATCH] Added systemd unit monitor --- host/systemd/README.md | 68 +++++++++++++++++++++++++++++++++++++++ host/systemd/common.tf | 1 + host/systemd/main.tf | 34 ++++++++++++++++++++ host/systemd/variables.tf | 41 +++++++++++++++++++++++ host/systemd/versions.tf | 1 + host/windows/main.tf | 2 +- 6 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 host/systemd/README.md create mode 120000 host/systemd/common.tf create mode 100644 host/systemd/main.tf create mode 100644 host/systemd/variables.tf create mode 120000 host/systemd/versions.tf diff --git a/host/systemd/README.md b/host/systemd/README.md new file mode 100644 index 0000000..d19e5ac --- /dev/null +++ b/host/systemd/README.md @@ -0,0 +1,68 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | >= 3.37 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.systemd_unit](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags to apply to all monitors | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags to apply to all monitors | `list(string)` | `[]` | no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [systemd\_unit\_alert\_enabled](#input\_systemd\_unit\_alert\_enabled) | Enable or disable the Systemd service alert monitor | `bool` | `true` | no | +| [systemd\_unit\_alert\_threshold\_critical](#input\_systemd\_unit\_alert\_threshold\_critical) | Critical threshold for the Systemd service alert (count of services not running/failed) | `number` | `2` | no | +| [systemd\_unit\_alert\_threshold\_warning](#input\_systemd\_unit\_alert\_threshold\_warning) | Warning threshold for the Systemd service alert (count of services not running/failed) | `number` | `1` | no | +| [systemd\_unit\_alert\_use\_message](#input\_systemd\_unit\_alert\_use\_message) | Whether to use the base message for the Systemd service alert | `bool` | `true` | no | +| [systemd\_units\_filter](#input\_systemd\_units\_filter) | List of specific systemd units (services) to monitor. If empty, monitors all. | `list(string)` | `[]` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + \ No newline at end of file diff --git a/host/systemd/common.tf b/host/systemd/common.tf new file mode 120000 index 0000000..47c0063 --- /dev/null +++ b/host/systemd/common.tf @@ -0,0 +1 @@ +../../common/common.tf \ No newline at end of file diff --git a/host/systemd/main.tf b/host/systemd/main.tf new file mode 100644 index 0000000..8e826e2 --- /dev/null +++ b/host/systemd/main.tf @@ -0,0 +1,34 @@ +locals { + monitor_alert_default_priority = null + monitor_warn_default_priority = null + monitor_nodata_default_priority = null + + title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]" + title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})" +} + +resource "datadog_monitor" "systemd_unit" { + count = var.systemd_unit_alert_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Systemd Unit Status - {{host.name}}", local.title_suffix]) + type = "service check" + message = var.systemd_unit_alert_use_message ? local.query_alert_base_message : "" + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + + evaluation_delay = var.evaluation_delay + notify_no_data = false + notify_audit = false + renotify_interval = 60 + timeout_h = var.timeout_h + include_tags = false + require_full_window = false + + query = <