From 25317f087975b31eb71cc223c0a92198cb149359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Mon, 16 Jan 2023 10:59:54 +0100 Subject: [PATCH 1/9] integration_azure-automation-account --- docs/severity.md | 8 ++ .../README.md | 105 ++++++++++++++++++ .../common-filters.tf | 1 + .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/01-jobs.yaml | 20 ++++ .../conf/readme.yaml | 3 + .../detectors-gen.tf | 29 +++++ .../outputs.tf | 5 + .../tags.tf | 4 + .../variables-gen.tf | 61 ++++++++++ 13 files changed, 240 insertions(+) create mode 100644 modules/integration_azure-automation-account/README.md create mode 120000 modules/integration_azure-automation-account/common-filters.tf create mode 120000 modules/integration_azure-automation-account/common-locals.tf create mode 120000 modules/integration_azure-automation-account/common-modules.tf create mode 120000 modules/integration_azure-automation-account/common-variables.tf create mode 120000 modules/integration_azure-automation-account/common-versions.tf create mode 100644 modules/integration_azure-automation-account/conf/01-jobs.yaml create mode 100644 modules/integration_azure-automation-account/conf/readme.yaml create mode 100644 modules/integration_azure-automation-account/detectors-gen.tf create mode 100644 modules/integration_azure-automation-account/outputs.tf create mode 100644 modules/integration_azure-automation-account/tags.tf create mode 100644 modules/integration_azure-automation-account/variables-gen.tf diff --git a/docs/severity.md b/docs/severity.md index fab655be5..f24280f01 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -31,6 +31,7 @@ - [integration_azure-app-service-plan](#integration_azure-app-service-plan) - [integration_azure-app-service](#integration_azure-app-service) - [integration_azure-application-gateway](#integration_azure-application-gateway) +- [integration_azure-automation-account](#integration_azure-automation-account) - [integration_azure-azure-search](#integration_azure-azure-search) - [integration_azure-container-instance](#integration_azure-container-instance) - [integration_azure-cosmos-db](#integration_azure-cosmos-db) @@ -380,6 +381,13 @@ |Azure Application Gateway capacity units|-|X|-|-|-| +## integration_azure-automation-account + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Azure Automation Account jobs|X|-|-|-|-| + + ## integration_azure-azure-search |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/integration_azure-automation-account/README.md b/modules/integration_azure-automation-account/README.md new file mode 100644 index 000000000..7ccec8fac --- /dev/null +++ b/modules/integration_azure-automation-account/README.md @@ -0,0 +1,105 @@ +# AZURE-AUTOMATION-ACCOUNT SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-integration-azure-automation-account" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_azure-automation-account?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailled in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Azure Automation Account jobs|X|-|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +[Azure integration](https://docs.splunk.com/Observability/gdi/get-data-in/connect/azure/azure.html) configurable +with [this Terraform module](https://github.com/claranet/terraform-signalfx-integrations/tree/master/cloud/azure). + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `TotalJob` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/integration_azure-automation-account/common-filters.tf b/modules/integration_azure-automation-account/common-filters.tf new file mode 120000 index 000000000..9d7fa21ea --- /dev/null +++ b/modules/integration_azure-automation-account/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-integration-azure.tf \ No newline at end of file diff --git a/modules/integration_azure-automation-account/common-locals.tf b/modules/integration_azure-automation-account/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/integration_azure-automation-account/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_azure-automation-account/common-modules.tf b/modules/integration_azure-automation-account/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/integration_azure-automation-account/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_azure-automation-account/common-variables.tf b/modules/integration_azure-automation-account/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/integration_azure-automation-account/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_azure-automation-account/common-versions.tf b/modules/integration_azure-automation-account/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/integration_azure-automation-account/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml new file mode 100644 index 000000000..053719265 --- /dev/null +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -0,0 +1,20 @@ +## Example +module: Azure Automation Account +name: jobs + +transformation: true +aggregation: true + +filtering: "filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') and filter('status', 'Failed')" + +signals: + totaljob: + metric: "TotalJob" + signal: + formula: totaljob.count(by=['runbook']) +rules: + critical: + threshold: 1 + comparator: ">=" + + diff --git a/modules/integration_azure-automation-account/conf/readme.yaml b/modules/integration_azure-automation-account/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/integration_azure-automation-account/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf new file mode 100644 index 000000000..2fbc3b7b6 --- /dev/null +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -0,0 +1,29 @@ +resource "signalfx_detector" "jobs" { + name = format("%s %s", local.detector_name_prefix, "Azure Automation Account jobs") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') and filter('status', 'Failed') + totaljob = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} + signal = totaljob.count(by=['runbook']).publish('signal') + detect(when(signal >= ${var.jobs_threshold_critical}, lasting=%{if var.jobs_lasting_duration_critical == null}None%{else}'${var.jobs_lasting_duration_critical}'%{endif}, at_least=${var.jobs_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high >= ${var.jobs_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.jobs_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jobs_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.jobs_runbook_url, var.runbook_url), "") + tip = var.jobs_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.jobs_max_delay +} + diff --git a/modules/integration_azure-automation-account/outputs.tf b/modules/integration_azure-automation-account/outputs.tf new file mode 100644 index 000000000..64f143f92 --- /dev/null +++ b/modules/integration_azure-automation-account/outputs.tf @@ -0,0 +1,5 @@ +output "jobs" { + description = "Detector resource for jobs" + value = signalfx_detector.jobs +} + diff --git a/modules/integration_azure-automation-account/tags.tf b/modules/integration_azure-automation-account/tags.tf new file mode 100644 index 000000000..54c6e26af --- /dev/null +++ b/modules/integration_azure-automation-account/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["integration", "azure-automation-account"] +} + diff --git a/modules/integration_azure-automation-account/variables-gen.tf b/modules/integration_azure-automation-account/variables-gen.tf new file mode 100644 index 000000000..8295fc597 --- /dev/null +++ b/modules/integration_azure-automation-account/variables-gen.tf @@ -0,0 +1,61 @@ +# jobs detector + +variable "jobs_notifications" { + description = "Notification recipients list per severity overridden for jobs detector" + type = map(list(string)) + default = {} +} + +variable "jobs_aggregation_function" { + description = "Aggregation function and group by for jobs detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "jobs_transformation_function" { + description = "Transformation function for jobs detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "jobs_max_delay" { + description = "Enforce max delay for jobs detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "jobs_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "jobs_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "jobs_disabled" { + description = "Disable all alerting rules for jobs detector" + type = bool + default = null +} + +variable "jobs_threshold_critical" { + description = "Critical threshold for jobs detector" + type = number + default = 1 +} + +variable "jobs_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jobs_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 55f331267391ad58d5c370d301f19269326a1eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Mon, 16 Jan 2023 15:15:22 +0100 Subject: [PATCH 2/9] Fill data with last value + add lasting --- modules/integration_azure-automation-account/conf/01-jobs.yaml | 3 ++- modules/integration_azure-automation-account/detectors-gen.tf | 2 +- modules/integration_azure-automation-account/variables-gen.tf | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index 053719265..f060db0c4 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -11,10 +11,11 @@ signals: totaljob: metric: "TotalJob" signal: - formula: totaljob.count(by=['runbook']) + formula: totaljob.fill().count(by=['runbook']) rules: critical: threshold: 1 comparator: ">=" + lasting_duration: "1h" diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf index 2fbc3b7b6..44bab541b 100644 --- a/modules/integration_azure-automation-account/detectors-gen.tf +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -8,7 +8,7 @@ resource "signalfx_detector" "jobs" { program_text = <<-EOF base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') and filter('status', 'Failed') totaljob = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} - signal = totaljob.count(by=['runbook']).publish('signal') + signal = totaljob.fill().count(by=['runbook']).publish('signal') detect(when(signal >= ${var.jobs_threshold_critical}, lasting=%{if var.jobs_lasting_duration_critical == null}None%{else}'${var.jobs_lasting_duration_critical}'%{endif}, at_least=${var.jobs_at_least_percentage_critical})).publish('CRIT') EOF diff --git a/modules/integration_azure-automation-account/variables-gen.tf b/modules/integration_azure-automation-account/variables-gen.tf index 8295fc597..4a96f8a1d 100644 --- a/modules/integration_azure-automation-account/variables-gen.tf +++ b/modules/integration_azure-automation-account/variables-gen.tf @@ -51,7 +51,7 @@ variable "jobs_threshold_critical" { variable "jobs_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "1h" } variable "jobs_at_least_percentage_critical" { From 791d6c03b142efd0bf7eb2c96479b6bbb8de4ab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Mon, 16 Jan 2023 16:23:02 +0100 Subject: [PATCH 3/9] Add comparison with succeded jobs to avoid false positive --- .../conf/01-jobs.yaml | 15 +++++++++------ .../detectors-gen.tf | 7 ++++--- .../variables-gen.tf | 6 +++--- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index f060db0c4..5b0db3da9 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -2,20 +2,23 @@ module: Azure Automation Account name: jobs -transformation: true -aggregation: true +transformation: ".count(over='1h')" +aggregation: ".count(by=['runbook'])" -filtering: "filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') and filter('status', 'Failed')" +filtering: "filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true')" signals: - totaljob: + success: metric: "TotalJob" + filtering: "filter('status', 'Completed')" + failed: + metric: "TotalJob" + filtering: "filter('status', 'Failed')" signal: - formula: totaljob.fill().count(by=['runbook']) + formula: (failed - success).min(over='1h') rules: critical: threshold: 1 comparator: ">=" - lasting_duration: "1h" diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf index 44bab541b..4a2fbff24 100644 --- a/modules/integration_azure-automation-account/detectors-gen.tf +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -6,9 +6,10 @@ resource "signalfx_detector" "jobs" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') and filter('status', 'Failed') - totaljob = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} - signal = totaljob.fill().count(by=['runbook']).publish('signal') + base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') + success = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} + failed = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} + signal = (failed - success).min(over='1h').publish('signal') detect(when(signal >= ${var.jobs_threshold_critical}, lasting=%{if var.jobs_lasting_duration_critical == null}None%{else}'${var.jobs_lasting_duration_critical}'%{endif}, at_least=${var.jobs_at_least_percentage_critical})).publish('CRIT') EOF diff --git a/modules/integration_azure-automation-account/variables-gen.tf b/modules/integration_azure-automation-account/variables-gen.tf index 4a96f8a1d..06f9821b7 100644 --- a/modules/integration_azure-automation-account/variables-gen.tf +++ b/modules/integration_azure-automation-account/variables-gen.tf @@ -9,13 +9,13 @@ variable "jobs_notifications" { variable "jobs_aggregation_function" { description = "Aggregation function and group by for jobs detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".count(by=['runbook'])" } variable "jobs_transformation_function" { description = "Transformation function for jobs detector (i.e. \".mean(over='5m')\")" type = string - default = "" + default = ".count(over='1h')" } variable "jobs_max_delay" { @@ -51,7 +51,7 @@ variable "jobs_threshold_critical" { variable "jobs_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = "1h" + default = null } variable "jobs_at_least_percentage_critical" { From 547e84e6df3a5ae3cacb3753828b010ff4d77ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Marmol?= Date: Mon, 16 Jan 2023 16:24:06 +0100 Subject: [PATCH 4/9] Update modules/integration_azure-automation-account/conf/readme.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Respaut --- modules/integration_azure-automation-account/conf/readme.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/integration_azure-automation-account/conf/readme.yaml b/modules/integration_azure-automation-account/conf/readme.yaml index 9015fc41a..f436c4be9 100644 --- a/modules/integration_azure-automation-account/conf/readme.yaml +++ b/modules/integration_azure-automation-account/conf/readme.yaml @@ -1,3 +1,4 @@ documentations: - + - name: Azure Monitor + url: https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/metrics-supported#microsoftautomationautomationaccounts source_doc: From 76f35fa07751ab84680af15f7e019ea3fd865c4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Marmol?= Date: Mon, 16 Jan 2023 16:24:15 +0100 Subject: [PATCH 5/9] Update modules/integration_azure-automation-account/conf/01-jobs.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Respaut --- modules/integration_azure-automation-account/conf/01-jobs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index 5b0db3da9..c9ae93c5d 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -1,6 +1,6 @@ ## Example module: Azure Automation Account -name: jobs +name: "failed jobs" transformation: ".count(over='1h')" aggregation: ".count(by=['runbook'])" From 1ed15b9df6fd49460f1660690455a7b849137098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Mon, 16 Jan 2023 16:36:24 +0100 Subject: [PATCH 6/9] Fix filter on signal --- docs/severity.md | 2 +- .../README.md | 3 +- .../conf/01-jobs.yaml | 4 +-- .../conf/readme.yaml | 4 +-- .../detectors-gen.tf | 22 ++++++------ .../outputs.tf | 6 ++-- .../variables-gen.tf | 34 +++++++++---------- 7 files changed, 38 insertions(+), 37 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index f24280f01..21cda29f7 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -385,7 +385,7 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|Azure Automation Account jobs|X|-|-|-|-| +|Azure Automation Account failed jobs|X|-|-|-|-| ## integration_azure-azure-search diff --git a/modules/integration_azure-automation-account/README.md b/modules/integration_azure-automation-account/README.md index 7ccec8fac..6de3f3e45 100644 --- a/modules/integration_azure-automation-account/README.md +++ b/modules/integration_azure-automation-account/README.md @@ -75,7 +75,7 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|Azure Automation Account jobs|X|-|-|-|-| +|Azure Automation Account failed jobs|X|-|-|-|-| ## How to collect required metrics? @@ -103,3 +103,4 @@ Here is the list of required metrics for detectors in this module. * [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) * [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) * [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) +* [Azure Monitor](https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/metrics-supported#microsoftautomationautomationaccounts) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index c9ae93c5d..b4f6acdec 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -10,10 +10,10 @@ filtering: "filter('resource_type', 'Microsoft.Automation/automationAccounts') a signals: success: metric: "TotalJob" - filtering: "filter('status', 'Completed')" + filter: "filter('status', 'Completed')" failed: metric: "TotalJob" - filtering: "filter('status', 'Failed')" + filter: "filter('status', 'Failed')" signal: formula: (failed - success).min(over='1h') rules: diff --git a/modules/integration_azure-automation-account/conf/readme.yaml b/modules/integration_azure-automation-account/conf/readme.yaml index f436c4be9..edd36c5f1 100644 --- a/modules/integration_azure-automation-account/conf/readme.yaml +++ b/modules/integration_azure-automation-account/conf/readme.yaml @@ -1,4 +1,4 @@ documentations: - - name: Azure Monitor - url: https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/metrics-supported#microsoftautomationautomationaccounts + - name: 'Azure Monitor' + url: 'https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/metrics-supported#microsoftautomationautomationaccounts' source_doc: diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf index 4a2fbff24..a8b6c687f 100644 --- a/modules/integration_azure-automation-account/detectors-gen.tf +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -1,5 +1,5 @@ -resource "signalfx_detector" "jobs" { - name = format("%s %s", local.detector_name_prefix, "Azure Automation Account jobs") +resource "signalfx_detector" "failed_jobs" { + name = format("%s %s", local.detector_name_prefix, "Azure Automation Account failed jobs") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -7,24 +7,24 @@ resource "signalfx_detector" "jobs" { program_text = <<-EOF base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') - success = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} - failed = data('TotalJob', filter=base_filtering and ${module.filtering.signalflow})${var.jobs_aggregation_function}${var.jobs_transformation_function} + success = data('TotalJob', filter=base_filtering and filter('status', 'Completed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} + failed = data('TotalJob', filter=base_filtering and filter('status', 'Failed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} signal = (failed - success).min(over='1h').publish('signal') - detect(when(signal >= ${var.jobs_threshold_critical}, lasting=%{if var.jobs_lasting_duration_critical == null}None%{else}'${var.jobs_lasting_duration_critical}'%{endif}, at_least=${var.jobs_at_least_percentage_critical})).publish('CRIT') + detect(when(signal >= ${var.failed_jobs_threshold_critical}, lasting=%{if var.failed_jobs_lasting_duration_critical == null}None%{else}'${var.failed_jobs_lasting_duration_critical}'%{endif}, at_least=${var.failed_jobs_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "is too high >= ${var.jobs_threshold_critical}" + description = "is too high >= ${var.failed_jobs_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.jobs_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jobs_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.jobs_runbook_url, var.runbook_url), "") - tip = var.jobs_tip + disabled = coalesce(var.failed_jobs_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.failed_jobs_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.failed_jobs_runbook_url, var.runbook_url), "") + tip = var.failed_jobs_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.jobs_max_delay + max_delay = var.failed_jobs_max_delay } diff --git a/modules/integration_azure-automation-account/outputs.tf b/modules/integration_azure-automation-account/outputs.tf index 64f143f92..41cfc2f18 100644 --- a/modules/integration_azure-automation-account/outputs.tf +++ b/modules/integration_azure-automation-account/outputs.tf @@ -1,5 +1,5 @@ -output "jobs" { - description = "Detector resource for jobs" - value = signalfx_detector.jobs +output "failed_jobs" { + description = "Detector resource for failed_jobs" + value = signalfx_detector.failed_jobs } diff --git a/modules/integration_azure-automation-account/variables-gen.tf b/modules/integration_azure-automation-account/variables-gen.tf index 06f9821b7..08f5d6e67 100644 --- a/modules/integration_azure-automation-account/variables-gen.tf +++ b/modules/integration_azure-automation-account/variables-gen.tf @@ -1,60 +1,60 @@ -# jobs detector +# failed_jobs detector -variable "jobs_notifications" { - description = "Notification recipients list per severity overridden for jobs detector" +variable "failed_jobs_notifications" { + description = "Notification recipients list per severity overridden for failed_jobs detector" type = map(list(string)) default = {} } -variable "jobs_aggregation_function" { - description = "Aggregation function and group by for jobs detector (i.e. \".mean(by=['host'])\")" +variable "failed_jobs_aggregation_function" { + description = "Aggregation function and group by for failed_jobs detector (i.e. \".mean(by=['host'])\")" type = string default = ".count(by=['runbook'])" } -variable "jobs_transformation_function" { - description = "Transformation function for jobs detector (i.e. \".mean(over='5m')\")" +variable "failed_jobs_transformation_function" { + description = "Transformation function for failed_jobs detector (i.e. \".mean(over='5m')\")" type = string default = ".count(over='1h')" } -variable "jobs_max_delay" { - description = "Enforce max delay for jobs detector (use \"0\" or \"null\" for \"Auto\")" +variable "failed_jobs_max_delay" { + description = "Enforce max delay for failed_jobs detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "jobs_tip" { +variable "failed_jobs_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "jobs_runbook_url" { +variable "failed_jobs_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "jobs_disabled" { - description = "Disable all alerting rules for jobs detector" +variable "failed_jobs_disabled" { + description = "Disable all alerting rules for failed_jobs detector" type = bool default = null } -variable "jobs_threshold_critical" { - description = "Critical threshold for jobs detector" +variable "failed_jobs_threshold_critical" { + description = "Critical threshold for failed_jobs detector" type = number default = 1 } -variable "jobs_lasting_duration_critical" { +variable "failed_jobs_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "jobs_at_least_percentage_critical" { +variable "failed_jobs_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 From 39562c7964c67b538c2e012af12a397dadad11df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Tue, 17 Jan 2023 11:55:35 +0100 Subject: [PATCH 7/9] Change aggregation and add scale factor to success metrics to avoid false positive on only one failure --- .../conf/01-jobs.yaml | 12 +++++------- .../detectors-gen.tf | 6 +++--- .../variables-gen.tf | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index b4f6acdec..457396e66 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -2,8 +2,8 @@ module: Azure Automation Account name: "failed jobs" -transformation: ".count(over='1h')" -aggregation: ".count(by=['runbook'])" +transformation: ".max(over='1h')" +aggregation: ".mean(by=['runbook'])" filtering: "filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true')" @@ -15,10 +15,8 @@ signals: metric: "TotalJob" filter: "filter('status', 'Failed')" signal: - formula: (failed - success).min(over='1h') + formula: (failed - success.scale(10)).min(over='1h') rules: critical: - threshold: 1 - comparator: ">=" - - + threshold: 0 + comparator: ">" diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf index a8b6c687f..7f87b8e78 100644 --- a/modules/integration_azure-automation-account/detectors-gen.tf +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -9,12 +9,12 @@ resource "signalfx_detector" "failed_jobs" { base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') success = data('TotalJob', filter=base_filtering and filter('status', 'Completed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} failed = data('TotalJob', filter=base_filtering and filter('status', 'Failed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} - signal = (failed - success).min(over='1h').publish('signal') - detect(when(signal >= ${var.failed_jobs_threshold_critical}, lasting=%{if var.failed_jobs_lasting_duration_critical == null}None%{else}'${var.failed_jobs_lasting_duration_critical}'%{endif}, at_least=${var.failed_jobs_at_least_percentage_critical})).publish('CRIT') + signal = (failed - success.scale(10)).min(over='1h').publish('signal') + detect(when(signal > ${var.failed_jobs_threshold_critical}, lasting=%{if var.failed_jobs_lasting_duration_critical == null}None%{else}'${var.failed_jobs_lasting_duration_critical}'%{endif}, at_least=${var.failed_jobs_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "is too high >= ${var.failed_jobs_threshold_critical}" + description = "is too high > ${var.failed_jobs_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.failed_jobs_disabled, var.detectors_disabled) diff --git a/modules/integration_azure-automation-account/variables-gen.tf b/modules/integration_azure-automation-account/variables-gen.tf index 08f5d6e67..8bab6ea4d 100644 --- a/modules/integration_azure-automation-account/variables-gen.tf +++ b/modules/integration_azure-automation-account/variables-gen.tf @@ -9,13 +9,13 @@ variable "failed_jobs_notifications" { variable "failed_jobs_aggregation_function" { description = "Aggregation function and group by for failed_jobs detector (i.e. \".mean(by=['host'])\")" type = string - default = ".count(by=['runbook'])" + default = ".mean(by=['runbook'])" } variable "failed_jobs_transformation_function" { description = "Transformation function for failed_jobs detector (i.e. \".mean(over='5m')\")" type = string - default = ".count(over='1h')" + default = ".max(over='1h')" } variable "failed_jobs_max_delay" { @@ -45,7 +45,7 @@ variable "failed_jobs_disabled" { variable "failed_jobs_threshold_critical" { description = "Critical threshold for failed_jobs detector" type = number - default = 1 + default = 0 } variable "failed_jobs_lasting_duration_critical" { From 888dbd4d7d177e67c001532978b6b3fd3c24889e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Tue, 24 Jan 2023 10:45:39 +0100 Subject: [PATCH 8/9] Add only a scale factor of 2 on success and fill blank metrics with 0 --- modules/integration_azure-automation-account/conf/01-jobs.yaml | 2 +- modules/integration_azure-automation-account/detectors-gen.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index 457396e66..e3b02bfa7 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -15,7 +15,7 @@ signals: metric: "TotalJob" filter: "filter('status', 'Failed')" signal: - formula: (failed - success.scale(10)).min(over='1h') + formula: (failed.fill(0) - success.fill(0).scale(2)) rules: critical: threshold: 0 diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf index 7f87b8e78..6defc03c5 100644 --- a/modules/integration_azure-automation-account/detectors-gen.tf +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -9,7 +9,7 @@ resource "signalfx_detector" "failed_jobs" { base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') success = data('TotalJob', filter=base_filtering and filter('status', 'Completed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} failed = data('TotalJob', filter=base_filtering and filter('status', 'Failed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} - signal = (failed - success.scale(10)).min(over='1h').publish('signal') + signal = (failed.fill(0) - success.fill(0).scale(2)).publish('signal') detect(when(signal > ${var.failed_jobs_threshold_critical}, lasting=%{if var.failed_jobs_lasting_duration_critical == null}None%{else}'${var.failed_jobs_lasting_duration_critical}'%{endif}, at_least=${var.failed_jobs_at_least_percentage_critical})).publish('CRIT') EOF From 1e38cfdf1d1764c0eaaeeea6cce05898ee3473be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Thu, 2 Feb 2023 18:24:15 +0100 Subject: [PATCH 9/9] Revert scale factor --- modules/integration_azure-automation-account/conf/01-jobs.yaml | 2 +- modules/integration_azure-automation-account/detectors-gen.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/integration_azure-automation-account/conf/01-jobs.yaml b/modules/integration_azure-automation-account/conf/01-jobs.yaml index e3b02bfa7..c46ec4375 100644 --- a/modules/integration_azure-automation-account/conf/01-jobs.yaml +++ b/modules/integration_azure-automation-account/conf/01-jobs.yaml @@ -15,7 +15,7 @@ signals: metric: "TotalJob" filter: "filter('status', 'Failed')" signal: - formula: (failed.fill(0) - success.fill(0).scale(2)) + formula: (failed.fill(0) - success.fill(0)) rules: critical: threshold: 0 diff --git a/modules/integration_azure-automation-account/detectors-gen.tf b/modules/integration_azure-automation-account/detectors-gen.tf index 6defc03c5..230c78f69 100644 --- a/modules/integration_azure-automation-account/detectors-gen.tf +++ b/modules/integration_azure-automation-account/detectors-gen.tf @@ -9,7 +9,7 @@ resource "signalfx_detector" "failed_jobs" { base_filtering = filter('resource_type', 'Microsoft.Automation/automationAccounts') and filter('primary_aggregation_type', 'true') success = data('TotalJob', filter=base_filtering and filter('status', 'Completed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} failed = data('TotalJob', filter=base_filtering and filter('status', 'Failed') and ${module.filtering.signalflow})${var.failed_jobs_aggregation_function}${var.failed_jobs_transformation_function} - signal = (failed.fill(0) - success.fill(0).scale(2)).publish('signal') + signal = (failed.fill(0) - success.fill(0)).publish('signal') detect(when(signal > ${var.failed_jobs_threshold_critical}, lasting=%{if var.failed_jobs_lasting_duration_critical == null}None%{else}'${var.failed_jobs_lasting_duration_critical}'%{endif}, at_least=${var.failed_jobs_at_least_percentage_critical})).publish('CRIT') EOF