diff --git a/docs/severity.md b/docs/severity.md index 15dea78a5..6cefcc654 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -429,6 +429,7 @@ |Azure API Management Service capacity|X|X|-|-|-| |Azure API Management Service duration of gateway request|X|X|-|-|-| |Azure API Management Service duration of backend request|X|X|-|-|-| +|Azure API Management Service cpu percentage of gateway|X|X|-|-|-| ## integration_azure-app-service-plan diff --git a/modules/integration_azure-api-management-service/README.md b/modules/integration_azure-api-management-service/README.md index 46ebc12c6..cccf05afd 100644 --- a/modules/integration_azure-api-management-service/README.md +++ b/modules/integration_azure-api-management-service/README.md @@ -79,6 +79,7 @@ This module creates the following SignalFx detectors which could contain one or |Azure API Management Service capacity|X|X|-|-|-| |Azure API Management Service duration of gateway request|X|X|-|-|-| |Azure API Management Service duration of backend request|X|X|-|-|-| +|Azure API Management Service cpu percentage of gateway|X|X|-|-|-| ## How to collect required metrics? @@ -98,6 +99,7 @@ Here is the list of required metrics for detectors in this module. * `BackendDuration` * `Capacity` +* `CpuPercent_Gateway` * `Duration` * `NetworkConnectivity` diff --git a/modules/integration_azure-api-management-service/conf/04-cpu-percent-gateway.yaml b/modules/integration_azure-api-management-service/conf/04-cpu-percent-gateway.yaml new file mode 100644 index 000000000..d652bde4c --- /dev/null +++ b/modules/integration_azure-api-management-service/conf/04-cpu-percent-gateway.yaml @@ -0,0 +1,21 @@ +--- +module: "Azure API Management Service" +name: "CPU Percentage of Gateway" +filtering: "filter('resource_type', 'Microsoft.ApiManagement/service') and filter('primary_aggregation_type', 'true')" +aggregation: ".mean(by=['azure_resource_name', 'azure_resource_group_name', 'azure_region'])" +value_unit: "%" +transformation: true +signals: + signal: + metric: "CpuPercent_Gateway" +rules: + critical: + threshold: 95 + comparator: ">" + lasting_duration: '5m' + major: + threshold: 90 + comparator: ">" + lasting_duration: '5m' + dependency: critical +... diff --git a/modules/integration_azure-api-management-service/detectors-gen.tf b/modules/integration_azure-api-management-service/detectors-gen.tf index 0680a2619..b54a7cf70 100644 --- a/modules/integration_azure-api-management-service/detectors-gen.tf +++ b/modules/integration_azure-api-management-service/detectors-gen.tf @@ -165,3 +165,49 @@ EOF max_delay = var.duration_of_backend_request_max_delay } +resource "signalfx_detector" "cpu_percentage_of_gateway" { + name = format("%s %s", local.detector_name_prefix, "Azure API Management Service cpu percentage of gateway") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + base_filtering = filter('resource_type', 'Microsoft.ApiManagement/service') and filter('primary_aggregation_type', 'true') + signal = data('CpuPercent_Gateway', filter=base_filtering and ${module.filtering.signalflow})${var.cpu_percentage_of_gateway_aggregation_function}${var.cpu_percentage_of_gateway_transformation_function}.publish('signal') + detect(when(signal > ${var.cpu_percentage_of_gateway_threshold_critical}%{if var.cpu_percentage_of_gateway_lasting_duration_critical != null}, lasting='${var.cpu_percentage_of_gateway_lasting_duration_critical}', at_least=${var.cpu_percentage_of_gateway_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.cpu_percentage_of_gateway_threshold_major}%{if var.cpu_percentage_of_gateway_lasting_duration_major != null}, lasting='${var.cpu_percentage_of_gateway_lasting_duration_major}', at_least=${var.cpu_percentage_of_gateway_at_least_percentage_major}%{endif}) and (not when(signal > ${var.cpu_percentage_of_gateway_threshold_critical}%{if var.cpu_percentage_of_gateway_lasting_duration_critical != null}, lasting='${var.cpu_percentage_of_gateway_lasting_duration_critical}', at_least=${var.cpu_percentage_of_gateway_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cpu_percentage_of_gateway_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cpu_percentage_of_gateway_disabled_critical, var.cpu_percentage_of_gateway_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_percentage_of_gateway_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cpu_percentage_of_gateway_runbook_url, var.runbook_url), "") + tip = var.cpu_percentage_of_gateway_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cpu_percentage_of_gateway_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cpu_percentage_of_gateway_disabled_major, var.cpu_percentage_of_gateway_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_percentage_of_gateway_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cpu_percentage_of_gateway_runbook_url, var.runbook_url), "") + tip = var.cpu_percentage_of_gateway_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cpu_percentage_of_gateway_max_delay +} + diff --git a/modules/integration_azure-api-management-service/outputs.tf b/modules/integration_azure-api-management-service/outputs.tf index 8aba0cad6..84785d281 100644 --- a/modules/integration_azure-api-management-service/outputs.tf +++ b/modules/integration_azure-api-management-service/outputs.tf @@ -3,6 +3,11 @@ output "capacity" { value = signalfx_detector.capacity } +output "cpu_percentage_of_gateway" { + description = "Detector resource for cpu_percentage_of_gateway" + value = signalfx_detector.cpu_percentage_of_gateway +} + output "duration_of_backend_request" { description = "Detector resource for duration_of_backend_request" value = signalfx_detector.duration_of_backend_request diff --git a/modules/integration_azure-api-management-service/variables-gen.tf b/modules/integration_azure-api-management-service/variables-gen.tf index 4206fceaa..cbabb589d 100644 --- a/modules/integration_azure-api-management-service/variables-gen.tf +++ b/modules/integration_azure-api-management-service/variables-gen.tf @@ -318,3 +318,93 @@ variable "duration_of_backend_request_at_least_percentage_major" { type = number default = 1 } +# cpu_percentage_of_gateway detector + +variable "cpu_percentage_of_gateway_notifications" { + description = "Notification recipients list per severity overridden for cpu_percentage_of_gateway detector" + type = map(list(string)) + default = {} +} + +variable "cpu_percentage_of_gateway_aggregation_function" { + description = "Aggregation function and group by for cpu_percentage_of_gateway detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".mean(by=['azure_resource_name', 'azure_resource_group_name', 'azure_region'])" +} + +variable "cpu_percentage_of_gateway_transformation_function" { + description = "Transformation function for cpu_percentage_of_gateway detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "cpu_percentage_of_gateway_max_delay" { + description = "Enforce max delay for cpu_percentage_of_gateway detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cpu_percentage_of_gateway_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cpu_percentage_of_gateway_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cpu_percentage_of_gateway_disabled" { + description = "Disable all alerting rules for cpu_percentage_of_gateway detector" + type = bool + default = null +} + +variable "cpu_percentage_of_gateway_disabled_critical" { + description = "Disable critical alerting rule for cpu_percentage_of_gateway detector" + type = bool + default = null +} + +variable "cpu_percentage_of_gateway_disabled_major" { + description = "Disable major alerting rule for cpu_percentage_of_gateway detector" + type = bool + default = null +} + +variable "cpu_percentage_of_gateway_threshold_critical" { + description = "Critical threshold for cpu_percentage_of_gateway detector in %" + type = number + default = 95 +} + +variable "cpu_percentage_of_gateway_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "cpu_percentage_of_gateway_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cpu_percentage_of_gateway_threshold_major" { + description = "Major threshold for cpu_percentage_of_gateway detector in %" + type = number + default = 90 +} + +variable "cpu_percentage_of_gateway_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "cpu_percentage_of_gateway_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}