Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@
|Azure API Management Service capacity|X|X|-|-|-|
|Azure API Management Service duration of gateway request|X|X|-|-|-|
|Azure API Management Service duration of backend request|X|X|-|-|-|
|Azure API Management Service cpu percentage of gateway|X|X|-|-|-|


## integration_azure-app-service-plan
Expand Down
2 changes: 2 additions & 0 deletions modules/integration_azure-api-management-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ This module creates the following SignalFx detectors which could contain one or
|Azure API Management Service capacity|X|X|-|-|-|
|Azure API Management Service duration of gateway request|X|X|-|-|-|
|Azure API Management Service duration of backend request|X|X|-|-|-|
|Azure API Management Service cpu percentage of gateway|X|X|-|-|-|

## How to collect required metrics?

Expand All @@ -98,6 +99,7 @@ Here is the list of required metrics for detectors in this module.

* `BackendDuration`
* `Capacity`
* `CpuPercent_Gateway`
* `Duration`
* `NetworkConnectivity`

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
module: "Azure API Management Service"
name: "CPU Percentage of Gateway"
filtering: "filter('resource_type', 'Microsoft.ApiManagement/service') and filter('primary_aggregation_type', 'true')"
aggregation: ".mean(by=['azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
value_unit: "%"
transformation: true
signals:
signal:
metric: "CpuPercent_Gateway"
rules:
critical:
threshold: 95
comparator: ">"
lasting_duration: '5m'
major:
threshold: 90
comparator: ">"
lasting_duration: '5m'
dependency: critical
...
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,49 @@ EOF
max_delay = var.duration_of_backend_request_max_delay
}

resource "signalfx_detector" "cpu_percentage_of_gateway" {
name = format("%s %s", local.detector_name_prefix, "Azure API Management Service cpu percentage of gateway")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "%"
}

program_text = <<-EOF
base_filtering = filter('resource_type', 'Microsoft.ApiManagement/service') and filter('primary_aggregation_type', 'true')
signal = data('CpuPercent_Gateway', filter=base_filtering and ${module.filtering.signalflow})${var.cpu_percentage_of_gateway_aggregation_function}${var.cpu_percentage_of_gateway_transformation_function}.publish('signal')
detect(when(signal > ${var.cpu_percentage_of_gateway_threshold_critical}%{if var.cpu_percentage_of_gateway_lasting_duration_critical != null}, lasting='${var.cpu_percentage_of_gateway_lasting_duration_critical}', at_least=${var.cpu_percentage_of_gateway_at_least_percentage_critical}%{endif})).publish('CRIT')
detect(when(signal > ${var.cpu_percentage_of_gateway_threshold_major}%{if var.cpu_percentage_of_gateway_lasting_duration_major != null}, lasting='${var.cpu_percentage_of_gateway_lasting_duration_major}', at_least=${var.cpu_percentage_of_gateway_at_least_percentage_major}%{endif}) and (not when(signal > ${var.cpu_percentage_of_gateway_threshold_critical}%{if var.cpu_percentage_of_gateway_lasting_duration_critical != null}, lasting='${var.cpu_percentage_of_gateway_lasting_duration_critical}', at_least=${var.cpu_percentage_of_gateway_at_least_percentage_critical}%{endif}))).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.cpu_percentage_of_gateway_threshold_critical}%"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.cpu_percentage_of_gateway_disabled_critical, var.cpu_percentage_of_gateway_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.cpu_percentage_of_gateway_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.cpu_percentage_of_gateway_runbook_url, var.runbook_url), "")
tip = var.cpu_percentage_of_gateway_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too high > ${var.cpu_percentage_of_gateway_threshold_major}%"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.cpu_percentage_of_gateway_disabled_major, var.cpu_percentage_of_gateway_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.cpu_percentage_of_gateway_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.cpu_percentage_of_gateway_runbook_url, var.runbook_url), "")
tip = var.cpu_percentage_of_gateway_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.cpu_percentage_of_gateway_max_delay
}

5 changes: 5 additions & 0 deletions modules/integration_azure-api-management-service/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ output "capacity" {
value = signalfx_detector.capacity
}

output "cpu_percentage_of_gateway" {
description = "Detector resource for cpu_percentage_of_gateway"
value = signalfx_detector.cpu_percentage_of_gateway
}

output "duration_of_backend_request" {
description = "Detector resource for duration_of_backend_request"
value = signalfx_detector.duration_of_backend_request
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,93 @@ variable "duration_of_backend_request_at_least_percentage_major" {
type = number
default = 1
}
# cpu_percentage_of_gateway detector

variable "cpu_percentage_of_gateway_notifications" {
description = "Notification recipients list per severity overridden for cpu_percentage_of_gateway detector"
type = map(list(string))
default = {}
}

variable "cpu_percentage_of_gateway_aggregation_function" {
description = "Aggregation function and group by for cpu_percentage_of_gateway detector (i.e. \".mean(by=['host'])\")"
type = string
default = ".mean(by=['azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
}

variable "cpu_percentage_of_gateway_transformation_function" {
description = "Transformation function for cpu_percentage_of_gateway detector (i.e. \".mean(over='5m')\")"
type = string
default = ""
}

variable "cpu_percentage_of_gateway_max_delay" {
description = "Enforce max delay for cpu_percentage_of_gateway detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "cpu_percentage_of_gateway_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "cpu_percentage_of_gateway_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "cpu_percentage_of_gateway_disabled" {
description = "Disable all alerting rules for cpu_percentage_of_gateway detector"
type = bool
default = null
}

variable "cpu_percentage_of_gateway_disabled_critical" {
description = "Disable critical alerting rule for cpu_percentage_of_gateway detector"
type = bool
default = null
}

variable "cpu_percentage_of_gateway_disabled_major" {
description = "Disable major alerting rule for cpu_percentage_of_gateway detector"
type = bool
default = null
}

variable "cpu_percentage_of_gateway_threshold_critical" {
description = "Critical threshold for cpu_percentage_of_gateway detector in %"
type = number
default = 95
}

variable "cpu_percentage_of_gateway_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "5m"
}

variable "cpu_percentage_of_gateway_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "cpu_percentage_of_gateway_threshold_major" {
description = "Major threshold for cpu_percentage_of_gateway detector in %"
type = number
default = 90
}

variable "cpu_percentage_of_gateway_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "5m"
}

variable "cpu_percentage_of_gateway_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
Loading