Skip to content
6 changes: 4 additions & 2 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -1311,9 +1311,11 @@

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Zookeeper server-health|-|X|-|-|-|
|Zookeeper cluster-health|X|-|-|-|-|
|Zookeeper server-latency|-|X|-|-|-|
|Zookeeper cluster-latency|X|-|-|-|-|
|Zookeeper heartbeat|X|-|-|-|-|
|Zookeeper service health|X|-|-|-|-|
|Zookeeper latency|X|X|-|-|-|
|Zookeeper file descriptors usage|X|X|-|-|-|


8 changes: 5 additions & 3 deletions modules/smart-agent_zookeeper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Note the following parameters:

These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all
[modules](../) in this repository. Other variables, specific to this module, are available in
[variables.tf](variables.tf).
[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf).
In general, the default configuration "works" but all of these Terraform
[variables](https://www.terraform.io/language/values/variables) make it possible to
customize the detectors behavior to better fit your needs.
Expand All @@ -77,9 +77,11 @@ This module creates the following SignalFx detectors which could contain one or

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Zookeeper server-health|-|X|-|-|-|
|Zookeeper cluster-health|X|-|-|-|-|
|Zookeeper server-latency|-|X|-|-|-|
|Zookeeper cluster-latency|X|-|-|-|-|
|Zookeeper heartbeat|X|-|-|-|-|
|Zookeeper service health|X|-|-|-|-|
|Zookeeper latency|X|X|-|-|-|
|Zookeeper file descriptors usage|X|X|-|-|-|

## How to collect required metrics?
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: server-health
aggregation: false
signals:
signal:
metric: "gauge.zk_service_health"
rules:
major:
threshold: 1
comparator: "!="
description: "Zookeeper server is not running"
lasting_duration: "5m"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: cluster-health
aggregation: ".mean(by=['kubernetes_cluster'])"
signals:
signal:
metric: "gauge.zk_service_health"
rules:
critical:
threshold: 0
comparator: "=="
description: "Zookeeper cluster is not running"
lasting_duration: "5m"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: server-latency
aggregation: false
signals:
signal:
metric: "gauge.zk_avg_latency"
rules:
major:
threshold: 250000
comparator: ">"
description: "Zookeeper server latency is too high"
lasting_duration: "5m"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: cluster-latency
aggregation: ".mean(by=['kubernetes_cluster'])"
signals:
signal:
metric: "gauge.zk_avg_latency"
rules:
critical:
threshold: 300000
comparator: ">"
description: "Zookeeper cluster latency is too high"
lasting_duration: "5m"
108 changes: 108 additions & 0 deletions modules/smart-agent_zookeeper/detectors-gen.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
resource "signalfx_detector" "server-health" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper server-health")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.server-health_transformation_function}.publish('signal')
detect(when(signal != ${var.server-health_threshold_major}, lasting=%{if var.server-health_lasting_duration_major == null}None%{else}'${var.server-health_lasting_duration_major}'%{endif}, at_least=${var.server-health_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "Zookeeper server is not running != ${var.server-health_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.server-health_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.server-health_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.server-health_runbook_url, var.runbook_url), "")
tip = var.server-health_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.server-health_max_delay
}

resource "signalfx_detector" "cluster-health" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-health")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.cluster-health_aggregation_function}${var.cluster-health_transformation_function}.publish('signal')
detect(when(signal == ${var.cluster-health_threshold_critical}, lasting=%{if var.cluster-health_lasting_duration_critical == null}None%{else}'${var.cluster-health_lasting_duration_critical}'%{endif}, at_least=${var.cluster-health_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "Zookeeper cluster is not running == ${var.cluster-health_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.cluster-health_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.cluster-health_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.cluster-health_runbook_url, var.runbook_url), "")
tip = var.cluster-health_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.cluster-health_max_delay
}

resource "signalfx_detector" "server-latency" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper server-latency")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.server-latency_transformation_function}.publish('signal')
detect(when(signal > ${var.server-latency_threshold_major}, lasting=%{if var.server-latency_lasting_duration_major == null}None%{else}'${var.server-latency_lasting_duration_major}'%{endif}, at_least=${var.server-latency_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "Zookeeper server latency is too high > ${var.server-latency_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.server-latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.server-latency_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.server-latency_runbook_url, var.runbook_url), "")
tip = var.server-latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.server-latency_max_delay
}

resource "signalfx_detector" "cluster-latency" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-latency")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}${var.cluster-latency_transformation_function}.publish('signal')
detect(when(signal > ${var.cluster-latency_threshold_critical}, lasting=%{if var.cluster-latency_lasting_duration_critical == null}None%{else}'${var.cluster-latency_lasting_duration_critical}'%{endif}, at_least=${var.cluster-latency_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "Zookeeper cluster latency is too high > ${var.cluster-latency_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.cluster-latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.cluster-latency_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.cluster-latency_runbook_url, var.runbook_url), "")
tip = var.cluster-latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.cluster-latency_max_delay
}

67 changes: 0 additions & 67 deletions modules/smart-agent_zookeeper/detectors-zookeeper.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,73 +26,6 @@ EOF
max_delay = var.heartbeat_max_delay
}

resource "signalfx_detector" "zookeeper_health" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper service health")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_service_health', filter=filter('plugin', 'zookeeper') and ${module.filtering.signalflow})${var.zookeeper_health_aggregation_function}${var.zookeeper_health_transformation_function}.publish('signal')
detect(when(signal != 1)).publish('CRIT')
EOF

rule {
description = "is not running"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.zookeeper_health_disabled_critical, var.zookeeper_health_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.zookeeper_health_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.zookeeper_health_runbook_url, var.runbook_url), "")
tip = var.zookeeper_health_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.zookeeper_health_max_delay
}

resource "signalfx_detector" "zookeeper_latency" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper latency")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_avg_latency', filter=filter('plugin', 'zookeeper') and ${module.filtering.signalflow})${var.zookeeper_latency_aggregation_function}${var.zookeeper_latency_transformation_function}.publish('signal')
detect(when(signal > ${var.zookeeper_latency_threshold_critical})).publish('CRIT')
detect(when(signal > ${var.zookeeper_latency_threshold_major}) and (not when(signal > ${var.zookeeper_latency_threshold_critical}))).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.zookeeper_latency_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.zookeeper_latency_disabled_critical, var.zookeeper_latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.zookeeper_latency_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.zookeeper_latency_runbook_url, var.runbook_url), "")
tip = var.zookeeper_latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too high > ${var.zookeeper_latency_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.zookeeper_latency_disabled_major, var.zookeeper_latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.zookeeper_latency_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.zookeeper_latency_runbook_url, var.runbook_url), "")
tip = var.zookeeper_latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.zookeeper_latency_max_delay
}

resource "signalfx_detector" "file_descriptors" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper file descriptors usage")

Expand Down
22 changes: 16 additions & 6 deletions modules/smart-agent_zookeeper/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
output "cluster-health" {
description = "Detector resource for cluster-health"
value = signalfx_detector.cluster-health
}

output "cluster-latency" {
description = "Detector resource for cluster-latency"
value = signalfx_detector.cluster-latency
}

output "file_descriptors" {
description = "Detector resource for file_descriptors"
value = signalfx_detector.file_descriptors
Expand All @@ -8,13 +18,13 @@ output "heartbeat" {
value = signalfx_detector.heartbeat
}

output "zookeeper_health" {
description = "Detector resource for zookeeper_health"
value = signalfx_detector.zookeeper_health
output "server-health" {
description = "Detector resource for server-health"
value = signalfx_detector.server-health
}

output "zookeeper_latency" {
description = "Detector resource for zookeeper_latency"
value = signalfx_detector.zookeeper_latency
output "server-latency" {
description = "Detector resource for server-latency"
value = signalfx_detector.server-latency
}

Loading