From 75102fbe47d9be7a8846769f437b0e935b0fb856 Mon Sep 17 00:00:00 2001 From: Gauthier AMPE Date: Tue, 9 Jan 2024 18:10:05 +0100 Subject: [PATCH 1/6] Move detector to generated --- docs/severity.md | 32 +- modules/smart-agent_elasticsearch/README.md | 34 +- .../detectors-elasticsearch.tf | 766 ---------- modules/smart-agent_elasticsearch/outputs.tf | 70 +- .../smart-agent_elasticsearch/variables.tf | 1258 ----------------- 5 files changed, 68 insertions(+), 2092 deletions(-) delete mode 100644 modules/smart-agent_elasticsearch/detectors-elasticsearch.tf delete mode 100644 modules/smart-agent_elasticsearch/variables.tf diff --git a/docs/severity.md b/docs/severity.md index 30be3a0cd..70a1de7bc 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -975,25 +975,25 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|ElasticSearch heartbeat|X|-|-|-|-| +|Elasticsearch heartbeat|X|-|-|-|-| |ElasticSearch cluster status|X|X|-|-|-| |ElasticSearch cluster initializing shards|X|X|-|-|-| |ElasticSearch cluster relocating shards|X|X|-|-|-| -|ElasticSearch Cluster unassigned shards|X|X|-|-|-| -|ElasticSearch Pending tasks|X|X|-|-|-| -|Elasticsearch CPU usage|X|X|-|-|-| -|Elasticsearch file descriptors usage|X|X|-|-|-| -|Elasticsearch JVM heap memory usage|X|X|-|-|-| -|Elasticsearch JVM memory young usage|-|X|X|-|-| -|Elasticsearch JVM memory old usage|-|X|X|-|-| -|Elasticsearch old-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch young-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch indexing latency|-|X|X|-|-| -|Elasticsearch index flushing to disk latency|-|X|X|-|-| -|Elasticsearch search query latency|-|X|X|-|-| -|Elasticsearch search fetch latency|-|X|X|-|-| -|Elasticsearch fielddata cache evictions rate of change|-|X|X|-|-| -|Elasticsearch max time spent by task in queue rate of change|-|X|X|-|-| +|ElasticSearch cluster unassigned shards|X|X|-|-|-| +|ElasticSearch cluster pending tasks|X|X|-|-|-| +|ElasticSearch cpu usage|X|X|-|-|-| +|ElasticSearch file descriptors usage|X|X|-|-|-| +|ElasticSearch jvm heap memory usage|X|X|-|-|-| +|ElasticSearch jvm memory young usage|-|X|X|-|-| +|ElasticSearch jvm memory old usage|-|X|X|-|-| +|ElasticSearch old-generation garbage collections latency|-|X|X|-|-| +|ElasticSearch young-generation garbage collections latency|-|X|X|-|-| +|ElasticSearch indexing latency|-|X|X|-|-| +|ElasticSearch index flushing to disk latency|-|X|X|-|-| +|ElasticSearch search query latency|-|X|X|-|-| +|ElasticSearch search fetch latency|-|X|X|-|-| +|ElasticSearch fielddata cache evictions rate of change|-|X|X|-|-| +|ElasticSearch max time spent by task in queue rate of change|-|X|X|-|-| ## smart-agent_genericjmx diff --git a/modules/smart-agent_elasticsearch/README.md b/modules/smart-agent_elasticsearch/README.md index 89fe7ef26..598dc631d 100644 --- a/modules/smart-agent_elasticsearch/README.md +++ b/modules/smart-agent_elasticsearch/README.md @@ -59,7 +59,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables.tf](variables.tf). +[variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. @@ -77,25 +77,25 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|ElasticSearch heartbeat|X|-|-|-|-| +|Elasticsearch heartbeat|X|-|-|-|-| |ElasticSearch cluster status|X|X|-|-|-| |ElasticSearch cluster initializing shards|X|X|-|-|-| |ElasticSearch cluster relocating shards|X|X|-|-|-| -|ElasticSearch Cluster unassigned shards|X|X|-|-|-| -|ElasticSearch Pending tasks|X|X|-|-|-| -|Elasticsearch CPU usage|X|X|-|-|-| -|Elasticsearch file descriptors usage|X|X|-|-|-| -|Elasticsearch JVM heap memory usage|X|X|-|-|-| -|Elasticsearch JVM memory young usage|-|X|X|-|-| -|Elasticsearch JVM memory old usage|-|X|X|-|-| -|Elasticsearch old-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch young-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch indexing latency|-|X|X|-|-| -|Elasticsearch index flushing to disk latency|-|X|X|-|-| -|Elasticsearch search query latency|-|X|X|-|-| -|Elasticsearch search fetch latency|-|X|X|-|-| -|Elasticsearch fielddata cache evictions rate of change|-|X|X|-|-| -|Elasticsearch max time spent by task in queue rate of change|-|X|X|-|-| +|ElasticSearch cluster unassigned shards|X|X|-|-|-| +|ElasticSearch cluster pending tasks|X|X|-|-|-| +|ElasticSearch cpu usage|X|X|-|-|-| +|ElasticSearch file descriptors usage|X|X|-|-|-| +|ElasticSearch jvm heap memory usage|X|X|-|-|-| +|ElasticSearch jvm memory young usage|-|X|X|-|-| +|ElasticSearch jvm memory old usage|-|X|X|-|-| +|ElasticSearch old-generation garbage collections latency|-|X|X|-|-| +|ElasticSearch young-generation garbage collections latency|-|X|X|-|-| +|ElasticSearch indexing latency|-|X|X|-|-| +|ElasticSearch index flushing to disk latency|-|X|X|-|-| +|ElasticSearch search query latency|-|X|X|-|-| +|ElasticSearch search fetch latency|-|X|X|-|-| +|ElasticSearch fielddata cache evictions rate of change|-|X|X|-|-| +|ElasticSearch max time spent by task in queue rate of change|-|X|X|-|-| ## How to collect required metrics? diff --git a/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf b/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf deleted file mode 100644 index b62d490a3..000000000 --- a/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf +++ /dev/null @@ -1,766 +0,0 @@ -resource "signalfx_detector" "heartbeat" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch heartbeat") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - from signalfx.detectors.not_reporting import not_reporting - signal = data('elasticsearch.cluster.number-of-nodes', filter=filter('plugin', 'elasticsearch') and ${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') - not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') -EOF - - rule { - description = "has not reported in ${var.heartbeat_timeframe}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") - tip = var.heartbeat_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.heartbeat_max_delay -} - -resource "signalfx_detector" "cluster_status" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster status") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.cluster.status', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') - detect(when(signal == 1)).publish('MAJOR') - detect(when(signal == 2)).publish('CRIT') -EOF - - rule { - description = "is red" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cluster_status_disabled_critical, var.cluster_status_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_status_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") - tip = var.cluster_status_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is yellow" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_status_disabled_major, var.cluster_status_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_status_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") - tip = var.cluster_status_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cluster_status_max_delay -} - -resource "signalfx_detector" "cluster_initializing_shards" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster initializing shards") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.cluster.initializing-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_initializing_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_initializing_shards_threshold_major}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.cluster_initializing_shards_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cluster_initializing_shards_disabled_critical, var.cluster_initializing_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_initializing_shards_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster_initializing_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_initializing_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.cluster_initializing_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_initializing_shards_disabled_major, var.cluster_initializing_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_initializing_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_initializing_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_initializing_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cluster_initializing_shards_max_delay -} - -resource "signalfx_detector" "cluster_relocating_shards" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster relocating shards") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.cluster.relocating-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_relocating_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_relocating_shards_threshold_major}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.cluster_relocating_shards_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cluster_relocating_shards_disabled_critical, var.cluster_relocating_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_relocating_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.cluster_relocating_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_relocating_shards_disabled_major, var.cluster_relocating_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_relocating_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cluster_relocating_shards_max_delay -} - -resource "signalfx_detector" "cluster_unassigned_shards" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch Cluster unassigned shards") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.cluster.unassigned-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "are too high > ${var.cluster_unassigned_shards_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cluster_unassigned_shards_disabled_critical, var.cluster_unassigned_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_unassigned_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "are too high > ${var.cluster_unassigned_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_unassigned_shards_disabled_major, var.cluster_unassigned_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_unassigned_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cluster_unassigned_shards_max_delay -} - -resource "signalfx_detector" "pending_tasks" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch Pending tasks") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.cluster.pending-tasks', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') - detect(when(signal > ${var.pending_tasks_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.pending_tasks_threshold_major}) and (not when(signal > ${var.pending_tasks_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "are too high > ${var.pending_tasks_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.pending_tasks_disabled_critical, var.pending_tasks_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.pending_tasks_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.pending_tasks_runbook_url, var.runbook_url), "") - tip = var.pending_tasks_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "are too high > ${var.pending_tasks_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.pending_tasks_disabled_major, var.pending_tasks_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.pending_tasks_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.pending_tasks_runbook_url, var.runbook_url), "") - tip = var.pending_tasks_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.pending_tasks_max_delay -} - -resource "signalfx_detector" "cpu_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch CPU usage") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.process.cpu.percent', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.cpu_usage_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cpu_usage_threshold_major}) and (not when(signal > ${var.cpu_usage_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.cpu_usage_threshold_critical}%" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cpu_usage_disabled_critical, var.cpu_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cpu_usage_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cpu_usage_runbook_url, var.runbook_url), "") - tip = var.cpu_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.cpu_usage_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cpu_usage_disabled_major, var.cpu_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cpu_usage_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cpu_usage_runbook_url, var.runbook_url), "") - tip = var.cpu_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cpu_usage_max_delay -} - -resource "signalfx_detector" "file_descriptors" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch file descriptors usage") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.process.open_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} - B = data('elasticsearch.process.max_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} - signal = (A/B).scale(100).publish('signal') - detect(when(signal > ${var.file_descriptors_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.file_descriptors_threshold_major}) and (not when(signal > ${var.file_descriptors_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.file_descriptors_threshold_critical}%" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.file_descriptors_disabled_critical, var.file_descriptors_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.file_descriptors_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.file_descriptors_runbook_url, var.runbook_url), "") - tip = var.file_descriptors_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.file_descriptors_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.file_descriptors_disabled_major, var.file_descriptors_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.file_descriptors_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.file_descriptors_runbook_url, var.runbook_url), "") - tip = var.file_descriptors_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.file_descriptors_max_delay -} - -resource "signalfx_detector" "jvm_heap_memory_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch JVM heap memory usage") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.jvm_heap_memory_usage_threshold_critical}%" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.jvm_heap_memory_usage_disabled_critical, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_heap_memory_usage_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.jvm_heap_memory_usage_runbook_url, var.runbook_url), "") - tip = var.jvm_heap_memory_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.jvm_heap_memory_usage_threshold_major}%" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.jvm_heap_memory_usage_disabled_major, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_heap_memory_usage_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.jvm_heap_memory_usage_runbook_url, var.runbook_url), "") - tip = var.jvm_heap_memory_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.jvm_heap_memory_usage_max_delay -} - -resource "signalfx_detector" "jvm_memory_young_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch JVM memory young usage") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} - B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} - signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}) and (not when(signal > ${var.jvm_memory_young_usage_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.jvm_memory_young_usage_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.jvm_memory_young_usage_disabled_major, var.jvm_memory_young_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_memory_young_usage_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.jvm_memory_young_usage_runbook_url, var.runbook_url), "") - tip = var.jvm_memory_young_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.jvm_memory_young_usage_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.jvm_memory_young_usage_disabled_minor, var.jvm_memory_young_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_memory_young_usage_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.jvm_memory_young_usage_runbook_url, var.runbook_url), "") - tip = var.jvm_memory_young_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.jvm_memory_young_usage_max_delay -} - -resource "signalfx_detector" "jvm_memory_old_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch JVM memory old usage") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} - B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} - signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}) and (not when(signal > ${var.jvm_memory_old_usage_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.jvm_memory_old_usage_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.jvm_memory_old_usage_disabled_major, var.jvm_memory_old_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_memory_old_usage_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.jvm_memory_old_usage_runbook_url, var.runbook_url), "") - tip = var.jvm_memory_old_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.jvm_memory_old_usage_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.jvm_memory_old_usage_disabled_minor, var.jvm_memory_old_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_memory_old_usage_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.jvm_memory_old_usage_runbook_url, var.runbook_url), "") - tip = var.jvm_memory_old_usage_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.jvm_memory_old_usage_max_delay -} - -resource "signalfx_detector" "jvm_gc_old_collection_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch old-generation garbage collections latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.jvm.gc.old-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} - B = data('elasticsearch.jvm.gc.old-count', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}) and (not when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.jvm_gc_old_collection_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.jvm_gc_old_collection_latency_disabled_major, var.jvm_gc_old_collection_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_gc_old_collection_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.jvm_gc_old_collection_latency_runbook_url, var.runbook_url), "") - tip = var.jvm_gc_old_collection_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.jvm_gc_old_collection_latency_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.jvm_gc_old_collection_latency_disabled_minor, var.jvm_gc_old_collection_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_gc_old_collection_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.jvm_gc_old_collection_latency_runbook_url, var.runbook_url), "") - tip = var.jvm_gc_old_collection_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.jvm_gc_old_collection_latency_max_delay -} - -resource "signalfx_detector" "jvm_gc_young_collection_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch young-generation garbage collections latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.jvm.gc.time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} - B = data('elasticsearch.jvm.gc.count', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}) and (not when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.jvm_gc_young_collection_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.jvm_gc_young_collection_latency_disabled_major, var.jvm_gc_young_collection_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_gc_young_collection_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.jvm_gc_young_collection_latency_runbook_url, var.runbook_url), "") - tip = var.jvm_gc_young_collection_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.jvm_gc_young_collection_latency_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.jvm_gc_young_collection_latency_disabled_minor, var.jvm_gc_young_collection_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.jvm_gc_young_collection_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.jvm_gc_young_collection_latency_runbook_url, var.runbook_url), "") - tip = var.jvm_gc_young_collection_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.jvm_gc_young_collection_latency_max_delay -} - -resource "signalfx_detector" "indexing_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch indexing latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.indices.indexing.index-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} - B = data('elasticsearch.indices.indexing.index-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.indexing_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.indexing_latency_threshold_minor}) and (not when(signal > ${var.indexing_latency_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.indexing_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.indexing_latency_disabled_major, var.indexing_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.indexing_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.indexing_latency_runbook_url, var.runbook_url), "") - tip = var.indexing_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.indexing_latency_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.indexing_latency_disabled_minor, var.indexing_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.indexing_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.indexing_latency_runbook_url, var.runbook_url), "") - tip = var.indexing_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.indexing_latency_max_delay -} - -resource "signalfx_detector" "flush_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch index flushing to disk latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.indices.flush.total-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} - B = data('elasticsearch.indices.flush.total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.flush_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.flush_latency_threshold_minor}) and (not when(signal > ${var.flush_latency_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.flush_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.flush_latency_disabled_major, var.flush_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.flush_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.flush_latency_runbook_url, var.runbook_url), "") - tip = var.flush_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.flush_latency_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.flush_latency_disabled_minor, var.flush_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.flush_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.flush_latency_runbook_url, var.runbook_url), "") - tip = var.flush_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.flush_latency_max_delay -} - -resource "signalfx_detector" "search_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch search query latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.indices.search.query-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} - B = data('elasticsearch.indices.search.query-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.search_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.search_latency_threshold_minor}) and (not when(signal > ${var.search_latency_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.search_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.search_latency_disabled_major, var.search_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.search_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.search_latency_runbook_url, var.runbook_url), "") - tip = var.search_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.search_latency_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.search_latency_disabled_minor, var.search_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.search_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.search_latency_runbook_url, var.runbook_url), "") - tip = var.search_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.search_latency_max_delay -} - -resource "signalfx_detector" "fetch_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch search fetch latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('elasticsearch.indices.search.fetch-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} - B = data('elasticsearch.indices.search.fetch-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.fetch_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.fetch_latency_threshold_minor}) and (not when(signal > ${var.fetch_latency_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.fetch_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.fetch_latency_disabled_major, var.fetch_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.fetch_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.fetch_latency_runbook_url, var.runbook_url), "") - tip = var.fetch_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.fetch_latency_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.fetch_latency_disabled_minor, var.fetch_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.fetch_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.fetch_latency_runbook_url, var.runbook_url), "") - tip = var.fetch_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.fetch_latency_max_delay -} - -resource "signalfx_detector" "field_data_evictions_change" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch fielddata cache evictions rate of change") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.indices.fielddata.evictions', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta').rateofchange()${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function}.publish('signal') - detect(when(signal > ${var.field_data_evictions_change_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.field_data_evictions_change_threshold_minor}) and (not when(signal > ${var.field_data_evictions_change_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.field_data_evictions_change_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.field_data_evictions_change_disabled_major, var.field_data_evictions_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.field_data_evictions_change_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.field_data_evictions_change_runbook_url, var.runbook_url), "") - tip = var.field_data_evictions_change_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.field_data_evictions_change_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.field_data_evictions_change_disabled_minor, var.field_data_evictions_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.field_data_evictions_change_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.field_data_evictions_change_runbook_url, var.runbook_url), "") - tip = var.field_data_evictions_change_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.field_data_evictions_change_max_delay -} - -resource "signalfx_detector" "task_time_in_queue_change" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch max time spent by task in queue rate of change") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('elasticsearch.cluster.task-max-wait-time', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average').rateofchange()${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function}.publish('signal') - detect(when(signal > ${var.task_time_in_queue_change_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}))).publish('MINOR') -EOF - - rule { - description = "is too high > ${var.task_time_in_queue_change_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.task_time_in_queue_change_disabled_major, var.task_time_in_queue_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.task_time_in_queue_change_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.task_time_in_queue_change_runbook_url, var.runbook_url), "") - tip = var.task_time_in_queue_change_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.task_time_in_queue_change_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.task_time_in_queue_change_disabled_minor, var.task_time_in_queue_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.task_time_in_queue_change_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.task_time_in_queue_change_runbook_url, var.runbook_url), "") - tip = var.task_time_in_queue_change_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.task_time_in_queue_change_max_delay -} - diff --git a/modules/smart-agent_elasticsearch/outputs.tf b/modules/smart-agent_elasticsearch/outputs.tf index 3ccfbe78c..877821ae0 100644 --- a/modules/smart-agent_elasticsearch/outputs.tf +++ b/modules/smart-agent_elasticsearch/outputs.tf @@ -3,6 +3,11 @@ output "cluster_initializing_shards" { value = signalfx_detector.cluster_initializing_shards } +output "cluster_pending_tasks" { + description = "Detector resource for cluster_pending_tasks" + value = signalfx_detector.cluster_pending_tasks +} + output "cluster_relocating_shards" { description = "Detector resource for cluster_relocating_shards" value = signalfx_detector.cluster_relocating_shards @@ -23,24 +28,14 @@ output "cpu_usage" { value = signalfx_detector.cpu_usage } -output "fetch_latency" { - description = "Detector resource for fetch_latency" - value = signalfx_detector.fetch_latency -} - -output "field_data_evictions_change" { - description = "Detector resource for field_data_evictions_change" - value = signalfx_detector.field_data_evictions_change +output "fielddata_cache_evictions_rate_of_change" { + description = "Detector resource for fielddata_cache_evictions_rate_of_change" + value = signalfx_detector.fielddata_cache_evictions_rate_of_change } -output "file_descriptors" { - description = "Detector resource for file_descriptors" - value = signalfx_detector.file_descriptors -} - -output "flush_latency" { - description = "Detector resource for flush_latency" - value = signalfx_detector.flush_latency +output "file_descriptors_usage" { + description = "Detector resource for file_descriptors_usage" + value = signalfx_detector.file_descriptors_usage } output "heartbeat" { @@ -48,21 +43,16 @@ output "heartbeat" { value = signalfx_detector.heartbeat } +output "index_flushing_to_disk_latency" { + description = "Detector resource for index_flushing_to_disk_latency" + value = signalfx_detector.index_flushing_to_disk_latency +} + output "indexing_latency" { description = "Detector resource for indexing_latency" value = signalfx_detector.indexing_latency } -output "jvm_gc_old_collection_latency" { - description = "Detector resource for jvm_gc_old_collection_latency" - value = signalfx_detector.jvm_gc_old_collection_latency -} - -output "jvm_gc_young_collection_latency" { - description = "Detector resource for jvm_gc_young_collection_latency" - value = signalfx_detector.jvm_gc_young_collection_latency -} - output "jvm_heap_memory_usage" { description = "Detector resource for jvm_heap_memory_usage" value = signalfx_detector.jvm_heap_memory_usage @@ -78,18 +68,28 @@ output "jvm_memory_young_usage" { value = signalfx_detector.jvm_memory_young_usage } -output "pending_tasks" { - description = "Detector resource for pending_tasks" - value = signalfx_detector.pending_tasks +output "max_time_spent_by_task_in_queue_rate_of_change" { + description = "Detector resource for max_time_spent_by_task_in_queue_rate_of_change" + value = signalfx_detector.max_time_spent_by_task_in_queue_rate_of_change +} + +output "old-generation_garbage_collections_latency" { + description = "Detector resource for old-generation_garbage_collections_latency" + value = signalfx_detector.old-generation_garbage_collections_latency +} + +output "search_fetch_latency" { + description = "Detector resource for search_fetch_latency" + value = signalfx_detector.search_fetch_latency } -output "search_latency" { - description = "Detector resource for search_latency" - value = signalfx_detector.search_latency +output "search_query_latency" { + description = "Detector resource for search_query_latency" + value = signalfx_detector.search_query_latency } -output "task_time_in_queue_change" { - description = "Detector resource for task_time_in_queue_change" - value = signalfx_detector.task_time_in_queue_change +output "young-generation_garbage_collections_latency" { + description = "Detector resource for young-generation_garbage_collections_latency" + value = signalfx_detector.young-generation_garbage_collections_latency } diff --git a/modules/smart-agent_elasticsearch/variables.tf b/modules/smart-agent_elasticsearch/variables.tf deleted file mode 100644 index 3cf6ee9bb..000000000 --- a/modules/smart-agent_elasticsearch/variables.tf +++ /dev/null @@ -1,1258 +0,0 @@ -# Module specific - -# Heartbeat detector - -variable "heartbeat_max_delay" { - description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = 900 -} - -variable "heartbeat_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "heartbeat_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "heartbeat_disabled" { - description = "Disable all alerting rules for heartbeat detector" - type = bool - default = null -} - -variable "heartbeat_notifications" { - description = "Notification recipients list per severity overridden for heartbeat detector" - type = map(list(string)) - default = {} -} - -variable "heartbeat_timeframe" { - description = "Timeframe for heartbeat detector (i.e. \"10m\")" - type = string - default = "10m" -} - -variable "heartbeat_aggregation_function" { - description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -# Cluster_status detector - -variable "cluster_status_max_delay" { - description = "Enforce max delay for cluster_status detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cluster_status_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cluster_status_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cluster_status_disabled" { - description = "Disable all alerting rules for cluster_status_not_green detector" - type = bool - default = null -} - -variable "cluster_status_disabled_critical" { - description = "Disable critical alerting rule for cluster_status_not_green detector" - type = bool - default = null -} - -variable "cluster_status_disabled_major" { - description = "Disable major alerting rule for cluster_status_not_green detector" - type = bool - default = null -} - -variable "cluster_status_notifications" { - description = "Notification recipients list per severity overridden for cluster_status_not_green detector" - type = map(list(string)) - default = {} -} - -variable "cluster_status_aggregation_function" { - description = "Aggregation function and group by for cluster_status_not_green detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cluster_status_transformation_function" { - description = "Transformation function for cluster_status_not_green detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='5m')" -} - -# Cluster_initializing_shards detector - -variable "cluster_initializing_shards_max_delay" { - description = "Enforce max delay for cluster_initializing_shards detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cluster_initializing_shards_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cluster_initializing_shards_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cluster_initializing_shards_disabled" { - description = "Disable all alerting rules for cluster_initializing_shards detector" - type = bool - default = null -} - -variable "cluster_initializing_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_initializing_shards detector" - type = bool - default = null -} - -variable "cluster_initializing_shards_disabled_major" { - description = "Disable major alerting rule for cluster_initializing_shards detector" - type = bool - default = null -} - -variable "cluster_initializing_shards_notifications" { - description = "Notification recipients list per severity overridden for cluster_initializing_shards detector" - type = map(list(string)) - default = {} -} - -variable "cluster_initializing_shards_aggregation_function" { - description = "Aggregation function and group by for cluster_initializing_shards detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cluster_initializing_shards_transformation_function" { - description = "Transformation function for cluster_initializing_shards detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='15m')" -} - -variable "cluster_initializing_shards_threshold_critical" { - description = "Critical threshold for cluster_initializing_shards detector" - type = number - default = 1 -} - -variable "cluster_initializing_shards_threshold_major" { - description = "Major threshold for cluster_initializing_shards detector" - type = number - default = 0 -} - -# Cluster_relocating_shards detector - -variable "cluster_relocating_shards_max_delay" { - description = "Enforce max delay for cluster_relocating_shards detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cluster_relocating_shards_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cluster_relocating_shards_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cluster_relocating_shards_disabled" { - description = "Disable all alerting rules for cluster_relocating_shards detector" - type = bool - default = null -} - -variable "cluster_relocating_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_relocating_shards detector" - type = bool - default = null -} - -variable "cluster_relocating_shards_disabled_major" { - description = "Disable major alerting rule for cluster_relocating_shards detector" - type = bool - default = true -} - -variable "cluster_relocating_shards_notifications" { - description = "Notification recipients list per severity overridden for cluster_relocating_shards detector" - type = map(list(string)) - default = {} -} - -variable "cluster_relocating_shards_aggregation_function" { - description = "Aggregation function and group by for cluster_relocating_shards detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cluster_relocating_shards_transformation_function" { - description = "Transformation function for cluster_relocating_shards detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='15m')" -} - -variable "cluster_relocating_shards_threshold_critical" { - description = "Critical threshold for cluster_relocating_shards detector" - type = number - default = 0 -} - -variable "cluster_relocating_shards_threshold_major" { - description = "Major threshold for cluster_relocating_shards detector" - type = number - default = -1 -} - -# Cluster_unassigned_shards detector - -variable "cluster_unassigned_shards_max_delay" { - description = "Enforce max delay for cluster_unassigned_shards detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cluster_unassigned_shards_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cluster_unassigned_shards_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cluster_unassigned_shards_disabled" { - description = "Disable all alerting rules for cluster_unassigned_shards detector" - type = bool - default = null -} - -variable "cluster_unassigned_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_unassigned_shards detector" - type = bool - default = null -} - -variable "cluster_unassigned_shards_disabled_major" { - description = "Disable major alerting rule for cluster_unassigned_shards detector" - type = bool - default = true -} - -variable "cluster_unassigned_shards_notifications" { - description = "Notification recipients list per severity overridden for cluster_unassigned_shards detector" - type = map(list(string)) - default = {} -} - -variable "cluster_unassigned_shards_aggregation_function" { - description = "Aggregation function and group by for cluster_unassigned_shards detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cluster_unassigned_shards_transformation_function" { - description = "Transformation function for cluster_unassigned_shards detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='10m')" -} - -variable "cluster_unassigned_shards_threshold_critical" { - description = "Critical threshold for cluster_unassigned_shards detector" - type = number - default = 0 -} - -variable "cluster_unassigned_shards_threshold_major" { - description = "Major threshold for cluster_unassigned_shards detector" - type = number - default = -1 -} - -# pending_tasks detector - -variable "pending_tasks_max_delay" { - description = "Enforce max delay for pending_tasks detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "pending_tasks_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "pending_tasks_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "pending_tasks_disabled" { - description = "Disable all alerting rules for pending_tasks detector" - type = bool - default = null -} - -variable "pending_tasks_disabled_critical" { - description = "Disable critical alerting rule for pending_tasks detector" - type = bool - default = null -} - -variable "pending_tasks_disabled_major" { - description = "Disable major alerting rule for pending_tasks detector" - type = bool - default = null -} - -variable "pending_tasks_notifications" { - description = "Notification recipients list per severity overridden for pending_tasks detector" - type = map(list(string)) - default = {} -} - -variable "pending_tasks_aggregation_function" { - description = "Aggregation function and group by for pending_tasks detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "pending_tasks_transformation_function" { - description = "Transformation function for pending_tasks detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='15m')" -} - -variable "pending_tasks_threshold_critical" { - description = "Critical threshold for pending_tasks detector" - type = number - default = 5 -} - -variable "pending_tasks_threshold_major" { - description = "Major threshold for pending_tasks detector" - type = number - default = 0 -} - -# Jvm_heap_memory_usage detector - -variable "jvm_heap_memory_usage_max_delay" { - description = "Enforce max delay for jvm_heap_memory_usage detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "jvm_heap_memory_usage_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "jvm_heap_memory_usage_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "jvm_heap_memory_usage_disabled" { - description = "Disable all alerting rules for jvm_heap_memory_usage detector" - type = bool - default = null -} - -variable "jvm_heap_memory_usage_disabled_critical" { - description = "Disable critical alerting rule for jvm_heap_memory_usage detector" - type = bool - default = null -} - -variable "jvm_heap_memory_usage_disabled_major" { - description = "Disable major alerting rule for jvm_heap_memory_usage detector" - type = bool - default = null -} - -variable "jvm_heap_memory_usage_notifications" { - description = "Notification recipients list per severity overridden for jvm_heap_memory_usage detector" - type = map(list(string)) - default = {} -} - -variable "jvm_heap_memory_usage_aggregation_function" { - description = "Aggregation function and group by for jvm_heap_memory_usage detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "jvm_heap_memory_usage_transformation_function" { - description = "Transformation function for jvm_heap_memory_usage detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='5m')" -} - -variable "jvm_heap_memory_usage_threshold_critical" { - description = "Critical threshold for jvm_heap_memory_usage detector" - type = number - default = 90 -} - -variable "jvm_heap_memory_usage_threshold_major" { - description = "Major threshold for jvm_heap_memory_usage detector" - type = number - default = 80 -} - -# cpu_usage detector - -variable "cpu_usage_max_delay" { - description = "Enforce max delay for cpu_usage detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cpu_usage_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cpu_usage_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cpu_usage_disabled" { - description = "Disable all alerting rules for cpu_usage detector" - type = bool - default = null -} - -variable "cpu_usage_disabled_critical" { - description = "Disable critical alerting rule for cpu_usage detector" - type = bool - default = null -} - -variable "cpu_usage_disabled_major" { - description = "Disable major alerting rule for cpu_usage detector" - type = bool - default = null -} - -variable "cpu_usage_notifications" { - description = "Notification recipients list per severity overridden for cpu_usage detector" - type = map(list(string)) - default = {} -} - -variable "cpu_usage_aggregation_function" { - description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cpu_usage_transformation_function" { - description = "Transformation function for cpu_usage detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='30m')" -} - -variable "cpu_usage_threshold_critical" { - description = "Critical threshold for cpu_usage detector" - type = number - default = 95 -} - -variable "cpu_usage_threshold_major" { - description = "Major threshold for cpu_usage detector" - type = number - default = 85 -} - -# file_descriptors detector - -variable "file_descriptors_max_delay" { - description = "Enforce max delay for file_descriptors detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "file_descriptors_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "file_descriptors_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "file_descriptors_disabled" { - description = "Disable all alerting rules for file_descriptors detector" - type = bool - default = null -} - -variable "file_descriptors_disabled_critical" { - description = "Disable critical alerting rule for file_descriptors detector" - type = bool - default = null -} - -variable "file_descriptors_disabled_major" { - description = "Disable major alerting rule for file_descriptors detector" - type = bool - default = null -} - -variable "file_descriptors_notifications" { - description = "Notification recipients list per severity overridden for file_descriptors detector" - type = map(list(string)) - default = {} -} - -variable "file_descriptors_aggregation_function" { - description = "Aggregation function and group by for file_descriptors detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "file_descriptors_transformation_function" { - description = "Transformation function for file_descriptors detector (i.e. \".mean(over='5m')\")" - type = string - default = ".max(over='15m')" -} - -variable "file_descriptors_threshold_critical" { - description = "Critical threshold for file_descriptors detector" - type = number - default = 95 -} - -variable "file_descriptors_threshold_major" { - description = "Major threshold for file_descriptors detector" - type = number - default = 90 -} - -# Jvm_memory_young_usage detector - -variable "jvm_memory_young_usage_max_delay" { - description = "Enforce max delay for jvm_memory_young_usage detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "jvm_memory_young_usage_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "jvm_memory_young_usage_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "jvm_memory_young_usage_disabled" { - description = "Disable all alerting rules for jvm_memory_young_usage detector" - type = bool - default = null -} - -variable "jvm_memory_young_usage_disabled_major" { - description = "Disable major alerting rule for jvm_memory_young_usage detector" - type = bool - default = null -} - -variable "jvm_memory_young_usage_disabled_minor" { - description = "Disable minor alerting rule for jvm_memory_young_usage detector" - type = bool - default = null -} - -variable "jvm_memory_young_usage_notifications" { - description = "Notification recipients list per severity overridden for jvm_memory_young_usage detector" - type = map(list(string)) - default = {} -} - -variable "jvm_memory_young_usage_aggregation_function" { - description = "Aggregation function and group by for jvm_memory_young_usage detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "jvm_memory_young_usage_transformation_function" { - description = "Transformation function for jvm_memory_young_usage detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='10m')" -} - -variable "jvm_memory_young_usage_threshold_major" { - description = "major threshold for jvm_memory_young_usage detector" - type = number - default = 90 -} - -variable "jvm_memory_young_usage_threshold_minor" { - description = "minor threshold for jvm_memory_young_usage detector" - type = number - default = 80 -} - -# Jvm_memory_old_usage detector - -variable "jvm_memory_old_usage_max_delay" { - description = "Enforce max delay for jvm_memory_old_usage detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "jvm_memory_old_usage_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "jvm_memory_old_usage_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "jvm_memory_old_usage_disabled" { - description = "Disable all alerting rules for jvm_memory_old_usage detector" - type = bool - default = null -} - -variable "jvm_memory_old_usage_disabled_major" { - description = "Disable major alerting rule for jvm_memory_old_usage detector" - type = bool - default = null -} - -variable "jvm_memory_old_usage_disabled_minor" { - description = "Disable minor alerting rule for jvm_memory_old_usage detector" - type = bool - default = null -} - -variable "jvm_memory_old_usage_notifications" { - description = "Notification recipients list per severity overridden for jvm_memory_old_usage detector" - type = map(list(string)) - default = {} -} - -variable "jvm_memory_old_usage_aggregation_function" { - description = "Aggregation function and group by for jvm_memory_old_usage detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "jvm_memory_old_usage_transformation_function" { - description = "Transformation function for jvm_memory_old_usage detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='10m')" -} - -variable "jvm_memory_old_usage_threshold_major" { - description = "major threshold for jvm_memory_old_usage detector" - type = number - default = 90 -} - -variable "jvm_memory_old_usage_threshold_minor" { - description = "minor threshold for jvm_memory_old_usage detector" - type = number - default = 80 -} - -# Jvm_gc_old_collection_latency detector - -variable "jvm_gc_old_collection_latency_max_delay" { - description = "Enforce max delay for jvm_gc_old_collection_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "jvm_gc_old_collection_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "jvm_gc_old_collection_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "jvm_gc_old_collection_latency_disabled" { - description = "Disable all alerting rules for jvm_gc_old_collection_latency detector" - type = bool - default = null -} - -variable "jvm_gc_old_collection_latency_disabled_major" { - description = "Disable major alerting rule for jvm_gc_old_collection_latency detector" - type = bool - default = null -} - -variable "jvm_gc_old_collection_latency_disabled_minor" { - description = "Disable minor alerting rule for jvm_gc_old_collection_latency detector" - type = bool - default = null -} - -variable "jvm_gc_old_collection_latency_notifications" { - description = "Notification recipients list per severity overridden for jvm_gc_old_collection_latency detector" - type = map(list(string)) - default = {} -} - -variable "jvm_gc_old_collection_latency_aggregation_function" { - description = "Aggregation function and group by for jvm_gc_old_collection_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "jvm_gc_old_collection_latency_transformation_function" { - description = "Transformation function for jvm_gc_old_collection_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='15m')" -} - -variable "jvm_gc_old_collection_latency_threshold_major" { - description = "major threshold for jvm_gc_old_collection_latency detector" - type = number - default = 300 -} - -variable "jvm_gc_old_collection_latency_threshold_minor" { - description = "minor threshold for jvm_gc_old_collection_latency detector" - type = number - default = 200 -} - -# Jvm_gc_young_collection_latency detector - -variable "jvm_gc_young_collection_latency_max_delay" { - description = "Enforce max delay for jvm_gc_young_collection_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "jvm_gc_young_collection_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "jvm_gc_young_collection_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "jvm_gc_young_collection_latency_disabled" { - description = "Disable all alerting rules for jvm_gc_young_collection_latency detector" - type = bool - default = null -} - -variable "jvm_gc_young_collection_latency_disabled_major" { - description = "Disable major alerting rule for jvm_gc_young_collection_latency detector" - type = bool - default = null -} - -variable "jvm_gc_young_collection_latency_disabled_minor" { - description = "Disable minor alerting rule for jvm_gc_young_collection_latency detector" - type = bool - default = null -} - -variable "jvm_gc_young_collection_latency_notifications" { - description = "Notification recipients list per severity overridden for jvm_gc_young_collection_latency detector" - type = map(list(string)) - default = {} -} - -variable "jvm_gc_young_collection_latency_aggregation_function" { - description = "Aggregation function and group by for jvm_gc_young_collection_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "jvm_gc_young_collection_latency_transformation_function" { - description = "Transformation function for jvm_gc_young_collection_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='15m')" -} - -variable "jvm_gc_young_collection_latency_threshold_major" { - description = "major threshold for jvm_gc_young_collection_latency detector" - type = number - default = 40 -} - -variable "jvm_gc_young_collection_latency_threshold_minor" { - description = "minor threshold for jvm_gc_young_collection_latency detector" - type = number - default = 20 -} - -# Indexing_latency detector - -variable "indexing_latency_max_delay" { - description = "Enforce max delay for indexing_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "indexing_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "indexing_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "indexing_latency_disabled" { - description = "Disable all alerting rules for indexing_latency detector" - type = bool - default = null -} - -variable "indexing_latency_disabled_major" { - description = "Disable major alerting rule for indexing_latency detector" - type = bool - default = null -} - -variable "indexing_latency_disabled_minor" { - description = "Disable minor alerting rule for indexing_latency detector" - type = bool - default = null -} - -variable "indexing_latency_notifications" { - description = "Notification recipients list per severity overridden for indexing_latency detector" - type = map(list(string)) - default = {} -} - -variable "indexing_latency_aggregation_function" { - description = "Aggregation function and group by for indexing_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "indexing_latency_transformation_function" { - description = "Transformation function for indexing_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='15m')" -} - -variable "indexing_latency_threshold_major" { - description = "major threshold for indexing_latency detector" - type = number - default = 30 -} - -variable "indexing_latency_threshold_minor" { - description = "minor threshold for indexing_latency detector" - type = number - default = 15 -} - -# Flush_latency detector - -variable "flush_latency_max_delay" { - description = "Enforce max delay for flush_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "flush_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "flush_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "flush_latency_disabled" { - description = "Disable all alerting rules for flush_latency detector" - type = bool - default = null -} - -variable "flush_latency_disabled_major" { - description = "Disable major alerting rule for flush_latency detector" - type = bool - default = null -} - -variable "flush_latency_disabled_minor" { - description = "Disable minor alerting rule for flush_latency detector" - type = bool - default = null -} - -variable "flush_latency_notifications" { - description = "Notification recipients list per severity overridden for flush_latency detector" - type = map(list(string)) - default = {} -} - -variable "flush_latency_aggregation_function" { - description = "Aggregation function and group by for flush_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "flush_latency_transformation_function" { - description = "Transformation function for flush_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='15m')" -} - -variable "flush_latency_threshold_major" { - description = "major threshold for flush_latency detector" - type = number - default = 150 -} - -variable "flush_latency_threshold_minor" { - description = "minor threshold for flush_latency detector" - type = number - default = 100 -} - -# Search_latency detector - -variable "search_latency_max_delay" { - description = "Enforce max delay for search_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "search_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "search_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "search_latency_disabled" { - description = "Disable all alerting rules for search_latency detector" - type = bool - default = null -} - -variable "search_latency_disabled_major" { - description = "Disable major alerting rule for search_latency detector" - type = bool - default = null -} - -variable "search_latency_disabled_minor" { - description = "Disable minor alerting rule for search_latency detector" - type = bool - default = null -} - -variable "search_latency_notifications" { - description = "Notification recipients list per severity overridden for search_latency detector" - type = map(list(string)) - default = {} -} - -variable "search_latency_aggregation_function" { - description = "Aggregation function and group by for search_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "search_latency_transformation_function" { - description = "Transformation function for search_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='30m')" -} - -variable "search_latency_threshold_major" { - description = "major threshold for search_latency detector" - type = number - default = 20 -} - -variable "search_latency_threshold_minor" { - description = "minor threshold for search_latency detector" - type = number - default = 10 -} - -# Fetch_latency detector - -variable "fetch_latency_max_delay" { - description = "Enforce max delay for fetch_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "fetch_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "fetch_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "fetch_latency_disabled" { - description = "Disable all alerting rules for fetch_latency detector" - type = bool - default = null -} - -variable "fetch_latency_disabled_major" { - description = "Disable major alerting rule for fetch_latency detector" - type = bool - default = null -} - -variable "fetch_latency_disabled_minor" { - description = "Disable minor alerting rule for fetch_latency detector" - type = bool - default = null -} - -variable "fetch_latency_notifications" { - description = "Notification recipients list per severity overridden for fetch_latency detector" - type = map(list(string)) - default = {} -} - -variable "fetch_latency_aggregation_function" { - description = "Aggregation function and group by for fetch_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "fetch_latency_transformation_function" { - description = "Transformation function for fetch_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='15m')" -} - -variable "fetch_latency_threshold_major" { - description = "major threshold for fetch_latency detector" - type = number - default = 20 -} - -variable "fetch_latency_threshold_minor" { - description = "minor threshold for fetch_latency detector" - type = number - default = 10 -} - -# Field_data_evictions_change detector - -variable "field_data_evictions_change_max_delay" { - description = "Enforce max delay for field_data_evictions_change detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "field_data_evictions_change_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "field_data_evictions_change_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "field_data_evictions_change_disabled" { - description = "Disable all alerting rules for field_data_evictions_change detector" - type = bool - default = null -} - -variable "field_data_evictions_change_disabled_major" { - description = "Disable major alerting rule for field_data_evictions_change detector" - type = bool - default = null -} - -variable "field_data_evictions_change_disabled_minor" { - description = "Disable minor alerting rule for field_data_evictions_change detector" - type = bool - default = null -} - -variable "field_data_evictions_change_notifications" { - description = "Notification recipients list per severity overridden for field_data_evictions_change detector" - type = map(list(string)) - default = {} -} - -variable "field_data_evictions_change_aggregation_function" { - description = "Aggregation function and group by for field_data_evictions_change detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "field_data_evictions_change_transformation_function" { - description = "Transformation function for field_data_evictions_change detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='15m')" -} - -variable "field_data_evictions_change_threshold_major" { - description = "major threshold for field_data_evictions_change detector" - type = number - default = 120 -} - -variable "field_data_evictions_change_threshold_minor" { - description = "minor threshold for field_data_evictions_change detector" - type = number - default = 60 -} - -# Task_time_in_queue_change detector - -variable "task_time_in_queue_change_max_delay" { - description = "Enforce max delay for task_time_in_queue_change detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "task_time_in_queue_change_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "task_time_in_queue_change_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "task_time_in_queue_change_disabled" { - description = "Disable all alerting rules for task_time_in_queue_change detector" - type = bool - default = null -} - -variable "task_time_in_queue_change_disabled_major" { - description = "Disable major alerting rule for task_time_in_queue_change detector" - type = bool - default = null -} - -variable "task_time_in_queue_change_disabled_minor" { - description = "Disable minor alerting rule for task_time_in_queue_change detector" - type = bool - default = null -} - -variable "task_time_in_queue_change_notifications" { - description = "Notification recipients list per severity overridden for task_time_in_queue_change detector" - type = map(list(string)) - default = {} -} - -variable "task_time_in_queue_change_aggregation_function" { - description = "Aggregation function and group by for task_time_in_queue_change detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "task_time_in_queue_change_transformation_function" { - description = "Transformation function for task_time_in_queue_change detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='15m')" -} - -variable "task_time_in_queue_change_threshold_major" { - description = "major threshold for task_time_in_queue_change detector" - type = number - default = 200 -} - -variable "task_time_in_queue_change_threshold_minor" { - description = "minor threshold for task_time_in_queue_change detector" - type = number - default = 100 -} - From f876839169a23786a27a77c3231f88c77d84dca3 Mon Sep 17 00:00:00 2001 From: Gauthier AMPE Date: Tue, 9 Jan 2024 18:10:17 +0100 Subject: [PATCH 2/6] Move detector to generated --- .../conf/00-heartbeat.yaml | 12 + .../conf/01-cluster-status.yaml | 17 + .../conf/02-cluster_initializing_shards.yaml | 18 + .../conf/03-cluster_relocating_shards.yaml | 18 + .../conf/04-cluster_unassigned_shards.yaml | 18 + .../conf/05-cluster_pending_tasks.yaml | 18 + .../conf/06-cluster_cpu_usage.yaml | 19 + .../conf/07-cluster_file_descriptor.yaml | 24 + .../conf/08-cluster_JVM_heap_memory.yaml | 19 + .../09-cluster_JVM_memory_young_usage.yaml | 24 + .../conf/10-cluster_JVM_memory_old_usage.yaml | 24 + ...eneration_garbage_collections_latency.yaml | 26 + ...eneration_garbage_collections_latency.yaml | 26 + .../conf/13-cluster_indexing_latency.yaml | 26 + .../conf/14-cluster_flush_latency.yaml | 26 + .../conf/15-cluster_search_latency.yaml | 26 + .../conf/16-cluster_fetch_latency.yaml | 26 + ...luster_fielddata_cache_evictions_rate.yaml | 22 + .../conf/18-cluster_time_in_queue_change.yaml | 20 + .../detectors-gen.tf | 780 ++++++++ .../variables-gen.tf | 1664 +++++++++++++++++ 21 files changed, 2853 insertions(+) create mode 100644 modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml create mode 100644 modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml create mode 100644 modules/smart-agent_elasticsearch/detectors-gen.tf create mode 100644 modules/smart-agent_elasticsearch/variables-gen.tf diff --git a/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml new file mode 100644 index 000000000..e4e662a69 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml @@ -0,0 +1,12 @@ +module: Elasticsearch +name: heartbeat + +transformation: false +aggregation: true +exclude_not_running_vm: true + +signals: + signal: + metric: elasticsearch.cluster.number-of-nodes +rules: + critical: \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml new file mode 100644 index 000000000..eb7d9cd94 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml @@ -0,0 +1,17 @@ +module: ElasticSearch +name: "cluster status" +aggregation: true +transformation: ".mean(over='5m')" +signals: + signal: + metric: "elasticsearch.cluster.status" +rules: + critical: + threshold: 1 + comparator: "==" + description: "is red" + major: + threshold: 2 + comparator: "==" + dependency: critical + description: "is yellow" diff --git a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml new file mode 100644 index 000000000..4e0d26f11 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml @@ -0,0 +1,18 @@ +module: ElasticSearch +name: "cluster initializing shards" +aggregation: true +transformation: ".min(over='15m')" +signals: + signal: + metric: "elasticsearch.cluster.initializing-shards" + rollup: average +rules: + critical: + threshold: 0 + comparator: ">" + description: "is too high" + major: + threshold: -1 + comparator: ">" + dependency: critical + description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml new file mode 100644 index 000000000..ba88e652d --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml @@ -0,0 +1,18 @@ +module: ElasticSearch +name: "cluster relocating shards" +aggregation: true +transformation: ".min(over='15m')" +signals: + signal: + metric: "elasticsearch.cluster.relocating-shards" + rollup: average +rules: + critical: + threshold: 1 + comparator: ">" + description: "is too high" + major: + threshold: 0 + comparator: ">" + dependency: critical + description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml new file mode 100644 index 000000000..2bb6203ae --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml @@ -0,0 +1,18 @@ +module: ElasticSearch +name: "cluster unassigned shards" +aggregation: true +transformation: ".min(over='10m')" +signals: + signal: + metric: "elasticsearch.cluster.unassigned-shards" + rollup: average +rules: + critical: + threshold: 0 + comparator: ">" + description: "is too high" + major: + threshold: -1 + comparator: ">" + dependency: critical + description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml new file mode 100644 index 000000000..128fd855a --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml @@ -0,0 +1,18 @@ +module: ElasticSearch +name: "cluster pending tasks" +aggregation: true +transformation: ".min(over='15m')" +signals: + signal: + metric: "elasticsearch.cluster.pending-tasks" + rollup: average +rules: + critical: + threshold: 5 + comparator: ">" + description: "are too high" + major: + threshold: 0 + comparator: ">" + dependency: critical + description: "are too high" diff --git a/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml new file mode 100644 index 000000000..4869ffb98 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml @@ -0,0 +1,19 @@ +module: ElasticSearch +name: "cpu usage" +aggregation: true +transformation: ".min(over='30m')" +filtering: "filter('node_name', '*')" +signals: + signal: + metric: "elasticsearch.process.cpu.percent" + rollup: average +rules: + critical: + threshold: 95 + comparator: ">" + description: "is too high" + major: + threshold: 85 + comparator: ">" + dependency: critical + description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml new file mode 100644 index 000000000..bcd222bb6 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml @@ -0,0 +1,24 @@ +module: ElasticSearch +name: "file descriptors usage" +aggregation: true +transformation: ".max(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.process.open_file_descriptors" + rollup: average + B: + metric: "elasticsearch.process.max_file_descriptors" + rollup: average + signal: + formula: "(A/B).scale(100)" +rules: + critical: + threshold: 95 + comparator: ">" + description: "is too high" + major: + threshold: 90 + comparator: ">" + dependency: critical + description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml new file mode 100644 index 000000000..946276c3f --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml @@ -0,0 +1,19 @@ +module: ElasticSearch +name: "JVM heap memory usage" +aggregation: true +transformation: ".mean(over='5m')" +filtering: "filter('node_name', '*')" +signals: + signal: + metric: "elasticsearch.jvm.mem.heap-used-percent" + rollup: average +rules: + critical: + threshold: 90 + comparator: ">" + description: "is too high" + major: + threshold: 80 + comparator: ">" + dependency: critical + description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml new file mode 100644 index 000000000..f8940b635 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml @@ -0,0 +1,24 @@ +module: ElasticSearch +name: "JVM memory young usage" +aggregation: true +transformation: ".mean(over='10m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.jvm.mem.pools.young.used_in_bytes" + rollup: average + B: + metric: "elasticsearch.jvm.mem.pools.young.max_in_bytes" + rollup: average + signal: + formula: "(A/B).fill(0).scale(100)" +rules: + major: + threshold: 90 + comparator: ">" + description: "is too high" + minor: + threshold: 80 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml new file mode 100644 index 000000000..7be566329 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml @@ -0,0 +1,24 @@ +module: ElasticSearch +name: "JVM memory old usage" +aggregation: true +transformation: ".mean(over='10m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.jvm.mem.pools.old.used_in_bytes" + rollup: average + B: + metric: "elasticsearch.jvm.mem.pools.old.max_in_bytes" + rollup: average + signal: + formula: "(A/B).fill(0).scale(100)" +rules: + major: + threshold: 90 + comparator: ">" + description: "is too high" + minor: + threshold: 80 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml new file mode 100644 index 000000000..71c35f2f2 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml @@ -0,0 +1,26 @@ +module: ElasticSearch +name: "old-generation garbage collections latency" +aggregation: true +transformation: ".mean(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.jvm.gc.old-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.jvm.gc.old-count" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 300 + comparator: ">" + description: "is too high" + minor: + threshold: 200 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml new file mode 100644 index 000000000..92b778786 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml @@ -0,0 +1,26 @@ +module: ElasticSearch +name: "young-generation garbage collections latency" +aggregation: true +transformation: ".mean(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.jvm.gc.time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.jvm.gc.count" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 40 + comparator: ">" + description: "is too high" + minor: + threshold: 20 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml new file mode 100644 index 000000000..a72ab1cce --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml @@ -0,0 +1,26 @@ +module: ElasticSearch +name: "indexing latency" +aggregation: true +transformation: ".mean(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.indices.indexing.index-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.indexing.index-total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 30 + comparator: ">" + description: "is too high" + minor: + threshold: 15 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml new file mode 100644 index 000000000..5b342e70e --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml @@ -0,0 +1,26 @@ +module: ElasticSearch +name: "index flushing to disk latency" +aggregation: true +transformation: ".mean(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.indices.flush.total-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.flush.total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 150 + comparator: ">" + description: "is too high" + minor: + threshold: 100 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml new file mode 100644 index 000000000..52ec8afe2 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml @@ -0,0 +1,26 @@ +module: ElasticSearch +name: "search query latency" +aggregation: true +transformation: ".min(over='30m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.indices.search.query-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.search.query-total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 20 + comparator: ">" + description: "is too high" + minor: + threshold: 10 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml new file mode 100644 index 000000000..a5a21f406 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml @@ -0,0 +1,26 @@ +module: ElasticSearch +name: "search fetch latency" +aggregation: true +transformation: ".min(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.indices.search.fetch-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.search.fetch-total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 20 + comparator: ">" + description: "is too high" + minor: + threshold: 10 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml new file mode 100644 index 000000000..e7a3fb5cb --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml @@ -0,0 +1,22 @@ +module: ElasticSearch +name: "fielddata cache evictions rate of change" +aggregation: true +transformation: ".mean(over='15m')" +filtering: "filter('node_name', '*')" +signals: + A: + metric: "elasticsearch.indices.fielddata.evictions" + extrapolation: zero + rollup: delta + signal: + formula: A.rateofchange() +rules: + major: + threshold: 120 + comparator: ">" + description: "is too high" + minor: + threshold: 60 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml new file mode 100644 index 000000000..01d451399 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml @@ -0,0 +1,20 @@ +module: ElasticSearch +name: "max time spent by task in queue rate of change" +aggregation: true +transformation: ".mean(over='15m')" +signals: + A: + metric: "elasticsearch.cluster.task-max-wait-time" + rollup: average + signal: + formula: A.rateofchange() +rules: + major: + threshold: 200 + comparator: ">" + description: "is too high" + minor: + threshold: 100 + comparator: ">" + description: "is too high" + dependency: major diff --git a/modules/smart-agent_elasticsearch/detectors-gen.tf b/modules/smart-agent_elasticsearch/detectors-gen.tf new file mode 100644 index 000000000..dd192f000 --- /dev/null +++ b/modules/smart-agent_elasticsearch/detectors-gen.tf @@ -0,0 +1,780 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Elasticsearch heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('elasticsearch.cluster.number-of-nodes', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "cluster_status" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster status") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('elasticsearch.cluster.status', filter=${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') + detect(when(signal == ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical})).publish('CRIT') + detect(when(signal == ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major}) and (not when(signal == ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is red == ${var.cluster_status_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_status_disabled_critical, var.cluster_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_status_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") + tip = var.cluster_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is yellow == ${var.cluster_status_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_status_disabled_major, var.cluster_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_status_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") + tip = var.cluster_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_status_max_delay +} + +resource "signalfx_detector" "cluster_initializing_shards" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster initializing shards") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('elasticsearch.cluster.initializing-shards', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cluster_initializing_shards_threshold_major}, lasting=%{if var.cluster_initializing_shards_lasting_duration_major == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cluster_initializing_shards_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_initializing_shards_disabled_critical, var.cluster_initializing_shards_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_initializing_shards_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_initializing_shards_runbook_url, var.runbook_url), "") + tip = var.cluster_initializing_shards_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cluster_initializing_shards_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_initializing_shards_disabled_major, var.cluster_initializing_shards_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_initializing_shards_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_initializing_shards_runbook_url, var.runbook_url), "") + tip = var.cluster_initializing_shards_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_initializing_shards_max_delay +} + +resource "signalfx_detector" "cluster_relocating_shards" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster relocating shards") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('elasticsearch.cluster.relocating-shards', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cluster_relocating_shards_threshold_major}, lasting=%{if var.cluster_relocating_shards_lasting_duration_major == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cluster_relocating_shards_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_relocating_shards_disabled_critical, var.cluster_relocating_shards_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") + tip = var.cluster_relocating_shards_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cluster_relocating_shards_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_relocating_shards_disabled_major, var.cluster_relocating_shards_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") + tip = var.cluster_relocating_shards_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_relocating_shards_max_delay +} + +resource "signalfx_detector" "cluster_unassigned_shards" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster unassigned shards") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('elasticsearch.cluster.unassigned-shards', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_major == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cluster_unassigned_shards_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_unassigned_shards_disabled_critical, var.cluster_unassigned_shards_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") + tip = var.cluster_unassigned_shards_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cluster_unassigned_shards_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_unassigned_shards_disabled_major, var.cluster_unassigned_shards_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") + tip = var.cluster_unassigned_shards_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_unassigned_shards_max_delay +} + +resource "signalfx_detector" "cluster_pending_tasks" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster pending tasks") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('elasticsearch.cluster.pending-tasks', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_pending_tasks_aggregation_function}${var.cluster_pending_tasks_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster_pending_tasks_threshold_critical}, lasting=%{if var.cluster_pending_tasks_lasting_duration_critical == null}None%{else}'${var.cluster_pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.cluster_pending_tasks_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cluster_pending_tasks_threshold_major}, lasting=%{if var.cluster_pending_tasks_lasting_duration_major == null}None%{else}'${var.cluster_pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.cluster_pending_tasks_at_least_percentage_major}) and (not when(signal > ${var.cluster_pending_tasks_threshold_critical}, lasting=%{if var.cluster_pending_tasks_lasting_duration_critical == null}None%{else}'${var.cluster_pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.cluster_pending_tasks_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "are too high > ${var.cluster_pending_tasks_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_pending_tasks_disabled_critical, var.cluster_pending_tasks_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_pending_tasks_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_pending_tasks_runbook_url, var.runbook_url), "") + tip = var.cluster_pending_tasks_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "are too high > ${var.cluster_pending_tasks_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_pending_tasks_disabled_major, var.cluster_pending_tasks_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_pending_tasks_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_pending_tasks_runbook_url, var.runbook_url), "") + tip = var.cluster_pending_tasks_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_pending_tasks_max_delay +} + +resource "signalfx_detector" "cpu_usage" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cpu usage") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + signal = data('elasticsearch.process.cpu.percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') + detect(when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cpu_usage_threshold_major}, lasting=%{if var.cpu_usage_lasting_duration_major == null}None%{else}'${var.cpu_usage_lasting_duration_major}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_major}) and (not when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cpu_usage_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cpu_usage_disabled_critical, var.cpu_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_usage_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cpu_usage_runbook_url, var.runbook_url), "") + tip = var.cpu_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cpu_usage_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cpu_usage_disabled_major, var.cpu_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_usage_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cpu_usage_runbook_url, var.runbook_url), "") + tip = var.cpu_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cpu_usage_max_delay +} + +resource "signalfx_detector" "file_descriptors_usage" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch file descriptors usage") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.process.open_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_usage_aggregation_function}${var.file_descriptors_usage_transformation_function} + B = data('elasticsearch.process.max_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_usage_aggregation_function}${var.file_descriptors_usage_transformation_function} + signal = (A/B).scale(100).publish('signal') + detect(when(signal > ${var.file_descriptors_usage_threshold_critical}, lasting=%{if var.file_descriptors_usage_lasting_duration_critical == null}None%{else}'${var.file_descriptors_usage_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.file_descriptors_usage_threshold_major}, lasting=%{if var.file_descriptors_usage_lasting_duration_major == null}None%{else}'${var.file_descriptors_usage_lasting_duration_major}'%{endif}, at_least=${var.file_descriptors_usage_at_least_percentage_major}) and (not when(signal > ${var.file_descriptors_usage_threshold_critical}, lasting=%{if var.file_descriptors_usage_lasting_duration_critical == null}None%{else}'${var.file_descriptors_usage_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_usage_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.file_descriptors_usage_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.file_descriptors_usage_disabled_critical, var.file_descriptors_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.file_descriptors_usage_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.file_descriptors_usage_runbook_url, var.runbook_url), "") + tip = var.file_descriptors_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.file_descriptors_usage_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.file_descriptors_usage_disabled_major, var.file_descriptors_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.file_descriptors_usage_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.file_descriptors_usage_runbook_url, var.runbook_url), "") + tip = var.file_descriptors_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.file_descriptors_usage_max_delay +} + +resource "signalfx_detector" "jvm_heap_memory_usage" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm heap memory usage") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') + detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_major == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_major}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.jvm_heap_memory_usage_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.jvm_heap_memory_usage_disabled_critical, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_heap_memory_usage_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.jvm_heap_memory_usage_runbook_url, var.runbook_url), "") + tip = var.jvm_heap_memory_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.jvm_heap_memory_usage_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.jvm_heap_memory_usage_disabled_major, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_heap_memory_usage_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.jvm_heap_memory_usage_runbook_url, var.runbook_url), "") + tip = var.jvm_heap_memory_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.jvm_heap_memory_usage_max_delay +} + +resource "signalfx_detector" "jvm_memory_young_usage" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm memory young usage") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} + B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} + signal = (A/B).fill(0).scale(100).publish('signal') + detect(when(signal > ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_minor}) and (not when(signal > ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.jvm_memory_young_usage_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.jvm_memory_young_usage_disabled_major, var.jvm_memory_young_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_memory_young_usage_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.jvm_memory_young_usage_runbook_url, var.runbook_url), "") + tip = var.jvm_memory_young_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.jvm_memory_young_usage_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.jvm_memory_young_usage_disabled_minor, var.jvm_memory_young_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_memory_young_usage_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.jvm_memory_young_usage_runbook_url, var.runbook_url), "") + tip = var.jvm_memory_young_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.jvm_memory_young_usage_max_delay +} + +resource "signalfx_detector" "jvm_memory_old_usage" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm memory old usage") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} + B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} + signal = (A/B).fill(0).scale(100).publish('signal') + detect(when(signal > ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_minor}) and (not when(signal > ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.jvm_memory_old_usage_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.jvm_memory_old_usage_disabled_major, var.jvm_memory_old_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_memory_old_usage_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.jvm_memory_old_usage_runbook_url, var.runbook_url), "") + tip = var.jvm_memory_old_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.jvm_memory_old_usage_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.jvm_memory_old_usage_disabled_minor, var.jvm_memory_old_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_memory_old_usage_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.jvm_memory_old_usage_runbook_url, var.runbook_url), "") + tip = var.jvm_memory_old_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.jvm_memory_old_usage_max_delay +} + +resource "signalfx_detector" "old-generation_garbage_collections_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch old-generation garbage collections latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.jvm.gc.old-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.old-generation_garbage_collections_latency_aggregation_function}${var.old-generation_garbage_collections_latency_transformation_function} + B = data('elasticsearch.jvm.gc.old-count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.old-generation_garbage_collections_latency_aggregation_function}${var.old-generation_garbage_collections_latency_transformation_function} + signal = (A/B).fill(0).publish('signal') + detect(when(signal > ${var.old-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.old-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.old-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.old-generation_garbage_collections_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.old-generation_garbage_collections_latency_threshold_minor}, lasting=%{if var.old-generation_garbage_collections_latency_lasting_duration_minor == null}None%{else}'${var.old-generation_garbage_collections_latency_lasting_duration_minor}'%{endif}, at_least=${var.old-generation_garbage_collections_latency_at_least_percentage_minor}) and (not when(signal > ${var.old-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.old-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.old-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.old-generation_garbage_collections_latency_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.old-generation_garbage_collections_latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.old-generation_garbage_collections_latency_disabled_major, var.old-generation_garbage_collections_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.old-generation_garbage_collections_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.old-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") + tip = var.old-generation_garbage_collections_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.old-generation_garbage_collections_latency_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.old-generation_garbage_collections_latency_disabled_minor, var.old-generation_garbage_collections_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.old-generation_garbage_collections_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.old-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") + tip = var.old-generation_garbage_collections_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.old-generation_garbage_collections_latency_max_delay +} + +resource "signalfx_detector" "young-generation_garbage_collections_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch young-generation garbage collections latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.jvm.gc.time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.young-generation_garbage_collections_latency_aggregation_function}${var.young-generation_garbage_collections_latency_transformation_function} + B = data('elasticsearch.jvm.gc.count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.young-generation_garbage_collections_latency_aggregation_function}${var.young-generation_garbage_collections_latency_transformation_function} + signal = (A/B).fill(0).publish('signal') + detect(when(signal > ${var.young-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.young-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.young-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.young-generation_garbage_collections_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.young-generation_garbage_collections_latency_threshold_minor}, lasting=%{if var.young-generation_garbage_collections_latency_lasting_duration_minor == null}None%{else}'${var.young-generation_garbage_collections_latency_lasting_duration_minor}'%{endif}, at_least=${var.young-generation_garbage_collections_latency_at_least_percentage_minor}) and (not when(signal > ${var.young-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.young-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.young-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.young-generation_garbage_collections_latency_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.young-generation_garbage_collections_latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.young-generation_garbage_collections_latency_disabled_major, var.young-generation_garbage_collections_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.young-generation_garbage_collections_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.young-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") + tip = var.young-generation_garbage_collections_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.young-generation_garbage_collections_latency_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.young-generation_garbage_collections_latency_disabled_minor, var.young-generation_garbage_collections_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.young-generation_garbage_collections_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.young-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") + tip = var.young-generation_garbage_collections_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.young-generation_garbage_collections_latency_max_delay +} + +resource "signalfx_detector" "indexing_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch indexing latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.indices.indexing.index-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} + B = data('elasticsearch.indices.indexing.index-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} + signal = (A/B).fill(0).publish('signal') + detect(when(signal > ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.indexing_latency_threshold_minor}, lasting=%{if var.indexing_latency_lasting_duration_minor == null}None%{else}'${var.indexing_latency_lasting_duration_minor}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_minor}) and (not when(signal > ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.indexing_latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.indexing_latency_disabled_major, var.indexing_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.indexing_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.indexing_latency_runbook_url, var.runbook_url), "") + tip = var.indexing_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.indexing_latency_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.indexing_latency_disabled_minor, var.indexing_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.indexing_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.indexing_latency_runbook_url, var.runbook_url), "") + tip = var.indexing_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.indexing_latency_max_delay +} + +resource "signalfx_detector" "index_flushing_to_disk_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch index flushing to disk latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.index_flushing_to_disk_latency_aggregation_function}${var.index_flushing_to_disk_latency_transformation_function} + B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.index_flushing_to_disk_latency_aggregation_function}${var.index_flushing_to_disk_latency_transformation_function} + signal = (A/B).fill(0).publish('signal') + detect(when(signal > ${var.index_flushing_to_disk_latency_threshold_major}, lasting=%{if var.index_flushing_to_disk_latency_lasting_duration_major == null}None%{else}'${var.index_flushing_to_disk_latency_lasting_duration_major}'%{endif}, at_least=${var.index_flushing_to_disk_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.index_flushing_to_disk_latency_threshold_minor}, lasting=%{if var.index_flushing_to_disk_latency_lasting_duration_minor == null}None%{else}'${var.index_flushing_to_disk_latency_lasting_duration_minor}'%{endif}, at_least=${var.index_flushing_to_disk_latency_at_least_percentage_minor}) and (not when(signal > ${var.index_flushing_to_disk_latency_threshold_major}, lasting=%{if var.index_flushing_to_disk_latency_lasting_duration_major == null}None%{else}'${var.index_flushing_to_disk_latency_lasting_duration_major}'%{endif}, at_least=${var.index_flushing_to_disk_latency_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.index_flushing_to_disk_latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.index_flushing_to_disk_latency_disabled_major, var.index_flushing_to_disk_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.index_flushing_to_disk_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.index_flushing_to_disk_latency_runbook_url, var.runbook_url), "") + tip = var.index_flushing_to_disk_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.index_flushing_to_disk_latency_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.index_flushing_to_disk_latency_disabled_minor, var.index_flushing_to_disk_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.index_flushing_to_disk_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.index_flushing_to_disk_latency_runbook_url, var.runbook_url), "") + tip = var.index_flushing_to_disk_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.index_flushing_to_disk_latency_max_delay +} + +resource "signalfx_detector" "search_query_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch search query latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.indices.search.query-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_query_latency_aggregation_function}${var.search_query_latency_transformation_function} + B = data('elasticsearch.indices.search.query-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_query_latency_aggregation_function}${var.search_query_latency_transformation_function} + signal = (A/B).fill(0).publish('signal') + detect(when(signal > ${var.search_query_latency_threshold_major}, lasting=%{if var.search_query_latency_lasting_duration_major == null}None%{else}'${var.search_query_latency_lasting_duration_major}'%{endif}, at_least=${var.search_query_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.search_query_latency_threshold_minor}, lasting=%{if var.search_query_latency_lasting_duration_minor == null}None%{else}'${var.search_query_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_query_latency_at_least_percentage_minor}) and (not when(signal > ${var.search_query_latency_threshold_major}, lasting=%{if var.search_query_latency_lasting_duration_major == null}None%{else}'${var.search_query_latency_lasting_duration_major}'%{endif}, at_least=${var.search_query_latency_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.search_query_latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.search_query_latency_disabled_major, var.search_query_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.search_query_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.search_query_latency_runbook_url, var.runbook_url), "") + tip = var.search_query_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.search_query_latency_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.search_query_latency_disabled_minor, var.search_query_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.search_query_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.search_query_latency_runbook_url, var.runbook_url), "") + tip = var.search_query_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.search_query_latency_max_delay +} + +resource "signalfx_detector" "search_fetch_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch search fetch latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.indices.search.fetch-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_fetch_latency_aggregation_function}${var.search_fetch_latency_transformation_function} + B = data('elasticsearch.indices.search.fetch-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_fetch_latency_aggregation_function}${var.search_fetch_latency_transformation_function} + signal = (A/B).fill(0).publish('signal') + detect(when(signal > ${var.search_fetch_latency_threshold_major}, lasting=%{if var.search_fetch_latency_lasting_duration_major == null}None%{else}'${var.search_fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.search_fetch_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.search_fetch_latency_threshold_minor}, lasting=%{if var.search_fetch_latency_lasting_duration_minor == null}None%{else}'${var.search_fetch_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_fetch_latency_at_least_percentage_minor}) and (not when(signal > ${var.search_fetch_latency_threshold_major}, lasting=%{if var.search_fetch_latency_lasting_duration_major == null}None%{else}'${var.search_fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.search_fetch_latency_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.search_fetch_latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.search_fetch_latency_disabled_major, var.search_fetch_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.search_fetch_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.search_fetch_latency_runbook_url, var.runbook_url), "") + tip = var.search_fetch_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.search_fetch_latency_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.search_fetch_latency_disabled_minor, var.search_fetch_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.search_fetch_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.search_fetch_latency_runbook_url, var.runbook_url), "") + tip = var.search_fetch_latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.search_fetch_latency_max_delay +} + +resource "signalfx_detector" "fielddata_cache_evictions_rate_of_change" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch fielddata cache evictions rate of change") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('node_name', '*') + A = data('elasticsearch.indices.fielddata.evictions', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fielddata_cache_evictions_rate_of_change_aggregation_function}${var.fielddata_cache_evictions_rate_of_change_transformation_function} + signal = A.rateofchange().publish('signal') + detect(when(signal > ${var.fielddata_cache_evictions_rate_of_change_threshold_major}, lasting=%{if var.fielddata_cache_evictions_rate_of_change_lasting_duration_major == null}None%{else}'${var.fielddata_cache_evictions_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.fielddata_cache_evictions_rate_of_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.fielddata_cache_evictions_rate_of_change_threshold_minor}, lasting=%{if var.fielddata_cache_evictions_rate_of_change_lasting_duration_minor == null}None%{else}'${var.fielddata_cache_evictions_rate_of_change_lasting_duration_minor}'%{endif}, at_least=${var.fielddata_cache_evictions_rate_of_change_at_least_percentage_minor}) and (not when(signal > ${var.fielddata_cache_evictions_rate_of_change_threshold_major}, lasting=%{if var.fielddata_cache_evictions_rate_of_change_lasting_duration_major == null}None%{else}'${var.fielddata_cache_evictions_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.fielddata_cache_evictions_rate_of_change_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.fielddata_cache_evictions_rate_of_change_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.fielddata_cache_evictions_rate_of_change_disabled_major, var.fielddata_cache_evictions_rate_of_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fielddata_cache_evictions_rate_of_change_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.fielddata_cache_evictions_rate_of_change_runbook_url, var.runbook_url), "") + tip = var.fielddata_cache_evictions_rate_of_change_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.fielddata_cache_evictions_rate_of_change_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.fielddata_cache_evictions_rate_of_change_disabled_minor, var.fielddata_cache_evictions_rate_of_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fielddata_cache_evictions_rate_of_change_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.fielddata_cache_evictions_rate_of_change_runbook_url, var.runbook_url), "") + tip = var.fielddata_cache_evictions_rate_of_change_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.fielddata_cache_evictions_rate_of_change_max_delay +} + +resource "signalfx_detector" "max_time_spent_by_task_in_queue_rate_of_change" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch max time spent by task in queue rate of change") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + A = data('elasticsearch.cluster.task-max-wait-time', filter=${module.filtering.signalflow}, rollup='average')${var.max_time_spent_by_task_in_queue_rate_of_change_aggregation_function}${var.max_time_spent_by_task_in_queue_rate_of_change_transformation_function} + signal = A.rateofchange().publish('signal') + detect(when(signal > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_major}, lasting=%{if var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major == null}None%{else}'${var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_minor}, lasting=%{if var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_minor == null}None%{else}'${var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_minor}'%{endif}, at_least=${var.max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_minor}) and (not when(signal > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_major}, lasting=%{if var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major == null}None%{else}'${var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_major}))).publish('MINOR') +EOF + + rule { + description = "is too high > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_disabled_major, var.max_time_spent_by_task_in_queue_rate_of_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.max_time_spent_by_task_in_queue_rate_of_change_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_runbook_url, var.runbook_url), "") + tip = var.max_time_spent_by_task_in_queue_rate_of_change_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_disabled_minor, var.max_time_spent_by_task_in_queue_rate_of_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.max_time_spent_by_task_in_queue_rate_of_change_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_runbook_url, var.runbook_url), "") + tip = var.max_time_spent_by_task_in_queue_rate_of_change_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.max_time_spent_by_task_in_queue_rate_of_change_max_delay +} + diff --git a/modules/smart-agent_elasticsearch/variables-gen.tf b/modules/smart-agent_elasticsearch/variables-gen.tf new file mode 100644 index 000000000..29ddb9969 --- /dev/null +++ b/modules/smart-agent_elasticsearch/variables-gen.tf @@ -0,0 +1,1664 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = 900 +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"10m\")" + type = string + default = "10m" +} + +# cluster_status detector + +variable "cluster_status_notifications" { + description = "Notification recipients list per severity overridden for cluster_status detector" + type = map(list(string)) + default = {} +} + +variable "cluster_status_aggregation_function" { + description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cluster_status_transformation_function" { + description = "Transformation function for cluster_status detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='5m')" +} + +variable "cluster_status_max_delay" { + description = "Enforce max delay for cluster_status detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_status_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_status_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_status_disabled" { + description = "Disable all alerting rules for cluster_status detector" + type = bool + default = null +} + +variable "cluster_status_disabled_critical" { + description = "Disable critical alerting rule for cluster_status detector" + type = bool + default = null +} + +variable "cluster_status_disabled_major" { + description = "Disable major alerting rule for cluster_status detector" + type = bool + default = null +} + +variable "cluster_status_threshold_critical" { + description = "Critical threshold for cluster_status detector" + type = number + default = 1 +} + +variable "cluster_status_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_status_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_status_threshold_major" { + description = "Major threshold for cluster_status detector" + type = number + default = 2 +} + +variable "cluster_status_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_status_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_initializing_shards detector + +variable "cluster_initializing_shards_notifications" { + description = "Notification recipients list per severity overridden for cluster_initializing_shards detector" + type = map(list(string)) + default = {} +} + +variable "cluster_initializing_shards_aggregation_function" { + description = "Aggregation function and group by for cluster_initializing_shards detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cluster_initializing_shards_transformation_function" { + description = "Transformation function for cluster_initializing_shards detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='15m')" +} + +variable "cluster_initializing_shards_max_delay" { + description = "Enforce max delay for cluster_initializing_shards detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_initializing_shards_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_initializing_shards_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_initializing_shards_disabled" { + description = "Disable all alerting rules for cluster_initializing_shards detector" + type = bool + default = null +} + +variable "cluster_initializing_shards_disabled_critical" { + description = "Disable critical alerting rule for cluster_initializing_shards detector" + type = bool + default = null +} + +variable "cluster_initializing_shards_disabled_major" { + description = "Disable major alerting rule for cluster_initializing_shards detector" + type = bool + default = null +} + +variable "cluster_initializing_shards_threshold_critical" { + description = "Critical threshold for cluster_initializing_shards detector" + type = number + default = 0 +} + +variable "cluster_initializing_shards_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_initializing_shards_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_initializing_shards_threshold_major" { + description = "Major threshold for cluster_initializing_shards detector" + type = number + default = -1 +} + +variable "cluster_initializing_shards_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_initializing_shards_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_relocating_shards detector + +variable "cluster_relocating_shards_notifications" { + description = "Notification recipients list per severity overridden for cluster_relocating_shards detector" + type = map(list(string)) + default = {} +} + +variable "cluster_relocating_shards_aggregation_function" { + description = "Aggregation function and group by for cluster_relocating_shards detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cluster_relocating_shards_transformation_function" { + description = "Transformation function for cluster_relocating_shards detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='15m')" +} + +variable "cluster_relocating_shards_max_delay" { + description = "Enforce max delay for cluster_relocating_shards detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_relocating_shards_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_relocating_shards_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_relocating_shards_disabled" { + description = "Disable all alerting rules for cluster_relocating_shards detector" + type = bool + default = null +} + +variable "cluster_relocating_shards_disabled_critical" { + description = "Disable critical alerting rule for cluster_relocating_shards detector" + type = bool + default = null +} + +variable "cluster_relocating_shards_disabled_major" { + description = "Disable major alerting rule for cluster_relocating_shards detector" + type = bool + default = null +} + +variable "cluster_relocating_shards_threshold_critical" { + description = "Critical threshold for cluster_relocating_shards detector" + type = number + default = 1 +} + +variable "cluster_relocating_shards_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_relocating_shards_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_relocating_shards_threshold_major" { + description = "Major threshold for cluster_relocating_shards detector" + type = number + default = 0 +} + +variable "cluster_relocating_shards_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_relocating_shards_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_unassigned_shards detector + +variable "cluster_unassigned_shards_notifications" { + description = "Notification recipients list per severity overridden for cluster_unassigned_shards detector" + type = map(list(string)) + default = {} +} + +variable "cluster_unassigned_shards_aggregation_function" { + description = "Aggregation function and group by for cluster_unassigned_shards detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cluster_unassigned_shards_transformation_function" { + description = "Transformation function for cluster_unassigned_shards detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='10m')" +} + +variable "cluster_unassigned_shards_max_delay" { + description = "Enforce max delay for cluster_unassigned_shards detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_unassigned_shards_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_unassigned_shards_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_unassigned_shards_disabled" { + description = "Disable all alerting rules for cluster_unassigned_shards detector" + type = bool + default = null +} + +variable "cluster_unassigned_shards_disabled_critical" { + description = "Disable critical alerting rule for cluster_unassigned_shards detector" + type = bool + default = null +} + +variable "cluster_unassigned_shards_disabled_major" { + description = "Disable major alerting rule for cluster_unassigned_shards detector" + type = bool + default = null +} + +variable "cluster_unassigned_shards_threshold_critical" { + description = "Critical threshold for cluster_unassigned_shards detector" + type = number + default = 0 +} + +variable "cluster_unassigned_shards_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_unassigned_shards_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_unassigned_shards_threshold_major" { + description = "Major threshold for cluster_unassigned_shards detector" + type = number + default = -1 +} + +variable "cluster_unassigned_shards_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_unassigned_shards_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_pending_tasks detector + +variable "cluster_pending_tasks_notifications" { + description = "Notification recipients list per severity overridden for cluster_pending_tasks detector" + type = map(list(string)) + default = {} +} + +variable "cluster_pending_tasks_aggregation_function" { + description = "Aggregation function and group by for cluster_pending_tasks detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cluster_pending_tasks_transformation_function" { + description = "Transformation function for cluster_pending_tasks detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='15m')" +} + +variable "cluster_pending_tasks_max_delay" { + description = "Enforce max delay for cluster_pending_tasks detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_pending_tasks_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_pending_tasks_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_pending_tasks_disabled" { + description = "Disable all alerting rules for cluster_pending_tasks detector" + type = bool + default = null +} + +variable "cluster_pending_tasks_disabled_critical" { + description = "Disable critical alerting rule for cluster_pending_tasks detector" + type = bool + default = null +} + +variable "cluster_pending_tasks_disabled_major" { + description = "Disable major alerting rule for cluster_pending_tasks detector" + type = bool + default = null +} + +variable "cluster_pending_tasks_threshold_critical" { + description = "Critical threshold for cluster_pending_tasks detector" + type = number + default = 5 +} + +variable "cluster_pending_tasks_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_pending_tasks_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_pending_tasks_threshold_major" { + description = "Major threshold for cluster_pending_tasks detector" + type = number + default = 0 +} + +variable "cluster_pending_tasks_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_pending_tasks_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cpu_usage detector + +variable "cpu_usage_notifications" { + description = "Notification recipients list per severity overridden for cpu_usage detector" + type = map(list(string)) + default = {} +} + +variable "cpu_usage_aggregation_function" { + description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cpu_usage_transformation_function" { + description = "Transformation function for cpu_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "cpu_usage_max_delay" { + description = "Enforce max delay for cpu_usage detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cpu_usage_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cpu_usage_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cpu_usage_disabled" { + description = "Disable all alerting rules for cpu_usage detector" + type = bool + default = null +} + +variable "cpu_usage_disabled_critical" { + description = "Disable critical alerting rule for cpu_usage detector" + type = bool + default = null +} + +variable "cpu_usage_disabled_major" { + description = "Disable major alerting rule for cpu_usage detector" + type = bool + default = null +} + +variable "cpu_usage_threshold_critical" { + description = "Critical threshold for cpu_usage detector" + type = number + default = 95 +} + +variable "cpu_usage_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cpu_usage_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cpu_usage_threshold_major" { + description = "Major threshold for cpu_usage detector" + type = number + default = 85 +} + +variable "cpu_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cpu_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# file_descriptors_usage detector + +variable "file_descriptors_usage_notifications" { + description = "Notification recipients list per severity overridden for file_descriptors_usage detector" + type = map(list(string)) + default = {} +} + +variable "file_descriptors_usage_aggregation_function" { + description = "Aggregation function and group by for file_descriptors_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "file_descriptors_usage_transformation_function" { + description = "Transformation function for file_descriptors_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='15m')" +} + +variable "file_descriptors_usage_max_delay" { + description = "Enforce max delay for file_descriptors_usage detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "file_descriptors_usage_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "file_descriptors_usage_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "file_descriptors_usage_disabled" { + description = "Disable all alerting rules for file_descriptors_usage detector" + type = bool + default = null +} + +variable "file_descriptors_usage_disabled_critical" { + description = "Disable critical alerting rule for file_descriptors_usage detector" + type = bool + default = null +} + +variable "file_descriptors_usage_disabled_major" { + description = "Disable major alerting rule for file_descriptors_usage detector" + type = bool + default = null +} + +variable "file_descriptors_usage_threshold_critical" { + description = "Critical threshold for file_descriptors_usage detector" + type = number + default = 95 +} + +variable "file_descriptors_usage_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "file_descriptors_usage_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "file_descriptors_usage_threshold_major" { + description = "Major threshold for file_descriptors_usage detector" + type = number + default = 90 +} + +variable "file_descriptors_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "file_descriptors_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# jvm_heap_memory_usage detector + +variable "jvm_heap_memory_usage_notifications" { + description = "Notification recipients list per severity overridden for jvm_heap_memory_usage detector" + type = map(list(string)) + default = {} +} + +variable "jvm_heap_memory_usage_aggregation_function" { + description = "Aggregation function and group by for jvm_heap_memory_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "jvm_heap_memory_usage_transformation_function" { + description = "Transformation function for jvm_heap_memory_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='5m')" +} + +variable "jvm_heap_memory_usage_max_delay" { + description = "Enforce max delay for jvm_heap_memory_usage detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "jvm_heap_memory_usage_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "jvm_heap_memory_usage_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "jvm_heap_memory_usage_disabled" { + description = "Disable all alerting rules for jvm_heap_memory_usage detector" + type = bool + default = null +} + +variable "jvm_heap_memory_usage_disabled_critical" { + description = "Disable critical alerting rule for jvm_heap_memory_usage detector" + type = bool + default = null +} + +variable "jvm_heap_memory_usage_disabled_major" { + description = "Disable major alerting rule for jvm_heap_memory_usage detector" + type = bool + default = null +} + +variable "jvm_heap_memory_usage_threshold_critical" { + description = "Critical threshold for jvm_heap_memory_usage detector" + type = number + default = 90 +} + +variable "jvm_heap_memory_usage_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jvm_heap_memory_usage_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_heap_memory_usage_threshold_major" { + description = "Major threshold for jvm_heap_memory_usage detector" + type = number + default = 80 +} + +variable "jvm_heap_memory_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jvm_heap_memory_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# jvm_memory_young_usage detector + +variable "jvm_memory_young_usage_notifications" { + description = "Notification recipients list per severity overridden for jvm_memory_young_usage detector" + type = map(list(string)) + default = {} +} + +variable "jvm_memory_young_usage_aggregation_function" { + description = "Aggregation function and group by for jvm_memory_young_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "jvm_memory_young_usage_transformation_function" { + description = "Transformation function for jvm_memory_young_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='10m')" +} + +variable "jvm_memory_young_usage_max_delay" { + description = "Enforce max delay for jvm_memory_young_usage detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "jvm_memory_young_usage_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "jvm_memory_young_usage_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "jvm_memory_young_usage_disabled" { + description = "Disable all alerting rules for jvm_memory_young_usage detector" + type = bool + default = null +} + +variable "jvm_memory_young_usage_disabled_major" { + description = "Disable major alerting rule for jvm_memory_young_usage detector" + type = bool + default = null +} + +variable "jvm_memory_young_usage_disabled_minor" { + description = "Disable minor alerting rule for jvm_memory_young_usage detector" + type = bool + default = null +} + +variable "jvm_memory_young_usage_threshold_major" { + description = "Major threshold for jvm_memory_young_usage detector" + type = number + default = 90 +} + +variable "jvm_memory_young_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jvm_memory_young_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_memory_young_usage_threshold_minor" { + description = "Minor threshold for jvm_memory_young_usage detector" + type = number + default = 80 +} + +variable "jvm_memory_young_usage_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jvm_memory_young_usage_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# jvm_memory_old_usage detector + +variable "jvm_memory_old_usage_notifications" { + description = "Notification recipients list per severity overridden for jvm_memory_old_usage detector" + type = map(list(string)) + default = {} +} + +variable "jvm_memory_old_usage_aggregation_function" { + description = "Aggregation function and group by for jvm_memory_old_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "jvm_memory_old_usage_transformation_function" { + description = "Transformation function for jvm_memory_old_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='10m')" +} + +variable "jvm_memory_old_usage_max_delay" { + description = "Enforce max delay for jvm_memory_old_usage detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "jvm_memory_old_usage_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "jvm_memory_old_usage_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "jvm_memory_old_usage_disabled" { + description = "Disable all alerting rules for jvm_memory_old_usage detector" + type = bool + default = null +} + +variable "jvm_memory_old_usage_disabled_major" { + description = "Disable major alerting rule for jvm_memory_old_usage detector" + type = bool + default = null +} + +variable "jvm_memory_old_usage_disabled_minor" { + description = "Disable minor alerting rule for jvm_memory_old_usage detector" + type = bool + default = null +} + +variable "jvm_memory_old_usage_threshold_major" { + description = "Major threshold for jvm_memory_old_usage detector" + type = number + default = 90 +} + +variable "jvm_memory_old_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jvm_memory_old_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_memory_old_usage_threshold_minor" { + description = "Minor threshold for jvm_memory_old_usage detector" + type = number + default = 80 +} + +variable "jvm_memory_old_usage_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "jvm_memory_old_usage_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# old-generation_garbage_collections_latency detector + +variable "old-generation_garbage_collections_latency_notifications" { + description = "Notification recipients list per severity overridden for old-generation_garbage_collections_latency detector" + type = map(list(string)) + default = {} +} + +variable "old-generation_garbage_collections_latency_aggregation_function" { + description = "Aggregation function and group by for old-generation_garbage_collections_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "old-generation_garbage_collections_latency_transformation_function" { + description = "Transformation function for old-generation_garbage_collections_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='15m')" +} + +variable "old-generation_garbage_collections_latency_max_delay" { + description = "Enforce max delay for old-generation_garbage_collections_latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "old-generation_garbage_collections_latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "old-generation_garbage_collections_latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "old-generation_garbage_collections_latency_disabled" { + description = "Disable all alerting rules for old-generation_garbage_collections_latency detector" + type = bool + default = null +} + +variable "old-generation_garbage_collections_latency_disabled_major" { + description = "Disable major alerting rule for old-generation_garbage_collections_latency detector" + type = bool + default = null +} + +variable "old-generation_garbage_collections_latency_disabled_minor" { + description = "Disable minor alerting rule for old-generation_garbage_collections_latency detector" + type = bool + default = null +} + +variable "old-generation_garbage_collections_latency_threshold_major" { + description = "Major threshold for old-generation_garbage_collections_latency detector" + type = number + default = 300 +} + +variable "old-generation_garbage_collections_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "old-generation_garbage_collections_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "old-generation_garbage_collections_latency_threshold_minor" { + description = "Minor threshold for old-generation_garbage_collections_latency detector" + type = number + default = 200 +} + +variable "old-generation_garbage_collections_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "old-generation_garbage_collections_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# young-generation_garbage_collections_latency detector + +variable "young-generation_garbage_collections_latency_notifications" { + description = "Notification recipients list per severity overridden for young-generation_garbage_collections_latency detector" + type = map(list(string)) + default = {} +} + +variable "young-generation_garbage_collections_latency_aggregation_function" { + description = "Aggregation function and group by for young-generation_garbage_collections_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "young-generation_garbage_collections_latency_transformation_function" { + description = "Transformation function for young-generation_garbage_collections_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='15m')" +} + +variable "young-generation_garbage_collections_latency_max_delay" { + description = "Enforce max delay for young-generation_garbage_collections_latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "young-generation_garbage_collections_latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "young-generation_garbage_collections_latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "young-generation_garbage_collections_latency_disabled" { + description = "Disable all alerting rules for young-generation_garbage_collections_latency detector" + type = bool + default = null +} + +variable "young-generation_garbage_collections_latency_disabled_major" { + description = "Disable major alerting rule for young-generation_garbage_collections_latency detector" + type = bool + default = null +} + +variable "young-generation_garbage_collections_latency_disabled_minor" { + description = "Disable minor alerting rule for young-generation_garbage_collections_latency detector" + type = bool + default = null +} + +variable "young-generation_garbage_collections_latency_threshold_major" { + description = "Major threshold for young-generation_garbage_collections_latency detector" + type = number + default = 40 +} + +variable "young-generation_garbage_collections_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "young-generation_garbage_collections_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "young-generation_garbage_collections_latency_threshold_minor" { + description = "Minor threshold for young-generation_garbage_collections_latency detector" + type = number + default = 20 +} + +variable "young-generation_garbage_collections_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "young-generation_garbage_collections_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# indexing_latency detector + +variable "indexing_latency_notifications" { + description = "Notification recipients list per severity overridden for indexing_latency detector" + type = map(list(string)) + default = {} +} + +variable "indexing_latency_aggregation_function" { + description = "Aggregation function and group by for indexing_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "indexing_latency_transformation_function" { + description = "Transformation function for indexing_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='15m')" +} + +variable "indexing_latency_max_delay" { + description = "Enforce max delay for indexing_latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "indexing_latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "indexing_latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "indexing_latency_disabled" { + description = "Disable all alerting rules for indexing_latency detector" + type = bool + default = null +} + +variable "indexing_latency_disabled_major" { + description = "Disable major alerting rule for indexing_latency detector" + type = bool + default = null +} + +variable "indexing_latency_disabled_minor" { + description = "Disable minor alerting rule for indexing_latency detector" + type = bool + default = null +} + +variable "indexing_latency_threshold_major" { + description = "Major threshold for indexing_latency detector" + type = number + default = 30 +} + +variable "indexing_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "indexing_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "indexing_latency_threshold_minor" { + description = "Minor threshold for indexing_latency detector" + type = number + default = 15 +} + +variable "indexing_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "indexing_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# index_flushing_to_disk_latency detector + +variable "index_flushing_to_disk_latency_notifications" { + description = "Notification recipients list per severity overridden for index_flushing_to_disk_latency detector" + type = map(list(string)) + default = {} +} + +variable "index_flushing_to_disk_latency_aggregation_function" { + description = "Aggregation function and group by for index_flushing_to_disk_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "index_flushing_to_disk_latency_transformation_function" { + description = "Transformation function for index_flushing_to_disk_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='15m')" +} + +variable "index_flushing_to_disk_latency_max_delay" { + description = "Enforce max delay for index_flushing_to_disk_latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "index_flushing_to_disk_latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "index_flushing_to_disk_latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "index_flushing_to_disk_latency_disabled" { + description = "Disable all alerting rules for index_flushing_to_disk_latency detector" + type = bool + default = null +} + +variable "index_flushing_to_disk_latency_disabled_major" { + description = "Disable major alerting rule for index_flushing_to_disk_latency detector" + type = bool + default = null +} + +variable "index_flushing_to_disk_latency_disabled_minor" { + description = "Disable minor alerting rule for index_flushing_to_disk_latency detector" + type = bool + default = null +} + +variable "index_flushing_to_disk_latency_threshold_major" { + description = "Major threshold for index_flushing_to_disk_latency detector" + type = number + default = 150 +} + +variable "index_flushing_to_disk_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "index_flushing_to_disk_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "index_flushing_to_disk_latency_threshold_minor" { + description = "Minor threshold for index_flushing_to_disk_latency detector" + type = number + default = 100 +} + +variable "index_flushing_to_disk_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "index_flushing_to_disk_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# search_query_latency detector + +variable "search_query_latency_notifications" { + description = "Notification recipients list per severity overridden for search_query_latency detector" + type = map(list(string)) + default = {} +} + +variable "search_query_latency_aggregation_function" { + description = "Aggregation function and group by for search_query_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "search_query_latency_transformation_function" { + description = "Transformation function for search_query_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "search_query_latency_max_delay" { + description = "Enforce max delay for search_query_latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "search_query_latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "search_query_latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "search_query_latency_disabled" { + description = "Disable all alerting rules for search_query_latency detector" + type = bool + default = null +} + +variable "search_query_latency_disabled_major" { + description = "Disable major alerting rule for search_query_latency detector" + type = bool + default = null +} + +variable "search_query_latency_disabled_minor" { + description = "Disable minor alerting rule for search_query_latency detector" + type = bool + default = null +} + +variable "search_query_latency_threshold_major" { + description = "Major threshold for search_query_latency detector" + type = number + default = 20 +} + +variable "search_query_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "search_query_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "search_query_latency_threshold_minor" { + description = "Minor threshold for search_query_latency detector" + type = number + default = 10 +} + +variable "search_query_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "search_query_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# search_fetch_latency detector + +variable "search_fetch_latency_notifications" { + description = "Notification recipients list per severity overridden for search_fetch_latency detector" + type = map(list(string)) + default = {} +} + +variable "search_fetch_latency_aggregation_function" { + description = "Aggregation function and group by for search_fetch_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "search_fetch_latency_transformation_function" { + description = "Transformation function for search_fetch_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='15m')" +} + +variable "search_fetch_latency_max_delay" { + description = "Enforce max delay for search_fetch_latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "search_fetch_latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "search_fetch_latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "search_fetch_latency_disabled" { + description = "Disable all alerting rules for search_fetch_latency detector" + type = bool + default = null +} + +variable "search_fetch_latency_disabled_major" { + description = "Disable major alerting rule for search_fetch_latency detector" + type = bool + default = null +} + +variable "search_fetch_latency_disabled_minor" { + description = "Disable minor alerting rule for search_fetch_latency detector" + type = bool + default = null +} + +variable "search_fetch_latency_threshold_major" { + description = "Major threshold for search_fetch_latency detector" + type = number + default = 20 +} + +variable "search_fetch_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "search_fetch_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "search_fetch_latency_threshold_minor" { + description = "Minor threshold for search_fetch_latency detector" + type = number + default = 10 +} + +variable "search_fetch_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "search_fetch_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# fielddata_cache_evictions_rate_of_change detector + +variable "fielddata_cache_evictions_rate_of_change_notifications" { + description = "Notification recipients list per severity overridden for fielddata_cache_evictions_rate_of_change detector" + type = map(list(string)) + default = {} +} + +variable "fielddata_cache_evictions_rate_of_change_aggregation_function" { + description = "Aggregation function and group by for fielddata_cache_evictions_rate_of_change detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "fielddata_cache_evictions_rate_of_change_transformation_function" { + description = "Transformation function for fielddata_cache_evictions_rate_of_change detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='15m')" +} + +variable "fielddata_cache_evictions_rate_of_change_max_delay" { + description = "Enforce max delay for fielddata_cache_evictions_rate_of_change detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "fielddata_cache_evictions_rate_of_change_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "fielddata_cache_evictions_rate_of_change_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "fielddata_cache_evictions_rate_of_change_disabled" { + description = "Disable all alerting rules for fielddata_cache_evictions_rate_of_change detector" + type = bool + default = null +} + +variable "fielddata_cache_evictions_rate_of_change_disabled_major" { + description = "Disable major alerting rule for fielddata_cache_evictions_rate_of_change detector" + type = bool + default = null +} + +variable "fielddata_cache_evictions_rate_of_change_disabled_minor" { + description = "Disable minor alerting rule for fielddata_cache_evictions_rate_of_change detector" + type = bool + default = null +} + +variable "fielddata_cache_evictions_rate_of_change_threshold_major" { + description = "Major threshold for fielddata_cache_evictions_rate_of_change detector" + type = number + default = 120 +} + +variable "fielddata_cache_evictions_rate_of_change_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "fielddata_cache_evictions_rate_of_change_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "fielddata_cache_evictions_rate_of_change_threshold_minor" { + description = "Minor threshold for fielddata_cache_evictions_rate_of_change detector" + type = number + default = 60 +} + +variable "fielddata_cache_evictions_rate_of_change_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "fielddata_cache_evictions_rate_of_change_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# max_time_spent_by_task_in_queue_rate_of_change detector + +variable "max_time_spent_by_task_in_queue_rate_of_change_notifications" { + description = "Notification recipients list per severity overridden for max_time_spent_by_task_in_queue_rate_of_change detector" + type = map(list(string)) + default = {} +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_aggregation_function" { + description = "Aggregation function and group by for max_time_spent_by_task_in_queue_rate_of_change detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_transformation_function" { + description = "Transformation function for max_time_spent_by_task_in_queue_rate_of_change detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='15m')" +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_max_delay" { + description = "Enforce max delay for max_time_spent_by_task_in_queue_rate_of_change detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_disabled" { + description = "Disable all alerting rules for max_time_spent_by_task_in_queue_rate_of_change detector" + type = bool + default = null +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_disabled_major" { + description = "Disable major alerting rule for max_time_spent_by_task_in_queue_rate_of_change detector" + type = bool + default = null +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_disabled_minor" { + description = "Disable minor alerting rule for max_time_spent_by_task_in_queue_rate_of_change detector" + type = bool + default = null +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_threshold_major" { + description = "Major threshold for max_time_spent_by_task_in_queue_rate_of_change detector" + type = number + default = 200 +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "max_time_spent_by_task_in_queue_rate_of_change_threshold_minor" { + description = "Minor threshold for max_time_spent_by_task_in_queue_rate_of_change detector" + type = number + default = 100 +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 1ab2f624e252c940c000c9d2ef71f959b18ceccf Mon Sep 17 00:00:00 2001 From: Gauthier AMPE Date: Tue, 9 Jan 2024 18:38:44 +0100 Subject: [PATCH 3/6] update mutliple detector name because of recreation --- docs/severity.md | 18 +- modules/smart-agent_elasticsearch/README.md | 18 +- .../conf/02-cluster_initializing_shards.yaml | 5 +- .../conf/05-cluster_pending_tasks.yaml | 2 +- .../conf/07-cluster_file_descriptor.yaml | 2 +- ...eneration_garbage_collections_latency.yaml | 2 +- ...eneration_garbage_collections_latency.yaml | 2 +- .../conf/14-cluster_flush_latency.yaml | 2 +- .../conf/15-cluster_search_latency.yaml | 2 +- .../conf/16-cluster_fetch_latency.yaml | 2 +- ...luster_fielddata_cache_evictions_rate.yaml | 2 +- .../conf/18-cluster_time_in_queue_change.yaml | 2 +- .../detectors-gen.tf | 303 ++++++------ modules/smart-agent_elasticsearch/outputs.tf | 70 +-- .../variables-gen.tf | 454 +++++++++--------- 15 files changed, 444 insertions(+), 442 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index 70a1de7bc..b2f09502a 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -980,20 +980,20 @@ |ElasticSearch cluster initializing shards|X|X|-|-|-| |ElasticSearch cluster relocating shards|X|X|-|-|-| |ElasticSearch cluster unassigned shards|X|X|-|-|-| -|ElasticSearch cluster pending tasks|X|X|-|-|-| +|ElasticSearch pending tasks|X|X|-|-|-| |ElasticSearch cpu usage|X|X|-|-|-| -|ElasticSearch file descriptors usage|X|X|-|-|-| +|ElasticSearch file descriptors|X|X|-|-|-| |ElasticSearch jvm heap memory usage|X|X|-|-|-| |ElasticSearch jvm memory young usage|-|X|X|-|-| |ElasticSearch jvm memory old usage|-|X|X|-|-| -|ElasticSearch old-generation garbage collections latency|-|X|X|-|-| -|ElasticSearch young-generation garbage collections latency|-|X|X|-|-| +|ElasticSearch jvm gc old collection latency|-|X|X|-|-| +|ElasticSearch jvm gc young collection latency|-|X|X|-|-| |ElasticSearch indexing latency|-|X|X|-|-| -|ElasticSearch index flushing to disk latency|-|X|X|-|-| -|ElasticSearch search query latency|-|X|X|-|-| -|ElasticSearch search fetch latency|-|X|X|-|-| -|ElasticSearch fielddata cache evictions rate of change|-|X|X|-|-| -|ElasticSearch max time spent by task in queue rate of change|-|X|X|-|-| +|ElasticSearch flushing latency|-|X|X|-|-| +|ElasticSearch search latency|-|X|X|-|-| +|ElasticSearch fetch latency|-|X|X|-|-| +|ElasticSearch field_data evictions change|-|X|X|-|-| +|ElasticSearch task time in queue change|-|X|X|-|-| ## smart-agent_genericjmx diff --git a/modules/smart-agent_elasticsearch/README.md b/modules/smart-agent_elasticsearch/README.md index 598dc631d..2f944c3b9 100644 --- a/modules/smart-agent_elasticsearch/README.md +++ b/modules/smart-agent_elasticsearch/README.md @@ -82,20 +82,20 @@ This module creates the following SignalFx detectors which could contain one or |ElasticSearch cluster initializing shards|X|X|-|-|-| |ElasticSearch cluster relocating shards|X|X|-|-|-| |ElasticSearch cluster unassigned shards|X|X|-|-|-| -|ElasticSearch cluster pending tasks|X|X|-|-|-| +|ElasticSearch pending tasks|X|X|-|-|-| |ElasticSearch cpu usage|X|X|-|-|-| -|ElasticSearch file descriptors usage|X|X|-|-|-| +|ElasticSearch file descriptors|X|X|-|-|-| |ElasticSearch jvm heap memory usage|X|X|-|-|-| |ElasticSearch jvm memory young usage|-|X|X|-|-| |ElasticSearch jvm memory old usage|-|X|X|-|-| -|ElasticSearch old-generation garbage collections latency|-|X|X|-|-| -|ElasticSearch young-generation garbage collections latency|-|X|X|-|-| +|ElasticSearch jvm gc old collection latency|-|X|X|-|-| +|ElasticSearch jvm gc young collection latency|-|X|X|-|-| |ElasticSearch indexing latency|-|X|X|-|-| -|ElasticSearch index flushing to disk latency|-|X|X|-|-| -|ElasticSearch search query latency|-|X|X|-|-| -|ElasticSearch search fetch latency|-|X|X|-|-| -|ElasticSearch fielddata cache evictions rate of change|-|X|X|-|-| -|ElasticSearch max time spent by task in queue rate of change|-|X|X|-|-| +|ElasticSearch flushing latency|-|X|X|-|-| +|ElasticSearch search latency|-|X|X|-|-| +|ElasticSearch fetch latency|-|X|X|-|-| +|ElasticSearch field_data evictions change|-|X|X|-|-| +|ElasticSearch task time in queue change|-|X|X|-|-| ## How to collect required metrics? diff --git a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml index 4e0d26f11..de406b1cb 100644 --- a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml @@ -2,17 +2,18 @@ module: ElasticSearch name: "cluster initializing shards" aggregation: true transformation: ".min(over='15m')" +filtering: "filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.cluster.initializing-shards" rollup: average rules: critical: - threshold: 0 + threshold: 1 comparator: ">" description: "is too high" major: - threshold: -1 + threshold: 0 comparator: ">" dependency: critical description: "is too high" diff --git a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml index 128fd855a..da7ae06cd 100644 --- a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml +++ b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "cluster pending tasks" +name: "pending tasks" aggregation: true transformation: ".min(over='15m')" signals: diff --git a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml index bcd222bb6..36a0ddb46 100644 --- a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml +++ b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "file descriptors usage" +name: "file descriptors" aggregation: true transformation: ".max(over='15m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml index 71c35f2f2..427ca8e46 100644 --- a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "old-generation garbage collections latency" +name: "jvm gc old collection latency" aggregation: true transformation: ".mean(over='15m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml index 92b778786..a497687aa 100644 --- a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "young-generation garbage collections latency" +name: "jvm gc young collection latency" aggregation: true transformation: ".mean(over='15m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml index 5b342e70e..b162c269e 100644 --- a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "index flushing to disk latency" +name: "flushing latency" aggregation: true transformation: ".mean(over='15m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml index 52ec8afe2..0e592b254 100644 --- a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "search query latency" +name: "search latency" aggregation: true transformation: ".min(over='30m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml index a5a21f406..cad903c49 100644 --- a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "search fetch latency" +name: "fetch latency" aggregation: true transformation: ".min(over='15m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml index e7a3fb5cb..ff160ea43 100644 --- a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml +++ b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "fielddata cache evictions rate of change" +name: "field_data evictions change" aggregation: true transformation: ".mean(over='15m')" filtering: "filter('node_name', '*')" diff --git a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml index 01d451399..faaa7909a 100644 --- a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml +++ b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml @@ -1,5 +1,5 @@ module: ElasticSearch -name: "max time spent by task in queue rate of change" +name: "task time in queue change" aggregation: true transformation: ".mean(over='15m')" signals: diff --git a/modules/smart-agent_elasticsearch/detectors-gen.tf b/modules/smart-agent_elasticsearch/detectors-gen.tf index dd192f000..732a3d8c2 100644 --- a/modules/smart-agent_elasticsearch/detectors-gen.tf +++ b/modules/smart-agent_elasticsearch/detectors-gen.tf @@ -74,7 +74,8 @@ resource "signalfx_detector" "cluster_initializing_shards" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.initializing-shards', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.initializing-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') detect(when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.cluster_initializing_shards_threshold_major}, lasting=%{if var.cluster_initializing_shards_lasting_duration_major == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical}))).publish('MAJOR') EOF @@ -186,44 +187,44 @@ EOF max_delay = var.cluster_unassigned_shards_max_delay } -resource "signalfx_detector" "cluster_pending_tasks" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster pending tasks") +resource "signalfx_detector" "pending_tasks" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch pending tasks") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.pending-tasks', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_pending_tasks_aggregation_function}${var.cluster_pending_tasks_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_pending_tasks_threshold_critical}, lasting=%{if var.cluster_pending_tasks_lasting_duration_critical == null}None%{else}'${var.cluster_pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.cluster_pending_tasks_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_pending_tasks_threshold_major}, lasting=%{if var.cluster_pending_tasks_lasting_duration_major == null}None%{else}'${var.cluster_pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.cluster_pending_tasks_at_least_percentage_major}) and (not when(signal > ${var.cluster_pending_tasks_threshold_critical}, lasting=%{if var.cluster_pending_tasks_lasting_duration_critical == null}None%{else}'${var.cluster_pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.cluster_pending_tasks_at_least_percentage_critical}))).publish('MAJOR') + signal = data('elasticsearch.cluster.pending-tasks', filter=${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') + detect(when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.pending_tasks_threshold_major}, lasting=%{if var.pending_tasks_lasting_duration_major == null}None%{else}'${var.pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_major}) and (not when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "are too high > ${var.cluster_pending_tasks_threshold_critical}" + description = "are too high > ${var.pending_tasks_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.cluster_pending_tasks_disabled_critical, var.cluster_pending_tasks_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_pending_tasks_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster_pending_tasks_runbook_url, var.runbook_url), "") - tip = var.cluster_pending_tasks_tip + disabled = coalesce(var.pending_tasks_disabled_critical, var.pending_tasks_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.pending_tasks_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.pending_tasks_runbook_url, var.runbook_url), "") + tip = var.pending_tasks_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "are too high > ${var.cluster_pending_tasks_threshold_major}" + description = "are too high > ${var.pending_tasks_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.cluster_pending_tasks_disabled_major, var.cluster_pending_tasks_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_pending_tasks_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_pending_tasks_runbook_url, var.runbook_url), "") - tip = var.cluster_pending_tasks_tip + disabled = coalesce(var.pending_tasks_disabled_major, var.pending_tasks_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.pending_tasks_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.pending_tasks_runbook_url, var.runbook_url), "") + tip = var.pending_tasks_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.cluster_pending_tasks_max_delay + max_delay = var.pending_tasks_max_delay } resource "signalfx_detector" "cpu_usage" { @@ -267,8 +268,8 @@ EOF max_delay = var.cpu_usage_max_delay } -resource "signalfx_detector" "file_descriptors_usage" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch file descriptors usage") +resource "signalfx_detector" "file_descriptors" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch file descriptors") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -276,38 +277,38 @@ resource "signalfx_detector" "file_descriptors_usage" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.process.open_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_usage_aggregation_function}${var.file_descriptors_usage_transformation_function} - B = data('elasticsearch.process.max_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_usage_aggregation_function}${var.file_descriptors_usage_transformation_function} + A = data('elasticsearch.process.open_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} + B = data('elasticsearch.process.max_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} signal = (A/B).scale(100).publish('signal') - detect(when(signal > ${var.file_descriptors_usage_threshold_critical}, lasting=%{if var.file_descriptors_usage_lasting_duration_critical == null}None%{else}'${var.file_descriptors_usage_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_usage_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.file_descriptors_usage_threshold_major}, lasting=%{if var.file_descriptors_usage_lasting_duration_major == null}None%{else}'${var.file_descriptors_usage_lasting_duration_major}'%{endif}, at_least=${var.file_descriptors_usage_at_least_percentage_major}) and (not when(signal > ${var.file_descriptors_usage_threshold_critical}, lasting=%{if var.file_descriptors_usage_lasting_duration_critical == null}None%{else}'${var.file_descriptors_usage_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_usage_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal > ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.file_descriptors_threshold_major}, lasting=%{if var.file_descriptors_lasting_duration_major == null}None%{else}'${var.file_descriptors_lasting_duration_major}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_major}) and (not when(signal > ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.file_descriptors_usage_threshold_critical}" + description = "is too high > ${var.file_descriptors_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.file_descriptors_usage_disabled_critical, var.file_descriptors_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.file_descriptors_usage_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.file_descriptors_usage_runbook_url, var.runbook_url), "") - tip = var.file_descriptors_usage_tip + disabled = coalesce(var.file_descriptors_disabled_critical, var.file_descriptors_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.file_descriptors_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.file_descriptors_runbook_url, var.runbook_url), "") + tip = var.file_descriptors_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.file_descriptors_usage_threshold_major}" + description = "is too high > ${var.file_descriptors_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.file_descriptors_usage_disabled_major, var.file_descriptors_usage_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.file_descriptors_usage_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.file_descriptors_usage_runbook_url, var.runbook_url), "") - tip = var.file_descriptors_usage_tip + disabled = coalesce(var.file_descriptors_disabled_major, var.file_descriptors_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.file_descriptors_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.file_descriptors_runbook_url, var.runbook_url), "") + tip = var.file_descriptors_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.file_descriptors_usage_max_delay + max_delay = var.file_descriptors_max_delay } resource "signalfx_detector" "jvm_heap_memory_usage" { @@ -437,8 +438,8 @@ EOF max_delay = var.jvm_memory_old_usage_max_delay } -resource "signalfx_detector" "old-generation_garbage_collections_latency" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch old-generation garbage collections latency") +resource "signalfx_detector" "jvm_gc_old_collection_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm gc old collection latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -446,42 +447,42 @@ resource "signalfx_detector" "old-generation_garbage_collections_latency" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.jvm.gc.old-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.old-generation_garbage_collections_latency_aggregation_function}${var.old-generation_garbage_collections_latency_transformation_function} - B = data('elasticsearch.jvm.gc.old-count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.old-generation_garbage_collections_latency_aggregation_function}${var.old-generation_garbage_collections_latency_transformation_function} + A = data('elasticsearch.jvm.gc.old-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} + B = data('elasticsearch.jvm.gc.old-count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.old-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.old-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.old-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.old-generation_garbage_collections_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.old-generation_garbage_collections_latency_threshold_minor}, lasting=%{if var.old-generation_garbage_collections_latency_lasting_duration_minor == null}None%{else}'${var.old-generation_garbage_collections_latency_lasting_duration_minor}'%{endif}, at_least=${var.old-generation_garbage_collections_latency_at_least_percentage_minor}) and (not when(signal > ${var.old-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.old-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.old-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.old-generation_garbage_collections_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_minor}) and (not when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.old-generation_garbage_collections_latency_threshold_major}" + description = "is too high > ${var.jvm_gc_old_collection_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.old-generation_garbage_collections_latency_disabled_major, var.old-generation_garbage_collections_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.old-generation_garbage_collections_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.old-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") - tip = var.old-generation_garbage_collections_latency_tip + disabled = coalesce(var.jvm_gc_old_collection_latency_disabled_major, var.jvm_gc_old_collection_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_gc_old_collection_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.jvm_gc_old_collection_latency_runbook_url, var.runbook_url), "") + tip = var.jvm_gc_old_collection_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.old-generation_garbage_collections_latency_threshold_minor}" + description = "is too high > ${var.jvm_gc_old_collection_latency_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.old-generation_garbage_collections_latency_disabled_minor, var.old-generation_garbage_collections_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.old-generation_garbage_collections_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.old-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") - tip = var.old-generation_garbage_collections_latency_tip + disabled = coalesce(var.jvm_gc_old_collection_latency_disabled_minor, var.jvm_gc_old_collection_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_gc_old_collection_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.jvm_gc_old_collection_latency_runbook_url, var.runbook_url), "") + tip = var.jvm_gc_old_collection_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.old-generation_garbage_collections_latency_max_delay + max_delay = var.jvm_gc_old_collection_latency_max_delay } -resource "signalfx_detector" "young-generation_garbage_collections_latency" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch young-generation garbage collections latency") +resource "signalfx_detector" "jvm_gc_young_collection_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm gc young collection latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -489,38 +490,38 @@ resource "signalfx_detector" "young-generation_garbage_collections_latency" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.jvm.gc.time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.young-generation_garbage_collections_latency_aggregation_function}${var.young-generation_garbage_collections_latency_transformation_function} - B = data('elasticsearch.jvm.gc.count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.young-generation_garbage_collections_latency_aggregation_function}${var.young-generation_garbage_collections_latency_transformation_function} + A = data('elasticsearch.jvm.gc.time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} + B = data('elasticsearch.jvm.gc.count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.young-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.young-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.young-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.young-generation_garbage_collections_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.young-generation_garbage_collections_latency_threshold_minor}, lasting=%{if var.young-generation_garbage_collections_latency_lasting_duration_minor == null}None%{else}'${var.young-generation_garbage_collections_latency_lasting_duration_minor}'%{endif}, at_least=${var.young-generation_garbage_collections_latency_at_least_percentage_minor}) and (not when(signal > ${var.young-generation_garbage_collections_latency_threshold_major}, lasting=%{if var.young-generation_garbage_collections_latency_lasting_duration_major == null}None%{else}'${var.young-generation_garbage_collections_latency_lasting_duration_major}'%{endif}, at_least=${var.young-generation_garbage_collections_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_minor}) and (not when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.young-generation_garbage_collections_latency_threshold_major}" + description = "is too high > ${var.jvm_gc_young_collection_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.young-generation_garbage_collections_latency_disabled_major, var.young-generation_garbage_collections_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.young-generation_garbage_collections_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.young-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") - tip = var.young-generation_garbage_collections_latency_tip + disabled = coalesce(var.jvm_gc_young_collection_latency_disabled_major, var.jvm_gc_young_collection_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_gc_young_collection_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.jvm_gc_young_collection_latency_runbook_url, var.runbook_url), "") + tip = var.jvm_gc_young_collection_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.young-generation_garbage_collections_latency_threshold_minor}" + description = "is too high > ${var.jvm_gc_young_collection_latency_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.young-generation_garbage_collections_latency_disabled_minor, var.young-generation_garbage_collections_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.young-generation_garbage_collections_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.young-generation_garbage_collections_latency_runbook_url, var.runbook_url), "") - tip = var.young-generation_garbage_collections_latency_tip + disabled = coalesce(var.jvm_gc_young_collection_latency_disabled_minor, var.jvm_gc_young_collection_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.jvm_gc_young_collection_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.jvm_gc_young_collection_latency_runbook_url, var.runbook_url), "") + tip = var.jvm_gc_young_collection_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.young-generation_garbage_collections_latency_max_delay + max_delay = var.jvm_gc_young_collection_latency_max_delay } resource "signalfx_detector" "indexing_latency" { @@ -566,8 +567,8 @@ EOF max_delay = var.indexing_latency_max_delay } -resource "signalfx_detector" "index_flushing_to_disk_latency" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch index flushing to disk latency") +resource "signalfx_detector" "flushing_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch flushing latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -575,42 +576,42 @@ resource "signalfx_detector" "index_flushing_to_disk_latency" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.index_flushing_to_disk_latency_aggregation_function}${var.index_flushing_to_disk_latency_transformation_function} - B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.index_flushing_to_disk_latency_aggregation_function}${var.index_flushing_to_disk_latency_transformation_function} + A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flushing_latency_aggregation_function}${var.flushing_latency_transformation_function} + B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flushing_latency_aggregation_function}${var.flushing_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.index_flushing_to_disk_latency_threshold_major}, lasting=%{if var.index_flushing_to_disk_latency_lasting_duration_major == null}None%{else}'${var.index_flushing_to_disk_latency_lasting_duration_major}'%{endif}, at_least=${var.index_flushing_to_disk_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.index_flushing_to_disk_latency_threshold_minor}, lasting=%{if var.index_flushing_to_disk_latency_lasting_duration_minor == null}None%{else}'${var.index_flushing_to_disk_latency_lasting_duration_minor}'%{endif}, at_least=${var.index_flushing_to_disk_latency_at_least_percentage_minor}) and (not when(signal > ${var.index_flushing_to_disk_latency_threshold_major}, lasting=%{if var.index_flushing_to_disk_latency_lasting_duration_major == null}None%{else}'${var.index_flushing_to_disk_latency_lasting_duration_major}'%{endif}, at_least=${var.index_flushing_to_disk_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.flushing_latency_threshold_major}, lasting=%{if var.flushing_latency_lasting_duration_major == null}None%{else}'${var.flushing_latency_lasting_duration_major}'%{endif}, at_least=${var.flushing_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.flushing_latency_threshold_minor}, lasting=%{if var.flushing_latency_lasting_duration_minor == null}None%{else}'${var.flushing_latency_lasting_duration_minor}'%{endif}, at_least=${var.flushing_latency_at_least_percentage_minor}) and (not when(signal > ${var.flushing_latency_threshold_major}, lasting=%{if var.flushing_latency_lasting_duration_major == null}None%{else}'${var.flushing_latency_lasting_duration_major}'%{endif}, at_least=${var.flushing_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.index_flushing_to_disk_latency_threshold_major}" + description = "is too high > ${var.flushing_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.index_flushing_to_disk_latency_disabled_major, var.index_flushing_to_disk_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.index_flushing_to_disk_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.index_flushing_to_disk_latency_runbook_url, var.runbook_url), "") - tip = var.index_flushing_to_disk_latency_tip + disabled = coalesce(var.flushing_latency_disabled_major, var.flushing_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.flushing_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.flushing_latency_runbook_url, var.runbook_url), "") + tip = var.flushing_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.index_flushing_to_disk_latency_threshold_minor}" + description = "is too high > ${var.flushing_latency_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.index_flushing_to_disk_latency_disabled_minor, var.index_flushing_to_disk_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.index_flushing_to_disk_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.index_flushing_to_disk_latency_runbook_url, var.runbook_url), "") - tip = var.index_flushing_to_disk_latency_tip + disabled = coalesce(var.flushing_latency_disabled_minor, var.flushing_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.flushing_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.flushing_latency_runbook_url, var.runbook_url), "") + tip = var.flushing_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.index_flushing_to_disk_latency_max_delay + max_delay = var.flushing_latency_max_delay } -resource "signalfx_detector" "search_query_latency" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch search query latency") +resource "signalfx_detector" "search_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch search latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -618,42 +619,42 @@ resource "signalfx_detector" "search_query_latency" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.indices.search.query-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_query_latency_aggregation_function}${var.search_query_latency_transformation_function} - B = data('elasticsearch.indices.search.query-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_query_latency_aggregation_function}${var.search_query_latency_transformation_function} + A = data('elasticsearch.indices.search.query-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} + B = data('elasticsearch.indices.search.query-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.search_query_latency_threshold_major}, lasting=%{if var.search_query_latency_lasting_duration_major == null}None%{else}'${var.search_query_latency_lasting_duration_major}'%{endif}, at_least=${var.search_query_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.search_query_latency_threshold_minor}, lasting=%{if var.search_query_latency_lasting_duration_minor == null}None%{else}'${var.search_query_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_query_latency_at_least_percentage_minor}) and (not when(signal > ${var.search_query_latency_threshold_major}, lasting=%{if var.search_query_latency_lasting_duration_major == null}None%{else}'${var.search_query_latency_lasting_duration_major}'%{endif}, at_least=${var.search_query_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.search_latency_threshold_minor}, lasting=%{if var.search_latency_lasting_duration_minor == null}None%{else}'${var.search_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_latency_at_least_percentage_minor}) and (not when(signal > ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.search_query_latency_threshold_major}" + description = "is too high > ${var.search_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.search_query_latency_disabled_major, var.search_query_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.search_query_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.search_query_latency_runbook_url, var.runbook_url), "") - tip = var.search_query_latency_tip + disabled = coalesce(var.search_latency_disabled_major, var.search_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.search_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.search_latency_runbook_url, var.runbook_url), "") + tip = var.search_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.search_query_latency_threshold_minor}" + description = "is too high > ${var.search_latency_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.search_query_latency_disabled_minor, var.search_query_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.search_query_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.search_query_latency_runbook_url, var.runbook_url), "") - tip = var.search_query_latency_tip + disabled = coalesce(var.search_latency_disabled_minor, var.search_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.search_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.search_latency_runbook_url, var.runbook_url), "") + tip = var.search_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.search_query_latency_max_delay + max_delay = var.search_latency_max_delay } -resource "signalfx_detector" "search_fetch_latency" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch search fetch latency") +resource "signalfx_detector" "fetch_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch fetch latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -661,42 +662,42 @@ resource "signalfx_detector" "search_fetch_latency" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.indices.search.fetch-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_fetch_latency_aggregation_function}${var.search_fetch_latency_transformation_function} - B = data('elasticsearch.indices.search.fetch-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_fetch_latency_aggregation_function}${var.search_fetch_latency_transformation_function} + A = data('elasticsearch.indices.search.fetch-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} + B = data('elasticsearch.indices.search.fetch-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.search_fetch_latency_threshold_major}, lasting=%{if var.search_fetch_latency_lasting_duration_major == null}None%{else}'${var.search_fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.search_fetch_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.search_fetch_latency_threshold_minor}, lasting=%{if var.search_fetch_latency_lasting_duration_minor == null}None%{else}'${var.search_fetch_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_fetch_latency_at_least_percentage_minor}) and (not when(signal > ${var.search_fetch_latency_threshold_major}, lasting=%{if var.search_fetch_latency_lasting_duration_major == null}None%{else}'${var.search_fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.search_fetch_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.fetch_latency_threshold_minor}, lasting=%{if var.fetch_latency_lasting_duration_minor == null}None%{else}'${var.fetch_latency_lasting_duration_minor}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_minor}) and (not when(signal > ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.search_fetch_latency_threshold_major}" + description = "is too high > ${var.fetch_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.search_fetch_latency_disabled_major, var.search_fetch_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.search_fetch_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.search_fetch_latency_runbook_url, var.runbook_url), "") - tip = var.search_fetch_latency_tip + disabled = coalesce(var.fetch_latency_disabled_major, var.fetch_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fetch_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.fetch_latency_runbook_url, var.runbook_url), "") + tip = var.fetch_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.search_fetch_latency_threshold_minor}" + description = "is too high > ${var.fetch_latency_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.search_fetch_latency_disabled_minor, var.search_fetch_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.search_fetch_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.search_fetch_latency_runbook_url, var.runbook_url), "") - tip = var.search_fetch_latency_tip + disabled = coalesce(var.fetch_latency_disabled_minor, var.fetch_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fetch_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.fetch_latency_runbook_url, var.runbook_url), "") + tip = var.fetch_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.search_fetch_latency_max_delay + max_delay = var.fetch_latency_max_delay } -resource "signalfx_detector" "fielddata_cache_evictions_rate_of_change" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch fielddata cache evictions rate of change") +resource "signalfx_detector" "field_data_evictions_change" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch field_data evictions change") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -704,77 +705,77 @@ resource "signalfx_detector" "fielddata_cache_evictions_rate_of_change" { program_text = <<-EOF base_filtering = filter('node_name', '*') - A = data('elasticsearch.indices.fielddata.evictions', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fielddata_cache_evictions_rate_of_change_aggregation_function}${var.fielddata_cache_evictions_rate_of_change_transformation_function} + A = data('elasticsearch.indices.fielddata.evictions', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function} signal = A.rateofchange().publish('signal') - detect(when(signal > ${var.fielddata_cache_evictions_rate_of_change_threshold_major}, lasting=%{if var.fielddata_cache_evictions_rate_of_change_lasting_duration_major == null}None%{else}'${var.fielddata_cache_evictions_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.fielddata_cache_evictions_rate_of_change_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.fielddata_cache_evictions_rate_of_change_threshold_minor}, lasting=%{if var.fielddata_cache_evictions_rate_of_change_lasting_duration_minor == null}None%{else}'${var.fielddata_cache_evictions_rate_of_change_lasting_duration_minor}'%{endif}, at_least=${var.fielddata_cache_evictions_rate_of_change_at_least_percentage_minor}) and (not when(signal > ${var.fielddata_cache_evictions_rate_of_change_threshold_major}, lasting=%{if var.fielddata_cache_evictions_rate_of_change_lasting_duration_major == null}None%{else}'${var.fielddata_cache_evictions_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.fielddata_cache_evictions_rate_of_change_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.field_data_evictions_change_threshold_minor}, lasting=%{if var.field_data_evictions_change_lasting_duration_minor == null}None%{else}'${var.field_data_evictions_change_lasting_duration_minor}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_minor}) and (not when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.fielddata_cache_evictions_rate_of_change_threshold_major}" + description = "is too high > ${var.field_data_evictions_change_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.fielddata_cache_evictions_rate_of_change_disabled_major, var.fielddata_cache_evictions_rate_of_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.fielddata_cache_evictions_rate_of_change_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.fielddata_cache_evictions_rate_of_change_runbook_url, var.runbook_url), "") - tip = var.fielddata_cache_evictions_rate_of_change_tip + disabled = coalesce(var.field_data_evictions_change_disabled_major, var.field_data_evictions_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.field_data_evictions_change_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.field_data_evictions_change_runbook_url, var.runbook_url), "") + tip = var.field_data_evictions_change_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.fielddata_cache_evictions_rate_of_change_threshold_minor}" + description = "is too high > ${var.field_data_evictions_change_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.fielddata_cache_evictions_rate_of_change_disabled_minor, var.fielddata_cache_evictions_rate_of_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.fielddata_cache_evictions_rate_of_change_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.fielddata_cache_evictions_rate_of_change_runbook_url, var.runbook_url), "") - tip = var.fielddata_cache_evictions_rate_of_change_tip + disabled = coalesce(var.field_data_evictions_change_disabled_minor, var.field_data_evictions_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.field_data_evictions_change_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.field_data_evictions_change_runbook_url, var.runbook_url), "") + tip = var.field_data_evictions_change_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.fielddata_cache_evictions_rate_of_change_max_delay + max_delay = var.field_data_evictions_change_max_delay } -resource "signalfx_detector" "max_time_spent_by_task_in_queue_rate_of_change" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch max time spent by task in queue rate of change") +resource "signalfx_detector" "task_time_in_queue_change" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch task time in queue change") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.cluster.task-max-wait-time', filter=${module.filtering.signalflow}, rollup='average')${var.max_time_spent_by_task_in_queue_rate_of_change_aggregation_function}${var.max_time_spent_by_task_in_queue_rate_of_change_transformation_function} + A = data('elasticsearch.cluster.task-max-wait-time', filter=${module.filtering.signalflow}, rollup='average')${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function} signal = A.rateofchange().publish('signal') - detect(when(signal > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_major}, lasting=%{if var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major == null}None%{else}'${var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_minor}, lasting=%{if var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_minor == null}None%{else}'${var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_minor}'%{endif}, at_least=${var.max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_minor}) and (not when(signal > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_major}, lasting=%{if var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major == null}None%{else}'${var.max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major}'%{endif}, at_least=${var.max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}, lasting=%{if var.task_time_in_queue_change_lasting_duration_minor == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_minor}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_minor}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_major}" + description = "is too high > ${var.task_time_in_queue_change_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_disabled_major, var.max_time_spent_by_task_in_queue_rate_of_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.max_time_spent_by_task_in_queue_rate_of_change_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_runbook_url, var.runbook_url), "") - tip = var.max_time_spent_by_task_in_queue_rate_of_change_tip + disabled = coalesce(var.task_time_in_queue_change_disabled_major, var.task_time_in_queue_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.task_time_in_queue_change_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.task_time_in_queue_change_runbook_url, var.runbook_url), "") + tip = var.task_time_in_queue_change_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.max_time_spent_by_task_in_queue_rate_of_change_threshold_minor}" + description = "is too high > ${var.task_time_in_queue_change_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_disabled_minor, var.max_time_spent_by_task_in_queue_rate_of_change_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.max_time_spent_by_task_in_queue_rate_of_change_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.max_time_spent_by_task_in_queue_rate_of_change_runbook_url, var.runbook_url), "") - tip = var.max_time_spent_by_task_in_queue_rate_of_change_tip + disabled = coalesce(var.task_time_in_queue_change_disabled_minor, var.task_time_in_queue_change_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.task_time_in_queue_change_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.task_time_in_queue_change_runbook_url, var.runbook_url), "") + tip = var.task_time_in_queue_change_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.max_time_spent_by_task_in_queue_rate_of_change_max_delay + max_delay = var.task_time_in_queue_change_max_delay } diff --git a/modules/smart-agent_elasticsearch/outputs.tf b/modules/smart-agent_elasticsearch/outputs.tf index 877821ae0..4464fe0c2 100644 --- a/modules/smart-agent_elasticsearch/outputs.tf +++ b/modules/smart-agent_elasticsearch/outputs.tf @@ -3,11 +3,6 @@ output "cluster_initializing_shards" { value = signalfx_detector.cluster_initializing_shards } -output "cluster_pending_tasks" { - description = "Detector resource for cluster_pending_tasks" - value = signalfx_detector.cluster_pending_tasks -} - output "cluster_relocating_shards" { description = "Detector resource for cluster_relocating_shards" value = signalfx_detector.cluster_relocating_shards @@ -28,14 +23,24 @@ output "cpu_usage" { value = signalfx_detector.cpu_usage } -output "fielddata_cache_evictions_rate_of_change" { - description = "Detector resource for fielddata_cache_evictions_rate_of_change" - value = signalfx_detector.fielddata_cache_evictions_rate_of_change +output "fetch_latency" { + description = "Detector resource for fetch_latency" + value = signalfx_detector.fetch_latency +} + +output "field_data_evictions_change" { + description = "Detector resource for field_data_evictions_change" + value = signalfx_detector.field_data_evictions_change } -output "file_descriptors_usage" { - description = "Detector resource for file_descriptors_usage" - value = signalfx_detector.file_descriptors_usage +output "file_descriptors" { + description = "Detector resource for file_descriptors" + value = signalfx_detector.file_descriptors +} + +output "flushing_latency" { + description = "Detector resource for flushing_latency" + value = signalfx_detector.flushing_latency } output "heartbeat" { @@ -43,16 +48,21 @@ output "heartbeat" { value = signalfx_detector.heartbeat } -output "index_flushing_to_disk_latency" { - description = "Detector resource for index_flushing_to_disk_latency" - value = signalfx_detector.index_flushing_to_disk_latency -} - output "indexing_latency" { description = "Detector resource for indexing_latency" value = signalfx_detector.indexing_latency } +output "jvm_gc_old_collection_latency" { + description = "Detector resource for jvm_gc_old_collection_latency" + value = signalfx_detector.jvm_gc_old_collection_latency +} + +output "jvm_gc_young_collection_latency" { + description = "Detector resource for jvm_gc_young_collection_latency" + value = signalfx_detector.jvm_gc_young_collection_latency +} + output "jvm_heap_memory_usage" { description = "Detector resource for jvm_heap_memory_usage" value = signalfx_detector.jvm_heap_memory_usage @@ -68,28 +78,18 @@ output "jvm_memory_young_usage" { value = signalfx_detector.jvm_memory_young_usage } -output "max_time_spent_by_task_in_queue_rate_of_change" { - description = "Detector resource for max_time_spent_by_task_in_queue_rate_of_change" - value = signalfx_detector.max_time_spent_by_task_in_queue_rate_of_change -} - -output "old-generation_garbage_collections_latency" { - description = "Detector resource for old-generation_garbage_collections_latency" - value = signalfx_detector.old-generation_garbage_collections_latency -} - -output "search_fetch_latency" { - description = "Detector resource for search_fetch_latency" - value = signalfx_detector.search_fetch_latency +output "pending_tasks" { + description = "Detector resource for pending_tasks" + value = signalfx_detector.pending_tasks } -output "search_query_latency" { - description = "Detector resource for search_query_latency" - value = signalfx_detector.search_query_latency +output "search_latency" { + description = "Detector resource for search_latency" + value = signalfx_detector.search_latency } -output "young-generation_garbage_collections_latency" { - description = "Detector resource for young-generation_garbage_collections_latency" - value = signalfx_detector.young-generation_garbage_collections_latency +output "task_time_in_queue_change" { + description = "Detector resource for task_time_in_queue_change" + value = signalfx_detector.task_time_in_queue_change } diff --git a/modules/smart-agent_elasticsearch/variables-gen.tf b/modules/smart-agent_elasticsearch/variables-gen.tf index 29ddb9969..c4fbf9ef3 100644 --- a/modules/smart-agent_elasticsearch/variables-gen.tf +++ b/modules/smart-agent_elasticsearch/variables-gen.tf @@ -191,7 +191,7 @@ variable "cluster_initializing_shards_disabled_major" { variable "cluster_initializing_shards_threshold_critical" { description = "Critical threshold for cluster_initializing_shards detector" type = number - default = 0 + default = 1 } variable "cluster_initializing_shards_lasting_duration_critical" { @@ -208,7 +208,7 @@ variable "cluster_initializing_shards_at_least_percentage_critical" { variable "cluster_initializing_shards_threshold_major" { description = "Major threshold for cluster_initializing_shards detector" type = number - default = -1 + default = 0 } variable "cluster_initializing_shards_lasting_duration_major" { @@ -402,92 +402,92 @@ variable "cluster_unassigned_shards_at_least_percentage_major" { type = number default = 1 } -# cluster_pending_tasks detector +# pending_tasks detector -variable "cluster_pending_tasks_notifications" { - description = "Notification recipients list per severity overridden for cluster_pending_tasks detector" +variable "pending_tasks_notifications" { + description = "Notification recipients list per severity overridden for pending_tasks detector" type = map(list(string)) default = {} } -variable "cluster_pending_tasks_aggregation_function" { - description = "Aggregation function and group by for cluster_pending_tasks detector (i.e. \".mean(by=['host'])\")" +variable "pending_tasks_aggregation_function" { + description = "Aggregation function and group by for pending_tasks detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "cluster_pending_tasks_transformation_function" { - description = "Transformation function for cluster_pending_tasks detector (i.e. \".mean(over='5m')\")" +variable "pending_tasks_transformation_function" { + description = "Transformation function for pending_tasks detector (i.e. \".mean(over='5m')\")" type = string default = ".min(over='15m')" } -variable "cluster_pending_tasks_max_delay" { - description = "Enforce max delay for cluster_pending_tasks detector (use \"0\" or \"null\" for \"Auto\")" +variable "pending_tasks_max_delay" { + description = "Enforce max delay for pending_tasks detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "cluster_pending_tasks_tip" { +variable "pending_tasks_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "cluster_pending_tasks_runbook_url" { +variable "pending_tasks_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "cluster_pending_tasks_disabled" { - description = "Disable all alerting rules for cluster_pending_tasks detector" +variable "pending_tasks_disabled" { + description = "Disable all alerting rules for pending_tasks detector" type = bool default = null } -variable "cluster_pending_tasks_disabled_critical" { - description = "Disable critical alerting rule for cluster_pending_tasks detector" +variable "pending_tasks_disabled_critical" { + description = "Disable critical alerting rule for pending_tasks detector" type = bool default = null } -variable "cluster_pending_tasks_disabled_major" { - description = "Disable major alerting rule for cluster_pending_tasks detector" +variable "pending_tasks_disabled_major" { + description = "Disable major alerting rule for pending_tasks detector" type = bool default = null } -variable "cluster_pending_tasks_threshold_critical" { - description = "Critical threshold for cluster_pending_tasks detector" +variable "pending_tasks_threshold_critical" { + description = "Critical threshold for pending_tasks detector" type = number default = 5 } -variable "cluster_pending_tasks_lasting_duration_critical" { +variable "pending_tasks_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "cluster_pending_tasks_at_least_percentage_critical" { +variable "pending_tasks_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "cluster_pending_tasks_threshold_major" { - description = "Major threshold for cluster_pending_tasks detector" +variable "pending_tasks_threshold_major" { + description = "Major threshold for pending_tasks detector" type = number default = 0 } -variable "cluster_pending_tasks_lasting_duration_major" { +variable "pending_tasks_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "cluster_pending_tasks_at_least_percentage_major" { +variable "pending_tasks_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -582,92 +582,92 @@ variable "cpu_usage_at_least_percentage_major" { type = number default = 1 } -# file_descriptors_usage detector +# file_descriptors detector -variable "file_descriptors_usage_notifications" { - description = "Notification recipients list per severity overridden for file_descriptors_usage detector" +variable "file_descriptors_notifications" { + description = "Notification recipients list per severity overridden for file_descriptors detector" type = map(list(string)) default = {} } -variable "file_descriptors_usage_aggregation_function" { - description = "Aggregation function and group by for file_descriptors_usage detector (i.e. \".mean(by=['host'])\")" +variable "file_descriptors_aggregation_function" { + description = "Aggregation function and group by for file_descriptors detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "file_descriptors_usage_transformation_function" { - description = "Transformation function for file_descriptors_usage detector (i.e. \".mean(over='5m')\")" +variable "file_descriptors_transformation_function" { + description = "Transformation function for file_descriptors detector (i.e. \".mean(over='5m')\")" type = string default = ".max(over='15m')" } -variable "file_descriptors_usage_max_delay" { - description = "Enforce max delay for file_descriptors_usage detector (use \"0\" or \"null\" for \"Auto\")" +variable "file_descriptors_max_delay" { + description = "Enforce max delay for file_descriptors detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "file_descriptors_usage_tip" { +variable "file_descriptors_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "file_descriptors_usage_runbook_url" { +variable "file_descriptors_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "file_descriptors_usage_disabled" { - description = "Disable all alerting rules for file_descriptors_usage detector" +variable "file_descriptors_disabled" { + description = "Disable all alerting rules for file_descriptors detector" type = bool default = null } -variable "file_descriptors_usage_disabled_critical" { - description = "Disable critical alerting rule for file_descriptors_usage detector" +variable "file_descriptors_disabled_critical" { + description = "Disable critical alerting rule for file_descriptors detector" type = bool default = null } -variable "file_descriptors_usage_disabled_major" { - description = "Disable major alerting rule for file_descriptors_usage detector" +variable "file_descriptors_disabled_major" { + description = "Disable major alerting rule for file_descriptors detector" type = bool default = null } -variable "file_descriptors_usage_threshold_critical" { - description = "Critical threshold for file_descriptors_usage detector" +variable "file_descriptors_threshold_critical" { + description = "Critical threshold for file_descriptors detector" type = number default = 95 } -variable "file_descriptors_usage_lasting_duration_critical" { +variable "file_descriptors_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "file_descriptors_usage_at_least_percentage_critical" { +variable "file_descriptors_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "file_descriptors_usage_threshold_major" { - description = "Major threshold for file_descriptors_usage detector" +variable "file_descriptors_threshold_major" { + description = "Major threshold for file_descriptors detector" type = number default = 90 } -variable "file_descriptors_usage_lasting_duration_major" { +variable "file_descriptors_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "file_descriptors_usage_at_least_percentage_major" { +variable "file_descriptors_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -942,182 +942,182 @@ variable "jvm_memory_old_usage_at_least_percentage_minor" { type = number default = 1 } -# old-generation_garbage_collections_latency detector +# jvm_gc_old_collection_latency detector -variable "old-generation_garbage_collections_latency_notifications" { - description = "Notification recipients list per severity overridden for old-generation_garbage_collections_latency detector" +variable "jvm_gc_old_collection_latency_notifications" { + description = "Notification recipients list per severity overridden for jvm_gc_old_collection_latency detector" type = map(list(string)) default = {} } -variable "old-generation_garbage_collections_latency_aggregation_function" { - description = "Aggregation function and group by for old-generation_garbage_collections_latency detector (i.e. \".mean(by=['host'])\")" +variable "jvm_gc_old_collection_latency_aggregation_function" { + description = "Aggregation function and group by for jvm_gc_old_collection_latency detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "old-generation_garbage_collections_latency_transformation_function" { - description = "Transformation function for old-generation_garbage_collections_latency detector (i.e. \".mean(over='5m')\")" +variable "jvm_gc_old_collection_latency_transformation_function" { + description = "Transformation function for jvm_gc_old_collection_latency detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='15m')" } -variable "old-generation_garbage_collections_latency_max_delay" { - description = "Enforce max delay for old-generation_garbage_collections_latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "jvm_gc_old_collection_latency_max_delay" { + description = "Enforce max delay for jvm_gc_old_collection_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "old-generation_garbage_collections_latency_tip" { +variable "jvm_gc_old_collection_latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "old-generation_garbage_collections_latency_runbook_url" { +variable "jvm_gc_old_collection_latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "old-generation_garbage_collections_latency_disabled" { - description = "Disable all alerting rules for old-generation_garbage_collections_latency detector" +variable "jvm_gc_old_collection_latency_disabled" { + description = "Disable all alerting rules for jvm_gc_old_collection_latency detector" type = bool default = null } -variable "old-generation_garbage_collections_latency_disabled_major" { - description = "Disable major alerting rule for old-generation_garbage_collections_latency detector" +variable "jvm_gc_old_collection_latency_disabled_major" { + description = "Disable major alerting rule for jvm_gc_old_collection_latency detector" type = bool default = null } -variable "old-generation_garbage_collections_latency_disabled_minor" { - description = "Disable minor alerting rule for old-generation_garbage_collections_latency detector" +variable "jvm_gc_old_collection_latency_disabled_minor" { + description = "Disable minor alerting rule for jvm_gc_old_collection_latency detector" type = bool default = null } -variable "old-generation_garbage_collections_latency_threshold_major" { - description = "Major threshold for old-generation_garbage_collections_latency detector" +variable "jvm_gc_old_collection_latency_threshold_major" { + description = "Major threshold for jvm_gc_old_collection_latency detector" type = number default = 300 } -variable "old-generation_garbage_collections_latency_lasting_duration_major" { +variable "jvm_gc_old_collection_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "old-generation_garbage_collections_latency_at_least_percentage_major" { +variable "jvm_gc_old_collection_latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "old-generation_garbage_collections_latency_threshold_minor" { - description = "Minor threshold for old-generation_garbage_collections_latency detector" +variable "jvm_gc_old_collection_latency_threshold_minor" { + description = "Minor threshold for jvm_gc_old_collection_latency detector" type = number default = 200 } -variable "old-generation_garbage_collections_latency_lasting_duration_minor" { +variable "jvm_gc_old_collection_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "old-generation_garbage_collections_latency_at_least_percentage_minor" { +variable "jvm_gc_old_collection_latency_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -# young-generation_garbage_collections_latency detector +# jvm_gc_young_collection_latency detector -variable "young-generation_garbage_collections_latency_notifications" { - description = "Notification recipients list per severity overridden for young-generation_garbage_collections_latency detector" +variable "jvm_gc_young_collection_latency_notifications" { + description = "Notification recipients list per severity overridden for jvm_gc_young_collection_latency detector" type = map(list(string)) default = {} } -variable "young-generation_garbage_collections_latency_aggregation_function" { - description = "Aggregation function and group by for young-generation_garbage_collections_latency detector (i.e. \".mean(by=['host'])\")" +variable "jvm_gc_young_collection_latency_aggregation_function" { + description = "Aggregation function and group by for jvm_gc_young_collection_latency detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "young-generation_garbage_collections_latency_transformation_function" { - description = "Transformation function for young-generation_garbage_collections_latency detector (i.e. \".mean(over='5m')\")" +variable "jvm_gc_young_collection_latency_transformation_function" { + description = "Transformation function for jvm_gc_young_collection_latency detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='15m')" } -variable "young-generation_garbage_collections_latency_max_delay" { - description = "Enforce max delay for young-generation_garbage_collections_latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "jvm_gc_young_collection_latency_max_delay" { + description = "Enforce max delay for jvm_gc_young_collection_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "young-generation_garbage_collections_latency_tip" { +variable "jvm_gc_young_collection_latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "young-generation_garbage_collections_latency_runbook_url" { +variable "jvm_gc_young_collection_latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "young-generation_garbage_collections_latency_disabled" { - description = "Disable all alerting rules for young-generation_garbage_collections_latency detector" +variable "jvm_gc_young_collection_latency_disabled" { + description = "Disable all alerting rules for jvm_gc_young_collection_latency detector" type = bool default = null } -variable "young-generation_garbage_collections_latency_disabled_major" { - description = "Disable major alerting rule for young-generation_garbage_collections_latency detector" +variable "jvm_gc_young_collection_latency_disabled_major" { + description = "Disable major alerting rule for jvm_gc_young_collection_latency detector" type = bool default = null } -variable "young-generation_garbage_collections_latency_disabled_minor" { - description = "Disable minor alerting rule for young-generation_garbage_collections_latency detector" +variable "jvm_gc_young_collection_latency_disabled_minor" { + description = "Disable minor alerting rule for jvm_gc_young_collection_latency detector" type = bool default = null } -variable "young-generation_garbage_collections_latency_threshold_major" { - description = "Major threshold for young-generation_garbage_collections_latency detector" +variable "jvm_gc_young_collection_latency_threshold_major" { + description = "Major threshold for jvm_gc_young_collection_latency detector" type = number default = 40 } -variable "young-generation_garbage_collections_latency_lasting_duration_major" { +variable "jvm_gc_young_collection_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "young-generation_garbage_collections_latency_at_least_percentage_major" { +variable "jvm_gc_young_collection_latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "young-generation_garbage_collections_latency_threshold_minor" { - description = "Minor threshold for young-generation_garbage_collections_latency detector" +variable "jvm_gc_young_collection_latency_threshold_minor" { + description = "Minor threshold for jvm_gc_young_collection_latency detector" type = number default = 20 } -variable "young-generation_garbage_collections_latency_lasting_duration_minor" { +variable "jvm_gc_young_collection_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "young-generation_garbage_collections_latency_at_least_percentage_minor" { +variable "jvm_gc_young_collection_latency_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -1212,452 +1212,452 @@ variable "indexing_latency_at_least_percentage_minor" { type = number default = 1 } -# index_flushing_to_disk_latency detector +# flushing_latency detector -variable "index_flushing_to_disk_latency_notifications" { - description = "Notification recipients list per severity overridden for index_flushing_to_disk_latency detector" +variable "flushing_latency_notifications" { + description = "Notification recipients list per severity overridden for flushing_latency detector" type = map(list(string)) default = {} } -variable "index_flushing_to_disk_latency_aggregation_function" { - description = "Aggregation function and group by for index_flushing_to_disk_latency detector (i.e. \".mean(by=['host'])\")" +variable "flushing_latency_aggregation_function" { + description = "Aggregation function and group by for flushing_latency detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "index_flushing_to_disk_latency_transformation_function" { - description = "Transformation function for index_flushing_to_disk_latency detector (i.e. \".mean(over='5m')\")" +variable "flushing_latency_transformation_function" { + description = "Transformation function for flushing_latency detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='15m')" } -variable "index_flushing_to_disk_latency_max_delay" { - description = "Enforce max delay for index_flushing_to_disk_latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "flushing_latency_max_delay" { + description = "Enforce max delay for flushing_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "index_flushing_to_disk_latency_tip" { +variable "flushing_latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "index_flushing_to_disk_latency_runbook_url" { +variable "flushing_latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "index_flushing_to_disk_latency_disabled" { - description = "Disable all alerting rules for index_flushing_to_disk_latency detector" +variable "flushing_latency_disabled" { + description = "Disable all alerting rules for flushing_latency detector" type = bool default = null } -variable "index_flushing_to_disk_latency_disabled_major" { - description = "Disable major alerting rule for index_flushing_to_disk_latency detector" +variable "flushing_latency_disabled_major" { + description = "Disable major alerting rule for flushing_latency detector" type = bool default = null } -variable "index_flushing_to_disk_latency_disabled_minor" { - description = "Disable minor alerting rule for index_flushing_to_disk_latency detector" +variable "flushing_latency_disabled_minor" { + description = "Disable minor alerting rule for flushing_latency detector" type = bool default = null } -variable "index_flushing_to_disk_latency_threshold_major" { - description = "Major threshold for index_flushing_to_disk_latency detector" +variable "flushing_latency_threshold_major" { + description = "Major threshold for flushing_latency detector" type = number default = 150 } -variable "index_flushing_to_disk_latency_lasting_duration_major" { +variable "flushing_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "index_flushing_to_disk_latency_at_least_percentage_major" { +variable "flushing_latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "index_flushing_to_disk_latency_threshold_minor" { - description = "Minor threshold for index_flushing_to_disk_latency detector" +variable "flushing_latency_threshold_minor" { + description = "Minor threshold for flushing_latency detector" type = number default = 100 } -variable "index_flushing_to_disk_latency_lasting_duration_minor" { +variable "flushing_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "index_flushing_to_disk_latency_at_least_percentage_minor" { +variable "flushing_latency_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -# search_query_latency detector +# search_latency detector -variable "search_query_latency_notifications" { - description = "Notification recipients list per severity overridden for search_query_latency detector" +variable "search_latency_notifications" { + description = "Notification recipients list per severity overridden for search_latency detector" type = map(list(string)) default = {} } -variable "search_query_latency_aggregation_function" { - description = "Aggregation function and group by for search_query_latency detector (i.e. \".mean(by=['host'])\")" +variable "search_latency_aggregation_function" { + description = "Aggregation function and group by for search_latency detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "search_query_latency_transformation_function" { - description = "Transformation function for search_query_latency detector (i.e. \".mean(over='5m')\")" +variable "search_latency_transformation_function" { + description = "Transformation function for search_latency detector (i.e. \".mean(over='5m')\")" type = string default = ".min(over='30m')" } -variable "search_query_latency_max_delay" { - description = "Enforce max delay for search_query_latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "search_latency_max_delay" { + description = "Enforce max delay for search_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "search_query_latency_tip" { +variable "search_latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "search_query_latency_runbook_url" { +variable "search_latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "search_query_latency_disabled" { - description = "Disable all alerting rules for search_query_latency detector" +variable "search_latency_disabled" { + description = "Disable all alerting rules for search_latency detector" type = bool default = null } -variable "search_query_latency_disabled_major" { - description = "Disable major alerting rule for search_query_latency detector" +variable "search_latency_disabled_major" { + description = "Disable major alerting rule for search_latency detector" type = bool default = null } -variable "search_query_latency_disabled_minor" { - description = "Disable minor alerting rule for search_query_latency detector" +variable "search_latency_disabled_minor" { + description = "Disable minor alerting rule for search_latency detector" type = bool default = null } -variable "search_query_latency_threshold_major" { - description = "Major threshold for search_query_latency detector" +variable "search_latency_threshold_major" { + description = "Major threshold for search_latency detector" type = number default = 20 } -variable "search_query_latency_lasting_duration_major" { +variable "search_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "search_query_latency_at_least_percentage_major" { +variable "search_latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "search_query_latency_threshold_minor" { - description = "Minor threshold for search_query_latency detector" +variable "search_latency_threshold_minor" { + description = "Minor threshold for search_latency detector" type = number default = 10 } -variable "search_query_latency_lasting_duration_minor" { +variable "search_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "search_query_latency_at_least_percentage_minor" { +variable "search_latency_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -# search_fetch_latency detector +# fetch_latency detector -variable "search_fetch_latency_notifications" { - description = "Notification recipients list per severity overridden for search_fetch_latency detector" +variable "fetch_latency_notifications" { + description = "Notification recipients list per severity overridden for fetch_latency detector" type = map(list(string)) default = {} } -variable "search_fetch_latency_aggregation_function" { - description = "Aggregation function and group by for search_fetch_latency detector (i.e. \".mean(by=['host'])\")" +variable "fetch_latency_aggregation_function" { + description = "Aggregation function and group by for fetch_latency detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "search_fetch_latency_transformation_function" { - description = "Transformation function for search_fetch_latency detector (i.e. \".mean(over='5m')\")" +variable "fetch_latency_transformation_function" { + description = "Transformation function for fetch_latency detector (i.e. \".mean(over='5m')\")" type = string default = ".min(over='15m')" } -variable "search_fetch_latency_max_delay" { - description = "Enforce max delay for search_fetch_latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "fetch_latency_max_delay" { + description = "Enforce max delay for fetch_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "search_fetch_latency_tip" { +variable "fetch_latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "search_fetch_latency_runbook_url" { +variable "fetch_latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "search_fetch_latency_disabled" { - description = "Disable all alerting rules for search_fetch_latency detector" +variable "fetch_latency_disabled" { + description = "Disable all alerting rules for fetch_latency detector" type = bool default = null } -variable "search_fetch_latency_disabled_major" { - description = "Disable major alerting rule for search_fetch_latency detector" +variable "fetch_latency_disabled_major" { + description = "Disable major alerting rule for fetch_latency detector" type = bool default = null } -variable "search_fetch_latency_disabled_minor" { - description = "Disable minor alerting rule for search_fetch_latency detector" +variable "fetch_latency_disabled_minor" { + description = "Disable minor alerting rule for fetch_latency detector" type = bool default = null } -variable "search_fetch_latency_threshold_major" { - description = "Major threshold for search_fetch_latency detector" +variable "fetch_latency_threshold_major" { + description = "Major threshold for fetch_latency detector" type = number default = 20 } -variable "search_fetch_latency_lasting_duration_major" { +variable "fetch_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "search_fetch_latency_at_least_percentage_major" { +variable "fetch_latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "search_fetch_latency_threshold_minor" { - description = "Minor threshold for search_fetch_latency detector" +variable "fetch_latency_threshold_minor" { + description = "Minor threshold for fetch_latency detector" type = number default = 10 } -variable "search_fetch_latency_lasting_duration_minor" { +variable "fetch_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "search_fetch_latency_at_least_percentage_minor" { +variable "fetch_latency_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -# fielddata_cache_evictions_rate_of_change detector +# field_data_evictions_change detector -variable "fielddata_cache_evictions_rate_of_change_notifications" { - description = "Notification recipients list per severity overridden for fielddata_cache_evictions_rate_of_change detector" +variable "field_data_evictions_change_notifications" { + description = "Notification recipients list per severity overridden for field_data_evictions_change detector" type = map(list(string)) default = {} } -variable "fielddata_cache_evictions_rate_of_change_aggregation_function" { - description = "Aggregation function and group by for fielddata_cache_evictions_rate_of_change detector (i.e. \".mean(by=['host'])\")" +variable "field_data_evictions_change_aggregation_function" { + description = "Aggregation function and group by for field_data_evictions_change detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "fielddata_cache_evictions_rate_of_change_transformation_function" { - description = "Transformation function for fielddata_cache_evictions_rate_of_change detector (i.e. \".mean(over='5m')\")" +variable "field_data_evictions_change_transformation_function" { + description = "Transformation function for field_data_evictions_change detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='15m')" } -variable "fielddata_cache_evictions_rate_of_change_max_delay" { - description = "Enforce max delay for fielddata_cache_evictions_rate_of_change detector (use \"0\" or \"null\" for \"Auto\")" +variable "field_data_evictions_change_max_delay" { + description = "Enforce max delay for field_data_evictions_change detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "fielddata_cache_evictions_rate_of_change_tip" { +variable "field_data_evictions_change_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "fielddata_cache_evictions_rate_of_change_runbook_url" { +variable "field_data_evictions_change_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "fielddata_cache_evictions_rate_of_change_disabled" { - description = "Disable all alerting rules for fielddata_cache_evictions_rate_of_change detector" +variable "field_data_evictions_change_disabled" { + description = "Disable all alerting rules for field_data_evictions_change detector" type = bool default = null } -variable "fielddata_cache_evictions_rate_of_change_disabled_major" { - description = "Disable major alerting rule for fielddata_cache_evictions_rate_of_change detector" +variable "field_data_evictions_change_disabled_major" { + description = "Disable major alerting rule for field_data_evictions_change detector" type = bool default = null } -variable "fielddata_cache_evictions_rate_of_change_disabled_minor" { - description = "Disable minor alerting rule for fielddata_cache_evictions_rate_of_change detector" +variable "field_data_evictions_change_disabled_minor" { + description = "Disable minor alerting rule for field_data_evictions_change detector" type = bool default = null } -variable "fielddata_cache_evictions_rate_of_change_threshold_major" { - description = "Major threshold for fielddata_cache_evictions_rate_of_change detector" +variable "field_data_evictions_change_threshold_major" { + description = "Major threshold for field_data_evictions_change detector" type = number default = 120 } -variable "fielddata_cache_evictions_rate_of_change_lasting_duration_major" { +variable "field_data_evictions_change_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "fielddata_cache_evictions_rate_of_change_at_least_percentage_major" { +variable "field_data_evictions_change_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "fielddata_cache_evictions_rate_of_change_threshold_minor" { - description = "Minor threshold for fielddata_cache_evictions_rate_of_change detector" +variable "field_data_evictions_change_threshold_minor" { + description = "Minor threshold for field_data_evictions_change detector" type = number default = 60 } -variable "fielddata_cache_evictions_rate_of_change_lasting_duration_minor" { +variable "field_data_evictions_change_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "fielddata_cache_evictions_rate_of_change_at_least_percentage_minor" { +variable "field_data_evictions_change_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -# max_time_spent_by_task_in_queue_rate_of_change detector +# task_time_in_queue_change detector -variable "max_time_spent_by_task_in_queue_rate_of_change_notifications" { - description = "Notification recipients list per severity overridden for max_time_spent_by_task_in_queue_rate_of_change detector" +variable "task_time_in_queue_change_notifications" { + description = "Notification recipients list per severity overridden for task_time_in_queue_change detector" type = map(list(string)) default = {} } -variable "max_time_spent_by_task_in_queue_rate_of_change_aggregation_function" { - description = "Aggregation function and group by for max_time_spent_by_task_in_queue_rate_of_change detector (i.e. \".mean(by=['host'])\")" +variable "task_time_in_queue_change_aggregation_function" { + description = "Aggregation function and group by for task_time_in_queue_change detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "max_time_spent_by_task_in_queue_rate_of_change_transformation_function" { - description = "Transformation function for max_time_spent_by_task_in_queue_rate_of_change detector (i.e. \".mean(over='5m')\")" +variable "task_time_in_queue_change_transformation_function" { + description = "Transformation function for task_time_in_queue_change detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='15m')" } -variable "max_time_spent_by_task_in_queue_rate_of_change_max_delay" { - description = "Enforce max delay for max_time_spent_by_task_in_queue_rate_of_change detector (use \"0\" or \"null\" for \"Auto\")" +variable "task_time_in_queue_change_max_delay" { + description = "Enforce max delay for task_time_in_queue_change detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "max_time_spent_by_task_in_queue_rate_of_change_tip" { +variable "task_time_in_queue_change_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "max_time_spent_by_task_in_queue_rate_of_change_runbook_url" { +variable "task_time_in_queue_change_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "max_time_spent_by_task_in_queue_rate_of_change_disabled" { - description = "Disable all alerting rules for max_time_spent_by_task_in_queue_rate_of_change detector" +variable "task_time_in_queue_change_disabled" { + description = "Disable all alerting rules for task_time_in_queue_change detector" type = bool default = null } -variable "max_time_spent_by_task_in_queue_rate_of_change_disabled_major" { - description = "Disable major alerting rule for max_time_spent_by_task_in_queue_rate_of_change detector" +variable "task_time_in_queue_change_disabled_major" { + description = "Disable major alerting rule for task_time_in_queue_change detector" type = bool default = null } -variable "max_time_spent_by_task_in_queue_rate_of_change_disabled_minor" { - description = "Disable minor alerting rule for max_time_spent_by_task_in_queue_rate_of_change detector" +variable "task_time_in_queue_change_disabled_minor" { + description = "Disable minor alerting rule for task_time_in_queue_change detector" type = bool default = null } -variable "max_time_spent_by_task_in_queue_rate_of_change_threshold_major" { - description = "Major threshold for max_time_spent_by_task_in_queue_rate_of_change detector" +variable "task_time_in_queue_change_threshold_major" { + description = "Major threshold for task_time_in_queue_change detector" type = number default = 200 } -variable "max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_major" { +variable "task_time_in_queue_change_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_major" { +variable "task_time_in_queue_change_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "max_time_spent_by_task_in_queue_rate_of_change_threshold_minor" { - description = "Minor threshold for max_time_spent_by_task_in_queue_rate_of_change detector" +variable "task_time_in_queue_change_threshold_minor" { + description = "Minor threshold for task_time_in_queue_change detector" type = number default = 100 } -variable "max_time_spent_by_task_in_queue_rate_of_change_lasting_duration_minor" { +variable "task_time_in_queue_change_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string default = null } -variable "max_time_spent_by_task_in_queue_rate_of_change_at_least_percentage_minor" { +variable "task_time_in_queue_change_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 From 0da82709310a79b309bfd1a8823cabfe6fca0bb7 Mon Sep 17 00:00:00 2001 From: Gauthier AMPE Date: Wed, 10 Jan 2024 11:06:05 +0100 Subject: [PATCH 4/6] correction --- docs/severity.md | 2 +- modules/smart-agent_elasticsearch/README.md | 2 +- .../conf/00-heartbeat.yaml | 1 + .../conf/01-cluster-status.yaml | 8 +- .../conf/02-cluster_initializing_shards.yaml | 2 + .../conf/03-cluster_relocating_shards.yaml | 8 +- .../conf/04-cluster_unassigned_shards.yaml | 4 + .../conf/05-cluster_pending_tasks.yaml | 3 + .../conf/06-cluster_cpu_usage.yaml | 4 +- .../conf/07-cluster_file_descriptor.yaml | 4 +- .../conf/08-cluster_JVM_heap_memory.yaml | 4 +- .../09-cluster_JVM_memory_young_usage.yaml | 4 +- .../conf/10-cluster_JVM_memory_old_usage.yaml | 4 +- ...eneration_garbage_collections_latency.yaml | 4 +- ...eneration_garbage_collections_latency.yaml | 4 +- .../conf/13-cluster_indexing_latency.yaml | 4 +- .../conf/14-cluster_flush_latency.yaml | 8 +- .../conf/15-cluster_search_latency.yaml | 4 +- .../conf/16-cluster_fetch_latency.yaml | 4 +- ...luster_fielddata_cache_evictions_rate.yaml | 4 +- .../conf/18-cluster_time_in_queue_change.yaml | 3 + .../detectors-gen.tf | 78 +++++----- modules/smart-agent_elasticsearch/outputs.tf | 6 +- .../variables-gen.tf | 136 +++++++++--------- 24 files changed, 177 insertions(+), 128 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index b2f09502a..c562bbd69 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -989,7 +989,7 @@ |ElasticSearch jvm gc old collection latency|-|X|X|-|-| |ElasticSearch jvm gc young collection latency|-|X|X|-|-| |ElasticSearch indexing latency|-|X|X|-|-| -|ElasticSearch flushing latency|-|X|X|-|-| +|ElasticSearch flush latency|-|X|X|-|-| |ElasticSearch search latency|-|X|X|-|-| |ElasticSearch fetch latency|-|X|X|-|-| |ElasticSearch field_data evictions change|-|X|X|-|-| diff --git a/modules/smart-agent_elasticsearch/README.md b/modules/smart-agent_elasticsearch/README.md index 2f944c3b9..217d6e21c 100644 --- a/modules/smart-agent_elasticsearch/README.md +++ b/modules/smart-agent_elasticsearch/README.md @@ -91,7 +91,7 @@ This module creates the following SignalFx detectors which could contain one or |ElasticSearch jvm gc old collection latency|-|X|X|-|-| |ElasticSearch jvm gc young collection latency|-|X|X|-|-| |ElasticSearch indexing latency|-|X|X|-|-| -|ElasticSearch flushing latency|-|X|X|-|-| +|ElasticSearch flush latency|-|X|X|-|-| |ElasticSearch search latency|-|X|X|-|-| |ElasticSearch fetch latency|-|X|X|-|-| |ElasticSearch field_data evictions change|-|X|X|-|-| diff --git a/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml index e4e662a69..4eddf7003 100644 --- a/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml +++ b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml @@ -4,6 +4,7 @@ name: heartbeat transformation: false aggregation: true exclude_not_running_vm: true +filtering: "filter('plugin', 'elasticsearch')" signals: signal: diff --git a/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml index eb7d9cd94..6e354211e 100644 --- a/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml +++ b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml @@ -2,16 +2,18 @@ module: ElasticSearch name: "cluster status" aggregation: true transformation: ".mean(over='5m')" +filtering: "filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.cluster.status" rules: critical: - threshold: 1 + threshold: 2 comparator: "==" description: "is red" + lasting_duration: '5m' major: - threshold: 2 + threshold: 1 comparator: "==" - dependency: critical description: "is yellow" + lasting_duration: '5m' diff --git a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml index de406b1cb..bfbca53f6 100644 --- a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml @@ -12,8 +12,10 @@ rules: threshold: 1 comparator: ">" description: "is too high" + lasting_duration: '15m' major: threshold: 0 comparator: ">" dependency: critical description: "is too high" + lasting_duration: '15m' diff --git a/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml index ba88e652d..05ff4bd09 100644 --- a/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml @@ -2,17 +2,21 @@ module: ElasticSearch name: "cluster relocating shards" aggregation: true transformation: ".min(over='15m')" +filtering: "filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.cluster.relocating-shards" rollup: average rules: critical: - threshold: 1 + threshold: 0 comparator: ">" description: "is too high" + lasting_duration: '15m' major: - threshold: 0 + threshold: -1 comparator: ">" dependency: critical description: "is too high" + lasting_duration: '15m' + disabled: true diff --git a/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml index 2bb6203ae..d38bad121 100644 --- a/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml @@ -2,6 +2,7 @@ module: ElasticSearch name: "cluster unassigned shards" aggregation: true transformation: ".min(over='10m')" +filtering: "filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.cluster.unassigned-shards" @@ -11,8 +12,11 @@ rules: threshold: 0 comparator: ">" description: "is too high" + lasting_duration: '10m' major: threshold: -1 comparator: ">" dependency: critical description: "is too high" + lasting_duration: '10m' + disabled: true \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml index da7ae06cd..d152c5f2f 100644 --- a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml +++ b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml @@ -2,6 +2,7 @@ module: ElasticSearch name: "pending tasks" aggregation: true transformation: ".min(over='15m')" +filtering: "filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.cluster.pending-tasks" @@ -11,8 +12,10 @@ rules: threshold: 5 comparator: ">" description: "are too high" + lasting_duration: '15m' major: threshold: 0 comparator: ">" dependency: critical description: "are too high" + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml index 4869ffb98..cf5421821 100644 --- a/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml +++ b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "cpu usage" aggregation: true transformation: ".min(over='30m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.process.cpu.percent" @@ -12,8 +12,10 @@ rules: threshold: 95 comparator: ">" description: "is too high" + lasting_duration: '30m' major: threshold: 85 comparator: ">" dependency: critical description: "is too high" + lasting_duration: '30m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml index 36a0ddb46..e1c7894cb 100644 --- a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml +++ b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "file descriptors" aggregation: true transformation: ".max(over='15m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.process.open_file_descriptors" @@ -17,8 +17,10 @@ rules: threshold: 95 comparator: ">" description: "is too high" + lasting_duration: '15m' major: threshold: 90 comparator: ">" dependency: critical description: "is too high" + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml index 946276c3f..483adc0ee 100644 --- a/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml +++ b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "JVM heap memory usage" aggregation: true transformation: ".mean(over='5m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: signal: metric: "elasticsearch.jvm.mem.heap-used-percent" @@ -12,8 +12,10 @@ rules: threshold: 90 comparator: ">" description: "is too high" + lasting_duration: '5m' major: threshold: 80 comparator: ">" dependency: critical description: "is too high" + lasting_duration: '5m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml index f8940b635..d669c465a 100644 --- a/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml +++ b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "JVM memory young usage" aggregation: true transformation: ".mean(over='10m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.jvm.mem.pools.young.used_in_bytes" @@ -17,8 +17,10 @@ rules: threshold: 90 comparator: ">" description: "is too high" + lasting_duration: '10m' minor: threshold: 80 comparator: ">" description: "is too high" dependency: major + lasting_duration: '10m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml index 7be566329..ab617e0d1 100644 --- a/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml +++ b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "JVM memory old usage" aggregation: true transformation: ".mean(over='10m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.jvm.mem.pools.old.used_in_bytes" @@ -17,8 +17,10 @@ rules: threshold: 90 comparator: ">" description: "is too high" + lasting_duration: '10m' minor: threshold: 80 comparator: ">" description: "is too high" dependency: major + lasting_duration: '10m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml index 427ca8e46..65da726cd 100644 --- a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "jvm gc old collection latency" aggregation: true transformation: ".mean(over='15m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.jvm.gc.old-time" @@ -19,8 +19,10 @@ rules: threshold: 300 comparator: ">" description: "is too high" + lasting_duration: '15m' minor: threshold: 200 comparator: ">" description: "is too high" dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml index a497687aa..bc1f8bb7d 100644 --- a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "jvm gc young collection latency" aggregation: true transformation: ".mean(over='15m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.jvm.gc.time" @@ -19,8 +19,10 @@ rules: threshold: 40 comparator: ">" description: "is too high" + lasting_duration: '15m' minor: threshold: 20 comparator: ">" description: "is too high" dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml index a72ab1cce..b892a4314 100644 --- a/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "indexing latency" aggregation: true transformation: ".mean(over='15m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.indices.indexing.index-time" @@ -19,8 +19,10 @@ rules: threshold: 30 comparator: ">" description: "is too high" + lasting_duration: '1h' minor: threshold: 15 comparator: ">" description: "is too high" dependency: major + lasting_duration: '1h' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml index b162c269e..0b5fa3bca 100644 --- a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml @@ -1,8 +1,8 @@ module: ElasticSearch -name: "flushing latency" +name: "flush latency" aggregation: true -transformation: ".mean(over='15m')" -filtering: "filter('node_name', '*')" +transformation: ".mean(over='30m')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.indices.flush.total-time" @@ -19,8 +19,10 @@ rules: threshold: 150 comparator: ">" description: "is too high" + lasting_duration: '15m' minor: threshold: 100 comparator: ">" description: "is too high" dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml index 0e592b254..c8e17fa29 100644 --- a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "search latency" aggregation: true transformation: ".min(over='30m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.indices.search.query-time" @@ -19,8 +19,10 @@ rules: threshold: 20 comparator: ">" description: "is too high" + lasting_duration: '30m' minor: threshold: 10 comparator: ">" description: "is too high" dependency: major + lasting_duration: '30m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml index cad903c49..34d4b7791 100644 --- a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "fetch latency" aggregation: true transformation: ".min(over='15m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.indices.search.fetch-time" @@ -19,8 +19,10 @@ rules: threshold: 20 comparator: ">" description: "is too high" + lasting_duration: '15m' minor: threshold: 10 comparator: ">" description: "is too high" dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml index ff160ea43..f0c3dbfe4 100644 --- a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml +++ b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml @@ -2,7 +2,7 @@ module: ElasticSearch name: "field_data evictions change" aggregation: true transformation: ".mean(over='15m')" -filtering: "filter('node_name', '*')" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.indices.fielddata.evictions" @@ -15,8 +15,10 @@ rules: threshold: 120 comparator: ">" description: "is too high" + lasting_duration: '15m' minor: threshold: 60 comparator: ">" description: "is too high" dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml index faaa7909a..a4cbcb916 100644 --- a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml +++ b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml @@ -2,6 +2,7 @@ module: ElasticSearch name: "task time in queue change" aggregation: true transformation: ".mean(over='15m')" +filtering: "filter('plugin', 'elasticsearch')" signals: A: metric: "elasticsearch.cluster.task-max-wait-time" @@ -13,8 +14,10 @@ rules: threshold: 200 comparator: ">" description: "is too high" + lasting_duration: '15m' minor: threshold: 100 comparator: ">" description: "is too high" dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/detectors-gen.tf b/modules/smart-agent_elasticsearch/detectors-gen.tf index 732a3d8c2..0ebacf323 100644 --- a/modules/smart-agent_elasticsearch/detectors-gen.tf +++ b/modules/smart-agent_elasticsearch/detectors-gen.tf @@ -7,7 +7,8 @@ resource "signalfx_detector" "heartbeat" { program_text = <<-EOF from signalfx.detectors.not_reporting import not_reporting - signal = data('elasticsearch.cluster.number-of-nodes', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.number-of-nodes', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') EOF @@ -34,9 +35,10 @@ resource "signalfx_detector" "cluster_status" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.status', filter=${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.status', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') detect(when(signal == ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical})).publish('CRIT') - detect(when(signal == ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major}) and (not when(signal == ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal == ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major})).publish('MAJOR') EOF rule { @@ -115,7 +117,8 @@ resource "signalfx_detector" "cluster_relocating_shards" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.relocating-shards', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.relocating-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') detect(when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.cluster_relocating_shards_threshold_major}, lasting=%{if var.cluster_relocating_shards_lasting_duration_major == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical}))).publish('MAJOR') EOF @@ -155,7 +158,8 @@ resource "signalfx_detector" "cluster_unassigned_shards" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.unassigned-shards', filter=${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.unassigned-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_major == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical}))).publish('MAJOR') EOF @@ -195,7 +199,8 @@ resource "signalfx_detector" "pending_tasks" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.pending-tasks', filter=${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.pending-tasks', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') detect(when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.pending_tasks_threshold_major}, lasting=%{if var.pending_tasks_lasting_duration_major == null}None%{else}'${var.pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_major}) and (not when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical}))).publish('MAJOR') EOF @@ -235,7 +240,7 @@ resource "signalfx_detector" "cpu_usage" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') signal = data('elasticsearch.process.cpu.percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') detect(when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.cpu_usage_threshold_major}, lasting=%{if var.cpu_usage_lasting_duration_major == null}None%{else}'${var.cpu_usage_lasting_duration_major}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_major}) and (not when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical}))).publish('MAJOR') @@ -276,7 +281,7 @@ resource "signalfx_detector" "file_descriptors" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.process.open_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} B = data('elasticsearch.process.max_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} signal = (A/B).scale(100).publish('signal') @@ -319,7 +324,7 @@ resource "signalfx_detector" "jvm_heap_memory_usage" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_major == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_major}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical}))).publish('MAJOR') @@ -360,7 +365,7 @@ resource "signalfx_detector" "jvm_memory_young_usage" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} signal = (A/B).fill(0).scale(100).publish('signal') @@ -403,7 +408,7 @@ resource "signalfx_detector" "jvm_memory_old_usage" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} signal = (A/B).fill(0).scale(100).publish('signal') @@ -446,7 +451,7 @@ resource "signalfx_detector" "jvm_gc_old_collection_latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.jvm.gc.old-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} B = data('elasticsearch.jvm.gc.old-count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') @@ -489,7 +494,7 @@ resource "signalfx_detector" "jvm_gc_young_collection_latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.jvm.gc.time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} B = data('elasticsearch.jvm.gc.count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') @@ -532,7 +537,7 @@ resource "signalfx_detector" "indexing_latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.indices.indexing.index-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} B = data('elasticsearch.indices.indexing.index-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} signal = (A/B).fill(0).publish('signal') @@ -567,47 +572,47 @@ EOF max_delay = var.indexing_latency_max_delay } -resource "signalfx_detector" "flushing_latency" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch flushing latency") +resource "signalfx_detector" "flush_latency" { + name = format("%s %s", local.detector_name_prefix, "ElasticSearch flush latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') - A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flushing_latency_aggregation_function}${var.flushing_latency_transformation_function} - B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flushing_latency_aggregation_function}${var.flushing_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} + B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.flushing_latency_threshold_major}, lasting=%{if var.flushing_latency_lasting_duration_major == null}None%{else}'${var.flushing_latency_lasting_duration_major}'%{endif}, at_least=${var.flushing_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.flushing_latency_threshold_minor}, lasting=%{if var.flushing_latency_lasting_duration_minor == null}None%{else}'${var.flushing_latency_lasting_duration_minor}'%{endif}, at_least=${var.flushing_latency_at_least_percentage_minor}) and (not when(signal > ${var.flushing_latency_threshold_major}, lasting=%{if var.flushing_latency_lasting_duration_major == null}None%{else}'${var.flushing_latency_lasting_duration_major}'%{endif}, at_least=${var.flushing_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal > ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.flush_latency_threshold_minor}, lasting=%{if var.flush_latency_lasting_duration_minor == null}None%{else}'${var.flush_latency_lasting_duration_minor}'%{endif}, at_least=${var.flush_latency_at_least_percentage_minor}) and (not when(signal > ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.flushing_latency_threshold_major}" + description = "is too high > ${var.flush_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.flushing_latency_disabled_major, var.flushing_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.flushing_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.flushing_latency_runbook_url, var.runbook_url), "") - tip = var.flushing_latency_tip + disabled = coalesce(var.flush_latency_disabled_major, var.flush_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.flush_latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.flush_latency_runbook_url, var.runbook_url), "") + tip = var.flush_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } rule { - description = "is too high > ${var.flushing_latency_threshold_minor}" + description = "is too high > ${var.flush_latency_threshold_minor}" severity = "Minor" detect_label = "MINOR" - disabled = coalesce(var.flushing_latency_disabled_minor, var.flushing_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.flushing_latency_notifications, "minor", []), var.notifications.minor), null) - runbook_url = try(coalesce(var.flushing_latency_runbook_url, var.runbook_url), "") - tip = var.flushing_latency_tip + disabled = coalesce(var.flush_latency_disabled_minor, var.flush_latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.flush_latency_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.flush_latency_runbook_url, var.runbook_url), "") + tip = var.flush_latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.flushing_latency_max_delay + max_delay = var.flush_latency_max_delay } resource "signalfx_detector" "search_latency" { @@ -618,7 +623,7 @@ resource "signalfx_detector" "search_latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.indices.search.query-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} B = data('elasticsearch.indices.search.query-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} signal = (A/B).fill(0).publish('signal') @@ -661,7 +666,7 @@ resource "signalfx_detector" "fetch_latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.indices.search.fetch-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} B = data('elasticsearch.indices.search.fetch-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} signal = (A/B).fill(0).publish('signal') @@ -704,7 +709,7 @@ resource "signalfx_detector" "field_data_evictions_change" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - base_filtering = filter('node_name', '*') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.indices.fielddata.evictions', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function} signal = A.rateofchange().publish('signal') detect(when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major})).publish('MAJOR') @@ -746,7 +751,8 @@ resource "signalfx_detector" "task_time_in_queue_change" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.cluster.task-max-wait-time', filter=${module.filtering.signalflow}, rollup='average')${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function} + base_filtering = filter('plugin', 'elasticsearch') + A = data('elasticsearch.cluster.task-max-wait-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function} signal = A.rateofchange().publish('signal') detect(when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major})).publish('MAJOR') detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}, lasting=%{if var.task_time_in_queue_change_lasting_duration_minor == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_minor}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_minor}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major}))).publish('MINOR') diff --git a/modules/smart-agent_elasticsearch/outputs.tf b/modules/smart-agent_elasticsearch/outputs.tf index 4464fe0c2..3ccfbe78c 100644 --- a/modules/smart-agent_elasticsearch/outputs.tf +++ b/modules/smart-agent_elasticsearch/outputs.tf @@ -38,9 +38,9 @@ output "file_descriptors" { value = signalfx_detector.file_descriptors } -output "flushing_latency" { - description = "Detector resource for flushing_latency" - value = signalfx_detector.flushing_latency +output "flush_latency" { + description = "Detector resource for flush_latency" + value = signalfx_detector.flush_latency } output "heartbeat" { diff --git a/modules/smart-agent_elasticsearch/variables-gen.tf b/modules/smart-agent_elasticsearch/variables-gen.tf index c4fbf9ef3..258c8904e 100644 --- a/modules/smart-agent_elasticsearch/variables-gen.tf +++ b/modules/smart-agent_elasticsearch/variables-gen.tf @@ -101,13 +101,13 @@ variable "cluster_status_disabled_major" { variable "cluster_status_threshold_critical" { description = "Critical threshold for cluster_status detector" type = number - default = 1 + default = 2 } variable "cluster_status_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5m" } variable "cluster_status_at_least_percentage_critical" { @@ -118,13 +118,13 @@ variable "cluster_status_at_least_percentage_critical" { variable "cluster_status_threshold_major" { description = "Major threshold for cluster_status detector" type = number - default = 2 + default = 1 } variable "cluster_status_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5m" } variable "cluster_status_at_least_percentage_major" { @@ -197,7 +197,7 @@ variable "cluster_initializing_shards_threshold_critical" { variable "cluster_initializing_shards_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_initializing_shards_at_least_percentage_critical" { @@ -214,7 +214,7 @@ variable "cluster_initializing_shards_threshold_major" { variable "cluster_initializing_shards_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_initializing_shards_at_least_percentage_major" { @@ -275,19 +275,19 @@ variable "cluster_relocating_shards_disabled_critical" { variable "cluster_relocating_shards_disabled_major" { description = "Disable major alerting rule for cluster_relocating_shards detector" type = bool - default = null + default = true } variable "cluster_relocating_shards_threshold_critical" { description = "Critical threshold for cluster_relocating_shards detector" type = number - default = 1 + default = 0 } variable "cluster_relocating_shards_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_relocating_shards_at_least_percentage_critical" { @@ -298,13 +298,13 @@ variable "cluster_relocating_shards_at_least_percentage_critical" { variable "cluster_relocating_shards_threshold_major" { description = "Major threshold for cluster_relocating_shards detector" type = number - default = 0 + default = -1 } variable "cluster_relocating_shards_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_relocating_shards_at_least_percentage_major" { @@ -365,7 +365,7 @@ variable "cluster_unassigned_shards_disabled_critical" { variable "cluster_unassigned_shards_disabled_major" { description = "Disable major alerting rule for cluster_unassigned_shards detector" type = bool - default = null + default = true } variable "cluster_unassigned_shards_threshold_critical" { @@ -377,7 +377,7 @@ variable "cluster_unassigned_shards_threshold_critical" { variable "cluster_unassigned_shards_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "10m" } variable "cluster_unassigned_shards_at_least_percentage_critical" { @@ -394,7 +394,7 @@ variable "cluster_unassigned_shards_threshold_major" { variable "cluster_unassigned_shards_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "10m" } variable "cluster_unassigned_shards_at_least_percentage_major" { @@ -467,7 +467,7 @@ variable "pending_tasks_threshold_critical" { variable "pending_tasks_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "pending_tasks_at_least_percentage_critical" { @@ -484,7 +484,7 @@ variable "pending_tasks_threshold_major" { variable "pending_tasks_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "pending_tasks_at_least_percentage_major" { @@ -557,7 +557,7 @@ variable "cpu_usage_threshold_critical" { variable "cpu_usage_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "cpu_usage_at_least_percentage_critical" { @@ -574,7 +574,7 @@ variable "cpu_usage_threshold_major" { variable "cpu_usage_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "cpu_usage_at_least_percentage_major" { @@ -647,7 +647,7 @@ variable "file_descriptors_threshold_critical" { variable "file_descriptors_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "file_descriptors_at_least_percentage_critical" { @@ -664,7 +664,7 @@ variable "file_descriptors_threshold_major" { variable "file_descriptors_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "file_descriptors_at_least_percentage_major" { @@ -737,7 +737,7 @@ variable "jvm_heap_memory_usage_threshold_critical" { variable "jvm_heap_memory_usage_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5m" } variable "jvm_heap_memory_usage_at_least_percentage_critical" { @@ -754,7 +754,7 @@ variable "jvm_heap_memory_usage_threshold_major" { variable "jvm_heap_memory_usage_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "5m" } variable "jvm_heap_memory_usage_at_least_percentage_major" { @@ -827,7 +827,7 @@ variable "jvm_memory_young_usage_threshold_major" { variable "jvm_memory_young_usage_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "10m" } variable "jvm_memory_young_usage_at_least_percentage_major" { @@ -844,7 +844,7 @@ variable "jvm_memory_young_usage_threshold_minor" { variable "jvm_memory_young_usage_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "10m" } variable "jvm_memory_young_usage_at_least_percentage_minor" { @@ -917,7 +917,7 @@ variable "jvm_memory_old_usage_threshold_major" { variable "jvm_memory_old_usage_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "10m" } variable "jvm_memory_old_usage_at_least_percentage_major" { @@ -934,7 +934,7 @@ variable "jvm_memory_old_usage_threshold_minor" { variable "jvm_memory_old_usage_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "10m" } variable "jvm_memory_old_usage_at_least_percentage_minor" { @@ -1007,7 +1007,7 @@ variable "jvm_gc_old_collection_latency_threshold_major" { variable "jvm_gc_old_collection_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_gc_old_collection_latency_at_least_percentage_major" { @@ -1024,7 +1024,7 @@ variable "jvm_gc_old_collection_latency_threshold_minor" { variable "jvm_gc_old_collection_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_gc_old_collection_latency_at_least_percentage_minor" { @@ -1097,7 +1097,7 @@ variable "jvm_gc_young_collection_latency_threshold_major" { variable "jvm_gc_young_collection_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_gc_young_collection_latency_at_least_percentage_major" { @@ -1114,7 +1114,7 @@ variable "jvm_gc_young_collection_latency_threshold_minor" { variable "jvm_gc_young_collection_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_gc_young_collection_latency_at_least_percentage_minor" { @@ -1187,7 +1187,7 @@ variable "indexing_latency_threshold_major" { variable "indexing_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "1h" } variable "indexing_latency_at_least_percentage_major" { @@ -1204,7 +1204,7 @@ variable "indexing_latency_threshold_minor" { variable "indexing_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "1h" } variable "indexing_latency_at_least_percentage_minor" { @@ -1212,92 +1212,92 @@ variable "indexing_latency_at_least_percentage_minor" { type = number default = 1 } -# flushing_latency detector +# flush_latency detector -variable "flushing_latency_notifications" { - description = "Notification recipients list per severity overridden for flushing_latency detector" +variable "flush_latency_notifications" { + description = "Notification recipients list per severity overridden for flush_latency detector" type = map(list(string)) default = {} } -variable "flushing_latency_aggregation_function" { - description = "Aggregation function and group by for flushing_latency detector (i.e. \".mean(by=['host'])\")" +variable "flush_latency_aggregation_function" { + description = "Aggregation function and group by for flush_latency detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "flushing_latency_transformation_function" { - description = "Transformation function for flushing_latency detector (i.e. \".mean(over='5m')\")" +variable "flush_latency_transformation_function" { + description = "Transformation function for flush_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = ".mean(over='30m')" } -variable "flushing_latency_max_delay" { - description = "Enforce max delay for flushing_latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "flush_latency_max_delay" { + description = "Enforce max delay for flush_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "flushing_latency_tip" { +variable "flush_latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "flushing_latency_runbook_url" { +variable "flush_latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "flushing_latency_disabled" { - description = "Disable all alerting rules for flushing_latency detector" +variable "flush_latency_disabled" { + description = "Disable all alerting rules for flush_latency detector" type = bool default = null } -variable "flushing_latency_disabled_major" { - description = "Disable major alerting rule for flushing_latency detector" +variable "flush_latency_disabled_major" { + description = "Disable major alerting rule for flush_latency detector" type = bool default = null } -variable "flushing_latency_disabled_minor" { - description = "Disable minor alerting rule for flushing_latency detector" +variable "flush_latency_disabled_minor" { + description = "Disable minor alerting rule for flush_latency detector" type = bool default = null } -variable "flushing_latency_threshold_major" { - description = "Major threshold for flushing_latency detector" +variable "flush_latency_threshold_major" { + description = "Major threshold for flush_latency detector" type = number default = 150 } -variable "flushing_latency_lasting_duration_major" { +variable "flush_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } -variable "flushing_latency_at_least_percentage_major" { +variable "flush_latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "flushing_latency_threshold_minor" { - description = "Minor threshold for flushing_latency detector" +variable "flush_latency_threshold_minor" { + description = "Minor threshold for flush_latency detector" type = number default = 100 } -variable "flushing_latency_lasting_duration_minor" { +variable "flush_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } -variable "flushing_latency_at_least_percentage_minor" { +variable "flush_latency_at_least_percentage_minor" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -1367,7 +1367,7 @@ variable "search_latency_threshold_major" { variable "search_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "search_latency_at_least_percentage_major" { @@ -1384,7 +1384,7 @@ variable "search_latency_threshold_minor" { variable "search_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "30m" } variable "search_latency_at_least_percentage_minor" { @@ -1457,7 +1457,7 @@ variable "fetch_latency_threshold_major" { variable "fetch_latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "fetch_latency_at_least_percentage_major" { @@ -1474,7 +1474,7 @@ variable "fetch_latency_threshold_minor" { variable "fetch_latency_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "fetch_latency_at_least_percentage_minor" { @@ -1547,7 +1547,7 @@ variable "field_data_evictions_change_threshold_major" { variable "field_data_evictions_change_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "field_data_evictions_change_at_least_percentage_major" { @@ -1564,7 +1564,7 @@ variable "field_data_evictions_change_threshold_minor" { variable "field_data_evictions_change_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "field_data_evictions_change_at_least_percentage_minor" { @@ -1637,7 +1637,7 @@ variable "task_time_in_queue_change_threshold_major" { variable "task_time_in_queue_change_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "task_time_in_queue_change_at_least_percentage_major" { @@ -1654,7 +1654,7 @@ variable "task_time_in_queue_change_threshold_minor" { variable "task_time_in_queue_change_lasting_duration_minor" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "task_time_in_queue_change_at_least_percentage_minor" { From 10419a7b3401067e94bcfa3f60070a899e98dcd9 Mon Sep 17 00:00:00 2001 From: Gauthier AMPE Date: Wed, 10 Jan 2024 11:22:54 +0100 Subject: [PATCH 5/6] add heartbeat_aggregation_function --- modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml | 2 +- modules/smart-agent_elasticsearch/variables-gen.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml index 4eddf7003..b16dfc27f 100644 --- a/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml +++ b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml @@ -2,7 +2,7 @@ module: Elasticsearch name: heartbeat transformation: false -aggregation: true +aggregation: ".mean(by=['cluster'])" exclude_not_running_vm: true filtering: "filter('plugin', 'elasticsearch')" diff --git a/modules/smart-agent_elasticsearch/variables-gen.tf b/modules/smart-agent_elasticsearch/variables-gen.tf index 258c8904e..391d2bc31 100644 --- a/modules/smart-agent_elasticsearch/variables-gen.tf +++ b/modules/smart-agent_elasticsearch/variables-gen.tf @@ -9,7 +9,7 @@ variable "heartbeat_notifications" { variable "heartbeat_aggregation_function" { description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".mean(by=['cluster'])" } variable "heartbeat_max_delay" { From 89ec70d51142b1f7d334ef9e5ed824f095e9312f Mon Sep 17 00:00:00 2001 From: Gauthier AMPE Date: Fri, 12 Jan 2024 16:05:12 +0100 Subject: [PATCH 6/6] Elasticsearch improve detectors --- docs/severity.md | 4 +- modules/smart-agent_elasticsearch/README.md | 4 +- .../conf/01-cluster-status.yaml | 11 +- .../conf/02-cluster_initializing_shards.yaml | 5 +- .../conf/03-cluster_relocating_shards.yaml | 12 +- .../conf/04-cluster_unassigned_shards.yaml | 12 +- .../conf/05-cluster_pending_tasks.yaml | 5 +- .../conf/06-cluster_cpu_usage.yaml | 5 +- .../conf/07-cluster_file_descriptor.yaml | 5 +- .../conf/08-cluster_JVM_heap_memory.yaml | 5 +- .../09-cluster_JVM_memory_young_usage.yaml | 5 +- .../conf/10-cluster_JVM_memory_old_usage.yaml | 5 +- ...eneration_garbage_collections_latency.yaml | 5 +- ...eneration_garbage_collections_latency.yaml | 5 +- .../conf/13-cluster_indexing_latency.yaml | 5 +- .../conf/14-cluster_flush_latency.yaml | 5 +- .../conf/15-cluster_search_latency.yaml | 5 +- .../conf/16-cluster_fetch_latency.yaml | 5 +- ...luster_fielddata_cache_evictions_rate.yaml | 5 +- .../conf/18-cluster_time_in_queue_change.yaml | 5 +- .../detectors-gen.tf | 128 +++++++---------- .../variables-gen.tf | 130 +++++------------- 22 files changed, 130 insertions(+), 246 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index c562bbd69..d73143208 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -978,8 +978,8 @@ |Elasticsearch heartbeat|X|-|-|-|-| |ElasticSearch cluster status|X|X|-|-|-| |ElasticSearch cluster initializing shards|X|X|-|-|-| -|ElasticSearch cluster relocating shards|X|X|-|-|-| -|ElasticSearch cluster unassigned shards|X|X|-|-|-| +|ElasticSearch cluster relocating shards|X|-|-|-|-| +|ElasticSearch cluster unassigned shards|X|-|-|-|-| |ElasticSearch pending tasks|X|X|-|-|-| |ElasticSearch cpu usage|X|X|-|-|-| |ElasticSearch file descriptors|X|X|-|-|-| diff --git a/modules/smart-agent_elasticsearch/README.md b/modules/smart-agent_elasticsearch/README.md index 217d6e21c..b639402b1 100644 --- a/modules/smart-agent_elasticsearch/README.md +++ b/modules/smart-agent_elasticsearch/README.md @@ -80,8 +80,8 @@ This module creates the following SignalFx detectors which could contain one or |Elasticsearch heartbeat|X|-|-|-|-| |ElasticSearch cluster status|X|X|-|-|-| |ElasticSearch cluster initializing shards|X|X|-|-|-| -|ElasticSearch cluster relocating shards|X|X|-|-|-| -|ElasticSearch cluster unassigned shards|X|X|-|-|-| +|ElasticSearch cluster relocating shards|X|-|-|-|-| +|ElasticSearch cluster unassigned shards|X|-|-|-|-| |ElasticSearch pending tasks|X|X|-|-|-| |ElasticSearch cpu usage|X|X|-|-|-| |ElasticSearch file descriptors|X|X|-|-|-| diff --git a/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml index 6e354211e..25976153d 100644 --- a/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml +++ b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "cluster status" -aggregation: true -transformation: ".mean(over='5m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('plugin', 'elasticsearch')" signals: signal: @@ -9,11 +8,11 @@ signals: rules: critical: threshold: 2 - comparator: "==" + comparator: ">=" description: "is red" - lasting_duration: '5m' + lasting_duration: '5m' major: threshold: 1 - comparator: "==" + comparator: ">=" description: "is yellow" - lasting_duration: '5m' + lasting_duration: '5m' diff --git a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml index bfbca53f6..2c6bce1fc 100644 --- a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "cluster initializing shards" -aggregation: true -transformation: ".min(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('plugin', 'elasticsearch')" signals: signal: @@ -10,7 +9,7 @@ signals: rules: critical: threshold: 1 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' major: diff --git a/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml index 05ff4bd09..262398179 100644 --- a/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "cluster relocating shards" -aggregation: true -transformation: ".min(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('plugin', 'elasticsearch')" signals: signal: @@ -12,11 +11,4 @@ rules: threshold: 0 comparator: ">" description: "is too high" - lasting_duration: '15m' - major: - threshold: -1 - comparator: ">" - dependency: critical - description: "is too high" - lasting_duration: '15m' - disabled: true + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml index d38bad121..ff045141d 100644 --- a/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml +++ b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "cluster unassigned shards" -aggregation: true -transformation: ".min(over='10m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('plugin', 'elasticsearch')" signals: signal: @@ -12,11 +11,4 @@ rules: threshold: 0 comparator: ">" description: "is too high" - lasting_duration: '10m' - major: - threshold: -1 - comparator: ">" - dependency: critical - description: "is too high" - lasting_duration: '10m' - disabled: true \ No newline at end of file + lasting_duration: '10m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml index d152c5f2f..63aab518b 100644 --- a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml +++ b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "pending tasks" -aggregation: true -transformation: ".min(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('plugin', 'elasticsearch')" signals: signal: @@ -10,7 +9,7 @@ signals: rules: critical: threshold: 5 - comparator: ">" + comparator: ">=" description: "are too high" lasting_duration: '15m' major: diff --git a/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml index cf5421821..26f29a1a2 100644 --- a/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml +++ b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "cpu usage" -aggregation: true -transformation: ".min(over='30m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: signal: @@ -10,7 +9,7 @@ signals: rules: critical: threshold: 95 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '30m' major: diff --git a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml index e1c7894cb..5b55150b4 100644 --- a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml +++ b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "file descriptors" -aggregation: true -transformation: ".max(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -15,7 +14,7 @@ signals: rules: critical: threshold: 95 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' major: diff --git a/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml index 483adc0ee..fb589bbef 100644 --- a/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml +++ b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "JVM heap memory usage" -aggregation: true -transformation: ".mean(over='5m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: signal: @@ -10,7 +9,7 @@ signals: rules: critical: threshold: 90 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '5m' major: diff --git a/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml index d669c465a..ee42abe48 100644 --- a/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml +++ b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "JVM memory young usage" -aggregation: true -transformation: ".mean(over='10m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -15,7 +14,7 @@ signals: rules: major: threshold: 90 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '10m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml index ab617e0d1..b00693737 100644 --- a/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml +++ b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "JVM memory old usage" -aggregation: true -transformation: ".mean(over='10m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -15,7 +14,7 @@ signals: rules: major: threshold: 90 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '10m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml index 65da726cd..63be00353 100644 --- a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "jvm gc old collection latency" -aggregation: true -transformation: ".mean(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -17,7 +16,7 @@ signals: rules: major: threshold: 300 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml index bc1f8bb7d..6fa6dfa99 100644 --- a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "jvm gc young collection latency" -aggregation: true -transformation: ".mean(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -17,7 +16,7 @@ signals: rules: major: threshold: 40 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml index b892a4314..f881d43cb 100644 --- a/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "indexing latency" -aggregation: true -transformation: ".mean(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -17,7 +16,7 @@ signals: rules: major: threshold: 30 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '1h' minor: diff --git a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml index 0b5fa3bca..3dc7b354b 100644 --- a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "flush latency" -aggregation: true -transformation: ".mean(over='30m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -17,7 +16,7 @@ signals: rules: major: threshold: 150 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml index c8e17fa29..cb6872764 100644 --- a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "search latency" -aggregation: true -transformation: ".min(over='30m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -17,7 +16,7 @@ signals: rules: major: threshold: 20 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '30m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml index 34d4b7791..6dd286303 100644 --- a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml +++ b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "fetch latency" -aggregation: true -transformation: ".min(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -17,7 +16,7 @@ signals: rules: major: threshold: 20 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml index f0c3dbfe4..ba8bd48b7 100644 --- a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml +++ b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "field_data evictions change" -aggregation: true -transformation: ".mean(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" signals: A: @@ -13,7 +12,7 @@ signals: rules: major: threshold: 120 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' minor: diff --git a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml index a4cbcb916..d8701b683 100644 --- a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml +++ b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml @@ -1,7 +1,6 @@ module: ElasticSearch name: "task time in queue change" -aggregation: true -transformation: ".mean(over='15m')" +aggregation: ".max(by=['cluster'])" filtering: "filter('plugin', 'elasticsearch')" signals: A: @@ -12,7 +11,7 @@ signals: rules: major: threshold: 200 - comparator: ">" + comparator: ">=" description: "is too high" lasting_duration: '15m' minor: diff --git a/modules/smart-agent_elasticsearch/detectors-gen.tf b/modules/smart-agent_elasticsearch/detectors-gen.tf index 0ebacf323..b0fbd51ce 100644 --- a/modules/smart-agent_elasticsearch/detectors-gen.tf +++ b/modules/smart-agent_elasticsearch/detectors-gen.tf @@ -37,12 +37,12 @@ resource "signalfx_detector" "cluster_status" { program_text = <<-EOF base_filtering = filter('plugin', 'elasticsearch') signal = data('elasticsearch.cluster.status', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') - detect(when(signal == ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical})).publish('CRIT') - detect(when(signal == ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major})).publish('MAJOR') + detect(when(signal >= ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical})).publish('CRIT') + detect(when(signal >= ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major})).publish('MAJOR') EOF rule { - description = "is red == ${var.cluster_status_threshold_critical}" + description = "is red >= ${var.cluster_status_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cluster_status_disabled_critical, var.cluster_status_disabled, var.detectors_disabled) @@ -54,7 +54,7 @@ EOF } rule { - description = "is yellow == ${var.cluster_status_threshold_major}" + description = "is yellow >= ${var.cluster_status_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.cluster_status_disabled_major, var.cluster_status_disabled, var.detectors_disabled) @@ -78,12 +78,12 @@ resource "signalfx_detector" "cluster_initializing_shards" { program_text = <<-EOF base_filtering = filter('plugin', 'elasticsearch') signal = data('elasticsearch.cluster.initializing-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_initializing_shards_threshold_major}, lasting=%{if var.cluster_initializing_shards_lasting_duration_major == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal >= ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cluster_initializing_shards_threshold_major}, lasting=%{if var.cluster_initializing_shards_lasting_duration_major == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_major}) and (not when(signal >= ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.cluster_initializing_shards_threshold_critical}" + description = "is too high >= ${var.cluster_initializing_shards_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cluster_initializing_shards_disabled_critical, var.cluster_initializing_shards_disabled, var.detectors_disabled) @@ -120,14 +120,13 @@ resource "signalfx_detector" "cluster_relocating_shards" { base_filtering = filter('plugin', 'elasticsearch') signal = data('elasticsearch.cluster.relocating-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') detect(when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_relocating_shards_threshold_major}, lasting=%{if var.cluster_relocating_shards_lasting_duration_major == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical}))).publish('MAJOR') EOF rule { description = "is too high > ${var.cluster_relocating_shards_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.cluster_relocating_shards_disabled_critical, var.cluster_relocating_shards_disabled, var.detectors_disabled) + disabled = coalesce(var.cluster_relocating_shards_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") tip = var.cluster_relocating_shards_tip @@ -135,18 +134,6 @@ EOF parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - rule { - description = "is too high > ${var.cluster_relocating_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_relocating_shards_disabled_major, var.cluster_relocating_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_relocating_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - max_delay = var.cluster_relocating_shards_max_delay } @@ -161,14 +148,13 @@ resource "signalfx_detector" "cluster_unassigned_shards" { base_filtering = filter('plugin', 'elasticsearch') signal = data('elasticsearch.cluster.unassigned-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_major == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_major}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical}))).publish('MAJOR') EOF rule { description = "is too high > ${var.cluster_unassigned_shards_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.cluster_unassigned_shards_disabled_critical, var.cluster_unassigned_shards_disabled, var.detectors_disabled) + disabled = coalesce(var.cluster_unassigned_shards_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") tip = var.cluster_unassigned_shards_tip @@ -176,18 +162,6 @@ EOF parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - rule { - description = "is too high > ${var.cluster_unassigned_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_unassigned_shards_disabled_major, var.cluster_unassigned_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_unassigned_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - max_delay = var.cluster_unassigned_shards_max_delay } @@ -201,12 +175,12 @@ resource "signalfx_detector" "pending_tasks" { program_text = <<-EOF base_filtering = filter('plugin', 'elasticsearch') signal = data('elasticsearch.cluster.pending-tasks', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') - detect(when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.pending_tasks_threshold_major}, lasting=%{if var.pending_tasks_lasting_duration_major == null}None%{else}'${var.pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_major}) and (not when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal >= ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.pending_tasks_threshold_major}, lasting=%{if var.pending_tasks_lasting_duration_major == null}None%{else}'${var.pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_major}) and (not when(signal >= ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "are too high > ${var.pending_tasks_threshold_critical}" + description = "are too high >= ${var.pending_tasks_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.pending_tasks_disabled_critical, var.pending_tasks_disabled, var.detectors_disabled) @@ -242,12 +216,12 @@ resource "signalfx_detector" "cpu_usage" { program_text = <<-EOF base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') signal = data('elasticsearch.process.cpu.percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.cpu_usage_threshold_major}, lasting=%{if var.cpu_usage_lasting_duration_major == null}None%{else}'${var.cpu_usage_lasting_duration_major}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_major}) and (not when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal >= ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cpu_usage_threshold_major}, lasting=%{if var.cpu_usage_lasting_duration_major == null}None%{else}'${var.cpu_usage_lasting_duration_major}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_major}) and (not when(signal >= ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.cpu_usage_threshold_critical}" + description = "is too high >= ${var.cpu_usage_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cpu_usage_disabled_critical, var.cpu_usage_disabled, var.detectors_disabled) @@ -285,12 +259,12 @@ resource "signalfx_detector" "file_descriptors" { A = data('elasticsearch.process.open_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} B = data('elasticsearch.process.max_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} signal = (A/B).scale(100).publish('signal') - detect(when(signal > ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.file_descriptors_threshold_major}, lasting=%{if var.file_descriptors_lasting_duration_major == null}None%{else}'${var.file_descriptors_lasting_duration_major}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_major}) and (not when(signal > ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal >= ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.file_descriptors_threshold_major}, lasting=%{if var.file_descriptors_lasting_duration_major == null}None%{else}'${var.file_descriptors_lasting_duration_major}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_major}) and (not when(signal >= ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.file_descriptors_threshold_critical}" + description = "is too high >= ${var.file_descriptors_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.file_descriptors_disabled_critical, var.file_descriptors_disabled, var.detectors_disabled) @@ -326,12 +300,12 @@ resource "signalfx_detector" "jvm_heap_memory_usage" { program_text = <<-EOF base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_major == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_major}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal >= ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_major == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_major}) and (not when(signal >= ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.jvm_heap_memory_usage_threshold_critical}" + description = "is too high >= ${var.jvm_heap_memory_usage_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.jvm_heap_memory_usage_disabled_critical, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) @@ -369,12 +343,12 @@ resource "signalfx_detector" "jvm_memory_young_usage" { A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_minor}) and (not when(signal > ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_memory_young_usage_threshold_major}" + description = "is too high >= ${var.jvm_memory_young_usage_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_memory_young_usage_disabled_major, var.jvm_memory_young_usage_disabled, var.detectors_disabled) @@ -412,12 +386,12 @@ resource "signalfx_detector" "jvm_memory_old_usage" { A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_minor}) and (not when(signal > ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_memory_old_usage_threshold_major}" + description = "is too high >= ${var.jvm_memory_old_usage_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_memory_old_usage_disabled_major, var.jvm_memory_old_usage_disabled, var.detectors_disabled) @@ -455,12 +429,12 @@ resource "signalfx_detector" "jvm_gc_old_collection_latency" { A = data('elasticsearch.jvm.gc.old-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} B = data('elasticsearch.jvm.gc.old-count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_minor}) and (not when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_gc_old_collection_latency_threshold_major}" + description = "is too high >= ${var.jvm_gc_old_collection_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_gc_old_collection_latency_disabled_major, var.jvm_gc_old_collection_latency_disabled, var.detectors_disabled) @@ -498,12 +472,12 @@ resource "signalfx_detector" "jvm_gc_young_collection_latency" { A = data('elasticsearch.jvm.gc.time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} B = data('elasticsearch.jvm.gc.count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_minor}) and (not when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_gc_young_collection_latency_threshold_major}" + description = "is too high >= ${var.jvm_gc_young_collection_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_gc_young_collection_latency_disabled_major, var.jvm_gc_young_collection_latency_disabled, var.detectors_disabled) @@ -541,12 +515,12 @@ resource "signalfx_detector" "indexing_latency" { A = data('elasticsearch.indices.indexing.index-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} B = data('elasticsearch.indices.indexing.index-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.indexing_latency_threshold_minor}, lasting=%{if var.indexing_latency_lasting_duration_minor == null}None%{else}'${var.indexing_latency_lasting_duration_minor}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_minor}) and (not when(signal > ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.indexing_latency_threshold_minor}, lasting=%{if var.indexing_latency_lasting_duration_minor == null}None%{else}'${var.indexing_latency_lasting_duration_minor}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_minor}) and (not when(signal >= ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.indexing_latency_threshold_major}" + description = "is too high >= ${var.indexing_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.indexing_latency_disabled_major, var.indexing_latency_disabled, var.detectors_disabled) @@ -584,12 +558,12 @@ resource "signalfx_detector" "flush_latency" { A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.flush_latency_threshold_minor}, lasting=%{if var.flush_latency_lasting_duration_minor == null}None%{else}'${var.flush_latency_lasting_duration_minor}'%{endif}, at_least=${var.flush_latency_at_least_percentage_minor}) and (not when(signal > ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.flush_latency_threshold_minor}, lasting=%{if var.flush_latency_lasting_duration_minor == null}None%{else}'${var.flush_latency_lasting_duration_minor}'%{endif}, at_least=${var.flush_latency_at_least_percentage_minor}) and (not when(signal >= ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.flush_latency_threshold_major}" + description = "is too high >= ${var.flush_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.flush_latency_disabled_major, var.flush_latency_disabled, var.detectors_disabled) @@ -627,12 +601,12 @@ resource "signalfx_detector" "search_latency" { A = data('elasticsearch.indices.search.query-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} B = data('elasticsearch.indices.search.query-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.search_latency_threshold_minor}, lasting=%{if var.search_latency_lasting_duration_minor == null}None%{else}'${var.search_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_latency_at_least_percentage_minor}) and (not when(signal > ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.search_latency_threshold_minor}, lasting=%{if var.search_latency_lasting_duration_minor == null}None%{else}'${var.search_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_latency_at_least_percentage_minor}) and (not when(signal >= ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.search_latency_threshold_major}" + description = "is too high >= ${var.search_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.search_latency_disabled_major, var.search_latency_disabled, var.detectors_disabled) @@ -670,12 +644,12 @@ resource "signalfx_detector" "fetch_latency" { A = data('elasticsearch.indices.search.fetch-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} B = data('elasticsearch.indices.search.fetch-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.fetch_latency_threshold_minor}, lasting=%{if var.fetch_latency_lasting_duration_minor == null}None%{else}'${var.fetch_latency_lasting_duration_minor}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_minor}) and (not when(signal > ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.fetch_latency_threshold_minor}, lasting=%{if var.fetch_latency_lasting_duration_minor == null}None%{else}'${var.fetch_latency_lasting_duration_minor}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_minor}) and (not when(signal >= ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.fetch_latency_threshold_major}" + description = "is too high >= ${var.fetch_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.fetch_latency_disabled_major, var.fetch_latency_disabled, var.detectors_disabled) @@ -712,12 +686,12 @@ resource "signalfx_detector" "field_data_evictions_change" { base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') A = data('elasticsearch.indices.fielddata.evictions', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function} signal = A.rateofchange().publish('signal') - detect(when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.field_data_evictions_change_threshold_minor}, lasting=%{if var.field_data_evictions_change_lasting_duration_minor == null}None%{else}'${var.field_data_evictions_change_lasting_duration_minor}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_minor}) and (not when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.field_data_evictions_change_threshold_minor}, lasting=%{if var.field_data_evictions_change_lasting_duration_minor == null}None%{else}'${var.field_data_evictions_change_lasting_duration_minor}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_minor}) and (not when(signal >= ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.field_data_evictions_change_threshold_major}" + description = "is too high >= ${var.field_data_evictions_change_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.field_data_evictions_change_disabled_major, var.field_data_evictions_change_disabled, var.detectors_disabled) @@ -754,12 +728,12 @@ resource "signalfx_detector" "task_time_in_queue_change" { base_filtering = filter('plugin', 'elasticsearch') A = data('elasticsearch.cluster.task-max-wait-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function} signal = A.rateofchange().publish('signal') - detect(when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major})).publish('MAJOR') - detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}, lasting=%{if var.task_time_in_queue_change_lasting_duration_minor == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_minor}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_minor}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major}))).publish('MINOR') + detect(when(signal >= ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}, lasting=%{if var.task_time_in_queue_change_lasting_duration_minor == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_minor}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_minor}) and (not when(signal >= ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.task_time_in_queue_change_threshold_major}" + description = "is too high >= ${var.task_time_in_queue_change_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.task_time_in_queue_change_disabled_major, var.task_time_in_queue_change_disabled, var.detectors_disabled) diff --git a/modules/smart-agent_elasticsearch/variables-gen.tf b/modules/smart-agent_elasticsearch/variables-gen.tf index 391d2bc31..7d1383245 100644 --- a/modules/smart-agent_elasticsearch/variables-gen.tf +++ b/modules/smart-agent_elasticsearch/variables-gen.tf @@ -53,13 +53,13 @@ variable "cluster_status_notifications" { variable "cluster_status_aggregation_function" { description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "cluster_status_transformation_function" { description = "Transformation function for cluster_status detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" } variable "cluster_status_max_delay" { @@ -143,13 +143,13 @@ variable "cluster_initializing_shards_notifications" { variable "cluster_initializing_shards_aggregation_function" { description = "Aggregation function and group by for cluster_initializing_shards detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "cluster_initializing_shards_transformation_function" { description = "Transformation function for cluster_initializing_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" } variable "cluster_initializing_shards_max_delay" { @@ -233,13 +233,13 @@ variable "cluster_relocating_shards_notifications" { variable "cluster_relocating_shards_aggregation_function" { description = "Aggregation function and group by for cluster_relocating_shards detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "cluster_relocating_shards_transformation_function" { description = "Transformation function for cluster_relocating_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" } variable "cluster_relocating_shards_max_delay" { @@ -266,18 +266,6 @@ variable "cluster_relocating_shards_disabled" { default = null } -variable "cluster_relocating_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_relocating_shards detector" - type = bool - default = null -} - -variable "cluster_relocating_shards_disabled_major" { - description = "Disable major alerting rule for cluster_relocating_shards detector" - type = bool - default = true -} - variable "cluster_relocating_shards_threshold_critical" { description = "Critical threshold for cluster_relocating_shards detector" type = number @@ -295,23 +283,6 @@ variable "cluster_relocating_shards_at_least_percentage_critical" { type = number default = 1 } -variable "cluster_relocating_shards_threshold_major" { - description = "Major threshold for cluster_relocating_shards detector" - type = number - default = -1 -} - -variable "cluster_relocating_shards_lasting_duration_major" { - description = "Minimum duration that conditions must be true before raising alert" - type = string - default = "15m" -} - -variable "cluster_relocating_shards_at_least_percentage_major" { - description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" - type = number - default = 1 -} # cluster_unassigned_shards detector variable "cluster_unassigned_shards_notifications" { @@ -323,13 +294,13 @@ variable "cluster_unassigned_shards_notifications" { variable "cluster_unassigned_shards_aggregation_function" { description = "Aggregation function and group by for cluster_unassigned_shards detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "cluster_unassigned_shards_transformation_function" { description = "Transformation function for cluster_unassigned_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='10m')" + default = "" } variable "cluster_unassigned_shards_max_delay" { @@ -356,18 +327,6 @@ variable "cluster_unassigned_shards_disabled" { default = null } -variable "cluster_unassigned_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_unassigned_shards detector" - type = bool - default = null -} - -variable "cluster_unassigned_shards_disabled_major" { - description = "Disable major alerting rule for cluster_unassigned_shards detector" - type = bool - default = true -} - variable "cluster_unassigned_shards_threshold_critical" { description = "Critical threshold for cluster_unassigned_shards detector" type = number @@ -385,23 +344,6 @@ variable "cluster_unassigned_shards_at_least_percentage_critical" { type = number default = 1 } -variable "cluster_unassigned_shards_threshold_major" { - description = "Major threshold for cluster_unassigned_shards detector" - type = number - default = -1 -} - -variable "cluster_unassigned_shards_lasting_duration_major" { - description = "Minimum duration that conditions must be true before raising alert" - type = string - default = "10m" -} - -variable "cluster_unassigned_shards_at_least_percentage_major" { - description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" - type = number - default = 1 -} # pending_tasks detector variable "pending_tasks_notifications" { @@ -413,13 +355,13 @@ variable "pending_tasks_notifications" { variable "pending_tasks_aggregation_function" { description = "Aggregation function and group by for pending_tasks detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "pending_tasks_transformation_function" { description = "Transformation function for pending_tasks detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" } variable "pending_tasks_max_delay" { @@ -503,13 +445,13 @@ variable "cpu_usage_notifications" { variable "cpu_usage_aggregation_function" { description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "cpu_usage_transformation_function" { description = "Transformation function for cpu_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='30m')" + default = "" } variable "cpu_usage_max_delay" { @@ -593,13 +535,13 @@ variable "file_descriptors_notifications" { variable "file_descriptors_aggregation_function" { description = "Aggregation function and group by for file_descriptors detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "file_descriptors_transformation_function" { description = "Transformation function for file_descriptors detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "file_descriptors_max_delay" { @@ -683,13 +625,13 @@ variable "jvm_heap_memory_usage_notifications" { variable "jvm_heap_memory_usage_aggregation_function" { description = "Aggregation function and group by for jvm_heap_memory_usage detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "jvm_heap_memory_usage_transformation_function" { description = "Transformation function for jvm_heap_memory_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" } variable "jvm_heap_memory_usage_max_delay" { @@ -773,13 +715,13 @@ variable "jvm_memory_young_usage_notifications" { variable "jvm_memory_young_usage_aggregation_function" { description = "Aggregation function and group by for jvm_memory_young_usage detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "jvm_memory_young_usage_transformation_function" { description = "Transformation function for jvm_memory_young_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='10m')" + default = "" } variable "jvm_memory_young_usage_max_delay" { @@ -863,13 +805,13 @@ variable "jvm_memory_old_usage_notifications" { variable "jvm_memory_old_usage_aggregation_function" { description = "Aggregation function and group by for jvm_memory_old_usage detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "jvm_memory_old_usage_transformation_function" { description = "Transformation function for jvm_memory_old_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='10m')" + default = "" } variable "jvm_memory_old_usage_max_delay" { @@ -953,13 +895,13 @@ variable "jvm_gc_old_collection_latency_notifications" { variable "jvm_gc_old_collection_latency_aggregation_function" { description = "Aggregation function and group by for jvm_gc_old_collection_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "jvm_gc_old_collection_latency_transformation_function" { description = "Transformation function for jvm_gc_old_collection_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" } variable "jvm_gc_old_collection_latency_max_delay" { @@ -1043,13 +985,13 @@ variable "jvm_gc_young_collection_latency_notifications" { variable "jvm_gc_young_collection_latency_aggregation_function" { description = "Aggregation function and group by for jvm_gc_young_collection_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "jvm_gc_young_collection_latency_transformation_function" { description = "Transformation function for jvm_gc_young_collection_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" } variable "jvm_gc_young_collection_latency_max_delay" { @@ -1133,13 +1075,13 @@ variable "indexing_latency_notifications" { variable "indexing_latency_aggregation_function" { description = "Aggregation function and group by for indexing_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "indexing_latency_transformation_function" { description = "Transformation function for indexing_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" } variable "indexing_latency_max_delay" { @@ -1223,13 +1165,13 @@ variable "flush_latency_notifications" { variable "flush_latency_aggregation_function" { description = "Aggregation function and group by for flush_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "flush_latency_transformation_function" { description = "Transformation function for flush_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='30m')" + default = "" } variable "flush_latency_max_delay" { @@ -1313,13 +1255,13 @@ variable "search_latency_notifications" { variable "search_latency_aggregation_function" { description = "Aggregation function and group by for search_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "search_latency_transformation_function" { description = "Transformation function for search_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='30m')" + default = "" } variable "search_latency_max_delay" { @@ -1403,13 +1345,13 @@ variable "fetch_latency_notifications" { variable "fetch_latency_aggregation_function" { description = "Aggregation function and group by for fetch_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "fetch_latency_transformation_function" { description = "Transformation function for fetch_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" } variable "fetch_latency_max_delay" { @@ -1493,13 +1435,13 @@ variable "field_data_evictions_change_notifications" { variable "field_data_evictions_change_aggregation_function" { description = "Aggregation function and group by for field_data_evictions_change detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "field_data_evictions_change_transformation_function" { description = "Transformation function for field_data_evictions_change detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" } variable "field_data_evictions_change_max_delay" { @@ -1583,13 +1525,13 @@ variable "task_time_in_queue_change_notifications" { variable "task_time_in_queue_change_aggregation_function" { description = "Aggregation function and group by for task_time_in_queue_change detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } variable "task_time_in_queue_change_transformation_function" { description = "Transformation function for task_time_in_queue_change detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" } variable "task_time_in_queue_change_max_delay" {