diff --git a/docs/severity.md b/docs/severity.md index 30be3a0cd..d73143208 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -975,25 +975,25 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|ElasticSearch heartbeat|X|-|-|-|-| +|Elasticsearch heartbeat|X|-|-|-|-| |ElasticSearch cluster status|X|X|-|-|-| |ElasticSearch cluster initializing shards|X|X|-|-|-| -|ElasticSearch cluster relocating shards|X|X|-|-|-| -|ElasticSearch Cluster unassigned shards|X|X|-|-|-| -|ElasticSearch Pending tasks|X|X|-|-|-| -|Elasticsearch CPU usage|X|X|-|-|-| -|Elasticsearch file descriptors usage|X|X|-|-|-| -|Elasticsearch JVM heap memory usage|X|X|-|-|-| -|Elasticsearch JVM memory young usage|-|X|X|-|-| -|Elasticsearch JVM memory old usage|-|X|X|-|-| -|Elasticsearch old-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch young-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch indexing latency|-|X|X|-|-| -|Elasticsearch index flushing to disk latency|-|X|X|-|-| -|Elasticsearch search query latency|-|X|X|-|-| -|Elasticsearch search fetch latency|-|X|X|-|-| -|Elasticsearch fielddata cache evictions rate of change|-|X|X|-|-| -|Elasticsearch max time spent by task in queue rate of change|-|X|X|-|-| +|ElasticSearch cluster relocating shards|X|-|-|-|-| +|ElasticSearch cluster unassigned shards|X|-|-|-|-| +|ElasticSearch pending tasks|X|X|-|-|-| +|ElasticSearch cpu usage|X|X|-|-|-| +|ElasticSearch file descriptors|X|X|-|-|-| +|ElasticSearch jvm heap memory usage|X|X|-|-|-| +|ElasticSearch jvm memory young usage|-|X|X|-|-| +|ElasticSearch jvm memory old usage|-|X|X|-|-| +|ElasticSearch jvm gc old collection latency|-|X|X|-|-| +|ElasticSearch jvm gc young collection latency|-|X|X|-|-| +|ElasticSearch indexing latency|-|X|X|-|-| +|ElasticSearch flush latency|-|X|X|-|-| +|ElasticSearch search latency|-|X|X|-|-| +|ElasticSearch fetch latency|-|X|X|-|-| +|ElasticSearch field_data evictions change|-|X|X|-|-| +|ElasticSearch task time in queue change|-|X|X|-|-| ## smart-agent_genericjmx diff --git a/modules/smart-agent_elasticsearch/README.md b/modules/smart-agent_elasticsearch/README.md index 89fe7ef26..b639402b1 100644 --- a/modules/smart-agent_elasticsearch/README.md +++ b/modules/smart-agent_elasticsearch/README.md @@ -59,7 +59,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables.tf](variables.tf). +[variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. @@ -77,25 +77,25 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|ElasticSearch heartbeat|X|-|-|-|-| +|Elasticsearch heartbeat|X|-|-|-|-| |ElasticSearch cluster status|X|X|-|-|-| |ElasticSearch cluster initializing shards|X|X|-|-|-| -|ElasticSearch cluster relocating shards|X|X|-|-|-| -|ElasticSearch Cluster unassigned shards|X|X|-|-|-| -|ElasticSearch Pending tasks|X|X|-|-|-| -|Elasticsearch CPU usage|X|X|-|-|-| -|Elasticsearch file descriptors usage|X|X|-|-|-| -|Elasticsearch JVM heap memory usage|X|X|-|-|-| -|Elasticsearch JVM memory young usage|-|X|X|-|-| -|Elasticsearch JVM memory old usage|-|X|X|-|-| -|Elasticsearch old-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch young-generation garbage collections latency|-|X|X|-|-| -|Elasticsearch indexing latency|-|X|X|-|-| -|Elasticsearch index flushing to disk latency|-|X|X|-|-| -|Elasticsearch search query latency|-|X|X|-|-| -|Elasticsearch search fetch latency|-|X|X|-|-| -|Elasticsearch fielddata cache evictions rate of change|-|X|X|-|-| -|Elasticsearch max time spent by task in queue rate of change|-|X|X|-|-| +|ElasticSearch cluster relocating shards|X|-|-|-|-| +|ElasticSearch cluster unassigned shards|X|-|-|-|-| +|ElasticSearch pending tasks|X|X|-|-|-| +|ElasticSearch cpu usage|X|X|-|-|-| +|ElasticSearch file descriptors|X|X|-|-|-| +|ElasticSearch jvm heap memory usage|X|X|-|-|-| +|ElasticSearch jvm memory young usage|-|X|X|-|-| +|ElasticSearch jvm memory old usage|-|X|X|-|-| +|ElasticSearch jvm gc old collection latency|-|X|X|-|-| +|ElasticSearch jvm gc young collection latency|-|X|X|-|-| +|ElasticSearch indexing latency|-|X|X|-|-| +|ElasticSearch flush latency|-|X|X|-|-| +|ElasticSearch search latency|-|X|X|-|-| +|ElasticSearch fetch latency|-|X|X|-|-| +|ElasticSearch field_data evictions change|-|X|X|-|-| +|ElasticSearch task time in queue change|-|X|X|-|-| ## How to collect required metrics? diff --git a/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml new file mode 100644 index 000000000..b16dfc27f --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +module: Elasticsearch +name: heartbeat + +transformation: false +aggregation: ".mean(by=['cluster'])" +exclude_not_running_vm: true +filtering: "filter('plugin', 'elasticsearch')" + +signals: + signal: + metric: elasticsearch.cluster.number-of-nodes +rules: + critical: \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml new file mode 100644 index 000000000..25976153d --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/01-cluster-status.yaml @@ -0,0 +1,18 @@ +module: ElasticSearch +name: "cluster status" +aggregation: ".max(by=['cluster'])" +filtering: "filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.cluster.status" +rules: + critical: + threshold: 2 + comparator: ">=" + description: "is red" + lasting_duration: '5m' + major: + threshold: 1 + comparator: ">=" + description: "is yellow" + lasting_duration: '5m' diff --git a/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml new file mode 100644 index 000000000..2c6bce1fc --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/02-cluster_initializing_shards.yaml @@ -0,0 +1,20 @@ +module: ElasticSearch +name: "cluster initializing shards" +aggregation: ".max(by=['cluster'])" +filtering: "filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.cluster.initializing-shards" + rollup: average +rules: + critical: + threshold: 1 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + major: + threshold: 0 + comparator: ">" + dependency: critical + description: "is too high" + lasting_duration: '15m' diff --git a/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml new file mode 100644 index 000000000..262398179 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/03-cluster_relocating_shards.yaml @@ -0,0 +1,14 @@ +module: ElasticSearch +name: "cluster relocating shards" +aggregation: ".max(by=['cluster'])" +filtering: "filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.cluster.relocating-shards" + rollup: average +rules: + critical: + threshold: 0 + comparator: ">" + description: "is too high" + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml new file mode 100644 index 000000000..ff045141d --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/04-cluster_unassigned_shards.yaml @@ -0,0 +1,14 @@ +module: ElasticSearch +name: "cluster unassigned shards" +aggregation: ".max(by=['cluster'])" +filtering: "filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.cluster.unassigned-shards" + rollup: average +rules: + critical: + threshold: 0 + comparator: ">" + description: "is too high" + lasting_duration: '10m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml new file mode 100644 index 000000000..63aab518b --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/05-cluster_pending_tasks.yaml @@ -0,0 +1,20 @@ +module: ElasticSearch +name: "pending tasks" +aggregation: ".max(by=['cluster'])" +filtering: "filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.cluster.pending-tasks" + rollup: average +rules: + critical: + threshold: 5 + comparator: ">=" + description: "are too high" + lasting_duration: '15m' + major: + threshold: 0 + comparator: ">" + dependency: critical + description: "are too high" + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml new file mode 100644 index 000000000..26f29a1a2 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/06-cluster_cpu_usage.yaml @@ -0,0 +1,20 @@ +module: ElasticSearch +name: "cpu usage" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.process.cpu.percent" + rollup: average +rules: + critical: + threshold: 95 + comparator: ">=" + description: "is too high" + lasting_duration: '30m' + major: + threshold: 85 + comparator: ">" + dependency: critical + description: "is too high" + lasting_duration: '30m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml new file mode 100644 index 000000000..5b55150b4 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/07-cluster_file_descriptor.yaml @@ -0,0 +1,25 @@ +module: ElasticSearch +name: "file descriptors" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.process.open_file_descriptors" + rollup: average + B: + metric: "elasticsearch.process.max_file_descriptors" + rollup: average + signal: + formula: "(A/B).scale(100)" +rules: + critical: + threshold: 95 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + major: + threshold: 90 + comparator: ">" + dependency: critical + description: "is too high" + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml new file mode 100644 index 000000000..fb589bbef --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/08-cluster_JVM_heap_memory.yaml @@ -0,0 +1,20 @@ +module: ElasticSearch +name: "JVM heap memory usage" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + signal: + metric: "elasticsearch.jvm.mem.heap-used-percent" + rollup: average +rules: + critical: + threshold: 90 + comparator: ">=" + description: "is too high" + lasting_duration: '5m' + major: + threshold: 80 + comparator: ">" + dependency: critical + description: "is too high" + lasting_duration: '5m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml new file mode 100644 index 000000000..ee42abe48 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/09-cluster_JVM_memory_young_usage.yaml @@ -0,0 +1,25 @@ +module: ElasticSearch +name: "JVM memory young usage" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.jvm.mem.pools.young.used_in_bytes" + rollup: average + B: + metric: "elasticsearch.jvm.mem.pools.young.max_in_bytes" + rollup: average + signal: + formula: "(A/B).fill(0).scale(100)" +rules: + major: + threshold: 90 + comparator: ">=" + description: "is too high" + lasting_duration: '10m' + minor: + threshold: 80 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '10m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml new file mode 100644 index 000000000..b00693737 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/10-cluster_JVM_memory_old_usage.yaml @@ -0,0 +1,25 @@ +module: ElasticSearch +name: "JVM memory old usage" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.jvm.mem.pools.old.used_in_bytes" + rollup: average + B: + metric: "elasticsearch.jvm.mem.pools.old.max_in_bytes" + rollup: average + signal: + formula: "(A/B).fill(0).scale(100)" +rules: + major: + threshold: 90 + comparator: ">=" + description: "is too high" + lasting_duration: '10m' + minor: + threshold: 80 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '10m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml new file mode 100644 index 000000000..63be00353 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/11-cluster_old-generation_garbage_collections_latency.yaml @@ -0,0 +1,27 @@ +module: ElasticSearch +name: "jvm gc old collection latency" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.jvm.gc.old-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.jvm.gc.old-count" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 300 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + minor: + threshold: 200 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml new file mode 100644 index 000000000..6fa6dfa99 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/12-cluster_young-generation_garbage_collections_latency.yaml @@ -0,0 +1,27 @@ +module: ElasticSearch +name: "jvm gc young collection latency" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.jvm.gc.time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.jvm.gc.count" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 40 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + minor: + threshold: 20 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml new file mode 100644 index 000000000..f881d43cb --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/13-cluster_indexing_latency.yaml @@ -0,0 +1,27 @@ +module: ElasticSearch +name: "indexing latency" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.indices.indexing.index-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.indexing.index-total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 30 + comparator: ">=" + description: "is too high" + lasting_duration: '1h' + minor: + threshold: 15 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '1h' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml new file mode 100644 index 000000000..3dc7b354b --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/14-cluster_flush_latency.yaml @@ -0,0 +1,27 @@ +module: ElasticSearch +name: "flush latency" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.indices.flush.total-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.flush.total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 150 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + minor: + threshold: 100 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml new file mode 100644 index 000000000..cb6872764 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/15-cluster_search_latency.yaml @@ -0,0 +1,27 @@ +module: ElasticSearch +name: "search latency" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.indices.search.query-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.search.query-total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 20 + comparator: ">=" + description: "is too high" + lasting_duration: '30m' + minor: + threshold: 10 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '30m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml new file mode 100644 index 000000000..6dd286303 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/16-cluster_fetch_latency.yaml @@ -0,0 +1,27 @@ +module: ElasticSearch +name: "fetch latency" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.indices.search.fetch-time" + extrapolation: zero + rollup: delta + B: + metric: "elasticsearch.indices.search.fetch-total" + extrapolation: zero + rollup: delta + signal: + formula: "(A/B).fill(0)" +rules: + major: + threshold: 20 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + minor: + threshold: 10 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml new file mode 100644 index 000000000..ba8bd48b7 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/17-cluster_fielddata_cache_evictions_rate.yaml @@ -0,0 +1,23 @@ +module: ElasticSearch +name: "field_data evictions change" +aggregation: ".max(by=['cluster'])" +filtering: "filter('node_name', '*') and filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.indices.fielddata.evictions" + extrapolation: zero + rollup: delta + signal: + formula: A.rateofchange() +rules: + major: + threshold: 120 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + minor: + threshold: 60 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml new file mode 100644 index 000000000..d8701b683 --- /dev/null +++ b/modules/smart-agent_elasticsearch/conf/18-cluster_time_in_queue_change.yaml @@ -0,0 +1,22 @@ +module: ElasticSearch +name: "task time in queue change" +aggregation: ".max(by=['cluster'])" +filtering: "filter('plugin', 'elasticsearch')" +signals: + A: + metric: "elasticsearch.cluster.task-max-wait-time" + rollup: average + signal: + formula: A.rateofchange() +rules: + major: + threshold: 200 + comparator: ">=" + description: "is too high" + lasting_duration: '15m' + minor: + threshold: 100 + comparator: ">" + description: "is too high" + dependency: major + lasting_duration: '15m' \ No newline at end of file diff --git a/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf b/modules/smart-agent_elasticsearch/detectors-gen.tf similarity index 60% rename from modules/smart-agent_elasticsearch/detectors-elasticsearch.tf rename to modules/smart-agent_elasticsearch/detectors-gen.tf index b62d490a3..b0fbd51ce 100644 --- a/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf +++ b/modules/smart-agent_elasticsearch/detectors-gen.tf @@ -1,5 +1,5 @@ resource "signalfx_detector" "heartbeat" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch heartbeat") + name = format("%s %s", local.detector_name_prefix, "Elasticsearch heartbeat") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) @@ -7,7 +7,8 @@ resource "signalfx_detector" "heartbeat" { program_text = <<-EOF from signalfx.detectors.not_reporting import not_reporting - signal = data('elasticsearch.cluster.number-of-nodes', filter=filter('plugin', 'elasticsearch') and ${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.number-of-nodes', filter=${local.not_running_vm_filters} and base_filtering and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') EOF @@ -34,13 +35,14 @@ resource "signalfx_detector" "cluster_status" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.status', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') - detect(when(signal == 1)).publish('MAJOR') - detect(when(signal == 2)).publish('CRIT') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.status', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') + detect(when(signal >= ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical})).publish('CRIT') + detect(when(signal >= ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major})).publish('MAJOR') EOF rule { - description = "is red" + description = "is red >= ${var.cluster_status_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cluster_status_disabled_critical, var.cluster_status_disabled, var.detectors_disabled) @@ -52,7 +54,7 @@ EOF } rule { - description = "is yellow" + description = "is yellow >= ${var.cluster_status_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.cluster_status_disabled_major, var.cluster_status_disabled, var.detectors_disabled) @@ -74,13 +76,14 @@ resource "signalfx_detector" "cluster_initializing_shards" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.initializing-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_initializing_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_initializing_shards_threshold_major}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}))).publish('MAJOR') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.initializing-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') + detect(when(signal >= ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cluster_initializing_shards_threshold_major}, lasting=%{if var.cluster_initializing_shards_lasting_duration_major == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_major}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_major}) and (not when(signal >= ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting_duration_critical == null}None%{else}'${var.cluster_initializing_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_initializing_shards_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.cluster_initializing_shards_threshold_critical}" + description = "is too high >= ${var.cluster_initializing_shards_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cluster_initializing_shards_disabled_critical, var.cluster_initializing_shards_disabled, var.detectors_disabled) @@ -114,16 +117,16 @@ resource "signalfx_detector" "cluster_relocating_shards" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.relocating-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_relocating_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_relocating_shards_threshold_major}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}))).publish('MAJOR') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.relocating-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting_duration_critical == null}None%{else}'${var.cluster_relocating_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_relocating_shards_at_least_percentage_critical})).publish('CRIT') EOF rule { description = "is too high > ${var.cluster_relocating_shards_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.cluster_relocating_shards_disabled_critical, var.cluster_relocating_shards_disabled, var.detectors_disabled) + disabled = coalesce(var.cluster_relocating_shards_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") tip = var.cluster_relocating_shards_tip @@ -131,39 +134,27 @@ EOF parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - rule { - description = "is too high > ${var.cluster_relocating_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_relocating_shards_disabled_major, var.cluster_relocating_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_relocating_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_relocating_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_relocating_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - max_delay = var.cluster_relocating_shards_max_delay } resource "signalfx_detector" "cluster_unassigned_shards" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch Cluster unassigned shards") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cluster unassigned shards") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.unassigned-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}))).publish('MAJOR') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.unassigned-shards', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting_duration_critical == null}None%{else}'${var.cluster_unassigned_shards_lasting_duration_critical}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "are too high > ${var.cluster_unassigned_shards_threshold_critical}" + description = "is too high > ${var.cluster_unassigned_shards_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.cluster_unassigned_shards_disabled_critical, var.cluster_unassigned_shards_disabled, var.detectors_disabled) + disabled = coalesce(var.cluster_unassigned_shards_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") tip = var.cluster_unassigned_shards_tip @@ -171,36 +162,25 @@ EOF parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - rule { - description = "are too high > ${var.cluster_unassigned_shards_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_unassigned_shards_disabled_major, var.cluster_unassigned_shards_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_unassigned_shards_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_unassigned_shards_runbook_url, var.runbook_url), "") - tip = var.cluster_unassigned_shards_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - max_delay = var.cluster_unassigned_shards_max_delay } resource "signalfx_detector" "pending_tasks" { - name = format("%s %s", local.detector_name_prefix, "ElasticSearch Pending tasks") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch pending tasks") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.pending-tasks', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') - detect(when(signal > ${var.pending_tasks_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.pending_tasks_threshold_major}) and (not when(signal > ${var.pending_tasks_threshold_critical}))).publish('MAJOR') + base_filtering = filter('plugin', 'elasticsearch') + signal = data('elasticsearch.cluster.pending-tasks', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') + detect(when(signal >= ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.pending_tasks_threshold_major}, lasting=%{if var.pending_tasks_lasting_duration_major == null}None%{else}'${var.pending_tasks_lasting_duration_major}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_major}) and (not when(signal >= ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_lasting_duration_critical == null}None%{else}'${var.pending_tasks_lasting_duration_critical}'%{endif}, at_least=${var.pending_tasks_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "are too high > ${var.pending_tasks_threshold_critical}" + description = "are too high >= ${var.pending_tasks_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.pending_tasks_disabled_critical, var.pending_tasks_disabled, var.detectors_disabled) @@ -227,20 +207,21 @@ EOF } resource "signalfx_detector" "cpu_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch CPU usage") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch cpu usage") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.process.cpu.percent', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.cpu_usage_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cpu_usage_threshold_major}) and (not when(signal > ${var.cpu_usage_threshold_critical}))).publish('MAJOR') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + signal = data('elasticsearch.process.cpu.percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') + detect(when(signal >= ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.cpu_usage_threshold_major}, lasting=%{if var.cpu_usage_lasting_duration_major == null}None%{else}'${var.cpu_usage_lasting_duration_major}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_major}) and (not when(signal >= ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting_duration_critical == null}None%{else}'${var.cpu_usage_lasting_duration_critical}'%{endif}, at_least=${var.cpu_usage_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.cpu_usage_threshold_critical}%" + description = "is too high >= ${var.cpu_usage_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cpu_usage_disabled_critical, var.cpu_usage_disabled, var.detectors_disabled) @@ -252,7 +233,7 @@ EOF } rule { - description = "is too high > ${var.cpu_usage_threshold_major}%" + description = "is too high > ${var.cpu_usage_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.cpu_usage_disabled_major, var.cpu_usage_disabled, var.detectors_disabled) @@ -267,22 +248,23 @@ EOF } resource "signalfx_detector" "file_descriptors" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch file descriptors usage") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch file descriptors") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.process.open_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} - B = data('elasticsearch.process.max_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.process.open_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} + B = data('elasticsearch.process.max_file_descriptors', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} signal = (A/B).scale(100).publish('signal') - detect(when(signal > ${var.file_descriptors_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.file_descriptors_threshold_major}) and (not when(signal > ${var.file_descriptors_threshold_critical}))).publish('MAJOR') + detect(when(signal >= ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.file_descriptors_threshold_major}, lasting=%{if var.file_descriptors_lasting_duration_major == null}None%{else}'${var.file_descriptors_lasting_duration_major}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_major}) and (not when(signal >= ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting_duration_critical == null}None%{else}'${var.file_descriptors_lasting_duration_critical}'%{endif}, at_least=${var.file_descriptors_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.file_descriptors_threshold_critical}%" + description = "is too high >= ${var.file_descriptors_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.file_descriptors_disabled_critical, var.file_descriptors_disabled, var.detectors_disabled) @@ -294,7 +276,7 @@ EOF } rule { - description = "is too high > ${var.file_descriptors_threshold_major}%" + description = "is too high > ${var.file_descriptors_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.file_descriptors_disabled_major, var.file_descriptors_disabled, var.detectors_disabled) @@ -309,20 +291,21 @@ EOF } resource "signalfx_detector" "jvm_heap_memory_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch JVM heap memory usage") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm heap memory usage") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}))).publish('MAJOR') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') + detect(when(signal >= ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_major == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_major}) and (not when(signal >= ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting_duration_critical == null}None%{else}'${var.jvm_heap_memory_usage_lasting_duration_critical}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least_percentage_critical}))).publish('MAJOR') EOF rule { - description = "is too high > ${var.jvm_heap_memory_usage_threshold_critical}%" + description = "is too high >= ${var.jvm_heap_memory_usage_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.jvm_heap_memory_usage_disabled_critical, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) @@ -334,7 +317,7 @@ EOF } rule { - description = "is too high > ${var.jvm_heap_memory_usage_threshold_major}%" + description = "is too high > ${var.jvm_heap_memory_usage_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_heap_memory_usage_disabled_major, var.jvm_heap_memory_usage_disabled, var.detectors_disabled) @@ -349,22 +332,23 @@ EOF } resource "signalfx_detector" "jvm_memory_young_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch JVM memory young usage") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm memory young usage") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} - B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} + B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}) and (not when(signal > ${var.jvm_memory_young_usage_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_young_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_memory_young_usage_threshold_major}" + description = "is too high >= ${var.jvm_memory_young_usage_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_memory_young_usage_disabled_major, var.jvm_memory_young_usage_disabled, var.detectors_disabled) @@ -391,22 +375,23 @@ EOF } resource "signalfx_detector" "jvm_memory_old_usage" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch JVM memory old usage") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm memory old usage") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} - B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} + B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}) and (not when(signal > ${var.jvm_memory_old_usage_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_minor == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_minor}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting_duration_major == null}None%{else}'${var.jvm_memory_old_usage_lasting_duration_major}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_memory_old_usage_threshold_major}" + description = "is too high >= ${var.jvm_memory_old_usage_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_memory_old_usage_disabled_major, var.jvm_memory_old_usage_disabled, var.detectors_disabled) @@ -433,22 +418,23 @@ EOF } resource "signalfx_detector" "jvm_gc_old_collection_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch old-generation garbage collections latency") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm gc old collection latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.jvm.gc.old-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} - B = data('elasticsearch.jvm.gc.old-count', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.jvm.gc.old-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} + B = data('elasticsearch.jvm.gc.old-count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}) and (not when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_gc_old_collection_latency_threshold_major}" + description = "is too high >= ${var.jvm_gc_old_collection_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_gc_old_collection_latency_disabled_major, var.jvm_gc_old_collection_latency_disabled, var.detectors_disabled) @@ -475,22 +461,23 @@ EOF } resource "signalfx_detector" "jvm_gc_young_collection_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch young-generation garbage collections latency") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch jvm gc young collection latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.jvm.gc.time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} - B = data('elasticsearch.jvm.gc.count', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.jvm.gc.time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} + B = data('elasticsearch.jvm.gc.count', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}) and (not when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_minor == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_minor}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_minor}) and (not when(signal >= ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting_duration_major == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting_duration_major}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.jvm_gc_young_collection_latency_threshold_major}" + description = "is too high >= ${var.jvm_gc_young_collection_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.jvm_gc_young_collection_latency_disabled_major, var.jvm_gc_young_collection_latency_disabled, var.detectors_disabled) @@ -517,22 +504,23 @@ EOF } resource "signalfx_detector" "indexing_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch indexing latency") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch indexing latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.indices.indexing.index-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} - B = data('elasticsearch.indices.indexing.index-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.indices.indexing.index-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} + B = data('elasticsearch.indices.indexing.index-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.indexing_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.indexing_latency_threshold_minor}) and (not when(signal > ${var.indexing_latency_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.indexing_latency_threshold_minor}, lasting=%{if var.indexing_latency_lasting_duration_minor == null}None%{else}'${var.indexing_latency_lasting_duration_minor}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_minor}) and (not when(signal >= ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting_duration_major == null}None%{else}'${var.indexing_latency_lasting_duration_major}'%{endif}, at_least=${var.indexing_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.indexing_latency_threshold_major}" + description = "is too high >= ${var.indexing_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.indexing_latency_disabled_major, var.indexing_latency_disabled, var.detectors_disabled) @@ -559,22 +547,23 @@ EOF } resource "signalfx_detector" "flush_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch index flushing to disk latency") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch flush latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.indices.flush.total-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} - B = data('elasticsearch.indices.flush.total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.indices.flush.total-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} + B = data('elasticsearch.indices.flush.total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.flush_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.flush_latency_threshold_minor}) and (not when(signal > ${var.flush_latency_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.flush_latency_threshold_minor}, lasting=%{if var.flush_latency_lasting_duration_minor == null}None%{else}'${var.flush_latency_lasting_duration_minor}'%{endif}, at_least=${var.flush_latency_at_least_percentage_minor}) and (not when(signal >= ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting_duration_major == null}None%{else}'${var.flush_latency_lasting_duration_major}'%{endif}, at_least=${var.flush_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.flush_latency_threshold_major}" + description = "is too high >= ${var.flush_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.flush_latency_disabled_major, var.flush_latency_disabled, var.detectors_disabled) @@ -601,22 +590,23 @@ EOF } resource "signalfx_detector" "search_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch search query latency") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch search latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.indices.search.query-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} - B = data('elasticsearch.indices.search.query-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.indices.search.query-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} + B = data('elasticsearch.indices.search.query-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.search_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.search_latency_threshold_minor}) and (not when(signal > ${var.search_latency_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.search_latency_threshold_minor}, lasting=%{if var.search_latency_lasting_duration_minor == null}None%{else}'${var.search_latency_lasting_duration_minor}'%{endif}, at_least=${var.search_latency_at_least_percentage_minor}) and (not when(signal >= ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting_duration_major == null}None%{else}'${var.search_latency_lasting_duration_major}'%{endif}, at_least=${var.search_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.search_latency_threshold_major}" + description = "is too high >= ${var.search_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.search_latency_disabled_major, var.search_latency_disabled, var.detectors_disabled) @@ -643,22 +633,23 @@ EOF } resource "signalfx_detector" "fetch_latency" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch search fetch latency") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch fetch latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.indices.search.fetch-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} - B = data('elasticsearch.indices.search.fetch-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.indices.search.fetch-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} + B = data('elasticsearch.indices.search.fetch-total', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.fetch_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.fetch_latency_threshold_minor}) and (not when(signal > ${var.fetch_latency_threshold_major}))).publish('MINOR') + detect(when(signal >= ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.fetch_latency_threshold_minor}, lasting=%{if var.fetch_latency_lasting_duration_minor == null}None%{else}'${var.fetch_latency_lasting_duration_minor}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_minor}) and (not when(signal >= ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting_duration_major == null}None%{else}'${var.fetch_latency_lasting_duration_major}'%{endif}, at_least=${var.fetch_latency_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.fetch_latency_threshold_major}" + description = "is too high >= ${var.fetch_latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.fetch_latency_disabled_major, var.fetch_latency_disabled, var.detectors_disabled) @@ -685,20 +676,22 @@ EOF } resource "signalfx_detector" "field_data_evictions_change" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch fielddata cache evictions rate of change") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch field_data evictions change") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.indices.fielddata.evictions', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta').rateofchange()${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function}.publish('signal') - detect(when(signal > ${var.field_data_evictions_change_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.field_data_evictions_change_threshold_minor}) and (not when(signal > ${var.field_data_evictions_change_threshold_major}))).publish('MINOR') + base_filtering = filter('node_name', '*') and filter('plugin', 'elasticsearch') + A = data('elasticsearch.indices.fielddata.evictions', filter=base_filtering and ${module.filtering.signalflow}, rollup='delta', extrapolation='zero')${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function} + signal = A.rateofchange().publish('signal') + detect(when(signal >= ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.field_data_evictions_change_threshold_minor}, lasting=%{if var.field_data_evictions_change_lasting_duration_minor == null}None%{else}'${var.field_data_evictions_change_lasting_duration_minor}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_minor}) and (not when(signal >= ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting_duration_major == null}None%{else}'${var.field_data_evictions_change_lasting_duration_major}'%{endif}, at_least=${var.field_data_evictions_change_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.field_data_evictions_change_threshold_major}" + description = "is too high >= ${var.field_data_evictions_change_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.field_data_evictions_change_disabled_major, var.field_data_evictions_change_disabled, var.detectors_disabled) @@ -725,20 +718,22 @@ EOF } resource "signalfx_detector" "task_time_in_queue_change" { - name = format("%s %s", local.detector_name_prefix, "Elasticsearch max time spent by task in queue rate of change") + name = format("%s %s", local.detector_name_prefix, "ElasticSearch task time in queue change") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('elasticsearch.cluster.task-max-wait-time', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average').rateofchange()${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function}.publish('signal') - detect(when(signal > ${var.task_time_in_queue_change_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}))).publish('MINOR') + base_filtering = filter('plugin', 'elasticsearch') + A = data('elasticsearch.cluster.task-max-wait-time', filter=base_filtering and ${module.filtering.signalflow}, rollup='average')${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function} + signal = A.rateofchange().publish('signal') + detect(when(signal >= ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major})).publish('MAJOR') + detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}, lasting=%{if var.task_time_in_queue_change_lasting_duration_minor == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_minor}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_minor}) and (not when(signal >= ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting_duration_major == null}None%{else}'${var.task_time_in_queue_change_lasting_duration_major}'%{endif}, at_least=${var.task_time_in_queue_change_at_least_percentage_major}))).publish('MINOR') EOF rule { - description = "is too high > ${var.task_time_in_queue_change_threshold_major}" + description = "is too high >= ${var.task_time_in_queue_change_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.task_time_in_queue_change_disabled_major, var.task_time_in_queue_change_disabled, var.detectors_disabled) diff --git a/modules/smart-agent_elasticsearch/variables.tf b/modules/smart-agent_elasticsearch/variables-gen.tf similarity index 67% rename from modules/smart-agent_elasticsearch/variables.tf rename to modules/smart-agent_elasticsearch/variables-gen.tf index 3cf6ee9bb..7d1383245 100644 --- a/modules/smart-agent_elasticsearch/variables.tf +++ b/modules/smart-agent_elasticsearch/variables-gen.tf @@ -1,6 +1,16 @@ -# Module specific +# heartbeat detector -# Heartbeat detector +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".mean(by=['cluster'])" +} variable "heartbeat_max_delay" { description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" @@ -26,26 +36,32 @@ variable "heartbeat_disabled" { default = null } -variable "heartbeat_notifications" { - description = "Notification recipients list per severity overridden for heartbeat detector" +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"10m\")" + type = string + default = "10m" +} + +# cluster_status detector + +variable "cluster_status_notifications" { + description = "Notification recipients list per severity overridden for cluster_status detector" type = map(list(string)) default = {} } -variable "heartbeat_timeframe" { - description = "Timeframe for heartbeat detector (i.e. \"10m\")" +variable "cluster_status_aggregation_function" { + description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" type = string - default = "10m" + default = ".max(by=['cluster'])" } -variable "heartbeat_aggregation_function" { - description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" +variable "cluster_status_transformation_function" { + description = "Transformation function for cluster_status detector (i.e. \".mean(over='5m')\")" type = string default = "" } -# Cluster_status detector - variable "cluster_status_max_delay" { description = "Enforce max delay for cluster_status detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -65,43 +81,77 @@ variable "cluster_status_runbook_url" { } variable "cluster_status_disabled" { - description = "Disable all alerting rules for cluster_status_not_green detector" + description = "Disable all alerting rules for cluster_status detector" type = bool default = null } variable "cluster_status_disabled_critical" { - description = "Disable critical alerting rule for cluster_status_not_green detector" + description = "Disable critical alerting rule for cluster_status detector" type = bool default = null } variable "cluster_status_disabled_major" { - description = "Disable major alerting rule for cluster_status_not_green detector" + description = "Disable major alerting rule for cluster_status detector" type = bool default = null } -variable "cluster_status_notifications" { - description = "Notification recipients list per severity overridden for cluster_status_not_green detector" +variable "cluster_status_threshold_critical" { + description = "Critical threshold for cluster_status detector" + type = number + default = 2 +} + +variable "cluster_status_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "cluster_status_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_status_threshold_major" { + description = "Major threshold for cluster_status detector" + type = number + default = 1 +} + +variable "cluster_status_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "cluster_status_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_initializing_shards detector + +variable "cluster_initializing_shards_notifications" { + description = "Notification recipients list per severity overridden for cluster_initializing_shards detector" type = map(list(string)) default = {} } -variable "cluster_status_aggregation_function" { - description = "Aggregation function and group by for cluster_status_not_green detector (i.e. \".mean(by=['host'])\")" +variable "cluster_initializing_shards_aggregation_function" { + description = "Aggregation function and group by for cluster_initializing_shards detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } -variable "cluster_status_transformation_function" { - description = "Transformation function for cluster_status_not_green detector (i.e. \".mean(over='5m')\")" +variable "cluster_initializing_shards_transformation_function" { + description = "Transformation function for cluster_initializing_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" } -# Cluster_initializing_shards detector - variable "cluster_initializing_shards_max_delay" { description = "Enforce max delay for cluster_initializing_shards detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -138,37 +188,59 @@ variable "cluster_initializing_shards_disabled_major" { default = null } -variable "cluster_initializing_shards_notifications" { - description = "Notification recipients list per severity overridden for cluster_initializing_shards detector" - type = map(list(string)) - default = {} -} - -variable "cluster_initializing_shards_aggregation_function" { - description = "Aggregation function and group by for cluster_initializing_shards detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" +variable "cluster_initializing_shards_threshold_critical" { + description = "Critical threshold for cluster_initializing_shards detector" + type = number + default = 1 } -variable "cluster_initializing_shards_transformation_function" { - description = "Transformation function for cluster_initializing_shards detector (i.e. \".mean(over='5m')\")" +variable "cluster_initializing_shards_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".min(over='15m')" + default = "15m" } -variable "cluster_initializing_shards_threshold_critical" { - description = "Critical threshold for cluster_initializing_shards detector" +variable "cluster_initializing_shards_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } - variable "cluster_initializing_shards_threshold_major" { description = "Major threshold for cluster_initializing_shards detector" type = number default = 0 } -# Cluster_relocating_shards detector +variable "cluster_initializing_shards_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "cluster_initializing_shards_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_relocating_shards detector + +variable "cluster_relocating_shards_notifications" { + description = "Notification recipients list per severity overridden for cluster_relocating_shards detector" + type = map(list(string)) + default = {} +} + +variable "cluster_relocating_shards_aggregation_function" { + description = "Aggregation function and group by for cluster_relocating_shards detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" +} + +variable "cluster_relocating_shards_transformation_function" { + description = "Transformation function for cluster_relocating_shards detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "cluster_relocating_shards_max_delay" { description = "Enforce max delay for cluster_relocating_shards detector (use \"0\" or \"null\" for \"Auto\")" @@ -194,50 +266,43 @@ variable "cluster_relocating_shards_disabled" { default = null } -variable "cluster_relocating_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_relocating_shards detector" - type = bool - default = null +variable "cluster_relocating_shards_threshold_critical" { + description = "Critical threshold for cluster_relocating_shards detector" + type = number + default = 0 } -variable "cluster_relocating_shards_disabled_major" { - description = "Disable major alerting rule for cluster_relocating_shards detector" - type = bool - default = true +variable "cluster_relocating_shards_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" } -variable "cluster_relocating_shards_notifications" { - description = "Notification recipients list per severity overridden for cluster_relocating_shards detector" +variable "cluster_relocating_shards_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_unassigned_shards detector + +variable "cluster_unassigned_shards_notifications" { + description = "Notification recipients list per severity overridden for cluster_unassigned_shards detector" type = map(list(string)) default = {} } -variable "cluster_relocating_shards_aggregation_function" { - description = "Aggregation function and group by for cluster_relocating_shards detector (i.e. \".mean(by=['host'])\")" +variable "cluster_unassigned_shards_aggregation_function" { + description = "Aggregation function and group by for cluster_unassigned_shards detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } -variable "cluster_relocating_shards_transformation_function" { - description = "Transformation function for cluster_relocating_shards detector (i.e. \".mean(over='5m')\")" +variable "cluster_unassigned_shards_transformation_function" { + description = "Transformation function for cluster_unassigned_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" -} - -variable "cluster_relocating_shards_threshold_critical" { - description = "Critical threshold for cluster_relocating_shards detector" - type = number - default = 0 -} - -variable "cluster_relocating_shards_threshold_major" { - description = "Major threshold for cluster_relocating_shards detector" - type = number - default = -1 + default = "" } -# Cluster_unassigned_shards detector - variable "cluster_unassigned_shards_max_delay" { description = "Enforce max delay for cluster_unassigned_shards detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -262,50 +327,43 @@ variable "cluster_unassigned_shards_disabled" { default = null } -variable "cluster_unassigned_shards_disabled_critical" { - description = "Disable critical alerting rule for cluster_unassigned_shards detector" - type = bool - default = null +variable "cluster_unassigned_shards_threshold_critical" { + description = "Critical threshold for cluster_unassigned_shards detector" + type = number + default = 0 } -variable "cluster_unassigned_shards_disabled_major" { - description = "Disable major alerting rule for cluster_unassigned_shards detector" - type = bool - default = true +variable "cluster_unassigned_shards_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" } -variable "cluster_unassigned_shards_notifications" { - description = "Notification recipients list per severity overridden for cluster_unassigned_shards detector" +variable "cluster_unassigned_shards_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# pending_tasks detector + +variable "pending_tasks_notifications" { + description = "Notification recipients list per severity overridden for pending_tasks detector" type = map(list(string)) default = {} } -variable "cluster_unassigned_shards_aggregation_function" { - description = "Aggregation function and group by for cluster_unassigned_shards detector (i.e. \".mean(by=['host'])\")" +variable "pending_tasks_aggregation_function" { + description = "Aggregation function and group by for pending_tasks detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } -variable "cluster_unassigned_shards_transformation_function" { - description = "Transformation function for cluster_unassigned_shards detector (i.e. \".mean(over='5m')\")" +variable "pending_tasks_transformation_function" { + description = "Transformation function for pending_tasks detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='10m')" -} - -variable "cluster_unassigned_shards_threshold_critical" { - description = "Critical threshold for cluster_unassigned_shards detector" - type = number - default = 0 -} - -variable "cluster_unassigned_shards_threshold_major" { - description = "Major threshold for cluster_unassigned_shards detector" - type = number - default = -1 + default = "" } -# pending_tasks detector - variable "pending_tasks_max_delay" { description = "Enforce max delay for pending_tasks detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -342,241 +400,329 @@ variable "pending_tasks_disabled_major" { default = null } -variable "pending_tasks_notifications" { - description = "Notification recipients list per severity overridden for pending_tasks detector" - type = map(list(string)) - default = {} -} - -variable "pending_tasks_aggregation_function" { - description = "Aggregation function and group by for pending_tasks detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" +variable "pending_tasks_threshold_critical" { + description = "Critical threshold for pending_tasks detector" + type = number + default = 5 } -variable "pending_tasks_transformation_function" { - description = "Transformation function for pending_tasks detector (i.e. \".mean(over='5m')\")" +variable "pending_tasks_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".min(over='15m')" + default = "15m" } -variable "pending_tasks_threshold_critical" { - description = "Critical threshold for pending_tasks detector" +variable "pending_tasks_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 5 + default = 1 } - variable "pending_tasks_threshold_major" { description = "Major threshold for pending_tasks detector" type = number default = 0 } -# Jvm_heap_memory_usage detector +variable "pending_tasks_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "pending_tasks_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cpu_usage detector -variable "jvm_heap_memory_usage_max_delay" { - description = "Enforce max delay for jvm_heap_memory_usage detector (use \"0\" or \"null\" for \"Auto\")" +variable "cpu_usage_notifications" { + description = "Notification recipients list per severity overridden for cpu_usage detector" + type = map(list(string)) + default = {} +} + +variable "cpu_usage_aggregation_function" { + description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" +} + +variable "cpu_usage_transformation_function" { + description = "Transformation function for cpu_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "cpu_usage_max_delay" { + description = "Enforce max delay for cpu_usage detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "jvm_heap_memory_usage_tip" { +variable "cpu_usage_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "jvm_heap_memory_usage_runbook_url" { +variable "cpu_usage_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "jvm_heap_memory_usage_disabled" { - description = "Disable all alerting rules for jvm_heap_memory_usage detector" +variable "cpu_usage_disabled" { + description = "Disable all alerting rules for cpu_usage detector" type = bool default = null } -variable "jvm_heap_memory_usage_disabled_critical" { - description = "Disable critical alerting rule for jvm_heap_memory_usage detector" +variable "cpu_usage_disabled_critical" { + description = "Disable critical alerting rule for cpu_usage detector" type = bool default = null } -variable "jvm_heap_memory_usage_disabled_major" { - description = "Disable major alerting rule for jvm_heap_memory_usage detector" +variable "cpu_usage_disabled_major" { + description = "Disable major alerting rule for cpu_usage detector" type = bool default = null } -variable "jvm_heap_memory_usage_notifications" { - description = "Notification recipients list per severity overridden for jvm_heap_memory_usage detector" - type = map(list(string)) - default = {} +variable "cpu_usage_threshold_critical" { + description = "Critical threshold for cpu_usage detector" + type = number + default = 95 } -variable "jvm_heap_memory_usage_aggregation_function" { - description = "Aggregation function and group by for jvm_heap_memory_usage detector (i.e. \".mean(by=['host'])\")" +variable "cpu_usage_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "30m" } -variable "jvm_heap_memory_usage_transformation_function" { - description = "Transformation function for jvm_heap_memory_usage detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='5m')" +variable "cpu_usage_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } - -variable "jvm_heap_memory_usage_threshold_critical" { - description = "Critical threshold for jvm_heap_memory_usage detector" +variable "cpu_usage_threshold_major" { + description = "Major threshold for cpu_usage detector" type = number - default = 90 + default = 85 } -variable "jvm_heap_memory_usage_threshold_major" { - description = "Major threshold for jvm_heap_memory_usage detector" +variable "cpu_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "30m" +} + +variable "cpu_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 80 + default = 1 } +# file_descriptors detector -# cpu_usage detector +variable "file_descriptors_notifications" { + description = "Notification recipients list per severity overridden for file_descriptors detector" + type = map(list(string)) + default = {} +} -variable "cpu_usage_max_delay" { - description = "Enforce max delay for cpu_usage detector (use \"0\" or \"null\" for \"Auto\")" +variable "file_descriptors_aggregation_function" { + description = "Aggregation function and group by for file_descriptors detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" +} + +variable "file_descriptors_transformation_function" { + description = "Transformation function for file_descriptors detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "file_descriptors_max_delay" { + description = "Enforce max delay for file_descriptors detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "cpu_usage_tip" { +variable "file_descriptors_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "cpu_usage_runbook_url" { +variable "file_descriptors_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "cpu_usage_disabled" { - description = "Disable all alerting rules for cpu_usage detector" +variable "file_descriptors_disabled" { + description = "Disable all alerting rules for file_descriptors detector" type = bool default = null } -variable "cpu_usage_disabled_critical" { - description = "Disable critical alerting rule for cpu_usage detector" +variable "file_descriptors_disabled_critical" { + description = "Disable critical alerting rule for file_descriptors detector" type = bool default = null } -variable "cpu_usage_disabled_major" { - description = "Disable major alerting rule for cpu_usage detector" +variable "file_descriptors_disabled_major" { + description = "Disable major alerting rule for file_descriptors detector" type = bool default = null } -variable "cpu_usage_notifications" { - description = "Notification recipients list per severity overridden for cpu_usage detector" - type = map(list(string)) - default = {} +variable "file_descriptors_threshold_critical" { + description = "Critical threshold for file_descriptors detector" + type = number + default = 95 } -variable "cpu_usage_aggregation_function" { - description = "Aggregation function and group by for cpu_usage detector (i.e. \".mean(by=['host'])\")" +variable "file_descriptors_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "15m" } -variable "cpu_usage_transformation_function" { - description = "Transformation function for cpu_usage detector (i.e. \".mean(over='5m')\")" +variable "file_descriptors_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "file_descriptors_threshold_major" { + description = "Major threshold for file_descriptors detector" + type = number + default = 90 +} + +variable "file_descriptors_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".min(over='30m')" + default = "15m" } -variable "cpu_usage_threshold_critical" { - description = "Critical threshold for cpu_usage detector" +variable "file_descriptors_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 95 + default = 1 } +# jvm_heap_memory_usage detector -variable "cpu_usage_threshold_major" { - description = "Major threshold for cpu_usage detector" - type = number - default = 85 +variable "jvm_heap_memory_usage_notifications" { + description = "Notification recipients list per severity overridden for jvm_heap_memory_usage detector" + type = map(list(string)) + default = {} } -# file_descriptors detector +variable "jvm_heap_memory_usage_aggregation_function" { + description = "Aggregation function and group by for jvm_heap_memory_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" +} -variable "file_descriptors_max_delay" { - description = "Enforce max delay for file_descriptors detector (use \"0\" or \"null\" for \"Auto\")" +variable "jvm_heap_memory_usage_transformation_function" { + description = "Transformation function for jvm_heap_memory_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "jvm_heap_memory_usage_max_delay" { + description = "Enforce max delay for jvm_heap_memory_usage detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "file_descriptors_tip" { +variable "jvm_heap_memory_usage_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "file_descriptors_runbook_url" { +variable "jvm_heap_memory_usage_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "file_descriptors_disabled" { - description = "Disable all alerting rules for file_descriptors detector" +variable "jvm_heap_memory_usage_disabled" { + description = "Disable all alerting rules for jvm_heap_memory_usage detector" type = bool default = null } -variable "file_descriptors_disabled_critical" { - description = "Disable critical alerting rule for file_descriptors detector" +variable "jvm_heap_memory_usage_disabled_critical" { + description = "Disable critical alerting rule for jvm_heap_memory_usage detector" type = bool default = null } -variable "file_descriptors_disabled_major" { - description = "Disable major alerting rule for file_descriptors detector" +variable "jvm_heap_memory_usage_disabled_major" { + description = "Disable major alerting rule for jvm_heap_memory_usage detector" type = bool default = null } -variable "file_descriptors_notifications" { - description = "Notification recipients list per severity overridden for file_descriptors detector" - type = map(list(string)) - default = {} +variable "jvm_heap_memory_usage_threshold_critical" { + description = "Critical threshold for jvm_heap_memory_usage detector" + type = number + default = 90 } -variable "file_descriptors_aggregation_function" { - description = "Aggregation function and group by for file_descriptors detector (i.e. \".mean(by=['host'])\")" +variable "jvm_heap_memory_usage_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "5m" } -variable "file_descriptors_transformation_function" { - description = "Transformation function for file_descriptors detector (i.e. \".mean(over='5m')\")" +variable "jvm_heap_memory_usage_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_heap_memory_usage_threshold_major" { + description = "Major threshold for jvm_heap_memory_usage detector" + type = number + default = 80 +} + +variable "jvm_heap_memory_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".max(over='15m')" + default = "5m" } -variable "file_descriptors_threshold_critical" { - description = "Critical threshold for file_descriptors detector" +variable "jvm_heap_memory_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 95 + default = 1 } +# jvm_memory_young_usage detector -variable "file_descriptors_threshold_major" { - description = "Major threshold for file_descriptors detector" - type = number - default = 90 +variable "jvm_memory_young_usage_notifications" { + description = "Notification recipients list per severity overridden for jvm_memory_young_usage detector" + type = map(list(string)) + default = {} +} + +variable "jvm_memory_young_usage_aggregation_function" { + description = "Aggregation function and group by for jvm_memory_young_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Jvm_memory_young_usage detector +variable "jvm_memory_young_usage_transformation_function" { + description = "Transformation function for jvm_memory_young_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "jvm_memory_young_usage_max_delay" { description = "Enforce max delay for jvm_memory_young_usage detector (use \"0\" or \"null\" for \"Auto\")" @@ -614,37 +760,59 @@ variable "jvm_memory_young_usage_disabled_minor" { default = null } -variable "jvm_memory_young_usage_notifications" { - description = "Notification recipients list per severity overridden for jvm_memory_young_usage detector" - type = map(list(string)) - default = {} +variable "jvm_memory_young_usage_threshold_major" { + description = "Major threshold for jvm_memory_young_usage detector" + type = number + default = 90 } -variable "jvm_memory_young_usage_aggregation_function" { - description = "Aggregation function and group by for jvm_memory_young_usage detector (i.e. \".mean(by=['host'])\")" +variable "jvm_memory_young_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "10m" } -variable "jvm_memory_young_usage_transformation_function" { - description = "Transformation function for jvm_memory_young_usage detector (i.e. \".mean(over='5m')\")" +variable "jvm_memory_young_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_memory_young_usage_threshold_minor" { + description = "Minor threshold for jvm_memory_young_usage detector" + type = number + default = 80 +} + +variable "jvm_memory_young_usage_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='10m')" + default = "10m" } -variable "jvm_memory_young_usage_threshold_major" { - description = "major threshold for jvm_memory_young_usage detector" +variable "jvm_memory_young_usage_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 90 + default = 1 } +# jvm_memory_old_usage detector -variable "jvm_memory_young_usage_threshold_minor" { - description = "minor threshold for jvm_memory_young_usage detector" - type = number - default = 80 +variable "jvm_memory_old_usage_notifications" { + description = "Notification recipients list per severity overridden for jvm_memory_old_usage detector" + type = map(list(string)) + default = {} } -# Jvm_memory_old_usage detector +variable "jvm_memory_old_usage_aggregation_function" { + description = "Aggregation function and group by for jvm_memory_old_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" +} + +variable "jvm_memory_old_usage_transformation_function" { + description = "Transformation function for jvm_memory_old_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "jvm_memory_old_usage_max_delay" { description = "Enforce max delay for jvm_memory_old_usage detector (use \"0\" or \"null\" for \"Auto\")" @@ -682,37 +850,59 @@ variable "jvm_memory_old_usage_disabled_minor" { default = null } -variable "jvm_memory_old_usage_notifications" { - description = "Notification recipients list per severity overridden for jvm_memory_old_usage detector" - type = map(list(string)) - default = {} +variable "jvm_memory_old_usage_threshold_major" { + description = "Major threshold for jvm_memory_old_usage detector" + type = number + default = 90 } -variable "jvm_memory_old_usage_aggregation_function" { - description = "Aggregation function and group by for jvm_memory_old_usage detector (i.e. \".mean(by=['host'])\")" +variable "jvm_memory_old_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "10m" } -variable "jvm_memory_old_usage_transformation_function" { - description = "Transformation function for jvm_memory_old_usage detector (i.e. \".mean(over='5m')\")" +variable "jvm_memory_old_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_memory_old_usage_threshold_minor" { + description = "Minor threshold for jvm_memory_old_usage detector" + type = number + default = 80 +} + +variable "jvm_memory_old_usage_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='10m')" + default = "10m" } -variable "jvm_memory_old_usage_threshold_major" { - description = "major threshold for jvm_memory_old_usage detector" +variable "jvm_memory_old_usage_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 90 + default = 1 } +# jvm_gc_old_collection_latency detector -variable "jvm_memory_old_usage_threshold_minor" { - description = "minor threshold for jvm_memory_old_usage detector" - type = number - default = 80 +variable "jvm_gc_old_collection_latency_notifications" { + description = "Notification recipients list per severity overridden for jvm_gc_old_collection_latency detector" + type = map(list(string)) + default = {} +} + +variable "jvm_gc_old_collection_latency_aggregation_function" { + description = "Aggregation function and group by for jvm_gc_old_collection_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Jvm_gc_old_collection_latency detector +variable "jvm_gc_old_collection_latency_transformation_function" { + description = "Transformation function for jvm_gc_old_collection_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "jvm_gc_old_collection_latency_max_delay" { description = "Enforce max delay for jvm_gc_old_collection_latency detector (use \"0\" or \"null\" for \"Auto\")" @@ -750,37 +940,59 @@ variable "jvm_gc_old_collection_latency_disabled_minor" { default = null } -variable "jvm_gc_old_collection_latency_notifications" { - description = "Notification recipients list per severity overridden for jvm_gc_old_collection_latency detector" - type = map(list(string)) - default = {} +variable "jvm_gc_old_collection_latency_threshold_major" { + description = "Major threshold for jvm_gc_old_collection_latency detector" + type = number + default = 300 } -variable "jvm_gc_old_collection_latency_aggregation_function" { - description = "Aggregation function and group by for jvm_gc_old_collection_latency detector (i.e. \".mean(by=['host'])\")" +variable "jvm_gc_old_collection_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "15m" } -variable "jvm_gc_old_collection_latency_transformation_function" { - description = "Transformation function for jvm_gc_old_collection_latency detector (i.e. \".mean(over='5m')\")" +variable "jvm_gc_old_collection_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_gc_old_collection_latency_threshold_minor" { + description = "Minor threshold for jvm_gc_old_collection_latency detector" + type = number + default = 200 +} + +variable "jvm_gc_old_collection_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='15m')" + default = "15m" } -variable "jvm_gc_old_collection_latency_threshold_major" { - description = "major threshold for jvm_gc_old_collection_latency detector" +variable "jvm_gc_old_collection_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 300 + default = 1 } +# jvm_gc_young_collection_latency detector -variable "jvm_gc_old_collection_latency_threshold_minor" { - description = "minor threshold for jvm_gc_old_collection_latency detector" - type = number - default = 200 +variable "jvm_gc_young_collection_latency_notifications" { + description = "Notification recipients list per severity overridden for jvm_gc_young_collection_latency detector" + type = map(list(string)) + default = {} +} + +variable "jvm_gc_young_collection_latency_aggregation_function" { + description = "Aggregation function and group by for jvm_gc_young_collection_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Jvm_gc_young_collection_latency detector +variable "jvm_gc_young_collection_latency_transformation_function" { + description = "Transformation function for jvm_gc_young_collection_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "jvm_gc_young_collection_latency_max_delay" { description = "Enforce max delay for jvm_gc_young_collection_latency detector (use \"0\" or \"null\" for \"Auto\")" @@ -812,44 +1024,66 @@ variable "jvm_gc_young_collection_latency_disabled_major" { default = null } -variable "jvm_gc_young_collection_latency_disabled_minor" { - description = "Disable minor alerting rule for jvm_gc_young_collection_latency detector" - type = bool - default = null +variable "jvm_gc_young_collection_latency_disabled_minor" { + description = "Disable minor alerting rule for jvm_gc_young_collection_latency detector" + type = bool + default = null +} + +variable "jvm_gc_young_collection_latency_threshold_major" { + description = "Major threshold for jvm_gc_young_collection_latency detector" + type = number + default = 40 +} + +variable "jvm_gc_young_collection_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "jvm_gc_young_collection_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "jvm_gc_young_collection_latency_threshold_minor" { + description = "Minor threshold for jvm_gc_young_collection_latency detector" + type = number + default = 20 +} + +variable "jvm_gc_young_collection_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "jvm_gc_young_collection_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } +# indexing_latency detector -variable "jvm_gc_young_collection_latency_notifications" { - description = "Notification recipients list per severity overridden for jvm_gc_young_collection_latency detector" +variable "indexing_latency_notifications" { + description = "Notification recipients list per severity overridden for indexing_latency detector" type = map(list(string)) default = {} } -variable "jvm_gc_young_collection_latency_aggregation_function" { - description = "Aggregation function and group by for jvm_gc_young_collection_latency detector (i.e. \".mean(by=['host'])\")" +variable "indexing_latency_aggregation_function" { + description = "Aggregation function and group by for indexing_latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".max(by=['cluster'])" } -variable "jvm_gc_young_collection_latency_transformation_function" { - description = "Transformation function for jvm_gc_young_collection_latency detector (i.e. \".mean(over='5m')\")" +variable "indexing_latency_transformation_function" { + description = "Transformation function for indexing_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" -} - -variable "jvm_gc_young_collection_latency_threshold_major" { - description = "major threshold for jvm_gc_young_collection_latency detector" - type = number - default = 40 -} - -variable "jvm_gc_young_collection_latency_threshold_minor" { - description = "minor threshold for jvm_gc_young_collection_latency detector" - type = number - default = 20 + default = "" } -# Indexing_latency detector - variable "indexing_latency_max_delay" { description = "Enforce max delay for indexing_latency detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -886,37 +1120,59 @@ variable "indexing_latency_disabled_minor" { default = null } -variable "indexing_latency_notifications" { - description = "Notification recipients list per severity overridden for indexing_latency detector" - type = map(list(string)) - default = {} +variable "indexing_latency_threshold_major" { + description = "Major threshold for indexing_latency detector" + type = number + default = 30 } -variable "indexing_latency_aggregation_function" { - description = "Aggregation function and group by for indexing_latency detector (i.e. \".mean(by=['host'])\")" +variable "indexing_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "1h" } -variable "indexing_latency_transformation_function" { - description = "Transformation function for indexing_latency detector (i.e. \".mean(over='5m')\")" +variable "indexing_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "indexing_latency_threshold_minor" { + description = "Minor threshold for indexing_latency detector" + type = number + default = 15 +} + +variable "indexing_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='15m')" + default = "1h" } -variable "indexing_latency_threshold_major" { - description = "major threshold for indexing_latency detector" +variable "indexing_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 30 + default = 1 } +# flush_latency detector -variable "indexing_latency_threshold_minor" { - description = "minor threshold for indexing_latency detector" - type = number - default = 15 +variable "flush_latency_notifications" { + description = "Notification recipients list per severity overridden for flush_latency detector" + type = map(list(string)) + default = {} +} + +variable "flush_latency_aggregation_function" { + description = "Aggregation function and group by for flush_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Flush_latency detector +variable "flush_latency_transformation_function" { + description = "Transformation function for flush_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "flush_latency_max_delay" { description = "Enforce max delay for flush_latency detector (use \"0\" or \"null\" for \"Auto\")" @@ -954,37 +1210,59 @@ variable "flush_latency_disabled_minor" { default = null } -variable "flush_latency_notifications" { - description = "Notification recipients list per severity overridden for flush_latency detector" - type = map(list(string)) - default = {} +variable "flush_latency_threshold_major" { + description = "Major threshold for flush_latency detector" + type = number + default = 150 } -variable "flush_latency_aggregation_function" { - description = "Aggregation function and group by for flush_latency detector (i.e. \".mean(by=['host'])\")" +variable "flush_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "15m" } -variable "flush_latency_transformation_function" { - description = "Transformation function for flush_latency detector (i.e. \".mean(over='5m')\")" +variable "flush_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "flush_latency_threshold_minor" { + description = "Minor threshold for flush_latency detector" + type = number + default = 100 +} + +variable "flush_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='15m')" + default = "15m" } -variable "flush_latency_threshold_major" { - description = "major threshold for flush_latency detector" +variable "flush_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 150 + default = 1 } +# search_latency detector -variable "flush_latency_threshold_minor" { - description = "minor threshold for flush_latency detector" - type = number - default = 100 +variable "search_latency_notifications" { + description = "Notification recipients list per severity overridden for search_latency detector" + type = map(list(string)) + default = {} +} + +variable "search_latency_aggregation_function" { + description = "Aggregation function and group by for search_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Search_latency detector +variable "search_latency_transformation_function" { + description = "Transformation function for search_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "search_latency_max_delay" { description = "Enforce max delay for search_latency detector (use \"0\" or \"null\" for \"Auto\")" @@ -1022,37 +1300,59 @@ variable "search_latency_disabled_minor" { default = null } -variable "search_latency_notifications" { - description = "Notification recipients list per severity overridden for search_latency detector" - type = map(list(string)) - default = {} +variable "search_latency_threshold_major" { + description = "Major threshold for search_latency detector" + type = number + default = 20 } -variable "search_latency_aggregation_function" { - description = "Aggregation function and group by for search_latency detector (i.e. \".mean(by=['host'])\")" +variable "search_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "30m" } -variable "search_latency_transformation_function" { - description = "Transformation function for search_latency detector (i.e. \".mean(over='5m')\")" +variable "search_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "search_latency_threshold_minor" { + description = "Minor threshold for search_latency detector" + type = number + default = 10 +} + +variable "search_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".min(over='30m')" + default = "30m" } -variable "search_latency_threshold_major" { - description = "major threshold for search_latency detector" +variable "search_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 20 + default = 1 } +# fetch_latency detector -variable "search_latency_threshold_minor" { - description = "minor threshold for search_latency detector" - type = number - default = 10 +variable "fetch_latency_notifications" { + description = "Notification recipients list per severity overridden for fetch_latency detector" + type = map(list(string)) + default = {} +} + +variable "fetch_latency_aggregation_function" { + description = "Aggregation function and group by for fetch_latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Fetch_latency detector +variable "fetch_latency_transformation_function" { + description = "Transformation function for fetch_latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "fetch_latency_max_delay" { description = "Enforce max delay for fetch_latency detector (use \"0\" or \"null\" for \"Auto\")" @@ -1090,37 +1390,59 @@ variable "fetch_latency_disabled_minor" { default = null } -variable "fetch_latency_notifications" { - description = "Notification recipients list per severity overridden for fetch_latency detector" - type = map(list(string)) - default = {} +variable "fetch_latency_threshold_major" { + description = "Major threshold for fetch_latency detector" + type = number + default = 20 } -variable "fetch_latency_aggregation_function" { - description = "Aggregation function and group by for fetch_latency detector (i.e. \".mean(by=['host'])\")" +variable "fetch_latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "15m" } -variable "fetch_latency_transformation_function" { - description = "Transformation function for fetch_latency detector (i.e. \".mean(over='5m')\")" +variable "fetch_latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "fetch_latency_threshold_minor" { + description = "Minor threshold for fetch_latency detector" + type = number + default = 10 +} + +variable "fetch_latency_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".min(over='15m')" + default = "15m" } -variable "fetch_latency_threshold_major" { - description = "major threshold for fetch_latency detector" +variable "fetch_latency_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 20 + default = 1 } +# field_data_evictions_change detector -variable "fetch_latency_threshold_minor" { - description = "minor threshold for fetch_latency detector" - type = number - default = 10 +variable "field_data_evictions_change_notifications" { + description = "Notification recipients list per severity overridden for field_data_evictions_change detector" + type = map(list(string)) + default = {} +} + +variable "field_data_evictions_change_aggregation_function" { + description = "Aggregation function and group by for field_data_evictions_change detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Field_data_evictions_change detector +variable "field_data_evictions_change_transformation_function" { + description = "Transformation function for field_data_evictions_change detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "field_data_evictions_change_max_delay" { description = "Enforce max delay for field_data_evictions_change detector (use \"0\" or \"null\" for \"Auto\")" @@ -1158,37 +1480,59 @@ variable "field_data_evictions_change_disabled_minor" { default = null } -variable "field_data_evictions_change_notifications" { - description = "Notification recipients list per severity overridden for field_data_evictions_change detector" - type = map(list(string)) - default = {} +variable "field_data_evictions_change_threshold_major" { + description = "Major threshold for field_data_evictions_change detector" + type = number + default = 120 } -variable "field_data_evictions_change_aggregation_function" { - description = "Aggregation function and group by for field_data_evictions_change detector (i.e. \".mean(by=['host'])\")" +variable "field_data_evictions_change_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = "15m" } -variable "field_data_evictions_change_transformation_function" { - description = "Transformation function for field_data_evictions_change detector (i.e. \".mean(over='5m')\")" +variable "field_data_evictions_change_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "field_data_evictions_change_threshold_minor" { + description = "Minor threshold for field_data_evictions_change detector" + type = number + default = 60 +} + +variable "field_data_evictions_change_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='15m')" + default = "15m" } -variable "field_data_evictions_change_threshold_major" { - description = "major threshold for field_data_evictions_change detector" +variable "field_data_evictions_change_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 120 + default = 1 } +# task_time_in_queue_change detector -variable "field_data_evictions_change_threshold_minor" { - description = "minor threshold for field_data_evictions_change detector" - type = number - default = 60 +variable "task_time_in_queue_change_notifications" { + description = "Notification recipients list per severity overridden for task_time_in_queue_change detector" + type = map(list(string)) + default = {} +} + +variable "task_time_in_queue_change_aggregation_function" { + description = "Aggregation function and group by for task_time_in_queue_change detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['cluster'])" } -# Task_time_in_queue_change detector +variable "task_time_in_queue_change_transformation_function" { + description = "Transformation function for task_time_in_queue_change detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} variable "task_time_in_queue_change_max_delay" { description = "Enforce max delay for task_time_in_queue_change detector (use \"0\" or \"null\" for \"Auto\")" @@ -1226,33 +1570,37 @@ variable "task_time_in_queue_change_disabled_minor" { default = null } -variable "task_time_in_queue_change_notifications" { - description = "Notification recipients list per severity overridden for task_time_in_queue_change detector" - type = map(list(string)) - default = {} -} - -variable "task_time_in_queue_change_aggregation_function" { - description = "Aggregation function and group by for task_time_in_queue_change detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" +variable "task_time_in_queue_change_threshold_major" { + description = "Major threshold for task_time_in_queue_change detector" + type = number + default = 200 } -variable "task_time_in_queue_change_transformation_function" { - description = "Transformation function for task_time_in_queue_change detector (i.e. \".mean(over='5m')\")" +variable "task_time_in_queue_change_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='15m')" + default = "15m" } -variable "task_time_in_queue_change_threshold_major" { - description = "major threshold for task_time_in_queue_change detector" +variable "task_time_in_queue_change_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 200 + default = 1 } - variable "task_time_in_queue_change_threshold_minor" { - description = "minor threshold for task_time_in_queue_change detector" + description = "Minor threshold for task_time_in_queue_change detector" type = number default = 100 } +variable "task_time_in_queue_change_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "task_time_in_queue_change_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}