Skip to content

Commit a6de7f8

Browse files
authored
[system/{diskio,network}]: Add alerting rule templates (#15998)
1 parent af0d889 commit a6de7f8

File tree

7 files changed

+129
-3
lines changed

7 files changed

+129
-3
lines changed

packages/system/changelog.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
# newer versions go on top
2+
- version: "2.8.0"
3+
changes:
4+
- description: Add alerting rule templates (for diskio and network datastreams)
5+
type: enhancement
6+
link: https://github.com/elastic/integrations/pull/15998
27
- version: "2.7.2"
38
changes:
49
- description: Fixed parsing of SidList field in Windows Security event 4908 (Special Groups Logon table modified) by normalizing whitespace separators.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"id": "system-disk-io-saturation",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[System] Disk I/O Saturation",
6+
"tags": [
7+
"System"
8+
],
9+
"ruleTypeId": ".es-query",
10+
"schedule": {
11+
"interval": "1m"
12+
},
13+
"params": {
14+
"searchType": "esqlQuery",
15+
"timeWindowSize": 5,
16+
"timeWindowUnit": "m",
17+
"esqlQuery": {
18+
"esql": "// Alert when disk devices show sustained saturation via utilization and queue depth metrics.\n// Groups by host/device, enforces minimum work, and uses Little's Law to estimate queue depth.\n// Adjust utilization or queue depth thresholds in the final WHERE clause as needed.\nFROM metrics-system.diskio-*\n| WHERE `system.diskio.name` NOT RLIKE \"(loop|ram|sr|dm-)\"\n| STATS\nmax_io_time = MAX(`system.diskio.io.time`::LONG),\nmin_io_time = MIN(`system.diskio.io.time`::LONG),\nmax_read_time = MAX(`system.diskio.read.time`::LONG),\nmin_read_time = MIN(`system.diskio.read.time`::LONG),\nmax_write_time = MAX(`system.diskio.write.time`::LONG),\nmin_write_time = MIN(`system.diskio.write.time`::LONG),\nmax_read_ops = MAX(`system.diskio.read.count`::LONG),\nmin_read_ops = MIN(`system.diskio.read.count`::LONG),\nmax_write_ops = MAX(`system.diskio.write.count`::LONG),\nmin_write_ops = MIN(`system.diskio.write.count`::LONG),\nfirst_ts = MIN(@timestamp),\nlast_ts = MAX(@timestamp)\nBY `host.name`, `system.diskio.name`\n| EVAL elapsed_ms = TO_DOUBLE(TO_LONG(last_ts) - TO_LONG(first_ts))\n| WHERE elapsed_ms >= 60000\n| EVAL io_time_delta = CASE(max_io_time >= min_io_time, max_io_time - min_io_time, 0)\n| EVAL read_time_delta = CASE(max_read_time >= min_read_time, max_read_time - min_read_time, 0)\n| EVAL write_time_delta = CASE(max_write_time >= min_write_time, max_write_time - min_write_time, 0)\n| EVAL read_ops_delta = CASE(max_read_ops >= min_read_ops, max_read_ops - min_read_ops, 0)\n| EVAL write_ops_delta = CASE(max_write_ops >= min_write_ops, max_write_ops - min_write_ops, 0)\n| EVAL total_ops = read_ops_delta + write_ops_delta\n| WHERE total_ops >= 100\n| EVAL io_util_pct = CASE(elapsed_ms > 0, 100.0 * io_time_delta / elapsed_ms, 0.0)\n| EVAL avg_iops = CASE(elapsed_ms > 0, total_ops / (elapsed_ms / 1000.0), 0.0)\n| EVAL total_io_time_ms = read_time_delta + write_time_delta\n| EVAL avg_latency_ms = CASE(\ntotal_ops > 0 AND total_io_time_ms > 0,\ntotal_io_time_ms / TO_DOUBLE(total_ops),\n0\n)\n| EVAL est_queue_depth = CASE(\navg_latency_ms > 0 AND avg_iops > 0,\n(avg_iops * avg_latency_ms) / 1000,\n0\n)\n| WHERE io_util_pct > 90\nOR est_queue_depth > 10\nOR (io_util_pct > 70 AND est_queue_depth > 5)"
19+
},
20+
"groupBy": "row",
21+
"timeField": "@timestamp"
22+
},
23+
"alertDelay": {
24+
"active": 3
25+
}
26+
},
27+
"managed": true,
28+
"coreMigrationVersion": "8.8.0",
29+
"typeMigrationVersion": "10.1.0"
30+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"id": "system-high-disk-io-latency",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[System] High Disk I/O Latency",
6+
"tags": [
7+
"System"
8+
],
9+
"ruleTypeId": ".es-query",
10+
"schedule": {
11+
"interval": "1m"
12+
},
13+
"params": {
14+
"searchType": "esqlQuery",
15+
"timeWindowSize": 5,
16+
"timeWindowUnit": "m",
17+
"esqlQuery": {
18+
"esql": "// Alert when average disk I/O latency exceeds storage-type thresholds within the configured lookback window.\n// Groups by host and device, excludes loop/ram/sr/dm devices, and clamps counter resets.\n// Adjust latency thresholds in the final WHERE clause as needed.\nFROM metrics-system.diskio-*\n| WHERE `system.diskio.name` NOT RLIKE \"(loop|ram|sr|dm-)\"\n| STATS\nmax_read_time = MAX(`system.diskio.read.time`::LONG),\nmin_read_time = MIN(`system.diskio.read.time`::LONG),\nmax_write_time = MAX(`system.diskio.write.time`::LONG),\nmin_write_time = MIN(`system.diskio.write.time`::LONG),\nmax_read_ops = MAX(`system.diskio.read.count`::LONG),\nmin_read_ops = MIN(`system.diskio.read.count`::LONG),\nmax_write_ops = MAX(`system.diskio.write.count`::LONG),\nmin_write_ops = MIN(`system.diskio.write.count`::LONG),\nfirst_ts = MIN(@timestamp),\nlast_ts = MAX(@timestamp)\nBY `host.name`, `system.diskio.name`\n| EVAL elapsed_ms = TO_DOUBLE(TO_LONG(last_ts) - TO_LONG(first_ts))\n| WHERE elapsed_ms >= 60000\n| EVAL read_time_delta = CASE(max_read_time >= min_read_time, max_read_time - min_read_time, 0)\n| EVAL write_time_delta = CASE(max_write_time >= min_write_time, max_write_time - min_write_time, 0)\n| EVAL read_ops_delta = CASE(max_read_ops >= min_read_ops, max_read_ops - min_read_ops, 0)\n| EVAL write_ops_delta = CASE(max_write_ops >= min_write_ops, max_write_ops - min_write_ops, 0)\n| EVAL total_ops = read_ops_delta + write_ops_delta\n| WHERE total_ops >= 100\n| EVAL io_time_ms = read_time_delta + write_time_delta\n| WHERE io_time_ms > 0\n| EVAL avg_latency_ms = io_time_ms / TO_DOUBLE(total_ops)\n| EVAL avg_iops = CASE(elapsed_ms > 0, total_ops / (elapsed_ms / 1000.0), 0.0)\n| EVAL device_type = CASE(\n`system.diskio.name` RLIKE \"^nvme[0-9]\", \"ssd\",\navg_latency_ms < 3 AND avg_iops > 100, \"ssd\",\navg_latency_ms > 15, \"hdd\",\n\"unknown\"\n)\n| WHERE (device_type == \"ssd\" AND avg_latency_ms > 5)\nOR (device_type == \"hdd\" AND avg_latency_ms > 30)\nOR (device_type == \"unknown\" AND avg_latency_ms > 10)"
19+
},
20+
"groupBy": "row",
21+
"timeField": "@timestamp"
22+
},
23+
"alertDelay": {
24+
"active": 3
25+
}
26+
},
27+
"managed": true,
28+
"coreMigrationVersion": "8.8.0",
29+
"typeMigrationVersion": "10.1.0"
30+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"id": "system-high-network-error-rate",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[System] High Network Error Rate",
6+
"tags": [
7+
"System"
8+
],
9+
"ruleTypeId": ".es-query",
10+
"schedule": {
11+
"interval": "1m"
12+
},
13+
"params": {
14+
"searchType": "esqlQuery",
15+
"timeWindowSize": 5,
16+
"timeWindowUnit": "m",
17+
"esqlQuery": {
18+
"esql": "// Alert when network interfaces show sustained error rates (>5 errors/sec or >0.5% of packets).\n// Excludes loopback/container interfaces, enforces minimum traffic, and clamps counter resets.\n// Tune thresholds in the final WHERE clause for your environment.\nFROM metrics-system.network-*\n| WHERE `system.network.name` NOT RLIKE \"^(lo|docker|veth|br-)\"\n| STATS\n max_in_errors = MAX(`system.network.in.errors`::LONG),\n min_in_errors = MIN(`system.network.in.errors`::LONG),\n max_out_errors = MAX(`system.network.out.errors`::LONG),\n min_out_errors = MIN(`system.network.out.errors`::LONG),\n max_in_packets = MAX(`system.network.in.packets`::LONG),\n min_in_packets = MIN(`system.network.in.packets`::LONG),\n max_out_packets = MAX(`system.network.out.packets`::LONG),\n min_out_packets = MIN(`system.network.out.packets`::LONG),\n first_ts = MIN(@timestamp),\n last_ts = MAX(@timestamp)\n BY `host.name`, `system.network.name`\n| EVAL elapsed_ms = TO_DOUBLE(TO_LONG(last_ts) - TO_LONG(first_ts))\n| EVAL elapsed_sec = CASE(elapsed_ms > 0, elapsed_ms / 1000.0, 0.0)\n| WHERE elapsed_sec >= 60\n| EVAL in_error_delta = CASE(max_in_errors >= min_in_errors, max_in_errors - min_in_errors, 0)\n| EVAL out_error_delta = CASE(max_out_errors >= min_out_errors, max_out_errors - min_out_errors, 0)\n| EVAL in_packet_delta = CASE(max_in_packets >= min_in_packets, max_in_packets - min_in_packets, 0)\n| EVAL out_packet_delta = CASE(max_out_packets >= min_out_packets, max_out_packets - min_out_packets, 0)\n| EVAL total_packets = in_packet_delta + out_packet_delta\n| WHERE total_packets >= 1000\n| EVAL in_error_rate = CASE(elapsed_sec > 0, in_error_delta / elapsed_sec, 0.0)\n| EVAL out_error_rate = CASE(elapsed_sec > 0, out_error_delta / elapsed_sec, 0.0)\n| EVAL in_error_pct = CASE(in_packet_delta > 100, 100.0 * in_error_delta / TO_DOUBLE(in_packet_delta), 0.0)\n| EVAL out_error_pct = CASE(out_packet_delta > 100, 100.0 * out_error_delta / TO_DOUBLE(out_packet_delta), 0.0)\n| WHERE in_error_rate > 5\n OR out_error_rate > 5\n OR in_error_pct > 0.5\n OR out_error_pct > 0.5"
19+
},
20+
"groupBy": "row",
21+
"timeField": "@timestamp"
22+
},
23+
"alertDelay": {
24+
"active": 3
25+
}
26+
},
27+
"managed": true,
28+
"coreMigrationVersion": "8.8.0",
29+
"typeMigrationVersion": "10.1.0"
30+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"id": "system-high-packet-drop-rate",
3+
"type": "alerting_rule_template",
4+
"attributes": {
5+
"name": "[System] High Packet Drop Rate",
6+
"tags": [
7+
"System"
8+
],
9+
"ruleTypeId": ".es-query",
10+
"schedule": {
11+
"interval": "1m"
12+
},
13+
"params": {
14+
"searchType": "esqlQuery",
15+
"timeWindowSize": 5,
16+
"timeWindowUnit": "m",
17+
"esqlQuery": {
18+
"esql": "// Alert when interfaces drop >25 packets/sec or more than 1% of traffic over the configured time window.\n// Excludes loopback/container interfaces, requires minimum packet volume, and guards against counter resets.\n// Adjust absolute or percentage thresholds in the final WHERE clause as needed.\nFROM metrics-system.network-*\n| WHERE `system.network.name` NOT RLIKE \"^(lo|docker|veth|br-)\"\n| STATS\nmax_in_dropped = MAX(`system.network.in.dropped`::LONG),\nmin_in_dropped = MIN(`system.network.in.dropped`::LONG),\nmax_out_dropped = MAX(`system.network.out.dropped`::LONG),\nmin_out_dropped = MIN(`system.network.out.dropped`::LONG),\nmax_in_packets = MAX(`system.network.in.packets`::LONG),\nmin_in_packets = MIN(`system.network.in.packets`::LONG),\nmax_out_packets = MAX(`system.network.out.packets`::LONG),\nmin_out_packets = MIN(`system.network.out.packets`::LONG),\nfirst_ts = MIN(@timestamp),\nlast_ts = MAX(@timestamp)\nBY `host.name`, `system.network.name`\n| EVAL elapsed_ms = TO_DOUBLE(TO_LONG(last_ts) - TO_LONG(first_ts))\n| EVAL elapsed_sec = CASE(elapsed_ms > 0, elapsed_ms / 1000.0, 0.0)\n| WHERE elapsed_sec >= 60\n| EVAL in_drop_delta = CASE(max_in_dropped >= min_in_dropped, max_in_dropped - min_in_dropped, 0)\n| EVAL out_drop_delta = CASE(max_out_dropped >= min_out_dropped, max_out_dropped - min_out_dropped, 0)\n| EVAL in_packet_delta = CASE(max_in_packets >= min_in_packets, max_in_packets - min_in_packets, 0)\n| EVAL out_packet_delta = CASE(max_out_packets >= min_out_packets, max_out_packets - min_out_packets, 0)\n| EVAL total_packets = in_packet_delta + out_packet_delta\n| WHERE total_packets >= 1000\n| EVAL in_drop_rate = CASE(elapsed_sec > 0, in_drop_delta / elapsed_sec, 0.0)\n| EVAL out_drop_rate = CASE(elapsed_sec > 0, out_drop_delta / elapsed_sec, 0.0)\n| EVAL in_drop_pct = CASE(in_packet_delta > 100, 100.0 * in_drop_delta / TO_DOUBLE(in_packet_delta), 0.0)\n| EVAL out_drop_pct = CASE(out_packet_delta > 100, 100.0 * out_drop_delta / TO_DOUBLE(out_packet_delta), 0.0)\n| WHERE in_drop_rate > 25\nOR out_drop_rate > 25\nOR in_drop_pct > 1\nOR out_drop_pct > 1"
19+
},
20+
"groupBy": "row",
21+
"timeField": "@timestamp"
22+
},
23+
"alertDelay": {
24+
"active": 3
25+
}
26+
},
27+
"managed": true,
28+
"coreMigrationVersion": "8.8.0",
29+
"typeMigrationVersion": "10.1.0"
30+
}

packages/system/manifest.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
format_version: 3.0.2
1+
format_version: 3.4.0
22
name: system
33
title: System
4-
version: "2.7.2"
4+
version: "2.8.0"
55
description: Collect system logs and metrics from your servers with Elastic Agent.
66
type: integration
77
categories:
@@ -12,7 +12,7 @@ categories:
1212
- observability
1313
conditions:
1414
kibana:
15-
version: "^9.2.0"
15+
version: "^9.2.1"
1616
screenshots:
1717
- src: /img/policy-and-object-monitoring-dashboard.png
1818
title: policy and object monitoring

packages/system/validation.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ errors:
22
exclude_checks:
33
- SVR00002 # expected filter in dashboard.
44
- SVR00004 # references found in dashboard.
5+
- JSE00001

0 commit comments

Comments
 (0)