From f6b54970f556da452eac526500c1c352c99be17b Mon Sep 17 00:00:00 2001 From: Richard Klose Date: Fri, 21 Feb 2025 15:29:06 +0100 Subject: [PATCH 1/2] feat: add support for _health_report In elasticsearch 8.7 a new endpoint for cluster health has been added. See https://www.elastic.co/docs/api/doc/elasticsearch/v8/operation/operation-health-report Signed-off-by: Richard Klose --- README.md | 26 ++ collector/health_report.go | 472 +++++++++++++++++++++++++++++++ collector/health_report_test.go | 169 +++++++++++ fixtures/healthreport/8.7.0.json | 111 ++++++++ 4 files changed, 778 insertions(+) create mode 100644 collector/health_report.go create mode 100644 collector/health_report_test.go create mode 100644 fixtures/healthreport/8.7.0.json diff --git a/README.md b/README.md index 3ae31ed8..9cc6f173 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ elasticsearch_exporter --help | es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. | es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | | collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | +| collector.health-report | 1.9.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | | es.slm | | If true, query stats for SLM. | false | | es.data_stream | | If true, query state for Data Steams. | false | | es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | @@ -270,6 +271,31 @@ Further Information | elasticsearch_data_stream_stats_json_parse_failures | counter | 0 | Number of parsing failures for Data Stream stats | | elasticsearch_data_stream_backing_indices_total | gauge | 1 | Number of backing indices for Data Stream | | elasticsearch_data_stream_store_size_bytes | gauge | 1 | Current size of data stream backing indices in bytes | +| elasticsearch_health_report_creating_primaries | gauge | 1 | The number of creating primary shards | +| elasticsearch_health_report_creating_replicas | gauge | 1 | The number of creating replica shards | +| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 1 | Data stream lifecycle status | +| elasticsearch_health_report_disk_status | gauge | 3 | disk status | +| elasticsearch_health_report_ilm_policies | gauge | 1 | The number of ILM Policies | +| elasticsearch_health_report_ilm_stagnating_indices | gauge | 1 | The number of stagnating indices | +| elasticsearch_health_report_ilm_status | gauge | 3 | ILM status | +| elasticsearch_health_report_initializing_primaries | gauge | 1 | The number of initializing primary shards | +| elasticsearch_health_report_initializing_replicas | gauge | 1 | The number of initializing replica shards | +| elasticsearch_health_report_master_is_stable_status | gauge | 3 | Master is stable status | +| elasticsearch_health_report_max_shards_in_cluster_data | gauge | 1 | The number of maximum shards in a cluster | +| elasticsearch_health_report_max_shards_in_cluster_frozen | gauge | 1 | The number of maximum frozen shards in a cluster | +| elasticsearch_health_report_repository_integrity_status | gauge | 3 | Repository integrity status | +| elasticsearch_health_report_restarting_primaries | gauge | 1 | The number of restarting primary shards | +| elasticsearch_health_report_restarting_replicas | gauge | 1 | The number of restarting replica shards | +| elasticsearch_health_report_shards_availabilty_status | gauge | 3 | Shards availabilty status | +| elasticsearch_health_report_shards_capacity_status | gauge | 3 | Shards capacity status | +| elasticsearch_health_report_slm_policies | gauge | 1 | The number of SLM policies | +| elasticsearch_health_report_slm_status | gauge | 3 | SLM status | +| elasticsearch_health_report_started_primaries | gauge | 1 | The number of started primary shards | +| elasticsearch_health_report_started_replicas | gauge | 1 | The number of started replica shards | +| elasticsearch_health_report_status | gauge | 3 | Overall cluster status | +| elasticsearch_health_report_total_repositories | gauge | 1 | The number snapshot repositories | +| elasticsearch_health_report_unassigned_primaries | gauge | 1 | The number of unassigned primary shards | +| elasticsearch_health_report_unassigned_replicas | gauge | 1 | The number of unassigned replica shards | ### Alerts & Recording Rules diff --git a/collector/health_report.go b/collector/health_report.go new file mode 100644 index 00000000..4933d98c --- /dev/null +++ b/collector/health_report.go @@ -0,0 +1,472 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "context" + "encoding/json" + "log/slog" + "net/http" + "net/url" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + statusColors = []string{"green", "yellow", "red"} + defaultHealthReportLabels = []string{"cluster"} +) + +var ( + healthReportTotalRepositories = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "total_repositories"), + "The number of snapshot repositories", + defaultHealthReportLabels, nil, + ) + healthReportMaxShardsInClusterData = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "max_shards_in_cluster_data"), + "The number of maximum shards in a cluster", + defaultHealthReportLabels, nil, + ) + healthReportMaxShardsInClusterFrozen = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "max_shards_in_cluster_frozen"), + "The number of maximum frozen shards in a cluster", + defaultHealthReportLabels, nil, + ) + healthReportRestartingReplicas = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "restarting_replicas"), + "The number of restarting replica shards", + defaultHealthReportLabels, nil, + ) + healthReportCreatingPrimaries = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "creating_primaries"), + "The number of creating primary shards", + defaultHealthReportLabels, nil, + ) + healthReportInitializingReplicas = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "initializing_replicas"), + "The number of initializing replica shards", + defaultHealthReportLabels, nil, + ) + healthReportUnassignedReplicas = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "unassigned_replicas"), + "The number of unassigned replica shards", + defaultHealthReportLabels, nil, + ) + healthReportStartedPrimaries = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "started_primaries"), + "The number of started primary shards", + defaultHealthReportLabels, nil, + ) + healthReportRestartingPrimaries = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "restarting_primaries"), + "The number of restarting primary shards", + defaultHealthReportLabels, nil, + ) + healthReportInitializingPrimaries = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "initializing_primaries"), + "The number of initializing primary shards", + defaultHealthReportLabels, nil, + ) + healthReportCreatingReplicas = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "creating_replicas"), + "The number of creating replica shards", + defaultHealthReportLabels, nil, + ) + healthReportStartedReplicas = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "started_replicas"), + "The number of started replica shards", + defaultHealthReportLabels, nil, + ) + healthReportUnassignedPrimaries = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "unassigned_primaries"), + "The number of unassigned primary shards", + defaultHealthReportLabels, nil, + ) + healthReportSlmPolicies = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "slm_policies"), + "The number of SLM policies", + defaultHealthReportLabels, nil, + ) + healthReportIlmPolicies = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "ilm_policies"), + "The number of ILM Policies", + defaultHealthReportLabels, nil, + ) + healthReportIlmStagnatingIndices = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "ilm_stagnating_indices"), + "The number of stagnating indices", + defaultHealthReportLabels, nil, + ) + healthReportStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "status"), + "Overall cluster status", + []string{"cluster", "color"}, nil, + ) + healthReportMasterIsStableStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "master_is_stable_status"), + "Master is stable status", + []string{"cluster", "color"}, nil, + ) + healthReportRepositoryIntegrityStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "repository_integrity_status"), + "Repository integrity status", + []string{"cluster", "color"}, nil, + ) + healthReportDiskStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "disk_status"), + "Disk status", + []string{"cluster", "color"}, nil, + ) + healthReportShardsCapacityStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "shards_capacity_status"), + "Shards capacity status", + []string{"cluster", "color"}, nil, + ) + healthReportShardsAvailabiltystatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "shards_availabilty_status"), + "Shards availabilty status", + []string{"cluster", "color"}, nil, + ) + healthReportDataStreamLifecycleStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "data_stream_lifecycle_status"), + "Data stream lifecycle status", + []string{"cluster", "color"}, nil, + ) + healthReportSlmStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "slm_status"), + "SLM status", + []string{"cluster", "color"}, nil, + ) + healthReportIlmStatus = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "health_report", "ilm_status"), + "ILM status", + []string{"cluster", "color"}, nil, + ) +) + +func init() { + registerCollector("health-report", defaultDisabled, NewHealthReport) +} + +type HealthReport struct { + logger *slog.Logger + client *http.Client + url *url.URL +} + +func NewHealthReport(logger *slog.Logger, url *url.URL, client *http.Client) (Collector, error) { + return &HealthReport{ + logger: logger, + client: client, + url: url, + }, nil +} + +type HealthReportResponse struct { + ClusterName string `json:"cluster_name"` + Status string `json:"status"` + Indicators HealthReportIndicators `json:"indicators"` +} + +type HealthReportIndicators struct { + MasterIsStable HealthReportMasterIsStable `json:"master_is_stable"` + RepositoryIntegrity HealthReportRepositoryIntegrity `json:"repository_integrity"` + Disk HealthReportDisk `json:"disk"` + ShardsCapacity HealthReportShardsCapacity `json:"shards_capacity"` + ShardsAvailability HealthReportShardsAvailability `json:"shards_availability"` + DataStreamLifecycle HealthReportDataStreamLifecycle `json:"data_stream_lifecycle"` + Slm HealthReportSlm `json:"slm"` + Ilm HealthReportIlm `json:"ilm"` +} + +type HealthReportMasterIsStable struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportMasterIsStableDetails `json:"details"` +} + +type HealthReportMasterIsStableDetails struct { + CurrentMaster HealthReportMasterIsStableDetailsNode `json:"current_master"` + RecentMasters []HealthReportMasterIsStableDetailsNode `json:"recent_masters"` +} + +type HealthReportMasterIsStableDetailsNode struct { + NodeID string `json:"node_id"` + Name string `json:"name"` +} + +type HealthReportRepositoryIntegrity struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportRepositoriyIntegrityDetails `json:"details"` +} + +type HealthReportRepositoriyIntegrityDetails struct { + TotalRepositories int `json:"total_repositories"` +} + +type HealthReportDisk struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportDiskDetails `json:"details"` +} + +type HealthReportDiskDetails struct { + IndicesWithReadonlyBlock int `json:"indices_with_readonly_block"` + NodesWithEnoughDiskSpace int `json:"nodes_with_enough_disk_space"` + NodesWithUnknownDiskStatus int `json:"nodes_with_unknown_disk_status"` + NodesOverHighWatermark int `json:"nodes_over_high_watermark"` + NodesOverFloodStageWatermark int `json:"nodes_over_flood_stage_watermark"` +} + +type HealthReportShardsCapacity struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportShardsCapacityDetails `json:"details"` +} + +type HealthReportShardsCapacityDetails struct { + Data HealthReportShardsCapacityDetailsMaxShards `json:"data"` + Frozen HealthReportShardsCapacityDetailsMaxShards `json:"frozen"` +} + +type HealthReportShardsCapacityDetailsMaxShards struct { + MaxShardsInCluster int `json:"max_shards_in_cluster"` +} + +type HealthReportShardsAvailability struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportShardsAvailabilityDetails `json:"details"` +} + +type HealthReportShardsAvailabilityDetails struct { + RestartingReplicas int `json:"restarting_replicas"` + CreatingPrimaries int `json:"creating_primaries"` + InitializingReplicas int `json:"initializing_replicas"` + UnassignedReplicas int `json:"unassigned_replicas"` + StartedPrimaries int `json:"started_primaries"` + RestartingPrimaries int `json:"restarting_primaries"` + InitializingPrimaries int `json:"initializing_primaries"` + CreatingReplicas int `json:"creating_replicas"` + StartedReplicas int `json:"started_replicas"` + UnassignedPrimaries int `json:"unassigned_primaries"` +} + +type HealthReportDataStreamLifecycle struct { + Status string `json:"status"` + Symptom string `json:"symptom"` +} + +type HealthReportSlm struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportSlmDetails `json:"details"` +} + +type HealthReportSlmDetails struct { + SlmStatus string `json:"slm_status"` + Policies int `json:"policies"` +} + +type HealthReportIlm struct { + Status string `json:"status"` + Symptom string `json:"symptom"` + Details HealthReportIlmDetails `json:"details"` +} + +type HealthReportIlmDetails struct { + Policies int `json:"policies"` + StagnatingIndices int `json:"stagnating_indices"` + IlmStatus string `json:"ilm_status"` +} + +func statusValue(value string, color string) float64 { + if value == color { + return 1 + } + return 0 +} + +func (c *HealthReport) Update(ctx context.Context, ch chan<- prometheus.Metric) error { + u := c.url.ResolveReference(&url.URL{Path: "/_health_report"}) + var healthReportResponse HealthReportResponse + + resp, err := getURL(ctx, c.client, c.logger, u.String()) + if err != nil { + return err + } + + err = json.Unmarshal(resp, &healthReportResponse) + if err != nil { + return err + } + + ch <- prometheus.MustNewConstMetric( + healthReportTotalRepositories, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.RepositoryIntegrity.Details.TotalRepositories), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportMaxShardsInClusterData, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsCapacity.Details.Data.MaxShardsInCluster), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportMaxShardsInClusterFrozen, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsCapacity.Details.Frozen.MaxShardsInCluster), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportRestartingReplicas, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.RestartingReplicas), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportCreatingPrimaries, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.CreatingPrimaries), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportInitializingReplicas, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.InitializingReplicas), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportUnassignedReplicas, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.UnassignedReplicas), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportStartedPrimaries, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.StartedPrimaries), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportRestartingPrimaries, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.RestartingPrimaries), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportInitializingPrimaries, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.InitializingPrimaries), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportCreatingReplicas, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.CreatingReplicas), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportStartedReplicas, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.StartedReplicas), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportUnassignedPrimaries, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.ShardsAvailability.Details.UnassignedPrimaries), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportSlmPolicies, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.Slm.Details.Policies), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportIlmPolicies, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.Ilm.Details.Policies), + healthReportResponse.ClusterName, + ) + ch <- prometheus.MustNewConstMetric( + healthReportIlmStagnatingIndices, + prometheus.GaugeValue, + float64(healthReportResponse.Indicators.Ilm.Details.StagnatingIndices), + healthReportResponse.ClusterName, + ) + + for _, color := range statusColors { + ch <- prometheus.MustNewConstMetric( + healthReportStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportMasterIsStableStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.MasterIsStable.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportRepositoryIntegrityStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.RepositoryIntegrity.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportDiskStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.Disk.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportShardsCapacityStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.ShardsCapacity.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportShardsAvailabiltystatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.ShardsAvailability.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportDataStreamLifecycleStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.DataStreamLifecycle.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportSlmStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.Slm.Status, color), + healthReportResponse.ClusterName, color, + ) + ch <- prometheus.MustNewConstMetric( + healthReportIlmStatus, + prometheus.GaugeValue, + statusValue(healthReportResponse.Indicators.Ilm.Status, color), + healthReportResponse.ClusterName, color, + ) + } + + return nil +} diff --git a/collector/health_report_test.go b/collector/health_report_test.go new file mode 100644 index 00000000..012afbfd --- /dev/null +++ b/collector/health_report_test.go @@ -0,0 +1,169 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "io" + "net/http" + "net/http/httptest" + "net/url" + "os" + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/common/promslog" +) + +func TestHealthReport(t *testing.T) { + // Testcases created using: + // docker run -d -p 9200:9200 elasticsearch:VERSION + // curl -XPUT http://localhost:9200/twitter + // curl http://localhost:9200/_health_report + + tests := []struct { + name string + file string + want string + }{ + { + name: "8.7.0", + file: "../fixtures/healthreport/8.7.0.json", + want: ` + # HELP elasticsearch_health_report_creating_primaries The number of creating primary shards + # TYPE elasticsearch_health_report_creating_primaries gauge + elasticsearch_health_report_creating_primaries{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_creating_replicas The number of creating replica shards + # TYPE elasticsearch_health_report_creating_replicas gauge + elasticsearch_health_report_creating_replicas{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_data_stream_lifecycle_status Data stream lifecycle status + # TYPE elasticsearch_health_report_data_stream_lifecycle_status gauge + elasticsearch_health_report_data_stream_lifecycle_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_data_stream_lifecycle_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_data_stream_lifecycle_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_disk_status Disk status + # TYPE elasticsearch_health_report_disk_status gauge + elasticsearch_health_report_disk_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_disk_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_disk_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_ilm_policies The number of ILM Policies + # TYPE elasticsearch_health_report_ilm_policies gauge + elasticsearch_health_report_ilm_policies{cluster="docker-cluster"} 17 + # HELP elasticsearch_health_report_ilm_stagnating_indices The number of stagnating indices + # TYPE elasticsearch_health_report_ilm_stagnating_indices gauge + elasticsearch_health_report_ilm_stagnating_indices{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_ilm_status ILM status + # TYPE elasticsearch_health_report_ilm_status gauge + elasticsearch_health_report_ilm_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_ilm_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_ilm_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_initializing_primaries The number of initializing primary shards + # TYPE elasticsearch_health_report_initializing_primaries gauge + elasticsearch_health_report_initializing_primaries{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_initializing_replicas The number of initializing replica shards + # TYPE elasticsearch_health_report_initializing_replicas gauge + elasticsearch_health_report_initializing_replicas{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_master_is_stable_status Master is stable status + # TYPE elasticsearch_health_report_master_is_stable_status gauge + elasticsearch_health_report_master_is_stable_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_master_is_stable_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_master_is_stable_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_max_shards_in_cluster_data The number of maximum shards in a cluster + # TYPE elasticsearch_health_report_max_shards_in_cluster_data gauge + elasticsearch_health_report_max_shards_in_cluster_data{cluster="docker-cluster"} 13500 + # HELP elasticsearch_health_report_max_shards_in_cluster_frozen The number of maximum frozen shards in a cluster + # TYPE elasticsearch_health_report_max_shards_in_cluster_frozen gauge + elasticsearch_health_report_max_shards_in_cluster_frozen{cluster="docker-cluster"} 9000 + # HELP elasticsearch_health_report_repository_integrity_status Repository integrity status + # TYPE elasticsearch_health_report_repository_integrity_status gauge + elasticsearch_health_report_repository_integrity_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_repository_integrity_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_repository_integrity_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_restarting_primaries The number of restarting primary shards + # TYPE elasticsearch_health_report_restarting_primaries gauge + elasticsearch_health_report_restarting_primaries{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_restarting_replicas The number of restarting replica shards + # TYPE elasticsearch_health_report_restarting_replicas gauge + elasticsearch_health_report_restarting_replicas{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_shards_availabilty_status Shards availabilty status + # TYPE elasticsearch_health_report_shards_availabilty_status gauge + elasticsearch_health_report_shards_availabilty_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_shards_availabilty_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_shards_availabilty_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_shards_capacity_status Shards capacity status + # TYPE elasticsearch_health_report_shards_capacity_status gauge + elasticsearch_health_report_shards_capacity_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_shards_capacity_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_shards_capacity_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_slm_policies The number of SLM policies + # TYPE elasticsearch_health_report_slm_policies gauge + elasticsearch_health_report_slm_policies{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_slm_status SLM status + # TYPE elasticsearch_health_report_slm_status gauge + elasticsearch_health_report_slm_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_slm_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_slm_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_started_primaries The number of started primary shards + # TYPE elasticsearch_health_report_started_primaries gauge + elasticsearch_health_report_started_primaries{cluster="docker-cluster"} 11703 + # HELP elasticsearch_health_report_started_replicas The number of started replica shards + # TYPE elasticsearch_health_report_started_replicas gauge + elasticsearch_health_report_started_replicas{cluster="docker-cluster"} 1701 + # HELP elasticsearch_health_report_status Overall cluster status + # TYPE elasticsearch_health_report_status gauge + elasticsearch_health_report_status{cluster="docker-cluster",color="green"} 1 + elasticsearch_health_report_status{cluster="docker-cluster",color="red"} 0 + elasticsearch_health_report_status{cluster="docker-cluster",color="yellow"} 0 + # HELP elasticsearch_health_report_total_repositories The number of snapshot repositories + # TYPE elasticsearch_health_report_total_repositories gauge + elasticsearch_health_report_total_repositories{cluster="docker-cluster"} 1 + # HELP elasticsearch_health_report_unassigned_primaries The number of unassigned primary shards + # TYPE elasticsearch_health_report_unassigned_primaries gauge + elasticsearch_health_report_unassigned_primaries{cluster="docker-cluster"} 0 + # HELP elasticsearch_health_report_unassigned_replicas The number of unassigned replica shards + # TYPE elasticsearch_health_report_unassigned_replicas gauge + elasticsearch_health_report_unassigned_replicas{cluster="docker-cluster"} 0 + `, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + f, err := os.Open(tt.file) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + io.Copy(w, f) + })) + defer ts.Close() + + u, err := url.Parse(ts.URL) + if err != nil { + t.Fatal(err) + } + + c, err := NewHealthReport(promslog.NewNopLogger(), u, http.DefaultClient) + if err != nil { + t.Fatal(err) + } + + if err := testutil.CollectAndCompare(wrapCollector{c}, strings.NewReader(tt.want)); err != nil { + t.Fatal(err) + } + }) + } +} diff --git a/fixtures/healthreport/8.7.0.json b/fixtures/healthreport/8.7.0.json new file mode 100644 index 00000000..337142d5 --- /dev/null +++ b/fixtures/healthreport/8.7.0.json @@ -0,0 +1,111 @@ +{ + "status": "green", + "cluster_name": "docker-cluster", + "indicators": { + "master_is_stable": { + "status": "green", + "symptom": "The cluster has a stable master node", + "details": { + "current_master": { + "node_id": "X8BAj1mfQ3qgcSoAlG3HHw", + "name": "5da1610e99a7" + }, + "recent_masters": [ + { + "node_id": "X8BAj1mfQ3qgcSoAlG3HHw", + "name": "5da1610e99a7" + } + ] + } + }, + "repository_integrity": { + "status": "green", + "symptom": "All repositories are healthy.", + "details": { + "total_repositories": 1 + } + }, + "shards_capacity": { + "status": "green", + "symptom": "The cluster has enough room to add new shards.", + "details": { + "data": { + "max_shards_in_cluster": 13500 + }, + "frozen": { + "max_shards_in_cluster": 9000 + } + } + }, + "shards_availability": { + "status": "green", + "symptom": "This cluster has all shards available.", + "details": { + "restarting_replicas": 0, + "creating_primaries": 0, + "initializing_replicas": 0, + "unassigned_replicas": 0, + "started_primaries": 11703, + "restarting_primaries": 0, + "initializing_primaries": 0, + "creating_replicas": 0, + "started_replicas": 1701, + "unassigned_primaries": 0 + }, + "impacts": [ + { + "id": "elasticsearch:health:shards_availability:impact:replica_unassigned", + "severity": 2, + "description": "Searches might be slower than usual. Fewer redundant copies of the data exist on 1 index [twitter].", + "impact_areas": [ + "search" + ] + } + ], + "diagnosis": [ + { + "id": "elasticsearch:health:shards_availability:diagnosis:increase_tier_capacity_for_allocations:tier:data_content", + "cause": "Elasticsearch isn't allowed to allocate some shards from these indices to any of the nodes in the desired data tier because there are not enough nodes in the [data_content] tier to allocate each shard copy on a different node.", + "action": "Increase the number of nodes in this tier or decrease the number of replica shards in the affected indices.", + "help_url": "https://ela.st/tier-capacity", + "affected_resources": { + "indices": [ + "twitter" + ] + } + } + ] + }, + "disk": { + "status": "green", + "symptom": "The cluster has enough available disk space.", + "details": { + "indices_with_readonly_block": 0, + "nodes_with_enough_disk_space": 1, + "nodes_with_unknown_disk_status": 0, + "nodes_over_high_watermark": 0, + "nodes_over_flood_stage_watermark": 0 + } + }, + "data_stream_lifecycle": { + "status": "green", + "symptom": "No data stream lifecycle health data available yet. Health information will be reported after the first run." + }, + "ilm": { + "status": "green", + "symptom": "Index Lifecycle Management is running", + "details": { + "policies": 17, + "ilm_status": "RUNNING" + } + }, + "slm": { + "status": "green", + "symptom": "No Snapshot Lifecycle Management policies configured", + "details": { + "slm_status": "RUNNING", + "policies": 0 + } + } + } +} From 47231fb107b30646f0dc86c24e6bc63deae0a64a Mon Sep 17 00:00:00 2001 From: Richard Klose Date: Fri, 21 Mar 2025 11:17:28 +0100 Subject: [PATCH 2/2] docs: update cardinality in README and release version Co-authored-by: Joe Adams Signed-off-by: Richard Klose --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9cc6f173..382c25f2 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ elasticsearch_exporter --help | es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. | es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | | collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | -| collector.health-report | 1.9.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | +| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | | es.slm | | If true, query stats for SLM. | false | | es.data_stream | | If true, query state for Data Steams. | false | | es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | @@ -273,26 +273,26 @@ Further Information | elasticsearch_data_stream_store_size_bytes | gauge | 1 | Current size of data stream backing indices in bytes | | elasticsearch_health_report_creating_primaries | gauge | 1 | The number of creating primary shards | | elasticsearch_health_report_creating_replicas | gauge | 1 | The number of creating replica shards | -| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 1 | Data stream lifecycle status | -| elasticsearch_health_report_disk_status | gauge | 3 | disk status | +| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 2 | Data stream lifecycle status | +| elasticsearch_health_report_disk_status | gauge | 2 | disk status | | elasticsearch_health_report_ilm_policies | gauge | 1 | The number of ILM Policies | | elasticsearch_health_report_ilm_stagnating_indices | gauge | 1 | The number of stagnating indices | -| elasticsearch_health_report_ilm_status | gauge | 3 | ILM status | +| elasticsearch_health_report_ilm_status | gauge | 2 | ILM status | | elasticsearch_health_report_initializing_primaries | gauge | 1 | The number of initializing primary shards | | elasticsearch_health_report_initializing_replicas | gauge | 1 | The number of initializing replica shards | -| elasticsearch_health_report_master_is_stable_status | gauge | 3 | Master is stable status | +| elasticsearch_health_report_master_is_stable_status | gauge | 2 | Master is stable status | | elasticsearch_health_report_max_shards_in_cluster_data | gauge | 1 | The number of maximum shards in a cluster | | elasticsearch_health_report_max_shards_in_cluster_frozen | gauge | 1 | The number of maximum frozen shards in a cluster | -| elasticsearch_health_report_repository_integrity_status | gauge | 3 | Repository integrity status | +| elasticsearch_health_report_repository_integrity_status | gauge | 2 | Repository integrity status | | elasticsearch_health_report_restarting_primaries | gauge | 1 | The number of restarting primary shards | | elasticsearch_health_report_restarting_replicas | gauge | 1 | The number of restarting replica shards | -| elasticsearch_health_report_shards_availabilty_status | gauge | 3 | Shards availabilty status | -| elasticsearch_health_report_shards_capacity_status | gauge | 3 | Shards capacity status | +| elasticsearch_health_report_shards_availabilty_status | gauge | 2 | Shards availabilty status | +| elasticsearch_health_report_shards_capacity_status | gauge | 2 | Shards capacity status | | elasticsearch_health_report_slm_policies | gauge | 1 | The number of SLM policies | -| elasticsearch_health_report_slm_status | gauge | 3 | SLM status | +| elasticsearch_health_report_slm_status | gauge | 2 | SLM status | | elasticsearch_health_report_started_primaries | gauge | 1 | The number of started primary shards | | elasticsearch_health_report_started_replicas | gauge | 1 | The number of started replica shards | -| elasticsearch_health_report_status | gauge | 3 | Overall cluster status | +| elasticsearch_health_report_status | gauge | 2 | Overall cluster status | | elasticsearch_health_report_total_repositories | gauge | 1 | The number snapshot repositories | | elasticsearch_health_report_unassigned_primaries | gauge | 1 | The number of unassigned primary shards | | elasticsearch_health_report_unassigned_replicas | gauge | 1 | The number of unassigned replica shards |