Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ elasticsearch_exporter --help
| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false |
| es.slm | | If true, query stats for SLM. | false |
| es.data_stream | | If true, query state for Data Steams. | false |
| es.remote_info | 2.x.x | If true, query stats for configured remote clusters in the Elasticsearch cluster. Exposes connection metrics for cross-cluster search and replication. | false |
| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s |
| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | |
| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | |
Expand Down Expand Up @@ -107,6 +108,7 @@ es.shards | not sure if `indices` or `cluster` `monitor` or both |
collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057)
es.slm | `manage_slm`
es.data_stream | `monitor` or `manage` (per index or `*`) |
es.remote_info | `cluster` `monitor` | Required for accessing remote cluster connection information via the `/_remote/info` endpoint

Further Information

Expand Down Expand Up @@ -175,6 +177,49 @@ Notes:
- Any `options:` under an auth module will be appended as URL query parameters to the target URL.
- The `tls` auth module (client certificate authentication) is intended for self‑managed Elasticsearch/OpenSearch deployments. Amazon OpenSearch Service typically authenticates at the domain edge with IAM/SigV4 and does not support client certificate authentication; use the `aws` auth module instead when scraping Amazon OpenSearch Service domains.

### Remote Cluster Monitoring

The remote info collector (`es.remote_info`) provides monitoring capabilities for Elasticsearch cross-cluster search and cross-cluster replication configurations. This collector queries the `/_remote/info` endpoint to gather connection statistics for configured remote clusters.

#### When to Enable

Enable this collector when you have:
- Cross-cluster search configured
- Cross-cluster replication set up
- Multiple Elasticsearch clusters connected via remote cluster connections
- Need to monitor the health and connectivity of remote cluster connections

#### Metrics Provided

The collector provides connection metrics labeled by `remote_cluster` name, including:
- Active node connections to remote clusters
- Proxy socket connections (for clusters behind proxies)
- Maximum connection limits per cluster
- Connection health and scrape statistics

#### Prerequisites

- Remote clusters must be properly configured in your Elasticsearch cluster
- The user account must have `cluster:monitor` privileges to access the `/_remote/info` endpoint
- Remote clusters should be accessible and properly configured with seeds

#### Example Configuration

To enable remote cluster monitoring:

```bash
./elasticsearch_exporter --es.uri=http://localhost:9200 --es.remote_info
```

The collector will automatically discover all configured remote clusters and expose metrics for each one.

The remote info collector can also be enabled via the `ES_REMOTE_INFO` environment variable:

```bash
export ES_REMOTE_INFO=true
./elasticsearch_exporter --es.uri=http://localhost:9200
```

### Metrics

See the [metrics documentation](metrics.md)
Expand Down
187 changes: 187 additions & 0 deletions collector/remote_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collector

import (
"encoding/json"
"fmt"
"log/slog"
"net/http"
"net/url"
"path"

"github.com/prometheus/client_golang/prometheus"
)

// Labels for remote info metrics
var defaulRemoteInfoLabels = []string{"remote_cluster"}
var defaultRemoteInfoLabelValues = func(remote_cluster string) []string {
return []string{
remote_cluster,
}
}

type remoteInfoMetric struct {
Type prometheus.ValueType
Desc *prometheus.Desc
Value func(remoteStats RemoteCluster) float64
Labels func(remote_cluster string) []string
}

// RemoteInfo information struct
type RemoteInfo struct {
logger *slog.Logger
client *http.Client
url *url.URL

up prometheus.Gauge
totalScrapes, jsonParseFailures prometheus.Counter

remoteInfoMetrics []*remoteInfoMetric
}

// NewClusterSettings defines Cluster Settings Prometheus metrics
func NewRemoteInfo(logger *slog.Logger, client *http.Client, url *url.URL) *RemoteInfo {

return &RemoteInfo{
logger: logger,
client: client,
url: url,

up: prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, "remote_info_stats", "up"),
Help: "Was the last scrape of the ElasticSearch remote info endpoint successful.",
}),
totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{
Name: prometheus.BuildFQName(namespace, "remote_info_stats", "total_scrapes"),
Help: "Current total ElasticSearch remote info scrapes.",
}),
jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{
Name: prometheus.BuildFQName(namespace, "remote_info_stats", "json_parse_failures"),
Help: "Number of errors while parsing JSON.",
}),
// Send all of the remote metrics
remoteInfoMetrics: []*remoteInfoMetric{
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "remote_info", "num_nodes_connected"),
"Number of nodes connected", defaulRemoteInfoLabels, nil,
),
Value: func(remoteStats RemoteCluster) float64 {
return float64(remoteStats.NumNodesConnected)
},
Labels: defaultRemoteInfoLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "remote_info", "num_proxy_sockets_connected"),
"Number of proxy sockets connected", defaulRemoteInfoLabels, nil,
),
Value: func(remoteStats RemoteCluster) float64 {
return float64(remoteStats.NumProxySocketsConnected)
},
Labels: defaultRemoteInfoLabelValues,
},
{
Type: prometheus.GaugeValue,
Desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "remote_info", "max_connections_per_cluster"),
"Max connections per cluster", defaulRemoteInfoLabels, nil,
),
Value: func(remoteStats RemoteCluster) float64 {
return float64(remoteStats.MaxConnectionsPerCluster)
},
Labels: defaultRemoteInfoLabelValues,
},
},
}
}

func (c *RemoteInfo) fetchAndDecodeRemoteInfoStats() (RemoteInfoResponse, error) {
var rir RemoteInfoResponse

u := *c.url
u.Path = path.Join(u.Path, "/_remote/info")

res, err := c.client.Get(u.String())
if err != nil {
return rir, fmt.Errorf("failed to get remote info from %s://%s:%s%s: %s",
u.Scheme, u.Hostname(), u.Port(), u.Path, err)
}

defer func() {
err = res.Body.Close()
if err != nil {
c.logger.Warn(
"failed to close http.Client",
"err", err,
)
}
}()

if res.StatusCode != http.StatusOK {
return rir, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode)
}

if err := json.NewDecoder(res.Body).Decode(&rir); err != nil {
c.jsonParseFailures.Inc()
return rir, err
}
return rir, nil
}

// Collect gets remote info values
func (ri *RemoteInfo) Collect(ch chan<- prometheus.Metric) {
ri.totalScrapes.Inc()
defer func() {
ch <- ri.up
ch <- ri.totalScrapes
ch <- ri.jsonParseFailures
}()

remoteInfoResp, err := ri.fetchAndDecodeRemoteInfoStats()
if err != nil {
ri.up.Set(0)
ri.logger.Warn(
"failed to fetch and decode remote info",
"err", err,
)
return
}
ri.up.Set(1)

// Remote Info
for remote_cluster, remoteInfo := range remoteInfoResp {
for _, metric := range ri.remoteInfoMetrics {
ch <- prometheus.MustNewConstMetric(
metric.Desc,
metric.Type,
metric.Value(remoteInfo),
metric.Labels(remote_cluster)...,
)
}
}
}

// Describe add Indices metrics descriptions
func (ri *RemoteInfo) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range ri.remoteInfoMetrics {
ch <- metric.Desc
}
ch <- ri.up.Desc()
ch <- ri.totalScrapes.Desc()
ch <- ri.jsonParseFailures.Desc()
}
28 changes: 28 additions & 0 deletions collector/remote_info_response.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collector

// RemoteInfoResponse is a representation of a Elasticsearch _remote/info
type RemoteInfoResponse map[string]RemoteCluster

// RemoteClsuter defines the struct of the tree for the Remote Cluster
type RemoteCluster struct {
Seeds []string `json:"seeds"`
Connected bool `json:"connected"`
NumNodesConnected int64 `json:"num_nodes_connected"`
NumProxySocketsConnected int64 `json:"num_proxy_sockets_connected"`
MaxConnectionsPerCluster int64 `json:"max_connections_per_cluster"`
InitialConnectTimeout string `json:"initial_connect_timeout"`
SkipUnavailable bool `json:"skip_unavailable"`
}
Loading