diff --git a/.chloggen/elasticsearch-cluster-health-shards.yaml b/.chloggen/elasticsearch-cluster-health-shards.yaml new file mode 100644 index 0000000000000..0353d8f3e616a --- /dev/null +++ b/.chloggen/elasticsearch-cluster-health-shards.yaml @@ -0,0 +1,16 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: elasticsearchreceiver + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add cluster health metrics for two more shards types + +# One or more tracking issues related to the change +issues: [14635] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/receiver/elasticsearchreceiver/README.md b/receiver/elasticsearchreceiver/README.md index d47a59459d290..7ca81bb58873e 100644 --- a/receiver/elasticsearchreceiver/README.md +++ b/receiver/elasticsearchreceiver/README.md @@ -55,5 +55,21 @@ The following metric are available with versions: - `elasticsearch.cluster.state_update.time` >= [7.16.0](https://www.elastic.co/guide/en/elasticsearch/reference/7.16/release-notes-7.16.0.html) Details about the metrics produced by this receiver can be found in [metadata.yaml](./metadata.yaml) + +## Feature gate configurations + +See the [Collector feature gates](https://github.com/open-telemetry/opentelemetry-collector/blob/main/featuregate/README.md#collector-feature-gates) for an overview of feature gates in the collector. + +**ALPHA**: `receiver.elasticsearch.emitClusterHealthDetailedShardMetrics` + +The feature gate `receiver.elasticsearch.emitClusterHealthDetailedShardMetrics` once enabled starts emitting the metric `elasticsearch.cluster.shards` +with two additional data points - one with `state` equal to `active_primary` and one with `state` equal to `unassigned_delayed`. + +This is considered a breaking change for existing users of this receiver, and it is recommended to migrate to the new implementation when possible. Any new users planning to adopt this receiver should enable this feature gate to avoid having to migrate any visualisations or alerts. + +This feature gate will eventually be enabled by default, and eventually the old implementation will be removed. It aims +to give users time to migrate to the new implementation. The target release for this featuregate to be enabled by default +is 0.68.0. + [beta]:https://github.com/open-telemetry/opentelemetry-collector#beta [contrib]:https://github.com/open-telemetry/opentelemetry-collector-releases/tree/main/distributions/otelcol-contrib diff --git a/receiver/elasticsearchreceiver/documentation.md b/receiver/elasticsearchreceiver/documentation.md index 4b3545ca80c79..a05eb9f8f5624 100644 --- a/receiver/elasticsearchreceiver/documentation.md +++ b/receiver/elasticsearchreceiver/documentation.md @@ -136,7 +136,7 @@ metrics: | operation (operation) | The type of operation. | index, delete, get, query, fetch, scroll, suggest, merge, refresh, flush, warmer | | query_cache_count_type (type) | Type of query cache count | hit, miss | | segments_memory_object_type (object) | Type of object in segment | term, doc_value, index_writer, fixed_bit_set | -| shard_state (state) | The state of the shard. | active, relocating, initializing, unassigned | +| shard_state (state) | The state of the shard. | active, active_primary, relocating, initializing, unassigned, unassigned_delayed | | task_state (state) | The state of the task. | rejected, completed | | thread_pool_name | The name of the thread pool. | | | thread_state (state) | The state of the thread. | active, idle | diff --git a/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go b/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go index a28571425cb46..81c323dc6cd0e 100644 --- a/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go +++ b/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go @@ -874,9 +874,11 @@ type AttributeShardState int const ( _ AttributeShardState = iota AttributeShardStateActive + AttributeShardStateActivePrimary AttributeShardStateRelocating AttributeShardStateInitializing AttributeShardStateUnassigned + AttributeShardStateUnassignedDelayed ) // String returns the string representation of the AttributeShardState. @@ -884,22 +886,28 @@ func (av AttributeShardState) String() string { switch av { case AttributeShardStateActive: return "active" + case AttributeShardStateActivePrimary: + return "active_primary" case AttributeShardStateRelocating: return "relocating" case AttributeShardStateInitializing: return "initializing" case AttributeShardStateUnassigned: return "unassigned" + case AttributeShardStateUnassignedDelayed: + return "unassigned_delayed" } return "" } // MapAttributeShardState is a helper map of string to AttributeShardState attribute value. var MapAttributeShardState = map[string]AttributeShardState{ - "active": AttributeShardStateActive, - "relocating": AttributeShardStateRelocating, - "initializing": AttributeShardStateInitializing, - "unassigned": AttributeShardStateUnassigned, + "active": AttributeShardStateActive, + "active_primary": AttributeShardStateActivePrimary, + "relocating": AttributeShardStateRelocating, + "initializing": AttributeShardStateInitializing, + "unassigned": AttributeShardStateUnassigned, + "unassigned_delayed": AttributeShardStateUnassignedDelayed, } // AttributeTaskState specifies the a value task_state attribute. diff --git a/receiver/elasticsearchreceiver/internal/model/clusterhealth.go b/receiver/elasticsearchreceiver/internal/model/clusterhealth.go index 7e05e1acb97a7..9f383b441440b 100644 --- a/receiver/elasticsearchreceiver/internal/model/clusterhealth.go +++ b/receiver/elasticsearchreceiver/internal/model/clusterhealth.go @@ -18,14 +18,16 @@ package model // import "github.com/open-telemetry/opentelemetry-collector-contr // The struct is not exhaustive; It does not provide all values returned by elasticsearch, // only the ones relevant to the metrics retrieved by the scraper. type ClusterHealth struct { - ClusterName string `json:"cluster_name"` - ActiveShards int64 `json:"active_shards"` - RelocatingShards int64 `json:"relocating_shards"` - InitializingShards int64 `json:"initializing_shards"` - UnassignedShards int64 `json:"unassigned_shards"` - NodeCount int64 `json:"number_of_nodes"` - DataNodeCount int64 `json:"number_of_data_nodes"` - PendingTasksCount int64 `json:"number_of_pending_tasks"` - InFlightFetchCount int64 `json:"number_of_in_flight_fetch"` - Status string `json:"status"` + ClusterName string `json:"cluster_name"` + ActiveShards int64 `json:"active_shards"` + ActivePrimaryShards int64 `json:"active_primary_shards"` + RelocatingShards int64 `json:"relocating_shards"` + InitializingShards int64 `json:"initializing_shards"` + UnassignedShards int64 `json:"unassigned_shards"` + DelayedUnassignedShards int64 `json:"delayed_unassigned_shards"` + NodeCount int64 `json:"number_of_nodes"` + DataNodeCount int64 `json:"number_of_data_nodes"` + PendingTasksCount int64 `json:"number_of_pending_tasks"` + InFlightFetchCount int64 `json:"number_of_in_flight_fetch"` + Status string `json:"status"` } diff --git a/receiver/elasticsearchreceiver/metadata.yaml b/receiver/elasticsearchreceiver/metadata.yaml index c8b8c0e66a9b0..57d32d94318bb 100644 --- a/receiver/elasticsearchreceiver/metadata.yaml +++ b/receiver/elasticsearchreceiver/metadata.yaml @@ -45,9 +45,11 @@ attributes: description: The state of the shard. enum: - active + - active_primary - relocating - initializing - unassigned + - unassigned_delayed operation: value: operation description: The type of operation. diff --git a/receiver/elasticsearchreceiver/scraper.go b/receiver/elasticsearchreceiver/scraper.go index fc52a7fdef739..6fcfad6113d30 100644 --- a/receiver/elasticsearchreceiver/scraper.go +++ b/receiver/elasticsearchreceiver/scraper.go @@ -22,6 +22,7 @@ import ( "github.com/hashicorp/go-version" "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver/scrapererror" @@ -41,6 +42,19 @@ var ( }() ) +const ( + readmeURL = "https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/elasticsearchreceiver/README.md" + emitClusterHealthDetailedShardMetricsID = "receiver.elasticsearch.emitClusterHealthDetailedShardMetrics" +) + +func init() { + featuregate.GetRegistry().MustRegisterID( + emitClusterHealthDetailedShardMetricsID, + featuregate.StageAlpha, + featuregate.WithRegisterDescription("When enabled, the elasticsearch.cluster.shards metric will be emitted with two more datapoints."), + ) +} + var errUnknownClusterStatus = errors.New("unknown cluster status") type elasticsearchScraper struct { @@ -50,17 +64,29 @@ type elasticsearchScraper struct { mb *metadata.MetricsBuilder version *version.Version clusterName string + + // Feature gates + emitClusterHealthDetailedShardMetrics bool } func newElasticSearchScraper( settings component.ReceiverCreateSettings, cfg *Config, ) *elasticsearchScraper { - return &elasticsearchScraper{ - settings: settings.TelemetrySettings, - cfg: cfg, - mb: metadata.NewMetricsBuilder(cfg.Metrics, settings.BuildInfo), + e := &elasticsearchScraper{ + settings: settings.TelemetrySettings, + cfg: cfg, + mb: metadata.NewMetricsBuilder(cfg.Metrics, settings.BuildInfo), + emitClusterHealthDetailedShardMetrics: featuregate.GetRegistry().IsEnabled(emitClusterHealthDetailedShardMetricsID), } + + if !e.emitClusterHealthDetailedShardMetrics { + settings.Logger.Warn( + fmt.Sprintf("Feature gate %s is not enabled. Please see the README for more information: %s", emitClusterHealthDetailedShardMetricsID, readmeURL), + ) + } + + return e } func (r *elasticsearchScraper) start(_ context.Context, host component.Host) (err error) { @@ -323,6 +349,11 @@ func (r *elasticsearchScraper) scrapeClusterMetrics(ctx context.Context, now pco r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.RelocatingShards, metadata.AttributeShardStateRelocating) r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.UnassignedShards, metadata.AttributeShardStateUnassigned) + if r.emitClusterHealthDetailedShardMetrics { + r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.ActivePrimaryShards, metadata.AttributeShardStateActivePrimary) + r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.DelayedUnassignedShards, metadata.AttributeShardStateUnassignedDelayed) + } + r.mb.RecordElasticsearchClusterPendingTasksDataPoint(now, clusterHealth.PendingTasksCount) r.mb.RecordElasticsearchClusterInFlightFetchDataPoint(now, clusterHealth.InFlightFetchCount) diff --git a/receiver/elasticsearchreceiver/scraper_test.go b/receiver/elasticsearchreceiver/scraper_test.go index c0fde98fb0d8c..e09f523c13bdf 100644 --- a/receiver/elasticsearchreceiver/scraper_test.go +++ b/receiver/elasticsearchreceiver/scraper_test.go @@ -26,6 +26,7 @@ import ( "go.opentelemetry.io/collector/component/componenttest" "go.opentelemetry.io/collector/config/confighttp" "go.opentelemetry.io/collector/config/configtls" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/receiver/scrapererror" "github.com/open-telemetry/opentelemetry-collector-contrib/internal/scrapertest" @@ -38,6 +39,13 @@ const fullExpectedMetricsPath = "./testdata/expected_metrics/full.json" const skipClusterExpectedMetricsPath = "./testdata/expected_metrics/clusterSkip.json" const noNodesExpectedMetricsPath = "./testdata/expected_metrics/noNodes.json" +func TestMain(m *testing.M) { + // Enable the feature gates before all tests to avoid flaky tests. + _ = featuregate.GetRegistry().Apply(map[string]bool{emitClusterHealthDetailedShardMetricsID: true}) + code := m.Run() + os.Exit(code) +} + func TestScraper(t *testing.T) { t.Parallel() diff --git a/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json b/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json index 267b0936742a9..2af62881cb1e0 100644 --- a/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json +++ b/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json @@ -2515,6 +2515,32 @@ ], "startTimeUnixNano": "1661811689941624000", "timeUnixNano": "1661811689943245000" + }, + { + "asInt": "23", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "active_primary" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" + }, + { + "asInt": "1", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "unassigned_delayed" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" } ] }, diff --git a/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json b/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json index 7efecefdfa2ec..c9930d7c4d380 100644 --- a/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json +++ b/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json @@ -180,6 +180,32 @@ ], "startTimeUnixNano": "1662458370557980000", "timeUnixNano": "1662458370559258000" + }, + { + "asInt": "23", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "active_primary" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" + }, + { + "asInt": "1", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "unassigned_delayed" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" } ] }, diff --git a/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json b/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json index 35f14df3362dc..ac526ca34c3e4 100644 --- a/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json +++ b/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json @@ -9,7 +9,7 @@ "relocating_shards": 10, "initializing_shards": 2, "unassigned_shards": 3, - "delayed_unassigned_shards": 0, + "delayed_unassigned_shards": 1, "number_of_pending_tasks": 0, "number_of_in_flight_fetch": 0, "task_max_waiting_in_queue_millis": 0,