[pkg/stanza] Add an option to resend logs instead of dropping (open-t…

…elemetry#20864) Add a `retry_on_failure` config option (disabled by default) to stanza receivers that can be used to slow down reading logs instead of dropping if downstream components return a non-permanent error. The configuration has the following options: - `enabled`: Enable or disable the retry mechanism. Default is `false`. - `initial_interval`: The initial interval to wait before retrying. Default is `1s`. - `max_interval`: The maximum interval to wait before retrying. Default is `30s`. - `max_elapsed_time`: The maximum amount of time to wait before giving up. Default is `5m`. The configuration interface is inspired by https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/exporterhelper#configuration which potentially can be exposed in as another package not specific to exporter and used by any components
amazon-contributing · Apr 24, 2023 · 7bf5d66 · 7bf5d66
1 parent 5616ef8
commit 7bf5d66
Show file tree

Hide file tree

Showing 43 changed files with 586 additions and 134 deletions.
diff --git a/.chloggen/filelog-receiver-retry.yaml b/.chloggen/filelog-receiver-retry.yaml
@@ -0,0 +1,22 @@
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/stanza
+
+# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add and option to pause reading a file and attempt to resend the current batch of logs if it encounters an error from downstream components.
+
+# One or more tracking issues related to the change
+issues: [20511]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: |
+ Add a `retry_on_failure` config option (disabled by default) that can be used to slow down reading logs instead of 
+ dropping logs if downstream components return a non-permanent error. The configuration has the following options:
+ - `enabled`: Enable or disable the retry mechanism. Default is `false`.
+ - `initial_interval`: The initial interval to wait before retrying. Default is `1s`.
+ - `max_interval`: The maximum interval to wait before retrying. Default is `30s`.
+ - `max_elapsed_time`: The maximum amount of time to wait before giving up. Default is `5m`.
diff --git a/internal/coreinternal/consumerretry/config.go b/internal/coreinternal/consumerretry/config.go
@@ -0,0 +1,44 @@
+// Copyright The OpenTelemetry Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package consumerretry // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry"
+
+import "time"
+
+// Config defines configuration for retrying batches in case of receiving a retryable error from a downstream
+// consumer. If the retryable error doesn't provide a delay, exponential backoff is applied.
+type Config struct {
+ // Enabled indicates whether to not retry sending logs in case of receiving a retryable error from a downstream
+ // consumer. Default is false.
+ Enabled bool `mapstructure:"enabled"`
+ // InitialInterval the time to wait after the first failure before retrying. Default value is 1 second.
+ InitialInterval time.Duration `mapstructure:"initial_interval"`
+ // MaxInterval is the upper bound on backoff interval. Once this value is reached the delay between
+ // consecutive retries will always be `MaxInterval`. Default value is 30 seconds.
+ MaxInterval time.Duration `mapstructure:"max_interval"`
+ // MaxElapsedTime is the maximum amount of time (including retries) spent trying to send a logs batch to
+ // a downstream consumer. Once this value is reached, the data is discarded. It never stops if MaxElapsedTime == 0.
+ // Default value is 5 minutes.
+ MaxElapsedTime time.Duration `mapstructure:"max_elapsed_time"`
+}
+
+// NewDefaultConfig returns the default Config.
+func NewDefaultConfig() Config {
+ return Config{
+ Enabled: false,
+ InitialInterval: 1 * time.Second,
+ MaxInterval: 30 * time.Second,
+ MaxElapsedTime: 5 * time.Minute,
+ }
+}
diff --git a/internal/coreinternal/consumerretry/logs.go b/internal/coreinternal/consumerretry/logs.go
@@ -0,0 +1,124 @@
+// Copyright The OpenTelemetry Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package consumerretry // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry"
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "time"
+
+ "github.com/cenkalti/backoff/v4"
+ "go.opentelemetry.io/collector/consumer"
+ "go.opentelemetry.io/collector/consumer/consumererror"
+ "go.opentelemetry.io/collector/pdata/plog"
+ "go.opentelemetry.io/otel/attribute"
+ "go.opentelemetry.io/otel/trace"
+ "go.uber.org/zap"
+)
+
+type logsConsumer struct {
+ consumer.Logs
+ cfg Config
+ logger *zap.Logger
+}
+
+func NewLogs(config Config, logger *zap.Logger, next consumer.Logs) consumer.Logs {
+ return &logsConsumer{
+ Logs: next,
+ cfg: config,
+ logger: logger,
+ }
+}
+
+func (lc *logsConsumer) ConsumeLogs(ctx context.Context, logs plog.Logs) error {
+ if !lc.cfg.Enabled {
+ err := lc.Logs.ConsumeLogs(ctx, logs)
+ if err != nil {
+ lc.logger.Error("ConsumeLogs() failed. "+
+ "Enable retry_on_failure to slow down reading logs and avoid dropping.", zap.Error(err))
+ }
+ return err
+ }
+
+ // Do not use NewExponentialBackOff since it calls Reset and the code here must
+ // call Reset after changing the InitialInterval (this saves an unnecessary call to Now).
+ expBackoff := backoff.ExponentialBackOff{
+ MaxElapsedTime: lc.cfg.MaxElapsedTime,
+ InitialInterval: lc.cfg.InitialInterval,
+ MaxInterval: lc.cfg.MaxInterval,
+ RandomizationFactor: backoff.DefaultRandomizationFactor,
+ Multiplier: backoff.DefaultMultiplier,
+ Stop: backoff.Stop,
+ Clock: backoff.SystemClock,
+ }
+ expBackoff.Reset()
+
+ span := trace.SpanFromContext(ctx)
+ retryNum := int64(0)
+ retryableErr := consumererror.Logs{}
+ for {
+ span.AddEvent(
+ "Sending logs.",
+ trace.WithAttributes(attribute.Int64("retry_num", retryNum)))
+
+ err := lc.Logs.ConsumeLogs(ctx, logs)
+ if err == nil {
+ return nil
+ }
+
+ if consumererror.IsPermanent(err) {
+ lc.logger.Error(
+ "ConsumeLogs() failed. The error is not retryable. Dropping data.",
+ zap.Error(err),
+ zap.Int("dropped_items", logs.LogRecordCount()),
+ )
+ return err
+ }
+
+ if errors.As(err, &retryableErr) {
+ logs = retryableErr.Data()
+ }
+
+ // TODO: take delay from the error once it is available in the consumererror package.
+ backoffDelay := expBackoff.NextBackOff()
+ if backoffDelay == backoff.Stop {
+ lc.logger.Error("Max elapsed time expired. Dropping data.", zap.Error(err), zap.Int("dropped_items",
+ logs.LogRecordCount()))
+ return err
+ }
+
+ backoffDelayStr := backoffDelay.String()
+ span.AddEvent(
+ "ConsumeLogs() failed. Will retry the request after interval.",
+ trace.WithAttributes(
+ attribute.String("interval", backoffDelayStr),
+ attribute.String("error", err.Error())))
+ lc.logger.Debug(
+ "ConsumeLogs() failed. Will retry the request after interval.",
+ zap.Error(err),
+ zap.String("interval", backoffDelayStr),
+ zap.Int("logs_count", logs.LogRecordCount()),
+ )
+ retryNum++
+
+ // back-off, but get interrupted when shutting down or request is cancelled or timed out.
+ select {
+ case <-ctx.Done():
+ return fmt.Errorf("context is cancelled or timed out %w", err)
+ case <-time.After(backoffDelay):
+ }
+ }
+}
diff --git a/internal/coreinternal/consumerretry/logs_test.go b/internal/coreinternal/consumerretry/logs_test.go
@@ -0,0 +1,125 @@
+// Copyright The OpenTelemetry Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package consumerretry
+
+import (
+ "context"
+ "errors"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "go.opentelemetry.io/collector/consumer/consumererror"
+ "go.uber.org/zap"
+
+ "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/testdata"
+)
+
+func TestConsumeLogs(t *testing.T) {
+ tests := []struct {
+ name string
+ cfg Config
+ consumer *MockLogsRejecter
+ expectedErr error
+ }{
+ {
+ name: "no_retry_success",
+ expectedErr: nil,
+ cfg: NewDefaultConfig(),
+ consumer: NewMockLogsRejecter(0),
+ },
+ {
+ name: "permanent_error",
+ expectedErr: consumererror.NewPermanent(errors.New("permanent error")),
+ cfg: Config{Enabled: true},
+ consumer: NewMockLogsRejecter(-1),
+ },
+ {
+ name: "timeout_error",
+ expectedErr: errors.New("retry later"),
+ cfg: Config{
+ Enabled: true,
+ InitialInterval: 1 * time.Millisecond,
+ MaxInterval: 5 * time.Millisecond,
+ MaxElapsedTime: 10 * time.Millisecond,
+ },
+ consumer: NewMockLogsRejecter(20),
+ },
+ {
+ name: "retry_success",
+ expectedErr: nil,
+ cfg: Config{
+ Enabled: true,
+ InitialInterval: 1 * time.Millisecond,
+ MaxInterval: 2 * time.Millisecond,
+ MaxElapsedTime: 100 * time.Millisecond,
+ },
+ consumer: NewMockLogsRejecter(5),
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ consumer := NewLogs(tt.cfg, zap.NewNop(), tt.consumer)
+ err := consumer.ConsumeLogs(context.Background(), testdata.GenerateLogsTwoLogRecordsSameResource())
+ assert.Equal(t, tt.expectedErr, err)
+ if err == nil {
+ assert.Equal(t, 1, len(tt.consumer.AllLogs()))
+ assert.Equal(t, 2, tt.consumer.AllLogs()[0].LogRecordCount())
+ if tt.consumer.acceptAfter > 0 {
+ assert.Equal(t, tt.consumer.rejectCount.Load(), tt.consumer.acceptAfter)
+ }
+ } else if tt.consumer.acceptAfter > 0 {
+ assert.Less(t, tt.consumer.rejectCount.Load(), tt.consumer.acceptAfter)
+ }
+ })
+ }
+}
+
+func TestConsumeLogs_ContextDeadline(t *testing.T) {
+ consumer := NewLogs(Config{
+ Enabled: true,
+ InitialInterval: 1 * time.Millisecond,
+ MaxInterval: 5 * time.Millisecond,
+ MaxElapsedTime: 50 * time.Millisecond,
+ }, zap.NewNop(), NewMockLogsRejecter(10))
+
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond)
+ defer cancel()
+ err := consumer.ConsumeLogs(ctx, testdata.GenerateLogsTwoLogRecordsSameResource())
+ assert.Error(t, err)
+ assert.Contains(t, err.Error(), "context is cancelled or timed out retry later")
+}
+
+func TestConsumeLogs_PartialRetry(t *testing.T) {
+ sink := &mockPartialLogsRejecter{}
+ consumer := NewLogs(Config{
+ Enabled: true,
+ InitialInterval: 1 * time.Millisecond,
+ MaxInterval: 5 * time.Millisecond,
+ MaxElapsedTime: 50 * time.Millisecond,
+ }, zap.NewNop(), sink)
+
+ logs := testdata.GenerateLogsTwoLogRecordsSameResource()
+ testdata.GenerateLogsOneLogRecordNoResource().ResourceLogs().MoveAndAppendTo(logs.ResourceLogs())
+ assert.NoError(t, consumer.ConsumeLogs(context.Background(), logs))
+
+ // Verify the logs batch is broken into two parts, one with the partial error and one without.
+ assert.Equal(t, 2, len(sink.AllLogs()))
+ assert.Equal(t, 1, sink.AllLogs()[0].ResourceLogs().Len())
+ assert.Equal(t, 2, sink.AllLogs()[0].LogRecordCount())
+ assert.Equal(t, 1, sink.AllLogs()[1].ResourceLogs().Len())
+ assert.Equal(t, 1, sink.AllLogs()[1].LogRecordCount())
+}
diff --git a/internal/coreinternal/consumerretry/testutil.go b/internal/coreinternal/consumerretry/testutil.go
@@ -0,0 +1,70 @@
+// Copyright The OpenTelemetry Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package consumerretry // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry"
+
+import (
+ "context"
+ "errors"
+ "sync/atomic"
+
+ "go.opentelemetry.io/collector/consumer/consumererror"
+ "go.opentelemetry.io/collector/consumer/consumertest"
+ "go.opentelemetry.io/collector/pdata/plog"
+)
+
+type MockLogsRejecter struct {
+ consumertest.LogsSink
+ rejectCount *atomic.Int32
+ acceptAfter int32
+}
+
+// NewMockLogsRejecter creates new MockLogsRejecter. acceptAfter is a number of rejects before accepting,
+// 0 means always accept, -1 means always reject with permanent error
+func NewMockLogsRejecter(acceptAfter int32) *MockLogsRejecter {
+ return &MockLogsRejecter{
+ acceptAfter: acceptAfter,
+ rejectCount: &atomic.Int32{},
+ }
+}
+
+func (m *MockLogsRejecter) ConsumeLogs(ctx context.Context, logs plog.Logs) error {
+ if m.acceptAfter < 0 {
+ return consumererror.NewPermanent(errors.New("permanent error"))
+ }
+ if m.rejectCount.Load() < m.acceptAfter {
+ m.rejectCount.Add(1)
+ return errors.New("retry later")
+ }
+ return m.LogsSink.ConsumeLogs(ctx, logs)
+}
+
+// mockPartialLogsRejecter is a mock LogsConsumer that accepts only one logs object and rejects the rest.
+type mockPartialLogsRejecter struct {
+ consumertest.LogsSink
+}
+
+func (m *mockPartialLogsRejecter) ConsumeLogs(ctx context.Context, logs plog.Logs) error {
+ if logs.ResourceLogs().Len() <= 1 {
+ return m.LogsSink.ConsumeLogs(ctx, logs)
+ }
+ accepted := plog.NewLogs()
+ rejected := plog.NewLogs()
+ logs.ResourceLogs().At(0).CopyTo(accepted.ResourceLogs().AppendEmpty())
+ for i := 1; i < logs.ResourceLogs().Len(); i++ {
+ logs.ResourceLogs().At(i).CopyTo(rejected.ResourceLogs().AppendEmpty())
+ }
+ _ = m.LogsSink.ConsumeLogs(ctx, accepted)
+ return consumererror.NewLogs(errors.New("partial error"), rejected)
+}