forked from open-telemetry/opentelemetry-collector-contrib
-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[pkg/stanza] Add an option to resend logs instead of dropping (open-t…
…elemetry#20864) Add a `retry_on_failure` config option (disabled by default) to stanza receivers that can be used to slow down reading logs instead of dropping if downstream components return a non-permanent error. The configuration has the following options: - `enabled`: Enable or disable the retry mechanism. Default is `false`. - `initial_interval`: The initial interval to wait before retrying. Default is `1s`. - `max_interval`: The maximum interval to wait before retrying. Default is `30s`. - `max_elapsed_time`: The maximum amount of time to wait before giving up. Default is `5m`. The configuration interface is inspired by https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/exporterhelper#configuration which potentially can be exposed in as another package not specific to exporter and used by any components
- Loading branch information
Showing
43 changed files
with
586 additions
and
134 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' | ||
change_type: enhancement | ||
|
||
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) | ||
component: pkg/stanza | ||
|
||
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). | ||
note: Add and option to pause reading a file and attempt to resend the current batch of logs if it encounters an error from downstream components. | ||
|
||
# One or more tracking issues related to the change | ||
issues: [20511] | ||
|
||
# (Optional) One or more lines of additional information to render under the primary note. | ||
# These lines will be padded with 2 spaces and then inserted directly into the document. | ||
# Use pipe (|) for multiline entries. | ||
subtext: | | ||
Add a `retry_on_failure` config option (disabled by default) that can be used to slow down reading logs instead of | ||
dropping logs if downstream components return a non-permanent error. The configuration has the following options: | ||
- `enabled`: Enable or disable the retry mechanism. Default is `false`. | ||
- `initial_interval`: The initial interval to wait before retrying. Default is `1s`. | ||
- `max_interval`: The maximum interval to wait before retrying. Default is `30s`. | ||
- `max_elapsed_time`: The maximum amount of time to wait before giving up. Default is `5m`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package consumerretry // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry" | ||
|
||
import "time" | ||
|
||
// Config defines configuration for retrying batches in case of receiving a retryable error from a downstream | ||
// consumer. If the retryable error doesn't provide a delay, exponential backoff is applied. | ||
type Config struct { | ||
// Enabled indicates whether to not retry sending logs in case of receiving a retryable error from a downstream | ||
// consumer. Default is false. | ||
Enabled bool `mapstructure:"enabled"` | ||
// InitialInterval the time to wait after the first failure before retrying. Default value is 1 second. | ||
InitialInterval time.Duration `mapstructure:"initial_interval"` | ||
// MaxInterval is the upper bound on backoff interval. Once this value is reached the delay between | ||
// consecutive retries will always be `MaxInterval`. Default value is 30 seconds. | ||
MaxInterval time.Duration `mapstructure:"max_interval"` | ||
// MaxElapsedTime is the maximum amount of time (including retries) spent trying to send a logs batch to | ||
// a downstream consumer. Once this value is reached, the data is discarded. It never stops if MaxElapsedTime == 0. | ||
// Default value is 5 minutes. | ||
MaxElapsedTime time.Duration `mapstructure:"max_elapsed_time"` | ||
} | ||
|
||
// NewDefaultConfig returns the default Config. | ||
func NewDefaultConfig() Config { | ||
return Config{ | ||
Enabled: false, | ||
InitialInterval: 1 * time.Second, | ||
MaxInterval: 30 * time.Second, | ||
MaxElapsedTime: 5 * time.Minute, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package consumerretry // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry" | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/cenkalti/backoff/v4" | ||
"go.opentelemetry.io/collector/consumer" | ||
"go.opentelemetry.io/collector/consumer/consumererror" | ||
"go.opentelemetry.io/collector/pdata/plog" | ||
"go.opentelemetry.io/otel/attribute" | ||
"go.opentelemetry.io/otel/trace" | ||
"go.uber.org/zap" | ||
) | ||
|
||
type logsConsumer struct { | ||
consumer.Logs | ||
cfg Config | ||
logger *zap.Logger | ||
} | ||
|
||
func NewLogs(config Config, logger *zap.Logger, next consumer.Logs) consumer.Logs { | ||
return &logsConsumer{ | ||
Logs: next, | ||
cfg: config, | ||
logger: logger, | ||
} | ||
} | ||
|
||
func (lc *logsConsumer) ConsumeLogs(ctx context.Context, logs plog.Logs) error { | ||
if !lc.cfg.Enabled { | ||
err := lc.Logs.ConsumeLogs(ctx, logs) | ||
if err != nil { | ||
lc.logger.Error("ConsumeLogs() failed. "+ | ||
"Enable retry_on_failure to slow down reading logs and avoid dropping.", zap.Error(err)) | ||
} | ||
return err | ||
} | ||
|
||
// Do not use NewExponentialBackOff since it calls Reset and the code here must | ||
// call Reset after changing the InitialInterval (this saves an unnecessary call to Now). | ||
expBackoff := backoff.ExponentialBackOff{ | ||
MaxElapsedTime: lc.cfg.MaxElapsedTime, | ||
InitialInterval: lc.cfg.InitialInterval, | ||
MaxInterval: lc.cfg.MaxInterval, | ||
RandomizationFactor: backoff.DefaultRandomizationFactor, | ||
Multiplier: backoff.DefaultMultiplier, | ||
Stop: backoff.Stop, | ||
Clock: backoff.SystemClock, | ||
} | ||
expBackoff.Reset() | ||
|
||
span := trace.SpanFromContext(ctx) | ||
retryNum := int64(0) | ||
retryableErr := consumererror.Logs{} | ||
for { | ||
span.AddEvent( | ||
"Sending logs.", | ||
trace.WithAttributes(attribute.Int64("retry_num", retryNum))) | ||
|
||
err := lc.Logs.ConsumeLogs(ctx, logs) | ||
if err == nil { | ||
return nil | ||
} | ||
|
||
if consumererror.IsPermanent(err) { | ||
lc.logger.Error( | ||
"ConsumeLogs() failed. The error is not retryable. Dropping data.", | ||
zap.Error(err), | ||
zap.Int("dropped_items", logs.LogRecordCount()), | ||
) | ||
return err | ||
} | ||
|
||
if errors.As(err, &retryableErr) { | ||
logs = retryableErr.Data() | ||
} | ||
|
||
// TODO: take delay from the error once it is available in the consumererror package. | ||
backoffDelay := expBackoff.NextBackOff() | ||
if backoffDelay == backoff.Stop { | ||
lc.logger.Error("Max elapsed time expired. Dropping data.", zap.Error(err), zap.Int("dropped_items", | ||
logs.LogRecordCount())) | ||
return err | ||
} | ||
|
||
backoffDelayStr := backoffDelay.String() | ||
span.AddEvent( | ||
"ConsumeLogs() failed. Will retry the request after interval.", | ||
trace.WithAttributes( | ||
attribute.String("interval", backoffDelayStr), | ||
attribute.String("error", err.Error()))) | ||
lc.logger.Debug( | ||
"ConsumeLogs() failed. Will retry the request after interval.", | ||
zap.Error(err), | ||
zap.String("interval", backoffDelayStr), | ||
zap.Int("logs_count", logs.LogRecordCount()), | ||
) | ||
retryNum++ | ||
|
||
// back-off, but get interrupted when shutting down or request is cancelled or timed out. | ||
select { | ||
case <-ctx.Done(): | ||
return fmt.Errorf("context is cancelled or timed out %w", err) | ||
case <-time.After(backoffDelay): | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package consumerretry | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"go.opentelemetry.io/collector/consumer/consumererror" | ||
"go.uber.org/zap" | ||
|
||
"github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/testdata" | ||
) | ||
|
||
func TestConsumeLogs(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
cfg Config | ||
consumer *MockLogsRejecter | ||
expectedErr error | ||
}{ | ||
{ | ||
name: "no_retry_success", | ||
expectedErr: nil, | ||
cfg: NewDefaultConfig(), | ||
consumer: NewMockLogsRejecter(0), | ||
}, | ||
{ | ||
name: "permanent_error", | ||
expectedErr: consumererror.NewPermanent(errors.New("permanent error")), | ||
cfg: Config{Enabled: true}, | ||
consumer: NewMockLogsRejecter(-1), | ||
}, | ||
{ | ||
name: "timeout_error", | ||
expectedErr: errors.New("retry later"), | ||
cfg: Config{ | ||
Enabled: true, | ||
InitialInterval: 1 * time.Millisecond, | ||
MaxInterval: 5 * time.Millisecond, | ||
MaxElapsedTime: 10 * time.Millisecond, | ||
}, | ||
consumer: NewMockLogsRejecter(20), | ||
}, | ||
{ | ||
name: "retry_success", | ||
expectedErr: nil, | ||
cfg: Config{ | ||
Enabled: true, | ||
InitialInterval: 1 * time.Millisecond, | ||
MaxInterval: 2 * time.Millisecond, | ||
MaxElapsedTime: 100 * time.Millisecond, | ||
}, | ||
consumer: NewMockLogsRejecter(5), | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
consumer := NewLogs(tt.cfg, zap.NewNop(), tt.consumer) | ||
err := consumer.ConsumeLogs(context.Background(), testdata.GenerateLogsTwoLogRecordsSameResource()) | ||
assert.Equal(t, tt.expectedErr, err) | ||
if err == nil { | ||
assert.Equal(t, 1, len(tt.consumer.AllLogs())) | ||
assert.Equal(t, 2, tt.consumer.AllLogs()[0].LogRecordCount()) | ||
if tt.consumer.acceptAfter > 0 { | ||
assert.Equal(t, tt.consumer.rejectCount.Load(), tt.consumer.acceptAfter) | ||
} | ||
} else if tt.consumer.acceptAfter > 0 { | ||
assert.Less(t, tt.consumer.rejectCount.Load(), tt.consumer.acceptAfter) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestConsumeLogs_ContextDeadline(t *testing.T) { | ||
consumer := NewLogs(Config{ | ||
Enabled: true, | ||
InitialInterval: 1 * time.Millisecond, | ||
MaxInterval: 5 * time.Millisecond, | ||
MaxElapsedTime: 50 * time.Millisecond, | ||
}, zap.NewNop(), NewMockLogsRejecter(10)) | ||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond) | ||
defer cancel() | ||
err := consumer.ConsumeLogs(ctx, testdata.GenerateLogsTwoLogRecordsSameResource()) | ||
assert.Error(t, err) | ||
assert.Contains(t, err.Error(), "context is cancelled or timed out retry later") | ||
} | ||
|
||
func TestConsumeLogs_PartialRetry(t *testing.T) { | ||
sink := &mockPartialLogsRejecter{} | ||
consumer := NewLogs(Config{ | ||
Enabled: true, | ||
InitialInterval: 1 * time.Millisecond, | ||
MaxInterval: 5 * time.Millisecond, | ||
MaxElapsedTime: 50 * time.Millisecond, | ||
}, zap.NewNop(), sink) | ||
|
||
logs := testdata.GenerateLogsTwoLogRecordsSameResource() | ||
testdata.GenerateLogsOneLogRecordNoResource().ResourceLogs().MoveAndAppendTo(logs.ResourceLogs()) | ||
assert.NoError(t, consumer.ConsumeLogs(context.Background(), logs)) | ||
|
||
// Verify the logs batch is broken into two parts, one with the partial error and one without. | ||
assert.Equal(t, 2, len(sink.AllLogs())) | ||
assert.Equal(t, 1, sink.AllLogs()[0].ResourceLogs().Len()) | ||
assert.Equal(t, 2, sink.AllLogs()[0].LogRecordCount()) | ||
assert.Equal(t, 1, sink.AllLogs()[1].ResourceLogs().Len()) | ||
assert.Equal(t, 1, sink.AllLogs()[1].LogRecordCount()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package consumerretry // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry" | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"sync/atomic" | ||
|
||
"go.opentelemetry.io/collector/consumer/consumererror" | ||
"go.opentelemetry.io/collector/consumer/consumertest" | ||
"go.opentelemetry.io/collector/pdata/plog" | ||
) | ||
|
||
type MockLogsRejecter struct { | ||
consumertest.LogsSink | ||
rejectCount *atomic.Int32 | ||
acceptAfter int32 | ||
} | ||
|
||
// NewMockLogsRejecter creates new MockLogsRejecter. acceptAfter is a number of rejects before accepting, | ||
// 0 means always accept, -1 means always reject with permanent error | ||
func NewMockLogsRejecter(acceptAfter int32) *MockLogsRejecter { | ||
return &MockLogsRejecter{ | ||
acceptAfter: acceptAfter, | ||
rejectCount: &atomic.Int32{}, | ||
} | ||
} | ||
|
||
func (m *MockLogsRejecter) ConsumeLogs(ctx context.Context, logs plog.Logs) error { | ||
if m.acceptAfter < 0 { | ||
return consumererror.NewPermanent(errors.New("permanent error")) | ||
} | ||
if m.rejectCount.Load() < m.acceptAfter { | ||
m.rejectCount.Add(1) | ||
return errors.New("retry later") | ||
} | ||
return m.LogsSink.ConsumeLogs(ctx, logs) | ||
} | ||
|
||
// mockPartialLogsRejecter is a mock LogsConsumer that accepts only one logs object and rejects the rest. | ||
type mockPartialLogsRejecter struct { | ||
consumertest.LogsSink | ||
} | ||
|
||
func (m *mockPartialLogsRejecter) ConsumeLogs(ctx context.Context, logs plog.Logs) error { | ||
if logs.ResourceLogs().Len() <= 1 { | ||
return m.LogsSink.ConsumeLogs(ctx, logs) | ||
} | ||
accepted := plog.NewLogs() | ||
rejected := plog.NewLogs() | ||
logs.ResourceLogs().At(0).CopyTo(accepted.ResourceLogs().AppendEmpty()) | ||
for i := 1; i < logs.ResourceLogs().Len(); i++ { | ||
logs.ResourceLogs().At(i).CopyTo(rejected.ResourceLogs().AppendEmpty()) | ||
} | ||
_ = m.LogsSink.ConsumeLogs(ctx, accepted) | ||
return consumererror.NewLogs(errors.New("partial error"), rejected) | ||
} |
Oops, something went wrong.