[receiver/filelog] Limit files consumed per polling cycle (#18477)

* Added max_batches configuration parameter to fileconsumer
open-telemetry · Feb 9, 2023 · 3aeced4 · 3aeced4
1 parent 9312c51
commit 3aeced4
Show file tree

Hide file tree

Showing 8 changed files with 159 additions and 1 deletion.
diff --git a/.chloggen/filelog-max-batches.yaml b/.chloggen/filelog-max-batches.yaml
@@ -0,0 +1,16 @@
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: 'enhancement'
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: filelogreceiver
+
+# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Added `max_batches` configuration parameter to limit the number of batches processed in a polling interval.
+
+# One or more tracking issues related to the change
+issues: [18476]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
diff --git a/pkg/stanza/docs/operators/file_input.md b/pkg/stanza/docs/operators/file_input.md
@@ -22,6 +22,7 @@ The `file_input` operator reads logs from files. It will place the lines read in
 | `fingerprint_size` | `1kb` | The number of bytes with which to identify a file. The first bytes in the file are used as the fingerprint. Decreasing this value at any point will cause existing fingerprints to forgotten, meaning that all files will be read from the beginning (one time). |
 | `max_log_size` | `1MiB` | The maximum size of a log entry to read before failing. Protects against reading large amounts of data into memory |.
 | `max_concurrent_files` | 1024 | The maximum number of log files from which logs will be read concurrently (minimum = 2). If the number of files matched in the `include` pattern exceeds half of this number, then files will be processed in batches. |
+| `max_batches` | 0 | Only applicable when files must be batched in order to respect `max_concurrent_files`. This value limits the number of batches that will be processed during a single poll interval. A value of 0 indicates no limit. |
 | `delete_after_read` | `false` | If `true`, each log file will be read and then immediately deleted. Requires that the `filelog.allowFileDeletion` feature gate is enabled. |
 | `attributes` | {} | A map of `key: value` pairs to add to the entry's attributes. |
 | `resource` | {} | A map of `key: value` pairs to add to the entry's resource. |

diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go
@@ -16,6 +16,7 @@ package fileconsumer // import "github.com/open-telemetry/opentelemetry-collecto
 
 import (
  "bufio"
+ "errors"
  "fmt"
  "time"
 
@@ -51,6 +52,7 @@ func NewConfig() *Config {
  FingerprintSize: DefaultFingerprintSize,
  MaxLogSize: defaultMaxLogSize,
  MaxConcurrentFiles: defaultMaxConcurrentFiles,
+ MaxBatches: 0,
  }
 }
 
@@ -66,6 +68,7 @@ type Config struct {
  FingerprintSize helper.ByteSize `mapstructure:"fingerprint_size,omitempty"`
  MaxLogSize helper.ByteSize `mapstructure:"max_log_size,omitempty"`
  MaxConcurrentFiles int `mapstructure:"max_concurrent_files,omitempty"`
+ MaxBatches int `mapstructure:"max_batches,omitempty"`
  DeleteAfterRead bool `mapstructure:"delete_after_read,omitempty"`
  Splitter helper.SplitterConfig `mapstructure:",squash,omitempty"`
 }
@@ -139,6 +142,7 @@ func (c Config) buildManager(logger *zap.SugaredLogger, emit EmitFunc, factory s
  roller: newRoller(),
  pollInterval: c.PollInterval,
  maxBatchFiles: c.MaxConcurrentFiles / 2,
+ maxBatches: c.MaxBatches,
  deleteAfterRead: c.DeleteAfterRead,
  knownFiles: make([]*Reader, 0, 10),
  seenPaths: make(map[string]struct{}, 100),
@@ -182,6 +186,10 @@ func (c Config) validate() error {
  return fmt.Errorf("`delete_after_read` cannot be used with `start_at: end`")
  }
 
+ if c.MaxBatches < 0 {
+ return errors.New("`max_batches` must not be negative")
+ }
+
  _, err := c.Splitter.EncodingConfig.Build()
  if err != nil {
  return err

diff --git a/pkg/stanza/fileconsumer/config_test.go b/pkg/stanza/fileconsumer/config_test.go
@@ -352,6 +352,14 @@ func TestUnmarshal(t *testing.T) {
  return newMockOperatorConfig(cfg)
  }(),
  },
+ {
+ Name: "max_batches_1",
+ Expect: func() *mockOperatorConfig {
+ cfg := NewConfig()
+ cfg.MaxBatches = 1
+ return newMockOperatorConfig(cfg)
+ }(),
+ },
  },
  }.Run(t)
 }
@@ -492,6 +500,24 @@ func TestBuild(t *testing.T) {
  require.Error,
  nil,
  },
+ {
+ "InvalidMaxBatches",
+ func(f *Config) {
+ f.MaxBatches = -1
+ },
+ require.Error,
+ nil,
+ },
+ {
+ "ValidMaxBatches",
+ func(f *Config) {
+ f.MaxBatches = 6
+ },
+ require.NoError,
+ func(t *testing.T, m *Manager) {
+ require.Equal(t, 6, m.maxBatches)
+ },
+ },
  }
 
  for _, tc := range cases {

diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go
@@ -41,6 +41,7 @@ type Manager struct {
  persister operator.Persister
 
  pollInterval time.Duration
+ maxBatches int
  maxBatchFiles int
  deleteAfterRead bool
 
@@ -112,10 +113,22 @@ func (m *Manager) poll(ctx context.Context) {
  m.knownFiles[i].generation++
  }
 
+ // Used to keep track of the number of batches processed in this poll cycle
+ batchesProcessed := 0
+
  // Get the list of paths on disk
  matches := m.finder.FindFiles()
  for len(matches) > m.maxBatchFiles {
  m.consume(ctx, matches[:m.maxBatchFiles])
+
+ // If a maxBatches is set, check if we have hit the limit
+ if m.maxBatches != 0 {
+ batchesProcessed++
+ if batchesProcessed >= m.maxBatches {
+ return
+ }
+ }
+
  matches = matches[m.maxBatchFiles:]
  }
  m.consume(ctx, matches)

diff --git a/pkg/stanza/fileconsumer/file_test.go b/pkg/stanza/fileconsumer/file_test.go
@@ -856,13 +856,16 @@ func TestFileBatching(t *testing.T) {
  linesPerFile := 10
  maxConcurrentFiles := 20
  maxBatchFiles := maxConcurrentFiles / 2
+ // Explicitly setting maxBatches to ensure a value of 0 does not enforce a limit
+ maxBatches := 0
 
  expectedBatches := files / maxBatchFiles // assumes no remainder
 
  tempDir := t.TempDir()
  cfg := NewConfig().includeDir(tempDir)
  cfg.StartAt = "beginning"
  cfg.MaxConcurrentFiles = maxConcurrentFiles
+ cfg.MaxBatches = maxBatches
  emitCalls := make(chan *emitParams, files*linesPerFile)
  operator := buildTestManagerWithEmit(t, cfg, emitCalls)
  operator.persister = testutil.NewMockPersister("test")
@@ -1232,3 +1235,90 @@ func TestDeleteAfterRead(t *testing.T) {
  require.True(t, os.IsNotExist(err))
  }
 }
+
+func TestMaxBatching(t *testing.T) {
+ t.Parallel()
+
+ files := 50
+ linesPerFile := 10
+ maxConcurrentFiles := 20
+ maxBatchFiles := maxConcurrentFiles / 2
+ maxBatches := 2
+
+ expectedBatches := maxBatches
+ expectedMaxFilesPerPoll := maxBatches * maxBatchFiles
+
+ tempDir := t.TempDir()
+ cfg := NewConfig().includeDir(tempDir)
+ cfg.StartAt = "beginning"
+ cfg.MaxConcurrentFiles = maxConcurrentFiles
+ cfg.MaxBatches = maxBatches
+ emitCalls := make(chan *emitParams, files*linesPerFile)
+ operator := buildTestManagerWithEmit(t, cfg, emitCalls)
+ operator.persister = testutil.NewMockPersister("test")
+
+ core, observedLogs := observer.New(zap.DebugLevel)
+ operator.SugaredLogger = zap.New(core).Sugar()
+
+ temps := make([]*os.File, 0, files)
+ for i := 0; i < files; i++ {
+ temps = append(temps, openTemp(t, tempDir))
+ }
+
+ // Write logs to each file
+ numExpectedTokens := expectedMaxFilesPerPoll * linesPerFile
+ for i, temp := range temps {
+ for j := 0; j < linesPerFile; j++ {
+ message := fmt.Sprintf("%s %d %d", tokenWithLength(100), i, j)
+ _, err := temp.WriteString(message + "\n")
+ require.NoError(t, err)
+ }
+ }
+
+ // Poll and wait for all lines
+ operator.poll(context.Background())
+ actualTokens := make([][]byte, 0, numExpectedTokens)
+ actualTokens = append(actualTokens, waitForNTokens(t, emitCalls, numExpectedTokens)...)
+ require.Len(t, actualTokens, numExpectedTokens)
+
+ // During the first poll, we expect one log per batch and one log per file
+ require.Equal(t, expectedMaxFilesPerPoll+expectedBatches, observedLogs.Len())
+ logNum := 0
+ for b := 0; b < expectedBatches; b++ {
+ log := observedLogs.All()[logNum]
+ require.Equal(t, "Consuming files", log.Message)
+ require.Equal(t, zapcore.DebugLevel, log.Level)
+ logNum++
+
+ for f := 0; f < maxBatchFiles; f++ {
+ log = observedLogs.All()[logNum]
+ require.Equal(t, "Started watching file", log.Message)
+ require.Equal(t, zapcore.InfoLevel, log.Level)
+ logNum++
+ }
+ }
+
+ // Write more logs to each file so we can validate that all files are still known
+ for i, temp := range temps {
+ for j := 0; j < linesPerFile; j++ {
+ message := fmt.Sprintf("%s %d %d", tokenWithLength(20), i, j)
+ _, err := temp.WriteString(message + "\n")
+ require.NoError(t, err)
+ }
+ }
+
+ // Poll again and wait for all new lines
+ operator.poll(context.Background())
+ actualTokens = make([][]byte, 0, numExpectedTokens)
+ actualTokens = append(actualTokens, waitForNTokens(t, emitCalls, numExpectedTokens)...)
+ require.Len(t, actualTokens, numExpectedTokens)
+
+ // During the second poll, we only expect one log per batch
+ require.Equal(t, expectedMaxFilesPerPoll+expectedBatches*2, observedLogs.Len())
+ for b := logNum; b < observedLogs.Len(); b++ {
+ log := observedLogs.All()[logNum]
+ require.Equal(t, "Consuming files", log.Message)
+ require.Equal(t, zapcore.DebugLevel, log.Level)
+ logNum++
+ }
+}
diff --git a/pkg/stanza/fileconsumer/testdata/config.yaml b/pkg/stanza/fileconsumer/testdata/config.yaml
@@ -154,3 +154,6 @@ poll_interval_no_units:
 start_at_string:
  type: mock
  start_at: "beginning"
+max_batches_1:
+ type: mock
+ max_batches: 1
diff --git a/receiver/filelogreceiver/README.md b/receiver/filelogreceiver/README.md
@@ -25,7 +25,8 @@ Tails and parses logs from files.
 | `poll_interval` | 200ms | The duration between filesystem polls |
 | `fingerprint_size` | `1kb` | The number of bytes with which to identify a file. The first bytes in the file are used as the fingerprint. Decreasing this value at any point will cause existing fingerprints to forgotten, meaning that all files will be read from the beginning (one time) |
 | `max_log_size` | `1MiB` | The maximum size of a log entry to read. A log entry will be truncated if it is larger than `max_log_size`. Protects against reading large amounts of data into memory |
-| `max_concurrent_files` | 1024 | The maximum number of log files from which logs will be read concurrently. If the number of files matched in the `include` pattern exceeds this number, then files will be processed in batches. One batch will be processed per `poll_interval` |
+| `max_concurrent_files` | 1024 | The maximum number of log files from which logs will be read concurrently. If the number of files matched in the `include` pattern exceeds this number, then files will be processed in batches. |
+| `max_batches` | 0 | Only applicable when files must be batched in order to respect `max_concurrent_files`. This value limits the number of batches that will be processed during a single poll interval. A value of 0 indicates no limit. |
 | `delete_after_read` | `false` | If `true`, each log file will be read and then immediately deleted. Requires that the `filelog.allowFileDeletion` feature gate is enabled. |
 | `attributes` | {} | A map of `key: value` pairs to add to the entry's attributes |
 | `resource` | {} | A map of `key: value` pairs to add to the entry's resource |