Added poll_file_limit configuration parameter to stanza fileconsumer

Signed-off-by: Corbin Phelps <[email protected]>
open-telemetry · djaglowski · Feb 9, 2023 · Feb 9, 2023 · Feb 9, 2023 · Feb 9, 2023
commit b15daeb76189619b4ac93a36433c3980667f0421
@@ -61,6 +61,7 @@ type Config struct {
  IncludeFilePath bool `mapstructure:"include_file_path,omitempty"`
  IncludeFileNameResolved bool `mapstructure:"include_file_name_resolved,omitempty"`
  IncludeFilePathResolved bool `mapstructure:"include_file_path_resolved,omitempty"`
+ PollFileLimit int `mapstructure:"poll_cycle_file_limit,omitempty"`
  PollInterval time.Duration `mapstructure:"poll_interval,omitempty"`
  StartAt string `mapstructure:"start_at,omitempty"`
  FingerprintSize helper.ByteSize `mapstructure:"fingerprint_size,omitempty"`
@@ -139,6 +140,7 @@ func (c Config) buildManager(logger *zap.SugaredLogger, emit EmitFunc, factory s
  roller: newRoller(),
  pollInterval: c.PollInterval,
  maxBatchFiles: c.MaxConcurrentFiles / 2,
+ pollFileLimit: c.PollFileLimit,
  deleteAfterRead: c.DeleteAfterRead,
  knownFiles: make([]*Reader, 0, 10),
  seenPaths: make(map[string]struct{}, 100),
@@ -182,6 +184,12 @@ func (c Config) validate() error {
  return fmt.Errorf("`delete_after_read` cannot be used with `start_at: end`")
  }
 
+ // Poll file limit can be 0 to signal unlimited or it must be greater than or equal to
+ // max concurrent files to ensure we can read at least the configured max concurrent files in a polling cycle.
+ if c.PollFileLimit < 0 || (c.PollFileLimit < c.MaxConcurrentFiles && c.PollFileLimit != 0) {
+ return fmt.Errorf("`poll_file_limit` must be 0 or greater than or equal to `max_concurrent_files`")
+ }
+
  _, err := c.Splitter.EncodingConfig.Build()
  if err != nil {
  return err

@@ -492,6 +492,34 @@ func TestBuild(t *testing.T) {
  require.Error,
  nil,
  },
+ {
+ "InvalidNegativePollFileLimit",
+ func(f *Config) {
+ f.PollFileLimit = -1
+ },
+ require.Error,
+ nil,
+ },
+ {
+ "InvalidTooSmallPollFileLimit",
+ func(f *Config) {
+ f.MaxConcurrentFiles = 5
+ f.PollFileLimit = 4
+ },
+ require.Error,
+ nil,
+ },
+ {
+ "ValidPollFileLimit",
+ func(f *Config) {
+ f.MaxConcurrentFiles = 5
+ f.PollFileLimit = 6
+ },
+ require.NoError,
+ func(t *testing.T, m *Manager) {
+ require.Equal(t, 6, m.pollFileLimit)
+ },
+ },
  }
 
  for _, tc := range cases {

@@ -42,6 +42,7 @@ type Manager struct {
 
  pollInterval time.Duration
  maxBatchFiles int
+ pollFileLimit int
  deleteAfterRead bool
 
  knownFiles []*Reader
@@ -112,10 +113,23 @@ func (m *Manager) poll(ctx context.Context) {
  m.knownFiles[i].generation++
  }
 
+ // Used to keep track of the number of files consumed in this poll cycle
+ filesConsumed := 0
+
  // Get the list of paths on disk
  matches := m.finder.FindFiles()
  for len(matches) > m.maxBatchFiles {
- m.consume(ctx, matches[:m.maxBatchFiles])
+ matchesToConsume := matches[:m.maxBatchFiles]
+ m.consume(ctx, matchesToConsume)
+
+ // If a pollFileLimit is set, check if we have consumed enough files
+ if m.pollFileLimit != 0 {
+ filesConsumed += len(matchesToConsume)
+ if filesConsumed >= m.pollFileLimit {
+ return
+ }
+ }
+
  matches = matches[m.maxBatchFiles:]
  }
  m.consume(ctx, matches)

@@ -856,13 +856,16 @@ func TestFileBatching(t *testing.T) {
  linesPerFile := 10
  maxConcurrentFiles := 20
  maxBatchFiles := maxConcurrentFiles / 2
+ // Explicitly setting pollFileLimit to ensure a value of 0 does not enforce a limit
+ pollFileLimit := 0
 
  expectedBatches := files / maxBatchFiles // assumes no remainder
 
  tempDir := t.TempDir()
  cfg := NewConfig().includeDir(tempDir)
  cfg.StartAt = "beginning"
  cfg.MaxConcurrentFiles = maxConcurrentFiles
+ cfg.PollFileLimit = pollFileLimit
  emitCalls := make(chan *emitParams, files*linesPerFile)
  operator := buildTestManagerWithEmit(t, cfg, emitCalls)
  operator.persister = testutil.NewMockPersister("test")
@@ -1232,3 +1235,93 @@ func TestDeleteAfterRead(t *testing.T) {
  require.True(t, os.IsNotExist(err))
  }
 }
+
+func TestPollCycleLimiting(t *testing.T) {
+ t.Parallel()
+
+ files := 50
+ linesPerFile := 10
+ maxConcurrentFiles := 20
+ maxBatchFiles := maxConcurrentFiles / 2
+ pollFileLimit := 40
+
+ expectedBatches := pollFileLimit / maxBatchFiles // assumes no remainder
+
+ tempDir := t.TempDir()
+ cfg := NewConfig().includeDir(tempDir)
+ cfg.StartAt = "beginning"
+ cfg.MaxConcurrentFiles = maxConcurrentFiles
+ cfg.PollFileLimit = pollFileLimit
+ emitCalls := make(chan *emitParams, files*linesPerFile)
+ operator := buildTestManagerWithEmit(t, cfg, emitCalls)
+ operator.persister = testutil.NewMockPersister("test")
+
+ core, observedLogs := observer.New(zap.DebugLevel)
+ operator.SugaredLogger = zap.New(core).Sugar()
+
+ // We only expect that pollFileLimit files are consumed
+ temps := make([]*os.File, 0, pollFileLimit)
+ for i := 0; i < pollFileLimit; i++ {
+ temps = append(temps, openTemp(t, tempDir))
+ }
+
+ // Write logs to each file
+ expectedTokens := make([][]byte, 0, files*linesPerFile)
+ for i, temp := range temps {
+ for j := 0; j < linesPerFile; j++ {
+ message := fmt.Sprintf("%s %d %d", tokenWithLength(100), i, j)
+ _, err := temp.WriteString(message + "\n")
+ require.NoError(t, err)
+ expectedTokens = append(expectedTokens, []byte(message))
+ }
+ }
+
+ // Poll and wait for all lines
+ operator.poll(context.Background())
+ actualTokens := make([][]byte, 0, files*linesPerFile)
+ actualTokens = append(actualTokens, waitForNTokens(t, emitCalls, len(expectedTokens))...)
+ require.ElementsMatch(t, expectedTokens, actualTokens)
+
+ // During the first poll, we expect one log per batch and one log per file
+ require.Equal(t, pollFileLimit+expectedBatches, observedLogs.Len())
+ logNum := 0
+ for b := 0; b < expectedBatches; b++ {
+ log := observedLogs.All()[logNum]
+ require.Equal(t, "Consuming files", log.Message)
+ require.Equal(t, zapcore.DebugLevel, log.Level)
+ logNum++
+
+ for f := 0; f < maxBatchFiles; f++ {
+ log = observedLogs.All()[logNum]
+ require.Equal(t, "Started watching file", log.Message)
+ require.Equal(t, zapcore.InfoLevel, log.Level)
+ logNum++
+ }
+ }
+
+ // Write more logs to each file so we can validate that all files are still known
+ expectedTokens = make([][]byte, 0, files*linesPerFile)
+ for i, temp := range temps {
+ for j := 0; j < linesPerFile; j++ {
+ message := fmt.Sprintf("%s %d %d", tokenWithLength(20), i, j)
+ _, err := temp.WriteString(message + "\n")
+ require.NoError(t, err)
+ expectedTokens = append(expectedTokens, []byte(message))
+ }
+ }
+
+ // Poll again and wait for all new lines
+ operator.poll(context.Background())
+ actualTokens = make([][]byte, 0, files*linesPerFile)
+ actualTokens = append(actualTokens, waitForNTokens(t, emitCalls, len(expectedTokens))...)
+ require.ElementsMatch(t, expectedTokens, actualTokens)
+
+ // During the second poll, we only expect one log per batch
+ require.Equal(t, pollFileLimit+expectedBatches*2, observedLogs.Len())
+ for b := logNum; b < observedLogs.Len(); b++ {
+ log := observedLogs.All()[logNum]
+ require.Equal(t, "Consuming files", log.Message)
+ require.Equal(t, zapcore.DebugLevel, log.Level)
+ logNum++
+ }
+}