[pkg/stanza/fileconsumer] Fix issue where buffer size could cause inc…

…orrect fingerprint update (open-telemetry#23183) The logic for determining when to update a fingerprint previously failed to account for the case where the buffer size is smaller than the fingerprint. This allows the fingerprint to be truncated in some cases. This PR rewrites the logic to explicitly handle each expected case, with optimization for handling the most common cases first.
mackjmr · Jun 26, 2023 · fbe16e1 · fbe16e1
1 parent 04327f5
commit fbe16e1
Show file tree

Hide file tree

Showing 9 changed files with 413 additions and 36 deletions.
diff --git a/.chloggen/fileconsumer-fix-offset.yaml b/.chloggen/fileconsumer-fix-offset.yaml
@@ -0,0 +1,20 @@
+# Use this changelog template to create an entry for release notes.
+# If your change doesn't affect end users, such as a test fix or a tooling change,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: bug_fix
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/stanza
+
+# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Fix issue where large fingerprint_size could cause duplication of logs
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [22936]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
diff --git a/pkg/stanza/fileconsumer/config.go b/pkg/stanza/fileconsumer/config.go
@@ -19,6 +19,7 @@ import (
 const (
  defaultMaxLogSize = 1024 * 1024
  defaultMaxConcurrentFiles = 1024
+ defaultBufSize = 16 * 1024
 )
 
 var allowFileDeletion = featuregate.GlobalRegistry().MustRegister(
@@ -148,6 +149,7 @@ func (c Config) buildManager(logger *zap.SugaredLogger, emit EmitFunc, factory s
  readerConfig: &readerConfig{
  fingerprintSize: int(c.FingerprintSize),
  maxLogSize: int(c.MaxLogSize),
+ bufferSize: defaultBufSize,
  emit: emit,
  },
  fromBeginning: startAtBeginning,

diff --git a/pkg/stanza/fileconsumer/file_test.go b/pkg/stanza/fileconsumer/file_test.go
@@ -617,12 +617,12 @@ func TestIgnoreEmptyFiles(t *testing.T) {
  temp4 := openTemp(t, tempDir)
 
  writeString(t, temp, "testlog1\n")
- writeString(t, temp3, "testlog2\n")
+ writeString(t, temp2, "testlog2\n")
  operator.poll(context.Background())
 
  waitForTokens(t, emitCalls, [][]byte{[]byte("testlog1"), []byte("testlog2")})
 
- writeString(t, temp2, "testlog3\n")
+ writeString(t, temp3, "testlog3\n")
  writeString(t, temp4, "testlog4\n")
  operator.poll(context.Background())
 

diff --git a/pkg/stanza/fileconsumer/reader.go b/pkg/stanza/fileconsumer/reader.go
@@ -19,6 +19,7 @@ import (
 type readerConfig struct {
  fingerprintSize int
  maxLogSize int
+ bufferSize int
  emit EmitFunc
 }
 
@@ -65,7 +66,11 @@ func (r *Reader) ReadToEnd(ctx context.Context) {
  return
  }
 
- scanner := NewPositionalScanner(r, r.maxLogSize, r.Offset, r.splitFunc)
+ bufferSize := r.bufferSize
+ if r.bufferSize < r.fingerprintSize {
+ bufferSize = r.fingerprintSize
+ }
+ scanner := NewPositionalScanner(r, r.maxLogSize, bufferSize, r.Offset, r.splitFunc)
 
  // Iterate over the tokenized file, emitting entries as we go
  for {
@@ -104,7 +109,7 @@ func (r *Reader) ReadToEnd(ctx context.Context) {
  return
  }
 
- scanner = NewPositionalScanner(r, r.maxLogSize, r.Offset, r.splitFunc)
+ scanner = NewPositionalScanner(r, r.maxLogSize, r.bufferSize, r.Offset, r.splitFunc)
  }
 
  r.Offset = scanner.Pos()
@@ -172,32 +177,64 @@ func (r *Reader) Close() {
 }
 
 // Read from the file and update the fingerprint if necessary
-func (r *Reader) Read(dst []byte) (int, error) {
- // Skip if fingerprint is already built
- // or if fingerprint is behind Offset
- if len(r.Fingerprint.FirstBytes) == r.fingerprintSize || int(r.Offset) > len(r.Fingerprint.FirstBytes) {
- return r.file.Read(dst)
- }
- n, err := r.file.Read(dst)
- appendCount := min0(n, r.fingerprintSize-int(r.Offset))
- // return for n == 0 or r.Offset >= r.fileInput.fingerprintSize
- if appendCount == 0 {
- return n, err
- }
-
- // for appendCount==0, the following code would add `0` to fingerprint
- r.Fingerprint.FirstBytes = append(r.Fingerprint.FirstBytes[:r.Offset], dst[:appendCount]...)
- return n, err
-}
+func (r *Reader) Read(dst []byte) (n int, err error) {
+ n, err = r.file.Read(dst)
+
+ if len(r.Fingerprint.FirstBytes) == r.fingerprintSize {
+ // Steady state. Just return data to scanner.
+ return
+ }
 
-func min0(a, b int) int {
- if a < 0 || b < 0 {
- return 0
+ if len(r.Fingerprint.FirstBytes) > r.fingerprintSize {
+ // Oversized fingerprint. The component was restarted with a decreased 'fingerprint_size'.
+ // Just return data to scanner.
+ return
  }
- if a < b {
- return a
+
+ if int(r.Offset) > len(r.Fingerprint.FirstBytes) {
+ // Undersized fingerprint. The component was restarted with an increased 'fingerprint_size.
+ // However, we've already read past the fingerprint. Just keep reading.
+ return
+ }
+
+ if len(r.Fingerprint.FirstBytes) == int(r.Offset) {
+ // The fingerprint is incomplete but is exactly aligned with the offset.
+ // Take advantage of the simple case and avoid some computation.
+ appendCount := r.fingerprintSize - len(r.Fingerprint.FirstBytes)
+ if appendCount > n {
+ appendCount = n
+ }
+ r.Fingerprint.FirstBytes = append(r.Fingerprint.FirstBytes, dst[:appendCount]...)
+ }
+
+ // The fingerprint is incomplete and is NOT aligned with the offset. This means the fingerprint
+ // contains data that hasn't yet been emitted. Either we observed an incomplete token at the end of the
+ // file, or we are running with 'start_at: beginning' in which case the fingerprint is initialized
+ // independently of the Reader.
+
+ // Allowing the fingerprint to run ahead of tokenization improves our ability to uniquely identify files.
+ // However, it also means we must compensate for the misalignment when appending to the fingerprint.
+
+ // WE MUST ASSUME that the fingerprint will never contain a token longer than the 'dst' buffer.
+ // The easiest way to enforce this is to ensure the buffer is at least as large as the fingerprint.
+ // Unfortunately, this must be enforced outside of this function.
+ // Without this guarantee, the scanner may call this function consecutively before we are able to update
+ // the offset, which means we cannot trust the offset to tell us which data in the 'dst' buffer has
+ // already been appended to the fingerprint.
+
+ newBytesIndex := len(r.Fingerprint.FirstBytes) - int(r.Offset)
+ if n <= newBytesIndex {
+ // Already have this data in the fingerprint. Just return data to scanner.
+ return
+ }
+
+ appendCount := r.fingerprintSize - len(r.Fingerprint.FirstBytes)
+ if appendCount > n-newBytesIndex {
+ // Not enough new data to complete the fingerprint, but append what we have.
+ appendCount = n - newBytesIndex
  }
- return b
+ r.Fingerprint.FirstBytes = append(r.Fingerprint.FirstBytes, dst[newBytesIndex:newBytesIndex+appendCount]...)
+ return
 }
 
 // mapCopy deep copies the provided attributes map.