Skip to content

Commit

Permalink
[pkg/stanza/fileconsumer] Fix issue where buffer size could cause inc…
Browse files Browse the repository at this point in the history
…orrect fingerprint update (open-telemetry#23183)

The logic for determining when to update a fingerprint previously failed
to account for the case where the buffer size is smaller than the
fingerprint. This allows the fingerprint to be truncated in some cases.

This PR rewrites the logic to explicitly handle each expected case, with
optimization for handling the most common cases first.
  • Loading branch information
djaglowski committed Jun 26, 2023
1 parent 04327f5 commit fbe16e1
Show file tree
Hide file tree
Showing 9 changed files with 413 additions and 36 deletions.
20 changes: 20 additions & 0 deletions .chloggen/fileconsumer-fix-offset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Use this changelog template to create an entry for release notes.
# If your change doesn't affect end users, such as a test fix or a tooling change,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: pkg/stanza

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Fix issue where large fingerprint_size could cause duplication of logs

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [22936]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:
2 changes: 2 additions & 0 deletions pkg/stanza/fileconsumer/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
const (
defaultMaxLogSize = 1024 * 1024
defaultMaxConcurrentFiles = 1024
defaultBufSize = 16 * 1024
)

var allowFileDeletion = featuregate.GlobalRegistry().MustRegister(
Expand Down Expand Up @@ -148,6 +149,7 @@ func (c Config) buildManager(logger *zap.SugaredLogger, emit EmitFunc, factory s
readerConfig: &readerConfig{
fingerprintSize: int(c.FingerprintSize),
maxLogSize: int(c.MaxLogSize),
bufferSize: defaultBufSize,
emit: emit,
},
fromBeginning: startAtBeginning,
Expand Down
4 changes: 2 additions & 2 deletions pkg/stanza/fileconsumer/file_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,12 +617,12 @@ func TestIgnoreEmptyFiles(t *testing.T) {
temp4 := openTemp(t, tempDir)

writeString(t, temp, "testlog1\n")
writeString(t, temp3, "testlog2\n")
writeString(t, temp2, "testlog2\n")
operator.poll(context.Background())

waitForTokens(t, emitCalls, [][]byte{[]byte("testlog1"), []byte("testlog2")})

writeString(t, temp2, "testlog3\n")
writeString(t, temp3, "testlog3\n")
writeString(t, temp4, "testlog4\n")
operator.poll(context.Background())

Expand Down
87 changes: 62 additions & 25 deletions pkg/stanza/fileconsumer/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
type readerConfig struct {
fingerprintSize int
maxLogSize int
bufferSize int
emit EmitFunc
}

Expand Down Expand Up @@ -65,7 +66,11 @@ func (r *Reader) ReadToEnd(ctx context.Context) {
return
}

scanner := NewPositionalScanner(r, r.maxLogSize, r.Offset, r.splitFunc)
bufferSize := r.bufferSize
if r.bufferSize < r.fingerprintSize {
bufferSize = r.fingerprintSize
}
scanner := NewPositionalScanner(r, r.maxLogSize, bufferSize, r.Offset, r.splitFunc)

// Iterate over the tokenized file, emitting entries as we go
for {
Expand Down Expand Up @@ -104,7 +109,7 @@ func (r *Reader) ReadToEnd(ctx context.Context) {
return
}

scanner = NewPositionalScanner(r, r.maxLogSize, r.Offset, r.splitFunc)
scanner = NewPositionalScanner(r, r.maxLogSize, r.bufferSize, r.Offset, r.splitFunc)
}

r.Offset = scanner.Pos()
Expand Down Expand Up @@ -172,32 +177,64 @@ func (r *Reader) Close() {
}

// Read from the file and update the fingerprint if necessary
func (r *Reader) Read(dst []byte) (int, error) {
// Skip if fingerprint is already built
// or if fingerprint is behind Offset
if len(r.Fingerprint.FirstBytes) == r.fingerprintSize || int(r.Offset) > len(r.Fingerprint.FirstBytes) {
return r.file.Read(dst)
}
n, err := r.file.Read(dst)
appendCount := min0(n, r.fingerprintSize-int(r.Offset))
// return for n == 0 or r.Offset >= r.fileInput.fingerprintSize
if appendCount == 0 {
return n, err
}

// for appendCount==0, the following code would add `0` to fingerprint
r.Fingerprint.FirstBytes = append(r.Fingerprint.FirstBytes[:r.Offset], dst[:appendCount]...)
return n, err
}
func (r *Reader) Read(dst []byte) (n int, err error) {
n, err = r.file.Read(dst)

if len(r.Fingerprint.FirstBytes) == r.fingerprintSize {
// Steady state. Just return data to scanner.
return
}

func min0(a, b int) int {
if a < 0 || b < 0 {
return 0
if len(r.Fingerprint.FirstBytes) > r.fingerprintSize {
// Oversized fingerprint. The component was restarted with a decreased 'fingerprint_size'.
// Just return data to scanner.
return
}
if a < b {
return a

if int(r.Offset) > len(r.Fingerprint.FirstBytes) {
// Undersized fingerprint. The component was restarted with an increased 'fingerprint_size.
// However, we've already read past the fingerprint. Just keep reading.
return
}

if len(r.Fingerprint.FirstBytes) == int(r.Offset) {
// The fingerprint is incomplete but is exactly aligned with the offset.
// Take advantage of the simple case and avoid some computation.
appendCount := r.fingerprintSize - len(r.Fingerprint.FirstBytes)
if appendCount > n {
appendCount = n
}
r.Fingerprint.FirstBytes = append(r.Fingerprint.FirstBytes, dst[:appendCount]...)
}

// The fingerprint is incomplete and is NOT aligned with the offset. This means the fingerprint
// contains data that hasn't yet been emitted. Either we observed an incomplete token at the end of the
// file, or we are running with 'start_at: beginning' in which case the fingerprint is initialized
// independently of the Reader.

// Allowing the fingerprint to run ahead of tokenization improves our ability to uniquely identify files.
// However, it also means we must compensate for the misalignment when appending to the fingerprint.

// WE MUST ASSUME that the fingerprint will never contain a token longer than the 'dst' buffer.
// The easiest way to enforce this is to ensure the buffer is at least as large as the fingerprint.
// Unfortunately, this must be enforced outside of this function.
// Without this guarantee, the scanner may call this function consecutively before we are able to update
// the offset, which means we cannot trust the offset to tell us which data in the 'dst' buffer has
// already been appended to the fingerprint.

newBytesIndex := len(r.Fingerprint.FirstBytes) - int(r.Offset)
if n <= newBytesIndex {
// Already have this data in the fingerprint. Just return data to scanner.
return
}

appendCount := r.fingerprintSize - len(r.Fingerprint.FirstBytes)
if appendCount > n-newBytesIndex {
// Not enough new data to complete the fingerprint, but append what we have.
appendCount = n - newBytesIndex
}
return b
r.Fingerprint.FirstBytes = append(r.Fingerprint.FirstBytes, dst[newBytesIndex:newBytesIndex+appendCount]...)
return
}

// mapCopy deep copies the provided attributes map.
Expand Down
Loading

0 comments on commit fbe16e1

Please sign in to comment.