DataDog · clamoriniere · Feb 13, 2020 · Feb 10, 2020 · Feb 10, 2020 · Feb 10, 2020
diff --git a/pkg/logs/auditor/auditor.go b/pkg/logs/auditor/auditor.go
@@ -68,15 +68,18 @@ func New(runPath string, health *health.Handle) *Auditor {
 
 // Start starts the Auditor
 func (a *Auditor) Start() {
+ a.mu.Lock()
  a.inputChan = make(chan *message.Message, config.ChanSize)
  a.done = make(chan struct{})
+ a.mu.Unlock()
  a.registry = a.recoverRegistry()
  a.cleanupRegistry()
  go a.run()
 }
 
 // Stop stops the Auditor
 func (a *Auditor) Stop() {
+ a.mu.Lock()
  if a.inputChan != nil {
  close(a.inputChan)
  }
@@ -86,6 +89,7 @@ func (a *Auditor) Stop() {
  a.done = nil
  }
  a.inputChan = nil
+ a.mu.Unlock()
 
  a.cleanupRegistry()
  err := a.flushRegistry()

diff --git a/pkg/logs/input/docker/launcher.go b/pkg/logs/input/docker/launcher.go
@@ -132,9 +132,15 @@ func (l *Launcher) Start() {
 func (l *Launcher) Stop() {
  l.stop <- struct{}{}
  stopper := restart.NewParallelStopper()
+ l.lock.Lock()
+ var containerIDs []string
  for _, tailer := range l.tailers {
  stopper.Add(tailer)
- l.removeTailer(tailer.ContainerID)
+ containerIDs = append(containerIDs, tailer.ContainerID)
+ }
+ l.lock.Unlock()
+ for _, containerID := range containerIDs {
+ l.removeTailer(containerID)
  }
  stopper.Stop()
 }
@@ -236,7 +242,7 @@ func (l *Launcher) overrideSource(container *Container, source *config.LogSource
 // startTailer starts a new tailer for the container matching with the source.
 func (l *Launcher) startTailer(container *Container, source *config.LogSource) {
  containerID := container.service.Identifier
- if _, isTailed := l.tailers[containerID]; isTailed {
+ if _, isTailed := l.getTailer(containerID); isTailed {
  log.Warnf("Can't tail twice the same container: %v", ShortContainerID(containerID))
  return
  }
@@ -270,7 +276,7 @@ func (l *Launcher) startTailer(container *Container, source *config.LogSource) {
 
 // stopTailer stops the tailer matching the containerID.
 func (l *Launcher) stopTailer(containerID string) {
- if tailer, isTailed := l.tailers[containerID]; isTailed {
+ if tailer, isTailed := l.getTailer(containerID); isTailed {
  // No-op if the tailer source came from AD
  if l.collectAllSource != nil {
  l.collectAllSource.RemoveInput(containerID)
@@ -285,8 +291,7 @@ func (l *Launcher) restartTailer(containerID string) {
  cumulatedBackoff := 0 * time.Second
  var source *config.LogSource
 
- oldTailer, exists := l.tailers[containerID]
- if exists {
+ if oldTailer, exists := l.getTailer(containerID); exists {
  source = oldTailer.source
  if l.collectAllSource != nil {
  l.collectAllSource.RemoveInput(containerID)
@@ -343,3 +348,10 @@ func (l *Launcher) removeTailer(containerID string) {
  delete(l.tailers, containerID)
  l.lock.Unlock()
 }
+
+func (l *Launcher) getTailer(containerID string) (*Tailer, bool) {
+ l.lock.Lock()
+ defer l.lock.Unlock()
+ tailer, exist := l.tailers[containerID]
+ return tailer, exist
+}
diff --git a/pkg/logs/input/docker/reader.go b/pkg/logs/input/docker/reader.go
@@ -10,16 +10,48 @@ package docker
 import (
  "errors"
  "io"
+ "time"
 )
 
 var errReaderNotInitialized = errors.New("reader not initialized")
 
+const defaultBackoffDuration = time.Second
+const maxBackoffDuration = 30 * time.Second
+
 type safeReader struct {
  reader io.ReadCloser
+
+ err error
+
+ backoffRetry int
+ backoffWaitDuration time.Duration
+ backoffDefaultDuration time.Duration
 }
 
 func newSafeReader() *safeReader {
- return &safeReader{}
+ return &safeReader{
+ backoffDefaultDuration: defaultBackoffDuration,
+ }
+}
+
+func (s *safeReader) Success() {
+ s.err = nil
+ s.backoffRetry = 0
+ s.backoffWaitDuration = 0
+}
+
+func (s *safeReader) getBackoffAndIncrement() time.Duration {
+ if s.backoffWaitDuration == maxBackoffDuration {
+ return s.backoffWaitDuration
+ }
+ duration := s.backoffWaitDuration
+ s.backoffRetry++
+ s.backoffWaitDuration += time.Duration(s.backoffRetry) * s.backoffDefaultDuration
+ if s.backoffWaitDuration > maxBackoffDuration {
+ s.backoffWaitDuration = maxBackoffDuration
+ }
+
+ return duration
 }
 
 func (s *safeReader) setUnsafeReader(reader io.ReadCloser) {

diff --git a/pkg/logs/input/docker/reader_test.go b/pkg/logs/input/docker/reader_test.go
@@ -12,6 +12,7 @@ import (
  "errors"
  "io"
  "testing"
+ "time"
 
  "github.com/stretchr/testify/assert"
 )
@@ -97,3 +98,70 @@ func TestSafeReaderClose(t *testing.T) {
  err = reader.Close()
  assert.Equal(t, errReaderNotInitialized, err)
 }
+
+func Test_safeReader_getBackoffAndIncrement(t *testing.T) {
+ type fields struct {
+ backoffRetry int
+ backoffWaitDuration time.Duration
+ backoffDefaultDuration time.Duration
+ }
+ tests := []struct {
+ name string
+ fields fields
+ want time.Duration
+ wantRetry int
+ wantWaitDuration time.Duration
+ }{
+ {
+ name: "init backoff, should return 0",
+ fields: fields{
+ backoffRetry: 0,
+ backoffWaitDuration: 0,
+ backoffDefaultDuration: time.Second,
+ },
+ want: 0,
+ wantRetry: 1,
+ wantWaitDuration: time.Second,
+ },
+ {
+ name: "second backoff, should return 1",
+ fields: fields{
+ backoffRetry: 1,
+ backoffWaitDuration: time.Second,
+ backoffDefaultDuration: time.Second,
+ },
+ want: time.Second,
+ wantRetry: 2,
+ wantWaitDuration: 3 * time.Second,
+ },
+ {
+ name: "third backoff, should return 3",
+ fields: fields{
+ backoffRetry: 2,
+ backoffWaitDuration: 3 * time.Second,
+ backoffDefaultDuration: time.Second,
+ },
+ want: 3 * time.Second,
+ wantRetry: 3,
+ wantWaitDuration: 6 * time.Second,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ s := &safeReader{
+ backoffRetry: tt.fields.backoffRetry,
+ backoffWaitDuration: tt.fields.backoffWaitDuration,
+ backoffDefaultDuration: tt.fields.backoffDefaultDuration,
+ }
+ if got := s.getBackoffAndIncrement(); got != tt.want {
+ t.Errorf("safeReader.getBackoffAndIncrement() = %v, want %v", got, tt.want)
+ }
+ if s.backoffRetry != tt.wantRetry {
+ t.Errorf("safeReader.backoffRetry = %v, want %v", s.backoffRetry, tt.wantRetry)
+ }
+ if s.backoffWaitDuration != tt.wantWaitDuration {
+ t.Errorf("safeReader.backoffWaitDuration = %v, want %v", s.backoffWaitDuration, tt.wantWaitDuration)
+ }
+ })
+ }
+}
diff --git a/pkg/logs/input/docker/since.go b/pkg/logs/input/docker/since.go
@@ -21,6 +21,8 @@ func Since(registry auditor.Registry, identifier string, creationTime service.Cr
  var err error
  offset := registry.GetOffset(identifier)
  switch {
+ case isEOFCorruptedOffset(offset):
+ since = time.Time{}
  case offset != "":
  // an offset was registered, tail from the offset
  since, err = time.Parse(config.DateFormat, offset)
@@ -36,3 +38,10 @@ func Since(registry auditor.Registry, identifier string, creationTime service.Cr
  }
  return since, err
 }
+
+// isEOFCorruptedOffset return true if the offset doesn't contain a
+// valid timestamp value due to a file rotation.
+func isEOFCorruptedOffset(offset string) bool {
+ // check if the offset value is equal to EOF char
+ return len(offset) > 0 && offset[0] == 0x03
+}
diff --git a/pkg/logs/input/docker/tailer.go b/pkg/logs/input/docker/tailer.go
@@ -28,7 +28,11 @@ import (
 )
 
 const defaultSleepDuration = 1 * time.Second
-const readTimeout = 30 * time.Second
+const defaultReadTimeout = 30 * time.Second
+
+type dockerContainerLogInterface interface {
+ ContainerLogs(ctx context.Context, container string, options types.ContainerLogsOptions) (io.ReadCloser, error)
+}
 
 // Tailer tails logs coming from stdout and stderr of a docker container
 // Logs from stdout and stderr are multiplexed into a single channel and needs to be demultiplexed later one.
@@ -38,10 +42,11 @@ type Tailer struct {
  outputChan chan *message.Message
  decoder *decoder.Decoder
  reader *safeReader
- cli *client.Client
+ cli dockerContainerLogInterface
  source *config.LogSource
  tagProvider tag.Provider
 
+ readTimeout time.Duration
  sleepDuration time.Duration
  shouldStop bool
  stop chan struct{}
@@ -61,6 +66,7 @@ func NewTailer(cli *client.Client, containerID string, source *config.LogSource,
  source: source,
  tagProvider: tag.NewProvider(dockerutil.ContainerIDToTaggerEntityName(containerID)),
  cli: cli,
+ readTimeout: defaultReadTimeout,
  sleepDuration: defaultSleepDuration,
  stop: make(chan struct{}, 1),
  done: make(chan struct{}, 1),
@@ -132,9 +138,16 @@ func (t *Tailer) setupReader() error {
  reader, err := t.cli.ContainerLogs(ctx, t.ContainerID, options)
  t.reader.setUnsafeReader(reader)
  t.cancelFunc = cancelFunc
+
  return err
 }
 
+func (t *Tailer) restartReader() error {
+ backoffDuration := t.reader.getBackoffAndIncrement()
+ time.Sleep(backoffDuration)
+ return t.setupReader()
+}
+
 // tail sets up and starts the tailer
 func (t *Tailer) tail(since string) error {
  t.setLastSince(since)
@@ -159,13 +172,20 @@ func (t *Tailer) tail(since string) error {
 func (t *Tailer) readForever() {
  defer t.decoder.Stop()
  for {
+ if t.reader.err != nil {
+ err := t.restartReader()
+ if err != nil {
+ log.Debugf("unable to restart the Reader for container %v, ", ShortContainerID(t.ContainerID))
+ continue
+ }
+ }
  select {
  case <-t.stop:
  // stop reading new logs from container
  return
  default:
  inBuf := make([]byte, 4096)
- n, err := t.read(inBuf, readTimeout)
+ n, err := t.read(inBuf, t.readTimeout)
  if err != nil { // an error occurred, stop from reading new logs
  switch {
  case isReaderClosed(err):
@@ -185,19 +205,25 @@ func (t *Tailer) readForever() {
  // This error is raised when the agent is stopping
  return
  case err == io.EOF:
- // This error is raised when the container is stopping
- // or when the container has not started to output logs yet.
- // Retry to read to make sure all logs are collected
- // or stop reading on the next iteration
- // if the tailer has been stopped.
- log.Debugf("No new logs are available for container %v", ShortContainerID(t.ContainerID))
+ // This error is raised when:
+ // * the container is stopping.
+ // * when the container has not started to output logs yet.
+ // * during a file rotation.
+ // restart the reader (by providing the error the t.reader)
+ // the reader will be restarted with a backoff policy.
+ t.source.Status.Error(fmt.Errorf("log decoder returns an EOF error that will trigger a Reader restart"))
+ log.Debugf("log decoder returns an EOF error that will trigger a Reader restart for container %v", ShortContainerID(t.ContainerID))
+ t.reader.err = err
+ continue
  default:
  t.source.Status.Error(err)
  log.Errorf("Could not tail logs for container %v: %v", ShortContainerID(t.ContainerID), err)
  t.erroredContainerID <- t.ContainerID
  return
  }
  }
+ t.reader.Success()
+ t.source.Status.Success()
  if n == 0 {
  // wait for new data to come
  t.wait()