[FLINK-4829] snapshot accumulators on a best-effort basis

Heartbeats should not fail when accumulators could not be snapshotted. Instead, we should simply skip the reporting of the failed accumulator. Eventually, the accumulator will be reported; at the latest, when the job finishes. This closes apache#2649
deepfield · Oct 18, 2016 · d95929e · d95929e
1 parent 783dca5
commit d95929e
Showing 1 changed file with 10 additions and 3 deletions.
diff --git a/flink-runtime/src/main/scala/org/apache/flink/runtime/taskmanager/TaskManager.scala b/flink-runtime/src/main/scala/org/apache/flink/runtime/taskmanager/TaskManager.scala
@@ -35,6 +35,7 @@ import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet,
 import com.codahale.metrics.{Gauge, MetricFilter, MetricRegistry}
 import com.fasterxml.jackson.databind.ObjectMapper
 import grizzled.slf4j.Logger
+import org.apache.commons.lang3.exception.ExceptionUtils
 import org.apache.flink.configuration._
 import org.apache.flink.core.fs.FileSystem
 import org.apache.flink.core.memory.{HeapMemorySegment, HybridMemorySegment, MemorySegmentFactory, MemoryType}
@@ -1335,9 +1336,15 @@ class TaskManager(
 
  runningTasks.asScala foreach {
  case (execID, task) =>
- val registry = task.getAccumulatorRegistry
- val accumulators = registry.getSnapshot
- accumulatorEvents.append(accumulators)
+ try {
+ val registry = task.getAccumulatorRegistry
+ val accumulators = registry.getSnapshot
+ accumulatorEvents.append(accumulators)
+ } catch {
+ case e: Exception =>
+ log.warn("Failed to take accumulator snapshot for task {}.",
+ execID, ExceptionUtils.getRootCause(e))
+ }
  }
 
  currentJobManager foreach {