Skip to content

Commit

Permalink
[FLINK-12472][yarn] Support setting attemptFailuresValidityInterval o…
Browse files Browse the repository at this point in the history
…f jobs on Yarn

This closes apache#8400.
  • Loading branch information
jiasheng55 authored and tillrohrmann committed May 14, 2019
1 parent 2826ff8 commit 6231b18
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
5 changes: 5 additions & 0 deletions docs/_includes/generated/yarn_config_configuration.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
</tr>
</thead>
<tbody>
<tr>
<td><h5>yarn.application-attempt-failures-validity-interval</h5></td>
<td style="word-wrap: break-word;">10000</td>
<td>Time window in milliseconds which defines the number of application attempt failures when restarting the AM. Failures which fall outside of this window are not being considered. Set this value to -1 in order to count globally. See <a href="https://hortonworks.com/blog/apache-hadoop-yarn-hdp-2-2-fault-tolerance-features-long-running-services/">here</a> for more information.</td>
</tr>
<tr>
<td><h5>yarn.application-attempts</h5></td>
<td style="word-wrap: break-word;">(none)</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import org.apache.flink.configuration.SecurityOptions;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.core.plugin.PluginUtils;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.clusterframework.BootstrapTools;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.entrypoint.ClusterEntrypoint;
Expand Down Expand Up @@ -1283,7 +1282,10 @@ private void activateHighAvailabilitySupport(ApplicationSubmissionContext appCon
ApplicationSubmissionContextReflector reflector = ApplicationSubmissionContextReflector.getInstance();

reflector.setKeepContainersAcrossApplicationAttempts(appContext, true);
reflector.setAttemptFailuresValidityInterval(appContext, AkkaUtils.getTimeout(flinkConfiguration).toMillis());

reflector.setAttemptFailuresValidityInterval(
appContext,
flinkConfiguration.getLong(YarnConfigOptions.APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL));
}

private void setApplicationTags(final ApplicationSubmissionContext appContext) throws InvocationTargetException,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,19 @@ public class YarnConfigOptions {
" and the YARN Client will loose the connection. Also, the JobManager address will change and you’ll need" +
" to set the JM host:port manually. It is recommended to leave this option at 1.");

/**
* The config parameter defining the attemptFailuresValidityInterval of Yarn application.
*/
public static final ConfigOption<Long> APPLICATION_ATTEMPT_FAILURE_VALIDITY_INTERVAL =
key("yarn.application-attempt-failures-validity-interval")
.defaultValue(10000L)
.withDescription(Description.builder()
.text("Time window in milliseconds which defines the number of application attempt failures when restarting the AM. " +
"Failures which fall outside of this window are not being considered. " +
"Set this value to -1 in order to count globally. " +
"See %s for more information.", link("https://hortonworks.com/blog/apache-hadoop-yarn-hdp-2-2-fault-tolerance-features-long-running-services/", "here"))
.build());

/**
* The heartbeat interval between the Application Master and the YARN Resource Manager.
*/
Expand Down

0 comments on commit 6231b18

Please sign in to comment.