Skip to content

Commit

Permalink
[FLINK-21190][runtime] Introduces exception history to web UI
Browse files Browse the repository at this point in the history
  • Loading branch information
XComp authored and zentol committed Mar 23, 2021
1 parent b6e0074 commit dd80c07
Show file tree
Hide file tree
Showing 25 changed files with 1,770 additions and 165 deletions.
6 changes: 6 additions & 0 deletions docs/layouts/shortcodes/generated/all_jobmanager_section.html
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@
<td>String</td>
<td>Dictionary for JobManager to store the archives of completed jobs.</td>
</tr>
<tr>
<td><h5>jobmanager.exception-history-size</h5></td>
<td style="word-wrap: break-word;">16</td>
<td>Integer</td>
<td>The maximum number of failures collected by the exception history per job.</td>
</tr>
<tr>
<td><h5>jobmanager.execution.attempts-history-size</h5></td>
<td style="word-wrap: break-word;">16</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
<td>String</td>
<td>The local address of the network interface that the job manager binds to. If not configured, '0.0.0.0' will be used.</td>
</tr>
<tr>
<td><h5>jobmanager.exception-history-size</h5></td>
<td style="word-wrap: break-word;">16</td>
<td>Integer</td>
<td>The maximum number of failures collected by the exception history per job.</td>
</tr>
<tr>
<td><h5>jobmanager.execution.attempts-history-size</h5></td>
<td style="word-wrap: break-word;">16</td>
Expand Down
37 changes: 35 additions & 2 deletions docs/layouts/shortcodes/generated/rest_v1_dispatcher.html
Original file line number Diff line number Diff line change
Expand Up @@ -2630,7 +2630,7 @@
<td class="text-left">Response code: <code>200 OK</code></td>
</tr>
<tr>
<td colspan="2">Returns the non-recoverable exceptions that have been observed by the job. The truncated flag defines whether more exceptions occurred, but are not listed, because the response would otherwise get too big.</td>
<td colspan="2">Returns the most recent exceptions that have been handled by Flink for this job. The 'exceptionHistory.truncated' flag defines whether exceptions were filtered out through the GET parameter. The backend collects only a specific amount of most recent exceptions per job. This can be configured through jobmanager.exception-history-size in the Flink configuration. The following first-level members are deprecated: 'root-exception', 'timestamp', 'timestamp', 'truncated'. Use the data provided through 'exceptionHistory', instead.</td>
</tr>
<tr>
<td colspan="2">Path parameters</td>
Expand Down Expand Up @@ -2685,7 +2685,7 @@
<code>
{
"type" : "object",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfo",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfoWithHistory",
"properties" : {
"all-exceptions" : {
"type" : "array",
Expand All @@ -2708,6 +2708,39 @@
}
}
},
"exceptionHistory" : {
"type" : "object",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfoWithHistory:JobExceptionHistory",
"properties" : {
"entries" : {
"type" : "array",
"items" : {
"type" : "object",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfoWithHistory:ExceptionInfo",
"properties" : {
"exceptionName" : {
"type" : "string"
},
"location" : {
"type" : "string"
},
"stacktrace" : {
"type" : "string"
},
"taskName" : {
"type" : "string"
},
"timestamp" : {
"type" : "integer"
}
}
}
},
"truncated" : {
"type" : "boolean"
}
}
},
"root-exception" : {
"type" : "string"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,15 @@ public class JobManagerOptions {
.withDescription(
"The maximum number of prior execution attempts kept in history.");

/** The maximum number of failures kept in the exception history. */
@Documentation.Section(Documentation.Sections.ALL_JOB_MANAGER)
public static final ConfigOption<Integer> MAX_EXCEPTION_HISTORY_SIZE =
key("jobmanager.exception-history-size")
.intType()
.defaultValue(16)
.withDescription(
"The maximum number of failures collected by the exception history per job.");

/**
* This option specifies the failover strategy, i.e. how the job computation recovers from task
* failures.
Expand Down
35 changes: 34 additions & 1 deletion flink-runtime-web/src/test/resources/rest_api_v1.snapshot
Original file line number Diff line number Diff line change
Expand Up @@ -1572,7 +1572,7 @@
},
"response" : {
"type" : "object",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfo",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfoWithHistory",
"properties" : {
"root-exception" : {
"type" : "string"
Expand Down Expand Up @@ -1603,6 +1603,39 @@
},
"truncated" : {
"type" : "boolean"
},
"exceptionHistory" : {
"type" : "object",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfoWithHistory:JobExceptionHistory",
"properties" : {
"entries" : {
"type" : "array",
"items" : {
"type" : "object",
"id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfoWithHistory:ExceptionInfo",
"properties" : {
"exceptionName" : {
"type" : "string"
},
"stacktrace" : {
"type" : "string"
},
"timestamp" : {
"type" : "integer"
},
"taskName" : {
"type" : "string"
},
"location" : {
"type" : "string"
}
}
}
},
"truncated" : {
"type" : "boolean"
}
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export interface JobExceptionInterface {
timestamp: number;
truncated: boolean;
'all-exceptions': JobExceptionItemInterface[];
'exceptionHistory': JobExceptionHistoryInterface;
}

export interface JobExceptionItemInterface {
Expand All @@ -32,3 +33,16 @@ export interface JobExceptionItemInterface {
timestamp: number;
'vertex-id': string;
}

export interface JobExceptionHistoryInterface {
entries: ExceptionInfoInterface[];
truncated: boolean;
}

export interface ExceptionInfoInterface {
exceptionName: string;
stacktrace: string;
timestamp: number;
taskName: string;
location: string;
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
<tr>
<th nzShowExpand></th>
<th>Time</th>
<th>Exception</th>
<th>Name</th>
<th>Location</th>
</tr>
Expand All @@ -40,21 +41,28 @@
<tr>
<td nzShowExpand [(nzExpand)]="exception.expand"></td>
<td>{{exception.timestamp | date:'yyyy-MM-dd HH:mm:ss'}}</td>
<td><div class="name">{{exception.exceptionName}}</div></td>
<td>
<div class="name">
{{exception.task}}
{{exception.taskName || "(global failure)"}}
</div>
</td>
<td>{{exception.location}}</td>
<td>{{exception.location || "(unassigned)"}}</td>
</tr>
<tr [nzExpand]="exception.expand">
<td colspan="6" class="expand-td">
<flink-monaco-editor *ngIf="exception.expand" class="subtask" [value]="exception.exception"></flink-monaco-editor>
<td colspan="5" class="expand-td">
<flink-monaco-editor *ngIf="exception.expand" class="subtask" [value]="exception.stacktrace"></flink-monaco-editor>
</td>
</tr>
</ng-container>
<tr *ngIf="truncated">
<tr *ngIf="listOfException.length > 0">
<td colspan="6">
<i nz-icon nzType="info-circle" nzTheme="fill"></i>&nbsp;
<i>The exception history is limited to the most recent failures that caused parts of the job or the entire job to restart. The maximum history size can be configured through the Flink configuration.</i>
</td>
</tr>
<tr *ngIf="truncated">
<td colspan="5">
<button nz-button nzBlock nzType="primary" nzGhost (click)="loadMore()" [nzLoading]="isLoading">Load More</button>
</td>
</tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import { formatDate } from '@angular/common';
import { Component, OnInit, ChangeDetectionStrategy, ChangeDetectorRef } from '@angular/core';
import { JobExceptionItemInterface } from 'interfaces';
import { ExceptionInfoInterface } from 'interfaces';
import { distinctUntilChanged, flatMap, tap } from 'rxjs/operators';
import { JobService } from 'services';

Expand All @@ -30,12 +30,13 @@ import { JobService } from 'services';
})
export class JobExceptionsComponent implements OnInit {
rootException = '';
listOfException: JobExceptionItemInterface[] = [];
listOfException: ExceptionInfoInterface[] = [];
truncated = false;
isLoading = false;
maxExceptions = 0;
total = 0;

trackExceptionBy(_: number, node: JobExceptionItemInterface) {
trackExceptionBy(_: number, node: ExceptionInfoInterface) {
return node.timestamp;
}
loadMore() {
Expand All @@ -52,13 +53,15 @@ export class JobExceptionsComponent implements OnInit {
)
.subscribe(data => {
// @ts-ignore
if (data['root-exception']) {
this.rootException = formatDate(data.timestamp, 'yyyy-MM-dd HH:mm:ss', 'en') + '\n' + data['root-exception'];
var exceptionHistory = data.exceptionHistory
if (exceptionHistory.entries.length > 0) {
var mostRecentException = exceptionHistory.entries[0]
this.rootException = formatDate(mostRecentException.timestamp, 'yyyy-MM-dd HH:mm:ss', 'en') + '\n' + mostRecentException.stacktrace;
} else {
this.rootException = 'No Root Exception';
}
this.truncated = data.truncated;
this.listOfException = data['all-exceptions'];
this.truncated = exceptionHistory.truncated;
this.listOfException = exceptionHistory.entries;
});
}

Expand Down
Loading

0 comments on commit dd80c07

Please sign in to comment.