Skip to content

Commit

Permalink
Allow containers to be retried. (argoproj#661)
Browse files Browse the repository at this point in the history
* Allow containers to be retried.

This commit allows `container` templates to be retried. The user
simply adds a 'retries' section to the templated. Currently, this
section only has a 'limit' field that has the number of times the
container should be retried in case of failure.

Subsequent commits will add more features such as retries policies,
retries on specific errors, erc.

When a container template has a retries section, the workflow
controller adds an intermediate node to the workflow. This node
acts as a parent of all the retries.

The argo command line tool's 'get' command correctly shows the
intermediate node and the child nodes along with their pod
names and status'.

Added unit tests that check the various state transitions of the
intermediate node based on the status of the child nodes (which
actually execute the pod).

Testing Done.

* Unit tests succeeded.

* Ran simple workflow that had retries. It completed and status
  of the nodes was correct.

* Ran workflows with parallel steps, each of which had containers
  with retries. The workflow completed correctly and the retries
  were applied to each individual container correctly.

* Ran other workflows which did not have retries. The completed
  correctly.

* `argo get <wf-name>` shows the retries and the pod names.

* Incorporate review comments.

* Rename example files for retries.

* Update autogenerated code to include RetryStrategy.

* Call processNodeRetries from executeTemplate.

Earlier, this was getting called during podReconciliation. Calling
this from executeTemplate is better since other non-leaf nodes also
getting processed in executeTemplate. podReconciliation should
only process leaf-nodes (which are based on execution of pods).

* Reduce failure probabilities of retry examples.

This will prevent spurious failures during e2e tests. Users wanting
to experiment with retries will have to explicitly change the
failure probability in the yamls.
  • Loading branch information
shrinandj committed Jan 11, 2018
1 parent 80f9b1b commit 290f679
Show file tree
Hide file tree
Showing 9 changed files with 452 additions and 70 deletions.
105 changes: 62 additions & 43 deletions cmd/argo/commands/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,57 +148,76 @@ func printNodeTree(w *tabwriter.Writer, wf *wfv1.Workflow, node wfv1.NodeStatus,
fmt.Fprintf(w, "%s%s\t%s\t%s\n", args...)
}

// If the node has children, the node is a workflow template and
// node.Children prepresent a list of parallel steps. We skip
// a generation when recursing since the children nodes of workflow
// templates represent a virtual step group, which are not worh printing.
for i, stepGroupNodeID := range node.Children {
lastStepGroup := bool(i == len(node.Children)-1)
var part1, subp1 string
if lastStepGroup {
part1 = "└-"
if node.RetryStrategy != nil {
for i, childNodeID := range node.Children {
var part1, subp1 string
subp1 = " "
} else {
part1 = "├-"
subp1 = "| "
}
stepGroupNode := wf.Status.Nodes[stepGroupNodeID]
for j, childNodeID := range stepGroupNode.Children {

childNode := wf.Status.Nodes[childNodeID]
if j > 0 {
if lastStepGroup {
part1 = " "
} else {
part1 = "| "
}
if i > 0 && i < len(node.Children)-1 {
part1 = "├-"
} else {
part1 = "└-"
}
firstParallel := bool(j == 0)
lastParallel := bool(j == len(stepGroupNode.Children)-1)
var part2, subp2 string
if firstParallel {
if len(stepGroupNode.Children) == 1 {
part2 = "--"
} else {
part2 = "·-"
part2 = "--"
childNodePrefix := childPrefix + part1 + part2
childChldPrefix := childPrefix + subp1 + subp2
printNodeTree(w, wf, childNode, depth+1, childNodePrefix, childChldPrefix)
}
} else {
// If the node has children, the node is a workflow template and
// node.Children prepresent a list of parallel steps. We skip
// a generation when recursing since the children nodes of workflow
// templates represent a virtual step group, which are not worh printing.
for i, stepGroupNodeID := range node.Children {
lastStepGroup := bool(i == len(node.Children)-1)
var part1, subp1 string
if lastStepGroup {
part1 = "└-"
subp1 = " "
} else {
part1 = "├-"
subp1 = "| "
}
stepGroupNode := wf.Status.Nodes[stepGroupNodeID]
for j, childNodeID := range stepGroupNode.Children {
childNode := wf.Status.Nodes[childNodeID]
if j > 0 {
if lastStepGroup {
part1 = " "
} else {
part1 = "| "
}
}
if !lastParallel {
subp2 = "| "
} else {
firstParallel := bool(j == 0)
lastParallel := bool(j == len(stepGroupNode.Children)-1)
var part2, subp2 string
if firstParallel {
if len(stepGroupNode.Children) == 1 {
part2 = "--"
} else {
part2 = "·-"
}
if !lastParallel {
subp2 = "| "
} else {
subp2 = " "
}

} else if lastParallel {
part2 = "└-"
subp2 = " "
} else {
part2 = "├-"
subp2 = "| "
}

} else if lastParallel {
part2 = "└-"
subp2 = " "
} else {
part2 = "├-"
subp2 = "| "
childNodePrefix := childPrefix + part1 + part2
childChldPrefix := childPrefix + subp1 + subp2
// Remove stepgroup name from being displayed
childNode.Name = strings.TrimPrefix(childNode.Name, stepGroupNode.Name+".")
printNodeTree(w, wf, childNode, depth+1, childNodePrefix, childChldPrefix)
}
childNodePrefix := childPrefix + part1 + part2
childChldPrefix := childPrefix + subp1 + subp2
// Remove stepgroup name from being displayed
childNode.Name = strings.TrimPrefix(childNode.Name, stepGroupNode.Name+".")
printNodeTree(w, wf, childNode, depth+1, childNodePrefix, childChldPrefix)
}
}
}
Expand Down
19 changes: 19 additions & 0 deletions examples/retry-container-to-completion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This example demonstrates the use of infinite retries for running
# the container to completion. It uses the `random-fail` container.
# For more details, see
# https://github.com/shrinandj/random-fail

apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: container-retries-
spec:
entrypoint: container-retries
templates:
- name: container-retries
retryStrategy: {}
container:
image: shrinand/random-fail
command: ["python"]
args: ["/run.py", "40"]

19 changes: 19 additions & 0 deletions examples/retry-container.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This example demonstrates the use of retries for a single container.
# It uses the `random-fail` container. For more details, see
# https://github.com/shrinandj/random-fail

apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: container-retries-
spec:
entrypoint: container-retries
templates:
- name: container-retries
retryStrategy:
limit: 4
container:
image: shrinand/random-fail
command: ["python"]
args: ["/run.py", "0"]

41 changes: 41 additions & 0 deletions examples/retry-with-steps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# This example demonstrates the use of retries with steps.
# It uses the `random-fail` container. For more details, see
# https://github.com/shrinandj/random-fail

apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: retry-with-steps-
spec:
entrypoint: hello-hello-hello
templates:
- name: hello-hello-hello
steps:
- - name: hello1
template: random-fail
arguments:
parameters:
- name: failPct
value: "0"
- - name: hello2a
template: random-fail
arguments:
parameters:
- name: failPct
value: "0"
- name: hello2b
template: random-fail
arguments:
parameters:
- name: failPct
value: "0"
- name: random-fail
inputs:
parameters:
- name: failPct
retryStrategy:
limit: 4
container:
image: shrinand/random-fail
command: ["python"]
args: ["/run.py", "{{inputs.parameters.failPct}}"]
25 changes: 25 additions & 0 deletions pkg/apis/workflow/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ type Template struct {
// before the system actively tries to terminate the pod; value must be positive integer
// This field is only applicable to container and script templates.
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`

RetryStrategy *RetryStrategy `json:"retryStrategy,omitempty"`
}

// Inputs are the mechanism for passing parameters, artifacts, volumes from one template to another
Expand Down Expand Up @@ -272,6 +274,21 @@ type WorkflowStatus struct {
PersistentVolumeClaims []apiv1.Volume `json:"persistentVolumeClaims,omitempty"`
}

// GetNodesWithRetries returns a list of nodes that have retries.
func (wfs *WorkflowStatus) GetNodesWithRetries() []NodeStatus {
var nodesWithRetries []NodeStatus
for _, node := range wfs.Nodes {
if node.RetryStrategy != nil {
nodesWithRetries = append(nodesWithRetries, node)
}
}
return nodesWithRetries
}

type RetryStrategy struct {
Limit *int32 `json:"limit,omitempty"`
}

type NodeStatus struct {
// ID is a unique identifier of a node within the worklow
// It is implemented as a hash of the node name, which makes the ID deterministic
Expand Down Expand Up @@ -300,6 +317,8 @@ type NodeStatus struct {
// Daemoned tracks whether or not this node was daemoned and need to be terminated
Daemoned *bool `json:"daemoned,omitempty"`

RetryStrategy *RetryStrategy `json:"retryStrategy,omitempty"`

// Outputs captures output parameter values and artifact locations
Outputs *Outputs `json:"outputs,omitempty"`

Expand Down Expand Up @@ -332,6 +351,12 @@ func (n NodeStatus) Successful() bool {
return n.Phase == NodeSucceeded || n.Phase == NodeSkipped
}

// CanRetry returns whether the node should be retried or not.
func (n NodeStatus) CanRetry() bool {
// TODO(shri): Check if there are some 'unretryable' errors.
return n.Completed() && !n.Successful()
}

type S3Bucket struct {
Endpoint string `json:"endpoint"`
Bucket string `json:"bucket"`
Expand Down
43 changes: 43 additions & 0 deletions pkg/apis/workflow/v1alpha1/zz_generated.deepcopy.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,15 @@ func (in *NodeStatus) DeepCopyInto(out *NodeStatus) {
**out = **in
}
}
if in.RetryStrategy != nil {
in, out := &in.RetryStrategy, &out.RetryStrategy
if *in == nil {
*out = nil
} else {
*out = new(RetryStrategy)
(*in).DeepCopyInto(*out)
}
}
if in.Outputs != nil {
in, out := &in.Outputs, &out.Outputs
if *in == nil {
Expand Down Expand Up @@ -387,6 +396,31 @@ func (in *ResourceTemplate) DeepCopy() *ResourceTemplate {
return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RetryStrategy) DeepCopyInto(out *RetryStrategy) {
*out = *in
if in.Limit != nil {
in, out := &in.Limit, &out.Limit
if *in == nil {
*out = nil
} else {
*out = new(int32)
**out = **in
}
}
return
}

// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryStrategy.
func (in *RetryStrategy) DeepCopy() *RetryStrategy {
if in == nil {
return nil
}
out := new(RetryStrategy)
in.DeepCopyInto(out)
return out
}

// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *S3Artifact) DeepCopyInto(out *S3Artifact) {
*out = *in
Expand Down Expand Up @@ -581,6 +615,15 @@ func (in *Template) DeepCopyInto(out *Template) {
**out = **in
}
}
if in.RetryStrategy != nil {
in, out := &in.RetryStrategy, &out.RetryStrategy
if *in == nil {
*out = nil
} else {
*out = new(RetryStrategy)
(*in).DeepCopyInto(*out)
}
}
return
}

Expand Down
Loading

0 comments on commit 290f679

Please sign in to comment.