Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Ensure ContainerStatus in PNS is terminated before continuing #4469

Merged
merged 6 commits into from
Nov 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions test/e2e/functional_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,38 @@ spec:
DeleteMemoryQuota()
}

func (s *FunctionalSuite) TestExitCodePNSSleep() {
s.Given().
Workflow(`apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
name: cond
labels:
argo-e2e: true
spec:
entrypoint: conditional-example
templates:
- name: conditional-example
steps:
- - name: print-hello
template: whalesay
- name: whalesay
container:
image: argoproj/argosay:v2
args: [sleep, 5s]
`).
When().
SubmitWorkflow().
Wait(10 * time.Second).
Then().
ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
node := status.Nodes.FindByDisplayName("print-hello")
if assert.NotNil(t, node) && assert.NotNil(t, node.Outputs) && assert.NotNil(t, node.Outputs.ExitCode) {
assert.Equal(t, "0", *node.Outputs.ExitCode)
}
})
}

func TestFunctionalSuite(t *testing.T) {
suite.Run(t, new(FunctionalSuite))
}
42 changes: 25 additions & 17 deletions workflow/executor/pns/pns.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ import (
log "github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"

"github.com/argoproj/argo/errors"
"github.com/argoproj/argo/util/archive"
"github.com/argoproj/argo/workflow/common"
execcommon "github.com/argoproj/argo/workflow/executor/common"
"github.com/argoproj/argo/workflow/executor/common/wait"
os_specific "github.com/argoproj/argo/workflow/executor/os-specific"
argowait "github.com/argoproj/argo/workflow/executor/common/wait"
osspecific "github.com/argoproj/argo/workflow/executor/os-specific"
)

type PNSExecutor struct {
Expand Down Expand Up @@ -97,7 +98,7 @@ func (p *PNSExecutor) enterChroot() error {
if err := p.mainFS.Chdir(); err != nil {
return errors.InternalWrapErrorf(err, "failed to chdir to main filesystem: %v", err)
}
err := os_specific.CallChroot()
err := osspecific.CallChroot()
if err != nil {
return errors.InternalWrapErrorf(err, "failed to chroot to main filesystem: %v", err)
}
Expand All @@ -109,7 +110,7 @@ func (p *PNSExecutor) exitChroot() error {
if err := p.rootFS.Chdir(); err != nil {
return errors.InternalWrapError(err)
}
err := os_specific.CallChroot()
err := osspecific.CallChroot()
if err != nil {
return errors.InternalWrapError(err)
}
Expand Down Expand Up @@ -167,7 +168,7 @@ func (p *PNSExecutor) Wait(containerID string) error {
log.Warnf("Ignoring wait failure: %v. Process assumed to have completed", err)
return nil
}
return wait.UntilTerminated(p.clientset, p.namespace, p.podName, containerID)
return argowait.UntilTerminated(p.clientset, p.namespace, p.podName, containerID)
}
log.Infof("Main pid identified as %d", mainPID)
for pid, f := range p.pidFileHandles {
Expand Down Expand Up @@ -228,7 +229,7 @@ func (p *PNSExecutor) GetOutputStream(containerID string, combinedOutput bool) (

func (p *PNSExecutor) GetExitCode(containerID string) (string, error) {
log.Infof("Getting exit code of %s", containerID)
_, containerStatus, err := p.GetContainerStatus(containerID)
_, containerStatus, err := p.GetTerminatedContainerStatus(containerID)
if err != nil {
return "", fmt.Errorf("could not get container status: %s", err)
}
Expand Down Expand Up @@ -368,18 +369,25 @@ func (p *PNSExecutor) updateCtrIDMap() {
}
}

func (p *PNSExecutor) GetContainerStatus(containerID string) (*corev1.Pod, *corev1.ContainerStatus, error) {
pod, err := p.clientset.CoreV1().Pods(p.namespace).Get(p.podName, metav1.GetOptions{})
if err != nil {
return nil, nil, fmt.Errorf("could not get pod: %s", err)
}
for _, containerStatus := range pod.Status.ContainerStatuses {
if execcommon.GetContainerID(&containerStatus) != containerID {
continue
func (p *PNSExecutor) GetTerminatedContainerStatus(containerID string) (*corev1.Pod, *corev1.ContainerStatus, error) {
var pod *corev1.Pod
var containerStatus *corev1.ContainerStatus
err := wait.Poll(1*time.Second, 3*time.Second, func() (bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this try 3 times over 3 seconds?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. According to my manual testing, this should succeed on the first try around 95% of the time. It should almost always succeed by the second try.

podRes, err := p.clientset.CoreV1().Pods(p.namespace).Get(p.podName, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("could not get pod: %s", err)
}
return pod, &containerStatus, nil
}
return nil, nil, errors.New(errors.CodeNotFound, fmt.Sprintf("containerID %q is not found in the pod %s", containerID, p.podName))
for _, containerStatusRes := range podRes.Status.ContainerStatuses {
if execcommon.GetContainerID(&containerStatusRes) != containerID {
continue
}
pod = podRes
containerStatus = &containerStatusRes
return containerStatus.State.Terminated != nil, nil
}
return false, errors.New(errors.CodeNotFound, fmt.Sprintf("containerID %q is not found in the pod %s", containerID, p.podName))
})
return pod, containerStatus, err
}

// parseContainerID parses the containerID of a pid
Expand Down