Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(executor): Handle sidecar killing in a process-namespace-shared pod #4575

Merged
merged 6 commits into from
Nov 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/argoexec/commands/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ func NewWaitCommand() *cobra.Command {

func waitContainer() error {
wfExecutor := initExecutor()
defer wfExecutor.HandleError()
defer wfExecutor.HandleError() // Must be placed at the bottom of defers stack.
defer stats.LogStats()
stats.StartStatsTicker(5 * time.Minute)

defer func() {
// Killing sidecar containers
err := wfExecutor.KillSidecars()
if err != nil {
log.Errorf("Failed to kill sidecars: %s", err.Error())
wfExecutor.AddError(err)
}
}()

Expand Down
3 changes: 3 additions & 0 deletions workflow/executor/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ func TerminatePodWithContainerID(c KubernetesClientInterface, containerID string
log.Infof("Container %s is already terminated: %v", container.ContainerID, container.State.Terminated.String())
return nil
}
if pod.Spec.ShareProcessNamespace != nil && *pod.Spec.ShareProcessNamespace {
return fmt.Errorf("cannot terminate a process-namespace-shared Pod %s", pod.Name)
Copy link
Member Author

@dtaniwaki dtaniwaki Nov 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it immediately returns an error here, the wait container exit with 0 and the pod succeeds. It's also the same in other error returns in this method too. Was it expected behavior?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test?

Copy link
Member Author

@dtaniwaki dtaniwaki Nov 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a unit test of this method. I think we should have e2e tests for all the executors. If we had, this functional test had found the problem.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have e2e smoke test for 3 of the executors?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, you're right. The problem occurs on the combination of k8sapi and shareProcessNamespace.

}
if pod.Spec.HostPID {
return fmt.Errorf("cannot terminate a hostPID Pod %s", pod.Name)
}
Expand Down
121 changes: 121 additions & 0 deletions workflow/executor/common/common_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package common

import (
"bytes"
"syscall"
"testing"

"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"
)

type MockKC struct {
getContainerStatusPod *v1.Pod
getContainerStatusContainerStatus *v1.ContainerStatus
getContainerStatusErr error
killContainerError error
}

func (m *MockKC) GetContainerStatus(containerID string) (*v1.Pod, *v1.ContainerStatus, error) {
return m.getContainerStatusPod, m.getContainerStatusContainerStatus, m.getContainerStatusErr
}

func (m *MockKC) KillContainer(pod *v1.Pod, container *v1.ContainerStatus, sig syscall.Signal) error {
return m.killContainerError
}

func (*MockKC) CreateArchive(containerID, sourcePath string) (*bytes.Buffer, error) {
return nil, nil
}

// TestScriptTemplateWithVolume ensure we can a script pod with input artifacts
func TestTerminatePodWithContainerID(t *testing.T) {
// Already terminated.
mock := &MockKC{
getContainerStatusContainerStatus: &v1.ContainerStatus{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{},
},
},
}
err := TerminatePodWithContainerID(mock, "container-id", syscall.SIGTERM)
assert.NoError(t, err)

// w/ ShareProcessNamespace.
mock = &MockKC{
getContainerStatusPod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: v1.PodSpec{
ShareProcessNamespace: pointer.BoolPtr(true),
},
},
getContainerStatusContainerStatus: &v1.ContainerStatus{
State: v1.ContainerState{
Terminated: nil,
},
},
}
err = TerminatePodWithContainerID(mock, "container-id", syscall.SIGTERM)
assert.EqualError(t, err, "cannot terminate a process-namespace-shared Pod foo")

// w/ HostPID.
mock = &MockKC{
getContainerStatusPod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: v1.PodSpec{
HostPID: true,
},
},
getContainerStatusContainerStatus: &v1.ContainerStatus{
State: v1.ContainerState{
Terminated: nil,
},
},
}
err = TerminatePodWithContainerID(mock, "container-id", syscall.SIGTERM)
assert.EqualError(t, err, "cannot terminate a hostPID Pod foo")

// w/ RestartPolicy.
mock = &MockKC{
getContainerStatusPod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: v1.PodSpec{
RestartPolicy: "Always",
},
},
getContainerStatusContainerStatus: &v1.ContainerStatus{
State: v1.ContainerState{
Terminated: nil,
},
},
}
err = TerminatePodWithContainerID(mock, "container-id", syscall.SIGTERM)
assert.EqualError(t, err, "cannot terminate pod with a \"Always\" restart policy")

// Successfully call KillContainer of the client interface.
mock = &MockKC{
getContainerStatusPod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: v1.PodSpec{
RestartPolicy: "Never",
},
},
getContainerStatusContainerStatus: &v1.ContainerStatus{
State: v1.ContainerState{
Terminated: nil,
},
},
}
err = TerminatePodWithContainerID(mock, "container-id", syscall.SIGTERM)
assert.NoError(t, err)
}