Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make it possible to select supported devices in the operator #599

Merged
merged 7 commits into from
Apr 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions cmd/operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@ NAME DESIRED READY NODE SELECTOR AGE
gpudeviceplugin-sample 1 1 5s
```

In order to limit the deployment to a specific device type,
use one of kustomizations under deployments/operator/device.

For example, to limit the deployment to FPGA, use:

```bash
$ kubectl apply -k deployments/operator/device/fpga
```

Operator also supports deployments with multiple selected device types.
In this case, create a new kustomization with the necessary resources
that passes the desired device types to the operator using `--device`
command line argument multiple times.

## Known issues

When the operator is run with leader election enabled, that is with the option
Expand All @@ -106,3 +120,7 @@ number of pods. Otherwise a heart beat used by the leader election code may trig
a timeout and crash. We are going to use different clients for the controller and
leader election code to alleviate the issue. See more details in
https://github.com/intel/intel-device-plugins-for-kubernetes/issues/476.

In case the deployment is limited to specific device type(s),
the CRDs for other device types are still created, but no controllers
for them are registered.
95 changes: 70 additions & 25 deletions cmd/operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ package main

import (
"flag"
"fmt"
"os"
"strings"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -54,18 +56,58 @@ func init() {

type devicePluginControllerAndWebhook map[string](func(ctrl.Manager, string, bool) error)

type flagList []string

var supportedDevices = flagList{"dsa", "fpga", "gpu", "qat", "sgx"}
var devices flagList

func (flag *flagList) String() string {
return strings.Join(*flag, ", ")
}

func (flag *flagList) Set(value string) error {
if !contains(supportedDevices, value) {
setupLog.Error(nil, fmt.Sprintf("Unsupported device: %s", value))
os.Exit(1)
}

if contains(devices, value) {
setupLog.Error(nil, fmt.Sprintf("Duplicate device: %s", value))
os.Exit(1)
}

*flag = append(*flag, value)
return nil
}

func contains(arr []string, val string) bool {
for _, s := range arr {
mythi marked this conversation as resolved.
Show resolved Hide resolved
if s == val {
return true
}
}
return false
}

func main() {
var metricsAddr string
var devicePluginNamespace string
var enableLeaderElection bool
var pm *patcher.PatcherManager

ctrl.SetLogger(klogr.New())

flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&devicePluginNamespace, "deviceplugin-namespace", metav1.NamespaceSystem, "The namespace where deviceplugin daemonsets are created")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
flag.Var(&devices, "devices", "Device(s) to set up.")
flag.Parse()

ctrl.SetLogger(klogr.New())
if len(devices) == 0 {
devices = supportedDevices
}

setupControllerAndWebhook := devicePluginControllerAndWebhook{
"dsa": dsa.SetupReconciler,
Expand Down Expand Up @@ -95,40 +137,43 @@ func main() {

withWebhook := true

// TODO(mythi): add a StringVar flag to select which controllers to enable
for _, device := range []string{"dsa", "fpga", "gpu", "qat", "sgx"} {
for _, device := range devices {
if err = setupControllerAndWebhook[device](mgr, ns, withWebhook); err != nil {
bart0sh marked this conversation as resolved.
Show resolved Hide resolved
setupLog.Error(err, "unable to initialize controller", "controller", device)
os.Exit(1)
}
}

pm := patcher.NewPatcherManager(mgr.GetLogger().WithName("webhooks").WithName("Fpga"))
if contains(devices, "sgx") {
mgr.GetWebhookServer().Register("/pods-sgx", &webhook.Admission{
Handler: &sgxwebhook.SgxMutator{Client: mgr.GetClient()},
})
}

mgr.GetWebhookServer().Register("/pods", &webhook.Admission{
Handler: admission.HandlerFunc(pm.GetPodMutator()),
})
if contains(devices, "fpga") {
pm = patcher.NewPatcherManager(mgr.GetLogger().WithName("webhooks").WithName("Fpga"))

mgr.GetWebhookServer().Register("/pods-sgx", &webhook.Admission{
Handler: &sgxwebhook.SgxMutator{Client: mgr.GetClient()},
})
mgr.GetWebhookServer().Register("/pods", &webhook.Admission{
Handler: admission.HandlerFunc(pm.GetPodMutator()),
})

if err = (&fpgacontroller.AcceleratorFunctionReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
PatcherManager: pm,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "AcceleratorFunction")
os.Exit(1)
}
if err = (&fpgacontroller.AcceleratorFunctionReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
PatcherManager: pm,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "AcceleratorFunction")
os.Exit(1)
}

if err = (&fpgacontroller.FpgaRegionReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
PatcherManager: pm,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "FpgaRegion")
os.Exit(1)
if err = (&fpgacontroller.FpgaRegionReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
PatcherManager: pm,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "FpgaRegion")
os.Exit(1)
}
}

setupLog.Info("starting manager")
Expand Down
14 changes: 14 additions & 0 deletions deployments/operator/device/dsa/dsa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inteldeviceplugins-controller-manager
namespace: inteldeviceplugins-system
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was the namespace required for this to work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tried to omit it, however kustomize errors without namespace:

Error: no matches for IdId apps_v1_Deployment|~X|inteldeviceplugins-controller-manager; failed to find unique target for patch apps_v1_Deployment|inteldeviceplugins-controller-manager

spec:
template:
spec:
containers:
- args:
- --metrics-addr=127.0.0.1:8080
- --enable-leader-election
- --device=dsa
name: manager
5 changes: 5 additions & 0 deletions deployments/operator/device/dsa/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bases:
- ../../default
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this also pulls in ../../crd/bases which creates the CRDs for all devices.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be a way to drop the unneeded CRDs?

I tried with the approach to pick only specific CRDs (for example for FPGA), however couldn't get it to work:

diff --git a/deployments/operator/device/fpga/kustomization.yaml b/deployments/operator/device/fpga/kustomization.yaml
index cea26f2..60de267 100644
--- a/deployments/operator/device/fpga/kustomization.yaml
+++ b/deployments/operator/device/fpga/kustomization.yaml
@@ -1,5 +1,16 @@
+namespace: inteldeviceplugins-system
+namePrefix: inteldeviceplugins-
+
+resources:
+- deviceplugin.intel.com_fpgadeviceplugins.yaml
+- fpga.intel.com_acceleratorfunctions.yaml
+- fpga.intel.com_fpgaregions.yaml
+
 bases:
- - ../../default
+- ../../rbac
+- ../../manager
+- ../../webhook
+- ../../certmanager
 
 patchesStrategicMerge:
  - fpga.yaml
# kustomize build deployments/operator/device/fpga
Error: no matches for IdId apps_v1_Deployment|inteldeviceplugins-system|inteldeviceplugins-controller-manager; failed to find unique target for patch apps_v1_Deployment|inteldeviceplugins-controller-manager

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be a way to drop the unneeded CRDs?

maybe the only way to do it is to have the CRDs created by the manager itself. This could follow the idea I demonstrate in #394 (comment)


patchesStrategicMerge:
- dsa.yaml
14 changes: 14 additions & 0 deletions deployments/operator/device/fpga/fpga.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inteldeviceplugins-controller-manager
namespace: inteldeviceplugins-system
spec:
template:
spec:
containers:
- args:
- --metrics-addr=127.0.0.1:8080
- --enable-leader-election
- --device=fpga
name: manager
5 changes: 5 additions & 0 deletions deployments/operator/device/fpga/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bases:
- ../../default

patchesStrategicMerge:
- fpga.yaml
14 changes: 14 additions & 0 deletions deployments/operator/device/gpu/gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inteldeviceplugins-controller-manager
namespace: inteldeviceplugins-system
spec:
template:
spec:
containers:
- args:
- --metrics-addr=127.0.0.1:8080
- --enable-leader-election
- --device=gpu
name: manager
5 changes: 5 additions & 0 deletions deployments/operator/device/gpu/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bases:
- ../../default

patchesStrategicMerge:
- gpu.yaml
5 changes: 5 additions & 0 deletions deployments/operator/device/qat/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bases:
- ../../default

patchesStrategicMerge:
- qat.yaml
14 changes: 14 additions & 0 deletions deployments/operator/device/qat/qat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inteldeviceplugins-controller-manager
namespace: inteldeviceplugins-system
spec:
template:
spec:
containers:
- args:
- --metrics-addr=127.0.0.1:8080
- --enable-leader-election
- --device=qat
name: manager
5 changes: 5 additions & 0 deletions deployments/operator/device/sgx/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bases:
- ../../default

patchesStrategicMerge:
- sgx.yaml
14 changes: 14 additions & 0 deletions deployments/operator/device/sgx/sgx.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inteldeviceplugins-controller-manager
namespace: inteldeviceplugins-system
spec:
template:
spec:
containers:
- args:
- --metrics-addr=127.0.0.1:8080
- --enable-leader-election
- --device=sgx
name: manager