Skip to content

Commit

Permalink
Fix device-plugin/worker pod labels validationsduring ordered-upgrade (
Browse files Browse the repository at this point in the history
…#592)

This PR inforces the correct ordering of add/removing version labels for
worker pods and device-plugin. The correct sequence should always be:
1) remove old version device-plugin label (if exists)
2) remove old version worker pod label (only in case no relevant device-plugin pod
   is present)
3) add new version worker pod label (only in case the kernel module is
   not loaded)
4) add new version device-plugin label ( if needed)

We are stalling for worker-pod labels in order to make sure that new
device-label pod will not be scheduled prior to new version of kernel
module being loaded. Otherwise, old version will never be able to be
removed from the kernel
  • Loading branch information
yevgeny-shnaidman authored Oct 5, 2023
1 parent 20fe966 commit 06f403b
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 25 deletions.
23 changes: 19 additions & 4 deletions internal/controllers/mock_node_label_module_version_reconciler.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 39 additions & 9 deletions internal/controllers/node_label_module_version_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ func (nlmvr *NodeLabelModuleVersionReconciler) Reconcile(ctx context.Context, re
return ctrl.Result{}, fmt.Errorf("could get device plugin pods for the node %s: %v", node.Name, err)
}

reconLabelsRes := nlmvr.helperAPI.reconcileLabels(modulesVersionLabels, devicePluginPods)
loadedKernelModules := nlmvr.helperAPI.getLoadedKernelModules(node.GetLabels())

reconLabelsRes := nlmvr.helperAPI.reconcileLabels(modulesVersionLabels, devicePluginPods, loadedKernelModules)

logger := log.FromContext(ctx).WithValues("node name", node.Name)
logLabelsUpdateData(logger, reconLabelsRes)
Expand All @@ -80,7 +82,8 @@ func (nlmvr *NodeLabelModuleVersionReconciler) Reconcile(ctx context.Context, re
type nodeLabelModuleVersionHelperAPI interface {
getLabelsPerModules(ctx context.Context, nodeLabels map[string]string) map[string]*modulesVersionLabels
getDevicePluginPods(ctx context.Context, nodeName string) ([]v1.Pod, error)
reconcileLabels(modulesLabels map[string]*modulesVersionLabels, devicePluginPods []v1.Pod) *reconcileLabelsResult
getLoadedKernelModules(labels map[string]string) []types.NamespacedName
reconcileLabels(modulesLabels map[string]*modulesVersionLabels, devicePluginPods []v1.Pod, kernelModuleReadyLabels []types.NamespacedName) *reconcileLabelsResult
updateNodeLabels(ctx context.Context, nodeName string, reconLabelsRes *reconcileLabelsResult) error
}

Expand Down Expand Up @@ -141,6 +144,17 @@ func (nlmvha *nodeLabelModuleVersionHelper) getDevicePluginPods(ctx context.Cont
return devicePluginPods, nil
}

func (nlmvha *nodeLabelModuleVersionHelper) getLoadedKernelModules(nodeLabels map[string]string) []types.NamespacedName {
loadedKernelModules := make([]types.NamespacedName, 0, len(nodeLabels))
for label := range nodeLabels {
isReadyLabel, namespace, name := utils.IsKernelModuleReadyNodeLabel(label)
if isReadyLabel {
loadedKernelModules = append(loadedKernelModules, types.NamespacedName{Namespace: namespace, Name: name})
}
}
return loadedKernelModules
}

func (nlmvha *nodeLabelModuleVersionHelper) updateNodeLabels(ctx context.Context, nodeName string, reconLabelsRes *reconcileLabelsResult) error {
node := v1.Node{}

Expand All @@ -165,7 +179,8 @@ func (nlmvha *nodeLabelModuleVersionHelper) updateNodeLabels(ctx context.Context
}

func (nlmvha *nodeLabelModuleVersionHelper) reconcileLabels(modulesLabels map[string]*modulesVersionLabels,
devicePluginPods []v1.Pod) *reconcileLabelsResult {
devicePluginPods []v1.Pod,
loadedKernelModules []types.NamespacedName) *reconcileLabelsResult {

reconRes := reconcileLabelsResult{
labelsToAdd: map[string]string{},
Expand All @@ -175,14 +190,17 @@ func (nlmvha *nodeLabelModuleVersionHelper) reconcileLabels(modulesLabels map[st
label, labelValue, action := getLabelAndAction(moduleLabels)
switch action {
case deleteAction:
validAction := verifyLabelActionValidity(moduleLabels.name, moduleLabels.namespace, devicePluginPods)
if validAction {
reconRes.labelsToDelete = append(reconRes.labelsToDelete, label)
} else {
if utils.IsWorkerPodVersionLabel(label) && !verifyLabelDeleteValidity(moduleLabels.name, moduleLabels.namespace, devicePluginPods) {
reconRes.requeue = true
} else {
reconRes.labelsToDelete = append(reconRes.labelsToDelete, label)
}
case addAction:
reconRes.labelsToAdd[label] = labelValue
if utils.IsWorkerPodVersionLabel(label) && !verifyLabelAddValidity(moduleLabels.name, moduleLabels.namespace, loadedKernelModules) {
reconRes.requeue = true
} else {
reconRes.labelsToAdd[label] = labelValue
}
}
}

Expand All @@ -192,7 +210,7 @@ func (nlmvha *nodeLabelModuleVersionHelper) reconcileLabels(modulesLabels map[st
// validity is checked by verifying that devicePlugin pod defined
// by name, namespace,role is missing. In case the pod is present, no matter in
// what state, then the action is invalid
func verifyLabelActionValidity(name, namespace string, devicePluginPods []v1.Pod) bool {
func verifyLabelDeleteValidity(name, namespace string, devicePluginPods []v1.Pod) bool {
for _, pod := range devicePluginPods {
podLabels := pod.GetLabels()
if pod.Namespace == namespace && podLabels[constants.ModuleNameLabel] == name {
Expand All @@ -202,6 +220,18 @@ func verifyLabelActionValidity(name, namespace string, devicePluginPods []v1.Pod
return true
}

// validity is checked by verifying that kernel module defined
// by name and namespace is not loaded. In case the kernel module is loaded, then the action is invalid
func verifyLabelAddValidity(name, namespace string, loadedKernelModules []types.NamespacedName) bool {
nsn := types.NamespacedName{Name: name, Namespace: namespace}
for _, kernelModule := range loadedKernelModules {
if kernelModule == nsn {
return false
}
}
return true
}

// returns the label, value of the label and action to execute on label (add or delete)
func getLabelAndAction(moduleLabels *modulesVersionLabels) (string, string, string) {
labelActionKey := getModuleVersionLabelsState(moduleLabels)
Expand Down
117 changes: 105 additions & 12 deletions internal/controllers/node_label_module_version_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,29 @@ var _ = Describe("Reconcile", func() {
}
req := ctrl.Request{NamespacedName: nn}
nodeLabels := map[string]string{"some label": "some label value"}
testNode := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Labels: nodeLabels,
},
}

DescribeTable("reconciler flow", func(getNodeError, getMLPodsError, upDateNodeLabelsErrors, requeue bool) {
labelsPerModules := map[string]*modulesVersionLabels{
"moduleNameNamespace": &modulesVersionLabels{name: "name", namespace: "namespace"},
}
devicePluginPods := []v1.Pod{v1.Pod{}}
loadedKernelModules := []types.NamespacedName{{Name: "some name", Namespace: "some namespace"}}
reconcileLabelsResult := &reconcileLabelsResult{requeue: requeue}
expectedRes := ctrl.Result{Requeue: requeue}
if getNodeError {
kubeClient.EXPECT().Get(ctx, nn, gomock.AssignableToTypeOf(&v1.Node{})).Return(fmt.Errorf("some error"))
kubeClient.EXPECT().Get(ctx, nn, gomock.AssignableToTypeOf(&testNode)).Return(fmt.Errorf("some error"))
goto executeTestFunction
}
kubeClient.EXPECT().Get(ctx, req.NamespacedName, gomock.Any()).Do(
func(_ interface{}, _ interface{}, node *v1.Node, _ ...client.GetOption) {
node.SetLabels(nodeLabels)
node.Name = nodeName
node.SetLabels(testNode.Labels)
node.Name = testNode.Name
},
)
mockHelper.EXPECT().getLabelsPerModules(ctx, nodeLabels).Return(labelsPerModules)
Expand All @@ -68,7 +75,8 @@ var _ = Describe("Reconcile", func() {
goto executeTestFunction
}
mockHelper.EXPECT().getDevicePluginPods(ctx, nodeName).Return(devicePluginPods, nil)
mockHelper.EXPECT().reconcileLabels(labelsPerModules, devicePluginPods).Return(reconcileLabelsResult)
mockHelper.EXPECT().getLoadedKernelModules(nodeLabels).Return(loadedKernelModules)
mockHelper.EXPECT().reconcileLabels(labelsPerModules, devicePluginPods, loadedKernelModules).Return(reconcileLabelsResult)
if upDateNodeLabelsErrors {
mockHelper.EXPECT().updateNodeLabels(ctx, nodeName, reconcileLabelsResult).Return(fmt.Errorf("some error"))
goto executeTestFunction
Expand Down Expand Up @@ -259,7 +267,7 @@ var _ = Describe("reconcileLabels", func() {
ObjectMeta: metav1.ObjectMeta{Namespace: "moduleNamespace"},
}

It("delete label scenario with device plugin pod not present", func() {
It("delete device-plugin label, device plugin pod not present", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
Expand All @@ -268,14 +276,30 @@ var _ = Describe("reconcileLabels", func() {
devicePluginVersionLabel: "1",
}

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod})
res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeFalse())
Expect(len(res.labelsToAdd)).To(Equal(0))
Expect(res.labelsToDelete).To(Equal([]string{utils.GetDevicePluginVersionLabelName("moduleNamespace", "moduleName")}))
})

It("delete label scenario with device plugin pod present", func() {
It("delete worker pod label, device plugin pod not present", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
moduleVersionLabel: "",
workerPodVersionLabel: "1",
devicePluginVersionLabel: "",
}

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeFalse())
Expect(len(res.labelsToAdd)).To(Equal(0))
Expect(res.labelsToDelete).To(Equal([]string{utils.GetWorkerPodVersionLabelName("moduleNamespace", "moduleName")}))
})

It("delete device-plugin label, device plugin pod present", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
Expand All @@ -285,14 +309,31 @@ var _ = Describe("reconcileLabels", func() {
}
pod.SetLabels(map[string]string{constants.ModuleNameLabel: "moduleName"})

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod})
res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeFalse())
Expect(res.labelsToDelete).To(Equal([]string{utils.GetDevicePluginVersionLabelName("moduleNamespace", "moduleName")}))
Expect(len(res.labelsToAdd)).To(Equal(0))
})

It("delete worker pod label,device plugin pod present", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
moduleVersionLabel: "",
workerPodVersionLabel: "1",
devicePluginVersionLabel: "",
}
pod.SetLabels(map[string]string{constants.ModuleNameLabel: "moduleName"})

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeTrue())
Expect(len(res.labelsToDelete)).To(Equal(0))
Expect(len(res.labelsToAdd)).To(Equal(0))
Expect(len(res.labelsToDelete)).To(Equal(0))
})

It("add label scenario", func() {
It("add worker pod label, kernel module is not loaded", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
Expand All @@ -301,13 +342,65 @@ var _ = Describe("reconcileLabels", func() {
devicePluginVersionLabel: "",
}

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod})
res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeFalse())
Expect(res.labelsToAdd).To(Equal(map[string]string{utils.GetWorkerPodVersionLabelName("moduleNamespace", "moduleName"): "1"}))
Expect(len(res.labelsToDelete)).To(Equal(0))
})

It("add worker pod label, kernel module is loaded", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
moduleVersionLabel: "1",
workerPodVersionLabel: "",
devicePluginVersionLabel: "",
}

loadedKernelModules := []types.NamespacedName{{Name: "moduleName", Namespace: "moduleNamespace"}}

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, loadedKernelModules)

Expect(res.requeue).To(BeTrue())
Expect(len(res.labelsToDelete)).To(Equal(0))
Expect(len(res.labelsToAdd)).To(Equal(0))
})

It("add device-plugin label, kernel module is not loaded", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
moduleVersionLabel: "1",
workerPodVersionLabel: "1",
devicePluginVersionLabel: "",
}

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeFalse())
Expect(res.labelsToAdd).To(Equal(map[string]string{utils.GetDevicePluginVersionLabelName("moduleNamespace", "moduleName"): "1"}))
Expect(len(res.labelsToDelete)).To(Equal(0))
})

It("add device-plugin label, kernel module is loaded", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
namespace: "moduleNamespace",
moduleVersionLabel: "1",
workerPodVersionLabel: "1",
devicePluginVersionLabel: "",
}

loadedKernelModules := []types.NamespacedName{{Name: "moduleName", Namespace: "moduleNamespace"}}

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, loadedKernelModules)

Expect(res.requeue).To(BeFalse())
Expect(res.labelsToAdd).To(Equal(map[string]string{utils.GetDevicePluginVersionLabelName("moduleNamespace", "moduleName"): "1"}))
Expect(len(res.labelsToDelete)).To(Equal(0))
})

It("no label needs to be added due to none action", func() {
moduleLabels := &modulesVersionLabels{
name: "moduleName",
Expand All @@ -320,7 +413,7 @@ var _ = Describe("reconcileLabels", func() {
pod.Namespace = "moduleNamespace"
pod.SetLabels(map[string]string{constants.ModuleNameLabel: "moduleName"})

res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod})
res := helper.reconcileLabels(map[string]*modulesVersionLabels{"key": moduleLabels}, []v1.Pod{pod}, nil)

Expect(res.requeue).To(BeFalse())
Expect(len(res.labelsToAdd)).To(Equal(0))
Expand Down

0 comments on commit 06f403b

Please sign in to comment.