Skip to content

Commit

Permalink
Fix(ILB): Prevent deletion of nodes with empty zone labels during lab…
Browse files Browse the repository at this point in the history
…el patching

Previously, new nodes were created with all topology labels present, including the zone label. However, recently, labels have been patched/updated on the node resource *after* the resource is created. This can result in nodes having an empty zone field when initially added to an instance group.

The Internal Load Balancer controller was incorrectly removing these nodes with empty zone labels from their assigned instance groups. This occurred because the controller interpreted nodes with empty zones as not belonging to any zone.

This commit fixes the issue by:

- Storing nodes with empty zones during node processing.
- Preventing the controller from removing these nodes during zone processing.

This ensures that new nodes with initially empty zone labels, due to the asynchronous label patching, are not prematurely deleted from their instance groups.

The fix has been verified by adding a node with an empty zone label to an instance group and confirming that it is not deleted after a controller sync.
  • Loading branch information
08volt committed Oct 29, 2024
1 parent 8929def commit 67dd896
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 4 deletions.
27 changes: 23 additions & 4 deletions providers/gce/gce_loadbalancer_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,8 @@ func (g *Cloud) ensureInternalHealthCheck(name string, svcName types.NamespacedN
return hc, nil
}

func (g *Cloud) ensureInternalInstanceGroup(name, zone string, nodes []*v1.Node) (string, error) {
klog.V(2).Infof("ensureInternalInstanceGroup(%v, %v): checking group that it contains %v nodes [node names limited, total number of nodes: %d]", name, zone, loggableNodeNames(nodes), len(nodes))
func (g *Cloud) ensureInternalInstanceGroup(name, zone string, nodes []*v1.Node, emptyZoneNodes []*v1.Node) (string, error) {
klog.V(2).Infof("ensureInternalInstanceGroup(%v, %v): checking group that it contains %v nodes [node names limited, total number of nodes: %d], the following nodes have empty string in the zone field and won't be deleted: %v", name, zone, loggableNodeNames(nodes), len(nodes), loggableNodeNames(emptyZoneNodes))
ig, err := g.GetInstanceGroup(name, zone)
if err != nil && !isNotFound(err) {
return "", err
Expand All @@ -615,6 +615,11 @@ func (g *Cloud) ensureInternalInstanceGroup(name, zone string, nodes []*v1.Node)
kubeNodes.Insert(n.Name)
}

emptyZoneNodesNames := sets.NewString()
for _, n := range emptyZoneNodes {
emptyZoneNodesNames.Insert(n.Name)
}

// Individual InstanceGroup has a limit for 1000 instances in it.
// As a result, it's not possible to add more to it.
// Given that the long-term fix (AlphaFeatureILBSubsets) is already in-progress,
Expand Down Expand Up @@ -650,7 +655,7 @@ func (g *Cloud) ensureInternalInstanceGroup(name, zone string, nodes []*v1.Node)
}
}

removeNodes := gceNodes.Difference(kubeNodes).List()
removeNodes := gceNodes.Difference(kubeNodes).Difference(emptyZoneNodesNames).List()
addNodes := kubeNodes.Difference(gceNodes).List()

if len(removeNodes) != 0 {
Expand Down Expand Up @@ -689,8 +694,22 @@ func (g *Cloud) ensureInternalInstanceGroups(name string, nodes []*v1.Node) ([]s

zonedNodes := splitNodesByZone(nodes)
klog.V(2).Infof("ensureInternalInstanceGroups(%v): %d nodes over %d zones in region %v", name, len(nodes), len(zonedNodes), g.region)

emptyZoneNodesNames := sets.NewString()
for _, n := range zonedNodes[""] {
emptyZoneNodesNames.Insert(n.Name)
}

if len(emptyZoneNodesNames) > 0 {
klog.V(2).Infof("%d nodes have empty zone: %v in region %v", len(emptyZoneNodesNames), emptyZoneNodesNames, g.region)
}

var igLinks []string
for zone, nodes := range zonedNodes {
if zone == "" {
continue // skip ensuring nodes with empty zone
}

if g.AlphaFeatureGate.Enabled(AlphaFeatureSkipIGsManagement) {
igs, err := g.FilterInstanceGroupsByNamePrefix(name, zone)
if err != nil {
Expand All @@ -700,7 +719,7 @@ func (g *Cloud) ensureInternalInstanceGroups(name string, nodes []*v1.Node) ([]s
igLinks = append(igLinks, ig.SelfLink)
}
} else {
igLink, err := g.ensureInternalInstanceGroup(name, zone, nodes)
igLink, err := g.ensureInternalInstanceGroup(name, zone, nodes, zonedNodes[""])
if err != nil {
return nil, err
}
Expand Down
52 changes: 52 additions & 0 deletions providers/gce/gce_loadbalancer_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,58 @@ func TestUpdateInternalLoadBalancerNodes(t *testing.T) {
)
}

func TestUpdateInternalLoadBalancerNodesWithEmptyZone(t *testing.T) {
t.Parallel()

vals := DefaultTestClusterValues()
gce, err := fakeGCECloud(vals)
require.NoError(t, err)
nodeName := "test-node-1"
node1Name := []string{nodeName}

svc := fakeLoadbalancerService(string(LBTypeInternal))
svc, err = gce.client.CoreV1().Services(svc.Namespace).Create(context.TODO(), svc, metav1.CreateOptions{})
require.NoError(t, err)
nodes, err := createAndInsertNodes(gce, node1Name, vals.ZoneName)
require.NoError(t, err)

_, err = gce.ensureInternalLoadBalancer(vals.ClusterName, vals.ClusterID, svc, nil, nodes)
assert.NoError(t, err)

// Ensure Node has been added to instance group
igName := makeInstanceGroupName(vals.ClusterID)
instances, err := gce.ListInstancesInInstanceGroup(igName, vals.ZoneName, "ALL")
require.NoError(t, err)
assert.Equal(t, 1, len(instances))
assert.Contains(
t,
instances[0].Instance,
fmt.Sprintf("%s/zones/%s/instances/%s", vals.ProjectID, vals.ZoneName, nodeName),
)

// Remove Zone from node
nodes[0].Labels[v1.LabelTopologyZone] = "" // empty zone

lbName := gce.GetLoadBalancerName(context.TODO(), "", svc)
existingFwdRule := &compute.ForwardingRule{
Name: lbName,
IPAddress: "",
Ports: []string{"123"},
IPProtocol: "TCP",
LoadBalancingScheme: string(cloud.SchemeInternal),
Description: fmt.Sprintf(`{"kubernetes.io/service-name":"%s"}`, types.NamespacedName{Name: svc.Name, Namespace: svc.Namespace}.String()),
}

_, err = gce.ensureInternalLoadBalancer(vals.ClusterName, vals.ClusterID, svc, existingFwdRule, nodes)
assert.NoError(t, err)

// Expect load balancer to not have deleted node test-node-1
node := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
exist, err := gce.InstanceExists(context.TODO(), node)
require.NoError(t, err)
assert.Equal(t, exist, true)
}

func TestEnsureInternalLoadBalancerDeleted(t *testing.T) {
t.Parallel()

Expand Down

0 comments on commit 67dd896

Please sign in to comment.