Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 6 additions & 23 deletions felix/nftables/maps.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2024 Tigera, Inc. All rights reserved.
// Copyright (c) 2024-2026 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -46,7 +46,7 @@ type MapsDataplane interface {

MapUpdates() *MapUpdates
FinishMapUpdates(updates *MapUpdates)
LoadDataplaneState() error
LoadDataplaneState(ctx context.Context, mapNames []string) error
}

var _ MapsDataplane = &Maps{}
Expand Down Expand Up @@ -278,10 +278,9 @@ func (s *Maps) filterAndCanonicaliseMembers(mtype MapType, members map[string][]
return filtered
}

// tryResync attempts to bring our state into sync with the dataplane. It scans the contents of the
// maps in the dataplane and queues up updates to any maps that are out-of-sync.
func (s *Maps) LoadDataplaneState() error {
// Log the time spent as we exit the function.
// LoadDataplaneState resyncs map state using the provided list of map names
// (typically obtained from a ListAll call by the caller).
func (s *Maps) LoadDataplaneState(ctx context.Context, maps []string) error {
resyncStart := time.Now()
defer func() {
s.logCxt.WithFields(logrus.Fields{
Expand All @@ -292,24 +291,8 @@ func (s *Maps) LoadDataplaneState() error {
}).Debug("Finished Maps resync")
}()

// Clear the dataplane metadata view, we'll build it back up again as we scan.
s.mapNameToProgrammedMetadata.Dataplane().DeleteAll()

// Load from the dataplane. Update our Dataplane() maps with the actual contents
// of the data plane.
//
// For any map that doesn't match the desired data plane state, we'll queue up an update.
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout)
defer cancel()
maps, err := s.nft.List(ctx, "map")
if err != nil {
if knftables.IsNotFound(err) {
// Table doesn't exist - nothing to resync.
return nil
}
return fmt.Errorf("error listing nftables maps: %s", err)
}

// We'll process each map in parallel, so we need a struct to hold the results.
// Once knftables is augmented to support reading many maps at once, we can remove this.
type mapData struct {
Expand Down Expand Up @@ -377,7 +360,7 @@ func (s *Maps) LoadDataplaneState() error {

memberTracker := s.getOrCreateMemberTracker(mapName)
numExtrasExpected := memberTracker.PendingDeletions().Len()
err = memberTracker.Dataplane().ReplaceFromIter(func(f func(k MapMember)) error {
err := memberTracker.Dataplane().ReplaceFromIter(func(f func(k MapMember)) error {
for item := range elemsSet.All() {
f(item)
}
Expand Down
23 changes: 16 additions & 7 deletions felix/nftables/maps_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2024 Tigera, Inc. All rights reserved.
// Copyright (c) 2024-2026 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -176,7 +176,7 @@ var _ = Describe("Maps with empty data plane", func() {
Expect(f.Run(context.TODO(), tx)).NotTo(HaveOccurred())

// Resync with dataplane.
Expect(s.LoadDataplaneState()).NotTo(HaveOccurred())
Expect(loadMapsDataplaneState(f, s)).NotTo(HaveOccurred())

// Should be no work to do.
Expect(s.MapUpdates()).To(Equal(&nftables.MapUpdates{
Expand All @@ -195,7 +195,7 @@ var _ = Describe("Maps with empty data plane", func() {
Expect(f.Run(context.TODO(), tx)).NotTo(HaveOccurred())

// Resync with dataplane. We should now detect the new element and queue it for removal.
Expect(s.LoadDataplaneState()).NotTo(HaveOccurred())
Expect(loadMapsDataplaneState(f, s)).NotTo(HaveOccurred())
upd := s.MapUpdates()
Expect(upd.MapToAddedMembers).To(HaveLen(0))
Expect(upd.MapToDeletedMembers).To(HaveLen(1))
Expand Down Expand Up @@ -226,7 +226,7 @@ var _ = Describe("Maps with empty data plane", func() {
Expect(f.Run(context.TODO(), tx)).NotTo(HaveOccurred())

// A resync should fix both the map and the elements.
Expect(s.LoadDataplaneState()).NotTo(HaveOccurred())
Expect(loadMapsDataplaneState(f, s)).NotTo(HaveOccurred())
upd = s.MapUpdates()
Expect(upd.MapToAddedMembers).To(HaveLen(1))
Expect(upd.MapToDeletedMembers).To(HaveLen(0))
Expand All @@ -251,7 +251,7 @@ var _ = Describe("Maps with empty data plane", func() {
Expect(f.Run(context.Background(), tx)).NotTo(HaveOccurred())

// Trigger a resync.
Expect(s.LoadDataplaneState()).NotTo(HaveOccurred())
Expect(loadMapsDataplaneState(f, s)).NotTo(HaveOccurred())

// Expect queued deletions for all the maps.
upd := s.MapUpdates()
Expand Down Expand Up @@ -281,7 +281,7 @@ var _ = Describe("Maps with empty data plane", func() {
Expect(f.Run(context.Background(), tx)).NotTo(HaveOccurred())

// Trigger a resync. We should delete the unexpected map.
Expect(s.LoadDataplaneState()).NotTo(HaveOccurred())
Expect(loadMapsDataplaneState(f, s)).NotTo(HaveOccurred())

// Expect the set to be deleted.
upd := s.MapUpdates()
Expand Down Expand Up @@ -315,7 +315,7 @@ var _ = Describe("Maps with empty data plane", func() {
s.AddOrReplaceMap(meta, nil)

// Load the dataplane state. We should delete the unexpected map.
Expect(s.LoadDataplaneState()).NotTo(HaveOccurred())
Expect(loadMapsDataplaneState(f, s)).NotTo(HaveOccurred())

// Expect members to be correct. We should remove the unexpected members despite not knowing the type.
// NOTE: We currently have no way to know or change the type of the map via knftables.
Expand All @@ -330,6 +330,15 @@ var _ = Describe("Maps with empty data plane", func() {
})
})

func loadMapsDataplaneState(f *fakeNFT, s *nftables.Maps) error {
ctx := context.TODO()
mapNames, err := f.List(ctx, "map")
if err != nil {
return fmt.Errorf("listing maps: %w", err)
}
return s.LoadDataplaneState(ctx, mapNames)
}

func addMapToTx(tx *knftables.Transaction, m nftables.MapMetadata, elements map[string][]string) {
tx.Add(&knftables.Map{
Name: m.Name,
Expand Down
72 changes: 48 additions & 24 deletions felix/nftables/table.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2016-2024 Tigera, Inc. All rights reserved.
// Copyright (c) 2016-2026 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -646,8 +646,25 @@ func (t *NftablesTable) decrefChain(chainName string) {
}

func (t *NftablesTable) loadDataplaneState() {
// Sync maps.
if err := t.LoadDataplaneState(); err != nil {
// Fetch all object names from the dataplane in a single nft invocation.
// This replaces separate List("map") and List("chain") calls, halving
// the number of nft subprocesses during resync.
ctx, cancel := context.WithTimeout(context.Background(), t.contextTimeout)
defer cancel()
Comment on lines +652 to +653
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same timeout-scoped ctx is used for both ListAll() and the subsequent map resync (which may issue many ListElements calls). This couples their time budgets and can cause premature context deadline exceeded during large resyncs. Consider using separate contexts/timeouts per operation (or let Maps.LoadDataplaneState derive its own per-call timeouts).

Copilot uses AI. Check for mistakes.
allObjects, err := t.nft.ListAll(ctx)
if err != nil {
if knftables.IsNotFound(err) {
t.logCxt.Debug("Table not found in dataplane, nothing to load.")
} else {
t.logCxt.WithError(err).Warn("Failed to list all nftables objects")
}
// Fall through — maps and chains will get empty slices, which is
// correct when the table doesn't exist yet.
allObjects = map[string][]string{}
Comment on lines +658 to +663
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On ListAll() failure (non-NotFound), this falls through to an empty object map. That can make Felix treat an existing table as empty, clearing the in-memory dataplane view and potentially causing noisy/failed re-programming attempts. Consider returning early on unexpected ListAll errors, or falling back to separate List("map")/List("chain") calls instead of assuming empty state.

Suggested change
} else {
t.logCxt.WithError(err).Warn("Failed to list all nftables objects")
}
// Fall through — maps and chains will get empty slices, which is
// correct when the table doesn't exist yet.
allObjects = map[string][]string{}
// Fall through — maps and chains will get empty slices, which is
// correct when the table doesn't exist yet.
allObjects = map[string][]string{}
} else {
t.logCxt.WithError(err).Warn("Failed to list all nftables objects")
return
}

Copilot uses AI. Check for mistakes.
}

// Sync maps using the pre-fetched map names.
if err := t.LoadDataplaneState(ctx, allObjects["map"]); err != nil {
t.logCxt.WithError(err).Warn("Failed to load maps state")
}

Expand All @@ -660,7 +677,7 @@ func (t *NftablesTable) loadDataplaneState() {

t.lastReadTime = t.timeNow()

dataplaneHashes, dataplaneRules := t.getHashesAndRulesFromDataplane()
dataplaneHashes, dataplaneRules := t.getHashesAndRulesFromDataplane(allObjects["chain"])

// Check that the rules we think we've programmed are still there and mark any inconsistent
// chains for refresh.
Expand Down Expand Up @@ -766,15 +783,15 @@ func (t *NftablesTable) expectedHashesForInsertAppendChain(chainName string) (al
// represented by an empty string. The 'rules' map contains an entry for each non-Calico chain in the table that
// contains inserts. It is used to generate deletes using the full rule, rather than deletes by line number, to avoid
// race conditions on chains we don't fully control.
func (t *NftablesTable) getHashesAndRulesFromDataplane() (hashes map[string][]string, rules map[string][]*knftables.Rule) {
func (t *NftablesTable) getHashesAndRulesFromDataplane(chainNames []string) (hashes map[string][]string, rules map[string][]*knftables.Rule) {
retries := 3
retryDelay := 100 * time.Millisecond

// Retry a few times before we panic. This deals with any transient errors and it prevents
// us from spamming a panic into the log when we're being gracefully shut down by a SIGTERM.
for {
t.onStillAlive()
hashes, rules, err := t.attemptToGetHashesAndRulesFromDataplane()
hashes, rules, err := t.attemptToGetHashesAndRulesFromDataplane(chainNames)
if err != nil {
countNumListErrors.Inc()
var stderr string
Expand All @@ -796,8 +813,10 @@ func (t *NftablesTable) getHashesAndRulesFromDataplane() (hashes map[string][]st
}
}

// attemptToGetHashesAndRulesFromDataplane reads nftables state and loads it into memory.
func (t *NftablesTable) attemptToGetHashesAndRulesFromDataplane() (hashes map[string][]string, rules map[string][]*knftables.Rule, err error) {
// attemptToGetHashesAndRulesFromDataplane reads nftables state and loads it
// into memory. If chainNames is non-nil, it is used directly (from a prior
// ListAll call). If nil, chain names are fetched via List.
func (t *NftablesTable) attemptToGetHashesAndRulesFromDataplane(chainNames []string) (hashes map[string][]string, rules map[string][]*knftables.Rule, err error) {
startTime := t.timeNow()
defer func() {
saveDuration := t.timeNow().Sub(startTime)
Expand All @@ -808,26 +827,28 @@ func (t *NftablesTable) attemptToGetHashesAndRulesFromDataplane() (hashes map[st
}
}()

t.logCxt.Debug("Attmempting to get hashes and rules from nftables")
t.logCxt.Debug("Attempting to get hashes and rules from nftables")

hashes = make(map[string][]string)
rules = make(map[string][]*knftables.Rule)

ctx, cancel := context.WithTimeout(context.Background(), t.contextTimeout)
defer cancel()

// Add chains. We need to query this separately, as chains may exist without rules.
countNumListCalls.Inc()
allChains, err := t.nft.List(ctx, "chain")
if err != nil {
if knftables.IsNotFound(err) {
err = nil
return
if chainNames == nil {
countNumListCalls.Inc()
chainNames, err = t.nft.List(ctx, "chain")
if err != nil {
if knftables.IsNotFound(err) {
err = nil
return
}
countNumListErrors.Inc()
return nil, nil, err
}
countNumListErrors.Inc()
return nil, nil, err
}
for _, chain := range allChains {

for _, chain := range chainNames {
hashes[chain] = []string{}
rules[chain] = []*knftables.Rule{}
}
Expand Down Expand Up @@ -1182,13 +1203,16 @@ func (t *NftablesTable) applyUpdates() error {
}

if err := t.runTransaction(tx); err != nil {
// Let's just print out the entire ruleset for debugging purposes.
cmd := t.newCmd("nft", "list", "ruleset")
// Dump our table's state for debugging. We scope this to our
// own table rather than using "nft list ruleset" to avoid
// parsing objects from other tables that may contain udata
// written by a newer nft, which can crash older nft binaries.
cmd := t.newCmd("nft", "list", "table", t.name)
Comment on lines +1206 to +1210
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nft list table is being invoked with only the table name. Since Felix creates both IPv4 and IPv6 tables with the same name (e.g. "calico"), this may dump the wrong table or fail depending on nft's argument parsing. Consider including the nftables family (ip/ip6/arp) in the command invocation (store the family on NftablesTable so it can be used here).

Copilot uses AI. Check for mistakes.
output, err2 := cmd.Output()
if err2 != nil {
t.logCxt.WithError(err2).Error("Failed to load nftables ruleset")
t.logCxt.WithError(err2).Error("Failed to load nftables table state")
} else {
t.logCxt.WithField("ruleset", string(output)).Error("Current ruleset after error")
t.logCxt.WithField("tableState", string(output)).Error("Current table state after error")
}

t.logCxt.WithError(err).WithField("tx", tx.String()).Error("Failed to run nft transaction")
Expand Down Expand Up @@ -1256,7 +1280,7 @@ func (t *NftablesTable) CheckRulesPresent(chain string, rules []generictables.Ru
features := t.featureDetector.GetFeatures()
hashes := CalculateRuleHashes(chain, rules, features)

dpHashes, _ := t.getHashesAndRulesFromDataplane()
dpHashes, _ := t.getHashesAndRulesFromDataplane(nil)
dpHashesSet := set.New[string]()
for _, h := range dpHashes[chain] {
dpHashesSet.Add(h)
Expand Down
7 changes: 4 additions & 3 deletions felix/nftables/table_layer.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2024 Tigera, Inc. All rights reserved.
// Copyright (c) 2024-2026 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
package nftables

import (
"context"
"fmt"
"strings"
"time"
Expand Down Expand Up @@ -183,6 +184,6 @@ func (t *tableLayer) FinishMapUpdates(updates *MapUpdates) {
t.maps.FinishMapUpdates(updates)
}

func (t *tableLayer) LoadDataplaneState() error {
return t.maps.LoadDataplaneState()
func (t *tableLayer) LoadDataplaneState(ctx context.Context, mapNames []string) error {
return t.maps.LoadDataplaneState(ctx, mapNames)
}
Loading