You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/util/linuxfw/nftables_runner.go

2056 lines
62 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build linux
package linuxfw
import (
"encoding/binary"
"encoding/hex"
"errors"
"fmt"
"net"
"net/netip"
"reflect"
"strings"
"github.com/google/nftables"
"github.com/google/nftables/expr"
"golang.org/x/sys/unix"
"tailscale.com/net/tsaddr"
"tailscale.com/types/logger"
"tailscale.com/types/ptr"
)
const (
chainNameForward = "ts-forward"
chainNameInput = "ts-input"
chainNamePostrouting = "ts-postrouting"
)
// chainTypeRegular is an nftables chain that does not apply to a hook.
const chainTypeRegular = ""
type chainInfo struct {
table *nftables.Table
name string
chainType nftables.ChainType
chainHook *nftables.ChainHook
chainPriority *nftables.ChainPriority
chainPolicy *nftables.ChainPolicy
}
// nftable contains nat and filter tables for the given IP family (Proto).
type nftable struct {
Proto nftables.TableFamily // IPv4 or IPv6
Filter *nftables.Table
Nat *nftables.Table
}
// nftablesRunner implements a netfilterRunner using the netlink based nftables
// library. As nftables allows for arbitrary tables and chains, there is a need
// to follow conventions in order to integrate well with a surrounding
// ecosystem. The rules installed by nftablesRunner have the following
// properties:
// - Install rules that intend to take precedence over rules installed by
// other software. Tailscale provides packet filtering for tailnet traffic
// inside the daemon based on the tailnet ACL rules.
// - As nftables "accept" is not final, rules from high priority tables (low
// numbers) will fall through to lower priority tables (high numbers). In
// order to effectively be 'final', we install "jump" rules into conventional
// tables and chains that will reach an accept verdict inside those tables.
// - The table and chain conventions followed here are those used by
// `iptables-nft` and `ufw`, so that those tools co-exist and do not
// negatively affect Tailscale function.
// - Be mindful that 1) all chains attached to a given hook (i.e the forward hook)
// will be processed in priority order till either a rule in one of the chains issues a drop verdict
// or there are no more chains for that hook
// 2) processing of individual rules within a chain will stop once one of them issues a final verdict (accept, drop).
// https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains
type nftablesRunner struct {
conn *nftables.Conn
nft4 *nftable // IPv4 tables, never nil
nft6 *nftable // IPv6 tables or nil if the system does not support IPv6
v6Available bool // whether the host supports IPv6
}
func (n *nftablesRunner) ensurePreroutingChain(dst netip.Addr) (*nftables.Table, *nftables.Chain, error) {
polAccept := nftables.ChainPolicyAccept
table, err := n.getNFTByAddr(dst)
if err != nil {
return nil, nil, fmt.Errorf("error setting up nftables for IP family of %v: %w", dst, err)
}
nat, err := createTableIfNotExist(n.conn, table.Proto, "nat")
if err != nil {
return nil, nil, fmt.Errorf("error ensuring nat table: %w", err)
}
// ensure prerouting chain exists
preroutingCh, err := getOrCreateChain(n.conn, chainInfo{
table: nat,
name: "PREROUTING",
chainType: nftables.ChainTypeNAT,
chainHook: nftables.ChainHookPrerouting,
chainPriority: nftables.ChainPriorityNATDest,
chainPolicy: &polAccept,
})
if err != nil {
return nil, nil, fmt.Errorf("error ensuring prerouting chain: %w", err)
}
return nat, preroutingCh, nil
}
func (n *nftablesRunner) AddDNATRule(origDst netip.Addr, dst netip.Addr) error {
nat, preroutingCh, err := n.ensurePreroutingChain(dst)
if err != nil {
return err
}
var daddrOffset, fam, dadderLen uint32
if origDst.Is4() {
daddrOffset = 16
dadderLen = 4
fam = unix.NFPROTO_IPV4
} else {
daddrOffset = 24
dadderLen = 16
fam = unix.NFPROTO_IPV6
}
dnatRule := &nftables.Rule{
Table: nat,
Chain: preroutingCh,
Exprs: []expr.Any{
&expr.Payload{
DestRegister: 1,
Base: expr.PayloadBaseNetworkHeader,
Offset: daddrOffset,
Len: dadderLen,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: origDst.AsSlice(),
},
&expr.Immediate{
Register: 1,
Data: dst.AsSlice(),
},
&expr.NAT{
Type: expr.NATTypeDestNAT,
Family: fam,
RegAddrMin: 1,
},
},
}
n.conn.InsertRule(dnatRule)
return n.conn.Flush()
}
// DNATWithLoadBalancer currently just forwards all traffic destined for origDst
// to the first IP address from the backend targets.
// TODO (irbekrm): instead of doing this load balance traffic evenly to all
// backend destinations.
// https://github.com/tailscale/tailscale/commit/d37f2f508509c6c35ad724fd75a27685b90b575b#diff-a3bcbcd1ca198799f4f768dc56fea913e1945a6b3ec9dbec89325a84a19a85e7R148-R232
func (n *nftablesRunner) DNATWithLoadBalancer(origDst netip.Addr, dsts []netip.Addr) error {
return n.AddDNATRule(origDst, dsts[0])
}
func (n *nftablesRunner) DNATNonTailscaleTraffic(tunname string, dst netip.Addr) error {
nat, preroutingCh, err := n.ensurePreroutingChain(dst)
if err != nil {
return err
}
var famConst uint32
if dst.Is4() {
famConst = unix.NFPROTO_IPV4
} else {
famConst = unix.NFPROTO_IPV6
}
dnatRule := &nftables.Rule{
Table: nat,
Chain: preroutingCh,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpNeq,
Register: 1,
Data: []byte(tunname),
},
&expr.Immediate{
Register: 1,
Data: dst.AsSlice(),
},
&expr.NAT{
Type: expr.NATTypeDestNAT,
Family: famConst,
RegAddrMin: 1,
},
},
}
n.conn.InsertRule(dnatRule)
return n.conn.Flush()
}
func (n *nftablesRunner) EnsureSNATForDst(src, dst netip.Addr) error {
polAccept := nftables.ChainPolicyAccept
table, err := n.getNFTByAddr(dst)
if err != nil {
return fmt.Errorf("error setting up nftables for IP family of %v: %w", dst, err)
}
nat, err := createTableIfNotExist(n.conn, table.Proto, "nat")
if err != nil {
return fmt.Errorf("error ensuring nat table exists: %w", err)
}
// ensure postrouting chain exists
postRoutingCh, err := getOrCreateChain(n.conn, chainInfo{
table: nat,
name: "POSTROUTING",
chainType: nftables.ChainTypeNAT,
chainHook: nftables.ChainHookPostrouting,
chainPriority: nftables.ChainPriorityNATSource,
chainPolicy: &polAccept,
})
if err != nil {
return fmt.Errorf("error ensuring postrouting chain: %w", err)
}
rules, err := n.conn.GetRules(nat, postRoutingCh)
if err != nil {
return fmt.Errorf("error listing rules: %w", err)
}
snatRulePrefixMatch := fmt.Sprintf("dst:%s,src:", dst.String())
snatRuleFullMatch := fmt.Sprintf("%s%s", snatRulePrefixMatch, src.String())
for _, rule := range rules {
current := string(rule.UserData)
if strings.HasPrefix(string(rule.UserData), snatRulePrefixMatch) {
if strings.EqualFold(current, snatRuleFullMatch) {
return nil // already exists, do nothing
}
if err := n.conn.DelRule(rule); err != nil {
return fmt.Errorf("error deleting SNAT rule: %w", err)
}
}
}
rule := snatRule(nat, postRoutingCh, src, dst, []byte(snatRuleFullMatch))
n.conn.AddRule(rule)
return n.conn.Flush()
}
// ClampMSSToPMTU ensures that all packets with TCP flags (SYN, ACK, RST) set
// being forwarded via the given interface (tun) have MSS set to <MTU of the
// interface> - 40 (IP and TCP headers). This can be useful if this tailscale
// instance is expected to run as a forwarding proxy, forwarding packets from an
// endpoint with higher MTU in an environment where path MTU discovery is
// expected to not work (such as the proxies created by the Tailscale Kubernetes
// operator). ClamMSSToPMTU creates a new base-chain ts-clamp in the filter
// table with accept policy and priority -150. In practice, this means that for
// SYN packets the clamp rule in this chain will likely run first and accept the
// packet. This is fine because 1) nftables run ALL chains with the same hook
// type unless a rule in one of them drops the packet and 2) this chain does not
// have functionality to drop the packet- so in practice a matching clamp rule
// will always be followed by the custom tailscale filtering rules in the other
// chains attached to the filter hook (FORWARD, ts-forward).
// We do not want to place the clamping rule into FORWARD/ts-forward chains
// because wgengine populates those chains with rules that contain accept
// verdicts that would cause no further procesing within that chain. This
// functionality is currently invoked from outside wgengine (containerboot), so
// we don't want to race with wgengine for rule ordering within chains.
func (n *nftablesRunner) ClampMSSToPMTU(tun string, addr netip.Addr) error {
polAccept := nftables.ChainPolicyAccept
table, err := n.getNFTByAddr(addr)
if err != nil {
return fmt.Errorf("error setting up nftables for IP family of %v: %w", addr, err)
}
filterTable, err := createTableIfNotExist(n.conn, table.Proto, "filter")
if err != nil {
return fmt.Errorf("error ensuring filter table: %w", err)
}
// ensure ts-clamp chain exists
fwChain, err := getOrCreateChain(n.conn, chainInfo{
table: filterTable,
name: "ts-clamp",
chainType: nftables.ChainTypeFilter,
chainHook: nftables.ChainHookForward,
chainPriority: nftables.ChainPriorityMangle,
chainPolicy: &polAccept,
})
if err != nil {
return fmt.Errorf("error ensuring forward chain: %w", err)
}
clampRule := &nftables.Rule{
Table: filterTable,
Chain: fwChain,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte(tun),
},
&expr.Meta{Key: expr.MetaKeyL4PROTO, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte{unix.IPPROTO_TCP},
},
&expr.Payload{
DestRegister: 1,
Base: expr.PayloadBaseTransportHeader,
Offset: 13,
Len: 1,
},
&expr.Bitwise{
DestRegister: 1,
SourceRegister: 1,
Len: 1,
Mask: []byte{0x02},
Xor: []byte{0x00},
},
&expr.Cmp{
Op: expr.CmpOpNeq, // match any packet with a TCP flag set (SYN, ACK, RST)
Register: 1,
Data: []byte{0x00},
},
&expr.Rt{
Register: 1,
Key: expr.RtTCPMSS,
},
&expr.Byteorder{
DestRegister: 1,
SourceRegister: 1,
Op: expr.ByteorderHton,
Len: 2,
Size: 2,
},
&expr.Exthdr{
SourceRegister: 1,
Type: 2,
Offset: 2,
Len: 2,
Op: expr.ExthdrOpTcpopt,
},
},
}
n.conn.AddRule(clampRule)
return n.conn.Flush()
}
// deleteTableIfExists deletes a nftables table via connection c if it exists
// within the given family.
func deleteTableIfExists(c *nftables.Conn, family nftables.TableFamily, name string) error {
t, err := getTableIfExists(c, family, name)
if err != nil {
return fmt.Errorf("get table: %w", err)
}
if t == nil {
// Table does not exist, so nothing to delete.
return nil
}
c.DelTable(t)
if err := c.Flush(); err != nil {
if t, err = getTableIfExists(c, family, name); t == nil && err == nil {
// Check if the table still exists. If it does not, then the error
// is due to the table not existing, so we can ignore it. Maybe a
// concurrent process deleted the table.
return nil
}
return fmt.Errorf("del table: %w", err)
}
return nil
}
// getTableIfExists returns the table with the given name from the given family
// if it exists. If none match, it returns (nil, nil).
func getTableIfExists(c *nftables.Conn, family nftables.TableFamily, name string) (*nftables.Table, error) {
tables, err := c.ListTables()
if err != nil {
return nil, fmt.Errorf("get tables: %w", err)
}
for _, table := range tables {
if table.Name == name && table.Family == family {
return table, nil
}
}
return nil, nil
}
// createTableIfNotExist creates a nftables table via connection c if it does
// not exist within the given family.
func createTableIfNotExist(c *nftables.Conn, family nftables.TableFamily, name string) (*nftables.Table, error) {
if t, err := getTableIfExists(c, family, name); err != nil {
return nil, fmt.Errorf("get table: %w", err)
} else if t != nil {
return t, nil
}
t := c.AddTable(&nftables.Table{
Family: family,
Name: name,
})
if err := c.Flush(); err != nil {
return nil, fmt.Errorf("add table: %w", err)
}
return t, nil
}
type errorChainNotFound struct {
chainName string
tableName string
}
func (e errorChainNotFound) Error() string {
return fmt.Sprintf("chain %s not found in table %s", e.chainName, e.tableName)
}
// getChainFromTable returns the chain with the given name from the given table.
// Note that a chain name is unique within a table.
func getChainFromTable(c *nftables.Conn, table *nftables.Table, name string) (*nftables.Chain, error) {
chains, err := c.ListChainsOfTableFamily(table.Family)
if err != nil {
return nil, fmt.Errorf("list chains: %w", err)
}
for _, chain := range chains {
// Table family is already checked so table name is unique
if chain.Table.Name == table.Name && chain.Name == name {
return chain, nil
}
}
return nil, errorChainNotFound{table.Name, name}
}
// isTSChain reports whether `name` begins with "ts-" (and is thus a
// Tailscale-managed chain).
func isTSChain(name string) bool {
return strings.HasPrefix(name, "ts-")
}
// createChainIfNotExist creates a chain with the given name in the given table
// if it does not exist.
func createChainIfNotExist(c *nftables.Conn, cinfo chainInfo) error {
_, err := getOrCreateChain(c, cinfo)
return err
}
func getOrCreateChain(c *nftables.Conn, cinfo chainInfo) (*nftables.Chain, error) {
chain, err := getChainFromTable(c, cinfo.table, cinfo.name)
if err != nil && !errors.Is(err, errorChainNotFound{cinfo.table.Name, cinfo.name}) {
return nil, fmt.Errorf("get chain: %w", err)
} else if err == nil {
// The chain already exists. If it is a TS chain, check the
// type/hook/priority, but for "conventional chains" assume they're what
// we expect (in case iptables-nft/ufw make minor behavior changes in
// the future).
if isTSChain(chain.Name) && (chain.Type != cinfo.chainType || *chain.Hooknum != *cinfo.chainHook || *chain.Priority != *cinfo.chainPriority) {
return nil, fmt.Errorf("chain %s already exists with different type/hook/priority", cinfo.name)
}
return chain, nil
}
chain = c.AddChain(&nftables.Chain{
Name: cinfo.name,
Table: cinfo.table,
Type: cinfo.chainType,
Hooknum: cinfo.chainHook,
Priority: cinfo.chainPriority,
Policy: cinfo.chainPolicy,
})
if err := c.Flush(); err != nil {
return nil, fmt.Errorf("add chain: %w", err)
}
return chain, nil
}
// NetfilterRunner abstracts helpers to run netfilter commands. It is
// implemented by linuxfw.IPTablesRunner and linuxfw.NfTablesRunner.
type NetfilterRunner interface {
// AddLoopbackRule adds a rule to permit loopback traffic to addr. This rule
// is added only if it does not already exist.
AddLoopbackRule(addr netip.Addr) error
// DelLoopbackRule removes the rule added by AddLoopbackRule.
DelLoopbackRule(addr netip.Addr) error
// AddHooks adds rules to conventional chains like "FORWARD", "INPUT" and
// "POSTROUTING" to jump from those chains to tailscale chains.
AddHooks() error
// DelHooks deletes rules added by AddHooks.
DelHooks(logf logger.Logf) error
// AddChains creates custom Tailscale chains.
AddChains() error
// DelChains removes chains added by AddChains.
DelChains() error
// AddBase adds rules reused by different other rules.
AddBase(tunname string) error
// DelBase removes rules added by AddBase.
DelBase() error
// AddSNATRule adds the netfilter rule to SNAT incoming traffic over
// the Tailscale interface destined for local subnets. An error is
// returned if the rule already exists.
AddSNATRule() error
// DelSNATRule removes the rule added by AddSNATRule.
DelSNATRule() error
// AddStatefulRule adds a netfilter rule for stateful packet filtering
// using conntrack.
AddStatefulRule(tunname string) error
// DelStatefulRule removes a netfilter rule for stateful packet filtering
// using conntrack.
DelStatefulRule(tunname string) error
// HasIPV6 reports true if the system supports IPv6.
HasIPV6() bool
// HasIPV6NAT reports true if the system supports IPv6 NAT.
HasIPV6NAT() bool
// HasIPV6Filter reports true if the system supports IPv6 filter tables
// This is only meaningful for iptables implementation, where hosts have
// partial ipables support (i.e missing filter table). For nftables
// implementation, this will default to the value of HasIPv6().
HasIPV6Filter() bool
// AddDNATRule adds a rule to the nat/PREROUTING chain to DNAT traffic
// destined for the given original destination to the given new destination.
// This is used to forward all traffic destined for the Tailscale interface
// to the provided destination, as used in the Kubernetes ingress proxies.
AddDNATRule(origDst, dst netip.Addr) error
// DNATWithLoadBalancer adds a rule to the nat/PREROUTING chain to DNAT
// traffic destined for the given original destination to the given new
// destination(s) using round robin to load balance if more than one
// destination is provided. This is used to forward all traffic destined
// for the Tailscale interface to the provided destination(s), as used
// in the Kubernetes ingress proxies.
DNATWithLoadBalancer(origDst netip.Addr, dsts []netip.Addr) error
// EnsureSNATForDst sets up firewall to mask the source for traffic destined for dst to src:
// - creates a SNAT rule if it doesn't already exist
// - deletes any pre-existing rules matching the destination
// This is used to forward traffic destined for the local machine over
// the Tailscale interface, as used in the Kubernetes egress proxies.
EnsureSNATForDst(src, dst netip.Addr) error
// DNATNonTailscaleTraffic adds a rule to the nat/PREROUTING chain to DNAT
// all traffic inbound from any interface except exemptInterface to dst.
// This is used to forward traffic destined for the local machine over
// the Tailscale interface, as used in the Kubernetes egress proxies.
DNATNonTailscaleTraffic(exemptInterface string, dst netip.Addr) error
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531) * cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets This commit is first part of the work to allow running multiple replicas of the Kubernetes operator egress proxies per tailnet service + to allow exposing multiple tailnet services via each proxy replica. This expands the existing iptables/nftables-based proxy configuration mechanism. A proxy can now be configured to route to one or more tailnet targets via a (mounted) config file that, for each tailnet target, specifies: - the target's tailnet IP or FQDN - mappings of container ports to which cluster workloads will send traffic to tailnet target ports where the traffic should be forwarded. Example configfile contents: { "some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}} } A proxy that is configured with this config file will configure firewall rules to route cluster traffic to the tailnet targets. It will then watch the config file for updates as well as monitor relevant netmap updates and reconfigure firewall as needed. This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update the firewall rules without needing to restart the proxy Pod as well as to make it easier to debug/understand the rules: - for iptables, each portmapping is a DNAT rule with a comment pointing at the 'service',i.e: -A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80 Additionally there is a SNAT rule for each tailnet target, to mask the source address. - for nftables, a separate prerouting chain is created for each tailnet target and all the portmapping rules are placed in that chain. This makes it easier to look up rules and delete services when no longer needed. (nftables allows hooking a custom chain to a prerouting hook, so no extra work is needed to ensure that the rules in the service chains are evaluated). The next steps will be to get the Kubernetes Operator to generate the configfile and ensure it is mounted to the relevant proxy nodes. Updates tailscale/tailscale#13406 Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2 months ago
EnsurePortMapRuleForSvc(svc, tun string, targetIP netip.Addr, pm PortMap) error
DeletePortMapRuleForSvc(svc, tun string, targetIP netip.Addr, pm PortMap) error
DeleteSvc(svc, tun string, targetIPs []netip.Addr, pm []PortMap) error
// ClampMSSToPMTU adds a rule to the mangle/FORWARD chain to clamp MSS for
// traffic destined for the provided tun interface.
ClampMSSToPMTU(tun string, addr netip.Addr) error
// AddMagicsockPortRule adds a rule to the ts-input chain to accept
// incoming traffic on the specified port, to allow magicsock to
// communicate.
AddMagicsockPortRule(port uint16, network string) error
// DelMagicsockPortRule removes the rule created by AddMagicsockPortRule,
// if it exists.
DelMagicsockPortRule(port uint16, network string) error
}
// New creates a NetfilterRunner, auto-detecting whether to use
// nftables or iptables.
// As nftables is still experimental, iptables will be used unless
// either the TS_DEBUG_FIREWALL_MODE environment variable, or the prefHint
// parameter, is set to one of "nftables" or "auto".
func New(logf logger.Logf, prefHint string) (NetfilterRunner, error) {
mode := detectFirewallMode(logf, prefHint)
switch mode {
case FirewallModeIPTables:
// Note that we don't simply return an newIPTablesRunner here because it
// would return a `nil` iptablesRunner which is different from returning
// a nil NetfilterRunner.
ipr, err := newIPTablesRunner(logf)
if err != nil {
return nil, err
}
return ipr, nil
case FirewallModeNfTables:
// Note that we don't simply return an newNfTablesRunner here because it
// would return a `nil` nftablesRunner which is different from returning
// a nil NetfilterRunner.
nfr, err := newNfTablesRunner(logf)
if err != nil {
return nil, err
}
return nfr, nil
default:
return nil, fmt.Errorf("unknown firewall mode %v", mode)
}
}
// newNfTablesRunner creates a new nftablesRunner without guaranteeing
// the existence of the tables and chains.
func newNfTablesRunner(logf logger.Logf) (*nftablesRunner, error) {
conn, err := nftables.New()
if err != nil {
return nil, fmt.Errorf("nftables connection: %w", err)
}
return newNfTablesRunnerWithConn(logf, conn), nil
}
func newNfTablesRunnerWithConn(logf logger.Logf, conn *nftables.Conn) *nftablesRunner {
nft4 := &nftable{Proto: nftables.TableFamilyIPv4}
v6err := CheckIPv6(logf)
if v6err != nil {
logf("disabling tunneled IPv6 due to system IPv6 config: %v", v6err)
}
supportsV6 := v6err == nil
var nft6 *nftable
if supportsV6 {
nft6 = &nftable{Proto: nftables.TableFamilyIPv6}
}
logf("netfilter running in nftables mode, v6 = %v", supportsV6)
// TODO(KevinLiang10): convert iptables rule to nftable rules if they exist in the iptables
return &nftablesRunner{
conn: conn,
nft4: nft4,
nft6: nft6,
v6Available: supportsV6,
}
}
// newLoadSaddrExpr creates a new nftables expression that loads the source
// address of the packet into the given register.
func newLoadSaddrExpr(proto nftables.TableFamily, destReg uint32) (expr.Any, error) {
switch proto {
case nftables.TableFamilyIPv4:
return &expr.Payload{
DestRegister: destReg,
Base: expr.PayloadBaseNetworkHeader,
Offset: 12,
Len: 4,
}, nil
case nftables.TableFamilyIPv6:
return &expr.Payload{
DestRegister: destReg,
Base: expr.PayloadBaseNetworkHeader,
Offset: 8,
Len: 16,
}, nil
default:
return nil, fmt.Errorf("table family %v is neither IPv4 nor IPv6", proto)
}
}
// newLoadDportExpr creates a new nftables express that loads the desination port
// of a TCP/UDP packet into the given register.
func newLoadDportExpr(destReg uint32) expr.Any {
return &expr.Payload{
DestRegister: destReg,
Base: expr.PayloadBaseTransportHeader,
Offset: 2,
Len: 2,
}
}
// HasIPV6 reports true if the system supports IPv6.
func (n *nftablesRunner) HasIPV6() bool {
return n.v6Available
}
// HasIPV6NAT returns true if the system supports IPv6.
// Kernel support for nftables was added after support for IPv6
// NAT, so no need for a separate IPv6 NAT support check like we do for iptables.
// https://tldp.org/HOWTO/Linux+IPv6-HOWTO/ch18s04.html
// https://wiki.nftables.org/wiki-nftables/index.php/Building_and_installing_nftables_from_sources
func (n *nftablesRunner) HasIPV6NAT() bool {
return n.v6Available
}
// HasIPV6Filter returns true if system supports IPv6. There are no known edge
// cases where nftables running on a host that supports IPv6 would not support
// filter table.
func (n *nftablesRunner) HasIPV6Filter() bool {
return n.v6Available
}
// findRule iterates through the rules to find the rule with matching expressions.
func findRule(conn *nftables.Conn, rule *nftables.Rule) (*nftables.Rule, error) {
rules, err := conn.GetRules(rule.Table, rule.Chain)
if err != nil {
return nil, fmt.Errorf("get nftables rules: %w", err)
}
if len(rules) == 0 {
return nil, nil
}
ruleLoop:
for _, r := range rules {
if len(r.Exprs) != len(rule.Exprs) {
continue
}
for i, e := range r.Exprs {
// Skip counter expressions, as they will not match.
if _, ok := e.(*expr.Counter); ok {
continue
}
if !reflect.DeepEqual(e, rule.Exprs[i]) {
continue ruleLoop
}
}
return r, nil
}
return nil, nil
}
func createLoopbackRule(
proto nftables.TableFamily,
table *nftables.Table,
chain *nftables.Chain,
addr netip.Addr,
) (*nftables.Rule, error) {
saddrExpr, err := newLoadSaddrExpr(proto, 1)
if err != nil {
return nil, fmt.Errorf("newLoadSaddrExpr: %w", err)
}
loopBackRule := &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{
Key: expr.MetaKeyIIFNAME,
Register: 1,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte("lo"),
},
saddrExpr,
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: addr.AsSlice(),
},
&expr.Counter{},
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
}
return loopBackRule, nil
}
// insertLoopbackRule inserts the TS loop back rule into
// the given chain as the first rule if it does not exist.
func insertLoopbackRule(
conn *nftables.Conn, proto nftables.TableFamily,
table *nftables.Table, chain *nftables.Chain, addr netip.Addr) error {
loopBackRule, err := createLoopbackRule(proto, table, chain, addr)
if err != nil {
return fmt.Errorf("create loopback rule: %w", err)
}
// If TestDial is set, we are running in test mode and we should not
// find rule because header will mismatch.
if conn.TestDial == nil {
// Check if the rule already exists.
rule, err := findRule(conn, loopBackRule)
if err != nil {
return fmt.Errorf("find rule: %w", err)
}
if rule != nil {
// Rule already exists, no need to insert.
return nil
}
}
// This inserts the rule to the top of the chain
_ = conn.InsertRule(loopBackRule)
if err = conn.Flush(); err != nil {
return fmt.Errorf("insert rule: %w", err)
}
return nil
}
// getNFTByAddr returns the nftables with correct IP family
// that we will be using for the given address.
func (n *nftablesRunner) getNFTByAddr(addr netip.Addr) (*nftable, error) {
if addr.Is6() && !n.v6Available {
return nil, fmt.Errorf("nftables for IPv6 are not available on this host")
}
if addr.Is6() {
return n.nft6, nil
}
return n.nft4, nil
}
// AddLoopbackRule adds an nftables rule to permit loopback traffic to
// a local Tailscale IP. This rule is added only if it does not already exist.
func (n *nftablesRunner) AddLoopbackRule(addr netip.Addr) error {
nf, err := n.getNFTByAddr(addr)
if err != nil {
return fmt.Errorf("error setting up nftables for IP family of %v: %w", addr, err)
}
inputChain, err := getChainFromTable(n.conn, nf.Filter, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain: %w", err)
}
if err := insertLoopbackRule(n.conn, nf.Proto, nf.Filter, inputChain, addr); err != nil {
return fmt.Errorf("add loopback rule: %w", err)
}
return nil
}
// DelLoopbackRule removes the nftables rule permitting loopback
// traffic to a Tailscale IP.
func (n *nftablesRunner) DelLoopbackRule(addr netip.Addr) error {
nf, err := n.getNFTByAddr(addr)
if err != nil {
return fmt.Errorf("error setting up nftables for IP family of %v: %w", addr, err)
}
inputChain, err := getChainFromTable(n.conn, nf.Filter, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain: %w", err)
}
loopBackRule, err := createLoopbackRule(nf.Proto, nf.Filter, inputChain, addr)
if err != nil {
return fmt.Errorf("create loopback rule: %w", err)
}
existingLoopBackRule, err := findRule(n.conn, loopBackRule)
if err != nil {
return fmt.Errorf("find loop back rule: %w", err)
}
if existingLoopBackRule == nil {
// Rule does not exist, no need to delete.
return nil
}
if err := n.conn.DelRule(existingLoopBackRule); err != nil {
return fmt.Errorf("delete rule: %w", err)
}
return n.conn.Flush()
}
// getTables returns tables for IP families that this host was determined to
// support (either IPv4 and IPv6 or just IPv4).
func (n *nftablesRunner) getTables() []*nftable {
if n.HasIPV6() {
return []*nftable{n.nft4, n.nft6}
}
return []*nftable{n.nft4}
}
// AddChains creates custom Tailscale chains in netfilter via nftables
// if the ts-chain doesn't already exist.
func (n *nftablesRunner) AddChains() error {
polAccept := nftables.ChainPolicyAccept
for _, table := range n.getTables() {
// Create the filter table if it doesn't exist, this table name is the same
// as the name used by iptables-nft and ufw. We install rules into the
// same conventional table so that `accept` verdicts from our jump
// chains are conclusive.
filter, err := createTableIfNotExist(n.conn, table.Proto, "filter")
if err != nil {
return fmt.Errorf("create table: %w", err)
}
table.Filter = filter
// Adding the "conventional chains" that are used by iptables-nft and ufw.
if err = createChainIfNotExist(n.conn, chainInfo{filter, "FORWARD", nftables.ChainTypeFilter, nftables.ChainHookForward, nftables.ChainPriorityFilter, &polAccept}); err != nil {
return fmt.Errorf("create forward chain: %w", err)
}
if err = createChainIfNotExist(n.conn, chainInfo{filter, "INPUT", nftables.ChainTypeFilter, nftables.ChainHookInput, nftables.ChainPriorityFilter, &polAccept}); err != nil {
return fmt.Errorf("create input chain: %w", err)
}
// Adding the tailscale chains that contain our rules.
if err = createChainIfNotExist(n.conn, chainInfo{filter, chainNameForward, chainTypeRegular, nil, nil, nil}); err != nil {
return fmt.Errorf("create forward chain: %w", err)
}
if err = createChainIfNotExist(n.conn, chainInfo{filter, chainNameInput, chainTypeRegular, nil, nil, nil}); err != nil {
return fmt.Errorf("create input chain: %w", err)
}
// Create the nat table if it doesn't exist, this table name is the same
// as the name used by iptables-nft and ufw. We install rules into the
// same conventional table so that `accept` verdicts from our jump
// chains are conclusive.
nat, err := createTableIfNotExist(n.conn, table.Proto, "nat")
if err != nil {
return fmt.Errorf("create table: %w", err)
}
table.Nat = nat
// Adding the "conventional chains" that are used by iptables-nft and ufw.
if err = createChainIfNotExist(n.conn, chainInfo{nat, "POSTROUTING", nftables.ChainTypeNAT, nftables.ChainHookPostrouting, nftables.ChainPriorityNATSource, &polAccept}); err != nil {
return fmt.Errorf("create postrouting chain: %w", err)
}
// Adding the tailscale chain that contains our rules.
if err = createChainIfNotExist(n.conn, chainInfo{nat, chainNamePostrouting, chainTypeRegular, nil, nil, nil}); err != nil {
return fmt.Errorf("create postrouting chain: %w", err)
}
}
return n.conn.Flush()
}
// These are dummy chains and tables we create to detect if nftables is
// available. We create them, then delete them. If we can create and delete
// them, then we can use nftables. If we can't, then we assume that we're
// running on a system that doesn't support nftables. See
// createDummyPostroutingChains.
const (
tsDummyChainName = "ts-test-postrouting"
tsDummyTableName = "ts-test-nat"
)
// createDummyPostroutingChains creates dummy postrouting chains in netfilter
// via netfilter via nftables, as a last resort measure to detect that nftables
// can be used. It cleans up the dummy chains after creation.
func (n *nftablesRunner) createDummyPostroutingChains() (retErr error) {
polAccept := ptr.To(nftables.ChainPolicyAccept)
for _, table := range n.getTables() {
nat, err := createTableIfNotExist(n.conn, table.Proto, tsDummyTableName)
if err != nil {
return fmt.Errorf("create nat table: %w", err)
}
defer func(fm nftables.TableFamily) {
if err := deleteTableIfExists(n.conn, fm, tsDummyTableName); err != nil && retErr == nil {
retErr = fmt.Errorf("delete %q table: %w", tsDummyTableName, err)
}
}(table.Proto)
table.Nat = nat
if err = createChainIfNotExist(n.conn, chainInfo{nat, tsDummyChainName, nftables.ChainTypeNAT, nftables.ChainHookPostrouting, nftables.ChainPriorityNATSource, polAccept}); err != nil {
return fmt.Errorf("create %q chain: %w", tsDummyChainName, err)
}
if err := deleteChainIfExists(n.conn, nat, tsDummyChainName); err != nil {
return fmt.Errorf("delete %q chain: %w", tsDummyChainName, err)
}
}
return nil
}
// deleteChainIfExists deletes a chain if it exists.
func deleteChainIfExists(c *nftables.Conn, table *nftables.Table, name string) error {
chain, err := getChainFromTable(c, table, name)
if err != nil && !errors.Is(err, errorChainNotFound{table.Name, name}) {
return fmt.Errorf("get chain: %w", err)
} else if err != nil {
// If the chain doesn't exist, we don't need to delete it.
return nil
}
c.FlushChain(chain)
c.DelChain(chain)
if err := c.Flush(); err != nil {
return fmt.Errorf("flush and delete chain: %w", err)
}
return nil
}
// DelChains removes the custom Tailscale chains from netfilter via nftables.
func (n *nftablesRunner) DelChains() error {
for _, table := range n.getTables() {
if err := deleteChainIfExists(n.conn, table.Filter, chainNameForward); err != nil {
return fmt.Errorf("delete chain: %w", err)
}
if err := deleteChainIfExists(n.conn, table.Filter, chainNameInput); err != nil {
return fmt.Errorf("delete chain: %w", err)
}
}
if err := deleteChainIfExists(n.conn, n.nft4.Nat, chainNamePostrouting); err != nil {
return fmt.Errorf("delete chain: %w", err)
}
if n.HasIPV6NAT() {
if err := deleteChainIfExists(n.conn, n.nft6.Nat, chainNamePostrouting); err != nil {
return fmt.Errorf("delete chain: %w", err)
}
}
if err := n.conn.Flush(); err != nil {
return fmt.Errorf("flush: %w", err)
}
return nil
}
// createHookRule creates a rule to jump from a hooked chain to a regular chain.
func createHookRule(table *nftables.Table, fromChain *nftables.Chain, toChainName string) *nftables.Rule {
exprs := []expr.Any{
&expr.Counter{},
&expr.Verdict{
Kind: expr.VerdictJump,
Chain: toChainName,
},
}
rule := &nftables.Rule{
Table: table,
Chain: fromChain,
Exprs: exprs,
}
return rule
}
// addHookRule adds a rule to jump from a hooked chain to a regular chain at top of the hooked chain.
func addHookRule(conn *nftables.Conn, table *nftables.Table, fromChain *nftables.Chain, toChainName string) error {
rule := createHookRule(table, fromChain, toChainName)
_ = conn.InsertRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add rule: %w", err)
}
return nil
}
// AddHooks is adding rules to conventional chains like "FORWARD", "INPUT" and "POSTROUTING"
// in tables and jump from those chains to tailscale chains.
func (n *nftablesRunner) AddHooks() error {
conn := n.conn
for _, table := range n.getTables() {
inputChain, err := getChainFromTable(conn, table.Filter, "INPUT")
if err != nil {
return fmt.Errorf("get INPUT chain: %w", err)
}
err = addHookRule(conn, table.Filter, inputChain, chainNameInput)
if err != nil {
return fmt.Errorf("Addhook: %w", err)
}
forwardChain, err := getChainFromTable(conn, table.Filter, "FORWARD")
if err != nil {
return fmt.Errorf("get FORWARD chain: %w", err)
}
err = addHookRule(conn, table.Filter, forwardChain, chainNameForward)
if err != nil {
return fmt.Errorf("Addhook: %w", err)
}
postroutingChain, err := getChainFromTable(conn, table.Nat, "POSTROUTING")
if err != nil {
return fmt.Errorf("get INPUT chain: %w", err)
}
err = addHookRule(conn, table.Nat, postroutingChain, chainNamePostrouting)
if err != nil {
return fmt.Errorf("Addhook: %w", err)
}
}
return nil
}
// delHookRule deletes a rule that jumps from a hooked chain to a regular chain.
func delHookRule(conn *nftables.Conn, table *nftables.Table, fromChain *nftables.Chain, toChainName string) error {
rule := createHookRule(table, fromChain, toChainName)
existingRule, err := findRule(conn, rule)
if err != nil {
return fmt.Errorf("Failed to find hook rule: %w", err)
}
if existingRule == nil {
return nil
}
_ = conn.DelRule(existingRule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush del hook rule: %w", err)
}
return nil
}
// DelHooks is deleting the rules added to conventional chains to jump to tailscale chains.
func (n *nftablesRunner) DelHooks(logf logger.Logf) error {
conn := n.conn
for _, table := range n.getTables() {
inputChain, err := getChainFromTable(conn, table.Filter, "INPUT")
if err != nil {
return fmt.Errorf("get INPUT chain: %w", err)
}
err = delHookRule(conn, table.Filter, inputChain, chainNameInput)
if err != nil {
return fmt.Errorf("delhook: %w", err)
}
forwardChain, err := getChainFromTable(conn, table.Filter, "FORWARD")
if err != nil {
return fmt.Errorf("get FORWARD chain: %w", err)
}
err = delHookRule(conn, table.Filter, forwardChain, chainNameForward)
if err != nil {
return fmt.Errorf("delhook: %w", err)
}
postroutingChain, err := getChainFromTable(conn, table.Nat, "POSTROUTING")
if err != nil {
return fmt.Errorf("get INPUT chain: %w", err)
}
err = delHookRule(conn, table.Nat, postroutingChain, chainNamePostrouting)
if err != nil {
return fmt.Errorf("delhook: %w", err)
}
}
return nil
}
// maskof returns the mask of the given prefix in big endian bytes.
func maskof(pfx netip.Prefix) []byte {
mask := make([]byte, 4)
binary.BigEndian.PutUint32(mask, ^(uint32(0xffff_ffff) >> pfx.Bits()))
return mask
}
// createRangeRule creates a rule that matches packets with source IP from the give
// range (like CGNAT range or ChromeOSVM range) and the interface is not the tunname,
// and makes the given decision. Only IPv4 is supported.
func createRangeRule(
table *nftables.Table, chain *nftables.Chain,
tunname string, rng netip.Prefix, decision expr.VerdictKind,
) (*nftables.Rule, error) {
if rng.Addr().Is6() {
return nil, errors.New("IPv6 is not supported")
}
saddrExpr, err := newLoadSaddrExpr(nftables.TableFamilyIPv4, 1)
if err != nil {
return nil, fmt.Errorf("newLoadSaddrExpr: %w", err)
}
netip := rng.Addr().AsSlice()
mask := maskof(rng)
rule := &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyIIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpNeq,
Register: 1,
Data: []byte(tunname),
},
saddrExpr,
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Mask: mask,
Xor: []byte{0x00, 0x00, 0x00, 0x00},
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: netip,
},
&expr.Counter{},
&expr.Verdict{
Kind: decision,
},
},
}
return rule, nil
}
// addReturnChromeOSVMRangeRule adds a rule to return if the source IP
// is in the ChromeOS VM range.
func addReturnChromeOSVMRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error {
rule, err := createRangeRule(table, chain, tunname, tsaddr.ChromeOSVMRange(), expr.VerdictReturn)
if err != nil {
return fmt.Errorf("create rule: %w", err)
}
_ = c.AddRule(rule)
if err = c.Flush(); err != nil {
return fmt.Errorf("add rule: %w", err)
}
return nil
}
// addDropCGNATRangeRule adds a rule to drop if the source IP is in the
// CGNAT range.
func addDropCGNATRangeRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error {
rule, err := createRangeRule(table, chain, tunname, tsaddr.CGNATRange(), expr.VerdictDrop)
if err != nil {
return fmt.Errorf("create rule: %w", err)
}
_ = c.AddRule(rule)
if err = c.Flush(); err != nil {
return fmt.Errorf("add rule: %w", err)
}
return nil
}
// createSetSubnetRouteMarkRule creates a rule to set the subnet route
// mark if the packet is from the given interface.
func createSetSubnetRouteMarkRule(table *nftables.Table, chain *nftables.Chain, tunname string) (*nftables.Rule, error) {
hexTsFwmarkMaskNeg := getTailscaleFwmarkMaskNeg()
hexTSSubnetRouteMark := getTailscaleSubnetRouteMark()
rule := &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyIIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte(tunname),
},
&expr.Counter{},
&expr.Meta{Key: expr.MetaKeyMARK, Register: 1},
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Mask: hexTsFwmarkMaskNeg,
Xor: hexTSSubnetRouteMark,
},
&expr.Meta{
Key: expr.MetaKeyMARK,
SourceRegister: true,
Register: 1,
},
},
}
return rule, nil
}
// addSetSubnetRouteMarkRule adds a rule to set the subnet route mark
// if the packet is from the given interface.
func addSetSubnetRouteMarkRule(c *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error {
rule, err := createSetSubnetRouteMarkRule(table, chain, tunname)
if err != nil {
return fmt.Errorf("create rule: %w", err)
}
_ = c.AddRule(rule)
if err := c.Flush(); err != nil {
return fmt.Errorf("add rule: %w", err)
}
return nil
}
// createDropOutgoingPacketFromCGNATRangeRuleWithTunname creates a rule to drop
// outgoing packets from the CGNAT range.
func createDropOutgoingPacketFromCGNATRangeRuleWithTunname(table *nftables.Table, chain *nftables.Chain, tunname string) (*nftables.Rule, error) {
_, ipNet, err := net.ParseCIDR(tsaddr.CGNATRange().String())
if err != nil {
return nil, fmt.Errorf("parse cidr: %v", err)
}
mask, err := hex.DecodeString(ipNet.Mask.String())
if err != nil {
return nil, fmt.Errorf("decode mask: %v", err)
}
netip := ipNet.IP.Mask(ipNet.Mask).To4()
saddrExpr, err := newLoadSaddrExpr(nftables.TableFamilyIPv4, 1)
if err != nil {
return nil, fmt.Errorf("newLoadSaddrExpr: %v", err)
}
rule := &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte(tunname),
},
saddrExpr,
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Mask: mask,
Xor: []byte{0x00, 0x00, 0x00, 0x00},
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: netip,
},
&expr.Counter{},
&expr.Verdict{
Kind: expr.VerdictDrop,
},
},
}
return rule, nil
}
// addDropOutgoingPacketFromCGNATRangeRuleWithTunname adds a rule to drop
// outgoing packets from the CGNAT range.
func addDropOutgoingPacketFromCGNATRangeRuleWithTunname(conn *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error {
rule, err := createDropOutgoingPacketFromCGNATRangeRuleWithTunname(table, chain, tunname)
if err != nil {
return fmt.Errorf("create rule: %w", err)
}
_ = conn.AddRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("add rule: %w", err)
}
return nil
}
// createAcceptOutgoingPacketRule creates a rule to accept outgoing packets
// from the given interface.
func createAcceptOutgoingPacketRule(table *nftables.Table, chain *nftables.Chain, tunname string) *nftables.Rule {
return &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte(tunname),
},
&expr.Counter{},
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
}
}
// addAcceptOutgoingPacketRule adds a rule to accept outgoing packets
// from the given interface.
func addAcceptOutgoingPacketRule(conn *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error {
rule := createAcceptOutgoingPacketRule(table, chain, tunname)
_ = conn.AddRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add rule: %w", err)
}
return nil
}
// createAcceptOnPortRule creates a rule to accept incoming packets to
// a given destination UDP port.
func createAcceptOnPortRule(table *nftables.Table, chain *nftables.Chain, port uint16) *nftables.Rule {
portBytes := make([]byte, 2)
binary.BigEndian.PutUint16(portBytes, port)
return &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{
Key: expr.MetaKeyL4PROTO,
Register: 1,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte{unix.IPPROTO_UDP},
},
newLoadDportExpr(1),
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: portBytes,
},
&expr.Counter{},
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
}
}
// addAcceptOnPortRule adds a rule to accept incoming packets to
// a given destination UDP port.
func addAcceptOnPortRule(conn *nftables.Conn, table *nftables.Table, chain *nftables.Chain, port uint16) error {
rule := createAcceptOnPortRule(table, chain, port)
_ = conn.AddRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add rule: %w", err)
}
return nil
}
// addAcceptOnPortRule removes a rule to accept incoming packets to
// a given destination UDP port.
func removeAcceptOnPortRule(conn *nftables.Conn, table *nftables.Table, chain *nftables.Chain, port uint16) error {
rule := createAcceptOnPortRule(table, chain, port)
rule, err := findRule(conn, rule)
if err != nil {
return fmt.Errorf("find rule: %v", err)
}
_ = conn.DelRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush del rule: %w", err)
}
return nil
}
// AddMagicsockPortRule adds a rule to nftables to allow incoming traffic on
// the specified UDP port, so magicsock can accept incoming connections.
// network must be either "udp4" or "udp6" - this determines whether the rule
// is added for IPv4 or IPv6.
func (n *nftablesRunner) AddMagicsockPortRule(port uint16, network string) error {
var filterTable *nftables.Table
switch network {
case "udp4":
filterTable = n.nft4.Filter
case "udp6":
filterTable = n.nft6.Filter
default:
return fmt.Errorf("unsupported network %s", network)
}
inputChain, err := getChainFromTable(n.conn, filterTable, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain: %v", err)
}
err = addAcceptOnPortRule(n.conn, filterTable, inputChain, port)
if err != nil {
return fmt.Errorf("add accept on port rule: %v", err)
}
return nil
}
// DelMagicsockPortRule removes a rule added by AddMagicsockPortRule to accept
// incoming traffic on a particular UDP port.
// network must be either "udp4" or "udp6" - this determines whether the rule
// is removed for IPv4 or IPv6.
func (n *nftablesRunner) DelMagicsockPortRule(port uint16, network string) error {
var filterTable *nftables.Table
switch network {
case "udp4":
filterTable = n.nft4.Filter
case "udp6":
filterTable = n.nft6.Filter
default:
return fmt.Errorf("unsupported network %s", network)
}
inputChain, err := getChainFromTable(n.conn, filterTable, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain: %v", err)
}
err = removeAcceptOnPortRule(n.conn, filterTable, inputChain, port)
if err != nil {
return fmt.Errorf("add accept on port rule: %v", err)
}
return nil
}
// createAcceptIncomingPacketRule creates a rule to accept incoming packets to
// the given interface.
func createAcceptIncomingPacketRule(table *nftables.Table, chain *nftables.Chain, tunname string) *nftables.Rule {
return &nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Meta{Key: expr.MetaKeyIIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte(tunname),
},
&expr.Counter{},
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
}
}
func addAcceptIncomingPacketRule(conn *nftables.Conn, table *nftables.Table, chain *nftables.Chain, tunname string) error {
rule := createAcceptIncomingPacketRule(table, chain, tunname)
_ = conn.AddRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add rule: %w", err)
}
return nil
}
// AddBase adds some basic processing rules.
func (n *nftablesRunner) AddBase(tunname string) error {
if err := n.addBase4(tunname); err != nil {
return fmt.Errorf("add base v4: %w", err)
}
if n.HasIPV6() {
if err := n.addBase6(tunname); err != nil {
return fmt.Errorf("add base v6: %w", err)
}
}
return nil
}
// addBase4 adds some basic IPv4 processing rules.
func (n *nftablesRunner) addBase4(tunname string) error {
conn := n.conn
inputChain, err := getChainFromTable(conn, n.nft4.Filter, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain v4: %v", err)
}
if err = addReturnChromeOSVMRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil {
return fmt.Errorf("add return chromeos vm range rule v4: %w", err)
}
if err = addDropCGNATRangeRule(conn, n.nft4.Filter, inputChain, tunname); err != nil {
return fmt.Errorf("add drop cgnat range rule v4: %w", err)
}
if err = addAcceptIncomingPacketRule(conn, n.nft4.Filter, inputChain, tunname); err != nil {
return fmt.Errorf("add accept incoming packet rule v4: %w", err)
}
forwardChain, err := getChainFromTable(conn, n.nft4.Filter, chainNameForward)
if err != nil {
return fmt.Errorf("get forward chain v4: %v", err)
}
if err = addSetSubnetRouteMarkRule(conn, n.nft4.Filter, forwardChain, tunname); err != nil {
return fmt.Errorf("add set subnet route mark rule v4: %w", err)
}
if err = addMatchSubnetRouteMarkRule(conn, n.nft4.Filter, forwardChain, Accept); err != nil {
return fmt.Errorf("add match subnet route mark rule v4: %w", err)
}
if err = addDropOutgoingPacketFromCGNATRangeRuleWithTunname(conn, n.nft4.Filter, forwardChain, tunname); err != nil {
return fmt.Errorf("add drop outgoing packet from cgnat range rule v4: %w", err)
}
if err = addAcceptOutgoingPacketRule(conn, n.nft4.Filter, forwardChain, tunname); err != nil {
return fmt.Errorf("add accept outgoing packet rule v4: %w", err)
}
if err = conn.Flush(); err != nil {
return fmt.Errorf("flush base v4: %w", err)
}
return nil
}
// addBase6 adds some basic IPv6 processing rules.
func (n *nftablesRunner) addBase6(tunname string) error {
conn := n.conn
inputChain, err := getChainFromTable(conn, n.nft6.Filter, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain v4: %v", err)
}
if err = addAcceptIncomingPacketRule(conn, n.nft6.Filter, inputChain, tunname); err != nil {
return fmt.Errorf("add accept incoming packet rule v6: %w", err)
}
forwardChain, err := getChainFromTable(conn, n.nft6.Filter, chainNameForward)
if err != nil {
return fmt.Errorf("get forward chain v6: %w", err)
}
if err = addSetSubnetRouteMarkRule(conn, n.nft6.Filter, forwardChain, tunname); err != nil {
return fmt.Errorf("add set subnet route mark rule v6: %w", err)
}
if err = addMatchSubnetRouteMarkRule(conn, n.nft6.Filter, forwardChain, Accept); err != nil {
return fmt.Errorf("add match subnet route mark rule v6: %w", err)
}
if err = addAcceptOutgoingPacketRule(conn, n.nft6.Filter, forwardChain, tunname); err != nil {
return fmt.Errorf("add accept outgoing packet rule v6: %w", err)
}
if err = conn.Flush(); err != nil {
return fmt.Errorf("flush base v6: %w", err)
}
return nil
}
// DelBase empties, but does not remove, custom Tailscale chains from
// netfilter via iptables.
func (n *nftablesRunner) DelBase() error {
conn := n.conn
for _, table := range n.getTables() {
inputChain, err := getChainFromTable(conn, table.Filter, chainNameInput)
if err != nil {
return fmt.Errorf("get input chain: %v", err)
}
conn.FlushChain(inputChain)
forwardChain, err := getChainFromTable(conn, table.Filter, chainNameForward)
if err != nil {
return fmt.Errorf("get forward chain: %v", err)
}
conn.FlushChain(forwardChain)
postrouteChain, err := getChainFromTable(conn, table.Nat, chainNamePostrouting)
if err != nil {
return fmt.Errorf("get postrouting chain v4: %v", err)
}
conn.FlushChain(postrouteChain)
}
return conn.Flush()
}
// createMatchSubnetRouteMarkRule creates a rule that matches packets
// with the subnet route mark and takes the specified action.
func createMatchSubnetRouteMarkRule(table *nftables.Table, chain *nftables.Chain, action MatchDecision) (*nftables.Rule, error) {
hexTSFwmarkMask := getTailscaleFwmarkMask()
hexTSSubnetRouteMark := getTailscaleSubnetRouteMark()
var endAction expr.Any
endAction = &expr.Verdict{Kind: expr.VerdictAccept}
if action == Masq {
endAction = &expr.Masq{}
}
exprs := []expr.Any{
&expr.Meta{Key: expr.MetaKeyMARK, Register: 1},
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Mask: hexTSFwmarkMask,
Xor: []byte{0x00, 0x00, 0x00, 0x00},
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: hexTSSubnetRouteMark,
},
&expr.Counter{},
endAction,
}
rule := &nftables.Rule{
Table: table,
Chain: chain,
Exprs: exprs,
}
return rule, nil
}
// addMatchSubnetRouteMarkRule adds a rule that matches packets with
// the subnet route mark and takes the specified action.
func addMatchSubnetRouteMarkRule(conn *nftables.Conn, table *nftables.Table, chain *nftables.Chain, action MatchDecision) error {
rule, err := createMatchSubnetRouteMarkRule(table, chain, action)
if err != nil {
return fmt.Errorf("create match subnet route mark rule: %w", err)
}
_ = conn.AddRule(rule)
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add rule: %w", err)
}
return nil
}
// AddSNATRule adds a netfilter rule to SNAT traffic destined for
// local subnets.
func (n *nftablesRunner) AddSNATRule() error {
conn := n.conn
for _, table := range n.getTables() {
chain, err := getChainFromTable(conn, table.Nat, chainNamePostrouting)
if err != nil {
return fmt.Errorf("get postrouting chain v4: %w", err)
}
if err = addMatchSubnetRouteMarkRule(conn, table.Nat, chain, Masq); err != nil {
return fmt.Errorf("add match subnet route mark rule v4: %w", err)
}
}
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add SNAT rule: %w", err)
}
return nil
}
// DelSNATRule removes the netfilter rule to SNAT traffic destined for
// local subnets. An error is returned if the rule does not exist.
func (n *nftablesRunner) DelSNATRule() error {
conn := n.conn
hexTSFwmarkMask := getTailscaleFwmarkMask()
hexTSSubnetRouteMark := getTailscaleSubnetRouteMark()
exprs := []expr.Any{
&expr.Meta{Key: expr.MetaKeyMARK, Register: 1},
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Mask: hexTSFwmarkMask,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: hexTSSubnetRouteMark,
},
&expr.Counter{},
&expr.Masq{},
}
for _, table := range n.getTables() {
chain, err := getChainFromTable(conn, table.Nat, chainNamePostrouting)
if err != nil {
return fmt.Errorf("get postrouting chain v4: %w", err)
}
rule := &nftables.Rule{
Table: table.Nat,
Chain: chain,
Exprs: exprs,
}
SNATRule, err := findRule(conn, rule)
if err != nil {
return fmt.Errorf("find SNAT rule v4: %w", err)
}
if SNATRule != nil {
_ = conn.DelRule(SNATRule)
}
}
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush del SNAT rule: %w", err)
}
return nil
}
func nativeUint32(v uint32) []byte {
b := make([]byte, 4)
binary.NativeEndian.PutUint32(b, v)
return b
}
func makeStatefulRuleExprs(tunname string) []expr.Any {
return []expr.Any{
// Check if the output interface is the Tailscale interface by
// first loding the OIFNAME into register 1 and comparing it
// against our tunname.
//
// 'cmp' implicitly breaks from a rule if a comparison fails,
// so if we continue past this rule we know that the packet is
// going to our TUN.
&expr.Meta{Key: expr.MetaKeyOIFNAME, Register: 1},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: []byte(tunname),
},
// Store the conntrack state in register 1
&expr.Ct{
Register: 1,
Key: expr.CtKeySTATE,
},
// Mask the state in register 1 to "hide" the ESTABLISHED and
// RELATED bits (which are expected and fine); if there are any
// other bits, we want them to remain.
//
// This operation is, in the kernel:
// dst[i] = (src[i] & mask[i]) ^ xor[i]
//
// So, we can mask by setting the inverse of the bits we want
// to remove; i.e. ESTABLISHED = 0b00000010, RELATED =
// 0b00000100, so, if we assume an 8-bit state (in reality,
// it's 32-bit), we can mask with 0b11111001 to clear those
// bits and keep everything else (e.g. the INVALID bit which is
// 0b00000001).
//
// TODO(andrew-d): for now, let's also allow
// CtStateBitUNTRACKED, which is a state for packets that are not
// tracked (marked so explicitly with an iptables rule using
// --notrack); we should figure out if we want to allow this or not.
&expr.Bitwise{
SourceRegister: 1,
DestRegister: 1,
Len: 4,
Mask: nativeUint32(^(0 |
expr.CtStateBitESTABLISHED |
expr.CtStateBitRELATED |
expr.CtStateBitUNTRACKED)),
// Xor is unused but must be specified
Xor: nativeUint32(0),
},
// Compare against the expected state (0, i.e. no bits set
// other than maybe ESTABLISHED and RELATED). We want this
// comparison to fail if there are no bits set, so that this
// rule's evaluation stops and we don't fall through to the
// "Drop" verdict.
//
// For example, if the state is ESTABLISHED (and we want to
// break from this rule/accept this packet):
// state = ESTABLISHED
// register1 = 0b0 (since the bitwise operation cleared the ESTABLISHED bit)
//
// compare register1 (0b0) != 0: false
// -> comparison implicitly breaks
// -> continue to the next rule
//
// For example, if the state is NEW (and we want to continue to
// the next expression and thus drop this packet):
// state = NEW
// register1 = 0b1000
//
// compare register1 (0b1000) != 0: true
// -> comparison continues to next expr
&expr.Cmp{
Op: expr.CmpOpNeq,
Register: 1,
Data: []byte{0, 0, 0, 0},
},
// If we get here, we know that this packet is going to our TUN
// device, and has a conntrack state set other than ESTABLISHED
// or RELATED. We thus count and drop the packet.
&expr.Counter{},
&expr.Verdict{Kind: expr.VerdictDrop},
}
// TODO(andrew-d): iptables-nft writes a rule that dumps as:
//
// match name conntrack rev 3
//
// I think this is using expr.Match against the following struct
// (xt_conntrack_mtinfo3):
//
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/netfilter/xt_conntrack.h#L64-L77
//
// We could probably do something similar here, but I'm not sure if
// there's any advantage. Below is an example Match statement if we
// decide to do that, based on dumping the rule that iptables-nft
// generates:
//
// _ = expr.Match{
// Name: "conntrack",
// Rev: 3,
// Info: &xt.ConntrackMtinfo3{
// ConntrackMtinfo2: xt.ConntrackMtinfo2{
// ConntrackMtinfoBase: xt.ConntrackMtinfoBase{
// MatchFlags: xt.ConntrackState,
// InvertFlags: xt.ConntrackState,
// },
// // Mask the state to remove ESTABLISHED and
// // RELATED before comparing.
// StateMask: expr.CtStateBitESTABLISHED | expr.CtStateBitRELATED,
// },
// },
// }
}
// AddStatefulRule adds a netfilter rule for stateful packet filtering using
// conntrack.
func (n *nftablesRunner) AddStatefulRule(tunname string) error {
conn := n.conn
exprs := makeStatefulRuleExprs(tunname)
for _, table := range n.getTables() {
chain, err := getChainFromTable(conn, table.Filter, chainNameForward)
if err != nil {
return fmt.Errorf("get forward chain: %w", err)
}
// First, find the 'accept' rule that we want to insert our rule before.
acceptRule := createAcceptOutgoingPacketRule(table.Filter, chain, tunname)
rule, err := findRule(conn, acceptRule)
if err != nil {
return fmt.Errorf("find accept rule: %w", err)
}
conn.InsertRule(&nftables.Rule{
Table: table.Filter,
Chain: chain,
Exprs: exprs,
// Specifying Position in an Insert operation means to
// insert this rule before the specified rule.
Position: rule.Handle,
})
}
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush add stateful rule: %w", err)
}
return nil
}
// DelStatefulRule removes the netfilter rule for stateful packet filtering
// using conntrack.
func (n *nftablesRunner) DelStatefulRule(tunname string) error {
conn := n.conn
exprs := makeStatefulRuleExprs(tunname)
for _, table := range n.getTables() {
chain, err := getChainFromTable(conn, table.Filter, chainNameForward)
if err != nil {
return fmt.Errorf("get forward chain: %w", err)
}
rule, err := findRule(conn, &nftables.Rule{
Table: table.Filter,
Chain: chain,
Exprs: exprs,
})
if err != nil {
return fmt.Errorf("find stateful rule: %w", err)
}
if rule != nil {
conn.DelRule(rule)
}
}
if err := conn.Flush(); err != nil {
return fmt.Errorf("flush del stateful rule: %w", err)
}
return nil
}
// cleanupChain removes a jump rule from hookChainName to tsChainName, and then
// the entire chain tsChainName. Errors are logged, but attempts to remove both
// the jump rule and chain continue even if one errors.
func cleanupChain(logf logger.Logf, conn *nftables.Conn, table *nftables.Table, hookChainName, tsChainName string) {
// remove the jump first, before removing the jump destination.
defaultChain, err := getChainFromTable(conn, table, hookChainName)
if err != nil && !errors.Is(err, errorChainNotFound{table.Name, hookChainName}) {
logf("cleanup: did not find default chain: %s", err)
}
if !errors.Is(err, errorChainNotFound{table.Name, hookChainName}) {
// delete hook in convention chain
_ = delHookRule(conn, table, defaultChain, tsChainName)
}
tsChain, err := getChainFromTable(conn, table, tsChainName)
if err != nil && !errors.Is(err, errorChainNotFound{table.Name, tsChainName}) {
logf("cleanup: did not find ts-chain: %s", err)
}
if tsChain != nil {
// flush and delete ts-chain
conn.FlushChain(tsChain)
conn.DelChain(tsChain)
err = conn.Flush()
logf("cleanup: delete and flush chain %s: %s", tsChainName, err)
}
}
// NfTablesCleanUp removes all Tailscale added nftables rules.
// Any errors that occur are logged to the provided logf.
func NfTablesCleanUp(logf logger.Logf) {
conn, err := nftables.New()
if err != nil {
logf("cleanup: nftables connection: %s", err)
}
tables, err := conn.ListTables() // both v4 and v6
if err != nil {
logf("cleanup: list tables: %s", err)
}
for _, table := range tables {
// These table names were used briefly in 1.48.0.
if table.Name == "ts-filter" || table.Name == "ts-nat" {
conn.DelTable(table)
if err := conn.Flush(); err != nil {
logf("cleanup: flush delete table %s: %s", table.Name, err)
}
}
if table.Name == "filter" {
cleanupChain(logf, conn, table, "INPUT", chainNameInput)
cleanupChain(logf, conn, table, "FORWARD", chainNameForward)
}
if table.Name == "nat" {
cleanupChain(logf, conn, table, "POSTROUTING", chainNamePostrouting)
}
}
}
func snatRule(t *nftables.Table, ch *nftables.Chain, src, dst netip.Addr, meta []byte) *nftables.Rule {
var daddrOffset, fam, daddrLen uint32
if dst.Is4() {
daddrOffset = 16
daddrLen = 4
fam = unix.NFPROTO_IPV4
} else {
daddrOffset = 24
daddrLen = 16
fam = unix.NFPROTO_IPV6
}
return &nftables.Rule{
Table: t,
Chain: ch,
Exprs: []expr.Any{
&expr.Payload{
DestRegister: 1,
Base: expr.PayloadBaseNetworkHeader,
Offset: daddrOffset,
Len: daddrLen,
},
&expr.Cmp{
Op: expr.CmpOpEq,
Register: 1,
Data: dst.AsSlice(),
},
&expr.Immediate{
Register: 1,
Data: src.AsSlice(),
},
&expr.NAT{
Type: expr.NATTypeSourceNAT,
Family: fam,
RegAddrMin: 1,
RegAddrMax: 1,
},
},
UserData: meta,
}
}