kernel: backport BPF-programmable network device support

Signed-off-by: Tianling Shen <cnsztl@immortalwrt.org>
(cherry picked from commit 41758671677bf8c85e5decf8fc1933c5f062769c)
This commit is contained in:
Tianling Shen 2024-11-04 16:16:06 +08:00
parent 9da8dfc1b9
commit 54a586a5b7
No known key found for this signature in database
GPG Key ID: 6850B6345C862176
12 changed files with 1775 additions and 4 deletions

View File

@ -563,6 +563,15 @@ config KERNEL_BPF_STREAM_PARSER
depends on KERNEL_CGROUP_BPF
default y if KERNEL_DEBUG_INFO_BTF
config KERNEL_NETKIT
bool "Compile the kernel with BPF-programmable network device support"
default y if KERNEL_DEBUG_INFO_BTF
help
The netkit device is a virtual networking device where BPF programs
can be attached to the device(s) transmission routine in order to
implement the driver's internal logic. The device can be configured
to operate in L3 or L2 mode.
config KERNEL_AIO
bool "Compile the kernel with asynchronous IO support"
default y if !SMALL_FLASH

View File

@ -0,0 +1,38 @@
From ea41b880cc85f0a992571f66e4554a69f7806246 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Thu, 26 Oct 2023 12:41:05 +0300
Subject: [PATCH] netkit: Remove explicit active/peer ptr initialization
Remove the explicit NULLing of active/peer pointers and rely on the
implicit one done at net device allocation.
Suggested-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231026094106.1505892-2-razor@blackwall.org
---
drivers/net/netkit.c | 4 ----
1 file changed, 4 deletions(-)
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -371,8 +371,6 @@ static int netkit_new_link(struct net *s
nk->policy = default_peer;
nk->mode = mode;
bpf_mprog_bundle_init(&nk->bundle);
- RCU_INIT_POINTER(nk->active, NULL);
- RCU_INIT_POINTER(nk->peer, NULL);
err = register_netdevice(peer);
put_net(net);
@@ -398,8 +396,6 @@ static int netkit_new_link(struct net *s
nk->policy = default_prim;
nk->mode = mode;
bpf_mprog_bundle_init(&nk->bundle);
- RCU_INIT_POINTER(nk->active, NULL);
- RCU_INIT_POINTER(nk->peer, NULL);
err = register_netdevice(dev);
if (err < 0)

View File

@ -0,0 +1,90 @@
From ae1658272c6491a31ac968e39882fc569f312ac3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Nov 2023 01:42:15 +0100
Subject: [PATCH] netkit: Add tstats per-CPU traffic counters
Add dev->tstats traffic accounting to netkit. The latter contains per-CPU
RX and TX counters.
The dev's TX counters are bumped upon pass/unspec as well as redirect
verdicts, in other words, on everything except for drops.
The dev's RX counters are bumped upon successful __netif_rx(), as well
as from skb_do_redirect() (not part of this commit here).
Using dev->lstats with having just a single packets/bytes counter and
inferring one another's RX counters from the peer dev's lstats is not
possible given skb_do_redirect() can also bump the device's stats.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20231114004220.6495-4-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
drivers/net/netkit.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -68,6 +68,7 @@ static netdev_tx_t netkit_xmit(struct sk
netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
const struct bpf_mprog_entry *entry;
struct net_device *peer;
+ int len = skb->len;
rcu_read_lock();
peer = rcu_dereference(nk->peer);
@@ -85,15 +86,22 @@ static netdev_tx_t netkit_xmit(struct sk
case NETKIT_PASS:
skb->protocol = eth_type_trans(skb, skb->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- __netif_rx(skb);
+ if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
+ dev_sw_netstats_tx_add(dev, 1, len);
+ dev_sw_netstats_rx_add(peer, len);
+ } else {
+ goto drop_stats;
+ }
break;
case NETKIT_REDIRECT:
+ dev_sw_netstats_tx_add(dev, 1, len);
skb_do_redirect(skb);
break;
case NETKIT_DROP:
default:
drop:
kfree_skb(skb);
+drop_stats:
dev_core_stats_tx_dropped_inc(dev);
ret_dev = NET_XMIT_DROP;
break;
@@ -174,6 +182,13 @@ static struct net_device *netkit_peer_de
return rcu_dereference(netkit_priv(dev)->peer);
}
+static void netkit_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ dev_fetch_sw_netstats(stats, dev->tstats);
+ stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
+}
+
static void netkit_uninit(struct net_device *dev);
static const struct net_device_ops netkit_netdev_ops = {
@@ -184,6 +199,7 @@ static const struct net_device_ops netki
.ndo_set_rx_headroom = netkit_set_headroom,
.ndo_get_iflink = netkit_get_iflink,
.ndo_get_peer_dev = netkit_peer_dev,
+ .ndo_get_stats64 = netkit_get_stats,
.ndo_uninit = netkit_uninit,
.ndo_features_check = passthru_features_check,
};
@@ -218,6 +234,7 @@ static void netkit_setup(struct net_devi
ether_setup(dev);
dev->max_mtu = ETH_MAX_MTU;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->flags |= IFF_NOARP;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;

View File

@ -0,0 +1,108 @@
From 2c225425704078282e152ba692649237f78b3d7a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Nov 2023 01:42:18 +0100
Subject: [PATCH] bpf, netkit: Add indirect call wrapper for fetching peer dev
ndo_get_peer_dev is used in tcx BPF fast path, therefore make use of
indirect call wrapper and therefore optimize the bpf_redirect_peer()
internal handling a bit. Add a small skb_get_peer_dev() wrapper which
utilizes the INDIRECT_CALL_1() macro instead of open coding.
Future work could potentially add a peer pointer directly into struct
net_device in future and convert veth and netkit over to use it so
that eventually ndo_get_peer_dev can be removed.
Co-developed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20231114004220.6495-7-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
drivers/net/netkit.c | 3 ++-
include/net/netkit.h | 6 ++++++
net/core/filter.c | 18 +++++++++++++-----
3 files changed, 21 insertions(+), 6 deletions(-)
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -7,6 +7,7 @@
#include <linux/filter.h>
#include <linux/netfilter_netdev.h>
#include <linux/bpf_mprog.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/netkit.h>
#include <net/dst.h>
@@ -177,7 +178,7 @@ out:
rcu_read_unlock();
}
-static struct net_device *netkit_peer_dev(struct net_device *dev)
+INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev)
{
return rcu_dereference(netkit_priv(dev)->peer);
}
--- a/include/net/netkit.h
+++ b/include/net/netkit.h
@@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_a
int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev));
#else
static inline int netkit_prog_attach(const union bpf_attr *attr,
struct bpf_prog *prog)
@@ -34,5 +35,10 @@ static inline int netkit_prog_query(cons
{
return -EINVAL;
}
+
+static inline struct net_device *netkit_peer_dev(struct net_device *dev)
+{
+ return NULL;
+}
#endif /* CONFIG_NETKIT */
#endif /* __NET_NETKIT_H */
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -81,6 +81,7 @@
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>
@@ -2472,6 +2473,16 @@ static const struct bpf_func_proto bpf_c
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
+static struct net_device *skb_get_peer_dev(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (likely(ops->ndo_get_peer_dev))
+ return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
+ netkit_peer_dev, dev);
+ return NULL;
+}
+
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -2485,12 +2496,9 @@ int skb_do_redirect(struct sk_buff *skb)
if (unlikely(!dev))
goto out_drop;
if (flags & BPF_F_PEER) {
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (unlikely(!ops->ndo_get_peer_dev ||
- !skb_at_tc_ingress(skb)))
+ if (unlikely(!skb_at_tc_ingress(skb)))
goto out_drop;
- dev = ops->ndo_get_peer_dev(dev);
+ dev = skb_get_peer_dev(dev);
if (unlikely(!dev ||
!(dev->flags & IFF_UP) ||
net_eq(net, dev_net(dev))))

View File

@ -0,0 +1,34 @@
From 0bad281d0ecdf8391b0f42678b663336e7c3ceb0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 27 Nov 2023 21:05:33 +0100
Subject: [PATCH] netkit: Reject IFLA_NETKIT_PEER_INFO in netkit_change_link
The IFLA_NETKIT_PEER_INFO attribute can only be used during device
creation, but not via changelink callback. Hence reject it there.
Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/e86a277a1e8d3b19890312779e42f790b0605ea4.1701115314.git.daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
drivers/net/netkit.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -851,6 +851,12 @@ static int netkit_change_link(struct net
return -EACCES;
}
+ if (data[IFLA_NETKIT_PEER_INFO]) {
+ NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO],
+ "netkit peer info cannot be changed after device creation");
+ return -EINVAL;
+ }
+
if (data[IFLA_NETKIT_POLICY]) {
attr = data[IFLA_NETKIT_POLICY];
policy = nla_get_u32(attr);

View File

@ -0,0 +1,91 @@
From d6fe532b7499e4575f9647879b7a34625817fe7f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 24 May 2024 18:36:16 +0200
Subject: [PATCH] netkit: Fix setting mac address in l2 mode
When running Cilium connectivity test suite with netkit in L2 mode, we
found that it is expected to be able to specify a custom MAC address for
the devices, in particular, cilium-cni obtains the specified MAC address
by querying the endpoint and sets the MAC address of the interface inside
the Pod. Thus, fix the missing support in netkit for L2 mode.
Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20240524163619.26001-1-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
drivers/net/netkit.c | 26 +++++++++++++++++++++-----
1 file changed, 21 insertions(+), 5 deletions(-)
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -155,6 +155,16 @@ static void netkit_set_multicast(struct
/* Nothing to do, we receive whatever gets pushed to us! */
}
+static int netkit_set_macaddr(struct net_device *dev, void *sa)
+{
+ struct netkit *nk = netkit_priv(dev);
+
+ if (nk->mode != NETKIT_L2)
+ return -EOPNOTSUPP;
+
+ return eth_mac_addr(dev, sa);
+}
+
static void netkit_set_headroom(struct net_device *dev, int headroom)
{
struct netkit *nk = netkit_priv(dev), *nk2;
@@ -198,6 +208,7 @@ static const struct net_device_ops netki
.ndo_start_xmit = netkit_xmit,
.ndo_set_rx_mode = netkit_set_multicast,
.ndo_set_rx_headroom = netkit_set_headroom,
+ .ndo_set_mac_address = netkit_set_macaddr,
.ndo_get_iflink = netkit_get_iflink,
.ndo_get_peer_dev = netkit_peer_dev,
.ndo_get_stats64 = netkit_get_stats,
@@ -300,9 +311,11 @@ static int netkit_validate(struct nlattr
if (!attr)
return 0;
- NL_SET_ERR_MSG_ATTR(extack, attr,
- "Setting Ethernet address is not supported");
- return -EOPNOTSUPP;
+ if (nla_len(attr) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(attr)))
+ return -EADDRNOTAVAIL;
+ return 0;
}
static struct rtnl_link_ops netkit_link_ops;
@@ -365,6 +378,9 @@ static int netkit_new_link(struct net *s
strscpy(ifname, "nk%d", IFNAMSIZ);
ifname_assign_type = NET_NAME_ENUM;
}
+ if (mode != NETKIT_L2 &&
+ (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
+ return -EOPNOTSUPP;
net = rtnl_link_get_net(src_net, tbp);
if (IS_ERR(net))
@@ -379,7 +395,7 @@ static int netkit_new_link(struct net *s
netif_inherit_tso_max(peer, dev);
- if (mode == NETKIT_L2)
+ if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
eth_hw_addr_random(peer);
if (ifmp && dev->ifindex)
peer->ifindex = ifmp->ifi_index;
@@ -402,7 +418,7 @@ static int netkit_new_link(struct net *s
if (err < 0)
goto err_configure_peer;
- if (mode == NETKIT_L2)
+ if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
if (tb[IFLA_IFNAME])
nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);

View File

@ -0,0 +1,90 @@
From 3998d184267dfcff858aaa84d3de17429253629d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 24 May 2024 18:36:17 +0200
Subject: [PATCH] netkit: Fix pkt_type override upon netkit pass verdict
When running Cilium connectivity test suite with netkit in L2 mode, we
found that compared to tcx a few tests were failing which pushed traffic
into an L7 proxy sitting in host namespace. The problem in particular is
around the invocation of eth_type_trans() in netkit.
In case of tcx, this is run before the tcx ingress is triggered inside
host namespace and thus if the BPF program uses the bpf_skb_change_type()
helper the newly set type is retained. However, in case of netkit, the
late eth_type_trans() invocation overrides the earlier decision from the
BPF program which eventually leads to the test failure.
Instead of eth_type_trans(), split out the relevant parts, meaning, reset
of mac header and call to eth_skb_pkt_type() before the BPF program is run
in order to have the same behavior as with tcx, and refactor a small helper
called eth_skb_pull_mac() which is run in case it's passed up the stack
where the mac header must be pulled. With this all connectivity tests pass.
Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20240524163619.26001-2-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
drivers/net/netkit.c | 4 +++-
include/linux/etherdevice.h | 8 ++++++++
net/ethernet/eth.c | 4 +---
3 files changed, 12 insertions(+), 4 deletions(-)
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -55,6 +55,7 @@ static void netkit_prep_forward(struct s
skb_scrub_packet(skb, xnet);
skb->priority = 0;
nf_skip_egress(skb, true);
+ skb_reset_mac_header(skb);
}
static struct netkit *netkit_priv(const struct net_device *dev)
@@ -78,6 +79,7 @@ static netdev_tx_t netkit_xmit(struct sk
skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
+ eth_skb_pkt_type(skb, peer);
skb->dev = peer;
entry = rcu_dereference(nk->active);
if (entry)
@@ -85,7 +87,7 @@ static netdev_tx_t netkit_xmit(struct sk
switch (ret) {
case NETKIT_NEXT:
case NETKIT_PASS:
- skb->protocol = eth_type_trans(skb, skb->dev);
+ eth_skb_pull_mac(skb);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
dev_sw_netstats_tx_add(dev, 1, len);
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -632,6 +632,14 @@ static inline void eth_skb_pkt_type(stru
}
}
+static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb)
+{
+ struct ethhdr *eth = (struct ethhdr *)skb->data;
+
+ skb_pull_inline(skb, ETH_HLEN);
+ return eth;
+}
+
/**
* eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
* @skb: Buffer to pad
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -161,9 +161,7 @@ __be16 eth_type_trans(struct sk_buff *sk
skb->dev = dev;
skb_reset_mac_header(skb);
- eth = (struct ethhdr *)skb->data;
- skb_pull_inline(skb, ETH_HLEN);
-
+ eth = eth_skb_pull_mac(skb);
eth_skb_pkt_type(skb, dev);
/*

View File

@ -41,7 +41,7 @@ Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21988,6 +21988,7 @@ U-BOOT ENVIRONMENT VARIABLES
@@ -21997,6 +21997,7 @@ U-BOOT ENVIRONMENT VARIABLES
M: Rafał Miłecki <rafal@milecki.pl>
S: Maintained
F: Documentation/devicetree/bindings/nvmem/u-boot,env.yaml

View File

@ -164,4 +164,4 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
+
skb_reset_mac_header(skb);
eth = (struct ethhdr *)skb->data;
eth = eth_skb_pull_mac(skb);

View File

@ -36,7 +36,7 @@ Signed-off-by: Daniel Golle <daniel@makrotopia.org>
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -22015,6 +22015,12 @@ F: Documentation/filesystems/ubifs-authe
@@ -22024,6 +22024,12 @@ F: Documentation/filesystems/ubifs-authe
F: Documentation/filesystems/ubifs.rst
F: fs/ubifs/

View File

@ -19,7 +19,7 @@ Signed-off-by: Daniel Golle <daniel@makrotopia.org>
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13356,7 +13356,9 @@ M: Daniel Golle <daniel@makrotopia.org>
@@ -13365,7 +13365,9 @@ M: Daniel Golle <daniel@makrotopia.org>
L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/pcs/pcs-mtk-lynxi.c