1312 lines
34 KiB
Diff
1312 lines
34 KiB
Diff
From 35dfaad7188cdc043fde31709c796f5a692ba2bd Mon Sep 17 00:00:00 2001
|
|
From: Daniel Borkmann <daniel@iogearbox.net>
|
|
Date: Tue, 24 Oct 2023 23:48:58 +0200
|
|
Subject: [PATCH] netkit, bpf: Add bpf programmable net device
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
This work adds a new, minimal BPF-programmable device called "netkit"
|
|
(former PoC code-name "meta") we recently presented at LSF/MM/BPF. The
|
|
core idea is that BPF programs are executed within the drivers xmit routine
|
|
and therefore e.g. in case of containers/Pods moving BPF processing closer
|
|
to the source.
|
|
|
|
One of the goals was that in case of Pod egress traffic, this allows to
|
|
move BPF programs from hostns tcx ingress into the device itself, providing
|
|
earlier drop or forward mechanisms, for example, if the BPF program
|
|
determines that the skb must be sent out of the node, then a redirect to
|
|
the physical device can take place directly without going through per-CPU
|
|
backlog queue. This helps to shift processing for such traffic from softirq
|
|
to process context, leading to better scheduling decisions/performance (see
|
|
measurements in the slides).
|
|
|
|
In this initial version, the netkit device ships as a pair, but we plan to
|
|
extend this further so it can also operate in single device mode. The pair
|
|
comes with a primary and a peer device. Only the primary device, typically
|
|
residing in hostns, can manage BPF programs for itself and its peer. The
|
|
peer device is designated for containers/Pods and cannot attach/detach
|
|
BPF programs. Upon the device creation, the user can set the default policy
|
|
to 'pass' or 'drop' for the case when no BPF program is attached.
|
|
|
|
Additionally, the device can be operated in L3 (default) or L2 mode. The
|
|
management of BPF programs is done via bpf_mprog, so that multi-attach is
|
|
supported right from the beginning with similar API and dependency controls
|
|
as tcx. For details on the latter see commit 053c8e1f235d ("bpf: Add generic
|
|
attach/detach/query API for multi-progs"). tc BPF compatibility is provided,
|
|
so that existing programs can be easily migrated.
|
|
|
|
Going forward, we plan to use netkit devices in Cilium as the main device
|
|
type for connecting Pods. They will be operated in L3 mode in order to
|
|
simplify a Pod's neighbor management and the peer will operate in default
|
|
drop mode, so that no traffic is leaving between the time when a Pod is
|
|
brought up by the CNI plugin and programs attached by the agent.
|
|
Additionally, the programs we attach via tcx on the physical devices are
|
|
using bpf_redirect_peer() for inbound traffic into netkit device, hence the
|
|
latter is also supporting the ndo_get_peer_dev callback. Similarly, we use
|
|
bpf_redirect_neigh() for the way out, pushing from netkit peer to phys device
|
|
directly. Also, BIG TCP is supported on netkit device. For the follow-up
|
|
work in single device mode, we plan to convert Cilium's cilium_host/_net
|
|
devices into a single one.
|
|
|
|
An extensive test suite for checking device operations and the BPF program
|
|
and link management API comes as BPF selftests in this series.
|
|
|
|
Co-developed-by: Nikolay Aleksandrov <razor@blackwall.org>
|
|
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
|
|
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
|
|
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
|
|
Acked-by: Stanislav Fomichev <sdf@google.com>
|
|
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
|
|
Link: https://github.com/borkmann/iproute2/tree/pr/netkit
|
|
Link: http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf (24ff.)
|
|
Link: https://lore.kernel.org/r/20231024214904.29825-2-daniel@iogearbox.net
|
|
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
|
|
---
|
|
MAINTAINERS | 9 +
|
|
drivers/net/Kconfig | 9 +
|
|
drivers/net/Makefile | 1 +
|
|
drivers/net/netkit.c | 940 +++++++++++++++++++++++++++++++++
|
|
include/net/netkit.h | 38 ++
|
|
include/uapi/linux/bpf.h | 14 +
|
|
include/uapi/linux/if_link.h | 24 +
|
|
kernel/bpf/syscall.c | 30 +-
|
|
tools/include/uapi/linux/bpf.h | 14 +
|
|
9 files changed, 1074 insertions(+), 5 deletions(-)
|
|
create mode 100644 drivers/net/netkit.c
|
|
create mode 100644 include/net/netkit.h
|
|
|
|
--- a/MAINTAINERS
|
|
+++ b/MAINTAINERS
|
|
@@ -3796,6 +3796,15 @@ L: bpf@vger.kernel.org
|
|
S: Odd Fixes
|
|
K: (?:\b|_)bpf(?:\b|_)
|
|
|
|
+BPF [NETKIT] (BPF-programmable network device)
|
|
+M: Daniel Borkmann <daniel@iogearbox.net>
|
|
+M: Nikolay Aleksandrov <razor@blackwall.org>
|
|
+L: bpf@vger.kernel.org
|
|
+L: netdev@vger.kernel.org
|
|
+S: Supported
|
|
+F: drivers/net/netkit.c
|
|
+F: include/net/netkit.h
|
|
+
|
|
BPF [NETWORKING] (struct_ops, reuseport)
|
|
M: Martin KaFai Lau <martin.lau@linux.dev>
|
|
L: bpf@vger.kernel.org
|
|
--- a/drivers/net/Kconfig
|
|
+++ b/drivers/net/Kconfig
|
|
@@ -448,6 +448,15 @@ config NLMON
|
|
diagnostics, etc. This is mostly intended for developers or support
|
|
to debug netlink issues. If unsure, say N.
|
|
|
|
+config NETKIT
|
|
+ bool "BPF-programmable network device"
|
|
+ depends on BPF_SYSCALL
|
|
+ help
|
|
+ The netkit device is a virtual networking device where BPF programs
|
|
+ can be attached to the device(s) transmission routine in order to
|
|
+ implement the driver's internal logic. The device can be configured
|
|
+ to operate in L3 or L2 mode. If unsure, say N.
|
|
+
|
|
config NET_VRF
|
|
tristate "Virtual Routing and Forwarding (Lite)"
|
|
depends on IP_MULTIPLE_TABLES
|
|
--- a/drivers/net/Makefile
|
|
+++ b/drivers/net/Makefile
|
|
@@ -22,6 +22,7 @@ obj-$(CONFIG_MDIO) += mdio.o
|
|
obj-$(CONFIG_NET) += loopback.o
|
|
obj-$(CONFIG_NETDEV_LEGACY_INIT) += Space.o
|
|
obj-$(CONFIG_NETCONSOLE) += netconsole.o
|
|
+obj-$(CONFIG_NETKIT) += netkit.o
|
|
obj-y += phy/
|
|
obj-y += pse-pd/
|
|
obj-y += mdio/
|
|
--- /dev/null
|
|
+++ b/drivers/net/netkit.c
|
|
@@ -0,0 +1,940 @@
|
|
+// SPDX-License-Identifier: GPL-2.0-only
|
|
+/* Copyright (c) 2023 Isovalent */
|
|
+
|
|
+#include <linux/netdevice.h>
|
|
+#include <linux/ethtool.h>
|
|
+#include <linux/etherdevice.h>
|
|
+#include <linux/filter.h>
|
|
+#include <linux/netfilter_netdev.h>
|
|
+#include <linux/bpf_mprog.h>
|
|
+
|
|
+#include <net/netkit.h>
|
|
+#include <net/dst.h>
|
|
+#include <net/tcx.h>
|
|
+
|
|
+#define DRV_NAME "netkit"
|
|
+
|
|
+struct netkit {
|
|
+ /* Needed in fast-path */
|
|
+ struct net_device __rcu *peer;
|
|
+ struct bpf_mprog_entry __rcu *active;
|
|
+ enum netkit_action policy;
|
|
+ struct bpf_mprog_bundle bundle;
|
|
+
|
|
+ /* Needed in slow-path */
|
|
+ enum netkit_mode mode;
|
|
+ bool primary;
|
|
+ u32 headroom;
|
|
+};
|
|
+
|
|
+struct netkit_link {
|
|
+ struct bpf_link link;
|
|
+ struct net_device *dev;
|
|
+ u32 location;
|
|
+};
|
|
+
|
|
+static __always_inline int
|
|
+netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
|
|
+ enum netkit_action ret)
|
|
+{
|
|
+ const struct bpf_mprog_fp *fp;
|
|
+ const struct bpf_prog *prog;
|
|
+
|
|
+ bpf_mprog_foreach_prog(entry, fp, prog) {
|
|
+ bpf_compute_data_pointers(skb);
|
|
+ ret = bpf_prog_run(prog, skb);
|
|
+ if (ret != NETKIT_NEXT)
|
|
+ break;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
|
|
+{
|
|
+ skb_scrub_packet(skb, xnet);
|
|
+ skb->priority = 0;
|
|
+ nf_skip_egress(skb, true);
|
|
+}
|
|
+
|
|
+static struct netkit *netkit_priv(const struct net_device *dev)
|
|
+{
|
|
+ return netdev_priv(dev);
|
|
+}
|
|
+
|
|
+static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ enum netkit_action ret = READ_ONCE(nk->policy);
|
|
+ netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
|
|
+ const struct bpf_mprog_entry *entry;
|
|
+ struct net_device *peer;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ peer = rcu_dereference(nk->peer);
|
|
+ if (unlikely(!peer || !(peer->flags & IFF_UP) ||
|
|
+ !pskb_may_pull(skb, ETH_HLEN) ||
|
|
+ skb_orphan_frags(skb, GFP_ATOMIC)))
|
|
+ goto drop;
|
|
+ netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
|
|
+ skb->dev = peer;
|
|
+ entry = rcu_dereference(nk->active);
|
|
+ if (entry)
|
|
+ ret = netkit_run(entry, skb, ret);
|
|
+ switch (ret) {
|
|
+ case NETKIT_NEXT:
|
|
+ case NETKIT_PASS:
|
|
+ skb->protocol = eth_type_trans(skb, skb->dev);
|
|
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
|
|
+ __netif_rx(skb);
|
|
+ break;
|
|
+ case NETKIT_REDIRECT:
|
|
+ skb_do_redirect(skb);
|
|
+ break;
|
|
+ case NETKIT_DROP:
|
|
+ default:
|
|
+drop:
|
|
+ kfree_skb(skb);
|
|
+ dev_core_stats_tx_dropped_inc(dev);
|
|
+ ret_dev = NET_XMIT_DROP;
|
|
+ break;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+ return ret_dev;
|
|
+}
|
|
+
|
|
+static int netkit_open(struct net_device *dev)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer = rtnl_dereference(nk->peer);
|
|
+
|
|
+ if (!peer)
|
|
+ return -ENOTCONN;
|
|
+ if (peer->flags & IFF_UP) {
|
|
+ netif_carrier_on(dev);
|
|
+ netif_carrier_on(peer);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int netkit_close(struct net_device *dev)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer = rtnl_dereference(nk->peer);
|
|
+
|
|
+ netif_carrier_off(dev);
|
|
+ if (peer)
|
|
+ netif_carrier_off(peer);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int netkit_get_iflink(const struct net_device *dev)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer;
|
|
+ int iflink = 0;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ peer = rcu_dereference(nk->peer);
|
|
+ if (peer)
|
|
+ iflink = peer->ifindex;
|
|
+ rcu_read_unlock();
|
|
+ return iflink;
|
|
+}
|
|
+
|
|
+static void netkit_set_multicast(struct net_device *dev)
|
|
+{
|
|
+ /* Nothing to do, we receive whatever gets pushed to us! */
|
|
+}
|
|
+
|
|
+static void netkit_set_headroom(struct net_device *dev, int headroom)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev), *nk2;
|
|
+ struct net_device *peer;
|
|
+
|
|
+ if (headroom < 0)
|
|
+ headroom = NET_SKB_PAD;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ peer = rcu_dereference(nk->peer);
|
|
+ if (unlikely(!peer))
|
|
+ goto out;
|
|
+
|
|
+ nk2 = netkit_priv(peer);
|
|
+ nk->headroom = headroom;
|
|
+ headroom = max(nk->headroom, nk2->headroom);
|
|
+
|
|
+ peer->needed_headroom = headroom;
|
|
+ dev->needed_headroom = headroom;
|
|
+out:
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+static struct net_device *netkit_peer_dev(struct net_device *dev)
|
|
+{
|
|
+ return rcu_dereference(netkit_priv(dev)->peer);
|
|
+}
|
|
+
|
|
+static void netkit_uninit(struct net_device *dev);
|
|
+
|
|
+static const struct net_device_ops netkit_netdev_ops = {
|
|
+ .ndo_open = netkit_open,
|
|
+ .ndo_stop = netkit_close,
|
|
+ .ndo_start_xmit = netkit_xmit,
|
|
+ .ndo_set_rx_mode = netkit_set_multicast,
|
|
+ .ndo_set_rx_headroom = netkit_set_headroom,
|
|
+ .ndo_get_iflink = netkit_get_iflink,
|
|
+ .ndo_get_peer_dev = netkit_peer_dev,
|
|
+ .ndo_uninit = netkit_uninit,
|
|
+ .ndo_features_check = passthru_features_check,
|
|
+};
|
|
+
|
|
+static void netkit_get_drvinfo(struct net_device *dev,
|
|
+ struct ethtool_drvinfo *info)
|
|
+{
|
|
+ strscpy(info->driver, DRV_NAME, sizeof(info->driver));
|
|
+}
|
|
+
|
|
+static const struct ethtool_ops netkit_ethtool_ops = {
|
|
+ .get_drvinfo = netkit_get_drvinfo,
|
|
+};
|
|
+
|
|
+static void netkit_setup(struct net_device *dev)
|
|
+{
|
|
+ static const netdev_features_t netkit_features_hw_vlan =
|
|
+ NETIF_F_HW_VLAN_CTAG_TX |
|
|
+ NETIF_F_HW_VLAN_CTAG_RX |
|
|
+ NETIF_F_HW_VLAN_STAG_TX |
|
|
+ NETIF_F_HW_VLAN_STAG_RX;
|
|
+ static const netdev_features_t netkit_features =
|
|
+ netkit_features_hw_vlan |
|
|
+ NETIF_F_SG |
|
|
+ NETIF_F_FRAGLIST |
|
|
+ NETIF_F_HW_CSUM |
|
|
+ NETIF_F_RXCSUM |
|
|
+ NETIF_F_SCTP_CRC |
|
|
+ NETIF_F_HIGHDMA |
|
|
+ NETIF_F_GSO_SOFTWARE |
|
|
+ NETIF_F_GSO_ENCAP_ALL;
|
|
+
|
|
+ ether_setup(dev);
|
|
+ dev->max_mtu = ETH_MAX_MTU;
|
|
+
|
|
+ dev->flags |= IFF_NOARP;
|
|
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
|
|
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
|
|
+ dev->priv_flags |= IFF_PHONY_HEADROOM;
|
|
+ dev->priv_flags |= IFF_NO_QUEUE;
|
|
+
|
|
+ dev->ethtool_ops = &netkit_ethtool_ops;
|
|
+ dev->netdev_ops = &netkit_netdev_ops;
|
|
+
|
|
+ dev->features |= netkit_features | NETIF_F_LLTX;
|
|
+ dev->hw_features = netkit_features;
|
|
+ dev->hw_enc_features = netkit_features;
|
|
+ dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
|
|
+ dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
|
|
+
|
|
+ dev->needs_free_netdev = true;
|
|
+
|
|
+ netif_set_tso_max_size(dev, GSO_MAX_SIZE);
|
|
+}
|
|
+
|
|
+static struct net *netkit_get_link_net(const struct net_device *dev)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer = rtnl_dereference(nk->peer);
|
|
+
|
|
+ return peer ? dev_net(peer) : dev_net(dev);
|
|
+}
|
|
+
|
|
+static int netkit_check_policy(int policy, struct nlattr *tb,
|
|
+ struct netlink_ext_ack *extack)
|
|
+{
|
|
+ switch (policy) {
|
|
+ case NETKIT_PASS:
|
|
+ case NETKIT_DROP:
|
|
+ return 0;
|
|
+ default:
|
|
+ NL_SET_ERR_MSG_ATTR(extack, tb,
|
|
+ "Provided default xmit policy not supported");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int netkit_check_mode(int mode, struct nlattr *tb,
|
|
+ struct netlink_ext_ack *extack)
|
|
+{
|
|
+ switch (mode) {
|
|
+ case NETKIT_L2:
|
|
+ case NETKIT_L3:
|
|
+ return 0;
|
|
+ default:
|
|
+ NL_SET_ERR_MSG_ATTR(extack, tb,
|
|
+ "Provided device mode can only be L2 or L3");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
|
|
+ struct netlink_ext_ack *extack)
|
|
+{
|
|
+ struct nlattr *attr = tb[IFLA_ADDRESS];
|
|
+
|
|
+ if (!attr)
|
|
+ return 0;
|
|
+ NL_SET_ERR_MSG_ATTR(extack, attr,
|
|
+ "Setting Ethernet address is not supported");
|
|
+ return -EOPNOTSUPP;
|
|
+}
|
|
+
|
|
+static struct rtnl_link_ops netkit_link_ops;
|
|
+
|
|
+static int netkit_new_link(struct net *src_net, struct net_device *dev,
|
|
+ struct nlattr *tb[], struct nlattr *data[],
|
|
+ struct netlink_ext_ack *extack)
|
|
+{
|
|
+ struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
|
|
+ enum netkit_action default_prim = NETKIT_PASS;
|
|
+ enum netkit_action default_peer = NETKIT_PASS;
|
|
+ enum netkit_mode mode = NETKIT_L3;
|
|
+ unsigned char ifname_assign_type;
|
|
+ struct ifinfomsg *ifmp = NULL;
|
|
+ struct net_device *peer;
|
|
+ char ifname[IFNAMSIZ];
|
|
+ struct netkit *nk;
|
|
+ struct net *net;
|
|
+ int err;
|
|
+
|
|
+ if (data) {
|
|
+ if (data[IFLA_NETKIT_MODE]) {
|
|
+ attr = data[IFLA_NETKIT_MODE];
|
|
+ mode = nla_get_u32(attr);
|
|
+ err = netkit_check_mode(mode, attr, extack);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+ }
|
|
+ if (data[IFLA_NETKIT_PEER_INFO]) {
|
|
+ attr = data[IFLA_NETKIT_PEER_INFO];
|
|
+ ifmp = nla_data(attr);
|
|
+ err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+ err = netkit_validate(peer_tb, NULL, extack);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+ tbp = peer_tb;
|
|
+ }
|
|
+ if (data[IFLA_NETKIT_POLICY]) {
|
|
+ attr = data[IFLA_NETKIT_POLICY];
|
|
+ default_prim = nla_get_u32(attr);
|
|
+ err = netkit_check_policy(default_prim, attr, extack);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+ }
|
|
+ if (data[IFLA_NETKIT_PEER_POLICY]) {
|
|
+ attr = data[IFLA_NETKIT_PEER_POLICY];
|
|
+ default_peer = nla_get_u32(attr);
|
|
+ err = netkit_check_policy(default_peer, attr, extack);
|
|
+ if (err < 0)
|
|
+ return err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (ifmp && tbp[IFLA_IFNAME]) {
|
|
+ nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
|
|
+ ifname_assign_type = NET_NAME_USER;
|
|
+ } else {
|
|
+ strscpy(ifname, "nk%d", IFNAMSIZ);
|
|
+ ifname_assign_type = NET_NAME_ENUM;
|
|
+ }
|
|
+
|
|
+ net = rtnl_link_get_net(src_net, tbp);
|
|
+ if (IS_ERR(net))
|
|
+ return PTR_ERR(net);
|
|
+
|
|
+ peer = rtnl_create_link(net, ifname, ifname_assign_type,
|
|
+ &netkit_link_ops, tbp, extack);
|
|
+ if (IS_ERR(peer)) {
|
|
+ put_net(net);
|
|
+ return PTR_ERR(peer);
|
|
+ }
|
|
+
|
|
+ netif_inherit_tso_max(peer, dev);
|
|
+
|
|
+ if (mode == NETKIT_L2)
|
|
+ eth_hw_addr_random(peer);
|
|
+ if (ifmp && dev->ifindex)
|
|
+ peer->ifindex = ifmp->ifi_index;
|
|
+
|
|
+ nk = netkit_priv(peer);
|
|
+ nk->primary = false;
|
|
+ nk->policy = default_peer;
|
|
+ nk->mode = mode;
|
|
+ bpf_mprog_bundle_init(&nk->bundle);
|
|
+ RCU_INIT_POINTER(nk->active, NULL);
|
|
+ RCU_INIT_POINTER(nk->peer, NULL);
|
|
+
|
|
+ err = register_netdevice(peer);
|
|
+ put_net(net);
|
|
+ if (err < 0)
|
|
+ goto err_register_peer;
|
|
+ netif_carrier_off(peer);
|
|
+ if (mode == NETKIT_L2)
|
|
+ dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
|
|
+
|
|
+ err = rtnl_configure_link(peer, NULL, 0, NULL);
|
|
+ if (err < 0)
|
|
+ goto err_configure_peer;
|
|
+
|
|
+ if (mode == NETKIT_L2)
|
|
+ eth_hw_addr_random(dev);
|
|
+ if (tb[IFLA_IFNAME])
|
|
+ nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
|
|
+ else
|
|
+ strscpy(dev->name, "nk%d", IFNAMSIZ);
|
|
+
|
|
+ nk = netkit_priv(dev);
|
|
+ nk->primary = true;
|
|
+ nk->policy = default_prim;
|
|
+ nk->mode = mode;
|
|
+ bpf_mprog_bundle_init(&nk->bundle);
|
|
+ RCU_INIT_POINTER(nk->active, NULL);
|
|
+ RCU_INIT_POINTER(nk->peer, NULL);
|
|
+
|
|
+ err = register_netdevice(dev);
|
|
+ if (err < 0)
|
|
+ goto err_configure_peer;
|
|
+ netif_carrier_off(dev);
|
|
+ if (mode == NETKIT_L2)
|
|
+ dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
|
|
+
|
|
+ rcu_assign_pointer(netkit_priv(dev)->peer, peer);
|
|
+ rcu_assign_pointer(netkit_priv(peer)->peer, dev);
|
|
+ return 0;
|
|
+err_configure_peer:
|
|
+ unregister_netdevice(peer);
|
|
+ return err;
|
|
+err_register_peer:
|
|
+ free_netdev(peer);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev,
|
|
+ bool bundle_fallback)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct bpf_mprog_entry *entry;
|
|
+
|
|
+ ASSERT_RTNL();
|
|
+ entry = rcu_dereference_rtnl(nk->active);
|
|
+ if (entry)
|
|
+ return entry;
|
|
+ if (bundle_fallback)
|
|
+ return &nk->bundle.a;
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void netkit_entry_update(struct net_device *dev,
|
|
+ struct bpf_mprog_entry *entry)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+
|
|
+ ASSERT_RTNL();
|
|
+ rcu_assign_pointer(nk->active, entry);
|
|
+}
|
|
+
|
|
+static void netkit_entry_sync(void)
|
|
+{
|
|
+ synchronize_rcu();
|
|
+}
|
|
+
|
|
+static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which)
|
|
+{
|
|
+ struct net_device *dev;
|
|
+ struct netkit *nk;
|
|
+
|
|
+ ASSERT_RTNL();
|
|
+
|
|
+ switch (which) {
|
|
+ case BPF_NETKIT_PRIMARY:
|
|
+ case BPF_NETKIT_PEER:
|
|
+ break;
|
|
+ default:
|
|
+ return ERR_PTR(-EINVAL);
|
|
+ }
|
|
+
|
|
+ dev = __dev_get_by_index(net, ifindex);
|
|
+ if (!dev)
|
|
+ return ERR_PTR(-ENODEV);
|
|
+ if (dev->netdev_ops != &netkit_netdev_ops)
|
|
+ return ERR_PTR(-ENXIO);
|
|
+
|
|
+ nk = netkit_priv(dev);
|
|
+ if (!nk->primary)
|
|
+ return ERR_PTR(-EACCES);
|
|
+ if (which == BPF_NETKIT_PEER) {
|
|
+ dev = rcu_dereference_rtnl(nk->peer);
|
|
+ if (!dev)
|
|
+ return ERR_PTR(-ENODEV);
|
|
+ }
|
|
+ return dev;
|
|
+}
|
|
+
|
|
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
+{
|
|
+ struct bpf_mprog_entry *entry, *entry_new;
|
|
+ struct bpf_prog *replace_prog = NULL;
|
|
+ struct net_device *dev;
|
|
+ int ret;
|
|
+
|
|
+ rtnl_lock();
|
|
+ dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
|
|
+ attr->attach_type);
|
|
+ if (IS_ERR(dev)) {
|
|
+ ret = PTR_ERR(dev);
|
|
+ goto out;
|
|
+ }
|
|
+ entry = netkit_entry_fetch(dev, true);
|
|
+ if (attr->attach_flags & BPF_F_REPLACE) {
|
|
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
|
|
+ prog->type);
|
|
+ if (IS_ERR(replace_prog)) {
|
|
+ ret = PTR_ERR(replace_prog);
|
|
+ replace_prog = NULL;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
|
|
+ attr->attach_flags, attr->relative_fd,
|
|
+ attr->expected_revision);
|
|
+ if (!ret) {
|
|
+ if (entry != entry_new) {
|
|
+ netkit_entry_update(dev, entry_new);
|
|
+ netkit_entry_sync();
|
|
+ }
|
|
+ bpf_mprog_commit(entry);
|
|
+ }
|
|
+out:
|
|
+ if (replace_prog)
|
|
+ bpf_prog_put(replace_prog);
|
|
+ rtnl_unlock();
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
+{
|
|
+ struct bpf_mprog_entry *entry, *entry_new;
|
|
+ struct net_device *dev;
|
|
+ int ret;
|
|
+
|
|
+ rtnl_lock();
|
|
+ dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
|
|
+ attr->attach_type);
|
|
+ if (IS_ERR(dev)) {
|
|
+ ret = PTR_ERR(dev);
|
|
+ goto out;
|
|
+ }
|
|
+ entry = netkit_entry_fetch(dev, false);
|
|
+ if (!entry) {
|
|
+ ret = -ENOENT;
|
|
+ goto out;
|
|
+ }
|
|
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
|
|
+ attr->relative_fd, attr->expected_revision);
|
|
+ if (!ret) {
|
|
+ if (!bpf_mprog_total(entry_new))
|
|
+ entry_new = NULL;
|
|
+ netkit_entry_update(dev, entry_new);
|
|
+ netkit_entry_sync();
|
|
+ bpf_mprog_commit(entry);
|
|
+ }
|
|
+out:
|
|
+ rtnl_unlock();
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
|
|
+{
|
|
+ struct net_device *dev;
|
|
+ int ret;
|
|
+
|
|
+ rtnl_lock();
|
|
+ dev = netkit_dev_fetch(current->nsproxy->net_ns,
|
|
+ attr->query.target_ifindex,
|
|
+ attr->query.attach_type);
|
|
+ if (IS_ERR(dev)) {
|
|
+ ret = PTR_ERR(dev);
|
|
+ goto out;
|
|
+ }
|
|
+ ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false));
|
|
+out:
|
|
+ rtnl_unlock();
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct netkit_link *netkit_link(const struct bpf_link *link)
|
|
+{
|
|
+ return container_of(link, struct netkit_link, link);
|
|
+}
|
|
+
|
|
+static int netkit_link_prog_attach(struct bpf_link *link, u32 flags,
|
|
+ u32 id_or_fd, u64 revision)
|
|
+{
|
|
+ struct netkit_link *nkl = netkit_link(link);
|
|
+ struct bpf_mprog_entry *entry, *entry_new;
|
|
+ struct net_device *dev = nkl->dev;
|
|
+ int ret;
|
|
+
|
|
+ ASSERT_RTNL();
|
|
+ entry = netkit_entry_fetch(dev, true);
|
|
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
|
|
+ id_or_fd, revision);
|
|
+ if (!ret) {
|
|
+ if (entry != entry_new) {
|
|
+ netkit_entry_update(dev, entry_new);
|
|
+ netkit_entry_sync();
|
|
+ }
|
|
+ bpf_mprog_commit(entry);
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void netkit_link_release(struct bpf_link *link)
|
|
+{
|
|
+ struct netkit_link *nkl = netkit_link(link);
|
|
+ struct bpf_mprog_entry *entry, *entry_new;
|
|
+ struct net_device *dev;
|
|
+ int ret = 0;
|
|
+
|
|
+ rtnl_lock();
|
|
+ dev = nkl->dev;
|
|
+ if (!dev)
|
|
+ goto out;
|
|
+ entry = netkit_entry_fetch(dev, false);
|
|
+ if (!entry) {
|
|
+ ret = -ENOENT;
|
|
+ goto out;
|
|
+ }
|
|
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
|
|
+ if (!ret) {
|
|
+ if (!bpf_mprog_total(entry_new))
|
|
+ entry_new = NULL;
|
|
+ netkit_entry_update(dev, entry_new);
|
|
+ netkit_entry_sync();
|
|
+ bpf_mprog_commit(entry);
|
|
+ nkl->dev = NULL;
|
|
+ }
|
|
+out:
|
|
+ WARN_ON_ONCE(ret);
|
|
+ rtnl_unlock();
|
|
+}
|
|
+
|
|
+static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog,
|
|
+ struct bpf_prog *oprog)
|
|
+{
|
|
+ struct netkit_link *nkl = netkit_link(link);
|
|
+ struct bpf_mprog_entry *entry, *entry_new;
|
|
+ struct net_device *dev;
|
|
+ int ret = 0;
|
|
+
|
|
+ rtnl_lock();
|
|
+ dev = nkl->dev;
|
|
+ if (!dev) {
|
|
+ ret = -ENOLINK;
|
|
+ goto out;
|
|
+ }
|
|
+ if (oprog && link->prog != oprog) {
|
|
+ ret = -EPERM;
|
|
+ goto out;
|
|
+ }
|
|
+ oprog = link->prog;
|
|
+ if (oprog == nprog) {
|
|
+ bpf_prog_put(nprog);
|
|
+ goto out;
|
|
+ }
|
|
+ entry = netkit_entry_fetch(dev, false);
|
|
+ if (!entry) {
|
|
+ ret = -ENOENT;
|
|
+ goto out;
|
|
+ }
|
|
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
|
|
+ BPF_F_REPLACE | BPF_F_ID,
|
|
+ link->prog->aux->id, 0);
|
|
+ if (!ret) {
|
|
+ WARN_ON_ONCE(entry != entry_new);
|
|
+ oprog = xchg(&link->prog, nprog);
|
|
+ bpf_prog_put(oprog);
|
|
+ bpf_mprog_commit(entry);
|
|
+ }
|
|
+out:
|
|
+ rtnl_unlock();
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void netkit_link_dealloc(struct bpf_link *link)
|
|
+{
|
|
+ kfree(netkit_link(link));
|
|
+}
|
|
+
|
|
+static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
|
|
+{
|
|
+ const struct netkit_link *nkl = netkit_link(link);
|
|
+ u32 ifindex = 0;
|
|
+
|
|
+ rtnl_lock();
|
|
+ if (nkl->dev)
|
|
+ ifindex = nkl->dev->ifindex;
|
|
+ rtnl_unlock();
|
|
+
|
|
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
|
|
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
|
|
+ nkl->location,
|
|
+ nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer");
|
|
+}
|
|
+
|
|
+static int netkit_link_fill_info(const struct bpf_link *link,
|
|
+ struct bpf_link_info *info)
|
|
+{
|
|
+ const struct netkit_link *nkl = netkit_link(link);
|
|
+ u32 ifindex = 0;
|
|
+
|
|
+ rtnl_lock();
|
|
+ if (nkl->dev)
|
|
+ ifindex = nkl->dev->ifindex;
|
|
+ rtnl_unlock();
|
|
+
|
|
+ info->netkit.ifindex = ifindex;
|
|
+ info->netkit.attach_type = nkl->location;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int netkit_link_detach(struct bpf_link *link)
|
|
+{
|
|
+ netkit_link_release(link);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static const struct bpf_link_ops netkit_link_lops = {
|
|
+ .release = netkit_link_release,
|
|
+ .detach = netkit_link_detach,
|
|
+ .dealloc = netkit_link_dealloc,
|
|
+ .update_prog = netkit_link_update,
|
|
+ .show_fdinfo = netkit_link_fdinfo,
|
|
+ .fill_link_info = netkit_link_fill_info,
|
|
+};
|
|
+
|
|
+static int netkit_link_init(struct netkit_link *nkl,
|
|
+ struct bpf_link_primer *link_primer,
|
|
+ const union bpf_attr *attr,
|
|
+ struct net_device *dev,
|
|
+ struct bpf_prog *prog)
|
|
+{
|
|
+ bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
|
|
+ &netkit_link_lops, prog);
|
|
+ nkl->location = attr->link_create.attach_type;
|
|
+ nkl->dev = dev;
|
|
+ return bpf_link_prime(&nkl->link, link_primer);
|
|
+}
|
|
+
|
|
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
|
+{
|
|
+ struct bpf_link_primer link_primer;
|
|
+ struct netkit_link *nkl;
|
|
+ struct net_device *dev;
|
|
+ int ret;
|
|
+
|
|
+ rtnl_lock();
|
|
+ dev = netkit_dev_fetch(current->nsproxy->net_ns,
|
|
+ attr->link_create.target_ifindex,
|
|
+ attr->link_create.attach_type);
|
|
+ if (IS_ERR(dev)) {
|
|
+ ret = PTR_ERR(dev);
|
|
+ goto out;
|
|
+ }
|
|
+ nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT);
|
|
+ if (!nkl) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ ret = netkit_link_init(nkl, &link_primer, attr, dev, prog);
|
|
+ if (ret) {
|
|
+ kfree(nkl);
|
|
+ goto out;
|
|
+ }
|
|
+ ret = netkit_link_prog_attach(&nkl->link,
|
|
+ attr->link_create.flags,
|
|
+ attr->link_create.netkit.relative_fd,
|
|
+ attr->link_create.netkit.expected_revision);
|
|
+ if (ret) {
|
|
+ nkl->dev = NULL;
|
|
+ bpf_link_cleanup(&link_primer);
|
|
+ goto out;
|
|
+ }
|
|
+ ret = bpf_link_settle(&link_primer);
|
|
+out:
|
|
+ rtnl_unlock();
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void netkit_release_all(struct net_device *dev)
|
|
+{
|
|
+ struct bpf_mprog_entry *entry;
|
|
+ struct bpf_tuple tuple = {};
|
|
+ struct bpf_mprog_fp *fp;
|
|
+ struct bpf_mprog_cp *cp;
|
|
+
|
|
+ entry = netkit_entry_fetch(dev, false);
|
|
+ if (!entry)
|
|
+ return;
|
|
+ netkit_entry_update(dev, NULL);
|
|
+ netkit_entry_sync();
|
|
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
|
|
+ if (tuple.link)
|
|
+ netkit_link(tuple.link)->dev = NULL;
|
|
+ else
|
|
+ bpf_prog_put(tuple.prog);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void netkit_uninit(struct net_device *dev)
|
|
+{
|
|
+ netkit_release_all(dev);
|
|
+}
|
|
+
|
|
+static void netkit_del_link(struct net_device *dev, struct list_head *head)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer = rtnl_dereference(nk->peer);
|
|
+
|
|
+ RCU_INIT_POINTER(nk->peer, NULL);
|
|
+ unregister_netdevice_queue(dev, head);
|
|
+ if (peer) {
|
|
+ nk = netkit_priv(peer);
|
|
+ RCU_INIT_POINTER(nk->peer, NULL);
|
|
+ unregister_netdevice_queue(peer, head);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
|
|
+ struct nlattr *data[],
|
|
+ struct netlink_ext_ack *extack)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer = rtnl_dereference(nk->peer);
|
|
+ enum netkit_action policy;
|
|
+ struct nlattr *attr;
|
|
+ int err;
|
|
+
|
|
+ if (!nk->primary) {
|
|
+ NL_SET_ERR_MSG(extack,
|
|
+ "netkit link settings can be changed only through the primary device");
|
|
+ return -EACCES;
|
|
+ }
|
|
+
|
|
+ if (data[IFLA_NETKIT_MODE]) {
|
|
+ NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE],
|
|
+ "netkit link operating mode cannot be changed after device creation");
|
|
+ return -EACCES;
|
|
+ }
|
|
+
|
|
+ if (data[IFLA_NETKIT_POLICY]) {
|
|
+ attr = data[IFLA_NETKIT_POLICY];
|
|
+ policy = nla_get_u32(attr);
|
|
+ err = netkit_check_policy(policy, attr, extack);
|
|
+ if (err)
|
|
+ return err;
|
|
+ WRITE_ONCE(nk->policy, policy);
|
|
+ }
|
|
+
|
|
+ if (data[IFLA_NETKIT_PEER_POLICY]) {
|
|
+ err = -EOPNOTSUPP;
|
|
+ attr = data[IFLA_NETKIT_PEER_POLICY];
|
|
+ policy = nla_get_u32(attr);
|
|
+ if (peer)
|
|
+ err = netkit_check_policy(policy, attr, extack);
|
|
+ if (err)
|
|
+ return err;
|
|
+ nk = netkit_priv(peer);
|
|
+ WRITE_ONCE(nk->policy, policy);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static size_t netkit_get_size(const struct net_device *dev)
|
|
+{
|
|
+ return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
|
|
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
|
|
+ nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */
|
|
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
|
|
+ 0;
|
|
+}
|
|
+
|
|
+static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
|
+{
|
|
+ struct netkit *nk = netkit_priv(dev);
|
|
+ struct net_device *peer = rtnl_dereference(nk->peer);
|
|
+
|
|
+ if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary))
|
|
+ return -EMSGSIZE;
|
|
+ if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy))
|
|
+ return -EMSGSIZE;
|
|
+ if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
|
|
+ return -EMSGSIZE;
|
|
+
|
|
+ if (peer) {
|
|
+ nk = netkit_priv(peer);
|
|
+ if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
|
|
+ return -EMSGSIZE;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
|
|
+ [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) },
|
|
+ [IFLA_NETKIT_POLICY] = { .type = NLA_U32 },
|
|
+ [IFLA_NETKIT_MODE] = { .type = NLA_U32 },
|
|
+ [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 },
|
|
+ [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT,
|
|
+ .reject_message = "Primary attribute is read-only" },
|
|
+};
|
|
+
|
|
+static struct rtnl_link_ops netkit_link_ops = {
|
|
+ .kind = DRV_NAME,
|
|
+ .priv_size = sizeof(struct netkit),
|
|
+ .setup = netkit_setup,
|
|
+ .newlink = netkit_new_link,
|
|
+ .dellink = netkit_del_link,
|
|
+ .changelink = netkit_change_link,
|
|
+ .get_link_net = netkit_get_link_net,
|
|
+ .get_size = netkit_get_size,
|
|
+ .fill_info = netkit_fill_info,
|
|
+ .policy = netkit_policy,
|
|
+ .validate = netkit_validate,
|
|
+ .maxtype = IFLA_NETKIT_MAX,
|
|
+};
|
|
+
|
|
+static __init int netkit_init(void)
|
|
+{
|
|
+ BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
|
|
+ (int)NETKIT_PASS != (int)TCX_PASS ||
|
|
+ (int)NETKIT_DROP != (int)TCX_DROP ||
|
|
+ (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
|
|
+
|
|
+ return rtnl_link_register(&netkit_link_ops);
|
|
+}
|
|
+
|
|
+static __exit void netkit_exit(void)
|
|
+{
|
|
+ rtnl_link_unregister(&netkit_link_ops);
|
|
+}
|
|
+
|
|
+module_init(netkit_init);
|
|
+module_exit(netkit_exit);
|
|
+
|
|
+MODULE_DESCRIPTION("BPF-programmable network device");
|
|
+MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
|
|
+MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);
|
|
--- /dev/null
|
|
+++ b/include/net/netkit.h
|
|
@@ -0,0 +1,38 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+/* Copyright (c) 2023 Isovalent */
|
|
+#ifndef __NET_NETKIT_H
|
|
+#define __NET_NETKIT_H
|
|
+
|
|
+#include <linux/bpf.h>
|
|
+
|
|
+#ifdef CONFIG_NETKIT
|
|
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
|
|
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
|
|
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
|
|
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
|
|
+#else
|
|
+static inline int netkit_prog_attach(const union bpf_attr *attr,
|
|
+ struct bpf_prog *prog)
|
|
+{
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+static inline int netkit_link_attach(const union bpf_attr *attr,
|
|
+ struct bpf_prog *prog)
|
|
+{
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+static inline int netkit_prog_detach(const union bpf_attr *attr,
|
|
+ struct bpf_prog *prog)
|
|
+{
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+static inline int netkit_prog_query(const union bpf_attr *attr,
|
|
+ union bpf_attr __user *uattr)
|
|
+{
|
|
+ return -EINVAL;
|
|
+}
|
|
+#endif /* CONFIG_NETKIT */
|
|
+#endif /* __NET_NETKIT_H */
|
|
--- a/include/uapi/linux/bpf.h
|
|
+++ b/include/uapi/linux/bpf.h
|
|
@@ -1057,6 +1057,8 @@ enum bpf_attach_type {
|
|
BPF_TCX_INGRESS,
|
|
BPF_TCX_EGRESS,
|
|
BPF_TRACE_UPROBE_MULTI,
|
|
+ BPF_NETKIT_PRIMARY,
|
|
+ BPF_NETKIT_PEER,
|
|
__MAX_BPF_ATTACH_TYPE
|
|
};
|
|
|
|
@@ -1076,6 +1078,7 @@ enum bpf_link_type {
|
|
BPF_LINK_TYPE_NETFILTER = 10,
|
|
BPF_LINK_TYPE_TCX = 11,
|
|
BPF_LINK_TYPE_UPROBE_MULTI = 12,
|
|
+ BPF_LINK_TYPE_NETKIT = 13,
|
|
MAX_BPF_LINK_TYPE,
|
|
};
|
|
|
|
@@ -1661,6 +1664,13 @@ union bpf_attr {
|
|
__u32 flags;
|
|
__u32 pid;
|
|
} uprobe_multi;
|
|
+ struct {
|
|
+ union {
|
|
+ __u32 relative_fd;
|
|
+ __u32 relative_id;
|
|
+ };
|
|
+ __u64 expected_revision;
|
|
+ } netkit;
|
|
};
|
|
} link_create;
|
|
|
|
@@ -6584,6 +6594,10 @@ struct bpf_link_info {
|
|
__u32 ifindex;
|
|
__u32 attach_type;
|
|
} tcx;
|
|
+ struct {
|
|
+ __u32 ifindex;
|
|
+ __u32 attach_type;
|
|
+ } netkit;
|
|
};
|
|
} __attribute__((aligned(8)));
|
|
|
|
--- a/include/uapi/linux/if_link.h
|
|
+++ b/include/uapi/linux/if_link.h
|
|
@@ -756,6 +756,30 @@ struct tunnel_msg {
|
|
__u32 ifindex;
|
|
};
|
|
|
|
+/* netkit section */
|
|
+enum netkit_action {
|
|
+ NETKIT_NEXT = -1,
|
|
+ NETKIT_PASS = 0,
|
|
+ NETKIT_DROP = 2,
|
|
+ NETKIT_REDIRECT = 7,
|
|
+};
|
|
+
|
|
+enum netkit_mode {
|
|
+ NETKIT_L2,
|
|
+ NETKIT_L3,
|
|
+};
|
|
+
|
|
+enum {
|
|
+ IFLA_NETKIT_UNSPEC,
|
|
+ IFLA_NETKIT_PEER_INFO,
|
|
+ IFLA_NETKIT_PRIMARY,
|
|
+ IFLA_NETKIT_POLICY,
|
|
+ IFLA_NETKIT_PEER_POLICY,
|
|
+ IFLA_NETKIT_MODE,
|
|
+ __IFLA_NETKIT_MAX,
|
|
+};
|
|
+#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
|
|
+
|
|
/* VXLAN section */
|
|
|
|
/* include statistics in the dump */
|
|
--- a/kernel/bpf/syscall.c
|
|
+++ b/kernel/bpf/syscall.c
|
|
@@ -35,8 +35,9 @@
|
|
#include <linux/rcupdate_trace.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/trace_events.h>
|
|
-#include <net/netfilter/nf_bpf_link.h>
|
|
|
|
+#include <net/netfilter/nf_bpf_link.h>
|
|
+#include <net/netkit.h>
|
|
#include <net/tcx.h>
|
|
|
|
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
|
|
@@ -3811,6 +3812,8 @@ attach_type_to_prog_type(enum bpf_attach
|
|
return BPF_PROG_TYPE_LSM;
|
|
case BPF_TCX_INGRESS:
|
|
case BPF_TCX_EGRESS:
|
|
+ case BPF_NETKIT_PRIMARY:
|
|
+ case BPF_NETKIT_PEER:
|
|
return BPF_PROG_TYPE_SCHED_CLS;
|
|
default:
|
|
return BPF_PROG_TYPE_UNSPEC;
|
|
@@ -3867,7 +3870,9 @@ static int bpf_prog_attach_check_attach_
|
|
return 0;
|
|
case BPF_PROG_TYPE_SCHED_CLS:
|
|
if (attach_type != BPF_TCX_INGRESS &&
|
|
- attach_type != BPF_TCX_EGRESS)
|
|
+ attach_type != BPF_TCX_EGRESS &&
|
|
+ attach_type != BPF_NETKIT_PRIMARY &&
|
|
+ attach_type != BPF_NETKIT_PEER)
|
|
return -EINVAL;
|
|
return 0;
|
|
default:
|
|
@@ -3950,7 +3955,11 @@ static int bpf_prog_attach(const union b
|
|
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
|
|
break;
|
|
case BPF_PROG_TYPE_SCHED_CLS:
|
|
- ret = tcx_prog_attach(attr, prog);
|
|
+ if (attr->attach_type == BPF_TCX_INGRESS ||
|
|
+ attr->attach_type == BPF_TCX_EGRESS)
|
|
+ ret = tcx_prog_attach(attr, prog);
|
|
+ else
|
|
+ ret = netkit_prog_attach(attr, prog);
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
@@ -4011,7 +4020,11 @@ static int bpf_prog_detach(const union b
|
|
ret = cgroup_bpf_prog_detach(attr, ptype);
|
|
break;
|
|
case BPF_PROG_TYPE_SCHED_CLS:
|
|
- ret = tcx_prog_detach(attr, prog);
|
|
+ if (attr->attach_type == BPF_TCX_INGRESS ||
|
|
+ attr->attach_type == BPF_TCX_EGRESS)
|
|
+ ret = tcx_prog_detach(attr, prog);
|
|
+ else
|
|
+ ret = netkit_prog_detach(attr, prog);
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
@@ -4073,6 +4086,9 @@ static int bpf_prog_query(const union bp
|
|
case BPF_TCX_INGRESS:
|
|
case BPF_TCX_EGRESS:
|
|
return tcx_prog_query(attr, uattr);
|
|
+ case BPF_NETKIT_PRIMARY:
|
|
+ case BPF_NETKIT_PEER:
|
|
+ return netkit_prog_query(attr, uattr);
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
@@ -5054,7 +5070,11 @@ static int link_create(union bpf_attr *a
|
|
ret = bpf_xdp_link_attach(attr, prog);
|
|
break;
|
|
case BPF_PROG_TYPE_SCHED_CLS:
|
|
- ret = tcx_link_attach(attr, prog);
|
|
+ if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
|
|
+ attr->link_create.attach_type == BPF_TCX_EGRESS)
|
|
+ ret = tcx_link_attach(attr, prog);
|
|
+ else
|
|
+ ret = netkit_link_attach(attr, prog);
|
|
break;
|
|
case BPF_PROG_TYPE_NETFILTER:
|
|
ret = bpf_nf_link_attach(attr, prog);
|
|
--- a/tools/include/uapi/linux/bpf.h
|
|
+++ b/tools/include/uapi/linux/bpf.h
|
|
@@ -1057,6 +1057,8 @@ enum bpf_attach_type {
|
|
BPF_TCX_INGRESS,
|
|
BPF_TCX_EGRESS,
|
|
BPF_TRACE_UPROBE_MULTI,
|
|
+ BPF_NETKIT_PRIMARY,
|
|
+ BPF_NETKIT_PEER,
|
|
__MAX_BPF_ATTACH_TYPE
|
|
};
|
|
|
|
@@ -1076,6 +1078,7 @@ enum bpf_link_type {
|
|
BPF_LINK_TYPE_NETFILTER = 10,
|
|
BPF_LINK_TYPE_TCX = 11,
|
|
BPF_LINK_TYPE_UPROBE_MULTI = 12,
|
|
+ BPF_LINK_TYPE_NETKIT = 13,
|
|
MAX_BPF_LINK_TYPE,
|
|
};
|
|
|
|
@@ -1661,6 +1664,13 @@ union bpf_attr {
|
|
__u32 flags;
|
|
__u32 pid;
|
|
} uprobe_multi;
|
|
+ struct {
|
|
+ union {
|
|
+ __u32 relative_fd;
|
|
+ __u32 relative_id;
|
|
+ };
|
|
+ __u64 expected_revision;
|
|
+ } netkit;
|
|
};
|
|
} link_create;
|
|
|
|
@@ -6587,6 +6597,10 @@ struct bpf_link_info {
|
|
__u32 ifindex;
|
|
__u32 attach_type;
|
|
} tcx;
|
|
+ struct {
|
|
+ __u32 ifindex;
|
|
+ __u32 attach_type;
|
|
+ } netkit;
|
|
};
|
|
} __attribute__((aligned(8)));
|
|
|