Skip to content

bpf: New approach for BPF MTU handling and enforcement #179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
sudo: required
language: bash
dist: bionic
services:
- docker

env:
global:
- PROJECT_NAME='libbpf'
- AUTHOR_EMAIL="$(git log -1 --pretty=\"%aE\")"
- REPO_ROOT="$TRAVIS_BUILD_DIR"
- CI_ROOT="$REPO_ROOT/travis-ci"
- VMTEST_ROOT="$CI_ROOT/vmtest"

addons:
apt:
packages:
- qemu-kvm
- zstd
- binutils-dev
- elfutils
- libcap-dev
- libelf-dev
- libdw-dev
- python3-docutils

jobs:
include:
- stage: Builds & Tests
name: Kernel LATEST + selftests
language: bash
env: KERNEL=LATEST
script: $CI_ROOT/vmtest/run_vmtest.sh || travis_terminate 1
5 changes: 3 additions & 2 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -3866,10 +3866,11 @@ bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);

static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb)
struct sk_buff *skb,
const bool mtu_check)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
(mtu_check && unlikely(!is_skb_forwardable(dev, skb)))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
Expand Down
24 changes: 22 additions & 2 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -2216,6 +2216,9 @@ union bpf_attr {
* * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
* packet is not forwarded or needs assist from full stack
*
* If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU
* was exceeded and result params->mtu contains the MTU.
*
* long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
* Description
* Add an entry to, or update a sockhash *map* referencing sockets.
Expand Down Expand Up @@ -3715,6 +3718,18 @@ union bpf_attr {
* never return NULL.
* Return
* A pointer pointing to the kernel percpu variable on this cpu.
*
* int bpf_mtu_lookup(void *ctx, u32 ifindex, u64 flags)
* Description
* Lookup MTU of net device based on ifindex. The Linux kernel
* route table can configure MTUs on a more specific per route
* level, which is not provided by this helper. For route level
* MTU checks use the **bpf_fib_lookup**\ () helper.
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
* Return
* On success, MTU size is returned. On error, a negative value.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -3872,6 +3887,7 @@ union bpf_attr {
FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \
FN(mtu_lookup), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
Expand Down Expand Up @@ -4844,9 +4860,13 @@ struct bpf_fib_lookup {
__be16 sport;
__be16 dport;

/* total length of packet from network header - used for MTU check */
__u16 tot_len;
union { /* used for MTU check */
/* input to lookup */
__u16 tot_len; /* total length of packet from network hdr */

/* output: MTU value (if requested check_mtu) */
__u16 mtu;
};
/* input: L3 device index for lookup
* output: device index from FIB lookup
*/
Expand Down
24 changes: 21 additions & 3 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -2209,7 +2209,7 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);
int ret = ____dev_forward_skb(dev, skb, true);

if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
Expand Down Expand Up @@ -3870,6 +3870,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
*ret = NET_XMIT_SUCCESS;
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
Expand All @@ -3885,6 +3886,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return NULL;
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_set_redirected(skb, false);
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
Expand Down Expand Up @@ -4063,9 +4065,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
bool mtu_check = false;
bool again = false;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;

skb_reset_mac_header(skb);

Expand All @@ -4081,14 +4084,28 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)

qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
mtu_check = skb_is_redirected(skb);
skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
unsigned int len_orig = skb->len;

skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
/* BPF-prog ran and could have changed packet size beyond MTU */
if (rc == NET_XMIT_SUCCESS && skb->len > len_orig)
mtu_check = true;
}
# endif
/* MTU-check only happens on "last" net_device in a redirect sequence
* (e.g. above sch_handle_egress can steal SKB and skb_do_redirect it
* either ingress or egress to another device).
*/
if (mtu_check && !is_skb_forwardable(dev, skb)) {
rc = -EMSGSIZE;
goto drop;
}
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
Expand Down Expand Up @@ -4156,7 +4173,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)

rc = -ENETDOWN;
rcu_read_unlock_bh();

drop:
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return rc;
Expand Down Expand Up @@ -4974,6 +4991,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_set_redirected(skb, true);
skb_do_redirect(skb);
return NULL;
case TC_ACT_CONSUMED:
Expand Down
88 changes: 79 additions & 9 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -2083,13 +2083,21 @@ static const struct bpf_func_proto bpf_csum_level_proto = {

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
int ret = ____dev_forward_skb(dev, skb, false);

if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
ret = netif_rx(skb);
}

return ret;
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);
int ret = ____dev_forward_skb(dev, skb, false);

if (likely(!ret)) {
skb->dev = dev;
Expand Down Expand Up @@ -3476,8 +3484,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,

static u32 __bpf_skb_max_len(const struct sk_buff *skb)
{
return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
SKB_MAX_ALLOC;
return IP_MAX_MTU;
}

BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
Expand Down Expand Up @@ -5186,13 +5193,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
const struct neighbour *neigh,
const struct net_device *dev)
const struct net_device *dev, u32 mtu)
{
memcpy(params->dmac, neigh->ha, ETH_ALEN);
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
params->ifindex = dev->ifindex;
params->mtu = mtu;

return 0;
}
Expand Down Expand Up @@ -5276,8 +5284,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,

if (check_mtu) {
mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
if (params->tot_len > mtu)
if (params->tot_len > mtu) {
params->mtu = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
}
}

nhc = res.nhc;
Expand Down Expand Up @@ -5310,7 +5320,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;

return bpf_fib_set_fwd_params(params, neigh, dev);
return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
}
#endif

Expand Down Expand Up @@ -5402,8 +5412,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,

if (check_mtu) {
mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
if (params->tot_len > mtu)
if (params->tot_len > mtu) {
params->mtu = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
}
}

if (res.nh->fib_nh_lws)
Expand All @@ -5422,7 +5434,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;

return bpf_fib_set_fwd_params(params, neigh, dev);
return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
}
#endif

Expand Down Expand Up @@ -5491,6 +5503,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

params->mtu = dev->mtu; /* union with tot_len */
}

return rc;
Expand All @@ -5506,6 +5520,58 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
.arg4_type = ARG_ANYTHING,
};

static int bpf_mtu_lookup(struct net *netns, u32 ifindex, u64 flags)
{
struct net_device *dev;

// XXX: Do we even need flags?
// Flag idea: get ctx dev->mtu for XDP_TX or redir out-same-dev
if (flags)
return -EINVAL;

dev = dev_get_by_index_rcu(netns, ifindex);
if (!dev)
return -ENODEV;

return dev->mtu;
}

BPF_CALL_3(bpf_skb_mtu_lookup, struct sk_buff *, skb,
u32, ifindex, u64, flags)
{
struct net *netns = dev_net(skb->dev);

return bpf_mtu_lookup(netns, ifindex, flags);
}

BPF_CALL_3(bpf_xdp_mtu_lookup, struct xdp_buff *, xdp,
u32, ifindex, u64, flags)
{
struct net *netns = dev_net(xdp->rxq->dev);
// XXX: Handle if this runs in devmap prog (then is rxq invalid?)

return bpf_mtu_lookup(netns, ifindex, flags);
}

static const struct bpf_func_proto bpf_skb_mtu_lookup_proto = {
.func = bpf_skb_mtu_lookup,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};

static const struct bpf_func_proto bpf_xdp_mtu_lookup_proto = {
.func = bpf_xdp_mtu_lookup,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};


#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
Expand Down Expand Up @@ -7069,6 +7135,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
return &bpf_skb_fib_lookup_proto;
case BPF_FUNC_mtu_lookup:
return &bpf_skb_mtu_lookup_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
Expand Down Expand Up @@ -7138,6 +7206,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_adjust_tail_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
case BPF_FUNC_mtu_lookup:
return &bpf_xdp_mtu_lookup_proto;
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_udp:
return &bpf_xdp_sk_lookup_udp_proto;
Expand Down
1 change: 1 addition & 0 deletions net/sched/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ config NET_SCH_INGRESS
depends on NET_CLS_ACT
select NET_INGRESS
select NET_EGRESS
select NET_REDIRECT
help
Say Y here if you want to use classifiers for incoming and/or outgoing
packets. This qdisc doesn't do anything else besides running classifiers,
Expand Down
Loading