Skip to content

Commit e58b477

Browse files
committed
Merge branch 'vxlan-support-user-defined-reserved-bits'
Petr Machata says: ==================== vxlan: Support user-defined reserved bits Currently the VXLAN header validation works by vxlan_rcv() going feature by feature, each feature clearing the bits that it consumes. If anything is left unparsed at the end, the packet is rejected. Unfortunately there are machines out there that send VXLAN packets with reserved bits set, even if they are configured to not use the corresponding features. One such report is here[1], and we have heard similar complaints from our customers as well. This patchset adds an attribute that makes it configurable which bits the user wishes to tolerate and which they consider reserved. This was recommended in [1] as well. A knob like that inevitably allows users to set as reserved bits that are in fact required for the features enabled by the netdevice, such as GPE. This is detected, and such configurations are rejected. In patches kernel-patches#1..kernel-patches#7, the reserved bits validation code is gradually moved away from the unparsed approach described above, to one where a given set of valid bits is precomputed and then the packet is validated against that. In patch kernel-patches#8, this precomputed set is made configurable through a new attribute IFLA_VXLAN_RESERVED_BITS. Patches kernel-patches#9 and kernel-patches#10 massage the testsuite a bit, so that patch kernel-patches#11 can introduce a selftest for the resreved bits feature. The corresponding iproute2 support is available in [2]. [1] https://lore.kernel.org/netdev/[email protected]/ [2] https://github.com/pmachata/iproute2/commits/vxlan_reserved_bits/ ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 3f330db + d84b5dc commit e58b477

File tree

8 files changed

+497
-62
lines changed

8 files changed

+497
-62
lines changed

drivers/net/vxlan/vxlan_core.c

Lines changed: 99 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -622,9 +622,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
622622
return 1;
623623
}
624624

625-
static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol)
625+
static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol)
626626
{
627-
struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr;
627+
const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr;
628628

629629
/* Need to have Next Protocol set for interfaces in GPE mode. */
630630
if (!gpe->np_applied)
@@ -1554,41 +1554,38 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
15541554
#endif
15551555
}
15561556

1557-
static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed,
1558-
struct sk_buff *skb,
1559-
u32 vxflags)
1557+
static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags)
15601558
{
1559+
const struct vxlanhdr *vh = vxlan_hdr(skb);
15611560
enum skb_drop_reason reason;
15621561
size_t start, offset;
15631562

1564-
if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
1565-
goto out;
1563+
if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
1564+
return SKB_NOT_DROPPED_YET;
15661565

1567-
start = vxlan_rco_start(unparsed->vx_vni);
1568-
offset = start + vxlan_rco_offset(unparsed->vx_vni);
1566+
start = vxlan_rco_start(vh->vx_vni);
1567+
offset = start + vxlan_rco_offset(vh->vx_vni);
15691568

15701569
reason = pskb_may_pull_reason(skb, offset + sizeof(u16));
15711570
if (reason)
15721571
return reason;
15731572

15741573
skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
15751574
!!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1576-
out:
1577-
unparsed->vx_flags &= ~VXLAN_HF_RCO;
1578-
unparsed->vx_vni &= VXLAN_VNI_MASK;
1579-
15801575
return SKB_NOT_DROPPED_YET;
15811576
}
15821577

1583-
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1584-
struct sk_buff *skb, u32 vxflags,
1578+
static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags,
15851579
struct vxlan_metadata *md)
15861580
{
1587-
struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1581+
const struct vxlanhdr *vh = vxlan_hdr(skb);
1582+
const struct vxlanhdr_gbp *gbp;
15881583
struct metadata_dst *tun_dst;
15891584

1590-
if (!(unparsed->vx_flags & VXLAN_HF_GBP))
1591-
goto out;
1585+
gbp = (const struct vxlanhdr_gbp *)vh;
1586+
1587+
if (!(vh->vx_flags & VXLAN_HF_GBP))
1588+
return;
15921589

15931590
md->gbp = ntohs(gbp->policy_id);
15941591

@@ -1607,8 +1604,6 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
16071604
/* In flow-based mode, GBP is carried in dst_metadata */
16081605
if (!(vxflags & VXLAN_F_COLLECT_METADATA))
16091606
skb->mark = md->gbp;
1610-
out:
1611-
unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
16121607
}
16131608

16141609
static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan,
@@ -1672,9 +1667,9 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
16721667
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
16731668
{
16741669
struct vxlan_vni_node *vninode = NULL;
1670+
const struct vxlanhdr *vh;
16751671
struct vxlan_dev *vxlan;
16761672
struct vxlan_sock *vs;
1677-
struct vxlanhdr unparsed;
16781673
struct vxlan_metadata _md;
16791674
struct vxlan_metadata *md = &_md;
16801675
__be16 protocol = htons(ETH_P_TEB);
@@ -1689,38 +1684,49 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
16891684
if (reason)
16901685
goto drop;
16911686

1692-
unparsed = *vxlan_hdr(skb);
1687+
vh = vxlan_hdr(skb);
16931688
/* VNI flag always required to be set */
1694-
if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
1689+
if (!(vh->vx_flags & VXLAN_HF_VNI)) {
16951690
netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
1696-
ntohl(vxlan_hdr(skb)->vx_flags),
1697-
ntohl(vxlan_hdr(skb)->vx_vni));
1691+
ntohl(vh->vx_flags), ntohl(vh->vx_vni));
16981692
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
16991693
/* Return non vxlan pkt */
17001694
goto drop;
17011695
}
1702-
unparsed.vx_flags &= ~VXLAN_HF_VNI;
1703-
unparsed.vx_vni &= ~VXLAN_VNI_MASK;
17041696

17051697
vs = rcu_dereference_sk_user_data(sk);
17061698
if (!vs)
17071699
goto drop;
17081700

1709-
vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
1701+
vni = vxlan_vni(vh->vx_vni);
17101702

17111703
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode);
17121704
if (!vxlan) {
17131705
reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND;
17141706
goto drop;
17151707
}
17161708

1717-
/* For backwards compatibility, only allow reserved fields to be
1718-
* used by VXLAN extensions if explicitly requested.
1719-
*/
1720-
if (vs->flags & VXLAN_F_GPE) {
1721-
if (!vxlan_parse_gpe_proto(&unparsed, &protocol))
1709+
if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags ||
1710+
vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) {
1711+
/* If the header uses bits besides those enabled by the
1712+
* netdevice configuration, treat this as a malformed packet.
1713+
* This behavior diverges from VXLAN RFC (RFC7348) which
1714+
* stipulates that bits in reserved in reserved fields are to be
1715+
* ignored. The approach here maintains compatibility with
1716+
* previous stack code, and also is more robust and provides a
1717+
* little more security in adding extensions to VXLAN.
1718+
*/
1719+
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
1720+
DEV_STATS_INC(vxlan->dev, rx_frame_errors);
1721+
DEV_STATS_INC(vxlan->dev, rx_errors);
1722+
vxlan_vnifilter_count(vxlan, vni, vninode,
1723+
VXLAN_VNI_STATS_RX_ERRORS, 0);
1724+
goto drop;
1725+
}
1726+
1727+
if (vxlan->cfg.flags & VXLAN_F_GPE) {
1728+
if (!vxlan_parse_gpe_proto(vh, &protocol))
17221729
goto drop;
1723-
unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS;
17241730
raw_proto = true;
17251731
}
17261732

@@ -1730,8 +1736,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
17301736
goto drop;
17311737
}
17321738

1733-
if (vs->flags & VXLAN_F_REMCSUM_RX) {
1734-
reason = vxlan_remcsum(&unparsed, skb, vs->flags);
1739+
if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) {
1740+
reason = vxlan_remcsum(skb, vxlan->cfg.flags);
17351741
if (unlikely(reason))
17361742
goto drop;
17371743
}
@@ -1756,25 +1762,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
17561762
memset(md, 0, sizeof(*md));
17571763
}
17581764

1759-
if (vs->flags & VXLAN_F_GBP)
1760-
vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
1765+
if (vxlan->cfg.flags & VXLAN_F_GBP)
1766+
vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md);
17611767
/* Note that GBP and GPE can never be active together. This is
17621768
* ensured in vxlan_dev_configure.
17631769
*/
17641770

1765-
if (unparsed.vx_flags || unparsed.vx_vni) {
1766-
/* If there are any unprocessed flags remaining treat
1767-
* this as a malformed packet. This behavior diverges from
1768-
* VXLAN RFC (RFC7348) which stipulates that bits in reserved
1769-
* in reserved fields are to be ignored. The approach here
1770-
* maintains compatibility with previous stack code, and also
1771-
* is more robust and provides a little more security in
1772-
* adding extensions to VXLAN.
1773-
*/
1774-
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
1775-
goto drop;
1776-
}
1777-
17781771
if (!raw_proto) {
17791772
reason = vxlan_set_mac(vxlan, vs, skb, vni);
17801773
if (reason)
@@ -3435,6 +3428,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
34353428
[IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 },
34363429
[IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1),
34373430
[IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX),
3431+
[IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)),
34383432
};
34393433

34403434
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -4070,6 +4064,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
40704064
struct net_device *dev, struct vxlan_config *conf,
40714065
bool changelink, struct netlink_ext_ack *extack)
40724066
{
4067+
struct vxlanhdr used_bits = {
4068+
.vx_flags = VXLAN_HF_VNI,
4069+
.vx_vni = VXLAN_VNI_MASK,
4070+
};
40734071
struct vxlan_dev *vxlan = netdev_priv(dev);
40744072
int err = 0;
40754073

@@ -4296,13 +4294,16 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
42964294
extack);
42974295
if (err)
42984296
return err;
4297+
used_bits.vx_flags |= VXLAN_HF_RCO;
4298+
used_bits.vx_vni |= ~VXLAN_VNI_MASK;
42994299
}
43004300

43014301
if (data[IFLA_VXLAN_GBP]) {
43024302
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
43034303
VXLAN_F_GBP, changelink, false, extack);
43044304
if (err)
43054305
return err;
4306+
used_bits.vx_flags |= VXLAN_GBP_USED_BITS;
43064307
}
43074308

43084309
if (data[IFLA_VXLAN_GPE]) {
@@ -4311,6 +4312,46 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
43114312
extack);
43124313
if (err)
43134314
return err;
4315+
4316+
used_bits.vx_flags |= VXLAN_GPE_USED_BITS;
4317+
}
4318+
4319+
if (data[IFLA_VXLAN_RESERVED_BITS]) {
4320+
struct vxlanhdr reserved_bits;
4321+
4322+
if (changelink) {
4323+
NL_SET_ERR_MSG_ATTR(extack,
4324+
data[IFLA_VXLAN_RESERVED_BITS],
4325+
"Cannot change reserved_bits");
4326+
return -EOPNOTSUPP;
4327+
}
4328+
4329+
nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS],
4330+
sizeof(reserved_bits));
4331+
if (used_bits.vx_flags & reserved_bits.vx_flags ||
4332+
used_bits.vx_vni & reserved_bits.vx_vni) {
4333+
__be64 ub_be64, rb_be64;
4334+
4335+
memcpy(&ub_be64, &used_bits, sizeof(ub_be64));
4336+
memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64));
4337+
4338+
NL_SET_ERR_MSG_ATTR_FMT(extack,
4339+
data[IFLA_VXLAN_RESERVED_BITS],
4340+
"Used bits %#018llx cannot overlap reserved bits %#018llx",
4341+
be64_to_cpu(ub_be64),
4342+
be64_to_cpu(rb_be64));
4343+
return -EINVAL;
4344+
}
4345+
4346+
conf->reserved_bits = reserved_bits;
4347+
} else {
4348+
/* For backwards compatibility, only allow reserved fields to be
4349+
* used by VXLAN extensions if explicitly requested.
4350+
*/
4351+
conf->reserved_bits = (struct vxlanhdr) {
4352+
.vx_flags = ~used_bits.vx_flags,
4353+
.vx_vni = ~used_bits.vx_vni,
4354+
};
43144355
}
43154356

43164357
if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
@@ -4497,6 +4538,8 @@ static size_t vxlan_get_size(const struct net_device *dev)
44974538
nla_total_size(0) + /* IFLA_VXLAN_GPE */
44984539
nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */
44994540
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */
4541+
/* IFLA_VXLAN_RESERVED_BITS */
4542+
nla_total_size(sizeof(struct vxlanhdr)) +
45004543
0;
45014544
}
45024545

@@ -4599,6 +4642,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
45994642
!!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)))
46004643
goto nla_put_failure;
46014644

4645+
if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS,
4646+
sizeof(vxlan->cfg.reserved_bits),
4647+
&vxlan->cfg.reserved_bits))
4648+
goto nla_put_failure;
4649+
46024650
return 0;
46034651

46044652
nla_put_failure:

include/net/vxlan.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ struct vxlan_config {
227227
unsigned int addrmax;
228228
bool no_share;
229229
enum ifla_vxlan_df df;
230+
struct vxlanhdr reserved_bits;
230231
};
231232

232233
enum {

include/uapi/linux/if_link.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,6 +1394,7 @@ enum {
13941394
IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
13951395
IFLA_VXLAN_LOCALBYPASS,
13961396
IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
1397+
IFLA_VXLAN_RESERVED_BITS,
13971398
__IFLA_VXLAN_MAX
13981399
};
13991400
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)

tools/testing/selftests/net/fdb_notify.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ test_dup_vxlan_self()
4949
{
5050
ip_link_add br up type bridge vlan_filtering 1
5151
ip_link_add vx up type vxlan id 2000 dstport 4789
52-
ip_link_master vx br
52+
ip_link_set_master vx br
5353

5454
do_test_dup add "vxlan" dev vx self dst 192.0.2.1
5555
do_test_dup del "vxlan" dev vx self dst 192.0.2.1
@@ -59,7 +59,7 @@ test_dup_vxlan_master()
5959
{
6060
ip_link_add br up type bridge vlan_filtering 1
6161
ip_link_add vx up type vxlan id 2000 dstport 4789
62-
ip_link_master vx br
62+
ip_link_set_master vx br
6363

6464
do_test_dup add "vxlan master" dev vx master
6565
do_test_dup del "vxlan master" dev vx master
@@ -79,7 +79,7 @@ test_dup_macvlan_master()
7979
ip_link_add br up type bridge vlan_filtering 1
8080
ip_link_add dd up type dummy
8181
ip_link_add mv up link dd type macvlan mode passthru
82-
ip_link_master mv br
82+
ip_link_set_master mv br
8383

8484
do_test_dup add "macvlan master" dev mv self
8585
do_test_dup del "macvlan master" dev mv self

tools/testing/selftests/net/forwarding/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
105105
vxlan_bridge_1q_port_8472_ipv6.sh \
106106
vxlan_bridge_1q_port_8472.sh \
107107
vxlan_bridge_1q.sh \
108+
vxlan_reserved.sh \
108109
vxlan_symmetric_ipv6.sh \
109110
vxlan_symmetric.sh
110111

tools/testing/selftests/net/forwarding/lib.sh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -932,13 +932,6 @@ packets_rate()
932932
echo $(((t1 - t0) / interval))
933933
}
934934

935-
mac_get()
936-
{
937-
local if_name=$1
938-
939-
ip -j link show dev $if_name | jq -r '.[]["address"]'
940-
}
941-
942935
ether_addr_to_u64()
943936
{
944937
local addr="$1"

0 commit comments

Comments
 (0)