7575#include <linux/skb_array.h>
7676#include <linux/bpf.h>
7777#include <linux/bpf_trace.h>
78+ #include <linux/mutex.h>
7879
7980#include <linux/uaccess.h>
8081
@@ -121,7 +122,8 @@ do { \
121122#define TUN_VNET_BE 0x40000000
122123
123124#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
124- IFF_MULTI_QUEUE | IFF_NAPI)
125+ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
126+
125127#define GOODCOPY_LEN 128
126128
127129#define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
173175 unsigned int ifindex ;
174176 };
175177 struct napi_struct napi ;
178+ struct mutex napi_mutex ; /* Protects access to the above napi */
176179 struct list_head next ;
177180 struct tun_struct * detached ;
178181 struct skb_array tx_array ;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
277280 netif_napi_add (tun -> dev , & tfile -> napi , tun_napi_poll ,
278281 NAPI_POLL_WEIGHT );
279282 napi_enable (& tfile -> napi );
283+ mutex_init (& tfile -> napi_mutex );
280284 }
281285}
282286
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
292296 netif_napi_del (& tfile -> napi );
293297}
294298
299+ static bool tun_napi_frags_enabled (const struct tun_struct * tun )
300+ {
301+ return READ_ONCE (tun -> flags ) & IFF_NAPI_FRAGS ;
302+ }
303+
295304#ifdef CONFIG_TUN_VNET_CROSS_LE
296305static inline bool tun_legacy_is_little_endian (struct tun_struct * tun )
297306{
@@ -1036,14 +1045,18 @@ static void tun_poll_controller(struct net_device *dev)
10361045 * supports polling, which enables bridge devices in virt setups to
10371046 * still use netconsole
10381047 * If NAPI is enabled, however, we need to schedule polling for all
1039- * queues.
1048+ * queues unless we are using napi_gro_frags(), which we call in
1049+ * process context and not in NAPI context.
10401050 */
10411051 struct tun_struct * tun = netdev_priv (dev );
10421052
10431053 if (tun -> flags & IFF_NAPI ) {
10441054 struct tun_file * tfile ;
10451055 int i ;
10461056
1057+ if (tun_napi_frags_enabled (tun ))
1058+ return ;
1059+
10471060 rcu_read_lock ();
10481061 for (i = 0 ; i < tun -> numqueues ; i ++ ) {
10491062 tfile = rcu_dereference (tun -> tfiles [i ]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
12661279 return mask ;
12671280}
12681281
1282+ static struct sk_buff * tun_napi_alloc_frags (struct tun_file * tfile ,
1283+ size_t len ,
1284+ const struct iov_iter * it )
1285+ {
1286+ struct sk_buff * skb ;
1287+ size_t linear ;
1288+ int err ;
1289+ int i ;
1290+
1291+ if (it -> nr_segs > MAX_SKB_FRAGS + 1 )
1292+ return ERR_PTR (- ENOMEM );
1293+
1294+ local_bh_disable ();
1295+ skb = napi_get_frags (& tfile -> napi );
1296+ local_bh_enable ();
1297+ if (!skb )
1298+ return ERR_PTR (- ENOMEM );
1299+
1300+ linear = iov_iter_single_seg_count (it );
1301+ err = __skb_grow (skb , linear );
1302+ if (err )
1303+ goto free ;
1304+
1305+ skb -> len = len ;
1306+ skb -> data_len = len - linear ;
1307+ skb -> truesize += skb -> data_len ;
1308+
1309+ for (i = 1 ; i < it -> nr_segs ; i ++ ) {
1310+ size_t fragsz = it -> iov [i ].iov_len ;
1311+ unsigned long offset ;
1312+ struct page * page ;
1313+ void * data ;
1314+
1315+ if (fragsz == 0 || fragsz > PAGE_SIZE ) {
1316+ err = - EINVAL ;
1317+ goto free ;
1318+ }
1319+
1320+ local_bh_disable ();
1321+ data = napi_alloc_frag (fragsz );
1322+ local_bh_enable ();
1323+ if (!data ) {
1324+ err = - ENOMEM ;
1325+ goto free ;
1326+ }
1327+
1328+ page = virt_to_head_page (data );
1329+ offset = data - page_address (page );
1330+ skb_fill_page_desc (skb , i - 1 , page , offset , fragsz );
1331+ }
1332+
1333+ return skb ;
1334+ free :
1335+ /* frees skb and all frags allocated with napi_alloc_frag() */
1336+ napi_free_frags (& tfile -> napi );
1337+ return ERR_PTR (err );
1338+ }
1339+
12691340/* prepad is the amount to reserve at front. len is length after that.
12701341 * linear is a hint as to how much to copy (usually headers). */
12711342static struct sk_buff * tun_alloc_skb (struct tun_file * tfile ,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
14781549 int err ;
14791550 u32 rxhash ;
14801551 int skb_xdp = 1 ;
1552+ bool frags = tun_napi_frags_enabled (tun );
14811553
14821554 if (!(tun -> dev -> flags & IFF_UP ))
14831555 return - EIO ;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
15351607 zerocopy = true;
15361608 }
15371609
1538- if (tun_can_build_skb (tun , tfile , len , noblock , zerocopy )) {
1610+ if (! frags && tun_can_build_skb (tun , tfile , len , noblock , zerocopy )) {
15391611 /* For the packet that is not easy to be processed
15401612 * (e.g gso or jumbo packet), we will do it at after
15411613 * skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
15561628 linear = tun16_to_cpu (tun , gso .hdr_len );
15571629 }
15581630
1559- skb = tun_alloc_skb (tfile , align , copylen , linear , noblock );
1631+ if (frags ) {
1632+ mutex_lock (& tfile -> napi_mutex );
1633+ skb = tun_napi_alloc_frags (tfile , copylen , from );
1634+ /* tun_napi_alloc_frags() enforces a layout for the skb.
1635+ * If zerocopy is enabled, then this layout will be
1636+ * overwritten by zerocopy_sg_from_iter().
1637+ */
1638+ zerocopy = false;
1639+ } else {
1640+ skb = tun_alloc_skb (tfile , align , copylen , linear ,
1641+ noblock );
1642+ }
1643+
15601644 if (IS_ERR (skb )) {
15611645 if (PTR_ERR (skb ) != - EAGAIN )
15621646 this_cpu_inc (tun -> pcpu_stats -> rx_dropped );
1647+ if (frags )
1648+ mutex_unlock (& tfile -> napi_mutex );
15631649 return PTR_ERR (skb );
15641650 }
15651651
@@ -1571,13 +1657,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
15711657 if (err ) {
15721658 this_cpu_inc (tun -> pcpu_stats -> rx_dropped );
15731659 kfree_skb (skb );
1660+ if (frags ) {
1661+ tfile -> napi .skb = NULL ;
1662+ mutex_unlock (& tfile -> napi_mutex );
1663+ }
1664+
15741665 return - EFAULT ;
15751666 }
15761667 }
15771668
15781669 if (virtio_net_hdr_to_skb (skb , & gso , tun_is_little_endian (tun ))) {
15791670 this_cpu_inc (tun -> pcpu_stats -> rx_frame_errors );
15801671 kfree_skb (skb );
1672+ if (frags ) {
1673+ tfile -> napi .skb = NULL ;
1674+ mutex_unlock (& tfile -> napi_mutex );
1675+ }
1676+
15811677 return - EINVAL ;
15821678 }
15831679
@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
16031699 skb -> dev = tun -> dev ;
16041700 break ;
16051701 case IFF_TAP :
1606- skb -> protocol = eth_type_trans (skb , tun -> dev );
1702+ if (!frags )
1703+ skb -> protocol = eth_type_trans (skb , tun -> dev );
16071704 break ;
16081705 }
16091706
@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
16381735
16391736 rxhash = __skb_get_hash_symmetric (skb );
16401737
1641- if (tun -> flags & IFF_NAPI ) {
1738+ if (frags ) {
1739+ /* Exercise flow dissector code path. */
1740+ u32 headlen = eth_get_headlen (skb -> data , skb_headlen (skb ));
1741+
1742+ if (headlen > skb_headlen (skb ) || headlen < ETH_HLEN ) {
1743+ this_cpu_inc (tun -> pcpu_stats -> rx_dropped );
1744+ napi_free_frags (& tfile -> napi );
1745+ mutex_unlock (& tfile -> napi_mutex );
1746+ WARN_ON (1 );
1747+ return - ENOMEM ;
1748+ }
1749+
1750+ local_bh_disable ();
1751+ napi_gro_frags (& tfile -> napi );
1752+ local_bh_enable ();
1753+ mutex_unlock (& tfile -> napi_mutex );
1754+ } else if (tun -> flags & IFF_NAPI ) {
16421755 struct sk_buff_head * queue = & tfile -> sk .sk_write_queue ;
16431756 int queue_len ;
16441757
@@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
20612174 if (tfile -> detached )
20622175 return - EINVAL ;
20632176
2177+ if ((ifr -> ifr_flags & IFF_NAPI_FRAGS )) {
2178+ if (!capable (CAP_NET_ADMIN ))
2179+ return - EPERM ;
2180+
2181+ if (!(ifr -> ifr_flags & IFF_NAPI ) ||
2182+ (ifr -> ifr_flags & TUN_TYPE_MASK ) != IFF_TAP )
2183+ return - EINVAL ;
2184+ }
2185+
20642186 dev = __dev_get_by_name (net , ifr -> ifr_name );
20652187 if (dev ) {
20662188 if (ifr -> ifr_flags & IFF_TUN_EXCL )
0 commit comments