Skip to content

Commit 8c48eea

Browse files
committed
page_pool: allow caching from safely localized NAPI
Recent patches to mlx5 mentioned a regression when moving from driver local page pool to only using the generic page pool code. Page pool has two recycling paths (1) direct one, which runs in safe NAPI context (basically consumer context, so producing can be lockless); and (2) via a ptr_ring, which takes a spin lock because the freeing can happen from any CPU; producer and consumer may run concurrently. Since the page pool code was added, Eric introduced a revised version of deferred skb freeing. TCP skbs are now usually returned to the CPU which allocated them, and freed in softirq context. This places the freeing (producing of pages back to the pool) enticingly close to the allocation (consumer). If we can prove that we're freeing in the same softirq context in which the consumer NAPI will run - lockless use of the cache is perfectly fine, no need for the lock. Let drivers link the page pool to a NAPI instance. If the NAPI instance is scheduled on the same CPU on which we're freeing - place the pages in the direct cache. With that and patched bnxt (XDP enabled to engage the page pool, sigh, bnxt really needs page pool work :() I see a 2.6% perf boost with a TCP stream test (app on a different physical core than softirq). The CPU use of relevant functions decreases as expected: page_pool_refill_alloc_cache 1.17% -> 0% _raw_spin_lock 2.41% -> 0.98% Only consider lockless path to be safe when NAPI is scheduled - in practice this should cover majority if not all of steady state workloads. It's usually the NAPI kicking in that causes the skb flush. The main case we'll miss out on is when application runs on the same CPU as NAPI. In that case we don't use the deferred skb free path. Reviewed-by: Tariq Toukan <[email protected]> Acked-by: Jesper Dangaard Brouer <[email protected]> Tested-by: Dragos Tatulea <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent b07a2d9 commit 8c48eea

File tree

7 files changed

+37
-12
lines changed

7 files changed

+37
-12
lines changed

Documentation/networking/page_pool.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ Registration
165165
pp_params.pool_size = DESC_NUM;
166166
pp_params.nid = NUMA_NO_NODE;
167167
pp_params.dev = priv->dev;
168+
pp_params.napi = napi; /* only if locking is tied to NAPI */
168169
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
169170
page_pool = page_pool_create(&pp_params);
170171

include/linux/netdevice.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,8 +360,11 @@ struct napi_struct {
360360
unsigned long gro_bitmask;
361361
int (*poll)(struct napi_struct *, int);
362362
#ifdef CONFIG_NETPOLL
363+
/* CPU actively polling if netpoll is configured */
363364
int poll_owner;
364365
#endif
366+
/* CPU on which NAPI has been scheduled for processing */
367+
int list_owner;
365368
struct net_device *dev;
366369
struct gro_list gro_hash[GRO_HASH_BUCKETS];
367370
struct sk_buff *skb;

include/linux/skbuff.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
33863386
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
33873387
}
33883388

3389+
static inline void
3390+
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
3391+
{
3392+
struct page *page = skb_frag_page(frag);
3393+
3394+
#ifdef CONFIG_PAGE_POOL
3395+
if (recycle && page_pool_return_skb_page(page, napi_safe))
3396+
return;
3397+
#endif
3398+
put_page(page);
3399+
}
3400+
33893401
/**
33903402
* __skb_frag_unref - release a reference on a paged fragment.
33913403
* @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
33963408
*/
33973409
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
33983410
{
3399-
struct page *page = skb_frag_page(frag);
3400-
3401-
#ifdef CONFIG_PAGE_POOL
3402-
if (recycle && page_pool_return_skb_page(page))
3403-
return;
3404-
#endif
3405-
put_page(page);
3411+
napi_frag_unref(frag, recycle, false);
34063412
}
34073413

34083414
/**

include/net/page_pool.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ struct page_pool_params {
7777
unsigned int pool_size;
7878
int nid; /* Numa node id to allocate from pages from */
7979
struct device *dev; /* device, for DMA pre-mapping purposes */
80+
struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
8081
enum dma_data_direction dma_dir; /* DMA mapping direction */
8182
unsigned int max_len; /* max DMA sync memory size */
8283
unsigned int offset; /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
239240
return pool->p.dma_dir;
240241
}
241242

242-
bool page_pool_return_skb_page(struct page *page);
243+
bool page_pool_return_skb_page(struct page *page, bool napi_safe);
243244

244245
struct page_pool *page_pool_create(const struct page_pool_params *params);
245246

net/core/dev.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
43594359
}
43604360

43614361
list_add_tail(&napi->poll_list, &sd->poll_list);
4362+
WRITE_ONCE(napi->list_owner, smp_processor_id());
43624363
/* If not called from net_rx_action()
43634364
* we have to raise NET_RX_SOFTIRQ.
43644365
*/
@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
60696070
list_del_init(&n->poll_list);
60706071
local_irq_restore(flags);
60716072
}
6073+
WRITE_ONCE(n->list_owner, -1);
60726074

60736075
val = READ_ONCE(n->state);
60746076
do {
@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
63846386
#ifdef CONFIG_NETPOLL
63856387
napi->poll_owner = -1;
63866388
#endif
6389+
napi->list_owner = -1;
63876390
set_bit(NAPI_STATE_SCHED, &napi->state);
63886391
set_bit(NAPI_STATE_NPSVC, &napi->state);
63896392
list_add_rcu(&napi->dev_list, &dev->napi_list);

net/core/page_pool.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <linux/mm.h> /* for put_page() */
2020
#include <linux/poison.h>
2121
#include <linux/ethtool.h>
22+
#include <linux/netdevice.h>
2223

2324
#include <trace/events/page_pool.h>
2425

@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
874875
}
875876
EXPORT_SYMBOL(page_pool_update_nid);
876877

877-
bool page_pool_return_skb_page(struct page *page)
878+
bool page_pool_return_skb_page(struct page *page, bool napi_safe)
878879
{
880+
struct napi_struct *napi;
879881
struct page_pool *pp;
882+
bool allow_direct;
880883

881884
page = compound_head(page);
882885

@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
892895

893896
pp = page->pp;
894897

898+
/* Allow direct recycle if we have reasons to believe that we are
899+
* in the same context as the consumer would run, so there's
900+
* no possible race.
901+
*/
902+
napi = pp->p.napi;
903+
allow_direct = napi_safe && napi &&
904+
READ_ONCE(napi->list_owner) == smp_processor_id();
905+
895906
/* Driver set this to memory recycling info. Reset it on recycle.
896907
* This will *not* work for NIC using a split-page memory model.
897908
* The page will be returned to the pool here regardless of the
898909
* 'flipped' fragment being in use or not.
899910
*/
900-
page_pool_put_full_page(pp, page, false);
911+
page_pool_put_full_page(pp, page, allow_direct);
901912

902913
return true;
903914
}

net/core/skbuff.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
843843
{
844844
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
845845
return false;
846-
return page_pool_return_skb_page(virt_to_page(data));
846+
return page_pool_return_skb_page(virt_to_page(data), napi_safe);
847847
}
848848

849849
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -889,7 +889,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
889889
}
890890

891891
for (i = 0; i < shinfo->nr_frags; i++)
892-
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
892+
napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
893893

894894
free_head:
895895
if (shinfo->frag_list)

0 commit comments

Comments
 (0)