Skip to content

Commit 191c1ca

Browse files
minaNipaLocal
authored and
NipaLocal
committed
net: devmem: Implement TX path
Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. Based on work by Stanislav Fomichev <[email protected]>. A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev <[email protected]> Signed-off-by: Kaiyuan Zhang <[email protected]> Signed-off-by: Mina Almasry <[email protected]> Acked-by: Stanislav Fomichev <[email protected]> Signed-off-by: NipaLocal <nipa@local>
1 parent 5bb1d18 commit 191c1ca

File tree

12 files changed

+308
-43
lines changed

12 files changed

+308
-43
lines changed

include/linux/skbuff.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,26 +1713,31 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
17131713
extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;
17141714

17151715
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
1716-
struct ubuf_info *uarg);
1716+
struct ubuf_info *uarg, bool devmem);
17171717

17181718
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
17191719

1720+
struct net_devmem_dmabuf_binding;
1721+
17201722
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
17211723
struct sk_buff *skb, struct iov_iter *from,
1722-
size_t length);
1724+
size_t length,
1725+
struct net_devmem_dmabuf_binding *binding);
17231726

17241727
int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
17251728
struct iov_iter *from, size_t length);
17261729

17271730
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
17281731
struct msghdr *msg, int len)
17291732
{
1730-
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
1733+
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
1734+
NULL);
17311735
}
17321736

17331737
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
17341738
struct msghdr *msg, int len,
1735-
struct ubuf_info *uarg);
1739+
struct ubuf_info *uarg,
1740+
struct net_devmem_dmabuf_binding *binding);
17361741

17371742
/* Internal */
17381743
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3703,6 +3708,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
37033708
size_t offset, size_t size,
37043709
enum dma_data_direction dir)
37053710
{
3711+
if (skb_frag_is_net_iov(frag)) {
3712+
return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
3713+
frag->offset;
3714+
}
37063715
return dma_map_page(dev, skb_frag_page(frag),
37073716
skb_frag_off(frag) + offset, size, dir);
37083717
}

include/net/sock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,6 +1814,7 @@ struct sockcm_cookie {
18141814
u32 tsflags;
18151815
u32 ts_opt_id;
18161816
u32 priority;
1817+
u32 dmabuf_id;
18171818
};
18181819

18191820
static inline void sockcm_init(struct sockcm_cookie *sockc,

net/core/datagram.c

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@
6363
#include <net/busy_poll.h>
6464
#include <crypto/hash.h>
6565

66+
#include "devmem.h"
67+
6668
/*
6769
* Is a socket 'connection oriented' ?
6870
*/
@@ -692,16 +694,58 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
692694
return 0;
693695
}
694696

697+
static int
698+
zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
699+
int length,
700+
struct net_devmem_dmabuf_binding *binding)
701+
{
702+
int i = skb_shinfo(skb)->nr_frags;
703+
size_t virt_addr, size, off;
704+
struct net_iov *niov;
705+
706+
/* Devmem filling works by taking an IOVEC from the user where the
707+
* iov_addrs are interpreted as an offset in bytes into the dma-buf to
708+
* send from. We do not support other iter types.
709+
*/
710+
if (iov_iter_type(from) != ITER_IOVEC)
711+
return -EFAULT;
712+
713+
while (length && iov_iter_count(from)) {
714+
if (i == MAX_SKB_FRAGS)
715+
return -EMSGSIZE;
716+
717+
virt_addr = (size_t)iter_iov_addr(from);
718+
niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
719+
if (!niov)
720+
return -EFAULT;
721+
722+
size = min_t(size_t, size, length);
723+
size = min_t(size_t, size, iter_iov_len(from));
724+
725+
get_netmem(net_iov_to_netmem(niov));
726+
skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
727+
size, PAGE_SIZE);
728+
iov_iter_advance(from, size);
729+
length -= size;
730+
i++;
731+
}
732+
733+
return 0;
734+
}
735+
695736
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
696737
struct sk_buff *skb, struct iov_iter *from,
697-
size_t length)
738+
size_t length,
739+
struct net_devmem_dmabuf_binding *binding)
698740
{
699741
unsigned long orig_size = skb->truesize;
700742
unsigned long truesize;
701743
int ret;
702744

703745
if (msg && msg->msg_ubuf && msg->sg_from_iter)
704746
ret = msg->sg_from_iter(skb, from, length);
747+
else if (unlikely(binding))
748+
ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
705749
else
706750
ret = zerocopy_fill_skb_from_iter(skb, from, length);
707751

@@ -735,7 +779,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
735779
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
736780
return -EFAULT;
737781

738-
return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
782+
return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
739783
}
740784
EXPORT_SYMBOL(zerocopy_sg_from_iter);
741785

net/core/devmem.c

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <net/netdev_rx_queue.h>
1818
#include <net/page_pool/helpers.h>
1919
#include <net/page_pool/memory_provider.h>
20+
#include <net/sock.h>
2021
#include <trace/events/page_pool.h>
2122

2223
#include "devmem.h"
@@ -73,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
7374
dma_buf_detach(binding->dmabuf, binding->attachment);
7475
dma_buf_put(binding->dmabuf);
7576
xa_destroy(&binding->bound_rxqs);
77+
kvfree(binding->tx_vec);
7678
kfree(binding);
7779
}
80+
EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
7881

7982
struct net_iov *
8083
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -119,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
119122
unsigned long xa_idx;
120123
unsigned int rxq_idx;
121124

125+
xa_erase(&net_devmem_dmabuf_bindings, binding->id);
126+
127+
/* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
128+
* erase.
129+
*/
130+
synchronize_net();
131+
122132
if (binding->list.next)
123133
list_del(&binding->list);
124134

@@ -133,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
133143
WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx));
134144
}
135145

136-
xa_erase(&net_devmem_dmabuf_bindings, binding->id);
137-
138146
net_devmem_dmabuf_binding_put(binding);
139147
}
140148

@@ -197,8 +205,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
197205
}
198206

199207
struct net_devmem_dmabuf_binding *
200-
net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
201-
struct netlink_ext_ack *extack)
208+
net_devmem_bind_dmabuf(struct net_device *dev,
209+
enum dma_data_direction direction,
210+
unsigned int dmabuf_fd, struct netlink_ext_ack *extack)
202211
{
203212
struct net_devmem_dmabuf_binding *binding;
204213
static u32 id_alloc_next;
@@ -241,7 +250,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
241250
}
242251

243252
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
244-
DMA_FROM_DEVICE);
253+
direction);
245254
if (IS_ERR(binding->sgt)) {
246255
err = PTR_ERR(binding->sgt);
247256
NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
@@ -252,13 +261,23 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
252261
* binding can be much more flexible than that. We may be able to
253262
* allocate MTU sized chunks here. Leave that for future work...
254263
*/
255-
binding->chunk_pool =
256-
gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
264+
binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
265+
dev_to_node(&dev->dev));
257266
if (!binding->chunk_pool) {
258267
err = -ENOMEM;
259268
goto err_unmap;
260269
}
261270

271+
if (direction == DMA_TO_DEVICE) {
272+
binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
273+
sizeof(struct net_iov *),
274+
GFP_KERNEL);
275+
if (!binding->tx_vec) {
276+
err = -ENOMEM;
277+
goto err_free_chunks;
278+
}
279+
}
280+
262281
virtual = 0;
263282
for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
264283
dma_addr_t dma_addr = sg_dma_address(sg);
@@ -300,6 +319,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
300319
niov->owner = &owner->area;
301320
page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
302321
net_devmem_get_dma_addr(niov));
322+
if (direction == DMA_TO_DEVICE)
323+
binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
303324
}
304325

305326
virtual += len;
@@ -311,6 +332,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
311332
gen_pool_for_each_chunk(binding->chunk_pool,
312333
net_devmem_dmabuf_free_chunk_owner, NULL);
313334
gen_pool_destroy(binding->chunk_pool);
335+
336+
kvfree(binding->tx_vec);
314337
err_unmap:
315338
dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
316339
DMA_FROM_DEVICE);
@@ -325,6 +348,21 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
325348
return ERR_PTR(err);
326349
}
327350

351+
struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
352+
{
353+
struct net_devmem_dmabuf_binding *binding;
354+
355+
rcu_read_lock();
356+
binding = xa_load(&net_devmem_dmabuf_bindings, id);
357+
if (binding) {
358+
if (!net_devmem_dmabuf_binding_get(binding))
359+
binding = NULL;
360+
}
361+
rcu_read_unlock();
362+
363+
return binding;
364+
}
365+
328366
void net_devmem_get_net_iov(struct net_iov *niov)
329367
{
330368
net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
@@ -335,6 +373,53 @@ void net_devmem_put_net_iov(struct net_iov *niov)
335373
net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
336374
}
337375

376+
struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
377+
unsigned int dmabuf_id)
378+
{
379+
struct net_devmem_dmabuf_binding *binding;
380+
struct dst_entry *dst = __sk_dst_get(sk);
381+
int err = 0;
382+
383+
binding = net_devmem_lookup_dmabuf(dmabuf_id);
384+
if (!binding || !binding->tx_vec) {
385+
err = -EINVAL;
386+
goto out_err;
387+
}
388+
389+
/* The dma-addrs in this binding are only reachable to the corresponding
390+
* net_device.
391+
*/
392+
if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
393+
err = -ENODEV;
394+
goto out_err;
395+
}
396+
397+
return binding;
398+
399+
out_err:
400+
if (binding)
401+
net_devmem_dmabuf_binding_put(binding);
402+
403+
return ERR_PTR(err);
404+
}
405+
406+
struct net_iov *
407+
net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
408+
size_t virt_addr, size_t *off, size_t *size)
409+
{
410+
size_t idx;
411+
412+
if (virt_addr >= binding->dmabuf->size)
413+
return NULL;
414+
415+
idx = virt_addr / PAGE_SIZE;
416+
417+
*off = virt_addr % PAGE_SIZE;
418+
*size = PAGE_SIZE - *off;
419+
420+
return binding->tx_vec[idx];
421+
}
422+
338423
/*** "Dmabuf devmem memory provider" ***/
339424

340425
int mp_dmabuf_devmem_init(struct page_pool *pool)

0 commit comments

Comments
 (0)