Skip to content

Commit 20a6d91

Browse files
author
Alexei Starovoitov
committed
Merge branch 'sockmap/sk_skb program memory acct fixes'
John Fastabend says: ==================== Users of sockmap and skmsg trying to build proxys and other tools have pointed out to me the error handling can be problematic. If the proxy is under-provisioned and/or the BPF admin does not have the ability to update/modify memory provisions on the sockets its possible data may be dropped. For some things we have retries so everything works out OK, but for most things this is likely not great. And things go bad. The original design dropped memory accounting on the receive socket as early as possible. We did this early in sk_skb handling and then charged it to the redirect socket immediately after running the BPF program. But, this design caused a fundamental problem. Namely, what should we do if we redirect to a socket that has already reached its socket memory limits. For proxy use cases the network admin can tune memory limits. But, in general we punted on this problem and told folks to simply make your memory limits high enough to handle your workload. This is not a really good answer. When deploying into environments where we expect this to be transparent its no longer the case because we need to tune params. In fact its really only viable in cases where we have fine grained control over the application. For example a proxy redirecting from an ingress socket to an egress socket. The result is I get bug reports because its surprising for one, but more importantly also breaks some use cases. So lets fix it. This series cleans up the different cases so that in many common modes, such as passing packet up to receive socket, we can simply use the underlying assumption that the TCP stack already has done memory accounting. Next instead of trying to do memory accounting against the socket we plan to redirect into we keep memory accounting on the receive socket until the skb can be put on the redirect socket. This means if we do an egress redirect to a socket and sock_writable() returns EAGAIN we can requeue the skb on the workqueue and try again. The same scenario plays out for ingress. If the skb can not be put on the receive queue of the redirect socket than we simply requeue and retry. In both cases memory is still accounted for against the receiving socket. This also handles head of line blocking. With the above scheme the skb is on a queue associated with the socket it will be sent/recv'd on, but the memory accounting is against the received socket. This means the receive socket can advance to the next skb and avoid head of line blocking. At least until its receive memory on the socket runs out. This will put some maximum size on the amount of data any socket can enqueue giving us bounds on the skb lists so they can't grow indefinitely. Overall I think this is a win. Tested with test_sockmap. These are fixes, but I tagged it for bpf-next considering we are at -rc8. v1->v2: Fix uninitialized/unused variables (kernel test robot) v2->v3: fix typo in patch2 err=0 needs to be <0 so use err=-EIO --- ==================== Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents ebb034b + 0b17ad2 commit 20a6d91

File tree

1 file changed

+45
-38
lines changed

1 file changed

+45
-38
lines changed

net/core/skmsg.c

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
433433
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
434434
u32 off, u32 len, bool ingress)
435435
{
436-
if (ingress)
437-
return sk_psock_skb_ingress(psock, skb);
438-
else
436+
if (!ingress) {
437+
if (!sock_writeable(psock->sk))
438+
return -EAGAIN;
439439
return skb_send_sock_locked(psock->sk, skb, off, len);
440+
}
441+
return sk_psock_skb_ingress(psock, skb);
440442
}
441443

442444
static void sk_psock_backlog(struct work_struct *work)
@@ -682,19 +684,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
682684
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
683685
struct sk_buff *skb)
684686
{
685-
int ret;
686-
687-
skb->sk = psock->sk;
688687
bpf_compute_data_end_sk_skb(skb);
689-
ret = bpf_prog_run_pin_on_cpu(prog, skb);
690-
/* strparser clones the skb before handing it to a upper layer,
691-
* meaning skb_orphan has been called. We NULL sk on the way out
692-
* to ensure we don't trigger a BUG_ON() in skb/sk operations
693-
* later and because we are not charging the memory of this skb
694-
* to any socket yet.
695-
*/
696-
skb->sk = NULL;
697-
return ret;
688+
return bpf_prog_run_pin_on_cpu(prog, skb);
698689
}
699690

700691
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
@@ -709,38 +700,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
709700
{
710701
struct sk_psock *psock_other;
711702
struct sock *sk_other;
712-
bool ingress;
713703

714704
sk_other = tcp_skb_bpf_redirect_fetch(skb);
705+
/* This error is a buggy BPF program, it returned a redirect
706+
* return code, but then didn't set a redirect interface.
707+
*/
715708
if (unlikely(!sk_other)) {
716709
kfree_skb(skb);
717710
return;
718711
}
719712
psock_other = sk_psock(sk_other);
713+
/* This error indicates the socket is being torn down or had another
714+
* error that caused the pipe to break. We can't send a packet on
715+
* a socket that is in this state so we drop the skb.
716+
*/
720717
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
721718
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
722719
kfree_skb(skb);
723720
return;
724721
}
725722

726-
ingress = tcp_skb_bpf_ingress(skb);
727-
if ((!ingress && sock_writeable(sk_other)) ||
728-
(ingress &&
729-
atomic_read(&sk_other->sk_rmem_alloc) <=
730-
sk_other->sk_rcvbuf)) {
731-
if (!ingress)
732-
skb_set_owner_w(skb, sk_other);
733-
skb_queue_tail(&psock_other->ingress_skb, skb);
734-
schedule_work(&psock_other->work);
735-
} else {
736-
kfree_skb(skb);
737-
}
723+
skb_queue_tail(&psock_other->ingress_skb, skb);
724+
schedule_work(&psock_other->work);
738725
}
739726

740-
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
727+
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
741728
{
742729
switch (verdict) {
743730
case __SK_REDIRECT:
731+
skb_set_owner_r(skb, sk);
744732
sk_psock_skb_redirect(skb);
745733
break;
746734
case __SK_PASS:
@@ -758,11 +746,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
758746
rcu_read_lock();
759747
prog = READ_ONCE(psock->progs.skb_verdict);
760748
if (likely(prog)) {
749+
/* We skip full set_owner_r here because if we do a SK_PASS
750+
* or SK_DROP we can skip skb memory accounting and use the
751+
* TLS context.
752+
*/
753+
skb->sk = psock->sk;
761754
tcp_skb_bpf_redirect_clear(skb);
762755
ret = sk_psock_bpf_run(psock, prog, skb);
763756
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
757+
skb->sk = NULL;
764758
}
765-
sk_psock_tls_verdict_apply(skb, ret);
759+
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
766760
rcu_read_unlock();
767761
return ret;
768762
}
@@ -771,7 +765,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
771765
static void sk_psock_verdict_apply(struct sk_psock *psock,
772766
struct sk_buff *skb, int verdict)
773767
{
768+
struct tcp_skb_cb *tcp;
774769
struct sock *sk_other;
770+
int err = -EIO;
775771

776772
switch (verdict) {
777773
case __SK_PASS:
@@ -780,16 +776,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
780776
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
781777
goto out_free;
782778
}
783-
if (atomic_read(&sk_other->sk_rmem_alloc) <=
784-
sk_other->sk_rcvbuf) {
785-
struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
786779

787-
tcp->bpf.flags |= BPF_F_INGRESS;
780+
tcp = TCP_SKB_CB(skb);
781+
tcp->bpf.flags |= BPF_F_INGRESS;
782+
783+
/* If the queue is empty then we can submit directly
784+
* into the msg queue. If its not empty we have to
785+
* queue work otherwise we may get OOO data. Otherwise,
786+
* if sk_psock_skb_ingress errors will be handled by
787+
* retrying later from workqueue.
788+
*/
789+
if (skb_queue_empty(&psock->ingress_skb)) {
790+
err = sk_psock_skb_ingress(psock, skb);
791+
}
792+
if (err < 0) {
788793
skb_queue_tail(&psock->ingress_skb, skb);
789794
schedule_work(&psock->work);
790-
break;
791795
}
792-
goto out_free;
796+
break;
793797
case __SK_REDIRECT:
794798
sk_psock_skb_redirect(skb);
795799
break;
@@ -814,9 +818,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
814818
kfree_skb(skb);
815819
goto out;
816820
}
821+
skb_set_owner_r(skb, sk);
817822
prog = READ_ONCE(psock->progs.skb_verdict);
818823
if (likely(prog)) {
819-
skb_orphan(skb);
820824
tcp_skb_bpf_redirect_clear(skb);
821825
ret = sk_psock_bpf_run(psock, prog, skb);
822826
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
@@ -839,8 +843,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
839843

840844
rcu_read_lock();
841845
prog = READ_ONCE(psock->progs.skb_parser);
842-
if (likely(prog))
846+
if (likely(prog)) {
847+
skb->sk = psock->sk;
843848
ret = sk_psock_bpf_run(psock, prog, skb);
849+
skb->sk = NULL;
850+
}
844851
rcu_read_unlock();
845852
return ret;
846853
}

0 commit comments

Comments
 (0)