Skip to content

Commit 29f38ca

Browse files
author
Martin KaFai Lau
committed
Merge branch 'Add new args into tcp_congestion_ops' cong_control'
Miao Xu says: ==================== This patchset attempts to add two new arguments into the hookpoint cong_control in tcp_congestion_ops. The new arguments are inherited from the caller tcp_cong_control and can be used by any bpf cc prog that implements its own logic inside this hookpoint. Please review. Thanks a lot! Changelog ===== v2->v3: - Fixed the broken selftest caused by the new arguments. - Renamed the selftest file name and bpf prog name. v1->v2: - Split the patchset into 3 separate patches. - Added highlights in the selftest prog. - Removed the dependency on bpf_tcp_helpers.h. ==================== Signed-off-by: Martin KaFai Lau <[email protected]>
2 parents f8c423d + 96c3490 commit 29f38ca

File tree

8 files changed

+244
-7
lines changed

8 files changed

+244
-7
lines changed

include/net/tcp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1172,7 +1172,7 @@ struct tcp_congestion_ops {
11721172
/* call when packets are delivered to update cwnd and pacing rate,
11731173
* after all the ca_state processing. (optional)
11741174
*/
1175-
void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
1175+
void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs);
11761176

11771177

11781178
/* new value of cwnd after loss (required) */

net/ipv4/bpf_tcp_ca.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
107107
case offsetof(struct tcp_sock, snd_cwnd_cnt):
108108
end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
109109
break;
110+
case offsetof(struct tcp_sock, snd_cwnd_stamp):
111+
end = offsetofend(struct tcp_sock, snd_cwnd_stamp);
112+
break;
110113
case offsetof(struct tcp_sock, snd_ssthresh):
111114
end = offsetofend(struct tcp_sock, snd_ssthresh);
112115
break;
@@ -307,7 +310,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
307310
return 0;
308311
}
309312

310-
static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs)
313+
static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
314+
const struct rate_sample *rs)
311315
{
312316
}
313317

net/ipv4/tcp_bbr.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
10241024
bbr_update_gains(sk);
10251025
}
10261026

1027-
__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
1027+
__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
10281028
{
10291029
struct bbr *bbr = inet_csk_ca(sk);
10301030
u32 bw;

net/ipv4/tcp_input.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3541,7 +3541,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
35413541
const struct inet_connection_sock *icsk = inet_csk(sk);
35423542

35433543
if (icsk->icsk_ca_ops->cong_control) {
3544-
icsk->icsk_ca_ops->cong_control(sk, rs);
3544+
icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
35453545
return;
35463546
}
35473547

tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "tcp_ca_incompl_cong_ops.skel.h"
1515
#include "tcp_ca_unsupp_cong_op.skel.h"
1616
#include "tcp_ca_kfunc.skel.h"
17+
#include "bpf_cc_cubic.skel.h"
1718

1819
#ifndef ENOTSUPP
1920
#define ENOTSUPP 524
@@ -452,6 +453,27 @@ static void test_tcp_ca_kfunc(void)
452453
tcp_ca_kfunc__destroy(skel);
453454
}
454455

456+
static void test_cc_cubic(void)
457+
{
458+
struct bpf_cc_cubic *cc_cubic_skel;
459+
struct bpf_link *link;
460+
461+
cc_cubic_skel = bpf_cc_cubic__open_and_load();
462+
if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load"))
463+
return;
464+
465+
link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic);
466+
if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) {
467+
bpf_cc_cubic__destroy(cc_cubic_skel);
468+
return;
469+
}
470+
471+
do_test("bpf_cc_cubic", NULL);
472+
473+
bpf_link__destroy(link);
474+
bpf_cc_cubic__destroy(cc_cubic_skel);
475+
}
476+
455477
void test_bpf_tcp_ca(void)
456478
{
457479
if (test__start_subtest("dctcp"))
@@ -482,4 +504,6 @@ void test_bpf_tcp_ca(void)
482504
test_link_replace();
483505
if (test__start_subtest("tcp_ca_kfunc"))
484506
test_tcp_ca_kfunc();
507+
if (test__start_subtest("cc_cubic"))
508+
test_cc_cubic();
485509
}
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
3+
/* Highlights:
4+
* 1. The major difference between this bpf program and tcp_cubic.c
5+
* is that this bpf program relies on `cong_control` rather than
6+
* `cong_avoid` in the struct tcp_congestion_ops.
7+
* 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and
8+
* tcp_update_pacing_rate is bypassed when `cong_control` is
9+
* defined, so moving these logic to `cong_control`.
10+
* 3. WARNING: This bpf program is NOT the same as tcp_cubic.c.
11+
* The main purpose is to show use cases of the arguments in
12+
* `cong_control`. For simplicity's sake, it reuses tcp cubic's
13+
* kernel functions.
14+
*/
15+
16+
#include "vmlinux.h"
17+
18+
#include <bpf/bpf_helpers.h>
19+
#include <bpf/bpf_tracing.h>
20+
#include "bpf_tracing_net.h"
21+
22+
#define BPF_STRUCT_OPS(name, args...) \
23+
SEC("struct_ops/"#name) \
24+
BPF_PROG(name, args)
25+
26+
#define USEC_PER_SEC 1000000UL
27+
#define TCP_PACING_SS_RATIO (200)
28+
#define TCP_PACING_CA_RATIO (120)
29+
#define TCP_REORDERING (12)
30+
31+
#define min(a, b) ((a) < (b) ? (a) : (b))
32+
#define max(a, b) ((a) > (b) ? (a) : (b))
33+
#define after(seq2, seq1) before(seq1, seq2)
34+
35+
extern void cubictcp_init(struct sock *sk) __ksym;
36+
extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
37+
extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
38+
extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym;
39+
extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
40+
extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
41+
extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
42+
43+
static struct inet_connection_sock *inet_csk(const struct sock *sk)
44+
{
45+
return (struct inet_connection_sock *)sk;
46+
}
47+
48+
static struct tcp_sock *tcp_sk(const struct sock *sk)
49+
{
50+
return (struct tcp_sock *)sk;
51+
}
52+
53+
static bool before(__u32 seq1, __u32 seq2)
54+
{
55+
return (__s32)(seq1-seq2) < 0;
56+
}
57+
58+
static __u64 div64_u64(__u64 dividend, __u64 divisor)
59+
{
60+
return dividend / divisor;
61+
}
62+
63+
static void tcp_update_pacing_rate(struct sock *sk)
64+
{
65+
const struct tcp_sock *tp = tcp_sk(sk);
66+
__u64 rate;
67+
68+
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
69+
rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
70+
71+
/* current rate is (cwnd * mss) / srtt
72+
* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
73+
* In Congestion Avoidance phase, set it to 120 % the current rate.
74+
*
75+
* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
76+
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
77+
* end of slow start and should slow down.
78+
*/
79+
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
80+
rate *= TCP_PACING_SS_RATIO;
81+
else
82+
rate *= TCP_PACING_CA_RATIO;
83+
84+
rate *= max(tp->snd_cwnd, tp->packets_out);
85+
86+
if (tp->srtt_us)
87+
rate = div64_u64(rate, (__u64)tp->srtt_us);
88+
89+
sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
90+
}
91+
92+
static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
93+
int newly_lost, int flag)
94+
{
95+
struct tcp_sock *tp = tcp_sk(sk);
96+
int sndcnt = 0;
97+
__u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out;
98+
int delta = tp->snd_ssthresh - pkts_in_flight;
99+
100+
if (newly_acked_sacked <= 0 || !tp->prior_cwnd)
101+
return;
102+
103+
__u32 prr_delivered = tp->prr_delivered + newly_acked_sacked;
104+
105+
if (delta < 0) {
106+
__u64 dividend =
107+
(__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1;
108+
sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out;
109+
} else {
110+
sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked);
111+
if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
112+
sndcnt++;
113+
sndcnt = min(delta, sndcnt);
114+
}
115+
/* Force a fast retransmit upon entering fast recovery */
116+
sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
117+
tp->snd_cwnd = pkts_in_flight + sndcnt;
118+
}
119+
120+
/* Decide wheather to run the increase function of congestion control. */
121+
static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
122+
{
123+
if (tcp_sk(sk)->reordering > TCP_REORDERING)
124+
return flag & FLAG_FORWARD_PROGRESS;
125+
126+
return flag & FLAG_DATA_ACKED;
127+
}
128+
129+
void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk)
130+
{
131+
cubictcp_init(sk);
132+
}
133+
134+
void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
135+
{
136+
cubictcp_cwnd_event(sk, event);
137+
}
138+
139+
void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag,
140+
const struct rate_sample *rs)
141+
{
142+
struct tcp_sock *tp = tcp_sk(sk);
143+
144+
if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) &
145+
(1 << inet_csk(sk)->icsk_ca_state)) {
146+
/* Reduce cwnd if state mandates */
147+
tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag);
148+
149+
if (!before(tp->snd_una, tp->high_seq)) {
150+
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
151+
if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
152+
inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
153+
tp->snd_cwnd = tp->snd_ssthresh;
154+
tp->snd_cwnd_stamp = tcp_jiffies32;
155+
}
156+
}
157+
} else if (tcp_may_raise_cwnd(sk, flag)) {
158+
/* Advance cwnd if state allows */
159+
cubictcp_cong_avoid(sk, ack, rs->acked_sacked);
160+
tp->snd_cwnd_stamp = tcp_jiffies32;
161+
}
162+
163+
tcp_update_pacing_rate(sk);
164+
}
165+
166+
__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
167+
{
168+
return cubictcp_recalc_ssthresh(sk);
169+
}
170+
171+
void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
172+
{
173+
cubictcp_state(sk, new_state);
174+
}
175+
176+
void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
177+
const struct ack_sample *sample)
178+
{
179+
cubictcp_acked(sk, sample);
180+
}
181+
182+
__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
183+
{
184+
return tcp_reno_undo_cwnd(sk);
185+
}
186+
187+
SEC(".struct_ops")
188+
struct tcp_congestion_ops cc_cubic = {
189+
.init = (void *)bpf_cubic_init,
190+
.ssthresh = (void *)bpf_cubic_recalc_ssthresh,
191+
.cong_control = (void *)bpf_cubic_cong_control,
192+
.set_state = (void *)bpf_cubic_state,
193+
.undo_cwnd = (void *)bpf_cubic_undo_cwnd,
194+
.cwnd_event = (void *)bpf_cubic_cwnd_event,
195+
.pkts_acked = (void *)bpf_cubic_acked,
196+
.name = "bpf_cc_cubic",
197+
};
198+
199+
char _license[] SEC("license") = "GPL";

tools/testing/selftests/bpf/progs/bpf_tracing_net.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@
8080
#define TCP_INFINITE_SSTHRESH 0x7fffffff
8181
#define TCP_PINGPONG_THRESH 3
8282

83+
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
84+
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
85+
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
86+
#define FLAG_SND_UNA_ADVANCED \
87+
0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
88+
#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED)
89+
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED)
90+
8391
#define fib_nh_dev nh_common.nhc_dev
8492
#define fib_nh_gw_family nh_common.nhc_gw_family
8593
#define fib_nh_gw6 nh_common.nhc_gw.ipv6
@@ -119,4 +127,6 @@
119127
#define tw_v6_daddr __tw_common.skc_v6_daddr
120128
#define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr
121129

130+
#define tcp_jiffies32 ((__u32)bpf_jiffies64())
131+
122132
#endif

tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <bpf/bpf_tracing.h>
66

77
extern void bbr_init(struct sock *sk) __ksym;
8-
extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym;
8+
extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym;
99
extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym;
1010
extern u32 bbr_undo_cwnd(struct sock *sk) __ksym;
1111
extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
@@ -42,9 +42,9 @@ void BPF_PROG(in_ack_event, struct sock *sk, u32 flags)
4242
}
4343

4444
SEC("struct_ops/cong_control")
45-
void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs)
45+
void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
4646
{
47-
bbr_main(sk, rs);
47+
bbr_main(sk, ack, flag, rs);
4848
}
4949

5050
SEC("struct_ops/cong_avoid")

0 commit comments

Comments
 (0)