|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
| 2 | + |
| 3 | +/* Highlights: |
| 4 | + * 1. The major difference between this bpf program and tcp_cubic.c |
| 5 | + * is that this bpf program relies on `cong_control` rather than |
| 6 | + * `cong_avoid` in the struct tcp_congestion_ops. |
| 7 | + * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and |
| 8 | + * tcp_update_pacing_rate is bypassed when `cong_control` is |
| 9 | + * defined, so moving these logic to `cong_control`. |
| 10 | + * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c. |
| 11 | + * The main purpose is to show use cases of the arguments in |
| 12 | + * `cong_control`. For simplicity's sake, it reuses tcp cubic's |
| 13 | + * kernel functions. |
| 14 | + */ |
| 15 | + |
| 16 | +#include "vmlinux.h" |
| 17 | + |
| 18 | +#include <bpf/bpf_helpers.h> |
| 19 | +#include <bpf/bpf_tracing.h> |
| 20 | +#include "bpf_tracing_net.h" |
| 21 | + |
| 22 | +#define BPF_STRUCT_OPS(name, args...) \ |
| 23 | +SEC("struct_ops/"#name) \ |
| 24 | +BPF_PROG(name, args) |
| 25 | + |
| 26 | +#define USEC_PER_SEC 1000000UL |
| 27 | +#define TCP_PACING_SS_RATIO (200) |
| 28 | +#define TCP_PACING_CA_RATIO (120) |
| 29 | +#define TCP_REORDERING (12) |
| 30 | + |
| 31 | +#define min(a, b) ((a) < (b) ? (a) : (b)) |
| 32 | +#define max(a, b) ((a) > (b) ? (a) : (b)) |
| 33 | +#define after(seq2, seq1) before(seq1, seq2) |
| 34 | + |
| 35 | +extern void cubictcp_init(struct sock *sk) __ksym; |
| 36 | +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; |
| 37 | +extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; |
| 38 | +extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; |
| 39 | +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; |
| 40 | +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; |
| 41 | +extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; |
| 42 | + |
| 43 | +static struct inet_connection_sock *inet_csk(const struct sock *sk) |
| 44 | +{ |
| 45 | + return (struct inet_connection_sock *)sk; |
| 46 | +} |
| 47 | + |
| 48 | +static struct tcp_sock *tcp_sk(const struct sock *sk) |
| 49 | +{ |
| 50 | + return (struct tcp_sock *)sk; |
| 51 | +} |
| 52 | + |
| 53 | +static bool before(__u32 seq1, __u32 seq2) |
| 54 | +{ |
| 55 | + return (__s32)(seq1-seq2) < 0; |
| 56 | +} |
| 57 | + |
| 58 | +static __u64 div64_u64(__u64 dividend, __u64 divisor) |
| 59 | +{ |
| 60 | + return dividend / divisor; |
| 61 | +} |
| 62 | + |
| 63 | +static void tcp_update_pacing_rate(struct sock *sk) |
| 64 | +{ |
| 65 | + const struct tcp_sock *tp = tcp_sk(sk); |
| 66 | + __u64 rate; |
| 67 | + |
| 68 | + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ |
| 69 | + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); |
| 70 | + |
| 71 | + /* current rate is (cwnd * mss) / srtt |
| 72 | + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. |
| 73 | + * In Congestion Avoidance phase, set it to 120 % the current rate. |
| 74 | + * |
| 75 | + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) |
| 76 | + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching |
| 77 | + * end of slow start and should slow down. |
| 78 | + */ |
| 79 | + if (tp->snd_cwnd < tp->snd_ssthresh / 2) |
| 80 | + rate *= TCP_PACING_SS_RATIO; |
| 81 | + else |
| 82 | + rate *= TCP_PACING_CA_RATIO; |
| 83 | + |
| 84 | + rate *= max(tp->snd_cwnd, tp->packets_out); |
| 85 | + |
| 86 | + if (tp->srtt_us) |
| 87 | + rate = div64_u64(rate, (__u64)tp->srtt_us); |
| 88 | + |
| 89 | + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); |
| 90 | +} |
| 91 | + |
| 92 | +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, |
| 93 | + int newly_lost, int flag) |
| 94 | +{ |
| 95 | + struct tcp_sock *tp = tcp_sk(sk); |
| 96 | + int sndcnt = 0; |
| 97 | + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; |
| 98 | + int delta = tp->snd_ssthresh - pkts_in_flight; |
| 99 | + |
| 100 | + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) |
| 101 | + return; |
| 102 | + |
| 103 | + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; |
| 104 | + |
| 105 | + if (delta < 0) { |
| 106 | + __u64 dividend = |
| 107 | + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; |
| 108 | + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; |
| 109 | + } else { |
| 110 | + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); |
| 111 | + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) |
| 112 | + sndcnt++; |
| 113 | + sndcnt = min(delta, sndcnt); |
| 114 | + } |
| 115 | + /* Force a fast retransmit upon entering fast recovery */ |
| 116 | + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); |
| 117 | + tp->snd_cwnd = pkts_in_flight + sndcnt; |
| 118 | +} |
| 119 | + |
| 120 | +/* Decide wheather to run the increase function of congestion control. */ |
| 121 | +static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
| 122 | +{ |
| 123 | + if (tcp_sk(sk)->reordering > TCP_REORDERING) |
| 124 | + return flag & FLAG_FORWARD_PROGRESS; |
| 125 | + |
| 126 | + return flag & FLAG_DATA_ACKED; |
| 127 | +} |
| 128 | + |
| 129 | +void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk) |
| 130 | +{ |
| 131 | + cubictcp_init(sk); |
| 132 | +} |
| 133 | + |
| 134 | +void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) |
| 135 | +{ |
| 136 | + cubictcp_cwnd_event(sk, event); |
| 137 | +} |
| 138 | + |
| 139 | +void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, |
| 140 | + const struct rate_sample *rs) |
| 141 | +{ |
| 142 | + struct tcp_sock *tp = tcp_sk(sk); |
| 143 | + |
| 144 | + if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) & |
| 145 | + (1 << inet_csk(sk)->icsk_ca_state)) { |
| 146 | + /* Reduce cwnd if state mandates */ |
| 147 | + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); |
| 148 | + |
| 149 | + if (!before(tp->snd_una, tp->high_seq)) { |
| 150 | + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ |
| 151 | + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && |
| 152 | + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { |
| 153 | + tp->snd_cwnd = tp->snd_ssthresh; |
| 154 | + tp->snd_cwnd_stamp = tcp_jiffies32; |
| 155 | + } |
| 156 | + } |
| 157 | + } else if (tcp_may_raise_cwnd(sk, flag)) { |
| 158 | + /* Advance cwnd if state allows */ |
| 159 | + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); |
| 160 | + tp->snd_cwnd_stamp = tcp_jiffies32; |
| 161 | + } |
| 162 | + |
| 163 | + tcp_update_pacing_rate(sk); |
| 164 | +} |
| 165 | + |
| 166 | +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) |
| 167 | +{ |
| 168 | + return cubictcp_recalc_ssthresh(sk); |
| 169 | +} |
| 170 | + |
| 171 | +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) |
| 172 | +{ |
| 173 | + cubictcp_state(sk, new_state); |
| 174 | +} |
| 175 | + |
| 176 | +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, |
| 177 | + const struct ack_sample *sample) |
| 178 | +{ |
| 179 | + cubictcp_acked(sk, sample); |
| 180 | +} |
| 181 | + |
| 182 | +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) |
| 183 | +{ |
| 184 | + return tcp_reno_undo_cwnd(sk); |
| 185 | +} |
| 186 | + |
| 187 | +SEC(".struct_ops") |
| 188 | +struct tcp_congestion_ops cc_cubic = { |
| 189 | + .init = (void *)bpf_cubic_init, |
| 190 | + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, |
| 191 | + .cong_control = (void *)bpf_cubic_cong_control, |
| 192 | + .set_state = (void *)bpf_cubic_state, |
| 193 | + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, |
| 194 | + .cwnd_event = (void *)bpf_cubic_cwnd_event, |
| 195 | + .pkts_acked = (void *)bpf_cubic_acked, |
| 196 | + .name = "bpf_cc_cubic", |
| 197 | +}; |
| 198 | + |
| 199 | +char _license[] SEC("license") = "GPL"; |
0 commit comments