Skip to content

Commit 2da35e4

Browse files
committed
Merge branch 'tcp-receive-side-improvements'
Eric Dumazet says: ==================== tcp: receive side improvements We have set tcp_rmem[2] to 15 MB for about 8 years at Google, but had some issues for high speed flows on very small RTT. TCP rx autotuning has a tendency to overestimate the RTT, thus tp->rcvq_space.space and sk->sk_rcvbuf. This makes TCP receive queues much bigger than necessary, to a point cpu caches are evicted before application can copy the data, on cpus using DDIO. This series aims to fix this. - First patch adds tcp_rcvbuf_grow() tracepoint, which was very convenient to study the various issues fixed in this series. - Seven patches fix receiver autotune issues. - Two patches fix sender side issues. - Final patch increases tcp_rmem[2] so that TCP speed over WAN can meet modern needs. Tested on a 200Gbit NIC, average max throughput of a single flow: Before: 73593 Mbit. After: 122514 Mbit. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents bebd7b2 + 572be9b commit 2da35e4

File tree

7 files changed

+134
-66
lines changed

7 files changed

+134
-66
lines changed

Documentation/networking/ip-sysctl.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
735735
net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
736736
automatic tuning of that socket's receive buffer size, in which
737737
case this value is ignored.
738-
Default: between 131072 and 6MB, depending on RAM size.
738+
Default: between 131072 and 32MB, depending on RAM size.
739739

740740
tcp_sack - BOOLEAN
741741
Enable select acknowledgments (SACKS).
@@ -1099,7 +1099,7 @@ tcp_limit_output_bytes - INTEGER
10991099
limits the number of bytes on qdisc or device to reduce artificial
11001100
RTT/cwnd and reduce bufferbloat.
11011101

1102-
Default: 1048576 (16 * 65536)
1102+
Default: 4194304 (4 MB)
11031103

11041104
tcp_challenge_ack_limit - INTEGER
11051105
Limits number of Challenge ACK sent per second, as recommended

include/linux/tcp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ struct tcp_sock {
340340
} rcv_rtt_est;
341341
/* Receiver queue space */
342342
struct {
343-
u32 space;
343+
int space;
344344
u32 seq;
345345
u64 time;
346346
} rcvq_space;

include/trace/events/tcp.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
213213
TP_ARGS(sk)
214214
);
215215

216+
TRACE_EVENT(tcp_rcvbuf_grow,
217+
218+
TP_PROTO(struct sock *sk, int time),
219+
220+
TP_ARGS(sk, time),
221+
222+
TP_STRUCT__entry(
223+
__field(int, time)
224+
__field(__u32, rtt_us)
225+
__field(__u32, copied)
226+
__field(__u32, inq)
227+
__field(__u32, space)
228+
__field(__u32, ooo_space)
229+
__field(__u32, rcvbuf)
230+
__field(__u8, scaling_ratio)
231+
__field(__u16, sport)
232+
__field(__u16, dport)
233+
__field(__u16, family)
234+
__array(__u8, saddr, 4)
235+
__array(__u8, daddr, 4)
236+
__array(__u8, saddr_v6, 16)
237+
__array(__u8, daddr_v6, 16)
238+
__field(const void *, skaddr)
239+
__field(__u64, sock_cookie)
240+
),
241+
242+
TP_fast_assign(
243+
struct inet_sock *inet = inet_sk(sk);
244+
struct tcp_sock *tp = tcp_sk(sk);
245+
__be32 *p32;
246+
247+
__entry->time = time;
248+
__entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
249+
__entry->copied = tp->copied_seq - tp->rcvq_space.seq;
250+
__entry->inq = tp->rcv_nxt - tp->copied_seq;
251+
__entry->space = tp->rcvq_space.space;
252+
__entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
253+
TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
254+
tp->rcv_nxt;
255+
256+
__entry->rcvbuf = sk->sk_rcvbuf;
257+
__entry->scaling_ratio = tp->scaling_ratio;
258+
__entry->sport = ntohs(inet->inet_sport);
259+
__entry->dport = ntohs(inet->inet_dport);
260+
__entry->family = sk->sk_family;
261+
262+
p32 = (__be32 *) __entry->saddr;
263+
*p32 = inet->inet_saddr;
264+
265+
p32 = (__be32 *) __entry->daddr;
266+
*p32 = inet->inet_daddr;
267+
268+
TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
269+
sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
270+
271+
__entry->skaddr = sk;
272+
__entry->sock_cookie = sock_gen_cookie(sk);
273+
),
274+
275+
TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
276+
"family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
277+
"saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
278+
__entry->time, __entry->rtt_us, __entry->copied,
279+
__entry->inq, __entry->space, __entry->ooo_space,
280+
__entry->scaling_ratio, __entry->rcvbuf,
281+
show_family_name(__entry->family),
282+
__entry->sport, __entry->dport,
283+
__entry->saddr, __entry->daddr,
284+
__entry->saddr_v6, __entry->daddr_v6,
285+
__entry->skaddr,
286+
__entry->sock_cookie)
287+
);
288+
216289
TRACE_EVENT(tcp_retransmit_synack,
217290

218291
TP_PROTO(const struct sock *sk, const struct request_sock *req),

net/ipv4/tcp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5231,7 +5231,7 @@ void __init tcp_init(void)
52315231
/* Set per-socket limits to no more than 1/128 the pressure threshold */
52325232
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
52335233
max_wshare = min(4UL*1024*1024, limit);
5234-
max_rshare = min(6UL*1024*1024, limit);
5234+
max_rshare = min(32UL*1024*1024, limit);
52355235

52365236
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
52375237
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;

net/ipv4/tcp_input.c

Lines changed: 53 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
664664
*/
665665
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
666666
{
667-
u32 new_sample = tp->rcv_rtt_est.rtt_us;
668-
long m = sample;
667+
u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
668+
long m = sample << 3;
669669

670-
if (new_sample != 0) {
670+
if (old_sample == 0 || m < old_sample) {
671+
new_sample = m;
672+
} else {
671673
/* If we sample in larger samples in the non-timestamp
672674
* case, we could grossly overestimate the RTT especially
673675
* with chatty applications or bulk transfer apps which
@@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
678680
* else with timestamps disabled convergence takes too
679681
* long.
680682
*/
681-
if (!win_dep) {
682-
m -= (new_sample >> 3);
683-
new_sample += m;
684-
} else {
685-
m <<= 3;
686-
if (m < new_sample)
687-
new_sample = m;
688-
}
689-
} else {
690-
/* No previous measure. */
691-
new_sample = m << 3;
683+
if (win_dep)
684+
return;
685+
/* Do not use this sample if receive queue is not empty. */
686+
if (tp->rcv_nxt != tp->copied_seq)
687+
return;
688+
new_sample = old_sample - (old_sample >> 3) + sample;
692689
}
693690

694691
tp->rcv_rtt_est.rtt_us = new_sample;
@@ -712,7 +709,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
712709
tp->rcv_rtt_est.time = tp->tcp_mstamp;
713710
}
714711

715-
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
712+
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
716713
{
717714
u32 delta, delta_us;
718715

@@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
722719

723720
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
724721
if (!delta)
725-
delta = 1;
722+
delta = min_delta;
726723
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
727724
return delta_us;
728725
}
@@ -740,22 +737,47 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
740737

741738
if (TCP_SKB_CB(skb)->end_seq -
742739
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
743-
s32 delta = tcp_rtt_tsopt_us(tp);
740+
s32 delta = tcp_rtt_tsopt_us(tp, 0);
744741

745-
if (delta >= 0)
742+
if (delta > 0)
746743
tcp_rcv_rtt_update(tp, delta, 0);
747744
}
748745
}
749746

747+
static void tcp_rcvbuf_grow(struct sock *sk)
748+
{
749+
const struct net *net = sock_net(sk);
750+
struct tcp_sock *tp = tcp_sk(sk);
751+
int rcvwin, rcvbuf, cap;
752+
753+
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
754+
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
755+
return;
756+
757+
/* slow start: allow the sender to double its rate. */
758+
rcvwin = tp->rcvq_space.space << 1;
759+
760+
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
761+
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
762+
763+
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
764+
765+
rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
766+
if (rcvbuf > sk->sk_rcvbuf) {
767+
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
768+
/* Make the window clamp follow along. */
769+
WRITE_ONCE(tp->window_clamp,
770+
tcp_win_from_space(sk, rcvbuf));
771+
}
772+
}
750773
/*
751774
* This function should be called every time data is copied to user space.
752775
* It calculates the appropriate TCP receive buffer space.
753776
*/
754777
void tcp_rcv_space_adjust(struct sock *sk)
755778
{
756779
struct tcp_sock *tp = tcp_sk(sk);
757-
u32 copied;
758-
int time;
780+
int time, inq, copied;
759781

760782
trace_tcp_rcv_space_adjust(sk);
761783

@@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)
766788

767789
/* Number of bytes copied to user in last RTT */
768790
copied = tp->copied_seq - tp->rcvq_space.seq;
791+
/* Number of bytes in receive queue. */
792+
inq = tp->rcv_nxt - tp->copied_seq;
793+
copied -= inq;
769794
if (copied <= tp->rcvq_space.space)
770795
goto new_measure;
771796

772-
/* A bit of theory :
773-
* copied = bytes received in previous RTT, our base window
774-
* To cope with packet losses, we need a 2x factor
775-
* To cope with slow start, and sender growing its cwin by 100 %
776-
* every RTT, we need a 4x factor, because the ACK we are sending
777-
* now is for the next RTT, not the current one :
778-
* <prev RTT . ><current RTT .. ><next RTT .... >
779-
*/
780-
781-
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
782-
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
783-
u64 rcvwin, grow;
784-
int rcvbuf;
785-
786-
/* minimal window to cope with packet losses, assuming
787-
* steady state. Add some cushion because of small variations.
788-
*/
789-
rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
797+
trace_tcp_rcvbuf_grow(sk, time);
790798

791-
/* Accommodate for sender rate increase (eg. slow start) */
792-
grow = rcvwin * (copied - tp->rcvq_space.space);
793-
do_div(grow, tp->rcvq_space.space);
794-
rcvwin += (grow << 1);
795-
796-
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
797-
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
798-
if (rcvbuf > sk->sk_rcvbuf) {
799-
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
800-
801-
/* Make the window clamp follow along. */
802-
WRITE_ONCE(tp->window_clamp,
803-
tcp_win_from_space(sk, rcvbuf));
804-
}
805-
}
806799
tp->rcvq_space.space = copied;
807800

801+
tcp_rcvbuf_grow(sk);
802+
808803
new_measure:
809804
tp->rcvq_space.seq = tp->copied_seq;
810805
tp->rcvq_space.time = tp->tcp_mstamp;
@@ -3226,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
32263221
*/
32273222
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
32283223
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
3229-
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
3224+
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
32303225

32313226
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
32323227
if (seq_rtt_us < 0)
@@ -5173,6 +5168,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
51735168
skb_condense(skb);
51745169
skb_set_owner_r(skb, sk);
51755170
}
5171+
tcp_rcvbuf_grow(sk);
51765172
}
51775173

51785174
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
@@ -6873,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
68736869
if (!tp->srtt_us)
68746870
tcp_synack_rtt_meas(sk, req);
68756871

6872+
if (tp->rx_opt.tstamp_ok)
6873+
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6874+
68766875
if (req) {
68776876
tcp_rcv_synrecv_state_fastopen(sk);
68786877
} else {
@@ -6898,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
68986897
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
68996898
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
69006899

6901-
if (tp->rx_opt.tstamp_ok)
6902-
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6903-
69046900
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
69056901
tcp_update_pacing_rate(sk);
69066902

net/ipv4/tcp_ipv4.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3495,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net)
34953495
* which are too large can cause TCP streams to be bursty.
34963496
*/
34973497
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3498-
/* Default TSQ limit of 16 TSO segments */
3499-
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3498+
/* Default TSQ limit of 4 MB */
3499+
net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
35003500

35013501
/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
35023502
net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;

net/ipv4/tcp_output.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2619,9 +2619,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
26192619
limit = max_t(unsigned long,
26202620
2 * skb->truesize,
26212621
READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
2622-
if (sk->sk_pacing_status == SK_PACING_NONE)
2623-
limit = min_t(unsigned long, limit,
2624-
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
2622+
limit = min_t(unsigned long, limit,
2623+
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
26252624
limit <<= factor;
26262625

26272626
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&

0 commit comments

Comments
 (0)