]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
659a8ad5 YC |
2 | #include <linux/tcp.h> |
3 | #include <net/tcp.h> | |
4 | ||
a0370b3f | 5 | int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; |
4f41b1c5 | 6 | |
db8da6bb YC |
7 | static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) |
8 | { | |
9 | struct tcp_sock *tp = tcp_sk(sk); | |
10 | ||
11 | tcp_skb_mark_lost_uncond_verify(tp, skb); | |
12 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | |
13 | /* Account for retransmits that are lost again */ | |
14 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | |
15 | tp->retrans_out -= tcp_skb_pcount(skb); | |
ecde8f36 YC |
16 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, |
17 | tcp_skb_pcount(skb)); | |
db8da6bb YC |
18 | } |
19 | } | |
20 | ||
9a568de4 | 21 | static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) |
1d0833df | 22 | { |
9a568de4 | 23 | return t1 > t2 || (t1 == t2 && after(seq1, seq2)); |
1d0833df YC |
24 | } |
25 | ||
a0370b3f YC |
26 | /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): |
27 | * | |
28 | * Marks a packet lost, if some packet sent later has been (s)acked. | |
4f41b1c5 YC |
29 | * The underlying idea is similar to the traditional dupthresh and FACK |
30 | * but they look at different metrics: | |
31 | * | |
32 | * dupthresh: 3 OOO packets delivered (packet count) | |
33 | * FACK: sequence delta to highest sacked sequence (sequence space) | |
34 | * RACK: sent time delta to the latest delivered packet (time domain) | |
35 | * | |
36 | * The advantage of RACK is it applies to both original and retransmitted | |
37 | * packet and therefore is robust against tail losses. Another advantage | |
38 | * is being more resilient to reordering by simply allowing some | |
39 | * "settling delay", instead of tweaking the dupthresh. | |
40 | * | |
a0370b3f YC |
41 | * When tcp_rack_detect_loss() detects some packets are lost and we |
42 | * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() | |
43 | * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will | |
44 | * make us enter the CA_Recovery state. | |
4f41b1c5 | 45 | */ |
7c1c7308 | 46 | static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) |
4f41b1c5 YC |
47 | { |
48 | struct tcp_sock *tp = tcp_sk(sk); | |
49 | struct sk_buff *skb; | |
e636f8b0 | 50 | u32 reo_wnd; |
4f41b1c5 | 51 | |
57dde7f7 | 52 | *reo_timeout = 0; |
4f41b1c5 YC |
53 | /* To be more reordering resilient, allow min_rtt/4 settling delay |
54 | * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed | |
55 | * RTT because reordering is often a path property and less related | |
56 | * to queuing or delayed ACKs. | |
4f41b1c5 YC |
57 | */ |
58 | reo_wnd = 1000; | |
a0370b3f | 59 | if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) |
4f41b1c5 YC |
60 | reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); |
61 | ||
62 | tcp_for_write_queue(skb, sk) { | |
63 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); | |
64 | ||
65 | if (skb == tcp_send_head(sk)) | |
66 | break; | |
67 | ||
68 | /* Skip ones already (s)acked */ | |
69 | if (!after(scb->end_seq, tp->snd_una) || | |
70 | scb->sacked & TCPCB_SACKED_ACKED) | |
71 | continue; | |
72 | ||
9a568de4 | 73 | if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, |
1d0833df | 74 | tp->rack.end_seq, scb->end_seq)) { |
deed7be7 YC |
75 | /* Step 3 in draft-cheng-tcpm-rack-00.txt: |
76 | * A packet is lost if its elapsed time is beyond | |
77 | * the recent RTT plus the reordering window. | |
78 | */ | |
9a568de4 ED |
79 | u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, |
80 | skb->skb_mstamp); | |
57dde7f7 YC |
81 | s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; |
82 | ||
83 | if (remaining < 0) { | |
deed7be7 | 84 | tcp_rack_mark_skb_lost(sk, skb); |
57dde7f7 | 85 | continue; |
deed7be7 | 86 | } |
57dde7f7 YC |
87 | |
88 | /* Skip ones marked lost but not yet retransmitted */ | |
89 | if ((scb->sacked & TCPCB_LOST) && | |
90 | !(scb->sacked & TCPCB_SACKED_RETRANS)) | |
91 | continue; | |
92 | ||
93 | /* Record maximum wait time (+1 to avoid 0) */ | |
94 | *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); | |
95 | ||
4f41b1c5 YC |
96 | } else if (!(scb->sacked & TCPCB_RETRANS)) { |
97 | /* Original data are sent sequentially so stop early | |
98 | * b/c the rest are all sent after rack_sent | |
99 | */ | |
100 | break; | |
101 | } | |
102 | } | |
e636f8b0 YC |
103 | } |
104 | ||
128eda86 | 105 | void tcp_rack_mark_lost(struct sock *sk) |
e636f8b0 YC |
106 | { |
107 | struct tcp_sock *tp = tcp_sk(sk); | |
57dde7f7 | 108 | u32 timeout; |
e636f8b0 | 109 | |
a0370b3f | 110 | if (!tp->rack.advanced) |
e636f8b0 | 111 | return; |
57dde7f7 | 112 | |
e636f8b0 YC |
113 | /* Reset the advanced flag to avoid unnecessary queue scanning */ |
114 | tp->rack.advanced = 0; | |
7c1c7308 | 115 | tcp_rack_detect_loss(sk, &timeout); |
57dde7f7 | 116 | if (timeout) { |
bb4d991a | 117 | timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; |
57dde7f7 YC |
118 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, |
119 | timeout, inet_csk(sk)->icsk_rto); | |
120 | } | |
4f41b1c5 YC |
121 | } |
122 | ||
deed7be7 YC |
123 | /* Record the most recently (re)sent time among the (s)acked packets |
124 | * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from | |
125 | * draft-cheng-tcpm-rack-00.txt | |
126 | */ | |
1d0833df | 127 | void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, |
9a568de4 | 128 | u64 xmit_time) |
659a8ad5 | 129 | { |
deed7be7 YC |
130 | u32 rtt_us; |
131 | ||
9a568de4 ED |
132 | if (tp->rack.mstamp && |
133 | !tcp_rack_sent_after(xmit_time, tp->rack.mstamp, | |
1d0833df | 134 | end_seq, tp->rack.end_seq)) |
659a8ad5 YC |
135 | return; |
136 | ||
9a568de4 | 137 | rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); |
659a8ad5 | 138 | if (sacked & TCPCB_RETRANS) { |
659a8ad5 YC |
139 | /* If the sacked packet was retransmitted, it's ambiguous |
140 | * whether the retransmission or the original (or the prior | |
141 | * retransmission) was sacked. | |
142 | * | |
143 | * If the original is lost, there is no ambiguity. Otherwise | |
144 | * we assume the original can be delayed up to aRTT + min_rtt. | |
145 | * the aRTT term is bounded by the fast recovery or timeout, | |
146 | * so it's at least one RTT (i.e., retransmission is at least | |
147 | * an RTT later). | |
148 | */ | |
deed7be7 | 149 | if (rtt_us < tcp_min_rtt(tp)) |
659a8ad5 YC |
150 | return; |
151 | } | |
deed7be7 | 152 | tp->rack.rtt_us = rtt_us; |
9a568de4 | 153 | tp->rack.mstamp = xmit_time; |
1d0833df | 154 | tp->rack.end_seq = end_seq; |
659a8ad5 YC |
155 | tp->rack.advanced = 1; |
156 | } | |
57dde7f7 YC |
157 | |
158 | /* We have waited long enough to accommodate reordering. Mark the expired | |
159 | * packets lost and retransmit them. | |
160 | */ | |
161 | void tcp_rack_reo_timeout(struct sock *sk) | |
162 | { | |
163 | struct tcp_sock *tp = tcp_sk(sk); | |
57dde7f7 YC |
164 | u32 timeout, prior_inflight; |
165 | ||
57dde7f7 | 166 | prior_inflight = tcp_packets_in_flight(tp); |
7c1c7308 | 167 | tcp_rack_detect_loss(sk, &timeout); |
57dde7f7 YC |
168 | if (prior_inflight != tcp_packets_in_flight(tp)) { |
169 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { | |
170 | tcp_enter_recovery(sk, false); | |
171 | if (!inet_csk(sk)->icsk_ca_ops->cong_control) | |
172 | tcp_cwnd_reduction(sk, 1, 0); | |
173 | } | |
174 | tcp_xmit_retransmit_queue(sk); | |
175 | } | |
176 | if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) | |
177 | tcp_rearm_rto(sk); | |
178 | } |