]>
Commit | Line | Data |
---|---|---|
699fafaf LB |
1 | /* |
2 | * TCP NV: TCP with Congestion Avoidance | |
3 | * | |
4 | * TCP-NV is a successor of TCP-Vegas that has been developed to | |
5 | * deal with the issues that occur in modern networks. | |
6 | * Like TCP-Vegas, TCP-NV supports true congestion avoidance, | |
7 | * the ability to detect congestion before packet losses occur. | |
8 | * When congestion (queue buildup) starts to occur, TCP-NV | |
9 | * predicts what the cwnd size should be for the current | |
10 | * throughput and it reduces the cwnd proportionally to | |
11 | * the difference between the current cwnd and the predicted cwnd. | |
12 | * | |
13 | * NV is only recommeneded for traffic within a data center, and when | |
14 | * all the flows are NV (at least those within the data center). This | |
15 | * is due to the inherent unfairness between flows using losses to | |
16 | * detect congestion (congestion control) and those that use queue | |
17 | * buildup to detect congestion (congestion avoidance). | |
18 | * | |
19 | * Note: High NIC coalescence values may lower the performance of NV | |
20 | * due to the increased noise in RTT values. In particular, we have | |
21 | * seen issues with rx-frames values greater than 8. | |
22 | * | |
23 | * TODO: | |
24 | * 1) Add mechanism to deal with reverse congestion. | |
25 | */ | |
26 | ||
27 | #include <linux/mm.h> | |
28 | #include <linux/module.h> | |
29 | #include <linux/math64.h> | |
30 | #include <net/tcp.h> | |
31 | #include <linux/inet_diag.h> | |
32 | ||
33 | /* TCP NV parameters | |
34 | * | |
35 | * nv_pad Max number of queued packets allowed in network | |
36 | * nv_pad_buffer Do not grow cwnd if this closed to nv_pad | |
37 | * nv_reset_period How often (in) seconds)to reset min_rtt | |
38 | * nv_min_cwnd Don't decrease cwnd below this if there are no losses | |
39 | * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected | |
40 | * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 | |
41 | * nv_rtt_factor RTT averaging factor | |
42 | * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur | |
43 | * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd | |
44 | * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd | |
45 | * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping | |
46 | * slow-start due to congestion | |
47 | * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion | |
48 | * nv_rtt_min_cnt Wait these many RTTs before making congesion decision | |
49 | * nv_cwnd_growth_rate_neg | |
50 | * nv_cwnd_growth_rate_pos | |
51 | * How quickly to double growth rate (not rate) of cwnd when not | |
52 | * congested. One value (nv_cwnd_growth_rate_neg) for when | |
53 | * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) | |
54 | * otherwise. | |
55 | */ | |
56 | ||
57 | static int nv_pad __read_mostly = 10; | |
58 | static int nv_pad_buffer __read_mostly = 2; | |
59 | static int nv_reset_period __read_mostly = 5; /* in seconds */ | |
60 | static int nv_min_cwnd __read_mostly = 2; | |
61 | static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ | |
62 | static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ | |
63 | static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ | |
64 | static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ | |
65 | static int nv_cwnd_growth_rate_neg __read_mostly = 8; | |
66 | static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ | |
67 | static int nv_dec_eval_min_calls __read_mostly = 60; | |
68 | static int nv_inc_eval_min_calls __read_mostly = 20; | |
69 | static int nv_ssthresh_eval_min_calls __read_mostly = 30; | |
70 | static int nv_stop_rtt_cnt __read_mostly = 10; | |
71 | static int nv_rtt_min_cnt __read_mostly = 2; | |
72 | ||
73 | module_param(nv_pad, int, 0644); | |
74 | MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); | |
75 | module_param(nv_reset_period, int, 0644); | |
76 | MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); | |
77 | module_param(nv_min_cwnd, int, 0644); | |
78 | MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" | |
79 | " without losses"); | |
80 | ||
81 | /* TCP NV Parameters */ | |
82 | struct tcpnv { | |
83 | unsigned long nv_min_rtt_reset_jiffies; /* when to switch to | |
84 | * nv_min_rtt_new */ | |
85 | s8 cwnd_growth_factor; /* Current cwnd growth factor, | |
86 | * < 0 => less than 1 packet/RTT */ | |
87 | u8 available8; | |
88 | u16 available16; | |
699fafaf LB |
89 | u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ |
90 | nv_reset:1, /* whether to reset values */ | |
91 | nv_catchup:1; /* whether we are growing because | |
92 | * of temporary cwnd decrease */ | |
93 | u8 nv_eval_call_cnt; /* call count since last eval */ | |
94 | u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is | |
95 | * smaller than this. It may grow to handle | |
96 | * TSO, LRO and interrupt coalescence because | |
97 | * with these a small cwnd cannot saturate | |
98 | * the link. Note that this is different from | |
99 | * the file local nv_min_cwnd */ | |
100 | u8 nv_rtt_cnt; /* RTTs without making ca decision */; | |
101 | u32 nv_last_rtt; /* last rtt */ | |
102 | u32 nv_min_rtt; /* active min rtt. Used to determine slope */ | |
103 | u32 nv_min_rtt_new; /* min rtt for future use */ | |
104 | u32 nv_rtt_max_rate; /* max rate seen during current RTT */ | |
105 | u32 nv_rtt_start_seq; /* current RTT ends when packet arrives | |
106 | * acking beyond nv_rtt_start_seq */ | |
107 | u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is | |
108 | * used to determine bytes acked since last | |
109 | * call to bictcp_acked */ | |
110 | u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ | |
111 | }; | |
112 | ||
113 | #define NV_INIT_RTT U32_MAX | |
114 | #define NV_MIN_CWND 4 | |
115 | #define NV_MIN_CWND_GROW 2 | |
116 | #define NV_TSO_CWND_BOUND 80 | |
117 | ||
118 | static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) | |
119 | { | |
120 | struct tcp_sock *tp = tcp_sk(sk); | |
121 | ||
122 | ca->nv_reset = 0; | |
699fafaf LB |
123 | ca->nv_no_cong_cnt = 0; |
124 | ca->nv_rtt_cnt = 0; | |
125 | ca->nv_last_rtt = 0; | |
126 | ca->nv_rtt_max_rate = 0; | |
127 | ca->nv_rtt_start_seq = tp->snd_una; | |
128 | ca->nv_eval_call_cnt = 0; | |
129 | ca->nv_last_snd_una = tp->snd_una; | |
130 | } | |
131 | ||
132 | static void tcpnv_init(struct sock *sk) | |
133 | { | |
134 | struct tcpnv *ca = inet_csk_ca(sk); | |
135 | ||
136 | tcpnv_reset(ca, sk); | |
137 | ||
138 | ca->nv_allow_cwnd_growth = 1; | |
139 | ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; | |
140 | ca->nv_min_rtt = NV_INIT_RTT; | |
141 | ca->nv_min_rtt_new = NV_INIT_RTT; | |
142 | ca->nv_min_cwnd = NV_MIN_CWND; | |
143 | ca->nv_catchup = 0; | |
144 | ca->cwnd_growth_factor = 0; | |
145 | } | |
146 | ||
147 | static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |
148 | { | |
149 | struct tcp_sock *tp = tcp_sk(sk); | |
150 | struct tcpnv *ca = inet_csk_ca(sk); | |
151 | u32 cnt; | |
152 | ||
153 | if (!tcp_is_cwnd_limited(sk)) | |
154 | return; | |
155 | ||
156 | /* Only grow cwnd if NV has not detected congestion */ | |
157 | if (!ca->nv_allow_cwnd_growth) | |
158 | return; | |
159 | ||
160 | if (tcp_in_slow_start(tp)) { | |
161 | acked = tcp_slow_start(tp, acked); | |
162 | if (!acked) | |
163 | return; | |
164 | } | |
165 | ||
166 | if (ca->cwnd_growth_factor < 0) { | |
167 | cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; | |
168 | tcp_cong_avoid_ai(tp, cnt, acked); | |
169 | } else { | |
170 | cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); | |
171 | tcp_cong_avoid_ai(tp, cnt, acked); | |
172 | } | |
173 | } | |
174 | ||
175 | static u32 tcpnv_recalc_ssthresh(struct sock *sk) | |
176 | { | |
177 | const struct tcp_sock *tp = tcp_sk(sk); | |
699fafaf | 178 | |
699fafaf LB |
179 | return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); |
180 | } | |
181 | ||
699fafaf LB |
182 | static void tcpnv_state(struct sock *sk, u8 new_state) |
183 | { | |
184 | struct tcpnv *ca = inet_csk_ca(sk); | |
185 | ||
186 | if (new_state == TCP_CA_Open && ca->nv_reset) { | |
187 | tcpnv_reset(ca, sk); | |
188 | } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || | |
189 | new_state == TCP_CA_Recovery) { | |
190 | ca->nv_reset = 1; | |
191 | ca->nv_allow_cwnd_growth = 0; | |
192 | if (new_state == TCP_CA_Loss) { | |
193 | /* Reset cwnd growth factor to Reno value */ | |
194 | if (ca->cwnd_growth_factor > 0) | |
195 | ca->cwnd_growth_factor = 0; | |
196 | /* Decrease growth rate if allowed */ | |
197 | if (nv_cwnd_growth_rate_neg > 0 && | |
198 | ca->cwnd_growth_factor > -8) | |
199 | ca->cwnd_growth_factor--; | |
200 | } | |
201 | } | |
202 | } | |
203 | ||
204 | /* Do congestion avoidance calculations for TCP-NV | |
205 | */ | |
206 | static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) | |
207 | { | |
208 | const struct inet_connection_sock *icsk = inet_csk(sk); | |
209 | struct tcp_sock *tp = tcp_sk(sk); | |
210 | struct tcpnv *ca = inet_csk_ca(sk); | |
211 | unsigned long now = jiffies; | |
212 | s64 rate64 = 0; | |
213 | u32 rate, max_win, cwnd_by_slope; | |
214 | u32 avg_rtt; | |
215 | u32 bytes_acked = 0; | |
216 | ||
217 | /* Some calls are for duplicates without timetamps */ | |
218 | if (sample->rtt_us < 0) | |
219 | return; | |
220 | ||
221 | /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ | |
222 | if (icsk->icsk_ca_state != TCP_CA_Open && | |
223 | icsk->icsk_ca_state != TCP_CA_Disorder) | |
224 | return; | |
225 | ||
226 | /* Stop cwnd growth if we were in catch up mode */ | |
227 | if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { | |
228 | ca->nv_catchup = 0; | |
229 | ca->nv_allow_cwnd_growth = 0; | |
230 | } | |
231 | ||
232 | bytes_acked = tp->snd_una - ca->nv_last_snd_una; | |
233 | ca->nv_last_snd_una = tp->snd_una; | |
234 | ||
235 | if (sample->in_flight == 0) | |
236 | return; | |
237 | ||
238 | /* Calculate moving average of RTT */ | |
239 | if (nv_rtt_factor > 0) { | |
240 | if (ca->nv_last_rtt > 0) { | |
241 | avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + | |
242 | ((u64)ca->nv_last_rtt) | |
243 | * (256 - nv_rtt_factor)) >> 8; | |
244 | } else { | |
245 | avg_rtt = sample->rtt_us; | |
246 | ca->nv_min_rtt = avg_rtt << 1; | |
247 | } | |
248 | ca->nv_last_rtt = avg_rtt; | |
249 | } else { | |
250 | avg_rtt = sample->rtt_us; | |
251 | } | |
252 | ||
253 | /* rate in 100's bits per second */ | |
254 | rate64 = ((u64)sample->in_flight) * 8000000; | |
255 | rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100)); | |
256 | ||
257 | /* Remember the maximum rate seen during this RTT | |
258 | * Note: It may be more than one RTT. This function should be | |
259 | * called at least nv_dec_eval_min_calls times. | |
260 | */ | |
261 | if (ca->nv_rtt_max_rate < rate) | |
262 | ca->nv_rtt_max_rate = rate; | |
263 | ||
264 | /* We have valid information, increment counter */ | |
265 | if (ca->nv_eval_call_cnt < 255) | |
266 | ca->nv_eval_call_cnt++; | |
267 | ||
268 | /* update min rtt if necessary */ | |
269 | if (avg_rtt < ca->nv_min_rtt) | |
270 | ca->nv_min_rtt = avg_rtt; | |
271 | ||
272 | /* update future min_rtt if necessary */ | |
273 | if (avg_rtt < ca->nv_min_rtt_new) | |
274 | ca->nv_min_rtt_new = avg_rtt; | |
275 | ||
276 | /* nv_min_rtt is updated with the minimum (possibley averaged) rtt | |
277 | * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a | |
278 | * warm reset). This new nv_min_rtt will be continued to be updated | |
279 | * and be used for another sysctl_tcp_nv_reset_period seconds, | |
280 | * when it will be updated again. | |
281 | * In practice we introduce some randomness, so the actual period used | |
282 | * is chosen randomly from the range: | |
283 | * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) | |
284 | */ | |
285 | if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { | |
286 | unsigned char rand; | |
287 | ||
288 | ca->nv_min_rtt = ca->nv_min_rtt_new; | |
289 | ca->nv_min_rtt_new = NV_INIT_RTT; | |
290 | get_random_bytes(&rand, 1); | |
291 | ca->nv_min_rtt_reset_jiffies = | |
292 | now + ((nv_reset_period * (384 + rand) * HZ) >> 9); | |
293 | /* Every so often we decrease ca->nv_min_cwnd in case previous | |
294 | * value is no longer accurate. | |
295 | */ | |
296 | ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); | |
297 | } | |
298 | ||
299 | /* Once per RTT check if we need to do congestion avoidance */ | |
300 | if (before(ca->nv_rtt_start_seq, tp->snd_una)) { | |
301 | ca->nv_rtt_start_seq = tp->snd_nxt; | |
302 | if (ca->nv_rtt_cnt < 0xff) | |
303 | /* Increase counter for RTTs without CA decision */ | |
304 | ca->nv_rtt_cnt++; | |
305 | ||
306 | /* If this function is only called once within an RTT | |
307 | * the cwnd is probably too small (in some cases due to | |
308 | * tso, lro or interrupt coalescence), so we increase | |
309 | * ca->nv_min_cwnd. | |
310 | */ | |
311 | if (ca->nv_eval_call_cnt == 1 && | |
312 | bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && | |
313 | ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { | |
314 | ca->nv_min_cwnd = min(ca->nv_min_cwnd | |
315 | + NV_MIN_CWND_GROW, | |
316 | NV_TSO_CWND_BOUND + 1); | |
317 | ca->nv_rtt_start_seq = tp->snd_nxt + | |
318 | ca->nv_min_cwnd * tp->mss_cache; | |
319 | ca->nv_eval_call_cnt = 0; | |
320 | ca->nv_allow_cwnd_growth = 1; | |
321 | return; | |
322 | } | |
323 | ||
324 | /* Find the ideal cwnd for current rate from slope | |
325 | * slope = 80000.0 * mss / nv_min_rtt | |
326 | * cwnd_by_slope = nv_rtt_max_rate / slope | |
327 | */ | |
328 | cwnd_by_slope = (u32) | |
329 | div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, | |
330 | (u64)(80000 * tp->mss_cache)); | |
331 | max_win = cwnd_by_slope + nv_pad; | |
332 | ||
333 | /* If cwnd > max_win, decrease cwnd | |
334 | * if cwnd < max_win, grow cwnd | |
335 | * else leave the same | |
336 | */ | |
337 | if (tp->snd_cwnd > max_win) { | |
338 | /* there is congestion, check that it is ok | |
339 | * to make a CA decision | |
340 | * 1. We should have at least nv_dec_eval_min_calls | |
341 | * data points before making a CA decision | |
342 | * 2. We only make a congesion decision after | |
343 | * nv_rtt_min_cnt RTTs | |
344 | */ | |
345 | if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { | |
346 | return; | |
347 | } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { | |
348 | if (ca->nv_eval_call_cnt < | |
349 | nv_ssthresh_eval_min_calls) | |
350 | return; | |
351 | /* otherwise we will decrease cwnd */ | |
352 | } else if (ca->nv_eval_call_cnt < | |
353 | nv_dec_eval_min_calls) { | |
354 | if (ca->nv_allow_cwnd_growth && | |
355 | ca->nv_rtt_cnt > nv_stop_rtt_cnt) | |
356 | ca->nv_allow_cwnd_growth = 0; | |
357 | return; | |
358 | } | |
359 | ||
360 | /* We have enough data to determine we are congested */ | |
361 | ca->nv_allow_cwnd_growth = 0; | |
362 | tp->snd_ssthresh = | |
363 | (nv_ssthresh_factor * max_win) >> 3; | |
364 | if (tp->snd_cwnd - max_win > 2) { | |
365 | /* gap > 2, we do exponential cwnd decrease */ | |
366 | int dec; | |
367 | ||
368 | dec = max(2U, ((tp->snd_cwnd - max_win) * | |
369 | nv_cong_dec_mult) >> 7); | |
370 | tp->snd_cwnd -= dec; | |
371 | } else if (nv_cong_dec_mult > 0) { | |
372 | tp->snd_cwnd = max_win; | |
373 | } | |
374 | if (ca->cwnd_growth_factor > 0) | |
375 | ca->cwnd_growth_factor = 0; | |
376 | ca->nv_no_cong_cnt = 0; | |
377 | } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { | |
378 | /* There is no congestion, grow cwnd if allowed*/ | |
379 | if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) | |
380 | return; | |
381 | ||
382 | ca->nv_allow_cwnd_growth = 1; | |
383 | ca->nv_no_cong_cnt++; | |
384 | if (ca->cwnd_growth_factor < 0 && | |
385 | nv_cwnd_growth_rate_neg > 0 && | |
386 | ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { | |
387 | ca->cwnd_growth_factor++; | |
388 | ca->nv_no_cong_cnt = 0; | |
389 | } else if (ca->cwnd_growth_factor >= 0 && | |
390 | nv_cwnd_growth_rate_pos > 0 && | |
391 | ca->nv_no_cong_cnt > | |
392 | nv_cwnd_growth_rate_pos) { | |
393 | ca->cwnd_growth_factor++; | |
394 | ca->nv_no_cong_cnt = 0; | |
395 | } | |
396 | } else { | |
397 | /* cwnd is in-between, so do nothing */ | |
398 | return; | |
399 | } | |
400 | ||
401 | /* update state */ | |
402 | ca->nv_eval_call_cnt = 0; | |
403 | ca->nv_rtt_cnt = 0; | |
404 | ca->nv_rtt_max_rate = 0; | |
405 | ||
406 | /* Don't want to make cwnd < nv_min_cwnd | |
407 | * (it wasn't before, if it is now is because nv | |
408 | * decreased it). | |
409 | */ | |
410 | if (tp->snd_cwnd < nv_min_cwnd) | |
411 | tp->snd_cwnd = nv_min_cwnd; | |
412 | } | |
413 | } | |
414 | ||
415 | /* Extract info for Tcp socket info provided via netlink */ | |
c718c6d6 | 416 | static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, |
417 | union tcp_cc_info *info) | |
699fafaf LB |
418 | { |
419 | const struct tcpnv *ca = inet_csk_ca(sk); | |
420 | ||
421 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { | |
422 | info->vegas.tcpv_enabled = 1; | |
423 | info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; | |
424 | info->vegas.tcpv_rtt = ca->nv_last_rtt; | |
425 | info->vegas.tcpv_minrtt = ca->nv_min_rtt; | |
426 | ||
427 | *attr = INET_DIAG_VEGASINFO; | |
428 | return sizeof(struct tcpvegas_info); | |
429 | } | |
430 | return 0; | |
431 | } | |
699fafaf LB |
432 | |
433 | static struct tcp_congestion_ops tcpnv __read_mostly = { | |
434 | .init = tcpnv_init, | |
435 | .ssthresh = tcpnv_recalc_ssthresh, | |
436 | .cong_avoid = tcpnv_cong_avoid, | |
437 | .set_state = tcpnv_state, | |
f1722a1b | 438 | .undo_cwnd = tcp_reno_undo_cwnd, |
699fafaf LB |
439 | .pkts_acked = tcpnv_acked, |
440 | .get_info = tcpnv_get_info, | |
441 | ||
442 | .owner = THIS_MODULE, | |
443 | .name = "nv", | |
444 | }; | |
445 | ||
446 | static int __init tcpnv_register(void) | |
447 | { | |
448 | BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); | |
449 | ||
450 | return tcp_register_congestion_control(&tcpnv); | |
451 | } | |
452 | ||
453 | static void __exit tcpnv_unregister(void) | |
454 | { | |
455 | tcp_unregister_congestion_control(&tcpnv); | |
456 | } | |
457 | ||
458 | module_init(tcpnv_register); | |
459 | module_exit(tcpnv_unregister); | |
460 | ||
461 | MODULE_AUTHOR("Lawrence Brakmo"); | |
462 | MODULE_LICENSE("GPL"); | |
463 | MODULE_DESCRIPTION("TCP NV"); | |
464 | MODULE_VERSION("1.0"); |