]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | /* | |
3 | * This file is open source software, licensed to you under the terms | |
4 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
5 | * distributed with this work for additional information regarding copyright | |
6 | * ownership. You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You may obtain a copy of the License at | |
9 | * | |
10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
11 | * | |
12 | * Unless required by applicable law or agreed to in writing, | |
13 | * software distributed under the License is distributed on an | |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
15 | * KIND, either express or implied. See the License for the | |
16 | * specific language governing permissions and limitations | |
17 | * under the License. | |
18 | */ | |
19 | /* | |
20 | * Copyright (C) 2014 Cloudius Systems, Ltd. | |
21 | */ | |
22 | ||
23 | #include "align.h" | |
24 | #include "TCP.h" | |
25 | #include "IP.h" | |
26 | #include "DPDKStack.h" | |
27 | ||
28 | #include "common/dout.h" | |
11fdf7f2 | 29 | #include "include/ceph_assert.h" |
7c673cae FG |
30 | |
31 | #define dout_subsys ceph_subsys_dpdk | |
32 | #undef dout_prefix | |
33 | #define dout_prefix *_dout << "tcp " | |
34 | ||
35 | void tcp_option::parse(uint8_t* beg, uint8_t* end) | |
36 | { | |
37 | while (beg < end) { | |
38 | auto kind = option_kind(*beg); | |
39 | if (kind != option_kind::nop && kind != option_kind::eol) { | |
40 | // Make sure there is enough room for this option | |
41 | auto len = *(beg + 1); | |
42 | if (beg + len > end) { | |
43 | return; | |
44 | } | |
45 | } | |
46 | switch (kind) { | |
47 | case option_kind::mss: | |
48 | _mss_received = true; | |
49 | _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss); | |
50 | beg += option_len::mss; | |
51 | break; | |
52 | case option_kind::win_scale: | |
53 | _win_scale_received = true; | |
54 | _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift; | |
55 | // We can turn on win_scale option, 7 is Linux's default win scale size | |
56 | _local_win_scale = 7; | |
57 | beg += option_len::win_scale; | |
58 | break; | |
59 | case option_kind::sack: | |
60 | _sack_received = true; | |
61 | beg += option_len::sack; | |
62 | break; | |
63 | case option_kind::nop: | |
64 | beg += option_len::nop; | |
65 | break; | |
66 | case option_kind::eol: | |
67 | return; | |
68 | default: | |
69 | // Ignore options we do not understand | |
70 | auto len = *(beg + 1); | |
71 | beg += len; | |
72 | // Prevent infinite loop | |
73 | if (len == 0) { | |
74 | return; | |
75 | } | |
76 | break; | |
77 | } | |
78 | } | |
79 | } | |
80 | ||
81 | uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size) | |
82 | { | |
83 | auto hdr = reinterpret_cast<uint8_t*>(th); | |
84 | auto off = hdr + sizeof(tcp_hdr); | |
85 | uint8_t size = 0; | |
86 | bool syn_on = th->f_syn; | |
87 | bool ack_on = th->f_ack; | |
88 | ||
89 | if (syn_on) { | |
90 | if (_mss_received || !ack_on) { | |
91 | auto mss = new (off) tcp_option::mss; | |
92 | mss->mss = _local_mss; | |
93 | off += mss->len; | |
94 | size += mss->len; | |
95 | *mss = mss->hton(); | |
96 | } | |
97 | if (_win_scale_received || !ack_on) { | |
98 | auto win_scale = new (off) tcp_option::win_scale; | |
99 | win_scale->shift = _local_win_scale; | |
100 | off += win_scale->len; | |
101 | size += win_scale->len; | |
102 | } | |
103 | } | |
104 | if (size > 0) { | |
105 | // Insert NOP option | |
106 | auto size_max = align_up(uint8_t(size + 1), tcp_option::align); | |
107 | while (size < size_max - uint8_t(option_len::eol)) { | |
108 | new (off) tcp_option::nop; | |
109 | off += option_len::nop; | |
110 | size += option_len::nop; | |
111 | } | |
112 | new (off) tcp_option::eol; | |
113 | size += option_len::eol; | |
114 | } | |
11fdf7f2 | 115 | ceph_assert(size == options_size); |
7c673cae FG |
116 | |
117 | return size; | |
118 | } | |
119 | ||
120 | uint8_t tcp_option::get_size(bool syn_on, bool ack_on) | |
121 | { | |
122 | uint8_t size = 0; | |
123 | if (syn_on) { | |
124 | if (_mss_received || !ack_on) { | |
125 | size += option_len::mss; | |
126 | } | |
127 | if (_win_scale_received || !ack_on) { | |
128 | size += option_len::win_scale; | |
129 | } | |
130 | } | |
131 | if (size > 0) { | |
132 | size += option_len::eol; | |
133 | // Insert NOP option to align on 32-bit | |
134 | size = align_up(size, tcp_option::align); | |
135 | } | |
136 | return size; | |
137 | } | |
138 | ||
139 | ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c) | |
140 | : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c))) | |
141 | { } | |
142 | ||
143 | ipv4_tcp::~ipv4_tcp() { } | |
144 | ||
145 | void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to) | |
146 | { | |
147 | _tcp->received(std::move(p), from, to); | |
148 | } | |
149 | ||
150 | bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off) | |
151 | { | |
152 | return _tcp->forward(out_hash_data, p, off); | |
153 | } | |
154 | ||
155 | int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, | |
11fdf7f2 | 156 | int type, ServerSocket *sock) |
7c673cae | 157 | { |
11fdf7f2 | 158 | auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type); |
7c673cae FG |
159 | int r = p->listen(); |
160 | if (r < 0) { | |
161 | delete p; | |
162 | return r; | |
163 | } | |
164 | *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p)); | |
165 | return 0; | |
166 | } | |
167 | ||
168 | int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, | |
169 | ConnectedSocket *sock) | |
170 | { | |
171 | auto conn = tcpv4.connect(addr); | |
172 | *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>( | |
173 | new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn)))); | |
174 | return 0; | |
175 | } | |
176 | ||
177 | template <typename InetTraits> | |
178 | void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip) | |
179 | { | |
180 | ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin) | |
181 | << " syn=" << bool(rth->f_syn) << dendl; | |
182 | if (rth->f_rst) { | |
183 | return; | |
184 | } | |
185 | Packet p; | |
186 | auto th = p.prepend_header<tcp_hdr>(); | |
187 | th->src_port = rth->dst_port; | |
188 | th->dst_port = rth->src_port; | |
189 | if (rth->f_ack) { | |
190 | th->seq = rth->ack; | |
191 | } | |
192 | // If this RST packet is in response to a SYN packet. We ACK the ISN. | |
193 | if (rth->f_syn) { | |
194 | th->ack = rth->seq + 1; | |
195 | th->f_ack = true; | |
196 | } | |
197 | th->f_rst = true; | |
198 | th->data_offset = sizeof(*th) / 4; | |
199 | th->checksum = 0; | |
200 | *th = th->hton(); | |
201 | ||
202 | checksummer csum; | |
203 | offload_info oi; | |
204 | InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th)); | |
205 | if (get_hw_features().tx_csum_l4_offload) { | |
206 | th->checksum = ~csum.get(); | |
207 | oi.needs_csum = true; | |
208 | } else { | |
209 | csum.sum(p); | |
210 | th->checksum = csum.get(); | |
211 | oi.needs_csum = false; | |
212 | } | |
213 | ||
214 | oi.protocol = ip_protocol_num::tcp; | |
215 | oi.tcp_hdr_len = sizeof(tcp_hdr); | |
216 | p.set_offload_info(oi); | |
217 | ||
218 | send_packet_without_tcb(local_ip, foreign_ip, std::move(p)); | |
219 | } | |
220 | ||
221 | #undef dout_prefix | |
222 | #define dout_prefix _prefix(_dout) | |
223 | template<typename InetTraits> | |
224 | ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) { | |
225 | return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port | |
226 | << " tcb(" << this << " fd=" << fd << " s=" << _state << ")."; | |
227 | } | |
228 | ||
229 | template<typename InetTraits> | |
230 | void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p) | |
231 | { | |
232 | auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); | |
233 | auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); | |
234 | auto opt_end = opt_start + opt_len; | |
235 | p.trim_front(th->data_offset * 4); | |
236 | tcp_sequence seg_seq = th->seq; | |
237 | ||
238 | // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ | |
239 | _rcv.next = seg_seq + 1; | |
240 | _rcv.initial = seg_seq; | |
241 | ||
242 | // ISS should be selected and a SYN segment sent of the form: | |
243 | // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> | |
244 | // SND.NXT is set to ISS+1 and SND.UNA to ISS | |
245 | // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is | |
246 | // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we | |
247 | // have | |
248 | // th->seq = syn_on ? _snd.initial : _snd.next | |
249 | // to make sure retransmitted SYN has correct SEQ number. | |
250 | do_setup_isn(); | |
251 | ||
252 | _rcv.urgent = _rcv.next; | |
253 | ||
254 | ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl; | |
255 | init_from_options(th, opt_start, opt_end); | |
256 | do_syn_received(); | |
257 | } | |
258 | ||
259 | template <typename InetTraits> | |
260 | void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p) | |
261 | { | |
262 | auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); | |
263 | auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); | |
264 | auto opt_end = opt_start + opt_len; | |
265 | p.trim_front(th->data_offset * 4); | |
266 | tcp_sequence seg_seq = th->seq; | |
267 | auto seg_ack = th->ack; | |
268 | ||
269 | ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw | |
270 | << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; | |
271 | ||
272 | bool acceptable = false; | |
273 | // 3.1 first check the ACK bit | |
274 | if (th->f_ack) { | |
275 | // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the | |
276 | // RST bit is set, if so drop the segment and return) | |
277 | if (seg_ack <= _snd.initial || seg_ack > _snd.next) { | |
278 | return respond_with_reset(th); | |
279 | } | |
280 | ||
281 | // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. | |
282 | acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next; | |
283 | } | |
284 | ||
285 | // 3.2 second check the RST bit | |
286 | if (th->f_rst) { | |
287 | // If the ACK was acceptable then signal the user "error: connection | |
288 | // reset", drop the segment, enter CLOSED state, delete TCB, and | |
289 | // return. Otherwise (no ACK) drop the segment and return. | |
290 | if (acceptable) { | |
291 | return do_reset(); | |
292 | } else { | |
293 | return; | |
294 | } | |
295 | } | |
296 | ||
297 | // 3.3 third check the security and precedence | |
298 | // NOTE: Ignored for now | |
299 | ||
300 | // 3.4 fourth check the SYN bit | |
301 | if (th->f_syn) { | |
302 | // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should | |
303 | // be advanced to equal SEG.ACK (if there is an ACK), and any segments | |
304 | // on the retransmission queue which are thereby acknowledged should be | |
305 | // removed. | |
306 | _rcv.next = seg_seq + 1; | |
307 | _rcv.initial = seg_seq; | |
308 | if (th->f_ack) { | |
309 | // TODO: clean retransmission queue | |
310 | _snd.unacknowledged = seg_ack; | |
311 | } | |
312 | if (_snd.unacknowledged > _snd.initial) { | |
313 | // If SND.UNA > ISS (our SYN has been ACKed), change the connection | |
314 | // state to ESTABLISHED, form an ACK segment | |
315 | // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> | |
316 | ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl; | |
317 | init_from_options(th, opt_start, opt_end); | |
318 | do_established(); | |
319 | output(); | |
320 | } else { | |
321 | // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment | |
322 | // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> | |
323 | ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl; | |
324 | do_syn_received(); | |
325 | } | |
326 | } | |
327 | ||
328 | // 3.5 fifth, if neither of the SYN or RST bits is set then drop the | |
329 | // segment and return. | |
330 | return; | |
331 | } | |
332 | ||
333 | template <typename InetTraits> | |
334 | void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p) | |
335 | { | |
336 | p.trim_front(th->data_offset * 4); | |
337 | bool do_output = false; | |
338 | bool do_output_data = false; | |
339 | tcp_sequence seg_seq = th->seq; | |
340 | auto seg_ack = th->ack; | |
341 | auto seg_len = p.len(); | |
342 | ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw | |
343 | << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw | |
344 | << " rcv next " << _rcv.next.raw << " len " << seg_len | |
345 | << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; | |
346 | ||
347 | // 4.1 first check sequence number | |
348 | if (!segment_acceptable(seg_seq, seg_len)) { | |
349 | //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> | |
350 | return output(); | |
351 | } | |
352 | ||
353 | // In the following it is assumed that the segment is the idealized | |
354 | // segment that begins at RCV.NXT and does not exceed the window. | |
355 | if (seg_seq < _rcv.next) { | |
356 | // ignore already acknowledged data | |
357 | auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len); | |
358 | ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl; | |
359 | p.trim_front(dup); | |
360 | seg_len -= dup; | |
361 | seg_seq += dup; | |
362 | } | |
363 | // FIXME: We should trim data outside the right edge of the receive window as well | |
364 | ||
365 | if (seg_seq != _rcv.next) { | |
366 | ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw | |
367 | << " actual " << seg_seq.raw | |
368 | << " out of order size " << _rcv.out_of_order.map.size() | |
369 | << dendl; | |
370 | insert_out_of_order(seg_seq, std::move(p)); | |
371 | // A TCP receiver SHOULD send an immediate duplicate ACK | |
372 | // when an out-of-order segment arrives. | |
373 | return output(); | |
374 | } | |
375 | ||
376 | // 4.2 second check the RST bit | |
377 | if (th->f_rst) { | |
378 | if (in_state(SYN_RECEIVED)) { | |
379 | // If this connection was initiated with a passive OPEN (i.e., | |
380 | // came from the LISTEN state), then return this connection to | |
381 | // LISTEN state and return. The user need not be informed. If | |
382 | // this connection was initiated with an active OPEN (i.e., came | |
383 | // from SYN_SENT state) then the connection was refused, signal | |
384 | // the user "connection refused". In either case, all segments | |
385 | // on the retransmission queue should be removed. And in the | |
386 | // active OPEN case, enter the CLOSED state and delete the TCB, | |
387 | // and return. | |
388 | errno = -ECONNREFUSED; | |
389 | return do_reset(); | |
390 | } | |
391 | if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) { | |
392 | // If the RST bit is set then, any outstanding RECEIVEs and SEND | |
393 | // should receive "reset" responses. All segment queues should be | |
394 | // flushed. Users should also receive an unsolicited general | |
395 | // "connection reset" signal. Enter the CLOSED state, delete the | |
396 | // TCB, and return. | |
397 | return do_reset(); | |
398 | } | |
399 | if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) { | |
400 | // If the RST bit is set then, enter the CLOSED state, delete the | |
401 | // TCB, and return. | |
402 | return do_closed(); | |
403 | } | |
404 | } | |
405 | ||
406 | // 4.3 third check security and precedence | |
407 | // NOTE: Ignored for now | |
408 | ||
409 | // 4.4 fourth, check the SYN bit | |
410 | if (th->f_syn) { | |
411 | // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 | |
412 | // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT | |
413 | ||
414 | // If the SYN is in the window it is an error, send a reset, any | |
415 | // outstanding RECEIVEs and SEND should receive "reset" responses, | |
416 | // all segment queues should be flushed, the user should also | |
417 | // receive an unsolicited general "connection reset" signal, enter | |
418 | // the CLOSED state, delete the TCB, and return. | |
419 | respond_with_reset(th); | |
420 | return do_reset(); | |
421 | ||
422 | // If the SYN is not in the window this step would not be reached | |
423 | // and an ack would have been sent in the first step (sequence | |
424 | // number check). | |
425 | } | |
426 | ||
427 | // 4.5 fifth check the ACK field | |
428 | if (!th->f_ack) { | |
429 | // if the ACK bit is off drop the segment and return | |
430 | return; | |
431 | } else { | |
432 | // SYN_RECEIVED STATE | |
433 | if (in_state(SYN_RECEIVED)) { | |
434 | // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state | |
435 | // and continue processing. | |
436 | if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) { | |
437 | ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl; | |
438 | do_established(); | |
439 | if (_tcp.push_listen_queue(_local_port, this)) { | |
440 | ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl; | |
441 | } else { | |
442 | ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl; | |
443 | return respond_with_reset(th); | |
444 | } | |
445 | } else { | |
446 | // <SEQ=SEG.ACK><CTL=RST> | |
447 | return respond_with_reset(th); | |
448 | } | |
449 | } | |
450 | auto update_window = [this, th, seg_seq, seg_ack] { | |
451 | ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq | |
452 | << " seg_ack=" << seg_ack << " old window=" << th->window | |
453 | << " new window=" << int(_snd.window_scale) << dendl; | |
454 | _snd.window = th->window << _snd.window_scale; | |
455 | _snd.wl1 = seg_seq; | |
456 | _snd.wl2 = seg_ack; | |
457 | if (_snd.window == 0) { | |
458 | _persist_time_out = _rto; | |
459 | start_persist_timer(); | |
460 | } else { | |
461 | stop_persist_timer(); | |
462 | } | |
463 | }; | |
464 | // ESTABLISHED STATE or | |
465 | // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state. | |
466 | if (in_state(ESTABLISHED | CLOSE_WAIT)) { | |
467 | // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK. | |
468 | if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) { | |
469 | // Remote ACKed data we sent | |
470 | auto acked_bytes = data_segment_acked(seg_ack); | |
471 | ||
472 | // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated. | |
473 | if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) { | |
474 | update_window(); | |
475 | } | |
476 | ||
477 | // some data is acked, try send more data | |
478 | do_output_data = true; | |
479 | ||
480 | auto set_retransmit_timer = [this] { | |
481 | if (_snd.data.empty()) { | |
482 | // All outstanding segments are acked, turn off the timer. | |
483 | stop_retransmit_timer(); | |
484 | // Signal the waiter of this event | |
485 | signal_all_data_acked(); | |
486 | } else { | |
487 | // Restart the timer becasue new data is acked. | |
488 | start_retransmit_timer(); | |
489 | } | |
490 | }; | |
491 | ||
492 | if (_snd.dupacks >= 3) { | |
493 | // We are in fast retransmit / fast recovery phase | |
494 | uint32_t smss = _snd.mss; | |
495 | if (seg_ack > _snd.recover) { | |
496 | ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl; | |
497 | // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS) | |
498 | _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss); | |
499 | // Exit the fast recovery procedure | |
500 | exit_fast_recovery(); | |
501 | set_retransmit_timer(); | |
502 | } else { | |
503 | ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl; | |
504 | // Retransmit the first unacknowledged segment | |
505 | fast_retransmit(); | |
506 | // Deflate the congestion window by the amount of new data | |
507 | // acknowledged by the Cumulative Acknowledgment field | |
508 | _snd.cwnd -= acked_bytes; | |
509 | // If the partial ACK acknowledges at least one SMSS of new | |
510 | // data, then add back SMSS bytes to the congestion window | |
511 | if (acked_bytes >= smss) { | |
512 | _snd.cwnd += smss; | |
513 | } | |
514 | // Send a new segment if permitted by the new value of | |
515 | // cwnd. Do not exit the fast recovery procedure For | |
516 | // the first partial ACK that arrives during fast | |
517 | // recovery, also reset the retransmit timer. | |
518 | if (++_snd.partial_ack == 1) { | |
519 | start_retransmit_timer(); | |
520 | } | |
521 | } | |
522 | } else { | |
523 | // RFC5681: The fast retransmit algorithm uses the arrival | |
524 | // of 3 duplicate ACKs (as defined in section 2, without | |
525 | // any intervening ACKs which move SND.UNA) as an | |
526 | // indication that a segment has been lost. | |
527 | // | |
528 | // So, here we reset dupacks to zero becasue this ACK moves | |
529 | // SND.UNA. | |
530 | exit_fast_recovery(); | |
531 | set_retransmit_timer(); | |
532 | } | |
533 | } else if (!_snd.data.empty() && seg_len == 0 && | |
534 | th->f_fin == 0 && th->f_syn == 0 && | |
535 | th->ack == _snd.unacknowledged && | |
536 | uint32_t(th->window << _snd.window_scale) == _snd.window) { | |
537 | // Note: | |
538 | // RFC793 states: | |
539 | // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored | |
540 | // RFC5681 states: | |
541 | // The TCP sender SHOULD use the "fast retransmit" algorithm to detect | |
542 | // and repair loss, based on incoming duplicate ACKs. | |
543 | // Here, We follow RFC5681. | |
544 | _snd.dupacks++; | |
545 | uint32_t smss = _snd.mss; | |
546 | // 3 duplicated ACKs trigger a fast retransmit | |
547 | if (_snd.dupacks == 1 || _snd.dupacks == 2) { | |
548 | // RFC5681 Step 3.1 | |
549 | // Send cwnd + 2 * smss per RFC3042 | |
550 | do_output_data = true; | |
551 | } else if (_snd.dupacks == 3) { | |
552 | // RFC6582 Step 3.2 | |
553 | if (seg_ack - 1 > _snd.recover) { | |
554 | _snd.recover = _snd.next - 1; | |
555 | // RFC5681 Step 3.2 | |
556 | _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss); | |
557 | fast_retransmit(); | |
558 | } else { | |
559 | // Do not enter fast retransmit and do not reset ssthresh | |
560 | } | |
561 | // RFC5681 Step 3.3 | |
562 | _snd.cwnd = _snd.ssthresh + 3 * smss; | |
563 | } else if (_snd.dupacks > 3) { | |
564 | // RFC5681 Step 3.4 | |
565 | _snd.cwnd += smss; | |
566 | // RFC5681 Step 3.5 | |
567 | do_output_data = true; | |
568 | } | |
569 | } else if (seg_ack > _snd.next) { | |
570 | // If the ACK acks something not yet sent (SEG.ACK > SND.NXT) | |
571 | // then send an ACK, drop the segment, and return | |
572 | return output(); | |
573 | } else if (_snd.window == 0 && th->window > 0) { | |
574 | update_window(); | |
575 | do_output_data = true; | |
576 | } | |
577 | } | |
578 | // FIN_WAIT_1 STATE | |
579 | if (in_state(FIN_WAIT_1)) { | |
580 | // In addition to the processing for the ESTABLISHED state, if | |
581 | // our FIN is now acknowledged then enter FIN-WAIT-2 and continue | |
582 | // processing in that state. | |
583 | if (seg_ack == _snd.next + 1) { | |
584 | ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl; | |
585 | _state = FIN_WAIT_2; | |
586 | do_local_fin_acked(); | |
587 | } | |
588 | } | |
589 | // FIN_WAIT_2 STATE | |
590 | if (in_state(FIN_WAIT_2)) { | |
591 | // In addition to the processing for the ESTABLISHED state, if | |
592 | // the retransmission queue is empty, the user’s CLOSE can be | |
593 | // acknowledged ("ok") but do not delete the TCB. | |
594 | // TODO | |
595 | } | |
596 | // CLOSING STATE | |
597 | if (in_state(CLOSING)) { | |
598 | if (seg_ack == _snd.next + 1) { | |
599 | ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl; | |
600 | do_local_fin_acked(); | |
601 | return do_time_wait(); | |
602 | } else { | |
603 | return; | |
604 | } | |
605 | } | |
606 | // LAST_ACK STATE | |
607 | if (in_state(LAST_ACK)) { | |
608 | if (seg_ack == _snd.next + 1) { | |
609 | ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl; | |
610 | do_local_fin_acked(); | |
611 | return do_closed(); | |
612 | } | |
613 | } | |
614 | // TIME_WAIT STATE | |
615 | if (in_state(TIME_WAIT)) { | |
616 | // The only thing that can arrive in this state is a | |
617 | // retransmission of the remote FIN. Acknowledge it, and restart | |
618 | // the 2 MSL timeout. | |
619 | // TODO | |
620 | } | |
621 | } | |
622 | ||
623 | // 4.6 sixth, check the URG bit | |
624 | if (th->f_urg) { | |
625 | // TODO | |
626 | } | |
627 | ||
628 | // 4.7 seventh, process the segment text | |
629 | if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) { | |
630 | if (p.len()) { | |
631 | // Once the TCP takes responsibility for the data it advances | |
632 | // RCV.NXT over the data accepted, and adjusts RCV.WND as | |
633 | // apporopriate to the current buffer availability. The total of | |
634 | // RCV.NXT and RCV.WND should not be reduced. | |
635 | _rcv.data.push_back(std::move(p)); | |
636 | _rcv.next += seg_len; | |
637 | auto merged = merge_out_of_order(); | |
638 | signal_data_received(); | |
639 | // Send an acknowledgment of the form: | |
640 | // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> | |
641 | // This acknowledgment should be piggybacked on a segment being | |
642 | // transmitted if possible without incurring undue delay. | |
643 | if (merged) { | |
644 | // TCP receiver SHOULD send an immediate ACK when the | |
645 | // incoming segment fills in all or part of a gap in the | |
646 | // sequence space. | |
647 | do_output = true; | |
648 | } else { | |
649 | do_output = should_send_ack(seg_len); | |
650 | } | |
651 | ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl; | |
652 | } | |
653 | } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) { | |
654 | // This should not occur, since a FIN has been received from the | |
655 | // remote side. Ignore the segment text. | |
656 | return; | |
657 | } | |
658 | ||
659 | // 4.8 eighth, check the FIN bit | |
660 | if (th->f_fin) { | |
661 | if (in_state(CLOSED | LISTEN | SYN_SENT)) { | |
662 | // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT | |
663 | // since the SEG.SEQ cannot be validated; drop the segment and return. | |
664 | return; | |
665 | } | |
666 | auto fin_seq = seg_seq + seg_len; | |
667 | if (fin_seq == _rcv.next) { | |
668 | _rcv.next = fin_seq + 1; | |
669 | ||
670 | // If this <FIN> packet contains data as well, we can ACK both data | |
671 | // and <FIN> in a single packet, so canncel the previous ACK. | |
672 | clear_delayed_ack(); | |
673 | do_output = false; | |
674 | // Send ACK for the FIN! | |
675 | output(); | |
676 | signal_data_received(); | |
677 | _errno = 0; | |
678 | ||
679 | if (in_state(SYN_RECEIVED | ESTABLISHED)) { | |
680 | ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl; | |
681 | _state = CLOSE_WAIT; | |
682 | // EOF | |
683 | } | |
684 | if (in_state(FIN_WAIT_1)) { | |
685 | // If our FIN has been ACKed (perhaps in this segment), then | |
686 | // enter TIME-WAIT, start the time-wait timer, turn off the other | |
687 | // timers; otherwise enter the CLOSING state. | |
688 | // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2 | |
689 | // not FIN_WAIT_1 if we reach here. | |
690 | ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl; | |
691 | _state = CLOSING; | |
692 | } | |
693 | if (in_state(FIN_WAIT_2)) { | |
694 | ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl; | |
695 | return do_time_wait(); | |
696 | } | |
697 | } | |
698 | } | |
699 | if (do_output || (do_output_data && can_send())) { | |
700 | // Since we will do output, we can canncel scheduled delayed ACK. | |
701 | clear_delayed_ack(); | |
702 | output(); | |
703 | } | |
704 | } | |
705 | ||
706 | template <typename InetTraits> | |
707 | void tcp<InetTraits>::tcb::connect() | |
708 | { | |
709 | ldout(_tcp.cct, 20) << __func__ << dendl; | |
710 | // An initial send sequence number (ISS) is selected. A SYN segment of the | |
711 | // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1, | |
712 | // enter SYN-SENT state, and return. | |
713 | do_setup_isn(); | |
714 | ||
715 | // Local receive window scale factor | |
716 | _rcv.window_scale = _option._local_win_scale = 7; | |
717 | // Maximum segment size local can receive | |
718 | _rcv.mss = _option._local_mss = local_mss(); | |
719 | // Linux's default window size | |
720 | _rcv.window = 29200 << _rcv.window_scale; | |
721 | ||
722 | do_syn_sent(); | |
723 | } | |
724 | ||
725 | template <typename InetTraits> | |
726 | void tcp<InetTraits>::tcb::close_final_cleanup() | |
727 | { | |
728 | if (_snd._all_data_acked_fd >= 0) { | |
729 | center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE); | |
730 | _tcp.manager.close(_snd._all_data_acked_fd); | |
731 | _snd._all_data_acked_fd = -1; | |
732 | } | |
733 | ||
734 | _snd.closed = true; | |
735 | signal_data_received(); | |
736 | ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl; | |
737 | if (in_state(CLOSE_WAIT)) { | |
738 | ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl; | |
739 | _state = LAST_ACK; | |
740 | } else if (in_state(ESTABLISHED)) { | |
741 | ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl; | |
742 | _state = FIN_WAIT_1; | |
743 | } | |
744 | // Send <FIN> to remote | |
745 | // Note: we call output_one to make sure a packet with FIN actually | |
746 | // sent out. If we only call output() and _packetq is not empty, | |
747 | // tcp::tcb::get_packet(), packet with FIN will not be generated. | |
748 | output_one(); | |
749 | output(); | |
750 | center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); | |
751 | } | |
752 | ||
753 | template <typename InetTraits> | |
754 | void tcp<InetTraits>::tcb::retransmit() | |
755 | { | |
756 | auto output_update_rto = [this] { | |
757 | output(); | |
758 | // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off | |
759 | this->_rto = std::min(this->_rto * 2, this->_rto_max); | |
760 | start_retransmit_timer(); | |
761 | }; | |
762 | ||
763 | // Retransmit SYN | |
764 | if (syn_needs_on()) { | |
765 | if (_snd.syn_retransmit++ < _max_nr_retransmit) { | |
766 | output_update_rto(); | |
767 | } else { | |
768 | _errno = -ECONNABORTED; | |
769 | ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max " | |
770 | << _max_nr_retransmit << dendl; | |
771 | _errno = -ETIMEDOUT; | |
772 | cleanup(); | |
773 | return; | |
774 | } | |
775 | } | |
776 | ||
777 | // Retransmit FIN | |
778 | if (fin_needs_on()) { | |
779 | if (_snd.fin_retransmit++ < _max_nr_retransmit) { | |
780 | output_update_rto(); | |
781 | } else { | |
782 | ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max " | |
783 | << _max_nr_retransmit << dendl; | |
784 | _errno = -ETIMEDOUT; | |
785 | cleanup(); | |
786 | return; | |
787 | } | |
788 | } | |
789 | ||
790 | // Retransmit Data | |
791 | if (_snd.data.empty()) { | |
792 | return; | |
793 | } | |
794 | ||
795 | // If there are unacked data, retransmit the earliest segment | |
796 | auto& unacked_seg = _snd.data.front(); | |
797 | ||
798 | // According to RFC5681 | |
799 | // Update ssthresh only for the first retransmit | |
800 | uint32_t smss = _snd.mss; | |
801 | if (unacked_seg.nr_transmits == 0) { | |
802 | _snd.ssthresh = std::max(flight_size() / 2, 2 * smss); | |
803 | } | |
804 | // RFC6582 Step 4 | |
805 | _snd.recover = _snd.next - 1; | |
806 | // Start the slow start process | |
807 | _snd.cwnd = smss; | |
808 | // End fast recovery | |
809 | exit_fast_recovery(); | |
810 | ||
811 | ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size() | |
812 | << " nr=" << unacked_seg.nr_transmits << dendl; | |
813 | if (unacked_seg.nr_transmits < _max_nr_retransmit) { | |
814 | unacked_seg.nr_transmits++; | |
815 | } else { | |
816 | // Delete connection when max num of retransmission is reached | |
817 | ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max " | |
818 | << _max_nr_retransmit << dendl; | |
819 | _errno = -ETIMEDOUT; | |
820 | cleanup(); | |
821 | return; | |
822 | } | |
823 | retransmit_one(); | |
824 | ||
825 | output_update_rto(); | |
826 | } | |
827 | ||
828 | template <typename InetTraits> | |
829 | void tcp<InetTraits>::tcb::persist() { | |
830 | ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl; | |
831 | // Send 1 byte packet to probe peer's window size | |
832 | _snd.window_probe = true; | |
833 | output_one(); | |
834 | _snd.window_probe = false; | |
835 | ||
836 | output(); | |
837 | // Perform binary exponential back-off per RFC1122 | |
838 | _persist_time_out = std::min(_persist_time_out * 2, _rto_max); | |
839 | start_persist_timer(); | |
840 | } |