]> git.proxmox.com Git - ceph.git/blob - ceph/src/msg/async/dpdk/TCP.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / msg / async / dpdk / TCP.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 /*
3 * This file is open source software, licensed to you under the terms
4 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
5 * distributed with this work for additional information regarding copyright
6 * ownership. You may not use this file except in compliance with the License.
7 *
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 /*
20 * Copyright (C) 2014 Cloudius Systems, Ltd.
21 */
22
23 #include "align.h"
24 #include "TCP.h"
25 #include "IP.h"
26 #include "DPDKStack.h"
27
28 #include "common/dout.h"
29 #include "include/ceph_assert.h"
30
31 #define dout_subsys ceph_subsys_dpdk
32 #undef dout_prefix
33 #define dout_prefix *_dout << "tcp "
34
35 void tcp_option::parse(uint8_t* beg, uint8_t* end)
36 {
37 while (beg < end) {
38 auto kind = option_kind(*beg);
39 if (kind != option_kind::nop && kind != option_kind::eol) {
40 // Make sure there is enough room for this option
41 auto len = *(beg + 1);
42 if (beg + len > end) {
43 return;
44 }
45 }
46 switch (kind) {
47 case option_kind::mss:
48 _mss_received = true;
49 _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss);
50 beg += option_len::mss;
51 break;
52 case option_kind::win_scale:
53 _win_scale_received = true;
54 _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift;
55 // We can turn on win_scale option, 7 is Linux's default win scale size
56 _local_win_scale = 7;
57 beg += option_len::win_scale;
58 break;
59 case option_kind::sack:
60 _sack_received = true;
61 beg += option_len::sack;
62 break;
63 case option_kind::nop:
64 beg += option_len::nop;
65 break;
66 case option_kind::eol:
67 return;
68 default:
69 // Ignore options we do not understand
70 auto len = *(beg + 1);
71 beg += len;
72 // Prevent infinite loop
73 if (len == 0) {
74 return;
75 }
76 break;
77 }
78 }
79 }
80
81 uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size)
82 {
83 auto hdr = reinterpret_cast<uint8_t*>(th);
84 auto off = hdr + sizeof(tcp_hdr);
85 uint8_t size = 0;
86 bool syn_on = th->f_syn;
87 bool ack_on = th->f_ack;
88
89 if (syn_on) {
90 if (_mss_received || !ack_on) {
91 auto mss = new (off) tcp_option::mss;
92 mss->mss = _local_mss;
93 off += mss->len;
94 size += mss->len;
95 *mss = mss->hton();
96 }
97 if (_win_scale_received || !ack_on) {
98 auto win_scale = new (off) tcp_option::win_scale;
99 win_scale->shift = _local_win_scale;
100 off += win_scale->len;
101 size += win_scale->len;
102 }
103 }
104 if (size > 0) {
105 // Insert NOP option
106 auto size_max = align_up(uint8_t(size + 1), tcp_option::align);
107 while (size < size_max - uint8_t(option_len::eol)) {
108 new (off) tcp_option::nop;
109 off += option_len::nop;
110 size += option_len::nop;
111 }
112 new (off) tcp_option::eol;
113 size += option_len::eol;
114 }
115 ceph_assert(size == options_size);
116
117 return size;
118 }
119
120 uint8_t tcp_option::get_size(bool syn_on, bool ack_on)
121 {
122 uint8_t size = 0;
123 if (syn_on) {
124 if (_mss_received || !ack_on) {
125 size += option_len::mss;
126 }
127 if (_win_scale_received || !ack_on) {
128 size += option_len::win_scale;
129 }
130 }
131 if (size > 0) {
132 size += option_len::eol;
133 // Insert NOP option to align on 32-bit
134 size = align_up(size, tcp_option::align);
135 }
136 return size;
137 }
138
139 ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c)
140 : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c)))
141 { }
142
143 ipv4_tcp::~ipv4_tcp() { }
144
145 void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to)
146 {
147 _tcp->received(std::move(p), from, to);
148 }
149
150 bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
151 {
152 return _tcp->forward(out_hash_data, p, off);
153 }
154
155 int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
156 int type, unsigned addr_slot, ServerSocket *sock)
157 {
158 auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts,
159 type, addr_slot);
160 int r = p->listen();
161 if (r < 0) {
162 delete p;
163 return r;
164 }
165 *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
166 return 0;
167 }
168
169 int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
170 ConnectedSocket *sock)
171 {
172 auto conn = tcpv4.connect(addr);
173 *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>(
174 new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn))));
175 return 0;
176 }
177
178 template <typename InetTraits>
179 void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip)
180 {
181 ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin)
182 << " syn=" << bool(rth->f_syn) << dendl;
183 if (rth->f_rst) {
184 return;
185 }
186 Packet p;
187 auto th = p.prepend_header<tcp_hdr>();
188 th->src_port = rth->dst_port;
189 th->dst_port = rth->src_port;
190 if (rth->f_ack) {
191 th->seq = rth->ack;
192 }
193 // If this RST packet is in response to a SYN packet. We ACK the ISN.
194 if (rth->f_syn) {
195 th->ack = rth->seq + 1;
196 th->f_ack = true;
197 }
198 th->f_rst = true;
199 th->data_offset = sizeof(*th) / 4;
200 th->checksum = 0;
201 *th = th->hton();
202
203 checksummer csum;
204 offload_info oi;
205 InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
206 if (get_hw_features().tx_csum_l4_offload) {
207 th->checksum = ~csum.get();
208 oi.needs_csum = true;
209 } else {
210 csum.sum(p);
211 th->checksum = csum.get();
212 oi.needs_csum = false;
213 }
214
215 oi.protocol = ip_protocol_num::tcp;
216 oi.tcp_hdr_len = sizeof(tcp_hdr);
217 p.set_offload_info(oi);
218
219 send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
220 }
221
222 #undef dout_prefix
223 #define dout_prefix _prefix(_dout)
224 template<typename InetTraits>
225 std::ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) {
226 return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port
227 << " tcb(" << this << " fd=" << fd << " s=" << _state << ").";
228 }
229
230 template<typename InetTraits>
231 void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p)
232 {
233 auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
234 auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
235 auto opt_end = opt_start + opt_len;
236 p.trim_front(th->data_offset * 4);
237 tcp_sequence seg_seq = th->seq;
238
239 // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
240 _rcv.next = seg_seq + 1;
241 _rcv.initial = seg_seq;
242
243 // ISS should be selected and a SYN segment sent of the form:
244 // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
245 // SND.NXT is set to ISS+1 and SND.UNA to ISS
246 // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
247 // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
248 // have
249 // th->seq = syn_on ? _snd.initial : _snd.next
250 // to make sure retransmitted SYN has correct SEQ number.
251 do_setup_isn();
252
253 _rcv.urgent = _rcv.next;
254
255 ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl;
256 init_from_options(th, opt_start, opt_end);
257 do_syn_received();
258 }
259
260 template <typename InetTraits>
261 void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p)
262 {
263 auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
264 auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
265 auto opt_end = opt_start + opt_len;
266 p.trim_front(th->data_offset * 4);
267 tcp_sequence seg_seq = th->seq;
268 auto seg_ack = th->ack;
269
270 ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
271 << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
272
273 bool acceptable = false;
274 // 3.1 first check the ACK bit
275 if (th->f_ack) {
276 // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
277 // RST bit is set, if so drop the segment and return)
278 if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
279 return respond_with_reset(th);
280 }
281
282 // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
283 acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
284 }
285
286 // 3.2 second check the RST bit
287 if (th->f_rst) {
288 // If the ACK was acceptable then signal the user "error: connection
289 // reset", drop the segment, enter CLOSED state, delete TCB, and
290 // return. Otherwise (no ACK) drop the segment and return.
291 if (acceptable) {
292 return do_reset();
293 } else {
294 return;
295 }
296 }
297
298 // 3.3 third check the security and precedence
299 // NOTE: Ignored for now
300
301 // 3.4 fourth check the SYN bit
302 if (th->f_syn) {
303 // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should
304 // be advanced to equal SEG.ACK (if there is an ACK), and any segments
305 // on the retransmission queue which are thereby acknowledged should be
306 // removed.
307 _rcv.next = seg_seq + 1;
308 _rcv.initial = seg_seq;
309 if (th->f_ack) {
310 // TODO: clean retransmission queue
311 _snd.unacknowledged = seg_ack;
312 }
313 if (_snd.unacknowledged > _snd.initial) {
314 // If SND.UNA > ISS (our SYN has been ACKed), change the connection
315 // state to ESTABLISHED, form an ACK segment
316 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
317 ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl;
318 init_from_options(th, opt_start, opt_end);
319 do_established();
320 output();
321 } else {
322 // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
323 // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
324 ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl;
325 do_syn_received();
326 }
327 }
328
329 // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
330 // segment and return.
331 return;
332 }
333
334 template <typename InetTraits>
335 void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p)
336 {
337 p.trim_front(th->data_offset * 4);
338 bool do_output = false;
339 bool do_output_data = false;
340 tcp_sequence seg_seq = th->seq;
341 auto seg_ack = th->ack;
342 auto seg_len = p.len();
343 ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
344 << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw
345 << " rcv next " << _rcv.next.raw << " len " << seg_len
346 << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
347
348 // 4.1 first check sequence number
349 if (!segment_acceptable(seg_seq, seg_len)) {
350 //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
351 return output();
352 }
353
354 // In the following it is assumed that the segment is the idealized
355 // segment that begins at RCV.NXT and does not exceed the window.
356 if (seg_seq < _rcv.next) {
357 // ignore already acknowledged data
358 auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
359 ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl;
360 p.trim_front(dup);
361 seg_len -= dup;
362 seg_seq += dup;
363 }
364 // FIXME: We should trim data outside the right edge of the receive window as well
365
366 if (seg_seq != _rcv.next) {
367 ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw
368 << " actual " << seg_seq.raw
369 << " out of order size " << _rcv.out_of_order.map.size()
370 << dendl;
371 insert_out_of_order(seg_seq, std::move(p));
372 // A TCP receiver SHOULD send an immediate duplicate ACK
373 // when an out-of-order segment arrives.
374 return output();
375 }
376
377 // 4.2 second check the RST bit
378 if (th->f_rst) {
379 if (in_state(SYN_RECEIVED)) {
380 // If this connection was initiated with a passive OPEN (i.e.,
381 // came from the LISTEN state), then return this connection to
382 // LISTEN state and return. The user need not be informed. If
383 // this connection was initiated with an active OPEN (i.e., came
384 // from SYN_SENT state) then the connection was refused, signal
385 // the user "connection refused". In either case, all segments
386 // on the retransmission queue should be removed. And in the
387 // active OPEN case, enter the CLOSED state and delete the TCB,
388 // and return.
389 errno = -ECONNREFUSED;
390 return do_reset();
391 }
392 if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
393 // If the RST bit is set then, any outstanding RECEIVEs and SEND
394 // should receive "reset" responses. All segment queues should be
395 // flushed. Users should also receive an unsolicited general
396 // "connection reset" signal. Enter the CLOSED state, delete the
397 // TCB, and return.
398 return do_reset();
399 }
400 if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
401 // If the RST bit is set then, enter the CLOSED state, delete the
402 // TCB, and return.
403 return do_closed();
404 }
405 }
406
407 // 4.3 third check security and precedence
408 // NOTE: Ignored for now
409
410 // 4.4 fourth, check the SYN bit
411 if (th->f_syn) {
412 // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
413 // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
414
415 // If the SYN is in the window it is an error, send a reset, any
416 // outstanding RECEIVEs and SEND should receive "reset" responses,
417 // all segment queues should be flushed, the user should also
418 // receive an unsolicited general "connection reset" signal, enter
419 // the CLOSED state, delete the TCB, and return.
420 respond_with_reset(th);
421 return do_reset();
422
423 // If the SYN is not in the window this step would not be reached
424 // and an ack would have been sent in the first step (sequence
425 // number check).
426 }
427
428 // 4.5 fifth check the ACK field
429 if (!th->f_ack) {
430 // if the ACK bit is off drop the segment and return
431 return;
432 } else {
433 // SYN_RECEIVED STATE
434 if (in_state(SYN_RECEIVED)) {
435 // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
436 // and continue processing.
437 if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
438 ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl;
439 do_established();
440 if (_tcp.push_listen_queue(_local_port, this)) {
441 ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl;
442 } else {
443 ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl;
444 return respond_with_reset(th);
445 }
446 } else {
447 // <SEQ=SEG.ACK><CTL=RST>
448 return respond_with_reset(th);
449 }
450 }
451 auto update_window = [this, th, seg_seq, seg_ack] {
452 ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq
453 << " seg_ack=" << seg_ack << " old window=" << th->window
454 << " new window=" << int(_snd.window_scale) << dendl;
455 _snd.window = th->window << _snd.window_scale;
456 _snd.wl1 = seg_seq;
457 _snd.wl2 = seg_ack;
458 if (_snd.window == 0) {
459 _persist_time_out = _rto;
460 start_persist_timer();
461 } else {
462 stop_persist_timer();
463 }
464 };
465 // ESTABLISHED STATE or
466 // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
467 if (in_state(ESTABLISHED | CLOSE_WAIT)) {
468 // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
469 if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
470 // Remote ACKed data we sent
471 auto acked_bytes = data_segment_acked(seg_ack);
472
473 // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
474 if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
475 update_window();
476 }
477
478 // some data is acked, try send more data
479 do_output_data = true;
480
481 auto set_retransmit_timer = [this] {
482 if (_snd.data.empty()) {
483 // All outstanding segments are acked, turn off the timer.
484 stop_retransmit_timer();
485 // Signal the waiter of this event
486 signal_all_data_acked();
487 } else {
488 // Restart the timer becasue new data is acked.
489 start_retransmit_timer();
490 }
491 };
492
493 if (_snd.dupacks >= 3) {
494 // We are in fast retransmit / fast recovery phase
495 uint32_t smss = _snd.mss;
496 if (seg_ack > _snd.recover) {
497 ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl;
498 // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
499 _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
500 // Exit the fast recovery procedure
501 exit_fast_recovery();
502 set_retransmit_timer();
503 } else {
504 ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl;
505 // Retransmit the first unacknowledged segment
506 fast_retransmit();
507 // Deflate the congestion window by the amount of new data
508 // acknowledged by the Cumulative Acknowledgment field
509 _snd.cwnd -= acked_bytes;
510 // If the partial ACK acknowledges at least one SMSS of new
511 // data, then add back SMSS bytes to the congestion window
512 if (acked_bytes >= smss) {
513 _snd.cwnd += smss;
514 }
515 // Send a new segment if permitted by the new value of
516 // cwnd. Do not exit the fast recovery procedure For
517 // the first partial ACK that arrives during fast
518 // recovery, also reset the retransmit timer.
519 if (++_snd.partial_ack == 1) {
520 start_retransmit_timer();
521 }
522 }
523 } else {
524 // RFC5681: The fast retransmit algorithm uses the arrival
525 // of 3 duplicate ACKs (as defined in section 2, without
526 // any intervening ACKs which move SND.UNA) as an
527 // indication that a segment has been lost.
528 //
529 // So, here we reset dupacks to zero becasue this ACK moves
530 // SND.UNA.
531 exit_fast_recovery();
532 set_retransmit_timer();
533 }
534 } else if (!_snd.data.empty() && seg_len == 0 &&
535 th->f_fin == 0 && th->f_syn == 0 &&
536 th->ack == _snd.unacknowledged &&
537 uint32_t(th->window << _snd.window_scale) == _snd.window) {
538 // Note:
539 // RFC793 states:
540 // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
541 // RFC5681 states:
542 // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
543 // and repair loss, based on incoming duplicate ACKs.
544 // Here, We follow RFC5681.
545 _snd.dupacks++;
546 uint32_t smss = _snd.mss;
547 // 3 duplicated ACKs trigger a fast retransmit
548 if (_snd.dupacks == 1 || _snd.dupacks == 2) {
549 // RFC5681 Step 3.1
550 // Send cwnd + 2 * smss per RFC3042
551 do_output_data = true;
552 } else if (_snd.dupacks == 3) {
553 // RFC6582 Step 3.2
554 if (seg_ack - 1 > _snd.recover) {
555 _snd.recover = _snd.next - 1;
556 // RFC5681 Step 3.2
557 _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
558 fast_retransmit();
559 } else {
560 // Do not enter fast retransmit and do not reset ssthresh
561 }
562 // RFC5681 Step 3.3
563 _snd.cwnd = _snd.ssthresh + 3 * smss;
564 } else if (_snd.dupacks > 3) {
565 // RFC5681 Step 3.4
566 _snd.cwnd += smss;
567 // RFC5681 Step 3.5
568 do_output_data = true;
569 }
570 } else if (seg_ack > _snd.next) {
571 // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
572 // then send an ACK, drop the segment, and return
573 return output();
574 } else if (_snd.window == 0 && th->window > 0) {
575 update_window();
576 do_output_data = true;
577 }
578 }
579 // FIN_WAIT_1 STATE
580 if (in_state(FIN_WAIT_1)) {
581 // In addition to the processing for the ESTABLISHED state, if
582 // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
583 // processing in that state.
584 if (seg_ack == _snd.next + 1) {
585 ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl;
586 _state = FIN_WAIT_2;
587 do_local_fin_acked();
588 }
589 }
590 // FIN_WAIT_2 STATE
591 if (in_state(FIN_WAIT_2)) {
592 // In addition to the processing for the ESTABLISHED state, if
593 // the retransmission queue is empty, the user’s CLOSE can be
594 // acknowledged ("ok") but do not delete the TCB.
595 // TODO
596 }
597 // CLOSING STATE
598 if (in_state(CLOSING)) {
599 if (seg_ack == _snd.next + 1) {
600 ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl;
601 do_local_fin_acked();
602 return do_time_wait();
603 } else {
604 return;
605 }
606 }
607 // LAST_ACK STATE
608 if (in_state(LAST_ACK)) {
609 if (seg_ack == _snd.next + 1) {
610 ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl;
611 do_local_fin_acked();
612 return do_closed();
613 }
614 }
615 // TIME_WAIT STATE
616 if (in_state(TIME_WAIT)) {
617 // The only thing that can arrive in this state is a
618 // retransmission of the remote FIN. Acknowledge it, and restart
619 // the 2 MSL timeout.
620 // TODO
621 }
622 }
623
624 // 4.6 sixth, check the URG bit
625 if (th->f_urg) {
626 // TODO
627 }
628
629 // 4.7 seventh, process the segment text
630 if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
631 if (p.len()) {
632 // Once the TCP takes responsibility for the data it advances
633 // RCV.NXT over the data accepted, and adjusts RCV.WND as
634 // apporopriate to the current buffer availability. The total of
635 // RCV.NXT and RCV.WND should not be reduced.
636 _rcv.data.push_back(std::move(p));
637 _rcv.next += seg_len;
638 auto merged = merge_out_of_order();
639 signal_data_received();
640 // Send an acknowledgment of the form:
641 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
642 // This acknowledgment should be piggybacked on a segment being
643 // transmitted if possible without incurring undue delay.
644 if (merged) {
645 // TCP receiver SHOULD send an immediate ACK when the
646 // incoming segment fills in all or part of a gap in the
647 // sequence space.
648 do_output = true;
649 } else {
650 do_output = should_send_ack(seg_len);
651 }
652 ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl;
653 }
654 } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
655 // This should not occur, since a FIN has been received from the
656 // remote side. Ignore the segment text.
657 return;
658 }
659
660 // 4.8 eighth, check the FIN bit
661 if (th->f_fin) {
662 if (in_state(CLOSED | LISTEN | SYN_SENT)) {
663 // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
664 // since the SEG.SEQ cannot be validated; drop the segment and return.
665 return;
666 }
667 auto fin_seq = seg_seq + seg_len;
668 if (fin_seq == _rcv.next) {
669 _rcv.next = fin_seq + 1;
670
671 // If this <FIN> packet contains data as well, we can ACK both data
672 // and <FIN> in a single packet, so canncel the previous ACK.
673 clear_delayed_ack();
674 do_output = false;
675 // Send ACK for the FIN!
676 output();
677 signal_data_received();
678 _errno = 0;
679
680 if (in_state(SYN_RECEIVED | ESTABLISHED)) {
681 ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl;
682 _state = CLOSE_WAIT;
683 // EOF
684 }
685 if (in_state(FIN_WAIT_1)) {
686 // If our FIN has been ACKed (perhaps in this segment), then
687 // enter TIME-WAIT, start the time-wait timer, turn off the other
688 // timers; otherwise enter the CLOSING state.
689 // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
690 // not FIN_WAIT_1 if we reach here.
691 ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl;
692 _state = CLOSING;
693 }
694 if (in_state(FIN_WAIT_2)) {
695 ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl;
696 return do_time_wait();
697 }
698 }
699 }
700 if (do_output || (do_output_data && can_send())) {
701 // Since we will do output, we can canncel scheduled delayed ACK.
702 clear_delayed_ack();
703 output();
704 }
705 }
706
707 template <typename InetTraits>
708 void tcp<InetTraits>::tcb::connect()
709 {
710 ldout(_tcp.cct, 20) << __func__ << dendl;
711 // An initial send sequence number (ISS) is selected. A SYN segment of the
712 // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1,
713 // enter SYN-SENT state, and return.
714 do_setup_isn();
715
716 // Local receive window scale factor
717 _rcv.window_scale = _option._local_win_scale = 7;
718 // Maximum segment size local can receive
719 _rcv.mss = _option._local_mss = local_mss();
720 // Linux's default window size
721 _rcv.window = 29200 << _rcv.window_scale;
722
723 do_syn_sent();
724 }
725
726 template <typename InetTraits>
727 void tcp<InetTraits>::tcb::close_final_cleanup()
728 {
729 if (_snd._all_data_acked_fd >= 0) {
730 center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE);
731 _tcp.manager.close(_snd._all_data_acked_fd);
732 _snd._all_data_acked_fd = -1;
733 }
734
735 _snd.closed = true;
736 signal_data_received();
737 ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl;
738 if (in_state(CLOSE_WAIT)) {
739 ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl;
740 _state = LAST_ACK;
741 } else if (in_state(ESTABLISHED)) {
742 ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl;
743 _state = FIN_WAIT_1;
744 }
745 // Send <FIN> to remote
746 // Note: we call output_one to make sure a packet with FIN actually
747 // sent out. If we only call output() and _packetq is not empty,
748 // tcp::tcb::get_packet(), packet with FIN will not be generated.
749 output_one();
750 output();
751 center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
752 }
753
754 template <typename InetTraits>
755 void tcp<InetTraits>::tcb::retransmit()
756 {
757 auto output_update_rto = [this] {
758 output();
759 // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
760 this->_rto = std::min(this->_rto * 2, this->_rto_max);
761 start_retransmit_timer();
762 };
763
764 // Retransmit SYN
765 if (syn_needs_on()) {
766 if (_snd.syn_retransmit++ < _max_nr_retransmit) {
767 output_update_rto();
768 } else {
769 _errno = -ECONNABORTED;
770 ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max "
771 << _max_nr_retransmit << dendl;
772 _errno = -ETIMEDOUT;
773 cleanup();
774 return;
775 }
776 }
777
778 // Retransmit FIN
779 if (fin_needs_on()) {
780 if (_snd.fin_retransmit++ < _max_nr_retransmit) {
781 output_update_rto();
782 } else {
783 ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max "
784 << _max_nr_retransmit << dendl;
785 _errno = -ETIMEDOUT;
786 cleanup();
787 return;
788 }
789 }
790
791 // Retransmit Data
792 if (_snd.data.empty()) {
793 return;
794 }
795
796 // If there are unacked data, retransmit the earliest segment
797 auto& unacked_seg = _snd.data.front();
798
799 // According to RFC5681
800 // Update ssthresh only for the first retransmit
801 uint32_t smss = _snd.mss;
802 if (unacked_seg.nr_transmits == 0) {
803 _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
804 }
805 // RFC6582 Step 4
806 _snd.recover = _snd.next - 1;
807 // Start the slow start process
808 _snd.cwnd = smss;
809 // End fast recovery
810 exit_fast_recovery();
811
812 ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size()
813 << " nr=" << unacked_seg.nr_transmits << dendl;
814 if (unacked_seg.nr_transmits < _max_nr_retransmit) {
815 unacked_seg.nr_transmits++;
816 } else {
817 // Delete connection when max num of retransmission is reached
818 ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max "
819 << _max_nr_retransmit << dendl;
820 _errno = -ETIMEDOUT;
821 cleanup();
822 return;
823 }
824 retransmit_one();
825
826 output_update_rto();
827 }
828
829 template <typename InetTraits>
830 void tcp<InetTraits>::tcb::persist() {
831 ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl;
832 // Send 1 byte packet to probe peer's window size
833 _snd.window_probe = true;
834 output_one();
835 _snd.window_probe = false;
836
837 output();
838 // Perform binary exponential back-off per RFC1122
839 _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
840 start_persist_timer();
841 }