1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3 * This file is open source software, licensed to you under the terms
4 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
5 * distributed with this work for additional information regarding copyright
6 * ownership. You may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
20 * Copyright (C) 2014 Cloudius Systems, Ltd.
26 #include "DPDKStack.h"
28 #include "common/dout.h"
29 #include "include/ceph_assert.h"
31 #define dout_subsys ceph_subsys_dpdk
33 #define dout_prefix *_dout << "tcp "
35 void tcp_option::parse(uint8_t* beg
, uint8_t* end
)
38 auto kind
= option_kind(*beg
);
39 if (kind
!= option_kind::nop
&& kind
!= option_kind::eol
) {
40 // Make sure there is enough room for this option
41 auto len
= *(beg
+ 1);
42 if (beg
+ len
> end
) {
47 case option_kind::mss
:
49 _remote_mss
= ntoh(reinterpret_cast<mss
*>(beg
)->mss
);
50 beg
+= option_len::mss
;
52 case option_kind::win_scale
:
53 _win_scale_received
= true;
54 _remote_win_scale
= reinterpret_cast<win_scale
*>(beg
)->shift
;
55 // We can turn on win_scale option, 7 is Linux's default win scale size
57 beg
+= option_len::win_scale
;
59 case option_kind::sack
:
60 _sack_received
= true;
61 beg
+= option_len::sack
;
63 case option_kind::nop
:
64 beg
+= option_len::nop
;
66 case option_kind::eol
:
69 // Ignore options we do not understand
70 auto len
= *(beg
+ 1);
72 // Prevent infinite loop
81 uint8_t tcp_option::fill(tcp_hdr
* th
, uint8_t options_size
)
83 auto hdr
= reinterpret_cast<uint8_t*>(th
);
84 auto off
= hdr
+ sizeof(tcp_hdr
);
86 bool syn_on
= th
->f_syn
;
87 bool ack_on
= th
->f_ack
;
90 if (_mss_received
|| !ack_on
) {
91 auto mss
= new (off
) tcp_option::mss
;
92 mss
->mss
= _local_mss
;
97 if (_win_scale_received
|| !ack_on
) {
98 auto win_scale
= new (off
) tcp_option::win_scale
;
99 win_scale
->shift
= _local_win_scale
;
100 off
+= win_scale
->len
;
101 size
+= win_scale
->len
;
106 auto size_max
= align_up(uint8_t(size
+ 1), tcp_option::align
);
107 while (size
< size_max
- uint8_t(option_len::eol
)) {
108 new (off
) tcp_option::nop
;
109 off
+= option_len::nop
;
110 size
+= option_len::nop
;
112 new (off
) tcp_option::eol
;
113 size
+= option_len::eol
;
115 ceph_assert(size
== options_size
);
120 uint8_t tcp_option::get_size(bool syn_on
, bool ack_on
)
124 if (_mss_received
|| !ack_on
) {
125 size
+= option_len::mss
;
127 if (_win_scale_received
|| !ack_on
) {
128 size
+= option_len::win_scale
;
132 size
+= option_len::eol
;
133 // Insert NOP option to align on 32-bit
134 size
= align_up(size
, tcp_option::align
);
139 ipv4_tcp::ipv4_tcp(ipv4
& inet
, EventCenter
*c
)
140 : _inet_l4(inet
), _tcp(std::unique_ptr
<tcp
<ipv4_traits
>>(new tcp
<ipv4_traits
>(inet
.cct
, _inet_l4
, c
)))
143 ipv4_tcp::~ipv4_tcp() { }
145 void ipv4_tcp::received(Packet p
, ipv4_address from
, ipv4_address to
)
147 _tcp
->received(std::move(p
), from
, to
);
150 bool ipv4_tcp::forward(forward_hash
& out_hash_data
, Packet
& p
, size_t off
)
152 return _tcp
->forward(out_hash_data
, p
, off
);
155 int tcpv4_listen(tcp
<ipv4_traits
>& tcpv4
, uint16_t port
, const SocketOptions
&opts
,
156 int type
, unsigned addr_slot
, ServerSocket
*sock
)
158 auto p
= new DPDKServerSocketImpl
<tcp
<ipv4_traits
>>(tcpv4
, port
, opts
,
165 *sock
= ServerSocket(std::unique_ptr
<ServerSocketImpl
>(p
));
169 int tcpv4_connect(tcp
<ipv4_traits
>& tcpv4
, const entity_addr_t
&addr
,
170 ConnectedSocket
*sock
)
172 auto conn
= tcpv4
.connect(addr
);
173 *sock
= ConnectedSocket(std::unique_ptr
<ConnectedSocketImpl
>(
174 new NativeConnectedSocketImpl
<tcp
<ipv4_traits
>>(std::move(conn
))));
178 template <typename InetTraits
>
179 void tcp
<InetTraits
>::respond_with_reset(tcp_hdr
* rth
, ipaddr local_ip
, ipaddr foreign_ip
)
181 ldout(cct
, 20) << __func__
<< " tcp header rst=" << bool(rth
->f_rst
) << " fin=" << bool(rth
->f_fin
)
182 << " syn=" << bool(rth
->f_syn
) << dendl
;
187 auto th
= p
.prepend_header
<tcp_hdr
>();
188 th
->src_port
= rth
->dst_port
;
189 th
->dst_port
= rth
->src_port
;
193 // If this RST packet is in response to a SYN packet. We ACK the ISN.
195 th
->ack
= rth
->seq
+ 1;
199 th
->data_offset
= sizeof(*th
) / 4;
205 InetTraits::tcp_pseudo_header_checksum(csum
, local_ip
, foreign_ip
, sizeof(*th
));
206 if (get_hw_features().tx_csum_l4_offload
) {
207 th
->checksum
= ~csum
.get();
208 oi
.needs_csum
= true;
211 th
->checksum
= csum
.get();
212 oi
.needs_csum
= false;
215 oi
.protocol
= ip_protocol_num::tcp
;
216 oi
.tcp_hdr_len
= sizeof(tcp_hdr
);
217 p
.set_offload_info(oi
);
219 send_packet_without_tcb(local_ip
, foreign_ip
, std::move(p
));
223 #define dout_prefix _prefix(_dout)
224 template<typename InetTraits
>
225 std::ostream
& tcp
<InetTraits
>::tcb::_prefix(std::ostream
*_dout
) {
226 return *_dout
<< "tcp " << _local_ip
<< ":" << _local_port
<< " -> " << _foreign_ip
<< ":" << _foreign_port
227 << " tcb(" << this << " fd=" << fd
<< " s=" << _state
<< ").";
230 template<typename InetTraits
>
231 void tcp
<InetTraits
>::tcb::input_handle_listen_state(tcp_hdr
* th
, Packet p
)
233 auto opt_len
= th
->data_offset
* 4 - sizeof(tcp_hdr
);
234 auto opt_start
= reinterpret_cast<uint8_t*>(p
.get_header(0, th
->data_offset
* 4)) + sizeof(tcp_hdr
);
235 auto opt_end
= opt_start
+ opt_len
;
236 p
.trim_front(th
->data_offset
* 4);
237 tcp_sequence seg_seq
= th
->seq
;
239 // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
240 _rcv
.next
= seg_seq
+ 1;
241 _rcv
.initial
= seg_seq
;
243 // ISS should be selected and a SYN segment sent of the form:
244 // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
245 // SND.NXT is set to ISS+1 and SND.UNA to ISS
246 // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
247 // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
249 // th->seq = syn_on ? _snd.initial : _snd.next
250 // to make sure retransmitted SYN has correct SEQ number.
253 _rcv
.urgent
= _rcv
.next
;
255 ldout(_tcp
.cct
, 10) << __func__
<< " listen: LISTEN -> SYN_RECEIVED" << dendl
;
256 init_from_options(th
, opt_start
, opt_end
);
260 template <typename InetTraits
>
261 void tcp
<InetTraits
>::tcb::input_handle_syn_sent_state(tcp_hdr
* th
, Packet p
)
263 auto opt_len
= th
->data_offset
* 4 - sizeof(tcp_hdr
);
264 auto opt_start
= reinterpret_cast<uint8_t*>(p
.get_header(0, th
->data_offset
* 4)) + sizeof(tcp_hdr
);
265 auto opt_end
= opt_start
+ opt_len
;
266 p
.trim_front(th
->data_offset
* 4);
267 tcp_sequence seg_seq
= th
->seq
;
268 auto seg_ack
= th
->ack
;
270 ldout(_tcp
.cct
, 20) << __func__
<< " tcp header seq " << seg_seq
.raw
<< " ack " << seg_ack
.raw
271 << " fin=" << bool(th
->f_fin
) << " syn=" << bool(th
->f_syn
) << dendl
;
273 bool acceptable
= false;
274 // 3.1 first check the ACK bit
276 // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
277 // RST bit is set, if so drop the segment and return)
278 if (seg_ack
<= _snd
.initial
|| seg_ack
> _snd
.next
) {
279 return respond_with_reset(th
);
282 // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
283 acceptable
= _snd
.unacknowledged
<= seg_ack
&& seg_ack
<= _snd
.next
;
286 // 3.2 second check the RST bit
288 // If the ACK was acceptable then signal the user "error: connection
289 // reset", drop the segment, enter CLOSED state, delete TCB, and
290 // return. Otherwise (no ACK) drop the segment and return.
298 // 3.3 third check the security and precedence
299 // NOTE: Ignored for now
301 // 3.4 fourth check the SYN bit
303 // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should
304 // be advanced to equal SEG.ACK (if there is an ACK), and any segments
305 // on the retransmission queue which are thereby acknowledged should be
307 _rcv
.next
= seg_seq
+ 1;
308 _rcv
.initial
= seg_seq
;
310 // TODO: clean retransmission queue
311 _snd
.unacknowledged
= seg_ack
;
313 if (_snd
.unacknowledged
> _snd
.initial
) {
314 // If SND.UNA > ISS (our SYN has been ACKed), change the connection
315 // state to ESTABLISHED, form an ACK segment
316 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
317 ldout(_tcp
.cct
, 20) << __func__
<< " syn: SYN_SENT -> ESTABLISHED" << dendl
;
318 init_from_options(th
, opt_start
, opt_end
);
322 // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
323 // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
324 ldout(_tcp
.cct
, 20) << __func__
<< " syn: SYN_SENT -> SYN_RECEIVED" << dendl
;
329 // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
330 // segment and return.
334 template <typename InetTraits
>
335 void tcp
<InetTraits
>::tcb::input_handle_other_state(tcp_hdr
* th
, Packet p
)
337 p
.trim_front(th
->data_offset
* 4);
338 bool do_output
= false;
339 bool do_output_data
= false;
340 tcp_sequence seg_seq
= th
->seq
;
341 auto seg_ack
= th
->ack
;
342 auto seg_len
= p
.len();
343 ldout(_tcp
.cct
, 20) << __func__
<< " tcp header seq " << seg_seq
.raw
<< " ack " << seg_ack
.raw
344 << " snd next " << _snd
.next
.raw
<< " unack " << _snd
.unacknowledged
.raw
345 << " rcv next " << _rcv
.next
.raw
<< " len " << seg_len
346 << " fin=" << bool(th
->f_fin
) << " syn=" << bool(th
->f_syn
) << dendl
;
348 // 4.1 first check sequence number
349 if (!segment_acceptable(seg_seq
, seg_len
)) {
350 //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
354 // In the following it is assumed that the segment is the idealized
355 // segment that begins at RCV.NXT and does not exceed the window.
356 if (seg_seq
< _rcv
.next
) {
357 // ignore already acknowledged data
358 auto dup
= std::min(uint32_t(_rcv
.next
- seg_seq
), seg_len
);
359 ldout(_tcp
.cct
, 10) << __func__
<< " dup segment len " << dup
<< dendl
;
364 // FIXME: We should trim data outside the right edge of the receive window as well
366 if (seg_seq
!= _rcv
.next
) {
367 ldout(_tcp
.cct
, 10) << __func__
<< " out of order, expect " << _rcv
.next
.raw
368 << " actual " << seg_seq
.raw
369 << " out of order size " << _rcv
.out_of_order
.map
.size()
371 insert_out_of_order(seg_seq
, std::move(p
));
372 // A TCP receiver SHOULD send an immediate duplicate ACK
373 // when an out-of-order segment arrives.
377 // 4.2 second check the RST bit
379 if (in_state(SYN_RECEIVED
)) {
380 // If this connection was initiated with a passive OPEN (i.e.,
381 // came from the LISTEN state), then return this connection to
382 // LISTEN state and return. The user need not be informed. If
383 // this connection was initiated with an active OPEN (i.e., came
384 // from SYN_SENT state) then the connection was refused, signal
385 // the user "connection refused". In either case, all segments
386 // on the retransmission queue should be removed. And in the
387 // active OPEN case, enter the CLOSED state and delete the TCB,
389 errno
= -ECONNREFUSED
;
392 if (in_state(ESTABLISHED
| FIN_WAIT_1
| FIN_WAIT_2
| CLOSE_WAIT
)) {
393 // If the RST bit is set then, any outstanding RECEIVEs and SEND
394 // should receive "reset" responses. All segment queues should be
395 // flushed. Users should also receive an unsolicited general
396 // "connection reset" signal. Enter the CLOSED state, delete the
400 if (in_state(CLOSING
| LAST_ACK
| TIME_WAIT
)) {
401 // If the RST bit is set then, enter the CLOSED state, delete the
407 // 4.3 third check security and precedence
408 // NOTE: Ignored for now
410 // 4.4 fourth, check the SYN bit
412 // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
413 // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
415 // If the SYN is in the window it is an error, send a reset, any
416 // outstanding RECEIVEs and SEND should receive "reset" responses,
417 // all segment queues should be flushed, the user should also
418 // receive an unsolicited general "connection reset" signal, enter
419 // the CLOSED state, delete the TCB, and return.
420 respond_with_reset(th
);
423 // If the SYN is not in the window this step would not be reached
424 // and an ack would have been sent in the first step (sequence
428 // 4.5 fifth check the ACK field
430 // if the ACK bit is off drop the segment and return
433 // SYN_RECEIVED STATE
434 if (in_state(SYN_RECEIVED
)) {
435 // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
436 // and continue processing.
437 if (_snd
.unacknowledged
<= seg_ack
&& seg_ack
<= _snd
.next
) {
438 ldout(_tcp
.cct
, 20) << __func__
<< " SYN_RECEIVED -> ESTABLISHED" << dendl
;
440 if (_tcp
.push_listen_queue(_local_port
, this)) {
441 ldout(_tcp
.cct
, 20) << __func__
<< " successfully accepting socket" << dendl
;
443 ldout(_tcp
.cct
, 5) << __func__
<< " not exist listener or full queue, reset" << dendl
;
444 return respond_with_reset(th
);
447 // <SEQ=SEG.ACK><CTL=RST>
448 return respond_with_reset(th
);
451 auto update_window
= [this, th
, seg_seq
, seg_ack
] {
452 ldout(_tcp
.cct
, 20) << __func__
<< " window update seg_seq=" << seg_seq
453 << " seg_ack=" << seg_ack
<< " old window=" << th
->window
454 << " new window=" << int(_snd
.window_scale
) << dendl
;
455 _snd
.window
= th
->window
<< _snd
.window_scale
;
458 if (_snd
.window
== 0) {
459 _persist_time_out
= _rto
;
460 start_persist_timer();
462 stop_persist_timer();
465 // ESTABLISHED STATE or
466 // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
467 if (in_state(ESTABLISHED
| CLOSE_WAIT
)) {
468 // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
469 if (_snd
.unacknowledged
< seg_ack
&& seg_ack
<= _snd
.next
) {
470 // Remote ACKed data we sent
471 auto acked_bytes
= data_segment_acked(seg_ack
);
473 // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
474 if (_snd
.wl1
< seg_seq
|| (_snd
.wl1
== seg_seq
&& _snd
.wl2
<= seg_ack
)) {
478 // some data is acked, try send more data
479 do_output_data
= true;
481 auto set_retransmit_timer
= [this] {
482 if (_snd
.data
.empty()) {
483 // All outstanding segments are acked, turn off the timer.
484 stop_retransmit_timer();
485 // Signal the waiter of this event
486 signal_all_data_acked();
488 // Restart the timer becasue new data is acked.
489 start_retransmit_timer();
493 if (_snd
.dupacks
>= 3) {
494 // We are in fast retransmit / fast recovery phase
495 uint32_t smss
= _snd
.mss
;
496 if (seg_ack
> _snd
.recover
) {
497 ldout(_tcp
.cct
, 20) << __func__
<< " ack: full_ack" << dendl
;
498 // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
499 _snd
.cwnd
= std::min(_snd
.ssthresh
, std::max(flight_size(), smss
) + smss
);
500 // Exit the fast recovery procedure
501 exit_fast_recovery();
502 set_retransmit_timer();
504 ldout(_tcp
.cct
, 20) << __func__
<< " ack: partial_ack" << dendl
;
505 // Retransmit the first unacknowledged segment
507 // Deflate the congestion window by the amount of new data
508 // acknowledged by the Cumulative Acknowledgment field
509 _snd
.cwnd
-= acked_bytes
;
510 // If the partial ACK acknowledges at least one SMSS of new
511 // data, then add back SMSS bytes to the congestion window
512 if (acked_bytes
>= smss
) {
515 // Send a new segment if permitted by the new value of
516 // cwnd. Do not exit the fast recovery procedure For
517 // the first partial ACK that arrives during fast
518 // recovery, also reset the retransmit timer.
519 if (++_snd
.partial_ack
== 1) {
520 start_retransmit_timer();
524 // RFC5681: The fast retransmit algorithm uses the arrival
525 // of 3 duplicate ACKs (as defined in section 2, without
526 // any intervening ACKs which move SND.UNA) as an
527 // indication that a segment has been lost.
529 // So, here we reset dupacks to zero becasue this ACK moves
531 exit_fast_recovery();
532 set_retransmit_timer();
534 } else if (!_snd
.data
.empty() && seg_len
== 0 &&
535 th
->f_fin
== 0 && th
->f_syn
== 0 &&
536 th
->ack
== _snd
.unacknowledged
&&
537 uint32_t(th
->window
<< _snd
.window_scale
) == _snd
.window
) {
540 // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
542 // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
543 // and repair loss, based on incoming duplicate ACKs.
544 // Here, We follow RFC5681.
546 uint32_t smss
= _snd
.mss
;
547 // 3 duplicated ACKs trigger a fast retransmit
548 if (_snd
.dupacks
== 1 || _snd
.dupacks
== 2) {
550 // Send cwnd + 2 * smss per RFC3042
551 do_output_data
= true;
552 } else if (_snd
.dupacks
== 3) {
554 if (seg_ack
- 1 > _snd
.recover
) {
555 _snd
.recover
= _snd
.next
- 1;
557 _snd
.ssthresh
= std::max((flight_size() - _snd
.limited_transfer
) / 2, 2 * smss
);
560 // Do not enter fast retransmit and do not reset ssthresh
563 _snd
.cwnd
= _snd
.ssthresh
+ 3 * smss
;
564 } else if (_snd
.dupacks
> 3) {
568 do_output_data
= true;
570 } else if (seg_ack
> _snd
.next
) {
571 // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
572 // then send an ACK, drop the segment, and return
574 } else if (_snd
.window
== 0 && th
->window
> 0) {
576 do_output_data
= true;
580 if (in_state(FIN_WAIT_1
)) {
581 // In addition to the processing for the ESTABLISHED state, if
582 // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
583 // processing in that state.
584 if (seg_ack
== _snd
.next
+ 1) {
585 ldout(_tcp
.cct
, 20) << __func__
<< " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl
;
587 do_local_fin_acked();
591 if (in_state(FIN_WAIT_2
)) {
592 // In addition to the processing for the ESTABLISHED state, if
593 // the retransmission queue is empty, the user’s CLOSE can be
594 // acknowledged ("ok") but do not delete the TCB.
598 if (in_state(CLOSING
)) {
599 if (seg_ack
== _snd
.next
+ 1) {
600 ldout(_tcp
.cct
, 20) << __func__
<< " ack: CLOSING -> TIME_WAIT" << dendl
;
601 do_local_fin_acked();
602 return do_time_wait();
608 if (in_state(LAST_ACK
)) {
609 if (seg_ack
== _snd
.next
+ 1) {
610 ldout(_tcp
.cct
, 20) << __func__
<< " ack: LAST_ACK -> CLOSED" << dendl
;
611 do_local_fin_acked();
616 if (in_state(TIME_WAIT
)) {
617 // The only thing that can arrive in this state is a
618 // retransmission of the remote FIN. Acknowledge it, and restart
619 // the 2 MSL timeout.
624 // 4.6 sixth, check the URG bit
629 // 4.7 seventh, process the segment text
630 if (in_state(ESTABLISHED
| FIN_WAIT_1
| FIN_WAIT_2
)) {
632 // Once the TCP takes responsibility for the data it advances
633 // RCV.NXT over the data accepted, and adjusts RCV.WND as
634 // apporopriate to the current buffer availability. The total of
635 // RCV.NXT and RCV.WND should not be reduced.
636 _rcv
.data
.push_back(std::move(p
));
637 _rcv
.next
+= seg_len
;
638 auto merged
= merge_out_of_order();
639 signal_data_received();
640 // Send an acknowledgment of the form:
641 // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
642 // This acknowledgment should be piggybacked on a segment being
643 // transmitted if possible without incurring undue delay.
645 // TCP receiver SHOULD send an immediate ACK when the
646 // incoming segment fills in all or part of a gap in the
650 do_output
= should_send_ack(seg_len
);
652 ldout(_tcp
.cct
, 20) << __func__
<< " merged=" << merged
<< " do_output=" << do_output
<< dendl
;
654 } else if (in_state(CLOSE_WAIT
| CLOSING
| LAST_ACK
| TIME_WAIT
)) {
655 // This should not occur, since a FIN has been received from the
656 // remote side. Ignore the segment text.
660 // 4.8 eighth, check the FIN bit
662 if (in_state(CLOSED
| LISTEN
| SYN_SENT
)) {
663 // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
664 // since the SEG.SEQ cannot be validated; drop the segment and return.
667 auto fin_seq
= seg_seq
+ seg_len
;
668 if (fin_seq
== _rcv
.next
) {
669 _rcv
.next
= fin_seq
+ 1;
671 // If this <FIN> packet contains data as well, we can ACK both data
672 // and <FIN> in a single packet, so canncel the previous ACK.
675 // Send ACK for the FIN!
677 signal_data_received();
680 if (in_state(SYN_RECEIVED
| ESTABLISHED
)) {
681 ldout(_tcp
.cct
, 20) << __func__
<< " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl
;
685 if (in_state(FIN_WAIT_1
)) {
686 // If our FIN has been ACKed (perhaps in this segment), then
687 // enter TIME-WAIT, start the time-wait timer, turn off the other
688 // timers; otherwise enter the CLOSING state.
689 // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
690 // not FIN_WAIT_1 if we reach here.
691 ldout(_tcp
.cct
, 20) << __func__
<< " fin: FIN_WAIT_1 -> CLOSING" << dendl
;
694 if (in_state(FIN_WAIT_2
)) {
695 ldout(_tcp
.cct
, 20) << __func__
<< " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl
;
696 return do_time_wait();
700 if (do_output
|| (do_output_data
&& can_send())) {
701 // Since we will do output, we can canncel scheduled delayed ACK.
707 template <typename InetTraits
>
708 void tcp
<InetTraits
>::tcb::connect()
710 ldout(_tcp
.cct
, 20) << __func__
<< dendl
;
711 // An initial send sequence number (ISS) is selected. A SYN segment of the
712 // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1,
713 // enter SYN-SENT state, and return.
716 // Local receive window scale factor
717 _rcv
.window_scale
= _option
._local_win_scale
= 7;
718 // Maximum segment size local can receive
719 _rcv
.mss
= _option
._local_mss
= local_mss();
720 // Linux's default window size
721 _rcv
.window
= 29200 << _rcv
.window_scale
;
726 template <typename InetTraits
>
727 void tcp
<InetTraits
>::tcb::close_final_cleanup()
729 if (_snd
._all_data_acked_fd
>= 0) {
730 center
->delete_file_event(_snd
._all_data_acked_fd
, EVENT_READABLE
);
731 _tcp
.manager
.close(_snd
._all_data_acked_fd
);
732 _snd
._all_data_acked_fd
= -1;
736 signal_data_received();
737 ldout(_tcp
.cct
, 20) << __func__
<< " unsent_len=" << _snd
.unsent_len
<< dendl
;
738 if (in_state(CLOSE_WAIT
)) {
739 ldout(_tcp
.cct
, 20) << __func__
<< " CLOSE_WAIT -> LAST_ACK" << dendl
;
741 } else if (in_state(ESTABLISHED
)) {
742 ldout(_tcp
.cct
, 20) << __func__
<< " ESTABLISHED -> FIN_WAIT_1" << dendl
;
745 // Send <FIN> to remote
746 // Note: we call output_one to make sure a packet with FIN actually
747 // sent out. If we only call output() and _packetq is not empty,
748 // tcp::tcb::get_packet(), packet with FIN will not be generated.
751 center
->delete_file_event(fd
, EVENT_READABLE
|EVENT_WRITABLE
);
754 template <typename InetTraits
>
755 void tcp
<InetTraits
>::tcb::retransmit()
757 auto output_update_rto
= [this] {
759 // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
760 this->_rto
= std::min(this->_rto
* 2, this->_rto_max
);
761 start_retransmit_timer();
765 if (syn_needs_on()) {
766 if (_snd
.syn_retransmit
++ < _max_nr_retransmit
) {
769 _errno
= -ECONNABORTED
;
770 ldout(_tcp
.cct
, 5) << __func__
<< " syn retransmit exceed max "
771 << _max_nr_retransmit
<< dendl
;
779 if (fin_needs_on()) {
780 if (_snd
.fin_retransmit
++ < _max_nr_retransmit
) {
783 ldout(_tcp
.cct
, 5) << __func__
<< " fin retransmit exceed max "
784 << _max_nr_retransmit
<< dendl
;
792 if (_snd
.data
.empty()) {
796 // If there are unacked data, retransmit the earliest segment
797 auto& unacked_seg
= _snd
.data
.front();
799 // According to RFC5681
800 // Update ssthresh only for the first retransmit
801 uint32_t smss
= _snd
.mss
;
802 if (unacked_seg
.nr_transmits
== 0) {
803 _snd
.ssthresh
= std::max(flight_size() / 2, 2 * smss
);
806 _snd
.recover
= _snd
.next
- 1;
807 // Start the slow start process
810 exit_fast_recovery();
812 ldout(_tcp
.cct
, 20) << __func__
<< " unack data size " << _snd
.data
.size()
813 << " nr=" << unacked_seg
.nr_transmits
<< dendl
;
814 if (unacked_seg
.nr_transmits
< _max_nr_retransmit
) {
815 unacked_seg
.nr_transmits
++;
817 // Delete connection when max num of retransmission is reached
818 ldout(_tcp
.cct
, 5) << __func__
<< " seg retransmit exceed max "
819 << _max_nr_retransmit
<< dendl
;
829 template <typename InetTraits
>
830 void tcp
<InetTraits
>::tcb::persist() {
831 ldout(_tcp
.cct
, 20) << __func__
<< " persist timer fired" << dendl
;
832 // Send 1 byte packet to probe peer's window size
833 _snd
.window_probe
= true;
835 _snd
.window_probe
= false;
838 // Perform binary exponential back-off per RFC1122
839 _persist_time_out
= std::min(_persist_time_out
* 2, _rto_max
);
840 start_persist_timer();