2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_low_latency __read_mostly
;
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
92 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
);
95 struct inet_hashinfo tcp_hashinfo
;
96 EXPORT_SYMBOL(tcp_hashinfo
);
98 static u32
tcp_v4_init_seq(const struct sk_buff
*skb
)
100 return secure_tcp_seq(ip_hdr(skb
)->daddr
,
103 tcp_hdr(skb
)->source
);
106 static u32
tcp_v4_init_ts_off(const struct net
*net
, const struct sk_buff
*skb
)
108 return secure_tcp_ts_off(net
, ip_hdr(skb
)->daddr
, ip_hdr(skb
)->saddr
);
111 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
113 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
114 struct tcp_sock
*tp
= tcp_sk(sk
);
116 /* With PAWS, it is safe from the viewpoint
117 of data integrity. Even without PAWS it is safe provided sequence
118 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
120 Actually, the idea is close to VJ's one, only timestamp cache is
121 held not per host, but per port pair and TW bucket is used as state
124 If TW bucket has been already destroyed we fall back to VJ's scheme
125 and use initial timestamp retrieved from peer table.
127 if (tcptw
->tw_ts_recent_stamp
&&
128 (!twp
|| (sock_net(sk
)->ipv4
.sysctl_tcp_tw_reuse
&&
129 get_seconds() - tcptw
->tw_ts_recent_stamp
> 1))) {
130 tp
->write_seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
131 if (tp
->write_seq
== 0)
133 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
134 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
146 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
147 struct inet_sock
*inet
= inet_sk(sk
);
148 struct tcp_sock
*tp
= tcp_sk(sk
);
149 __be16 orig_sport
, orig_dport
;
150 __be32 daddr
, nexthop
;
154 struct ip_options_rcu
*inet_opt
;
155 struct inet_timewait_death_row
*tcp_death_row
= &sock_net(sk
)->ipv4
.tcp_death_row
;
157 if (addr_len
< sizeof(struct sockaddr_in
))
160 if (usin
->sin_family
!= AF_INET
)
161 return -EAFNOSUPPORT
;
163 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
164 inet_opt
= rcu_dereference_protected(inet
->inet_opt
,
165 lockdep_sock_is_held(sk
));
166 if (inet_opt
&& inet_opt
->opt
.srr
) {
169 nexthop
= inet_opt
->opt
.faddr
;
172 orig_sport
= inet
->inet_sport
;
173 orig_dport
= usin
->sin_port
;
174 fl4
= &inet
->cork
.fl
.u
.ip4
;
175 rt
= ip_route_connect(fl4
, nexthop
, inet
->inet_saddr
,
176 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
178 orig_sport
, orig_dport
, sk
);
181 if (err
== -ENETUNREACH
)
182 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
186 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
191 if (!inet_opt
|| !inet_opt
->opt
.srr
)
194 if (!inet
->inet_saddr
)
195 inet
->inet_saddr
= fl4
->saddr
;
196 sk_rcv_saddr_set(sk
, inet
->inet_saddr
);
198 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
199 /* Reset inherited state */
200 tp
->rx_opt
.ts_recent
= 0;
201 tp
->rx_opt
.ts_recent_stamp
= 0;
202 if (likely(!tp
->repair
))
206 inet
->inet_dport
= usin
->sin_port
;
207 sk_daddr_set(sk
, daddr
);
209 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
211 inet_csk(sk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
213 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
220 tcp_set_state(sk
, TCP_SYN_SENT
);
221 err
= inet_hash_connect(tcp_death_row
, sk
);
227 rt
= ip_route_newports(fl4
, rt
, orig_sport
, orig_dport
,
228 inet
->inet_sport
, inet
->inet_dport
, sk
);
234 /* OK, now commit destination to socket. */
235 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
236 sk_setup_caps(sk
, &rt
->dst
);
239 if (likely(!tp
->repair
)) {
241 tp
->write_seq
= secure_tcp_seq(inet
->inet_saddr
,
245 tp
->tsoffset
= secure_tcp_ts_off(sock_net(sk
),
250 inet
->inet_id
= tp
->write_seq
^ jiffies
;
252 if (tcp_fastopen_defer_connect(sk
, &err
))
257 err
= tcp_connect(sk
);
266 * This unhashes the socket and releases the local port,
269 tcp_set_state(sk
, TCP_CLOSE
);
271 sk
->sk_route_caps
= 0;
272 inet
->inet_dport
= 0;
275 EXPORT_SYMBOL(tcp_v4_connect
);
278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279 * It can be called through tcp_release_cb() if socket was owned by user
280 * at the time tcp_v4_err() was called to handle ICMP message.
282 void tcp_v4_mtu_reduced(struct sock
*sk
)
284 struct inet_sock
*inet
= inet_sk(sk
);
285 struct dst_entry
*dst
;
288 if ((1 << sk
->sk_state
) & (TCPF_LISTEN
| TCPF_CLOSE
))
290 mtu
= tcp_sk(sk
)->mtu_info
;
291 dst
= inet_csk_update_pmtu(sk
, mtu
);
295 /* Something is about to be wrong... Remember soft error
296 * for the case, if this connection will not able to recover.
298 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
299 sk
->sk_err_soft
= EMSGSIZE
;
303 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
304 ip_sk_accept_pmtu(sk
) &&
305 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
306 tcp_sync_mss(sk
, mtu
);
308 /* Resend the TCP packet because it's
309 * clear that the old packet has been
310 * dropped. This is the new "fast" path mtu
313 tcp_simple_retransmit(sk
);
314 } /* else let the usual retransmit timer handle it */
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced
);
318 static void do_redirect(struct sk_buff
*skb
, struct sock
*sk
)
320 struct dst_entry
*dst
= __sk_dst_check(sk
, 0);
323 dst
->ops
->redirect(dst
, sk
, skb
);
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock
*sk
, u32 seq
, bool abort
)
330 struct request_sock
*req
= inet_reqsk(sk
);
331 struct net
*net
= sock_net(sk
);
333 /* ICMPs are not backlogged, hence we cannot get
334 * an established socket here.
336 if (seq
!= tcp_rsk(req
)->snt_isn
) {
337 __NET_INC_STATS(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
340 * Still in SYN_RECV, just remove it silently.
341 * There is no good way to pass the error to the newly
342 * created socket, and POSIX does not want network
343 * errors returned from accept().
345 inet_csk_reqsk_queue_drop(req
->rsk_listener
, req
);
346 tcp_listendrop(req
->rsk_listener
);
350 EXPORT_SYMBOL(tcp_req_err
);
353 * This routine is called by the ICMP module when it gets some
354 * sort of error condition. If err < 0 then the socket should
355 * be closed and the error returned to the user. If err > 0
356 * it's just the icmp type << 8 | icmp code. After adjustment
357 * header points to the first 8 bytes of the tcp header. We need
358 * to find the appropriate port.
360 * The locking strategy used here is very "optimistic". When
361 * someone else accesses the socket the ICMP is just dropped
362 * and for some paths there is no check at all.
363 * A more general error queue to queue errors for later handling
364 * is probably better.
368 void tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
370 const struct iphdr
*iph
= (const struct iphdr
*)icmp_skb
->data
;
371 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
372 struct inet_connection_sock
*icsk
;
374 struct inet_sock
*inet
;
375 const int type
= icmp_hdr(icmp_skb
)->type
;
376 const int code
= icmp_hdr(icmp_skb
)->code
;
379 struct request_sock
*fastopen
;
384 struct net
*net
= dev_net(icmp_skb
->dev
);
386 sk
= __inet_lookup_established(net
, &tcp_hashinfo
, iph
->daddr
,
387 th
->dest
, iph
->saddr
, ntohs(th
->source
),
390 __ICMP_INC_STATS(net
, ICMP_MIB_INERRORS
);
393 if (sk
->sk_state
== TCP_TIME_WAIT
) {
394 inet_twsk_put(inet_twsk(sk
));
397 seq
= ntohl(th
->seq
);
398 if (sk
->sk_state
== TCP_NEW_SYN_RECV
)
399 return tcp_req_err(sk
, seq
,
400 type
== ICMP_PARAMETERPROB
||
401 type
== ICMP_TIME_EXCEEDED
||
402 (type
== ICMP_DEST_UNREACH
&&
403 (code
== ICMP_NET_UNREACH
||
404 code
== ICMP_HOST_UNREACH
)));
407 /* If too many ICMPs get dropped on busy
408 * servers this needs to be solved differently.
409 * We do take care of PMTU discovery (RFC1191) special case :
410 * we can receive locally generated ICMP messages while socket is held.
412 if (sock_owned_by_user(sk
)) {
413 if (!(type
== ICMP_DEST_UNREACH
&& code
== ICMP_FRAG_NEEDED
))
414 __NET_INC_STATS(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
416 if (sk
->sk_state
== TCP_CLOSE
)
419 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
420 __NET_INC_STATS(net
, LINUX_MIB_TCPMINTTLDROP
);
426 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 fastopen
= tp
->fastopen_rsk
;
428 snd_una
= fastopen
? tcp_rsk(fastopen
)->snt_isn
: tp
->snd_una
;
429 if (sk
->sk_state
!= TCP_LISTEN
&&
430 !between(seq
, snd_una
, tp
->snd_nxt
)) {
431 __NET_INC_STATS(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
437 if (!sock_owned_by_user(sk
))
438 do_redirect(icmp_skb
, sk
);
440 case ICMP_SOURCE_QUENCH
:
441 /* Just silently ignore these. */
443 case ICMP_PARAMETERPROB
:
446 case ICMP_DEST_UNREACH
:
447 if (code
> NR_ICMP_UNREACH
)
450 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
451 /* We are not interested in TCP_LISTEN and open_requests
452 * (SYN-ACKs send out by Linux are always <576bytes so
453 * they should go through unfragmented).
455 if (sk
->sk_state
== TCP_LISTEN
)
459 if (!sock_owned_by_user(sk
)) {
460 tcp_v4_mtu_reduced(sk
);
462 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED
, &sk
->sk_tsq_flags
))
468 err
= icmp_err_convert
[code
].errno
;
469 /* check if icmp_skb allows revert of backoff
470 * (see draft-zimmermann-tcp-lcd) */
471 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
473 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
474 !icsk
->icsk_backoff
|| fastopen
)
477 if (sock_owned_by_user(sk
))
480 icsk
->icsk_backoff
--;
481 icsk
->icsk_rto
= tp
->srtt_us
? __tcp_set_rto(tp
) :
483 icsk
->icsk_rto
= inet_csk_rto_backoff(icsk
, TCP_RTO_MAX
);
485 skb
= tcp_write_queue_head(sk
);
488 tcp_mstamp_refresh(tp
);
489 delta_us
= (u32
)(tp
->tcp_mstamp
- skb
->skb_mstamp
);
490 remaining
= icsk
->icsk_rto
-
491 usecs_to_jiffies(delta_us
);
494 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
495 remaining
, TCP_RTO_MAX
);
497 /* RTO revert clocked out retransmission.
498 * Will retransmit now */
499 tcp_retransmit_timer(sk
);
503 case ICMP_TIME_EXCEEDED
:
510 switch (sk
->sk_state
) {
513 /* Only in fast or simultaneous open. If a fast open socket is
514 * is already accepted it is treated as a connected one below.
516 if (fastopen
&& !fastopen
->sk
)
519 if (!sock_owned_by_user(sk
)) {
522 sk
->sk_error_report(sk
);
526 sk
->sk_err_soft
= err
;
531 /* If we've already connected we will keep trying
532 * until we time out, or the user gives up.
534 * rfc1122 4.2.3.9 allows to consider as hard errors
535 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 * but it is obsoleted by pmtu discovery).
538 * Note, that in modern internet, where routing is unreliable
539 * and in each dark corner broken firewalls sit, sending random
540 * errors ordered by their masters even this two messages finally lose
541 * their original sense (even Linux sends invalid PORT_UNREACHs)
543 * Now we are in compliance with RFCs.
548 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
550 sk
->sk_error_report(sk
);
551 } else { /* Only an error on timeout */
552 sk
->sk_err_soft
= err
;
560 void __tcp_v4_send_check(struct sk_buff
*skb
, __be32 saddr
, __be32 daddr
)
562 struct tcphdr
*th
= tcp_hdr(skb
);
564 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
565 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
566 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
567 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
569 th
->check
= tcp_v4_check(skb
->len
, saddr
, daddr
,
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
579 const struct inet_sock
*inet
= inet_sk(sk
);
581 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
583 EXPORT_SYMBOL(tcp_v4_send_check
);
586 * This routine will send an RST to the other tcp.
588 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
590 * Answer: if a packet caused RST, it is not for a socket
591 * existing in our system, if it is matched to a socket,
592 * it is just duplicate segment or bug in other side's TCP.
593 * So that we build reply only basing on parameters
594 * arrived with segment.
595 * Exception: precedence violation. We do not implement it in any case.
598 static void tcp_v4_send_reset(const struct sock
*sk
, struct sk_buff
*skb
)
600 const struct tcphdr
*th
= tcp_hdr(skb
);
603 #ifdef CONFIG_TCP_MD5SIG
604 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
607 struct ip_reply_arg arg
;
608 #ifdef CONFIG_TCP_MD5SIG
609 struct tcp_md5sig_key
*key
= NULL
;
610 const __u8
*hash_location
= NULL
;
611 unsigned char newhash
[16];
613 struct sock
*sk1
= NULL
;
617 /* Never send a reset in response to a reset. */
621 /* If sk not NULL, it means we did a successful lookup and incoming
622 * route had to be correct. prequeue might have dropped our dst.
624 if (!sk
&& skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
627 /* Swap the send and the receive. */
628 memset(&rep
, 0, sizeof(rep
));
629 rep
.th
.dest
= th
->source
;
630 rep
.th
.source
= th
->dest
;
631 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
635 rep
.th
.seq
= th
->ack_seq
;
638 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
639 skb
->len
- (th
->doff
<< 2));
642 memset(&arg
, 0, sizeof(arg
));
643 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
644 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
646 net
= sk
? sock_net(sk
) : dev_net(skb_dst(skb
)->dev
);
647 #ifdef CONFIG_TCP_MD5SIG
649 hash_location
= tcp_parse_md5sig_option(th
);
650 if (sk
&& sk_fullsock(sk
)) {
651 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)
652 &ip_hdr(skb
)->saddr
, AF_INET
);
653 } else if (hash_location
) {
655 * active side is lost. Try to find listening socket through
656 * source port, and then find md5 key through listening socket.
657 * we are not loose security here:
658 * Incoming packet is checked with md5 hash with finding key,
659 * no RST generated if md5 hash doesn't match.
661 sk1
= __inet_lookup_listener(net
, &tcp_hashinfo
, NULL
, 0,
663 th
->source
, ip_hdr(skb
)->daddr
,
664 ntohs(th
->source
), inet_iif(skb
));
665 /* don't send rst if it can't find key */
669 key
= tcp_md5_do_lookup(sk1
, (union tcp_md5_addr
*)
670 &ip_hdr(skb
)->saddr
, AF_INET
);
675 genhash
= tcp_v4_md5_hash_skb(newhash
, key
, NULL
, skb
);
676 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0)
682 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
684 (TCPOPT_MD5SIG
<< 8) |
686 /* Update length and the length the header thinks exists */
687 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
688 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
690 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
691 key
, ip_hdr(skb
)->saddr
,
692 ip_hdr(skb
)->daddr
, &rep
.th
);
695 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
696 ip_hdr(skb
)->saddr
, /* XXX */
697 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
698 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
699 arg
.flags
= (sk
&& inet_sk_transparent(sk
)) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
701 /* When socket is gone, all binding information is lost.
702 * routing might fail in this case. No choice here, if we choose to force
703 * input interface, we will misroute in case of asymmetric route.
706 arg
.bound_dev_if
= sk
->sk_bound_dev_if
;
708 BUILD_BUG_ON(offsetof(struct sock
, sk_bound_dev_if
) !=
709 offsetof(struct inet_timewait_sock
, tw_bound_dev_if
));
711 arg
.tos
= ip_hdr(skb
)->tos
;
712 arg
.uid
= sock_net_uid(net
, sk
&& sk_fullsock(sk
) ? sk
: NULL
);
714 ip_send_unicast_reply(*this_cpu_ptr(net
->ipv4
.tcp_sk
),
715 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
716 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
717 &arg
, arg
.iov
[0].iov_len
);
719 __TCP_INC_STATS(net
, TCP_MIB_OUTSEGS
);
720 __TCP_INC_STATS(net
, TCP_MIB_OUTRSTS
);
723 #ifdef CONFIG_TCP_MD5SIG
729 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
730 outside socket context is ugly, certainly. What can I do?
733 static void tcp_v4_send_ack(const struct sock
*sk
,
734 struct sk_buff
*skb
, u32 seq
, u32 ack
,
735 u32 win
, u32 tsval
, u32 tsecr
, int oif
,
736 struct tcp_md5sig_key
*key
,
737 int reply_flags
, u8 tos
)
739 const struct tcphdr
*th
= tcp_hdr(skb
);
742 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
743 #ifdef CONFIG_TCP_MD5SIG
744 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
748 struct net
*net
= sock_net(sk
);
749 struct ip_reply_arg arg
;
751 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
752 memset(&arg
, 0, sizeof(arg
));
754 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
755 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
757 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
758 (TCPOPT_TIMESTAMP
<< 8) |
760 rep
.opt
[1] = htonl(tsval
);
761 rep
.opt
[2] = htonl(tsecr
);
762 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
765 /* Swap the send and the receive. */
766 rep
.th
.dest
= th
->source
;
767 rep
.th
.source
= th
->dest
;
768 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
769 rep
.th
.seq
= htonl(seq
);
770 rep
.th
.ack_seq
= htonl(ack
);
772 rep
.th
.window
= htons(win
);
774 #ifdef CONFIG_TCP_MD5SIG
776 int offset
= (tsecr
) ? 3 : 0;
778 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
780 (TCPOPT_MD5SIG
<< 8) |
782 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
783 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
785 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
786 key
, ip_hdr(skb
)->saddr
,
787 ip_hdr(skb
)->daddr
, &rep
.th
);
790 arg
.flags
= reply_flags
;
791 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
792 ip_hdr(skb
)->saddr
, /* XXX */
793 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
794 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
796 arg
.bound_dev_if
= oif
;
798 arg
.uid
= sock_net_uid(net
, sk_fullsock(sk
) ? sk
: NULL
);
800 ip_send_unicast_reply(*this_cpu_ptr(net
->ipv4
.tcp_sk
),
801 skb
, &TCP_SKB_CB(skb
)->header
.h4
.opt
,
802 ip_hdr(skb
)->saddr
, ip_hdr(skb
)->daddr
,
803 &arg
, arg
.iov
[0].iov_len
);
805 __TCP_INC_STATS(net
, TCP_MIB_OUTSEGS
);
809 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
811 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
812 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
814 tcp_v4_send_ack(sk
, skb
,
815 tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
816 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
817 tcp_time_stamp_raw() + tcptw
->tw_ts_offset
,
820 tcp_twsk_md5_key(tcptw
),
821 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0,
828 static void tcp_v4_reqsk_send_ack(const struct sock
*sk
, struct sk_buff
*skb
,
829 struct request_sock
*req
)
831 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
832 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
834 u32 seq
= (sk
->sk_state
== TCP_LISTEN
) ? tcp_rsk(req
)->snt_isn
+ 1 :
838 * The window field (SEG.WND) of every outgoing segment, with the
839 * exception of <SYN> segments, MUST be right-shifted by
840 * Rcv.Wind.Shift bits:
842 tcp_v4_send_ack(sk
, skb
, seq
,
843 tcp_rsk(req
)->rcv_nxt
,
844 req
->rsk_rcv_wnd
>> inet_rsk(req
)->rcv_wscale
,
845 tcp_time_stamp_raw() + tcp_rsk(req
)->ts_off
,
848 tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&ip_hdr(skb
)->daddr
,
850 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0,
855 * Send a SYN-ACK after having received a SYN.
856 * This still operates on a request_sock only, not on a big
859 static int tcp_v4_send_synack(const struct sock
*sk
, struct dst_entry
*dst
,
861 struct request_sock
*req
,
862 struct tcp_fastopen_cookie
*foc
,
863 enum tcp_synack_type synack_type
)
865 const struct inet_request_sock
*ireq
= inet_rsk(req
);
870 /* First, grab a route. */
871 if (!dst
&& (dst
= inet_csk_route_req(sk
, &fl4
, req
)) == NULL
)
874 skb
= tcp_make_synack(sk
, dst
, req
, foc
, synack_type
);
877 __tcp_v4_send_check(skb
, ireq
->ir_loc_addr
, ireq
->ir_rmt_addr
);
879 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->ir_loc_addr
,
882 err
= net_xmit_eval(err
);
889 * IPv4 request_sock destructor.
891 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
893 kfree(inet_rsk(req
)->opt
);
896 #ifdef CONFIG_TCP_MD5SIG
898 * RFC2385 MD5 checksumming requires a mapping of
899 * IP address->MD5 Key.
900 * We need to maintain these in the sk structure.
903 /* Find the Key structure for an address. */
904 struct tcp_md5sig_key
*tcp_md5_do_lookup(const struct sock
*sk
,
905 const union tcp_md5_addr
*addr
,
908 const struct tcp_sock
*tp
= tcp_sk(sk
);
909 struct tcp_md5sig_key
*key
;
910 const struct tcp_md5sig_info
*md5sig
;
912 struct tcp_md5sig_key
*best_match
= NULL
;
915 /* caller either holds rcu_read_lock() or socket lock */
916 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
917 lockdep_sock_is_held(sk
));
921 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
922 if (key
->family
!= family
)
925 if (family
== AF_INET
) {
926 mask
= inet_make_mask(key
->prefixlen
);
927 match
= (key
->addr
.a4
.s_addr
& mask
) ==
928 (addr
->a4
.s_addr
& mask
);
929 #if IS_ENABLED(CONFIG_IPV6)
930 } else if (family
== AF_INET6
) {
931 match
= ipv6_prefix_equal(&key
->addr
.a6
, &addr
->a6
,
938 if (match
&& (!best_match
||
939 key
->prefixlen
> best_match
->prefixlen
))
944 EXPORT_SYMBOL(tcp_md5_do_lookup
);
946 static struct tcp_md5sig_key
*tcp_md5_do_lookup_exact(const struct sock
*sk
,
947 const union tcp_md5_addr
*addr
,
948 int family
, u8 prefixlen
)
950 const struct tcp_sock
*tp
= tcp_sk(sk
);
951 struct tcp_md5sig_key
*key
;
952 unsigned int size
= sizeof(struct in_addr
);
953 const struct tcp_md5sig_info
*md5sig
;
955 /* caller either holds rcu_read_lock() or socket lock */
956 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
957 lockdep_sock_is_held(sk
));
960 #if IS_ENABLED(CONFIG_IPV6)
961 if (family
== AF_INET6
)
962 size
= sizeof(struct in6_addr
);
964 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
965 if (key
->family
!= family
)
967 if (!memcmp(&key
->addr
, addr
, size
) &&
968 key
->prefixlen
== prefixlen
)
974 struct tcp_md5sig_key
*tcp_v4_md5_lookup(const struct sock
*sk
,
975 const struct sock
*addr_sk
)
977 const union tcp_md5_addr
*addr
;
979 addr
= (const union tcp_md5_addr
*)&addr_sk
->sk_daddr
;
980 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
982 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
984 /* This can be called on a newly created socket, from other files */
985 int tcp_md5_do_add(struct sock
*sk
, const union tcp_md5_addr
*addr
,
986 int family
, u8 prefixlen
, const u8
*newkey
, u8 newkeylen
,
989 /* Add Key to the list */
990 struct tcp_md5sig_key
*key
;
991 struct tcp_sock
*tp
= tcp_sk(sk
);
992 struct tcp_md5sig_info
*md5sig
;
994 key
= tcp_md5_do_lookup_exact(sk
, addr
, family
, prefixlen
);
996 /* Pre-existing entry - just update that one. */
997 memcpy(key
->key
, newkey
, newkeylen
);
998 key
->keylen
= newkeylen
;
1002 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
1003 lockdep_sock_is_held(sk
));
1005 md5sig
= kmalloc(sizeof(*md5sig
), gfp
);
1009 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
1010 INIT_HLIST_HEAD(&md5sig
->head
);
1011 rcu_assign_pointer(tp
->md5sig_info
, md5sig
);
1014 key
= sock_kmalloc(sk
, sizeof(*key
), gfp
);
1017 if (!tcp_alloc_md5sig_pool()) {
1018 sock_kfree_s(sk
, key
, sizeof(*key
));
1022 memcpy(key
->key
, newkey
, newkeylen
);
1023 key
->keylen
= newkeylen
;
1024 key
->family
= family
;
1025 key
->prefixlen
= prefixlen
;
1026 memcpy(&key
->addr
, addr
,
1027 (family
== AF_INET6
) ? sizeof(struct in6_addr
) :
1028 sizeof(struct in_addr
));
1029 hlist_add_head_rcu(&key
->node
, &md5sig
->head
);
1032 EXPORT_SYMBOL(tcp_md5_do_add
);
1034 int tcp_md5_do_del(struct sock
*sk
, const union tcp_md5_addr
*addr
, int family
,
1037 struct tcp_md5sig_key
*key
;
1039 key
= tcp_md5_do_lookup_exact(sk
, addr
, family
, prefixlen
);
1042 hlist_del_rcu(&key
->node
);
1043 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1044 kfree_rcu(key
, rcu
);
1047 EXPORT_SYMBOL(tcp_md5_do_del
);
1049 static void tcp_clear_md5_list(struct sock
*sk
)
1051 struct tcp_sock
*tp
= tcp_sk(sk
);
1052 struct tcp_md5sig_key
*key
;
1053 struct hlist_node
*n
;
1054 struct tcp_md5sig_info
*md5sig
;
1056 md5sig
= rcu_dereference_protected(tp
->md5sig_info
, 1);
1058 hlist_for_each_entry_safe(key
, n
, &md5sig
->head
, node
) {
1059 hlist_del_rcu(&key
->node
);
1060 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1061 kfree_rcu(key
, rcu
);
1065 static int tcp_v4_parse_md5_keys(struct sock
*sk
, int optname
,
1066 char __user
*optval
, int optlen
)
1068 struct tcp_md5sig cmd
;
1069 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
1072 if (optlen
< sizeof(cmd
))
1075 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
1078 if (sin
->sin_family
!= AF_INET
)
1081 if (optname
== TCP_MD5SIG_EXT
&&
1082 cmd
.tcpm_flags
& TCP_MD5SIG_FLAG_PREFIX
) {
1083 prefixlen
= cmd
.tcpm_prefixlen
;
1088 if (!cmd
.tcpm_keylen
)
1089 return tcp_md5_do_del(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1090 AF_INET
, prefixlen
);
1092 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1095 return tcp_md5_do_add(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1096 AF_INET
, prefixlen
, cmd
.tcpm_key
, cmd
.tcpm_keylen
,
1100 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool
*hp
,
1101 __be32 daddr
, __be32 saddr
,
1102 const struct tcphdr
*th
, int nbytes
)
1104 struct tcp4_pseudohdr
*bp
;
1105 struct scatterlist sg
;
1112 bp
->protocol
= IPPROTO_TCP
;
1113 bp
->len
= cpu_to_be16(nbytes
);
1115 _th
= (struct tcphdr
*)(bp
+ 1);
1116 memcpy(_th
, th
, sizeof(*th
));
1119 sg_init_one(&sg
, bp
, sizeof(*bp
) + sizeof(*th
));
1120 ahash_request_set_crypt(hp
->md5_req
, &sg
, NULL
,
1121 sizeof(*bp
) + sizeof(*th
));
1122 return crypto_ahash_update(hp
->md5_req
);
1125 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1126 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
)
1128 struct tcp_md5sig_pool
*hp
;
1129 struct ahash_request
*req
;
1131 hp
= tcp_get_md5sig_pool();
1133 goto clear_hash_noput
;
1136 if (crypto_ahash_init(req
))
1138 if (tcp_v4_md5_hash_headers(hp
, daddr
, saddr
, th
, th
->doff
<< 2))
1140 if (tcp_md5_hash_key(hp
, key
))
1142 ahash_request_set_crypt(req
, NULL
, md5_hash
, 0);
1143 if (crypto_ahash_final(req
))
1146 tcp_put_md5sig_pool();
1150 tcp_put_md5sig_pool();
1152 memset(md5_hash
, 0, 16);
1156 int tcp_v4_md5_hash_skb(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1157 const struct sock
*sk
,
1158 const struct sk_buff
*skb
)
1160 struct tcp_md5sig_pool
*hp
;
1161 struct ahash_request
*req
;
1162 const struct tcphdr
*th
= tcp_hdr(skb
);
1163 __be32 saddr
, daddr
;
1165 if (sk
) { /* valid for establish/request sockets */
1166 saddr
= sk
->sk_rcv_saddr
;
1167 daddr
= sk
->sk_daddr
;
1169 const struct iphdr
*iph
= ip_hdr(skb
);
1174 hp
= tcp_get_md5sig_pool();
1176 goto clear_hash_noput
;
1179 if (crypto_ahash_init(req
))
1182 if (tcp_v4_md5_hash_headers(hp
, daddr
, saddr
, th
, skb
->len
))
1184 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1186 if (tcp_md5_hash_key(hp
, key
))
1188 ahash_request_set_crypt(req
, NULL
, md5_hash
, 0);
1189 if (crypto_ahash_final(req
))
1192 tcp_put_md5sig_pool();
1196 tcp_put_md5sig_pool();
1198 memset(md5_hash
, 0, 16);
1201 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1205 /* Called with rcu_read_lock() */
1206 static bool tcp_v4_inbound_md5_hash(const struct sock
*sk
,
1207 const struct sk_buff
*skb
)
1209 #ifdef CONFIG_TCP_MD5SIG
1211 * This gets called for each TCP segment that arrives
1212 * so we want to be efficient.
1213 * We have 3 drop cases:
1214 * o No MD5 hash and one expected.
1215 * o MD5 hash and we're not expecting one.
1216 * o MD5 hash and its wrong.
1218 const __u8
*hash_location
= NULL
;
1219 struct tcp_md5sig_key
*hash_expected
;
1220 const struct iphdr
*iph
= ip_hdr(skb
);
1221 const struct tcphdr
*th
= tcp_hdr(skb
);
1223 unsigned char newhash
[16];
1225 hash_expected
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&iph
->saddr
,
1227 hash_location
= tcp_parse_md5sig_option(th
);
1229 /* We've parsed the options - do we have a hash? */
1230 if (!hash_expected
&& !hash_location
)
1233 if (hash_expected
&& !hash_location
) {
1234 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1238 if (!hash_expected
&& hash_location
) {
1239 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1243 /* Okay, so this is hash_expected and hash_location -
1244 * so we need to calculate the checksum.
1246 genhash
= tcp_v4_md5_hash_skb(newhash
,
1250 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1251 NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPMD5FAILURE
);
1252 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1253 &iph
->saddr
, ntohs(th
->source
),
1254 &iph
->daddr
, ntohs(th
->dest
),
1255 genhash
? " tcp_v4_calc_md5_hash failed"
1264 static void tcp_v4_init_req(struct request_sock
*req
,
1265 const struct sock
*sk_listener
,
1266 struct sk_buff
*skb
)
1268 struct inet_request_sock
*ireq
= inet_rsk(req
);
1270 sk_rcv_saddr_set(req_to_sk(req
), ip_hdr(skb
)->daddr
);
1271 sk_daddr_set(req_to_sk(req
), ip_hdr(skb
)->saddr
);
1272 ireq
->opt
= tcp_v4_save_options(skb
);
1275 static struct dst_entry
*tcp_v4_route_req(const struct sock
*sk
,
1277 const struct request_sock
*req
)
1279 return inet_csk_route_req(sk
, &fl
->u
.ip4
, req
);
1282 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1284 .obj_size
= sizeof(struct tcp_request_sock
),
1285 .rtx_syn_ack
= tcp_rtx_synack
,
1286 .send_ack
= tcp_v4_reqsk_send_ack
,
1287 .destructor
= tcp_v4_reqsk_destructor
,
1288 .send_reset
= tcp_v4_send_reset
,
1289 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1292 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1293 .mss_clamp
= TCP_MSS_DEFAULT
,
1294 #ifdef CONFIG_TCP_MD5SIG
1295 .req_md5_lookup
= tcp_v4_md5_lookup
,
1296 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1298 .init_req
= tcp_v4_init_req
,
1299 #ifdef CONFIG_SYN_COOKIES
1300 .cookie_init_seq
= cookie_v4_init_sequence
,
1302 .route_req
= tcp_v4_route_req
,
1303 .init_seq
= tcp_v4_init_seq
,
1304 .init_ts_off
= tcp_v4_init_ts_off
,
1305 .send_synack
= tcp_v4_send_synack
,
1308 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1310 /* Never answer to SYNs send to broadcast or multicast */
1311 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1314 return tcp_conn_request(&tcp_request_sock_ops
,
1315 &tcp_request_sock_ipv4_ops
, sk
, skb
);
1321 EXPORT_SYMBOL(tcp_v4_conn_request
);
1325 * The three way handshake has completed - we got a valid synack -
1326 * now create the new socket.
1328 struct sock
*tcp_v4_syn_recv_sock(const struct sock
*sk
, struct sk_buff
*skb
,
1329 struct request_sock
*req
,
1330 struct dst_entry
*dst
,
1331 struct request_sock
*req_unhash
,
1334 struct inet_request_sock
*ireq
;
1335 struct inet_sock
*newinet
;
1336 struct tcp_sock
*newtp
;
1338 #ifdef CONFIG_TCP_MD5SIG
1339 struct tcp_md5sig_key
*key
;
1341 struct ip_options_rcu
*inet_opt
;
1343 if (sk_acceptq_is_full(sk
))
1346 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1350 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1351 inet_sk_rx_dst_set(newsk
, skb
);
1353 newtp
= tcp_sk(newsk
);
1354 newinet
= inet_sk(newsk
);
1355 ireq
= inet_rsk(req
);
1356 sk_daddr_set(newsk
, ireq
->ir_rmt_addr
);
1357 sk_rcv_saddr_set(newsk
, ireq
->ir_loc_addr
);
1358 newsk
->sk_bound_dev_if
= ireq
->ir_iif
;
1359 newinet
->inet_saddr
= ireq
->ir_loc_addr
;
1360 inet_opt
= ireq
->opt
;
1361 rcu_assign_pointer(newinet
->inet_opt
, inet_opt
);
1363 newinet
->mc_index
= inet_iif(skb
);
1364 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1365 newinet
->rcv_tos
= ip_hdr(skb
)->tos
;
1366 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1368 inet_csk(newsk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
1369 newinet
->inet_id
= newtp
->write_seq
^ jiffies
;
1372 dst
= inet_csk_route_child_sock(sk
, newsk
, req
);
1376 /* syncookie case : see end of cookie_v4_check() */
1378 sk_setup_caps(newsk
, dst
);
1380 tcp_ca_openreq_child(newsk
, dst
);
1382 tcp_sync_mss(newsk
, dst_mtu(dst
));
1383 newtp
->advmss
= tcp_mss_clamp(tcp_sk(sk
), dst_metric_advmss(dst
));
1385 tcp_initialize_rcv_mss(newsk
);
1387 #ifdef CONFIG_TCP_MD5SIG
1388 /* Copy over the MD5 key from the original socket */
1389 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1393 * We're using one, so create a matching key
1394 * on the newsk structure. If we fail to get
1395 * memory, then we end up not copying the key
1398 tcp_md5_do_add(newsk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1399 AF_INET
, 32, key
->key
, key
->keylen
, GFP_ATOMIC
);
1400 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1404 if (__inet_inherit_port(sk
, newsk
) < 0)
1406 *own_req
= inet_ehash_nolisten(newsk
, req_to_sk(req_unhash
));
1408 tcp_move_syn(newtp
, req
);
1413 NET_INC_STATS(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1420 inet_csk_prepare_forced_close(newsk
);
1424 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1426 static struct sock
*tcp_v4_cookie_check(struct sock
*sk
, struct sk_buff
*skb
)
1428 #ifdef CONFIG_SYN_COOKIES
1429 const struct tcphdr
*th
= tcp_hdr(skb
);
1432 sk
= cookie_v4_check(sk
, skb
);
1437 /* The socket must have it's spinlock held when we get
1438 * here, unless it is a TCP_LISTEN socket.
1440 * We have a potential double-lock case here, so even when
1441 * doing backlog processing we use the BH locking scheme.
1442 * This is because we cannot sleep with the original spinlock
1445 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1449 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1450 struct dst_entry
*dst
= sk
->sk_rx_dst
;
1452 sock_rps_save_rxhash(sk
, skb
);
1453 sk_mark_napi_id(sk
, skb
);
1455 if (inet_sk(sk
)->rx_dst_ifindex
!= skb
->skb_iif
||
1456 !dst
->ops
->check(dst
, 0)) {
1458 sk
->sk_rx_dst
= NULL
;
1461 tcp_rcv_established(sk
, skb
, tcp_hdr(skb
), skb
->len
);
1465 if (tcp_checksum_complete(skb
))
1468 if (sk
->sk_state
== TCP_LISTEN
) {
1469 struct sock
*nsk
= tcp_v4_cookie_check(sk
, skb
);
1474 if (tcp_child_process(sk
, nsk
, skb
)) {
1481 sock_rps_save_rxhash(sk
, skb
);
1483 if (tcp_rcv_state_process(sk
, skb
)) {
1490 tcp_v4_send_reset(rsk
, skb
);
1493 /* Be careful here. If this function gets more complicated and
1494 * gcc suffers from register pressure on the x86, sk (in %ebx)
1495 * might be destroyed here. This current version compiles correctly,
1496 * but you have been warned.
1501 TCP_INC_STATS(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1502 TCP_INC_STATS(sock_net(sk
), TCP_MIB_INERRS
);
1505 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1507 int tcp_v4_early_demux(struct sk_buff
*skb
)
1509 const struct iphdr
*iph
;
1510 const struct tcphdr
*th
;
1513 if (skb
->pkt_type
!= PACKET_HOST
)
1516 if (!pskb_may_pull(skb
, skb_transport_offset(skb
) + sizeof(struct tcphdr
)))
1522 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1525 sk
= __inet_lookup_established(dev_net(skb
->dev
), &tcp_hashinfo
,
1526 iph
->saddr
, th
->source
,
1527 iph
->daddr
, ntohs(th
->dest
),
1531 skb
->destructor
= sock_edemux
;
1532 if (sk_fullsock(sk
)) {
1533 struct dst_entry
*dst
= READ_ONCE(sk
->sk_rx_dst
);
1536 dst
= dst_check(dst
, 0);
1538 inet_sk(sk
)->rx_dst_ifindex
== skb
->skb_iif
)
1539 skb_dst_set_noref(skb
, dst
);
1545 /* Packet is added to VJ-style prequeue for processing in process
1546 * context, if a reader task is waiting. Apparently, this exciting
1547 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1548 * failed somewhere. Latency? Burstiness? Well, at least now we will
1549 * see, why it failed. 8)8) --ANK
1552 bool tcp_prequeue(struct sock
*sk
, struct sk_buff
*skb
)
1554 struct tcp_sock
*tp
= tcp_sk(sk
);
1556 if (sysctl_tcp_low_latency
|| !tp
->ucopy
.task
)
1559 if (skb
->len
<= tcp_hdrlen(skb
) &&
1560 skb_queue_len(&tp
->ucopy
.prequeue
) == 0)
1563 /* Before escaping RCU protected region, we need to take care of skb
1564 * dst. Prequeue is only enabled for established sockets.
1565 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1566 * Instead of doing full sk_rx_dst validity here, let's perform
1567 * an optimistic check.
1569 if (likely(sk
->sk_rx_dst
))
1572 skb_dst_force_safe(skb
);
1574 __skb_queue_tail(&tp
->ucopy
.prequeue
, skb
);
1575 tp
->ucopy
.memory
+= skb
->truesize
;
1576 if (skb_queue_len(&tp
->ucopy
.prequeue
) >= 32 ||
1577 tp
->ucopy
.memory
+ atomic_read(&sk
->sk_rmem_alloc
) > sk
->sk_rcvbuf
) {
1578 struct sk_buff
*skb1
;
1580 BUG_ON(sock_owned_by_user(sk
));
1581 __NET_ADD_STATS(sock_net(sk
), LINUX_MIB_TCPPREQUEUEDROPPED
,
1582 skb_queue_len(&tp
->ucopy
.prequeue
));
1584 while ((skb1
= __skb_dequeue(&tp
->ucopy
.prequeue
)) != NULL
)
1585 sk_backlog_rcv(sk
, skb1
);
1587 tp
->ucopy
.memory
= 0;
1588 } else if (skb_queue_len(&tp
->ucopy
.prequeue
) == 1) {
1589 wake_up_interruptible_sync_poll(sk_sleep(sk
),
1590 POLLIN
| POLLRDNORM
| POLLRDBAND
);
1591 if (!inet_csk_ack_scheduled(sk
))
1592 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_DACK
,
1593 (3 * tcp_rto_min(sk
)) / 4,
1598 EXPORT_SYMBOL(tcp_prequeue
);
1600 bool tcp_add_backlog(struct sock
*sk
, struct sk_buff
*skb
)
1602 u32 limit
= sk
->sk_rcvbuf
+ sk
->sk_sndbuf
;
1604 /* Only socket owner can try to collapse/prune rx queues
1605 * to reduce memory overhead, so add a little headroom here.
1606 * Few sockets backlog are possibly concurrently non empty.
1610 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1611 * we can fix skb->truesize to its real value to avoid future drops.
1612 * This is valid because skb is not yet charged to the socket.
1613 * It has been noticed pure SACK packets were sometimes dropped
1614 * (if cooked by drivers without copybreak feature).
1618 if (unlikely(sk_add_backlog(sk
, skb
, limit
))) {
1620 __NET_INC_STATS(sock_net(sk
), LINUX_MIB_TCPBACKLOGDROP
);
1625 EXPORT_SYMBOL(tcp_add_backlog
);
1627 int tcp_filter(struct sock
*sk
, struct sk_buff
*skb
)
1629 struct tcphdr
*th
= (struct tcphdr
*)skb
->data
;
1630 unsigned int eaten
= skb
->len
;
1633 err
= sk_filter_trim_cap(sk
, skb
, th
->doff
* 4);
1636 TCP_SKB_CB(skb
)->end_seq
-= eaten
;
1640 EXPORT_SYMBOL(tcp_filter
);
1646 int tcp_v4_rcv(struct sk_buff
*skb
)
1648 struct net
*net
= dev_net(skb
->dev
);
1649 const struct iphdr
*iph
;
1650 const struct tcphdr
*th
;
1655 if (skb
->pkt_type
!= PACKET_HOST
)
1658 /* Count it even if it's bad */
1659 __TCP_INC_STATS(net
, TCP_MIB_INSEGS
);
1661 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1664 th
= (const struct tcphdr
*)skb
->data
;
1666 if (unlikely(th
->doff
< sizeof(struct tcphdr
) / 4))
1668 if (!pskb_may_pull(skb
, th
->doff
* 4))
1671 /* An explanation is required here, I think.
1672 * Packet length and doff are validated by header prediction,
1673 * provided case of th->doff==0 is eliminated.
1674 * So, we defer the checks. */
1676 if (skb_checksum_init(skb
, IPPROTO_TCP
, inet_compute_pseudo
))
1679 th
= (const struct tcphdr
*)skb
->data
;
1681 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1682 * barrier() makes sure compiler wont play fool^Waliasing games.
1684 memmove(&TCP_SKB_CB(skb
)->header
.h4
, IPCB(skb
),
1685 sizeof(struct inet_skb_parm
));
1688 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1689 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1690 skb
->len
- th
->doff
* 4);
1691 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1692 TCP_SKB_CB(skb
)->tcp_flags
= tcp_flag_byte(th
);
1693 TCP_SKB_CB(skb
)->tcp_tw_isn
= 0;
1694 TCP_SKB_CB(skb
)->ip_dsfield
= ipv4_get_dsfield(iph
);
1695 TCP_SKB_CB(skb
)->sacked
= 0;
1698 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, __tcp_hdrlen(th
), th
->source
,
1699 th
->dest
, &refcounted
);
1704 if (sk
->sk_state
== TCP_TIME_WAIT
)
1707 if (sk
->sk_state
== TCP_NEW_SYN_RECV
) {
1708 struct request_sock
*req
= inet_reqsk(sk
);
1711 sk
= req
->rsk_listener
;
1712 if (unlikely(tcp_v4_inbound_md5_hash(sk
, skb
))) {
1713 sk_drops_add(sk
, skb
);
1717 if (unlikely(sk
->sk_state
!= TCP_LISTEN
)) {
1718 inet_csk_reqsk_queue_drop_and_put(sk
, req
);
1721 /* We own a reference on the listener, increase it again
1722 * as we might lose it too soon.
1727 if (!tcp_filter(sk
, skb
))
1728 nsk
= tcp_check_req(sk
, skb
, req
, false);
1731 goto discard_and_relse
;
1735 } else if (tcp_child_process(sk
, nsk
, skb
)) {
1736 tcp_v4_send_reset(nsk
, skb
);
1737 goto discard_and_relse
;
1743 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
1744 __NET_INC_STATS(net
, LINUX_MIB_TCPMINTTLDROP
);
1745 goto discard_and_relse
;
1748 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
1749 goto discard_and_relse
;
1751 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1752 goto discard_and_relse
;
1756 if (tcp_filter(sk
, skb
))
1757 goto discard_and_relse
;
1758 th
= (const struct tcphdr
*)skb
->data
;
1763 if (sk
->sk_state
== TCP_LISTEN
) {
1764 ret
= tcp_v4_do_rcv(sk
, skb
);
1765 goto put_and_return
;
1768 sk_incoming_cpu_update(sk
);
1770 bh_lock_sock_nested(sk
);
1771 tcp_segs_in(tcp_sk(sk
), skb
);
1773 if (!sock_owned_by_user(sk
)) {
1774 if (!tcp_prequeue(sk
, skb
))
1775 ret
= tcp_v4_do_rcv(sk
, skb
);
1776 } else if (tcp_add_backlog(sk
, skb
)) {
1777 goto discard_and_relse
;
1788 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
1791 if (tcp_checksum_complete(skb
)) {
1793 __TCP_INC_STATS(net
, TCP_MIB_CSUMERRORS
);
1795 __TCP_INC_STATS(net
, TCP_MIB_INERRS
);
1797 tcp_v4_send_reset(NULL
, skb
);
1801 /* Discard frame. */
1806 sk_drops_add(sk
, skb
);
1812 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
1813 inet_twsk_put(inet_twsk(sk
));
1817 if (tcp_checksum_complete(skb
)) {
1818 inet_twsk_put(inet_twsk(sk
));
1821 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
1823 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
1826 iph
->saddr
, th
->source
,
1827 iph
->daddr
, th
->dest
,
1830 inet_twsk_deschedule_put(inet_twsk(sk
));
1835 /* Fall through to ACK */
1838 tcp_v4_timewait_ack(sk
, skb
);
1841 tcp_v4_send_reset(sk
, skb
);
1842 inet_twsk_deschedule_put(inet_twsk(sk
));
1844 case TCP_TW_SUCCESS
:;
1849 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
1850 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
1851 .twsk_unique
= tcp_twsk_unique
,
1852 .twsk_destructor
= tcp_twsk_destructor
,
1855 void inet_sk_rx_dst_set(struct sock
*sk
, const struct sk_buff
*skb
)
1857 struct dst_entry
*dst
= skb_dst(skb
);
1859 if (dst
&& dst_hold_safe(dst
)) {
1860 sk
->sk_rx_dst
= dst
;
1861 inet_sk(sk
)->rx_dst_ifindex
= skb
->skb_iif
;
1864 EXPORT_SYMBOL(inet_sk_rx_dst_set
);
1866 const struct inet_connection_sock_af_ops ipv4_specific
= {
1867 .queue_xmit
= ip_queue_xmit
,
1868 .send_check
= tcp_v4_send_check
,
1869 .rebuild_header
= inet_sk_rebuild_header
,
1870 .sk_rx_dst_set
= inet_sk_rx_dst_set
,
1871 .conn_request
= tcp_v4_conn_request
,
1872 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
1873 .net_header_len
= sizeof(struct iphdr
),
1874 .setsockopt
= ip_setsockopt
,
1875 .getsockopt
= ip_getsockopt
,
1876 .addr2sockaddr
= inet_csk_addr2sockaddr
,
1877 .sockaddr_len
= sizeof(struct sockaddr_in
),
1878 #ifdef CONFIG_COMPAT
1879 .compat_setsockopt
= compat_ip_setsockopt
,
1880 .compat_getsockopt
= compat_ip_getsockopt
,
1882 .mtu_reduced
= tcp_v4_mtu_reduced
,
1884 EXPORT_SYMBOL(ipv4_specific
);
1886 #ifdef CONFIG_TCP_MD5SIG
1887 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
1888 .md5_lookup
= tcp_v4_md5_lookup
,
1889 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1890 .md5_parse
= tcp_v4_parse_md5_keys
,
1894 /* NOTE: A lot of things set to zero explicitly by call to
1895 * sk_alloc() so need not be done here.
1897 static int tcp_v4_init_sock(struct sock
*sk
)
1899 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1903 icsk
->icsk_af_ops
= &ipv4_specific
;
1905 #ifdef CONFIG_TCP_MD5SIG
1906 tcp_sk(sk
)->af_specific
= &tcp_sock_ipv4_specific
;
1912 void tcp_v4_destroy_sock(struct sock
*sk
)
1914 struct tcp_sock
*tp
= tcp_sk(sk
);
1916 tcp_clear_xmit_timers(sk
);
1918 tcp_cleanup_congestion_control(sk
);
1920 tcp_cleanup_ulp(sk
);
1922 /* Cleanup up the write buffer. */
1923 tcp_write_queue_purge(sk
);
1925 /* Check if we want to disable active TFO */
1926 tcp_fastopen_active_disable_ofo_check(sk
);
1928 /* Cleans up our, hopefully empty, out_of_order_queue. */
1929 skb_rbtree_purge(&tp
->out_of_order_queue
);
1931 #ifdef CONFIG_TCP_MD5SIG
1932 /* Clean up the MD5 key list, if any */
1933 if (tp
->md5sig_info
) {
1934 tcp_clear_md5_list(sk
);
1935 kfree_rcu(tp
->md5sig_info
, rcu
);
1936 tp
->md5sig_info
= NULL
;
1940 /* Clean prequeue, it must be empty really */
1941 __skb_queue_purge(&tp
->ucopy
.prequeue
);
1943 /* Clean up a referenced TCP bind bucket. */
1944 if (inet_csk(sk
)->icsk_bind_hash
)
1947 BUG_ON(tp
->fastopen_rsk
);
1949 /* If socket is aborted during connect operation */
1950 tcp_free_fastopen_req(tp
);
1951 tcp_saved_syn_free(tp
);
1953 sk_sockets_allocated_dec(sk
);
1955 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
1957 #ifdef CONFIG_PROC_FS
1958 /* Proc filesystem TCP sock list dumping. */
1961 * Get next listener socket follow cur. If cur is NULL, get first socket
1962 * starting from bucket given in st->bucket; when st->bucket is zero the
1963 * very first socket in the hash table is returned.
1965 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
1967 struct tcp_iter_state
*st
= seq
->private;
1968 struct net
*net
= seq_file_net(seq
);
1969 struct inet_listen_hashbucket
*ilb
;
1970 struct sock
*sk
= cur
;
1974 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
1975 spin_lock(&ilb
->lock
);
1976 sk
= sk_head(&ilb
->head
);
1980 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
1986 sk_for_each_from(sk
) {
1987 if (!net_eq(sock_net(sk
), net
))
1989 if (sk
->sk_family
== st
->family
)
1992 spin_unlock(&ilb
->lock
);
1994 if (++st
->bucket
< INET_LHTABLE_SIZE
)
1999 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
2001 struct tcp_iter_state
*st
= seq
->private;
2006 rc
= listening_get_next(seq
, NULL
);
2008 while (rc
&& *pos
) {
2009 rc
= listening_get_next(seq
, rc
);
2015 static inline bool empty_bucket(const struct tcp_iter_state
*st
)
2017 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
2021 * Get first established socket starting from bucket given in st->bucket.
2022 * If st->bucket is zero, the very first socket in the hash is returned.
2024 static void *established_get_first(struct seq_file
*seq
)
2026 struct tcp_iter_state
*st
= seq
->private;
2027 struct net
*net
= seq_file_net(seq
);
2031 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
2033 struct hlist_nulls_node
*node
;
2034 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
2036 /* Lockless fast path for the common case of empty buckets */
2037 if (empty_bucket(st
))
2041 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
2042 if (sk
->sk_family
!= st
->family
||
2043 !net_eq(sock_net(sk
), net
)) {
2049 spin_unlock_bh(lock
);
2055 static void *established_get_next(struct seq_file
*seq
, void *cur
)
2057 struct sock
*sk
= cur
;
2058 struct hlist_nulls_node
*node
;
2059 struct tcp_iter_state
*st
= seq
->private;
2060 struct net
*net
= seq_file_net(seq
);
2065 sk
= sk_nulls_next(sk
);
2067 sk_nulls_for_each_from(sk
, node
) {
2068 if (sk
->sk_family
== st
->family
&& net_eq(sock_net(sk
), net
))
2072 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2074 return established_get_first(seq
);
2077 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
2079 struct tcp_iter_state
*st
= seq
->private;
2083 rc
= established_get_first(seq
);
2086 rc
= established_get_next(seq
, rc
);
2092 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
2095 struct tcp_iter_state
*st
= seq
->private;
2097 st
->state
= TCP_SEQ_STATE_LISTENING
;
2098 rc
= listening_get_idx(seq
, &pos
);
2101 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2102 rc
= established_get_idx(seq
, pos
);
2108 static void *tcp_seek_last_pos(struct seq_file
*seq
)
2110 struct tcp_iter_state
*st
= seq
->private;
2111 int offset
= st
->offset
;
2112 int orig_num
= st
->num
;
2115 switch (st
->state
) {
2116 case TCP_SEQ_STATE_LISTENING
:
2117 if (st
->bucket
>= INET_LHTABLE_SIZE
)
2119 st
->state
= TCP_SEQ_STATE_LISTENING
;
2120 rc
= listening_get_next(seq
, NULL
);
2121 while (offset
-- && rc
)
2122 rc
= listening_get_next(seq
, rc
);
2126 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2128 case TCP_SEQ_STATE_ESTABLISHED
:
2129 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2131 rc
= established_get_first(seq
);
2132 while (offset
-- && rc
)
2133 rc
= established_get_next(seq
, rc
);
2141 static void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2143 struct tcp_iter_state
*st
= seq
->private;
2146 if (*pos
&& *pos
== st
->last_pos
) {
2147 rc
= tcp_seek_last_pos(seq
);
2152 st
->state
= TCP_SEQ_STATE_LISTENING
;
2156 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2159 st
->last_pos
= *pos
;
2163 static void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2165 struct tcp_iter_state
*st
= seq
->private;
2168 if (v
== SEQ_START_TOKEN
) {
2169 rc
= tcp_get_idx(seq
, 0);
2173 switch (st
->state
) {
2174 case TCP_SEQ_STATE_LISTENING
:
2175 rc
= listening_get_next(seq
, v
);
2177 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2180 rc
= established_get_first(seq
);
2183 case TCP_SEQ_STATE_ESTABLISHED
:
2184 rc
= established_get_next(seq
, v
);
2189 st
->last_pos
= *pos
;
2193 static void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2195 struct tcp_iter_state
*st
= seq
->private;
2197 switch (st
->state
) {
2198 case TCP_SEQ_STATE_LISTENING
:
2199 if (v
!= SEQ_START_TOKEN
)
2200 spin_unlock(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2202 case TCP_SEQ_STATE_ESTABLISHED
:
2204 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2209 int tcp_seq_open(struct inode
*inode
, struct file
*file
)
2211 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(inode
);
2212 struct tcp_iter_state
*s
;
2215 err
= seq_open_net(inode
, file
, &afinfo
->seq_ops
,
2216 sizeof(struct tcp_iter_state
));
2220 s
= ((struct seq_file
*)file
->private_data
)->private;
2221 s
->family
= afinfo
->family
;
2225 EXPORT_SYMBOL(tcp_seq_open
);
2227 int tcp_proc_register(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2230 struct proc_dir_entry
*p
;
2232 afinfo
->seq_ops
.start
= tcp_seq_start
;
2233 afinfo
->seq_ops
.next
= tcp_seq_next
;
2234 afinfo
->seq_ops
.stop
= tcp_seq_stop
;
2236 p
= proc_create_data(afinfo
->name
, S_IRUGO
, net
->proc_net
,
2237 afinfo
->seq_fops
, afinfo
);
2242 EXPORT_SYMBOL(tcp_proc_register
);
2244 void tcp_proc_unregister(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2246 remove_proc_entry(afinfo
->name
, net
->proc_net
);
2248 EXPORT_SYMBOL(tcp_proc_unregister
);
2250 static void get_openreq4(const struct request_sock
*req
,
2251 struct seq_file
*f
, int i
)
2253 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2254 long delta
= req
->rsk_timer
.expires
- jiffies
;
2256 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2257 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2262 ntohs(ireq
->ir_rmt_port
),
2264 0, 0, /* could print option size, but that is af dependent. */
2265 1, /* timers active (only the expire timer) */
2266 jiffies_delta_to_clock_t(delta
),
2268 from_kuid_munged(seq_user_ns(f
),
2269 sock_i_uid(req
->rsk_listener
)),
2270 0, /* non standard timer */
2271 0, /* open_requests have no inode */
2276 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
)
2279 unsigned long timer_expires
;
2280 const struct tcp_sock
*tp
= tcp_sk(sk
);
2281 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2282 const struct inet_sock
*inet
= inet_sk(sk
);
2283 const struct fastopen_queue
*fastopenq
= &icsk
->icsk_accept_queue
.fastopenq
;
2284 __be32 dest
= inet
->inet_daddr
;
2285 __be32 src
= inet
->inet_rcv_saddr
;
2286 __u16 destp
= ntohs(inet
->inet_dport
);
2287 __u16 srcp
= ntohs(inet
->inet_sport
);
2291 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
||
2292 icsk
->icsk_pending
== ICSK_TIME_REO_TIMEOUT
||
2293 icsk
->icsk_pending
== ICSK_TIME_LOSS_PROBE
) {
2295 timer_expires
= icsk
->icsk_timeout
;
2296 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2298 timer_expires
= icsk
->icsk_timeout
;
2299 } else if (timer_pending(&sk
->sk_timer
)) {
2301 timer_expires
= sk
->sk_timer
.expires
;
2304 timer_expires
= jiffies
;
2307 state
= sk_state_load(sk
);
2308 if (state
== TCP_LISTEN
)
2309 rx_queue
= sk
->sk_ack_backlog
;
2311 /* Because we don't lock the socket,
2312 * we might find a transient negative value.
2314 rx_queue
= max_t(int, tp
->rcv_nxt
- tp
->copied_seq
, 0);
2316 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2317 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2318 i
, src
, srcp
, dest
, destp
, state
,
2319 tp
->write_seq
- tp
->snd_una
,
2322 jiffies_delta_to_clock_t(timer_expires
- jiffies
),
2323 icsk
->icsk_retransmits
,
2324 from_kuid_munged(seq_user_ns(f
), sock_i_uid(sk
)),
2325 icsk
->icsk_probes_out
,
2327 refcount_read(&sk
->sk_refcnt
), sk
,
2328 jiffies_to_clock_t(icsk
->icsk_rto
),
2329 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2330 (icsk
->icsk_ack
.quick
<< 1) | icsk
->icsk_ack
.pingpong
,
2332 state
== TCP_LISTEN
?
2333 fastopenq
->max_qlen
:
2334 (tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
));
2337 static void get_timewait4_sock(const struct inet_timewait_sock
*tw
,
2338 struct seq_file
*f
, int i
)
2340 long delta
= tw
->tw_timer
.expires
- jiffies
;
2344 dest
= tw
->tw_daddr
;
2345 src
= tw
->tw_rcv_saddr
;
2346 destp
= ntohs(tw
->tw_dport
);
2347 srcp
= ntohs(tw
->tw_sport
);
2349 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2350 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2351 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2352 3, jiffies_delta_to_clock_t(delta
), 0, 0, 0, 0,
2353 refcount_read(&tw
->tw_refcnt
), tw
);
2358 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2360 struct tcp_iter_state
*st
;
2361 struct sock
*sk
= v
;
2363 seq_setwidth(seq
, TMPSZ
- 1);
2364 if (v
== SEQ_START_TOKEN
) {
2365 seq_puts(seq
, " sl local_address rem_address st tx_queue "
2366 "rx_queue tr tm->when retrnsmt uid timeout "
2372 if (sk
->sk_state
== TCP_TIME_WAIT
)
2373 get_timewait4_sock(v
, seq
, st
->num
);
2374 else if (sk
->sk_state
== TCP_NEW_SYN_RECV
)
2375 get_openreq4(v
, seq
, st
->num
);
2377 get_tcp4_sock(v
, seq
, st
->num
);
2383 static const struct file_operations tcp_afinfo_seq_fops
= {
2384 .owner
= THIS_MODULE
,
2385 .open
= tcp_seq_open
,
2387 .llseek
= seq_lseek
,
2388 .release
= seq_release_net
2391 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2394 .seq_fops
= &tcp_afinfo_seq_fops
,
2396 .show
= tcp4_seq_show
,
2400 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2402 return tcp_proc_register(net
, &tcp4_seq_afinfo
);
2405 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2407 tcp_proc_unregister(net
, &tcp4_seq_afinfo
);
2410 static struct pernet_operations tcp4_net_ops
= {
2411 .init
= tcp4_proc_init_net
,
2412 .exit
= tcp4_proc_exit_net
,
2415 int __init
tcp4_proc_init(void)
2417 return register_pernet_subsys(&tcp4_net_ops
);
2420 void tcp4_proc_exit(void)
2422 unregister_pernet_subsys(&tcp4_net_ops
);
2424 #endif /* CONFIG_PROC_FS */
2426 struct proto tcp_prot
= {
2428 .owner
= THIS_MODULE
,
2430 .connect
= tcp_v4_connect
,
2431 .disconnect
= tcp_disconnect
,
2432 .accept
= inet_csk_accept
,
2434 .init
= tcp_v4_init_sock
,
2435 .destroy
= tcp_v4_destroy_sock
,
2436 .shutdown
= tcp_shutdown
,
2437 .setsockopt
= tcp_setsockopt
,
2438 .getsockopt
= tcp_getsockopt
,
2439 .keepalive
= tcp_set_keepalive
,
2440 .recvmsg
= tcp_recvmsg
,
2441 .sendmsg
= tcp_sendmsg
,
2442 .sendpage
= tcp_sendpage
,
2443 .backlog_rcv
= tcp_v4_do_rcv
,
2444 .release_cb
= tcp_release_cb
,
2446 .unhash
= inet_unhash
,
2447 .get_port
= inet_csk_get_port
,
2448 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2449 .leave_memory_pressure
= tcp_leave_memory_pressure
,
2450 .stream_memory_free
= tcp_stream_memory_free
,
2451 .sockets_allocated
= &tcp_sockets_allocated
,
2452 .orphan_count
= &tcp_orphan_count
,
2453 .memory_allocated
= &tcp_memory_allocated
,
2454 .memory_pressure
= &tcp_memory_pressure
,
2455 .sysctl_mem
= sysctl_tcp_mem
,
2456 .sysctl_wmem
= sysctl_tcp_wmem
,
2457 .sysctl_rmem
= sysctl_tcp_rmem
,
2458 .max_header
= MAX_TCP_HEADER
,
2459 .obj_size
= sizeof(struct tcp_sock
),
2460 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
2461 .twsk_prot
= &tcp_timewait_sock_ops
,
2462 .rsk_prot
= &tcp_request_sock_ops
,
2463 .h
.hashinfo
= &tcp_hashinfo
,
2464 .no_autobind
= true,
2465 #ifdef CONFIG_COMPAT
2466 .compat_setsockopt
= compat_tcp_setsockopt
,
2467 .compat_getsockopt
= compat_tcp_getsockopt
,
2469 .diag_destroy
= tcp_abort
,
2471 EXPORT_SYMBOL(tcp_prot
);
2473 static void __net_exit
tcp_sk_exit(struct net
*net
)
2477 for_each_possible_cpu(cpu
)
2478 inet_ctl_sock_destroy(*per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
));
2479 free_percpu(net
->ipv4
.tcp_sk
);
2482 static int __net_init
tcp_sk_init(struct net
*net
)
2486 net
->ipv4
.tcp_sk
= alloc_percpu(struct sock
*);
2487 if (!net
->ipv4
.tcp_sk
)
2490 for_each_possible_cpu(cpu
) {
2493 res
= inet_ctl_sock_create(&sk
, PF_INET
, SOCK_RAW
,
2497 sock_set_flag(sk
, SOCK_USE_WRITE_QUEUE
);
2498 *per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
) = sk
;
2501 net
->ipv4
.sysctl_tcp_ecn
= 2;
2502 net
->ipv4
.sysctl_tcp_ecn_fallback
= 1;
2504 net
->ipv4
.sysctl_tcp_base_mss
= TCP_BASE_MSS
;
2505 net
->ipv4
.sysctl_tcp_probe_threshold
= TCP_PROBE_THRESHOLD
;
2506 net
->ipv4
.sysctl_tcp_probe_interval
= TCP_PROBE_INTERVAL
;
2508 net
->ipv4
.sysctl_tcp_keepalive_time
= TCP_KEEPALIVE_TIME
;
2509 net
->ipv4
.sysctl_tcp_keepalive_probes
= TCP_KEEPALIVE_PROBES
;
2510 net
->ipv4
.sysctl_tcp_keepalive_intvl
= TCP_KEEPALIVE_INTVL
;
2512 net
->ipv4
.sysctl_tcp_syn_retries
= TCP_SYN_RETRIES
;
2513 net
->ipv4
.sysctl_tcp_synack_retries
= TCP_SYNACK_RETRIES
;
2514 net
->ipv4
.sysctl_tcp_syncookies
= 1;
2515 net
->ipv4
.sysctl_tcp_reordering
= TCP_FASTRETRANS_THRESH
;
2516 net
->ipv4
.sysctl_tcp_retries1
= TCP_RETR1
;
2517 net
->ipv4
.sysctl_tcp_retries2
= TCP_RETR2
;
2518 net
->ipv4
.sysctl_tcp_orphan_retries
= 0;
2519 net
->ipv4
.sysctl_tcp_fin_timeout
= TCP_FIN_TIMEOUT
;
2520 net
->ipv4
.sysctl_tcp_notsent_lowat
= UINT_MAX
;
2521 net
->ipv4
.sysctl_tcp_tw_reuse
= 0;
2523 cnt
= tcp_hashinfo
.ehash_mask
+ 1;
2524 net
->ipv4
.tcp_death_row
.sysctl_max_tw_buckets
= (cnt
+ 1) / 2;
2525 net
->ipv4
.tcp_death_row
.hashinfo
= &tcp_hashinfo
;
2527 net
->ipv4
.sysctl_max_syn_backlog
= max(128, cnt
/ 256);
2528 net
->ipv4
.sysctl_tcp_sack
= 1;
2529 net
->ipv4
.sysctl_tcp_window_scaling
= 1;
2530 net
->ipv4
.sysctl_tcp_timestamps
= 1;
2539 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
2541 inet_twsk_purge(&tcp_hashinfo
, AF_INET
);
2544 static struct pernet_operations __net_initdata tcp_sk_ops
= {
2545 .init
= tcp_sk_init
,
2546 .exit
= tcp_sk_exit
,
2547 .exit_batch
= tcp_sk_exit_batch
,
2550 void __init
tcp_v4_init(void)
2552 if (register_pernet_subsys(&tcp_sk_ops
))
2553 panic("Failed to create the TCP control socket.\n");