2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_tw_reuse __read_mostly
;
89 int sysctl_tcp_low_latency __read_mostly
;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency
);
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
95 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
);
98 struct inet_hashinfo tcp_hashinfo
;
99 EXPORT_SYMBOL(tcp_hashinfo
);
101 static inline __u32
tcp_v4_init_sequence(const struct sk_buff
*skb
)
103 return secure_tcp_sequence_number(ip_hdr(skb
)->daddr
,
106 tcp_hdr(skb
)->source
);
109 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
111 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
112 struct tcp_sock
*tp
= tcp_sk(sk
);
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
125 if (tcptw
->tw_ts_recent_stamp
&&
126 (twp
== NULL
|| (sysctl_tcp_tw_reuse
&&
127 get_seconds() - tcptw
->tw_ts_recent_stamp
> 1))) {
128 tp
->write_seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
129 if (tp
->write_seq
== 0)
131 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
132 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
141 static int tcp_repair_connect(struct sock
*sk
)
143 tcp_connect_init(sk
);
144 tcp_finish_connect(sk
, NULL
);
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
152 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
153 struct inet_sock
*inet
= inet_sk(sk
);
154 struct tcp_sock
*tp
= tcp_sk(sk
);
155 __be16 orig_sport
, orig_dport
;
156 __be32 daddr
, nexthop
;
160 struct ip_options_rcu
*inet_opt
;
162 if (addr_len
< sizeof(struct sockaddr_in
))
165 if (usin
->sin_family
!= AF_INET
)
166 return -EAFNOSUPPORT
;
168 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
169 inet_opt
= rcu_dereference_protected(inet
->inet_opt
,
170 sock_owned_by_user(sk
));
171 if (inet_opt
&& inet_opt
->opt
.srr
) {
174 nexthop
= inet_opt
->opt
.faddr
;
177 orig_sport
= inet
->inet_sport
;
178 orig_dport
= usin
->sin_port
;
179 fl4
= &inet
->cork
.fl
.u
.ip4
;
180 rt
= ip_route_connect(fl4
, nexthop
, inet
->inet_saddr
,
181 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
183 orig_sport
, orig_dport
, sk
, true);
186 if (err
== -ENETUNREACH
)
187 IP_INC_STATS_BH(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
191 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
196 if (!inet_opt
|| !inet_opt
->opt
.srr
)
199 if (!inet
->inet_saddr
)
200 inet
->inet_saddr
= fl4
->saddr
;
201 inet
->inet_rcv_saddr
= inet
->inet_saddr
;
203 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
204 /* Reset inherited state */
205 tp
->rx_opt
.ts_recent
= 0;
206 tp
->rx_opt
.ts_recent_stamp
= 0;
207 if (likely(!tp
->repair
))
211 if (tcp_death_row
.sysctl_tw_recycle
&&
212 !tp
->rx_opt
.ts_recent_stamp
&& fl4
->daddr
== daddr
) {
213 struct inet_peer
*peer
= rt_get_peer(rt
, fl4
->daddr
);
215 * VJ's idea. We save last timestamp seen from
216 * the destination in peer table, when entering state
217 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
218 * when trying new connection.
221 inet_peer_refcheck(peer
);
222 if ((u32
)get_seconds() - peer
->tcp_ts_stamp
<= TCP_PAWS_MSL
) {
223 tp
->rx_opt
.ts_recent_stamp
= peer
->tcp_ts_stamp
;
224 tp
->rx_opt
.ts_recent
= peer
->tcp_ts
;
229 inet
->inet_dport
= usin
->sin_port
;
230 inet
->inet_daddr
= daddr
;
232 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
234 inet_csk(sk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
236 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
238 /* Socket identity is still unknown (sport may be zero).
239 * However we set state to SYN-SENT and not releasing socket
240 * lock select source port, enter ourselves into the hash tables and
241 * complete initialization after this.
243 tcp_set_state(sk
, TCP_SYN_SENT
);
244 err
= inet_hash_connect(&tcp_death_row
, sk
);
248 rt
= ip_route_newports(fl4
, rt
, orig_sport
, orig_dport
,
249 inet
->inet_sport
, inet
->inet_dport
, sk
);
255 /* OK, now commit destination to socket. */
256 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
257 sk_setup_caps(sk
, &rt
->dst
);
259 if (!tp
->write_seq
&& likely(!tp
->repair
))
260 tp
->write_seq
= secure_tcp_sequence_number(inet
->inet_saddr
,
265 inet
->inet_id
= tp
->write_seq
^ jiffies
;
267 if (likely(!tp
->repair
))
268 err
= tcp_connect(sk
);
270 err
= tcp_repair_connect(sk
);
280 * This unhashes the socket and releases the local port,
283 tcp_set_state(sk
, TCP_CLOSE
);
285 sk
->sk_route_caps
= 0;
286 inet
->inet_dport
= 0;
289 EXPORT_SYMBOL(tcp_v4_connect
);
292 * This routine does path mtu discovery as defined in RFC1191.
294 static void do_pmtu_discovery(struct sock
*sk
, const struct iphdr
*iph
, u32 mtu
)
296 struct dst_entry
*dst
;
297 struct inet_sock
*inet
= inet_sk(sk
);
299 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300 * send out by Linux are always <576bytes so they should go through
303 if (sk
->sk_state
== TCP_LISTEN
)
306 /* We don't check in the destentry if pmtu discovery is forbidden
307 * on this route. We just assume that no packet_to_big packets
308 * are send back when pmtu discovery is not active.
309 * There is a small race when the user changes this flag in the
310 * route, but I think that's acceptable.
312 if ((dst
= __sk_dst_check(sk
, 0)) == NULL
)
315 dst
->ops
->update_pmtu(dst
, mtu
);
317 /* Something is about to be wrong... Remember soft error
318 * for the case, if this connection will not able to recover.
320 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
321 sk
->sk_err_soft
= EMSGSIZE
;
325 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
326 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
327 tcp_sync_mss(sk
, mtu
);
329 /* Resend the TCP packet because it's
330 * clear that the old packet has been
331 * dropped. This is the new "fast" path mtu
334 tcp_simple_retransmit(sk
);
335 } /* else let the usual retransmit timer handle it */
339 * This routine is called by the ICMP module when it gets some
340 * sort of error condition. If err < 0 then the socket should
341 * be closed and the error returned to the user. If err > 0
342 * it's just the icmp type << 8 | icmp code. After adjustment
343 * header points to the first 8 bytes of the tcp header. We need
344 * to find the appropriate port.
346 * The locking strategy used here is very "optimistic". When
347 * someone else accesses the socket the ICMP is just dropped
348 * and for some paths there is no check at all.
349 * A more general error queue to queue errors for later handling
350 * is probably better.
354 void tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
356 const struct iphdr
*iph
= (const struct iphdr
*)icmp_skb
->data
;
357 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
358 struct inet_connection_sock
*icsk
;
360 struct inet_sock
*inet
;
361 const int type
= icmp_hdr(icmp_skb
)->type
;
362 const int code
= icmp_hdr(icmp_skb
)->code
;
368 struct net
*net
= dev_net(icmp_skb
->dev
);
370 if (icmp_skb
->len
< (iph
->ihl
<< 2) + 8) {
371 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
375 sk
= inet_lookup(net
, &tcp_hashinfo
, iph
->daddr
, th
->dest
,
376 iph
->saddr
, th
->source
, inet_iif(icmp_skb
));
378 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
381 if (sk
->sk_state
== TCP_TIME_WAIT
) {
382 inet_twsk_put(inet_twsk(sk
));
387 /* If too many ICMPs get dropped on busy
388 * servers this needs to be solved differently.
390 if (sock_owned_by_user(sk
))
391 NET_INC_STATS_BH(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
393 if (sk
->sk_state
== TCP_CLOSE
)
396 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
397 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
403 seq
= ntohl(th
->seq
);
404 if (sk
->sk_state
!= TCP_LISTEN
&&
405 !between(seq
, tp
->snd_una
, tp
->snd_nxt
)) {
406 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
411 case ICMP_SOURCE_QUENCH
:
412 /* Just silently ignore these. */
414 case ICMP_PARAMETERPROB
:
417 case ICMP_DEST_UNREACH
:
418 if (code
> NR_ICMP_UNREACH
)
421 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
422 if (!sock_owned_by_user(sk
))
423 do_pmtu_discovery(sk
, iph
, info
);
427 err
= icmp_err_convert
[code
].errno
;
428 /* check if icmp_skb allows revert of backoff
429 * (see draft-zimmermann-tcp-lcd) */
430 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
432 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
436 if (sock_owned_by_user(sk
))
439 icsk
->icsk_backoff
--;
440 inet_csk(sk
)->icsk_rto
= (tp
->srtt
? __tcp_set_rto(tp
) :
441 TCP_TIMEOUT_INIT
) << icsk
->icsk_backoff
;
444 skb
= tcp_write_queue_head(sk
);
447 remaining
= icsk
->icsk_rto
- min(icsk
->icsk_rto
,
448 tcp_time_stamp
- TCP_SKB_CB(skb
)->when
);
451 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
452 remaining
, TCP_RTO_MAX
);
454 /* RTO revert clocked out retransmission.
455 * Will retransmit now */
456 tcp_retransmit_timer(sk
);
460 case ICMP_TIME_EXCEEDED
:
467 switch (sk
->sk_state
) {
468 struct request_sock
*req
, **prev
;
470 if (sock_owned_by_user(sk
))
473 req
= inet_csk_search_req(sk
, &prev
, th
->dest
,
474 iph
->daddr
, iph
->saddr
);
478 /* ICMPs are not backlogged, hence we cannot get
479 an established socket here.
483 if (seq
!= tcp_rsk(req
)->snt_isn
) {
484 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
489 * Still in SYN_RECV, just remove it silently.
490 * There is no good way to pass the error to the newly
491 * created socket, and POSIX does not want network
492 * errors returned from accept().
494 inet_csk_reqsk_queue_drop(sk
, req
, prev
);
498 case TCP_SYN_RECV
: /* Cannot happen.
499 It can f.e. if SYNs crossed.
501 if (!sock_owned_by_user(sk
)) {
504 sk
->sk_error_report(sk
);
508 sk
->sk_err_soft
= err
;
513 /* If we've already connected we will keep trying
514 * until we time out, or the user gives up.
516 * rfc1122 4.2.3.9 allows to consider as hard errors
517 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
518 * but it is obsoleted by pmtu discovery).
520 * Note, that in modern internet, where routing is unreliable
521 * and in each dark corner broken firewalls sit, sending random
522 * errors ordered by their masters even this two messages finally lose
523 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 * Now we are in compliance with RFCs.
530 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
532 sk
->sk_error_report(sk
);
533 } else { /* Only an error on timeout */
534 sk
->sk_err_soft
= err
;
542 static void __tcp_v4_send_check(struct sk_buff
*skb
,
543 __be32 saddr
, __be32 daddr
)
545 struct tcphdr
*th
= tcp_hdr(skb
);
547 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
548 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
549 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
550 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
552 th
->check
= tcp_v4_check(skb
->len
, saddr
, daddr
,
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
562 const struct inet_sock
*inet
= inet_sk(sk
);
564 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
566 EXPORT_SYMBOL(tcp_v4_send_check
);
568 int tcp_v4_gso_send_check(struct sk_buff
*skb
)
570 const struct iphdr
*iph
;
573 if (!pskb_may_pull(skb
, sizeof(*th
)))
580 skb
->ip_summed
= CHECKSUM_PARTIAL
;
581 __tcp_v4_send_check(skb
, iph
->saddr
, iph
->daddr
);
586 * This routine will send an RST to the other tcp.
588 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
590 * Answer: if a packet caused RST, it is not for a socket
591 * existing in our system, if it is matched to a socket,
592 * it is just duplicate segment or bug in other side's TCP.
593 * So that we build reply only basing on parameters
594 * arrived with segment.
595 * Exception: precedence violation. We do not implement it in any case.
598 static void tcp_v4_send_reset(struct sock
*sk
, struct sk_buff
*skb
)
600 const struct tcphdr
*th
= tcp_hdr(skb
);
603 #ifdef CONFIG_TCP_MD5SIG
604 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
607 struct ip_reply_arg arg
;
608 #ifdef CONFIG_TCP_MD5SIG
609 struct tcp_md5sig_key
*key
;
610 const __u8
*hash_location
= NULL
;
611 unsigned char newhash
[16];
613 struct sock
*sk1
= NULL
;
617 /* Never send a reset in response to a reset. */
621 if (skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
624 /* Swap the send and the receive. */
625 memset(&rep
, 0, sizeof(rep
));
626 rep
.th
.dest
= th
->source
;
627 rep
.th
.source
= th
->dest
;
628 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
632 rep
.th
.seq
= th
->ack_seq
;
635 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
636 skb
->len
- (th
->doff
<< 2));
639 memset(&arg
, 0, sizeof(arg
));
640 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
641 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
643 #ifdef CONFIG_TCP_MD5SIG
644 hash_location
= tcp_parse_md5sig_option(th
);
645 if (!sk
&& hash_location
) {
647 * active side is lost. Try to find listening socket through
648 * source port, and then find md5 key through listening socket.
649 * we are not loose security here:
650 * Incoming packet is checked with md5 hash with finding key,
651 * no RST generated if md5 hash doesn't match.
653 sk1
= __inet_lookup_listener(dev_net(skb_dst(skb
)->dev
),
654 &tcp_hashinfo
, ip_hdr(skb
)->daddr
,
655 ntohs(th
->source
), inet_iif(skb
));
656 /* don't send rst if it can't find key */
660 key
= tcp_md5_do_lookup(sk1
, (union tcp_md5_addr
*)
661 &ip_hdr(skb
)->saddr
, AF_INET
);
665 genhash
= tcp_v4_md5_hash_skb(newhash
, key
, NULL
, NULL
, skb
);
666 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0)
669 key
= sk
? tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)
675 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
677 (TCPOPT_MD5SIG
<< 8) |
679 /* Update length and the length the header thinks exists */
680 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
681 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
683 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
684 key
, ip_hdr(skb
)->saddr
,
685 ip_hdr(skb
)->daddr
, &rep
.th
);
688 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
689 ip_hdr(skb
)->saddr
, /* XXX */
690 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
691 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
692 arg
.flags
= (sk
&& inet_sk(sk
)->transparent
) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
693 /* When socket is gone, all binding information is lost.
694 * routing might fail in this case. using iif for oif to
695 * make sure we can deliver it
697 arg
.bound_dev_if
= sk
? sk
->sk_bound_dev_if
: inet_iif(skb
);
699 net
= dev_net(skb_dst(skb
)->dev
);
700 arg
.tos
= ip_hdr(skb
)->tos
;
701 ip_send_reply(net
->ipv4
.tcp_sock
, skb
, ip_hdr(skb
)->saddr
,
702 &arg
, arg
.iov
[0].iov_len
);
704 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
705 TCP_INC_STATS_BH(net
, TCP_MIB_OUTRSTS
);
707 #ifdef CONFIG_TCP_MD5SIG
716 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
717 outside socket context is ugly, certainly. What can I do?
720 static void tcp_v4_send_ack(struct sk_buff
*skb
, u32 seq
, u32 ack
,
721 u32 win
, u32 ts
, int oif
,
722 struct tcp_md5sig_key
*key
,
723 int reply_flags
, u8 tos
)
725 const struct tcphdr
*th
= tcp_hdr(skb
);
728 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
729 #ifdef CONFIG_TCP_MD5SIG
730 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
734 struct ip_reply_arg arg
;
735 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
737 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
738 memset(&arg
, 0, sizeof(arg
));
740 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
741 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
743 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
744 (TCPOPT_TIMESTAMP
<< 8) |
746 rep
.opt
[1] = htonl(tcp_time_stamp
);
747 rep
.opt
[2] = htonl(ts
);
748 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
751 /* Swap the send and the receive. */
752 rep
.th
.dest
= th
->source
;
753 rep
.th
.source
= th
->dest
;
754 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
755 rep
.th
.seq
= htonl(seq
);
756 rep
.th
.ack_seq
= htonl(ack
);
758 rep
.th
.window
= htons(win
);
760 #ifdef CONFIG_TCP_MD5SIG
762 int offset
= (ts
) ? 3 : 0;
764 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
766 (TCPOPT_MD5SIG
<< 8) |
768 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
769 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
771 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
772 key
, ip_hdr(skb
)->saddr
,
773 ip_hdr(skb
)->daddr
, &rep
.th
);
776 arg
.flags
= reply_flags
;
777 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
778 ip_hdr(skb
)->saddr
, /* XXX */
779 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
780 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
782 arg
.bound_dev_if
= oif
;
784 ip_send_reply(net
->ipv4
.tcp_sock
, skb
, ip_hdr(skb
)->saddr
,
785 &arg
, arg
.iov
[0].iov_len
);
787 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
790 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
792 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
793 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
795 tcp_v4_send_ack(skb
, tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
796 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
799 tcp_twsk_md5_key(tcptw
),
800 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0,
807 static void tcp_v4_reqsk_send_ack(struct sock
*sk
, struct sk_buff
*skb
,
808 struct request_sock
*req
)
810 tcp_v4_send_ack(skb
, tcp_rsk(req
)->snt_isn
+ 1,
811 tcp_rsk(req
)->rcv_isn
+ 1, req
->rcv_wnd
,
814 tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&ip_hdr(skb
)->daddr
,
816 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0,
821 * Send a SYN-ACK after having received a SYN.
822 * This still operates on a request_sock only, not on a big
825 static int tcp_v4_send_synack(struct sock
*sk
, struct dst_entry
*dst
,
826 struct request_sock
*req
,
827 struct request_values
*rvp
,
830 const struct inet_request_sock
*ireq
= inet_rsk(req
);
833 struct sk_buff
* skb
;
835 /* First, grab a route. */
836 if (!dst
&& (dst
= inet_csk_route_req(sk
, &fl4
, req
)) == NULL
)
839 skb
= tcp_make_synack(sk
, dst
, req
, rvp
);
842 __tcp_v4_send_check(skb
, ireq
->loc_addr
, ireq
->rmt_addr
);
844 skb_set_queue_mapping(skb
, queue_mapping
);
845 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->loc_addr
,
848 err
= net_xmit_eval(err
);
854 static int tcp_v4_rtx_synack(struct sock
*sk
, struct request_sock
*req
,
855 struct request_values
*rvp
)
857 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_RETRANSSEGS
);
858 return tcp_v4_send_synack(sk
, NULL
, req
, rvp
, 0);
862 * IPv4 request_sock destructor.
864 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
866 kfree(inet_rsk(req
)->opt
);
870 * Return true if a syncookie should be sent
872 bool tcp_syn_flood_action(struct sock
*sk
,
873 const struct sk_buff
*skb
,
876 const char *msg
= "Dropping request";
877 bool want_cookie
= false;
878 struct listen_sock
*lopt
;
882 #ifdef CONFIG_SYN_COOKIES
883 if (sysctl_tcp_syncookies
) {
884 msg
= "Sending cookies";
886 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPREQQFULLDOCOOKIES
);
889 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPREQQFULLDROP
);
891 lopt
= inet_csk(sk
)->icsk_accept_queue
.listen_opt
;
892 if (!lopt
->synflood_warned
) {
893 lopt
->synflood_warned
= 1;
894 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
895 proto
, ntohs(tcp_hdr(skb
)->dest
), msg
);
899 EXPORT_SYMBOL(tcp_syn_flood_action
);
902 * Save and compile IPv4 options into the request_sock if needed.
904 static struct ip_options_rcu
*tcp_v4_save_options(struct sock
*sk
,
907 const struct ip_options
*opt
= &(IPCB(skb
)->opt
);
908 struct ip_options_rcu
*dopt
= NULL
;
910 if (opt
&& opt
->optlen
) {
911 int opt_size
= sizeof(*dopt
) + opt
->optlen
;
913 dopt
= kmalloc(opt_size
, GFP_ATOMIC
);
915 if (ip_options_echo(&dopt
->opt
, skb
)) {
924 #ifdef CONFIG_TCP_MD5SIG
926 * RFC2385 MD5 checksumming requires a mapping of
927 * IP address->MD5 Key.
928 * We need to maintain these in the sk structure.
931 /* Find the Key structure for an address. */
932 struct tcp_md5sig_key
*tcp_md5_do_lookup(struct sock
*sk
,
933 const union tcp_md5_addr
*addr
,
936 struct tcp_sock
*tp
= tcp_sk(sk
);
937 struct tcp_md5sig_key
*key
;
938 struct hlist_node
*pos
;
939 unsigned int size
= sizeof(struct in_addr
);
940 struct tcp_md5sig_info
*md5sig
;
942 /* caller either holds rcu_read_lock() or socket lock */
943 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
944 sock_owned_by_user(sk
) ||
945 lockdep_is_held(&sk
->sk_lock
.slock
));
948 #if IS_ENABLED(CONFIG_IPV6)
949 if (family
== AF_INET6
)
950 size
= sizeof(struct in6_addr
);
952 hlist_for_each_entry_rcu(key
, pos
, &md5sig
->head
, node
) {
953 if (key
->family
!= family
)
955 if (!memcmp(&key
->addr
, addr
, size
))
960 EXPORT_SYMBOL(tcp_md5_do_lookup
);
962 struct tcp_md5sig_key
*tcp_v4_md5_lookup(struct sock
*sk
,
963 struct sock
*addr_sk
)
965 union tcp_md5_addr
*addr
;
967 addr
= (union tcp_md5_addr
*)&inet_sk(addr_sk
)->inet_daddr
;
968 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
970 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
972 static struct tcp_md5sig_key
*tcp_v4_reqsk_md5_lookup(struct sock
*sk
,
973 struct request_sock
*req
)
975 union tcp_md5_addr
*addr
;
977 addr
= (union tcp_md5_addr
*)&inet_rsk(req
)->rmt_addr
;
978 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
981 /* This can be called on a newly created socket, from other files */
982 int tcp_md5_do_add(struct sock
*sk
, const union tcp_md5_addr
*addr
,
983 int family
, const u8
*newkey
, u8 newkeylen
, gfp_t gfp
)
985 /* Add Key to the list */
986 struct tcp_md5sig_key
*key
;
987 struct tcp_sock
*tp
= tcp_sk(sk
);
988 struct tcp_md5sig_info
*md5sig
;
990 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&addr
, AF_INET
);
992 /* Pre-existing entry - just update that one. */
993 memcpy(key
->key
, newkey
, newkeylen
);
994 key
->keylen
= newkeylen
;
998 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
999 sock_owned_by_user(sk
));
1001 md5sig
= kmalloc(sizeof(*md5sig
), gfp
);
1005 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
1006 INIT_HLIST_HEAD(&md5sig
->head
);
1007 rcu_assign_pointer(tp
->md5sig_info
, md5sig
);
1010 key
= sock_kmalloc(sk
, sizeof(*key
), gfp
);
1013 if (hlist_empty(&md5sig
->head
) && !tcp_alloc_md5sig_pool(sk
)) {
1014 sock_kfree_s(sk
, key
, sizeof(*key
));
1018 memcpy(key
->key
, newkey
, newkeylen
);
1019 key
->keylen
= newkeylen
;
1020 key
->family
= family
;
1021 memcpy(&key
->addr
, addr
,
1022 (family
== AF_INET6
) ? sizeof(struct in6_addr
) :
1023 sizeof(struct in_addr
));
1024 hlist_add_head_rcu(&key
->node
, &md5sig
->head
);
1027 EXPORT_SYMBOL(tcp_md5_do_add
);
1029 int tcp_md5_do_del(struct sock
*sk
, const union tcp_md5_addr
*addr
, int family
)
1031 struct tcp_sock
*tp
= tcp_sk(sk
);
1032 struct tcp_md5sig_key
*key
;
1033 struct tcp_md5sig_info
*md5sig
;
1035 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&addr
, AF_INET
);
1038 hlist_del_rcu(&key
->node
);
1039 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1040 kfree_rcu(key
, rcu
);
1041 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
1042 sock_owned_by_user(sk
));
1043 if (hlist_empty(&md5sig
->head
))
1044 tcp_free_md5sig_pool();
1047 EXPORT_SYMBOL(tcp_md5_do_del
);
1049 void tcp_clear_md5_list(struct sock
*sk
)
1051 struct tcp_sock
*tp
= tcp_sk(sk
);
1052 struct tcp_md5sig_key
*key
;
1053 struct hlist_node
*pos
, *n
;
1054 struct tcp_md5sig_info
*md5sig
;
1056 md5sig
= rcu_dereference_protected(tp
->md5sig_info
, 1);
1058 if (!hlist_empty(&md5sig
->head
))
1059 tcp_free_md5sig_pool();
1060 hlist_for_each_entry_safe(key
, pos
, n
, &md5sig
->head
, node
) {
1061 hlist_del_rcu(&key
->node
);
1062 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1063 kfree_rcu(key
, rcu
);
1067 static int tcp_v4_parse_md5_keys(struct sock
*sk
, char __user
*optval
,
1070 struct tcp_md5sig cmd
;
1071 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
1073 if (optlen
< sizeof(cmd
))
1076 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
1079 if (sin
->sin_family
!= AF_INET
)
1082 if (!cmd
.tcpm_key
|| !cmd
.tcpm_keylen
)
1083 return tcp_md5_do_del(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1086 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1089 return tcp_md5_do_add(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1090 AF_INET
, cmd
.tcpm_key
, cmd
.tcpm_keylen
,
1094 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool
*hp
,
1095 __be32 daddr
, __be32 saddr
, int nbytes
)
1097 struct tcp4_pseudohdr
*bp
;
1098 struct scatterlist sg
;
1100 bp
= &hp
->md5_blk
.ip4
;
1103 * 1. the TCP pseudo-header (in the order: source IP address,
1104 * destination IP address, zero-padded protocol number, and
1110 bp
->protocol
= IPPROTO_TCP
;
1111 bp
->len
= cpu_to_be16(nbytes
);
1113 sg_init_one(&sg
, bp
, sizeof(*bp
));
1114 return crypto_hash_update(&hp
->md5_desc
, &sg
, sizeof(*bp
));
1117 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1118 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
)
1120 struct tcp_md5sig_pool
*hp
;
1121 struct hash_desc
*desc
;
1123 hp
= tcp_get_md5sig_pool();
1125 goto clear_hash_noput
;
1126 desc
= &hp
->md5_desc
;
1128 if (crypto_hash_init(desc
))
1130 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, th
->doff
<< 2))
1132 if (tcp_md5_hash_header(hp
, th
))
1134 if (tcp_md5_hash_key(hp
, key
))
1136 if (crypto_hash_final(desc
, md5_hash
))
1139 tcp_put_md5sig_pool();
1143 tcp_put_md5sig_pool();
1145 memset(md5_hash
, 0, 16);
1149 int tcp_v4_md5_hash_skb(char *md5_hash
, struct tcp_md5sig_key
*key
,
1150 const struct sock
*sk
, const struct request_sock
*req
,
1151 const struct sk_buff
*skb
)
1153 struct tcp_md5sig_pool
*hp
;
1154 struct hash_desc
*desc
;
1155 const struct tcphdr
*th
= tcp_hdr(skb
);
1156 __be32 saddr
, daddr
;
1159 saddr
= inet_sk(sk
)->inet_saddr
;
1160 daddr
= inet_sk(sk
)->inet_daddr
;
1162 saddr
= inet_rsk(req
)->loc_addr
;
1163 daddr
= inet_rsk(req
)->rmt_addr
;
1165 const struct iphdr
*iph
= ip_hdr(skb
);
1170 hp
= tcp_get_md5sig_pool();
1172 goto clear_hash_noput
;
1173 desc
= &hp
->md5_desc
;
1175 if (crypto_hash_init(desc
))
1178 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, skb
->len
))
1180 if (tcp_md5_hash_header(hp
, th
))
1182 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1184 if (tcp_md5_hash_key(hp
, key
))
1186 if (crypto_hash_final(desc
, md5_hash
))
1189 tcp_put_md5sig_pool();
1193 tcp_put_md5sig_pool();
1195 memset(md5_hash
, 0, 16);
1198 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1200 static bool tcp_v4_inbound_md5_hash(struct sock
*sk
, const struct sk_buff
*skb
)
1203 * This gets called for each TCP segment that arrives
1204 * so we want to be efficient.
1205 * We have 3 drop cases:
1206 * o No MD5 hash and one expected.
1207 * o MD5 hash and we're not expecting one.
1208 * o MD5 hash and its wrong.
1210 const __u8
*hash_location
= NULL
;
1211 struct tcp_md5sig_key
*hash_expected
;
1212 const struct iphdr
*iph
= ip_hdr(skb
);
1213 const struct tcphdr
*th
= tcp_hdr(skb
);
1215 unsigned char newhash
[16];
1217 hash_expected
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&iph
->saddr
,
1219 hash_location
= tcp_parse_md5sig_option(th
);
1221 /* We've parsed the options - do we have a hash? */
1222 if (!hash_expected
&& !hash_location
)
1225 if (hash_expected
&& !hash_location
) {
1226 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1230 if (!hash_expected
&& hash_location
) {
1231 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1235 /* Okay, so this is hash_expected and hash_location -
1236 * so we need to calculate the checksum.
1238 genhash
= tcp_v4_md5_hash_skb(newhash
,
1242 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1243 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1244 &iph
->saddr
, ntohs(th
->source
),
1245 &iph
->daddr
, ntohs(th
->dest
),
1246 genhash
? " tcp_v4_calc_md5_hash failed"
1255 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1257 .obj_size
= sizeof(struct tcp_request_sock
),
1258 .rtx_syn_ack
= tcp_v4_rtx_synack
,
1259 .send_ack
= tcp_v4_reqsk_send_ack
,
1260 .destructor
= tcp_v4_reqsk_destructor
,
1261 .send_reset
= tcp_v4_send_reset
,
1262 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1265 #ifdef CONFIG_TCP_MD5SIG
1266 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1267 .md5_lookup
= tcp_v4_reqsk_md5_lookup
,
1268 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1272 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1274 struct tcp_extend_values tmp_ext
;
1275 struct tcp_options_received tmp_opt
;
1276 const u8
*hash_location
;
1277 struct request_sock
*req
;
1278 struct inet_request_sock
*ireq
;
1279 struct tcp_sock
*tp
= tcp_sk(sk
);
1280 struct dst_entry
*dst
= NULL
;
1281 __be32 saddr
= ip_hdr(skb
)->saddr
;
1282 __be32 daddr
= ip_hdr(skb
)->daddr
;
1283 __u32 isn
= TCP_SKB_CB(skb
)->when
;
1284 bool want_cookie
= false;
1286 /* Never answer to SYNs send to broadcast or multicast */
1287 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1290 /* TW buckets are converted to open requests without
1291 * limitations, they conserve resources and peer is
1292 * evidently real one.
1294 if (inet_csk_reqsk_queue_is_full(sk
) && !isn
) {
1295 want_cookie
= tcp_syn_flood_action(sk
, skb
, "TCP");
1300 /* Accept backlog is full. If we have already queued enough
1301 * of warm entries in syn queue, drop request. It is better than
1302 * clogging syn queue with openreqs with exponentially increasing
1305 if (sk_acceptq_is_full(sk
) && inet_csk_reqsk_queue_young(sk
) > 1)
1308 req
= inet_reqsk_alloc(&tcp_request_sock_ops
);
1312 #ifdef CONFIG_TCP_MD5SIG
1313 tcp_rsk(req
)->af_specific
= &tcp_request_sock_ipv4_ops
;
1316 tcp_clear_options(&tmp_opt
);
1317 tmp_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
1318 tmp_opt
.user_mss
= tp
->rx_opt
.user_mss
;
1319 tcp_parse_options(skb
, &tmp_opt
, &hash_location
, 0);
1321 if (tmp_opt
.cookie_plus
> 0 &&
1322 tmp_opt
.saw_tstamp
&&
1323 !tp
->rx_opt
.cookie_out_never
&&
1324 (sysctl_tcp_cookie_size
> 0 ||
1325 (tp
->cookie_values
!= NULL
&&
1326 tp
->cookie_values
->cookie_desired
> 0))) {
1328 u32
*mess
= &tmp_ext
.cookie_bakery
[COOKIE_DIGEST_WORDS
];
1329 int l
= tmp_opt
.cookie_plus
- TCPOLEN_COOKIE_BASE
;
1331 if (tcp_cookie_generator(&tmp_ext
.cookie_bakery
[0]) != 0)
1332 goto drop_and_release
;
1334 /* Secret recipe starts with IP addresses */
1335 *mess
++ ^= (__force u32
)daddr
;
1336 *mess
++ ^= (__force u32
)saddr
;
1338 /* plus variable length Initiator Cookie */
1341 *c
++ ^= *hash_location
++;
1343 want_cookie
= false; /* not our kind of cookie */
1344 tmp_ext
.cookie_out_never
= 0; /* false */
1345 tmp_ext
.cookie_plus
= tmp_opt
.cookie_plus
;
1346 } else if (!tp
->rx_opt
.cookie_in_always
) {
1347 /* redundant indications, but ensure initialization. */
1348 tmp_ext
.cookie_out_never
= 1; /* true */
1349 tmp_ext
.cookie_plus
= 0;
1351 goto drop_and_release
;
1353 tmp_ext
.cookie_in_always
= tp
->rx_opt
.cookie_in_always
;
1355 if (want_cookie
&& !tmp_opt
.saw_tstamp
)
1356 tcp_clear_options(&tmp_opt
);
1358 tmp_opt
.tstamp_ok
= tmp_opt
.saw_tstamp
;
1359 tcp_openreq_init(req
, &tmp_opt
, skb
);
1361 ireq
= inet_rsk(req
);
1362 ireq
->loc_addr
= daddr
;
1363 ireq
->rmt_addr
= saddr
;
1364 ireq
->no_srccheck
= inet_sk(sk
)->transparent
;
1365 ireq
->opt
= tcp_v4_save_options(sk
, skb
);
1367 if (security_inet_conn_request(sk
, skb
, req
))
1370 if (!want_cookie
|| tmp_opt
.tstamp_ok
)
1371 TCP_ECN_create_request(req
, skb
);
1374 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1375 req
->cookie_ts
= tmp_opt
.tstamp_ok
;
1377 struct inet_peer
*peer
= NULL
;
1380 /* VJ's idea. We save last timestamp seen
1381 * from the destination in peer table, when entering
1382 * state TIME-WAIT, and check against it before
1383 * accepting new connection request.
1385 * If "isn" is not zero, this request hit alive
1386 * timewait bucket, so that all the necessary checks
1387 * are made in the function processing timewait state.
1389 if (tmp_opt
.saw_tstamp
&&
1390 tcp_death_row
.sysctl_tw_recycle
&&
1391 (dst
= inet_csk_route_req(sk
, &fl4
, req
)) != NULL
&&
1392 fl4
.daddr
== saddr
&&
1393 (peer
= rt_get_peer((struct rtable
*)dst
, fl4
.daddr
)) != NULL
) {
1394 inet_peer_refcheck(peer
);
1395 if ((u32
)get_seconds() - peer
->tcp_ts_stamp
< TCP_PAWS_MSL
&&
1396 (s32
)(peer
->tcp_ts
- req
->ts_recent
) >
1398 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_PAWSPASSIVEREJECTED
);
1399 goto drop_and_release
;
1402 /* Kill the following clause, if you dislike this way. */
1403 else if (!sysctl_tcp_syncookies
&&
1404 (sysctl_max_syn_backlog
- inet_csk_reqsk_queue_len(sk
) <
1405 (sysctl_max_syn_backlog
>> 2)) &&
1406 (!peer
|| !peer
->tcp_ts_stamp
) &&
1407 (!dst
|| !dst_metric(dst
, RTAX_RTT
))) {
1408 /* Without syncookies last quarter of
1409 * backlog is filled with destinations,
1410 * proven to be alive.
1411 * It means that we continue to communicate
1412 * to destinations, already remembered
1413 * to the moment of synflood.
1415 LIMIT_NETDEBUG(KERN_DEBUG
pr_fmt("drop open request from %pI4/%u\n"),
1416 &saddr
, ntohs(tcp_hdr(skb
)->source
));
1417 goto drop_and_release
;
1420 isn
= tcp_v4_init_sequence(skb
);
1422 tcp_rsk(req
)->snt_isn
= isn
;
1423 tcp_rsk(req
)->snt_synack
= tcp_time_stamp
;
1425 if (tcp_v4_send_synack(sk
, dst
, req
,
1426 (struct request_values
*)&tmp_ext
,
1427 skb_get_queue_mapping(skb
)) ||
1431 inet_csk_reqsk_queue_hash_add(sk
, req
, TCP_TIMEOUT_INIT
);
1441 EXPORT_SYMBOL(tcp_v4_conn_request
);
1445 * The three way handshake has completed - we got a valid synack -
1446 * now create the new socket.
1448 struct sock
*tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1449 struct request_sock
*req
,
1450 struct dst_entry
*dst
)
1452 struct inet_request_sock
*ireq
;
1453 struct inet_sock
*newinet
;
1454 struct tcp_sock
*newtp
;
1456 #ifdef CONFIG_TCP_MD5SIG
1457 struct tcp_md5sig_key
*key
;
1459 struct ip_options_rcu
*inet_opt
;
1461 if (sk_acceptq_is_full(sk
))
1464 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1468 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1470 newtp
= tcp_sk(newsk
);
1471 newinet
= inet_sk(newsk
);
1472 ireq
= inet_rsk(req
);
1473 newinet
->inet_daddr
= ireq
->rmt_addr
;
1474 newinet
->inet_rcv_saddr
= ireq
->loc_addr
;
1475 newinet
->inet_saddr
= ireq
->loc_addr
;
1476 inet_opt
= ireq
->opt
;
1477 rcu_assign_pointer(newinet
->inet_opt
, inet_opt
);
1479 newinet
->mc_index
= inet_iif(skb
);
1480 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1481 newinet
->rcv_tos
= ip_hdr(skb
)->tos
;
1482 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1484 inet_csk(newsk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
1485 newinet
->inet_id
= newtp
->write_seq
^ jiffies
;
1488 dst
= inet_csk_route_child_sock(sk
, newsk
, req
);
1492 /* syncookie case : see end of cookie_v4_check() */
1494 sk_setup_caps(newsk
, dst
);
1496 tcp_mtup_init(newsk
);
1497 tcp_sync_mss(newsk
, dst_mtu(dst
));
1498 newtp
->advmss
= dst_metric_advmss(dst
);
1499 if (tcp_sk(sk
)->rx_opt
.user_mss
&&
1500 tcp_sk(sk
)->rx_opt
.user_mss
< newtp
->advmss
)
1501 newtp
->advmss
= tcp_sk(sk
)->rx_opt
.user_mss
;
1503 tcp_initialize_rcv_mss(newsk
);
1504 if (tcp_rsk(req
)->snt_synack
)
1505 tcp_valid_rtt_meas(newsk
,
1506 tcp_time_stamp
- tcp_rsk(req
)->snt_synack
);
1507 newtp
->total_retrans
= req
->retrans
;
1509 #ifdef CONFIG_TCP_MD5SIG
1510 /* Copy over the MD5 key from the original socket */
1511 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1515 * We're using one, so create a matching key
1516 * on the newsk structure. If we fail to get
1517 * memory, then we end up not copying the key
1520 tcp_md5_do_add(newsk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1521 AF_INET
, key
->key
, key
->keylen
, GFP_ATOMIC
);
1522 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1526 if (__inet_inherit_port(sk
, newsk
) < 0)
1528 __inet_hash_nolisten(newsk
, NULL
);
1533 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1537 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
1540 tcp_clear_xmit_timers(newsk
);
1541 tcp_cleanup_congestion_control(newsk
);
1542 bh_unlock_sock(newsk
);
1546 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1548 static struct sock
*tcp_v4_hnd_req(struct sock
*sk
, struct sk_buff
*skb
)
1550 struct tcphdr
*th
= tcp_hdr(skb
);
1551 const struct iphdr
*iph
= ip_hdr(skb
);
1553 struct request_sock
**prev
;
1554 /* Find possible connection requests. */
1555 struct request_sock
*req
= inet_csk_search_req(sk
, &prev
, th
->source
,
1556 iph
->saddr
, iph
->daddr
);
1558 return tcp_check_req(sk
, skb
, req
, prev
);
1560 nsk
= inet_lookup_established(sock_net(sk
), &tcp_hashinfo
, iph
->saddr
,
1561 th
->source
, iph
->daddr
, th
->dest
, inet_iif(skb
));
1564 if (nsk
->sk_state
!= TCP_TIME_WAIT
) {
1568 inet_twsk_put(inet_twsk(nsk
));
1572 #ifdef CONFIG_SYN_COOKIES
1574 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1579 static __sum16
tcp_v4_checksum_init(struct sk_buff
*skb
)
1581 const struct iphdr
*iph
= ip_hdr(skb
);
1583 if (skb
->ip_summed
== CHECKSUM_COMPLETE
) {
1584 if (!tcp_v4_check(skb
->len
, iph
->saddr
,
1585 iph
->daddr
, skb
->csum
)) {
1586 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1591 skb
->csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
1592 skb
->len
, IPPROTO_TCP
, 0);
1594 if (skb
->len
<= 76) {
1595 return __skb_checksum_complete(skb
);
1601 /* The socket must have it's spinlock held when we get
1604 * We have a potential double-lock case here, so even when
1605 * doing backlog processing we use the BH locking scheme.
1606 * This is because we cannot sleep with the original spinlock
1609 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1612 #ifdef CONFIG_TCP_MD5SIG
1614 * We really want to reject the packet as early as possible
1616 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1617 * o There is an MD5 option and we're not expecting one
1619 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1623 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1624 sock_rps_save_rxhash(sk
, skb
);
1625 if (tcp_rcv_established(sk
, skb
, tcp_hdr(skb
), skb
->len
)) {
1632 if (skb
->len
< tcp_hdrlen(skb
) || tcp_checksum_complete(skb
))
1635 if (sk
->sk_state
== TCP_LISTEN
) {
1636 struct sock
*nsk
= tcp_v4_hnd_req(sk
, skb
);
1641 sock_rps_save_rxhash(nsk
, skb
);
1642 if (tcp_child_process(sk
, nsk
, skb
)) {
1649 sock_rps_save_rxhash(sk
, skb
);
1651 if (tcp_rcv_state_process(sk
, skb
, tcp_hdr(skb
), skb
->len
)) {
1658 tcp_v4_send_reset(rsk
, skb
);
1661 /* Be careful here. If this function gets more complicated and
1662 * gcc suffers from register pressure on the x86, sk (in %ebx)
1663 * might be destroyed here. This current version compiles correctly,
1664 * but you have been warned.
1669 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_INERRS
);
1672 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1678 int tcp_v4_rcv(struct sk_buff
*skb
)
1680 const struct iphdr
*iph
;
1681 const struct tcphdr
*th
;
1684 struct net
*net
= dev_net(skb
->dev
);
1686 if (skb
->pkt_type
!= PACKET_HOST
)
1689 /* Count it even if it's bad */
1690 TCP_INC_STATS_BH(net
, TCP_MIB_INSEGS
);
1692 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1697 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1699 if (!pskb_may_pull(skb
, th
->doff
* 4))
1702 /* An explanation is required here, I think.
1703 * Packet length and doff are validated by header prediction,
1704 * provided case of th->doff==0 is eliminated.
1705 * So, we defer the checks. */
1706 if (!skb_csum_unnecessary(skb
) && tcp_v4_checksum_init(skb
))
1711 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
1712 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
1713 skb
->len
- th
->doff
* 4);
1714 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
1715 TCP_SKB_CB(skb
)->when
= 0;
1716 TCP_SKB_CB(skb
)->ip_dsfield
= ipv4_get_dsfield(iph
);
1717 TCP_SKB_CB(skb
)->sacked
= 0;
1719 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, th
->source
, th
->dest
);
1724 if (sk
->sk_state
== TCP_TIME_WAIT
)
1727 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
1728 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
1729 goto discard_and_relse
;
1732 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
1733 goto discard_and_relse
;
1736 if (sk_filter(sk
, skb
))
1737 goto discard_and_relse
;
1741 bh_lock_sock_nested(sk
);
1743 if (!sock_owned_by_user(sk
)) {
1744 #ifdef CONFIG_NET_DMA
1745 struct tcp_sock
*tp
= tcp_sk(sk
);
1746 if (!tp
->ucopy
.dma_chan
&& tp
->ucopy
.pinned_list
)
1747 tp
->ucopy
.dma_chan
= net_dma_find_channel();
1748 if (tp
->ucopy
.dma_chan
)
1749 ret
= tcp_v4_do_rcv(sk
, skb
);
1753 if (!tcp_prequeue(sk
, skb
))
1754 ret
= tcp_v4_do_rcv(sk
, skb
);
1756 } else if (unlikely(sk_add_backlog(sk
, skb
,
1757 sk
->sk_rcvbuf
+ sk
->sk_sndbuf
))) {
1759 NET_INC_STATS_BH(net
, LINUX_MIB_TCPBACKLOGDROP
);
1760 goto discard_and_relse
;
1769 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
1772 if (skb
->len
< (th
->doff
<< 2) || tcp_checksum_complete(skb
)) {
1774 TCP_INC_STATS_BH(net
, TCP_MIB_INERRS
);
1776 tcp_v4_send_reset(NULL
, skb
);
1780 /* Discard frame. */
1789 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
1790 inet_twsk_put(inet_twsk(sk
));
1794 if (skb
->len
< (th
->doff
<< 2) || tcp_checksum_complete(skb
)) {
1795 TCP_INC_STATS_BH(net
, TCP_MIB_INERRS
);
1796 inet_twsk_put(inet_twsk(sk
));
1799 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
1801 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
1803 iph
->daddr
, th
->dest
,
1806 inet_twsk_deschedule(inet_twsk(sk
), &tcp_death_row
);
1807 inet_twsk_put(inet_twsk(sk
));
1811 /* Fall through to ACK */
1814 tcp_v4_timewait_ack(sk
, skb
);
1818 case TCP_TW_SUCCESS
:;
1823 struct inet_peer
*tcp_v4_get_peer(struct sock
*sk
, bool *release_it
)
1825 struct rtable
*rt
= (struct rtable
*) __sk_dst_get(sk
);
1826 struct inet_sock
*inet
= inet_sk(sk
);
1827 struct inet_peer
*peer
;
1830 inet
->cork
.fl
.u
.ip4
.daddr
!= inet
->inet_daddr
) {
1831 peer
= inet_getpeer_v4(inet
->inet_daddr
, 1);
1835 rt_bind_peer(rt
, inet
->inet_daddr
, 1);
1837 *release_it
= false;
1842 EXPORT_SYMBOL(tcp_v4_get_peer
);
1844 void *tcp_v4_tw_get_peer(struct sock
*sk
)
1846 const struct inet_timewait_sock
*tw
= inet_twsk(sk
);
1848 return inet_getpeer_v4(tw
->tw_daddr
, 1);
1850 EXPORT_SYMBOL(tcp_v4_tw_get_peer
);
1852 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
1853 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
1854 .twsk_unique
= tcp_twsk_unique
,
1855 .twsk_destructor
= tcp_twsk_destructor
,
1856 .twsk_getpeer
= tcp_v4_tw_get_peer
,
1859 const struct inet_connection_sock_af_ops ipv4_specific
= {
1860 .queue_xmit
= ip_queue_xmit
,
1861 .send_check
= tcp_v4_send_check
,
1862 .rebuild_header
= inet_sk_rebuild_header
,
1863 .conn_request
= tcp_v4_conn_request
,
1864 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
1865 .get_peer
= tcp_v4_get_peer
,
1866 .net_header_len
= sizeof(struct iphdr
),
1867 .setsockopt
= ip_setsockopt
,
1868 .getsockopt
= ip_getsockopt
,
1869 .addr2sockaddr
= inet_csk_addr2sockaddr
,
1870 .sockaddr_len
= sizeof(struct sockaddr_in
),
1871 .bind_conflict
= inet_csk_bind_conflict
,
1872 #ifdef CONFIG_COMPAT
1873 .compat_setsockopt
= compat_ip_setsockopt
,
1874 .compat_getsockopt
= compat_ip_getsockopt
,
1877 EXPORT_SYMBOL(ipv4_specific
);
1879 #ifdef CONFIG_TCP_MD5SIG
1880 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
1881 .md5_lookup
= tcp_v4_md5_lookup
,
1882 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1883 .md5_parse
= tcp_v4_parse_md5_keys
,
1887 /* NOTE: A lot of things set to zero explicitly by call to
1888 * sk_alloc() so need not be done here.
1890 static int tcp_v4_init_sock(struct sock
*sk
)
1892 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1896 icsk
->icsk_af_ops
= &ipv4_specific
;
1898 #ifdef CONFIG_TCP_MD5SIG
1899 tcp_sk(sk
)->af_specific
= &tcp_sock_ipv4_specific
;
1905 void tcp_v4_destroy_sock(struct sock
*sk
)
1907 struct tcp_sock
*tp
= tcp_sk(sk
);
1909 tcp_clear_xmit_timers(sk
);
1911 tcp_cleanup_congestion_control(sk
);
1913 /* Cleanup up the write buffer. */
1914 tcp_write_queue_purge(sk
);
1916 /* Cleans up our, hopefully empty, out_of_order_queue. */
1917 __skb_queue_purge(&tp
->out_of_order_queue
);
1919 #ifdef CONFIG_TCP_MD5SIG
1920 /* Clean up the MD5 key list, if any */
1921 if (tp
->md5sig_info
) {
1922 tcp_clear_md5_list(sk
);
1923 kfree_rcu(tp
->md5sig_info
, rcu
);
1924 tp
->md5sig_info
= NULL
;
1928 #ifdef CONFIG_NET_DMA
1929 /* Cleans up our sk_async_wait_queue */
1930 __skb_queue_purge(&sk
->sk_async_wait_queue
);
1933 /* Clean prequeue, it must be empty really */
1934 __skb_queue_purge(&tp
->ucopy
.prequeue
);
1936 /* Clean up a referenced TCP bind bucket. */
1937 if (inet_csk(sk
)->icsk_bind_hash
)
1941 * If sendmsg cached page exists, toss it.
1943 if (sk
->sk_sndmsg_page
) {
1944 __free_page(sk
->sk_sndmsg_page
);
1945 sk
->sk_sndmsg_page
= NULL
;
1948 /* TCP Cookie Transactions */
1949 if (tp
->cookie_values
!= NULL
) {
1950 kref_put(&tp
->cookie_values
->kref
,
1951 tcp_cookie_values_release
);
1952 tp
->cookie_values
= NULL
;
1955 sk_sockets_allocated_dec(sk
);
1956 sock_release_memcg(sk
);
1958 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
1960 #ifdef CONFIG_PROC_FS
1961 /* Proc filesystem TCP sock list dumping. */
1963 static inline struct inet_timewait_sock
*tw_head(struct hlist_nulls_head
*head
)
1965 return hlist_nulls_empty(head
) ? NULL
:
1966 list_entry(head
->first
, struct inet_timewait_sock
, tw_node
);
1969 static inline struct inet_timewait_sock
*tw_next(struct inet_timewait_sock
*tw
)
1971 return !is_a_nulls(tw
->tw_node
.next
) ?
1972 hlist_nulls_entry(tw
->tw_node
.next
, typeof(*tw
), tw_node
) : NULL
;
1976 * Get next listener socket follow cur. If cur is NULL, get first socket
1977 * starting from bucket given in st->bucket; when st->bucket is zero the
1978 * very first socket in the hash table is returned.
1980 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
1982 struct inet_connection_sock
*icsk
;
1983 struct hlist_nulls_node
*node
;
1984 struct sock
*sk
= cur
;
1985 struct inet_listen_hashbucket
*ilb
;
1986 struct tcp_iter_state
*st
= seq
->private;
1987 struct net
*net
= seq_file_net(seq
);
1990 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
1991 spin_lock_bh(&ilb
->lock
);
1992 sk
= sk_nulls_head(&ilb
->head
);
1996 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2000 if (st
->state
== TCP_SEQ_STATE_OPENREQ
) {
2001 struct request_sock
*req
= cur
;
2003 icsk
= inet_csk(st
->syn_wait_sk
);
2007 if (req
->rsk_ops
->family
== st
->family
) {
2013 if (++st
->sbucket
>= icsk
->icsk_accept_queue
.listen_opt
->nr_table_entries
)
2016 req
= icsk
->icsk_accept_queue
.listen_opt
->syn_table
[st
->sbucket
];
2018 sk
= sk_nulls_next(st
->syn_wait_sk
);
2019 st
->state
= TCP_SEQ_STATE_LISTENING
;
2020 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2022 icsk
= inet_csk(sk
);
2023 read_lock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2024 if (reqsk_queue_len(&icsk
->icsk_accept_queue
))
2026 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2027 sk
= sk_nulls_next(sk
);
2030 sk_nulls_for_each_from(sk
, node
) {
2031 if (!net_eq(sock_net(sk
), net
))
2033 if (sk
->sk_family
== st
->family
) {
2037 icsk
= inet_csk(sk
);
2038 read_lock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2039 if (reqsk_queue_len(&icsk
->icsk_accept_queue
)) {
2041 st
->uid
= sock_i_uid(sk
);
2042 st
->syn_wait_sk
= sk
;
2043 st
->state
= TCP_SEQ_STATE_OPENREQ
;
2047 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2049 spin_unlock_bh(&ilb
->lock
);
2051 if (++st
->bucket
< INET_LHTABLE_SIZE
) {
2052 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2053 spin_lock_bh(&ilb
->lock
);
2054 sk
= sk_nulls_head(&ilb
->head
);
2062 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
2064 struct tcp_iter_state
*st
= seq
->private;
2069 rc
= listening_get_next(seq
, NULL
);
2071 while (rc
&& *pos
) {
2072 rc
= listening_get_next(seq
, rc
);
2078 static inline bool empty_bucket(struct tcp_iter_state
*st
)
2080 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
) &&
2081 hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].twchain
);
2085 * Get first established socket starting from bucket given in st->bucket.
2086 * If st->bucket is zero, the very first socket in the hash is returned.
2088 static void *established_get_first(struct seq_file
*seq
)
2090 struct tcp_iter_state
*st
= seq
->private;
2091 struct net
*net
= seq_file_net(seq
);
2095 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
2097 struct hlist_nulls_node
*node
;
2098 struct inet_timewait_sock
*tw
;
2099 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
2101 /* Lockless fast path for the common case of empty buckets */
2102 if (empty_bucket(st
))
2106 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
2107 if (sk
->sk_family
!= st
->family
||
2108 !net_eq(sock_net(sk
), net
)) {
2114 st
->state
= TCP_SEQ_STATE_TIME_WAIT
;
2115 inet_twsk_for_each(tw
, node
,
2116 &tcp_hashinfo
.ehash
[st
->bucket
].twchain
) {
2117 if (tw
->tw_family
!= st
->family
||
2118 !net_eq(twsk_net(tw
), net
)) {
2124 spin_unlock_bh(lock
);
2125 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2131 static void *established_get_next(struct seq_file
*seq
, void *cur
)
2133 struct sock
*sk
= cur
;
2134 struct inet_timewait_sock
*tw
;
2135 struct hlist_nulls_node
*node
;
2136 struct tcp_iter_state
*st
= seq
->private;
2137 struct net
*net
= seq_file_net(seq
);
2142 if (st
->state
== TCP_SEQ_STATE_TIME_WAIT
) {
2146 while (tw
&& (tw
->tw_family
!= st
->family
|| !net_eq(twsk_net(tw
), net
))) {
2153 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2154 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2156 /* Look for next non empty bucket */
2158 while (++st
->bucket
<= tcp_hashinfo
.ehash_mask
&&
2161 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2164 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2165 sk
= sk_nulls_head(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
2167 sk
= sk_nulls_next(sk
);
2169 sk_nulls_for_each_from(sk
, node
) {
2170 if (sk
->sk_family
== st
->family
&& net_eq(sock_net(sk
), net
))
2174 st
->state
= TCP_SEQ_STATE_TIME_WAIT
;
2175 tw
= tw_head(&tcp_hashinfo
.ehash
[st
->bucket
].twchain
);
2183 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
2185 struct tcp_iter_state
*st
= seq
->private;
2189 rc
= established_get_first(seq
);
2192 rc
= established_get_next(seq
, rc
);
2198 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
2201 struct tcp_iter_state
*st
= seq
->private;
2203 st
->state
= TCP_SEQ_STATE_LISTENING
;
2204 rc
= listening_get_idx(seq
, &pos
);
2207 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2208 rc
= established_get_idx(seq
, pos
);
2214 static void *tcp_seek_last_pos(struct seq_file
*seq
)
2216 struct tcp_iter_state
*st
= seq
->private;
2217 int offset
= st
->offset
;
2218 int orig_num
= st
->num
;
2221 switch (st
->state
) {
2222 case TCP_SEQ_STATE_OPENREQ
:
2223 case TCP_SEQ_STATE_LISTENING
:
2224 if (st
->bucket
>= INET_LHTABLE_SIZE
)
2226 st
->state
= TCP_SEQ_STATE_LISTENING
;
2227 rc
= listening_get_next(seq
, NULL
);
2228 while (offset
-- && rc
)
2229 rc
= listening_get_next(seq
, rc
);
2234 case TCP_SEQ_STATE_ESTABLISHED
:
2235 case TCP_SEQ_STATE_TIME_WAIT
:
2236 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2237 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2239 rc
= established_get_first(seq
);
2240 while (offset
-- && rc
)
2241 rc
= established_get_next(seq
, rc
);
2249 static void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2251 struct tcp_iter_state
*st
= seq
->private;
2254 if (*pos
&& *pos
== st
->last_pos
) {
2255 rc
= tcp_seek_last_pos(seq
);
2260 st
->state
= TCP_SEQ_STATE_LISTENING
;
2264 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2267 st
->last_pos
= *pos
;
2271 static void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2273 struct tcp_iter_state
*st
= seq
->private;
2276 if (v
== SEQ_START_TOKEN
) {
2277 rc
= tcp_get_idx(seq
, 0);
2281 switch (st
->state
) {
2282 case TCP_SEQ_STATE_OPENREQ
:
2283 case TCP_SEQ_STATE_LISTENING
:
2284 rc
= listening_get_next(seq
, v
);
2286 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2289 rc
= established_get_first(seq
);
2292 case TCP_SEQ_STATE_ESTABLISHED
:
2293 case TCP_SEQ_STATE_TIME_WAIT
:
2294 rc
= established_get_next(seq
, v
);
2299 st
->last_pos
= *pos
;
2303 static void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2305 struct tcp_iter_state
*st
= seq
->private;
2307 switch (st
->state
) {
2308 case TCP_SEQ_STATE_OPENREQ
:
2310 struct inet_connection_sock
*icsk
= inet_csk(st
->syn_wait_sk
);
2311 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2313 case TCP_SEQ_STATE_LISTENING
:
2314 if (v
!= SEQ_START_TOKEN
)
2315 spin_unlock_bh(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2317 case TCP_SEQ_STATE_TIME_WAIT
:
2318 case TCP_SEQ_STATE_ESTABLISHED
:
2320 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2325 int tcp_seq_open(struct inode
*inode
, struct file
*file
)
2327 struct tcp_seq_afinfo
*afinfo
= PDE(inode
)->data
;
2328 struct tcp_iter_state
*s
;
2331 err
= seq_open_net(inode
, file
, &afinfo
->seq_ops
,
2332 sizeof(struct tcp_iter_state
));
2336 s
= ((struct seq_file
*)file
->private_data
)->private;
2337 s
->family
= afinfo
->family
;
2341 EXPORT_SYMBOL(tcp_seq_open
);
2343 int tcp_proc_register(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2346 struct proc_dir_entry
*p
;
2348 afinfo
->seq_ops
.start
= tcp_seq_start
;
2349 afinfo
->seq_ops
.next
= tcp_seq_next
;
2350 afinfo
->seq_ops
.stop
= tcp_seq_stop
;
2352 p
= proc_create_data(afinfo
->name
, S_IRUGO
, net
->proc_net
,
2353 afinfo
->seq_fops
, afinfo
);
2358 EXPORT_SYMBOL(tcp_proc_register
);
2360 void tcp_proc_unregister(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2362 proc_net_remove(net
, afinfo
->name
);
2364 EXPORT_SYMBOL(tcp_proc_unregister
);
2366 static void get_openreq4(const struct sock
*sk
, const struct request_sock
*req
,
2367 struct seq_file
*f
, int i
, int uid
, int *len
)
2369 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2370 int ttd
= req
->expires
- jiffies
;
2372 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2373 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2376 ntohs(inet_sk(sk
)->inet_sport
),
2378 ntohs(ireq
->rmt_port
),
2380 0, 0, /* could print option size, but that is af dependent. */
2381 1, /* timers active (only the expire timer) */
2382 jiffies_to_clock_t(ttd
),
2385 0, /* non standard timer */
2386 0, /* open_requests have no inode */
2387 atomic_read(&sk
->sk_refcnt
),
2392 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
, int *len
)
2395 unsigned long timer_expires
;
2396 const struct tcp_sock
*tp
= tcp_sk(sk
);
2397 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2398 const struct inet_sock
*inet
= inet_sk(sk
);
2399 __be32 dest
= inet
->inet_daddr
;
2400 __be32 src
= inet
->inet_rcv_saddr
;
2401 __u16 destp
= ntohs(inet
->inet_dport
);
2402 __u16 srcp
= ntohs(inet
->inet_sport
);
2405 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
) {
2407 timer_expires
= icsk
->icsk_timeout
;
2408 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2410 timer_expires
= icsk
->icsk_timeout
;
2411 } else if (timer_pending(&sk
->sk_timer
)) {
2413 timer_expires
= sk
->sk_timer
.expires
;
2416 timer_expires
= jiffies
;
2419 if (sk
->sk_state
== TCP_LISTEN
)
2420 rx_queue
= sk
->sk_ack_backlog
;
2423 * because we dont lock socket, we might find a transient negative value
2425 rx_queue
= max_t(int, tp
->rcv_nxt
- tp
->copied_seq
, 0);
2427 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2428 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2429 i
, src
, srcp
, dest
, destp
, sk
->sk_state
,
2430 tp
->write_seq
- tp
->snd_una
,
2433 jiffies_to_clock_t(timer_expires
- jiffies
),
2434 icsk
->icsk_retransmits
,
2436 icsk
->icsk_probes_out
,
2438 atomic_read(&sk
->sk_refcnt
), sk
,
2439 jiffies_to_clock_t(icsk
->icsk_rto
),
2440 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2441 (icsk
->icsk_ack
.quick
<< 1) | icsk
->icsk_ack
.pingpong
,
2443 tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
,
2447 static void get_timewait4_sock(const struct inet_timewait_sock
*tw
,
2448 struct seq_file
*f
, int i
, int *len
)
2452 int ttd
= tw
->tw_ttd
- jiffies
;
2457 dest
= tw
->tw_daddr
;
2458 src
= tw
->tw_rcv_saddr
;
2459 destp
= ntohs(tw
->tw_dport
);
2460 srcp
= ntohs(tw
->tw_sport
);
2462 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2463 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2464 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2465 3, jiffies_to_clock_t(ttd
), 0, 0, 0, 0,
2466 atomic_read(&tw
->tw_refcnt
), tw
, len
);
2471 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2473 struct tcp_iter_state
*st
;
2476 if (v
== SEQ_START_TOKEN
) {
2477 seq_printf(seq
, "%-*s\n", TMPSZ
- 1,
2478 " sl local_address rem_address st tx_queue "
2479 "rx_queue tr tm->when retrnsmt uid timeout "
2485 switch (st
->state
) {
2486 case TCP_SEQ_STATE_LISTENING
:
2487 case TCP_SEQ_STATE_ESTABLISHED
:
2488 get_tcp4_sock(v
, seq
, st
->num
, &len
);
2490 case TCP_SEQ_STATE_OPENREQ
:
2491 get_openreq4(st
->syn_wait_sk
, v
, seq
, st
->num
, st
->uid
, &len
);
2493 case TCP_SEQ_STATE_TIME_WAIT
:
2494 get_timewait4_sock(v
, seq
, st
->num
, &len
);
2497 seq_printf(seq
, "%*s\n", TMPSZ
- 1 - len
, "");
2502 static const struct file_operations tcp_afinfo_seq_fops
= {
2503 .owner
= THIS_MODULE
,
2504 .open
= tcp_seq_open
,
2506 .llseek
= seq_lseek
,
2507 .release
= seq_release_net
2510 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2513 .seq_fops
= &tcp_afinfo_seq_fops
,
2515 .show
= tcp4_seq_show
,
2519 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2521 return tcp_proc_register(net
, &tcp4_seq_afinfo
);
2524 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2526 tcp_proc_unregister(net
, &tcp4_seq_afinfo
);
2529 static struct pernet_operations tcp4_net_ops
= {
2530 .init
= tcp4_proc_init_net
,
2531 .exit
= tcp4_proc_exit_net
,
2534 int __init
tcp4_proc_init(void)
2536 return register_pernet_subsys(&tcp4_net_ops
);
2539 void tcp4_proc_exit(void)
2541 unregister_pernet_subsys(&tcp4_net_ops
);
2543 #endif /* CONFIG_PROC_FS */
2545 struct sk_buff
**tcp4_gro_receive(struct sk_buff
**head
, struct sk_buff
*skb
)
2547 const struct iphdr
*iph
= skb_gro_network_header(skb
);
2549 switch (skb
->ip_summed
) {
2550 case CHECKSUM_COMPLETE
:
2551 if (!tcp_v4_check(skb_gro_len(skb
), iph
->saddr
, iph
->daddr
,
2553 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2559 NAPI_GRO_CB(skb
)->flush
= 1;
2563 return tcp_gro_receive(head
, skb
);
2566 int tcp4_gro_complete(struct sk_buff
*skb
)
2568 const struct iphdr
*iph
= ip_hdr(skb
);
2569 struct tcphdr
*th
= tcp_hdr(skb
);
2571 th
->check
= ~tcp_v4_check(skb
->len
- skb_transport_offset(skb
),
2572 iph
->saddr
, iph
->daddr
, 0);
2573 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV4
;
2575 return tcp_gro_complete(skb
);
2578 struct proto tcp_prot
= {
2580 .owner
= THIS_MODULE
,
2582 .connect
= tcp_v4_connect
,
2583 .disconnect
= tcp_disconnect
,
2584 .accept
= inet_csk_accept
,
2586 .init
= tcp_v4_init_sock
,
2587 .destroy
= tcp_v4_destroy_sock
,
2588 .shutdown
= tcp_shutdown
,
2589 .setsockopt
= tcp_setsockopt
,
2590 .getsockopt
= tcp_getsockopt
,
2591 .recvmsg
= tcp_recvmsg
,
2592 .sendmsg
= tcp_sendmsg
,
2593 .sendpage
= tcp_sendpage
,
2594 .backlog_rcv
= tcp_v4_do_rcv
,
2596 .unhash
= inet_unhash
,
2597 .get_port
= inet_csk_get_port
,
2598 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2599 .sockets_allocated
= &tcp_sockets_allocated
,
2600 .orphan_count
= &tcp_orphan_count
,
2601 .memory_allocated
= &tcp_memory_allocated
,
2602 .memory_pressure
= &tcp_memory_pressure
,
2603 .sysctl_wmem
= sysctl_tcp_wmem
,
2604 .sysctl_rmem
= sysctl_tcp_rmem
,
2605 .max_header
= MAX_TCP_HEADER
,
2606 .obj_size
= sizeof(struct tcp_sock
),
2607 .slab_flags
= SLAB_DESTROY_BY_RCU
,
2608 .twsk_prot
= &tcp_timewait_sock_ops
,
2609 .rsk_prot
= &tcp_request_sock_ops
,
2610 .h
.hashinfo
= &tcp_hashinfo
,
2611 .no_autobind
= true,
2612 #ifdef CONFIG_COMPAT
2613 .compat_setsockopt
= compat_tcp_setsockopt
,
2614 .compat_getsockopt
= compat_tcp_getsockopt
,
2616 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2617 .init_cgroup
= tcp_init_cgroup
,
2618 .destroy_cgroup
= tcp_destroy_cgroup
,
2619 .proto_cgroup
= tcp_proto_cgroup
,
2622 EXPORT_SYMBOL(tcp_prot
);
2624 static int __net_init
tcp_sk_init(struct net
*net
)
2626 return inet_ctl_sock_create(&net
->ipv4
.tcp_sock
,
2627 PF_INET
, SOCK_RAW
, IPPROTO_TCP
, net
);
2630 static void __net_exit
tcp_sk_exit(struct net
*net
)
2632 inet_ctl_sock_destroy(net
->ipv4
.tcp_sock
);
2635 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
2637 inet_twsk_purge(&tcp_hashinfo
, &tcp_death_row
, AF_INET
);
2640 static struct pernet_operations __net_initdata tcp_sk_ops
= {
2641 .init
= tcp_sk_init
,
2642 .exit
= tcp_sk_exit
,
2643 .exit_batch
= tcp_sk_exit_batch
,
2646 void __init
tcp_v4_init(void)
2648 inet_hashinfo_init(&tcp_hashinfo
);
2649 if (register_pernet_subsys(&tcp_sk_ops
))
2650 panic("Failed to create the TCP control socket.\n");