]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/ipv4/tcp_ipv4.c
tcp/dccp: fix ireq->opt races
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24 /*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_low_latency __read_mostly;
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 ip_hdr(skb)->saddr,
102 tcp_hdr(skb)->dest,
103 tcp_hdr(skb)->source);
104 }
105
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 /* With PAWS, it is safe from the viewpoint
117 of data integrity. Even without PAWS it is safe provided sequence
118 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120 Actually, the idea is close to VJ's one, only timestamp cache is
121 held not per host, but per port pair and TW bucket is used as state
122 holder.
123
124 If TW bucket has been already destroyed we fall back to VJ's scheme
125 and use initial timestamp retrieved from peer table.
126 */
127 if (tcptw->tw_ts_recent_stamp &&
128 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 if (tp->write_seq == 0)
132 tp->write_seq = 1;
133 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
134 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 sock_hold(sktw);
136 return 1;
137 }
138
139 return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 struct inet_sock *inet = inet_sk(sk);
148 struct tcp_sock *tp = tcp_sk(sk);
149 __be16 orig_sport, orig_dport;
150 __be32 daddr, nexthop;
151 struct flowi4 *fl4;
152 struct rtable *rt;
153 int err;
154 struct ip_options_rcu *inet_opt;
155 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156
157 if (addr_len < sizeof(struct sockaddr_in))
158 return -EINVAL;
159
160 if (usin->sin_family != AF_INET)
161 return -EAFNOSUPPORT;
162
163 nexthop = daddr = usin->sin_addr.s_addr;
164 inet_opt = rcu_dereference_protected(inet->inet_opt,
165 lockdep_sock_is_held(sk));
166 if (inet_opt && inet_opt->opt.srr) {
167 if (!daddr)
168 return -EINVAL;
169 nexthop = inet_opt->opt.faddr;
170 }
171
172 orig_sport = inet->inet_sport;
173 orig_dport = usin->sin_port;
174 fl4 = &inet->cork.fl.u.ip4;
175 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 IPPROTO_TCP,
178 orig_sport, orig_dport, sk);
179 if (IS_ERR(rt)) {
180 err = PTR_ERR(rt);
181 if (err == -ENETUNREACH)
182 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 return err;
184 }
185
186 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 ip_rt_put(rt);
188 return -ENETUNREACH;
189 }
190
191 if (!inet_opt || !inet_opt->opt.srr)
192 daddr = fl4->daddr;
193
194 if (!inet->inet_saddr)
195 inet->inet_saddr = fl4->saddr;
196 sk_rcv_saddr_set(sk, inet->inet_saddr);
197
198 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 /* Reset inherited state */
200 tp->rx_opt.ts_recent = 0;
201 tp->rx_opt.ts_recent_stamp = 0;
202 if (likely(!tp->repair))
203 tp->write_seq = 0;
204 }
205
206 inet->inet_dport = usin->sin_port;
207 sk_daddr_set(sk, daddr);
208
209 inet_csk(sk)->icsk_ext_hdr_len = 0;
210 if (inet_opt)
211 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215 /* Socket identity is still unknown (sport may be zero).
216 * However we set state to SYN-SENT and not releasing socket
217 * lock select source port, enter ourselves into the hash tables and
218 * complete initialization after this.
219 */
220 tcp_set_state(sk, TCP_SYN_SENT);
221 err = inet_hash_connect(tcp_death_row, sk);
222 if (err)
223 goto failure;
224
225 sk_set_txhash(sk);
226
227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 inet->inet_sport, inet->inet_dport, sk);
229 if (IS_ERR(rt)) {
230 err = PTR_ERR(rt);
231 rt = NULL;
232 goto failure;
233 }
234 /* OK, now commit destination to socket. */
235 sk->sk_gso_type = SKB_GSO_TCPV4;
236 sk_setup_caps(sk, &rt->dst);
237 rt = NULL;
238
239 if (likely(!tp->repair)) {
240 if (!tp->write_seq)
241 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 inet->inet_daddr,
243 inet->inet_sport,
244 usin->sin_port);
245 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246 inet->inet_saddr,
247 inet->inet_daddr);
248 }
249
250 inet->inet_id = tp->write_seq ^ jiffies;
251
252 if (tcp_fastopen_defer_connect(sk, &err))
253 return err;
254 if (err)
255 goto failure;
256
257 err = tcp_connect(sk);
258
259 if (err)
260 goto failure;
261
262 return 0;
263
264 failure:
265 /*
266 * This unhashes the socket and releases the local port,
267 * if necessary.
268 */
269 tcp_set_state(sk, TCP_CLOSE);
270 ip_rt_put(rt);
271 sk->sk_route_caps = 0;
272 inet->inet_dport = 0;
273 return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276
277 /*
278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279 * It can be called through tcp_release_cb() if socket was owned by user
280 * at the time tcp_v4_err() was called to handle ICMP message.
281 */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284 struct inet_sock *inet = inet_sk(sk);
285 struct dst_entry *dst;
286 u32 mtu;
287
288 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289 return;
290 mtu = tcp_sk(sk)->mtu_info;
291 dst = inet_csk_update_pmtu(sk, mtu);
292 if (!dst)
293 return;
294
295 /* Something is about to be wrong... Remember soft error
296 * for the case, if this connection will not able to recover.
297 */
298 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299 sk->sk_err_soft = EMSGSIZE;
300
301 mtu = dst_mtu(dst);
302
303 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304 ip_sk_accept_pmtu(sk) &&
305 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306 tcp_sync_mss(sk, mtu);
307
308 /* Resend the TCP packet because it's
309 * clear that the old packet has been
310 * dropped. This is the new "fast" path mtu
311 * discovery.
312 */
313 tcp_simple_retransmit(sk);
314 } /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320 struct dst_entry *dst = __sk_dst_check(sk, 0);
321
322 if (dst)
323 dst->ops->redirect(dst, sk, skb);
324 }
325
326
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330 struct request_sock *req = inet_reqsk(sk);
331 struct net *net = sock_net(sk);
332
333 /* ICMPs are not backlogged, hence we cannot get
334 * an established socket here.
335 */
336 if (seq != tcp_rsk(req)->snt_isn) {
337 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338 } else if (abort) {
339 /*
340 * Still in SYN_RECV, just remove it silently.
341 * There is no good way to pass the error to the newly
342 * created socket, and POSIX does not want network
343 * errors returned from accept().
344 */
345 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346 tcp_listendrop(req->rsk_listener);
347 }
348 reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351
352 /*
353 * This routine is called by the ICMP module when it gets some
354 * sort of error condition. If err < 0 then the socket should
355 * be closed and the error returned to the user. If err > 0
356 * it's just the icmp type << 8 | icmp code. After adjustment
357 * header points to the first 8 bytes of the tcp header. We need
358 * to find the appropriate port.
359 *
360 * The locking strategy used here is very "optimistic". When
361 * someone else accesses the socket the ICMP is just dropped
362 * and for some paths there is no check at all.
363 * A more general error queue to queue errors for later handling
364 * is probably better.
365 *
366 */
367
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372 struct inet_connection_sock *icsk;
373 struct tcp_sock *tp;
374 struct inet_sock *inet;
375 const int type = icmp_hdr(icmp_skb)->type;
376 const int code = icmp_hdr(icmp_skb)->code;
377 struct sock *sk;
378 struct sk_buff *skb;
379 struct request_sock *fastopen;
380 u32 seq, snd_una;
381 s32 remaining;
382 u32 delta_us;
383 int err;
384 struct net *net = dev_net(icmp_skb->dev);
385
386 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387 th->dest, iph->saddr, ntohs(th->source),
388 inet_iif(icmp_skb));
389 if (!sk) {
390 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391 return;
392 }
393 if (sk->sk_state == TCP_TIME_WAIT) {
394 inet_twsk_put(inet_twsk(sk));
395 return;
396 }
397 seq = ntohl(th->seq);
398 if (sk->sk_state == TCP_NEW_SYN_RECV)
399 return tcp_req_err(sk, seq,
400 type == ICMP_PARAMETERPROB ||
401 type == ICMP_TIME_EXCEEDED ||
402 (type == ICMP_DEST_UNREACH &&
403 (code == ICMP_NET_UNREACH ||
404 code == ICMP_HOST_UNREACH)));
405
406 bh_lock_sock(sk);
407 /* If too many ICMPs get dropped on busy
408 * servers this needs to be solved differently.
409 * We do take care of PMTU discovery (RFC1191) special case :
410 * we can receive locally generated ICMP messages while socket is held.
411 */
412 if (sock_owned_by_user(sk)) {
413 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415 }
416 if (sk->sk_state == TCP_CLOSE)
417 goto out;
418
419 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421 goto out;
422 }
423
424 icsk = inet_csk(sk);
425 tp = tcp_sk(sk);
426 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 fastopen = tp->fastopen_rsk;
428 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429 if (sk->sk_state != TCP_LISTEN &&
430 !between(seq, snd_una, tp->snd_nxt)) {
431 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432 goto out;
433 }
434
435 switch (type) {
436 case ICMP_REDIRECT:
437 if (!sock_owned_by_user(sk))
438 do_redirect(icmp_skb, sk);
439 goto out;
440 case ICMP_SOURCE_QUENCH:
441 /* Just silently ignore these. */
442 goto out;
443 case ICMP_PARAMETERPROB:
444 err = EPROTO;
445 break;
446 case ICMP_DEST_UNREACH:
447 if (code > NR_ICMP_UNREACH)
448 goto out;
449
450 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451 /* We are not interested in TCP_LISTEN and open_requests
452 * (SYN-ACKs send out by Linux are always <576bytes so
453 * they should go through unfragmented).
454 */
455 if (sk->sk_state == TCP_LISTEN)
456 goto out;
457
458 tp->mtu_info = info;
459 if (!sock_owned_by_user(sk)) {
460 tcp_v4_mtu_reduced(sk);
461 } else {
462 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463 sock_hold(sk);
464 }
465 goto out;
466 }
467
468 err = icmp_err_convert[code].errno;
469 /* check if icmp_skb allows revert of backoff
470 * (see draft-zimmermann-tcp-lcd) */
471 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472 break;
473 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
474 !icsk->icsk_backoff || fastopen)
475 break;
476
477 if (sock_owned_by_user(sk))
478 break;
479
480 icsk->icsk_backoff--;
481 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482 TCP_TIMEOUT_INIT;
483 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484
485 skb = tcp_write_queue_head(sk);
486 BUG_ON(!skb);
487
488 tcp_mstamp_refresh(tp);
489 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490 remaining = icsk->icsk_rto -
491 usecs_to_jiffies(delta_us);
492
493 if (remaining > 0) {
494 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495 remaining, TCP_RTO_MAX);
496 } else {
497 /* RTO revert clocked out retransmission.
498 * Will retransmit now */
499 tcp_retransmit_timer(sk);
500 }
501
502 break;
503 case ICMP_TIME_EXCEEDED:
504 err = EHOSTUNREACH;
505 break;
506 default:
507 goto out;
508 }
509
510 switch (sk->sk_state) {
511 case TCP_SYN_SENT:
512 case TCP_SYN_RECV:
513 /* Only in fast or simultaneous open. If a fast open socket is
514 * is already accepted it is treated as a connected one below.
515 */
516 if (fastopen && !fastopen->sk)
517 break;
518
519 if (!sock_owned_by_user(sk)) {
520 sk->sk_err = err;
521
522 sk->sk_error_report(sk);
523
524 tcp_done(sk);
525 } else {
526 sk->sk_err_soft = err;
527 }
528 goto out;
529 }
530
531 /* If we've already connected we will keep trying
532 * until we time out, or the user gives up.
533 *
534 * rfc1122 4.2.3.9 allows to consider as hard errors
535 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 * but it is obsoleted by pmtu discovery).
537 *
538 * Note, that in modern internet, where routing is unreliable
539 * and in each dark corner broken firewalls sit, sending random
540 * errors ordered by their masters even this two messages finally lose
541 * their original sense (even Linux sends invalid PORT_UNREACHs)
542 *
543 * Now we are in compliance with RFCs.
544 * --ANK (980905)
545 */
546
547 inet = inet_sk(sk);
548 if (!sock_owned_by_user(sk) && inet->recverr) {
549 sk->sk_err = err;
550 sk->sk_error_report(sk);
551 } else { /* Only an error on timeout */
552 sk->sk_err_soft = err;
553 }
554
555 out:
556 bh_unlock_sock(sk);
557 sock_put(sk);
558 }
559
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562 struct tcphdr *th = tcp_hdr(skb);
563
564 if (skb->ip_summed == CHECKSUM_PARTIAL) {
565 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566 skb->csum_start = skb_transport_header(skb) - skb->head;
567 skb->csum_offset = offsetof(struct tcphdr, check);
568 } else {
569 th->check = tcp_v4_check(skb->len, saddr, daddr,
570 csum_partial(th,
571 th->doff << 2,
572 skb->csum));
573 }
574 }
575
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579 const struct inet_sock *inet = inet_sk(sk);
580
581 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584
585 /*
586 * This routine will send an RST to the other tcp.
587 *
588 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
589 * for reset.
590 * Answer: if a packet caused RST, it is not for a socket
591 * existing in our system, if it is matched to a socket,
592 * it is just duplicate segment or bug in other side's TCP.
593 * So that we build reply only basing on parameters
594 * arrived with segment.
595 * Exception: precedence violation. We do not implement it in any case.
596 */
597
598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
599 {
600 const struct tcphdr *th = tcp_hdr(skb);
601 struct {
602 struct tcphdr th;
603 #ifdef CONFIG_TCP_MD5SIG
604 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605 #endif
606 } rep;
607 struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609 struct tcp_md5sig_key *key = NULL;
610 const __u8 *hash_location = NULL;
611 unsigned char newhash[16];
612 int genhash;
613 struct sock *sk1 = NULL;
614 #endif
615 struct net *net;
616
617 /* Never send a reset in response to a reset. */
618 if (th->rst)
619 return;
620
621 /* If sk not NULL, it means we did a successful lookup and incoming
622 * route had to be correct. prequeue might have dropped our dst.
623 */
624 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
625 return;
626
627 /* Swap the send and the receive. */
628 memset(&rep, 0, sizeof(rep));
629 rep.th.dest = th->source;
630 rep.th.source = th->dest;
631 rep.th.doff = sizeof(struct tcphdr) / 4;
632 rep.th.rst = 1;
633
634 if (th->ack) {
635 rep.th.seq = th->ack_seq;
636 } else {
637 rep.th.ack = 1;
638 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
639 skb->len - (th->doff << 2));
640 }
641
642 memset(&arg, 0, sizeof(arg));
643 arg.iov[0].iov_base = (unsigned char *)&rep;
644 arg.iov[0].iov_len = sizeof(rep.th);
645
646 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
647 #ifdef CONFIG_TCP_MD5SIG
648 rcu_read_lock();
649 hash_location = tcp_parse_md5sig_option(th);
650 if (sk && sk_fullsock(sk)) {
651 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
652 &ip_hdr(skb)->saddr, AF_INET);
653 } else if (hash_location) {
654 /*
655 * active side is lost. Try to find listening socket through
656 * source port, and then find md5 key through listening socket.
657 * we are not loose security here:
658 * Incoming packet is checked with md5 hash with finding key,
659 * no RST generated if md5 hash doesn't match.
660 */
661 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
662 ip_hdr(skb)->saddr,
663 th->source, ip_hdr(skb)->daddr,
664 ntohs(th->source), inet_iif(skb));
665 /* don't send rst if it can't find key */
666 if (!sk1)
667 goto out;
668
669 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
670 &ip_hdr(skb)->saddr, AF_INET);
671 if (!key)
672 goto out;
673
674
675 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
676 if (genhash || memcmp(hash_location, newhash, 16) != 0)
677 goto out;
678
679 }
680
681 if (key) {
682 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
683 (TCPOPT_NOP << 16) |
684 (TCPOPT_MD5SIG << 8) |
685 TCPOLEN_MD5SIG);
686 /* Update length and the length the header thinks exists */
687 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
688 rep.th.doff = arg.iov[0].iov_len / 4;
689
690 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
691 key, ip_hdr(skb)->saddr,
692 ip_hdr(skb)->daddr, &rep.th);
693 }
694 #endif
695 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
696 ip_hdr(skb)->saddr, /* XXX */
697 arg.iov[0].iov_len, IPPROTO_TCP, 0);
698 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
699 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
700
701 /* When socket is gone, all binding information is lost.
702 * routing might fail in this case. No choice here, if we choose to force
703 * input interface, we will misroute in case of asymmetric route.
704 */
705 if (sk)
706 arg.bound_dev_if = sk->sk_bound_dev_if;
707
708 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
709 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
710
711 arg.tos = ip_hdr(skb)->tos;
712 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
713 local_bh_disable();
714 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
715 skb, &TCP_SKB_CB(skb)->header.h4.opt,
716 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
717 &arg, arg.iov[0].iov_len);
718
719 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
720 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
721 local_bh_enable();
722
723 #ifdef CONFIG_TCP_MD5SIG
724 out:
725 rcu_read_unlock();
726 #endif
727 }
728
729 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
730 outside socket context is ugly, certainly. What can I do?
731 */
732
733 static void tcp_v4_send_ack(const struct sock *sk,
734 struct sk_buff *skb, u32 seq, u32 ack,
735 u32 win, u32 tsval, u32 tsecr, int oif,
736 struct tcp_md5sig_key *key,
737 int reply_flags, u8 tos)
738 {
739 const struct tcphdr *th = tcp_hdr(skb);
740 struct {
741 struct tcphdr th;
742 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
743 #ifdef CONFIG_TCP_MD5SIG
744 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
745 #endif
746 ];
747 } rep;
748 struct net *net = sock_net(sk);
749 struct ip_reply_arg arg;
750
751 memset(&rep.th, 0, sizeof(struct tcphdr));
752 memset(&arg, 0, sizeof(arg));
753
754 arg.iov[0].iov_base = (unsigned char *)&rep;
755 arg.iov[0].iov_len = sizeof(rep.th);
756 if (tsecr) {
757 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
758 (TCPOPT_TIMESTAMP << 8) |
759 TCPOLEN_TIMESTAMP);
760 rep.opt[1] = htonl(tsval);
761 rep.opt[2] = htonl(tsecr);
762 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
763 }
764
765 /* Swap the send and the receive. */
766 rep.th.dest = th->source;
767 rep.th.source = th->dest;
768 rep.th.doff = arg.iov[0].iov_len / 4;
769 rep.th.seq = htonl(seq);
770 rep.th.ack_seq = htonl(ack);
771 rep.th.ack = 1;
772 rep.th.window = htons(win);
773
774 #ifdef CONFIG_TCP_MD5SIG
775 if (key) {
776 int offset = (tsecr) ? 3 : 0;
777
778 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
779 (TCPOPT_NOP << 16) |
780 (TCPOPT_MD5SIG << 8) |
781 TCPOLEN_MD5SIG);
782 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783 rep.th.doff = arg.iov[0].iov_len/4;
784
785 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
786 key, ip_hdr(skb)->saddr,
787 ip_hdr(skb)->daddr, &rep.th);
788 }
789 #endif
790 arg.flags = reply_flags;
791 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 ip_hdr(skb)->saddr, /* XXX */
793 arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 if (oif)
796 arg.bound_dev_if = oif;
797 arg.tos = tos;
798 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
799 local_bh_disable();
800 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
801 skb, &TCP_SKB_CB(skb)->header.h4.opt,
802 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
803 &arg, arg.iov[0].iov_len);
804
805 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
806 local_bh_enable();
807 }
808
809 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
810 {
811 struct inet_timewait_sock *tw = inet_twsk(sk);
812 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
813
814 tcp_v4_send_ack(sk, skb,
815 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
816 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
817 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
818 tcptw->tw_ts_recent,
819 tw->tw_bound_dev_if,
820 tcp_twsk_md5_key(tcptw),
821 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
822 tw->tw_tos
823 );
824
825 inet_twsk_put(tw);
826 }
827
828 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
829 struct request_sock *req)
830 {
831 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
832 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
833 */
834 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
835 tcp_sk(sk)->snd_nxt;
836
837 /* RFC 7323 2.3
838 * The window field (SEG.WND) of every outgoing segment, with the
839 * exception of <SYN> segments, MUST be right-shifted by
840 * Rcv.Wind.Shift bits:
841 */
842 tcp_v4_send_ack(sk, skb, seq,
843 tcp_rsk(req)->rcv_nxt,
844 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
845 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
846 req->ts_recent,
847 0,
848 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
849 AF_INET),
850 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
851 ip_hdr(skb)->tos);
852 }
853
854 /*
855 * Send a SYN-ACK after having received a SYN.
856 * This still operates on a request_sock only, not on a big
857 * socket.
858 */
859 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
860 struct flowi *fl,
861 struct request_sock *req,
862 struct tcp_fastopen_cookie *foc,
863 enum tcp_synack_type synack_type)
864 {
865 const struct inet_request_sock *ireq = inet_rsk(req);
866 struct flowi4 fl4;
867 int err = -1;
868 struct sk_buff *skb;
869
870 /* First, grab a route. */
871 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
872 return -1;
873
874 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
875
876 if (skb) {
877 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
878
879 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
880 ireq->ir_rmt_addr,
881 rcu_dereference(ireq->ireq_opt));
882 err = net_xmit_eval(err);
883 }
884
885 return err;
886 }
887
888 /*
889 * IPv4 request_sock destructor.
890 */
891 static void tcp_v4_reqsk_destructor(struct request_sock *req)
892 {
893 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
894 }
895
896 #ifdef CONFIG_TCP_MD5SIG
897 /*
898 * RFC2385 MD5 checksumming requires a mapping of
899 * IP address->MD5 Key.
900 * We need to maintain these in the sk structure.
901 */
902
903 /* Find the Key structure for an address. */
904 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
905 const union tcp_md5_addr *addr,
906 int family)
907 {
908 const struct tcp_sock *tp = tcp_sk(sk);
909 struct tcp_md5sig_key *key;
910 const struct tcp_md5sig_info *md5sig;
911 __be32 mask;
912 struct tcp_md5sig_key *best_match = NULL;
913 bool match;
914
915 /* caller either holds rcu_read_lock() or socket lock */
916 md5sig = rcu_dereference_check(tp->md5sig_info,
917 lockdep_sock_is_held(sk));
918 if (!md5sig)
919 return NULL;
920
921 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
922 if (key->family != family)
923 continue;
924
925 if (family == AF_INET) {
926 mask = inet_make_mask(key->prefixlen);
927 match = (key->addr.a4.s_addr & mask) ==
928 (addr->a4.s_addr & mask);
929 #if IS_ENABLED(CONFIG_IPV6)
930 } else if (family == AF_INET6) {
931 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
932 key->prefixlen);
933 #endif
934 } else {
935 match = false;
936 }
937
938 if (match && (!best_match ||
939 key->prefixlen > best_match->prefixlen))
940 best_match = key;
941 }
942 return best_match;
943 }
944 EXPORT_SYMBOL(tcp_md5_do_lookup);
945
946 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
947 const union tcp_md5_addr *addr,
948 int family, u8 prefixlen)
949 {
950 const struct tcp_sock *tp = tcp_sk(sk);
951 struct tcp_md5sig_key *key;
952 unsigned int size = sizeof(struct in_addr);
953 const struct tcp_md5sig_info *md5sig;
954
955 /* caller either holds rcu_read_lock() or socket lock */
956 md5sig = rcu_dereference_check(tp->md5sig_info,
957 lockdep_sock_is_held(sk));
958 if (!md5sig)
959 return NULL;
960 #if IS_ENABLED(CONFIG_IPV6)
961 if (family == AF_INET6)
962 size = sizeof(struct in6_addr);
963 #endif
964 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
965 if (key->family != family)
966 continue;
967 if (!memcmp(&key->addr, addr, size) &&
968 key->prefixlen == prefixlen)
969 return key;
970 }
971 return NULL;
972 }
973
974 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
975 const struct sock *addr_sk)
976 {
977 const union tcp_md5_addr *addr;
978
979 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
980 return tcp_md5_do_lookup(sk, addr, AF_INET);
981 }
982 EXPORT_SYMBOL(tcp_v4_md5_lookup);
983
984 /* This can be called on a newly created socket, from other files */
985 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
986 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
987 gfp_t gfp)
988 {
989 /* Add Key to the list */
990 struct tcp_md5sig_key *key;
991 struct tcp_sock *tp = tcp_sk(sk);
992 struct tcp_md5sig_info *md5sig;
993
994 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
995 if (key) {
996 /* Pre-existing entry - just update that one. */
997 memcpy(key->key, newkey, newkeylen);
998 key->keylen = newkeylen;
999 return 0;
1000 }
1001
1002 md5sig = rcu_dereference_protected(tp->md5sig_info,
1003 lockdep_sock_is_held(sk));
1004 if (!md5sig) {
1005 md5sig = kmalloc(sizeof(*md5sig), gfp);
1006 if (!md5sig)
1007 return -ENOMEM;
1008
1009 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1010 INIT_HLIST_HEAD(&md5sig->head);
1011 rcu_assign_pointer(tp->md5sig_info, md5sig);
1012 }
1013
1014 key = sock_kmalloc(sk, sizeof(*key), gfp);
1015 if (!key)
1016 return -ENOMEM;
1017 if (!tcp_alloc_md5sig_pool()) {
1018 sock_kfree_s(sk, key, sizeof(*key));
1019 return -ENOMEM;
1020 }
1021
1022 memcpy(key->key, newkey, newkeylen);
1023 key->keylen = newkeylen;
1024 key->family = family;
1025 key->prefixlen = prefixlen;
1026 memcpy(&key->addr, addr,
1027 (family == AF_INET6) ? sizeof(struct in6_addr) :
1028 sizeof(struct in_addr));
1029 hlist_add_head_rcu(&key->node, &md5sig->head);
1030 return 0;
1031 }
1032 EXPORT_SYMBOL(tcp_md5_do_add);
1033
1034 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1035 u8 prefixlen)
1036 {
1037 struct tcp_md5sig_key *key;
1038
1039 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1040 if (!key)
1041 return -ENOENT;
1042 hlist_del_rcu(&key->node);
1043 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1044 kfree_rcu(key, rcu);
1045 return 0;
1046 }
1047 EXPORT_SYMBOL(tcp_md5_do_del);
1048
1049 static void tcp_clear_md5_list(struct sock *sk)
1050 {
1051 struct tcp_sock *tp = tcp_sk(sk);
1052 struct tcp_md5sig_key *key;
1053 struct hlist_node *n;
1054 struct tcp_md5sig_info *md5sig;
1055
1056 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1057
1058 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1059 hlist_del_rcu(&key->node);
1060 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1061 kfree_rcu(key, rcu);
1062 }
1063 }
1064
1065 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1066 char __user *optval, int optlen)
1067 {
1068 struct tcp_md5sig cmd;
1069 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1070 u8 prefixlen = 32;
1071
1072 if (optlen < sizeof(cmd))
1073 return -EINVAL;
1074
1075 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1076 return -EFAULT;
1077
1078 if (sin->sin_family != AF_INET)
1079 return -EINVAL;
1080
1081 if (optname == TCP_MD5SIG_EXT &&
1082 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1083 prefixlen = cmd.tcpm_prefixlen;
1084 if (prefixlen > 32)
1085 return -EINVAL;
1086 }
1087
1088 if (!cmd.tcpm_keylen)
1089 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1090 AF_INET, prefixlen);
1091
1092 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1093 return -EINVAL;
1094
1095 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1096 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1097 GFP_KERNEL);
1098 }
1099
1100 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1101 __be32 daddr, __be32 saddr,
1102 const struct tcphdr *th, int nbytes)
1103 {
1104 struct tcp4_pseudohdr *bp;
1105 struct scatterlist sg;
1106 struct tcphdr *_th;
1107
1108 bp = hp->scratch;
1109 bp->saddr = saddr;
1110 bp->daddr = daddr;
1111 bp->pad = 0;
1112 bp->protocol = IPPROTO_TCP;
1113 bp->len = cpu_to_be16(nbytes);
1114
1115 _th = (struct tcphdr *)(bp + 1);
1116 memcpy(_th, th, sizeof(*th));
1117 _th->check = 0;
1118
1119 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1120 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1121 sizeof(*bp) + sizeof(*th));
1122 return crypto_ahash_update(hp->md5_req);
1123 }
1124
1125 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1126 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1127 {
1128 struct tcp_md5sig_pool *hp;
1129 struct ahash_request *req;
1130
1131 hp = tcp_get_md5sig_pool();
1132 if (!hp)
1133 goto clear_hash_noput;
1134 req = hp->md5_req;
1135
1136 if (crypto_ahash_init(req))
1137 goto clear_hash;
1138 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1139 goto clear_hash;
1140 if (tcp_md5_hash_key(hp, key))
1141 goto clear_hash;
1142 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1143 if (crypto_ahash_final(req))
1144 goto clear_hash;
1145
1146 tcp_put_md5sig_pool();
1147 return 0;
1148
1149 clear_hash:
1150 tcp_put_md5sig_pool();
1151 clear_hash_noput:
1152 memset(md5_hash, 0, 16);
1153 return 1;
1154 }
1155
1156 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1157 const struct sock *sk,
1158 const struct sk_buff *skb)
1159 {
1160 struct tcp_md5sig_pool *hp;
1161 struct ahash_request *req;
1162 const struct tcphdr *th = tcp_hdr(skb);
1163 __be32 saddr, daddr;
1164
1165 if (sk) { /* valid for establish/request sockets */
1166 saddr = sk->sk_rcv_saddr;
1167 daddr = sk->sk_daddr;
1168 } else {
1169 const struct iphdr *iph = ip_hdr(skb);
1170 saddr = iph->saddr;
1171 daddr = iph->daddr;
1172 }
1173
1174 hp = tcp_get_md5sig_pool();
1175 if (!hp)
1176 goto clear_hash_noput;
1177 req = hp->md5_req;
1178
1179 if (crypto_ahash_init(req))
1180 goto clear_hash;
1181
1182 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1183 goto clear_hash;
1184 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1185 goto clear_hash;
1186 if (tcp_md5_hash_key(hp, key))
1187 goto clear_hash;
1188 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1189 if (crypto_ahash_final(req))
1190 goto clear_hash;
1191
1192 tcp_put_md5sig_pool();
1193 return 0;
1194
1195 clear_hash:
1196 tcp_put_md5sig_pool();
1197 clear_hash_noput:
1198 memset(md5_hash, 0, 16);
1199 return 1;
1200 }
1201 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1202
1203 #endif
1204
1205 /* Called with rcu_read_lock() */
1206 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1207 const struct sk_buff *skb)
1208 {
1209 #ifdef CONFIG_TCP_MD5SIG
1210 /*
1211 * This gets called for each TCP segment that arrives
1212 * so we want to be efficient.
1213 * We have 3 drop cases:
1214 * o No MD5 hash and one expected.
1215 * o MD5 hash and we're not expecting one.
1216 * o MD5 hash and its wrong.
1217 */
1218 const __u8 *hash_location = NULL;
1219 struct tcp_md5sig_key *hash_expected;
1220 const struct iphdr *iph = ip_hdr(skb);
1221 const struct tcphdr *th = tcp_hdr(skb);
1222 int genhash;
1223 unsigned char newhash[16];
1224
1225 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1226 AF_INET);
1227 hash_location = tcp_parse_md5sig_option(th);
1228
1229 /* We've parsed the options - do we have a hash? */
1230 if (!hash_expected && !hash_location)
1231 return false;
1232
1233 if (hash_expected && !hash_location) {
1234 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1235 return true;
1236 }
1237
1238 if (!hash_expected && hash_location) {
1239 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1240 return true;
1241 }
1242
1243 /* Okay, so this is hash_expected and hash_location -
1244 * so we need to calculate the checksum.
1245 */
1246 genhash = tcp_v4_md5_hash_skb(newhash,
1247 hash_expected,
1248 NULL, skb);
1249
1250 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1251 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1252 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1253 &iph->saddr, ntohs(th->source),
1254 &iph->daddr, ntohs(th->dest),
1255 genhash ? " tcp_v4_calc_md5_hash failed"
1256 : "");
1257 return true;
1258 }
1259 return false;
1260 #endif
1261 return false;
1262 }
1263
1264 static void tcp_v4_init_req(struct request_sock *req,
1265 const struct sock *sk_listener,
1266 struct sk_buff *skb)
1267 {
1268 struct inet_request_sock *ireq = inet_rsk(req);
1269
1270 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1271 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1272 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1273 }
1274
1275 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1276 struct flowi *fl,
1277 const struct request_sock *req)
1278 {
1279 return inet_csk_route_req(sk, &fl->u.ip4, req);
1280 }
1281
1282 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1283 .family = PF_INET,
1284 .obj_size = sizeof(struct tcp_request_sock),
1285 .rtx_syn_ack = tcp_rtx_synack,
1286 .send_ack = tcp_v4_reqsk_send_ack,
1287 .destructor = tcp_v4_reqsk_destructor,
1288 .send_reset = tcp_v4_send_reset,
1289 .syn_ack_timeout = tcp_syn_ack_timeout,
1290 };
1291
1292 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1293 .mss_clamp = TCP_MSS_DEFAULT,
1294 #ifdef CONFIG_TCP_MD5SIG
1295 .req_md5_lookup = tcp_v4_md5_lookup,
1296 .calc_md5_hash = tcp_v4_md5_hash_skb,
1297 #endif
1298 .init_req = tcp_v4_init_req,
1299 #ifdef CONFIG_SYN_COOKIES
1300 .cookie_init_seq = cookie_v4_init_sequence,
1301 #endif
1302 .route_req = tcp_v4_route_req,
1303 .init_seq = tcp_v4_init_seq,
1304 .init_ts_off = tcp_v4_init_ts_off,
1305 .send_synack = tcp_v4_send_synack,
1306 };
1307
1308 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1309 {
1310 /* Never answer to SYNs send to broadcast or multicast */
1311 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1312 goto drop;
1313
1314 return tcp_conn_request(&tcp_request_sock_ops,
1315 &tcp_request_sock_ipv4_ops, sk, skb);
1316
1317 drop:
1318 tcp_listendrop(sk);
1319 return 0;
1320 }
1321 EXPORT_SYMBOL(tcp_v4_conn_request);
1322
1323
1324 /*
1325 * The three way handshake has completed - we got a valid synack -
1326 * now create the new socket.
1327 */
1328 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1329 struct request_sock *req,
1330 struct dst_entry *dst,
1331 struct request_sock *req_unhash,
1332 bool *own_req)
1333 {
1334 struct inet_request_sock *ireq;
1335 struct inet_sock *newinet;
1336 struct tcp_sock *newtp;
1337 struct sock *newsk;
1338 #ifdef CONFIG_TCP_MD5SIG
1339 struct tcp_md5sig_key *key;
1340 #endif
1341 struct ip_options_rcu *inet_opt;
1342
1343 if (sk_acceptq_is_full(sk))
1344 goto exit_overflow;
1345
1346 newsk = tcp_create_openreq_child(sk, req, skb);
1347 if (!newsk)
1348 goto exit_nonewsk;
1349
1350 newsk->sk_gso_type = SKB_GSO_TCPV4;
1351 inet_sk_rx_dst_set(newsk, skb);
1352
1353 newtp = tcp_sk(newsk);
1354 newinet = inet_sk(newsk);
1355 ireq = inet_rsk(req);
1356 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1357 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1358 newsk->sk_bound_dev_if = ireq->ir_iif;
1359 newinet->inet_saddr = ireq->ir_loc_addr;
1360 inet_opt = rcu_dereference(ireq->ireq_opt);
1361 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1362 newinet->mc_index = inet_iif(skb);
1363 newinet->mc_ttl = ip_hdr(skb)->ttl;
1364 newinet->rcv_tos = ip_hdr(skb)->tos;
1365 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1366 if (inet_opt)
1367 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1368 newinet->inet_id = newtp->write_seq ^ jiffies;
1369
1370 if (!dst) {
1371 dst = inet_csk_route_child_sock(sk, newsk, req);
1372 if (!dst)
1373 goto put_and_exit;
1374 } else {
1375 /* syncookie case : see end of cookie_v4_check() */
1376 }
1377 sk_setup_caps(newsk, dst);
1378
1379 tcp_ca_openreq_child(newsk, dst);
1380
1381 tcp_sync_mss(newsk, dst_mtu(dst));
1382 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1383
1384 tcp_initialize_rcv_mss(newsk);
1385
1386 #ifdef CONFIG_TCP_MD5SIG
1387 /* Copy over the MD5 key from the original socket */
1388 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1389 AF_INET);
1390 if (key) {
1391 /*
1392 * We're using one, so create a matching key
1393 * on the newsk structure. If we fail to get
1394 * memory, then we end up not copying the key
1395 * across. Shucks.
1396 */
1397 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1398 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1399 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1400 }
1401 #endif
1402
1403 if (__inet_inherit_port(sk, newsk) < 0)
1404 goto put_and_exit;
1405 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1406 if (likely(*own_req)) {
1407 tcp_move_syn(newtp, req);
1408 ireq->ireq_opt = NULL;
1409 } else {
1410 newinet->inet_opt = NULL;
1411 }
1412 return newsk;
1413
1414 exit_overflow:
1415 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1416 exit_nonewsk:
1417 dst_release(dst);
1418 exit:
1419 tcp_listendrop(sk);
1420 return NULL;
1421 put_and_exit:
1422 newinet->inet_opt = NULL;
1423 inet_csk_prepare_forced_close(newsk);
1424 tcp_done(newsk);
1425 goto exit;
1426 }
1427 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1428
1429 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1430 {
1431 #ifdef CONFIG_SYN_COOKIES
1432 const struct tcphdr *th = tcp_hdr(skb);
1433
1434 if (!th->syn)
1435 sk = cookie_v4_check(sk, skb);
1436 #endif
1437 return sk;
1438 }
1439
1440 /* The socket must have it's spinlock held when we get
1441 * here, unless it is a TCP_LISTEN socket.
1442 *
1443 * We have a potential double-lock case here, so even when
1444 * doing backlog processing we use the BH locking scheme.
1445 * This is because we cannot sleep with the original spinlock
1446 * held.
1447 */
1448 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1449 {
1450 struct sock *rsk;
1451
1452 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1453 struct dst_entry *dst = sk->sk_rx_dst;
1454
1455 sock_rps_save_rxhash(sk, skb);
1456 sk_mark_napi_id(sk, skb);
1457 if (dst) {
1458 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1459 !dst->ops->check(dst, 0)) {
1460 dst_release(dst);
1461 sk->sk_rx_dst = NULL;
1462 }
1463 }
1464 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1465 return 0;
1466 }
1467
1468 if (tcp_checksum_complete(skb))
1469 goto csum_err;
1470
1471 if (sk->sk_state == TCP_LISTEN) {
1472 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1473
1474 if (!nsk)
1475 goto discard;
1476 if (nsk != sk) {
1477 if (tcp_child_process(sk, nsk, skb)) {
1478 rsk = nsk;
1479 goto reset;
1480 }
1481 return 0;
1482 }
1483 } else
1484 sock_rps_save_rxhash(sk, skb);
1485
1486 if (tcp_rcv_state_process(sk, skb)) {
1487 rsk = sk;
1488 goto reset;
1489 }
1490 return 0;
1491
1492 reset:
1493 tcp_v4_send_reset(rsk, skb);
1494 discard:
1495 kfree_skb(skb);
1496 /* Be careful here. If this function gets more complicated and
1497 * gcc suffers from register pressure on the x86, sk (in %ebx)
1498 * might be destroyed here. This current version compiles correctly,
1499 * but you have been warned.
1500 */
1501 return 0;
1502
1503 csum_err:
1504 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1505 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1506 goto discard;
1507 }
1508 EXPORT_SYMBOL(tcp_v4_do_rcv);
1509
1510 int tcp_v4_early_demux(struct sk_buff *skb)
1511 {
1512 const struct iphdr *iph;
1513 const struct tcphdr *th;
1514 struct sock *sk;
1515
1516 if (skb->pkt_type != PACKET_HOST)
1517 return 0;
1518
1519 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1520 return 0;
1521
1522 iph = ip_hdr(skb);
1523 th = tcp_hdr(skb);
1524
1525 if (th->doff < sizeof(struct tcphdr) / 4)
1526 return 0;
1527
1528 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1529 iph->saddr, th->source,
1530 iph->daddr, ntohs(th->dest),
1531 skb->skb_iif);
1532 if (sk) {
1533 skb->sk = sk;
1534 skb->destructor = sock_edemux;
1535 if (sk_fullsock(sk)) {
1536 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1537
1538 if (dst)
1539 dst = dst_check(dst, 0);
1540 if (dst &&
1541 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1542 skb_dst_set_noref(skb, dst);
1543 }
1544 }
1545 return 0;
1546 }
1547
1548 /* Packet is added to VJ-style prequeue for processing in process
1549 * context, if a reader task is waiting. Apparently, this exciting
1550 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1551 * failed somewhere. Latency? Burstiness? Well, at least now we will
1552 * see, why it failed. 8)8) --ANK
1553 *
1554 */
1555 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1556 {
1557 struct tcp_sock *tp = tcp_sk(sk);
1558
1559 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1560 return false;
1561
1562 if (skb->len <= tcp_hdrlen(skb) &&
1563 skb_queue_len(&tp->ucopy.prequeue) == 0)
1564 return false;
1565
1566 /* Before escaping RCU protected region, we need to take care of skb
1567 * dst. Prequeue is only enabled for established sockets.
1568 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1569 * Instead of doing full sk_rx_dst validity here, let's perform
1570 * an optimistic check.
1571 */
1572 if (likely(sk->sk_rx_dst))
1573 skb_dst_drop(skb);
1574 else
1575 skb_dst_force_safe(skb);
1576
1577 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1578 tp->ucopy.memory += skb->truesize;
1579 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1580 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1581 struct sk_buff *skb1;
1582
1583 BUG_ON(sock_owned_by_user(sk));
1584 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1585 skb_queue_len(&tp->ucopy.prequeue));
1586
1587 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1588 sk_backlog_rcv(sk, skb1);
1589
1590 tp->ucopy.memory = 0;
1591 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1592 wake_up_interruptible_sync_poll(sk_sleep(sk),
1593 POLLIN | POLLRDNORM | POLLRDBAND);
1594 if (!inet_csk_ack_scheduled(sk))
1595 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1596 (3 * tcp_rto_min(sk)) / 4,
1597 TCP_RTO_MAX);
1598 }
1599 return true;
1600 }
1601 EXPORT_SYMBOL(tcp_prequeue);
1602
1603 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1604 {
1605 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1606
1607 /* Only socket owner can try to collapse/prune rx queues
1608 * to reduce memory overhead, so add a little headroom here.
1609 * Few sockets backlog are possibly concurrently non empty.
1610 */
1611 limit += 64*1024;
1612
1613 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1614 * we can fix skb->truesize to its real value to avoid future drops.
1615 * This is valid because skb is not yet charged to the socket.
1616 * It has been noticed pure SACK packets were sometimes dropped
1617 * (if cooked by drivers without copybreak feature).
1618 */
1619 skb_condense(skb);
1620
1621 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1622 bh_unlock_sock(sk);
1623 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1624 return true;
1625 }
1626 return false;
1627 }
1628 EXPORT_SYMBOL(tcp_add_backlog);
1629
1630 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1631 {
1632 struct tcphdr *th = (struct tcphdr *)skb->data;
1633 unsigned int eaten = skb->len;
1634 int err;
1635
1636 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1637 if (!err) {
1638 eaten -= skb->len;
1639 TCP_SKB_CB(skb)->end_seq -= eaten;
1640 }
1641 return err;
1642 }
1643 EXPORT_SYMBOL(tcp_filter);
1644
1645 /*
1646 * From tcp_input.c
1647 */
1648
1649 int tcp_v4_rcv(struct sk_buff *skb)
1650 {
1651 struct net *net = dev_net(skb->dev);
1652 const struct iphdr *iph;
1653 const struct tcphdr *th;
1654 bool refcounted;
1655 struct sock *sk;
1656 int ret;
1657
1658 if (skb->pkt_type != PACKET_HOST)
1659 goto discard_it;
1660
1661 /* Count it even if it's bad */
1662 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1663
1664 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1665 goto discard_it;
1666
1667 th = (const struct tcphdr *)skb->data;
1668
1669 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1670 goto bad_packet;
1671 if (!pskb_may_pull(skb, th->doff * 4))
1672 goto discard_it;
1673
1674 /* An explanation is required here, I think.
1675 * Packet length and doff are validated by header prediction,
1676 * provided case of th->doff==0 is eliminated.
1677 * So, we defer the checks. */
1678
1679 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1680 goto csum_error;
1681
1682 th = (const struct tcphdr *)skb->data;
1683 iph = ip_hdr(skb);
1684 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1685 * barrier() makes sure compiler wont play fool^Waliasing games.
1686 */
1687 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1688 sizeof(struct inet_skb_parm));
1689 barrier();
1690
1691 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1692 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1693 skb->len - th->doff * 4);
1694 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1695 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1696 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1697 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1698 TCP_SKB_CB(skb)->sacked = 0;
1699
1700 lookup:
1701 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1702 th->dest, &refcounted);
1703 if (!sk)
1704 goto no_tcp_socket;
1705
1706 process:
1707 if (sk->sk_state == TCP_TIME_WAIT)
1708 goto do_time_wait;
1709
1710 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1711 struct request_sock *req = inet_reqsk(sk);
1712 struct sock *nsk;
1713
1714 sk = req->rsk_listener;
1715 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1716 sk_drops_add(sk, skb);
1717 reqsk_put(req);
1718 goto discard_it;
1719 }
1720 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1721 inet_csk_reqsk_queue_drop_and_put(sk, req);
1722 goto lookup;
1723 }
1724 /* We own a reference on the listener, increase it again
1725 * as we might lose it too soon.
1726 */
1727 sock_hold(sk);
1728 refcounted = true;
1729 nsk = NULL;
1730 if (!tcp_filter(sk, skb))
1731 nsk = tcp_check_req(sk, skb, req, false);
1732 if (!nsk) {
1733 reqsk_put(req);
1734 goto discard_and_relse;
1735 }
1736 if (nsk == sk) {
1737 reqsk_put(req);
1738 } else if (tcp_child_process(sk, nsk, skb)) {
1739 tcp_v4_send_reset(nsk, skb);
1740 goto discard_and_relse;
1741 } else {
1742 sock_put(sk);
1743 return 0;
1744 }
1745 }
1746 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1747 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1748 goto discard_and_relse;
1749 }
1750
1751 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1752 goto discard_and_relse;
1753
1754 if (tcp_v4_inbound_md5_hash(sk, skb))
1755 goto discard_and_relse;
1756
1757 nf_reset(skb);
1758
1759 if (tcp_filter(sk, skb))
1760 goto discard_and_relse;
1761 th = (const struct tcphdr *)skb->data;
1762 iph = ip_hdr(skb);
1763
1764 skb->dev = NULL;
1765
1766 if (sk->sk_state == TCP_LISTEN) {
1767 ret = tcp_v4_do_rcv(sk, skb);
1768 goto put_and_return;
1769 }
1770
1771 sk_incoming_cpu_update(sk);
1772
1773 bh_lock_sock_nested(sk);
1774 tcp_segs_in(tcp_sk(sk), skb);
1775 ret = 0;
1776 if (!sock_owned_by_user(sk)) {
1777 if (!tcp_prequeue(sk, skb))
1778 ret = tcp_v4_do_rcv(sk, skb);
1779 } else if (tcp_add_backlog(sk, skb)) {
1780 goto discard_and_relse;
1781 }
1782 bh_unlock_sock(sk);
1783
1784 put_and_return:
1785 if (refcounted)
1786 sock_put(sk);
1787
1788 return ret;
1789
1790 no_tcp_socket:
1791 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1792 goto discard_it;
1793
1794 if (tcp_checksum_complete(skb)) {
1795 csum_error:
1796 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1797 bad_packet:
1798 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1799 } else {
1800 tcp_v4_send_reset(NULL, skb);
1801 }
1802
1803 discard_it:
1804 /* Discard frame. */
1805 kfree_skb(skb);
1806 return 0;
1807
1808 discard_and_relse:
1809 sk_drops_add(sk, skb);
1810 if (refcounted)
1811 sock_put(sk);
1812 goto discard_it;
1813
1814 do_time_wait:
1815 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1816 inet_twsk_put(inet_twsk(sk));
1817 goto discard_it;
1818 }
1819
1820 if (tcp_checksum_complete(skb)) {
1821 inet_twsk_put(inet_twsk(sk));
1822 goto csum_error;
1823 }
1824 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1825 case TCP_TW_SYN: {
1826 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1827 &tcp_hashinfo, skb,
1828 __tcp_hdrlen(th),
1829 iph->saddr, th->source,
1830 iph->daddr, th->dest,
1831 inet_iif(skb));
1832 if (sk2) {
1833 inet_twsk_deschedule_put(inet_twsk(sk));
1834 sk = sk2;
1835 refcounted = false;
1836 goto process;
1837 }
1838 /* Fall through to ACK */
1839 }
1840 case TCP_TW_ACK:
1841 tcp_v4_timewait_ack(sk, skb);
1842 break;
1843 case TCP_TW_RST:
1844 tcp_v4_send_reset(sk, skb);
1845 inet_twsk_deschedule_put(inet_twsk(sk));
1846 goto discard_it;
1847 case TCP_TW_SUCCESS:;
1848 }
1849 goto discard_it;
1850 }
1851
1852 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1853 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1854 .twsk_unique = tcp_twsk_unique,
1855 .twsk_destructor= tcp_twsk_destructor,
1856 };
1857
1858 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1859 {
1860 struct dst_entry *dst = skb_dst(skb);
1861
1862 if (dst && dst_hold_safe(dst)) {
1863 sk->sk_rx_dst = dst;
1864 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1865 }
1866 }
1867 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1868
1869 const struct inet_connection_sock_af_ops ipv4_specific = {
1870 .queue_xmit = ip_queue_xmit,
1871 .send_check = tcp_v4_send_check,
1872 .rebuild_header = inet_sk_rebuild_header,
1873 .sk_rx_dst_set = inet_sk_rx_dst_set,
1874 .conn_request = tcp_v4_conn_request,
1875 .syn_recv_sock = tcp_v4_syn_recv_sock,
1876 .net_header_len = sizeof(struct iphdr),
1877 .setsockopt = ip_setsockopt,
1878 .getsockopt = ip_getsockopt,
1879 .addr2sockaddr = inet_csk_addr2sockaddr,
1880 .sockaddr_len = sizeof(struct sockaddr_in),
1881 #ifdef CONFIG_COMPAT
1882 .compat_setsockopt = compat_ip_setsockopt,
1883 .compat_getsockopt = compat_ip_getsockopt,
1884 #endif
1885 .mtu_reduced = tcp_v4_mtu_reduced,
1886 };
1887 EXPORT_SYMBOL(ipv4_specific);
1888
1889 #ifdef CONFIG_TCP_MD5SIG
1890 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1891 .md5_lookup = tcp_v4_md5_lookup,
1892 .calc_md5_hash = tcp_v4_md5_hash_skb,
1893 .md5_parse = tcp_v4_parse_md5_keys,
1894 };
1895 #endif
1896
1897 /* NOTE: A lot of things set to zero explicitly by call to
1898 * sk_alloc() so need not be done here.
1899 */
1900 static int tcp_v4_init_sock(struct sock *sk)
1901 {
1902 struct inet_connection_sock *icsk = inet_csk(sk);
1903
1904 tcp_init_sock(sk);
1905
1906 icsk->icsk_af_ops = &ipv4_specific;
1907
1908 #ifdef CONFIG_TCP_MD5SIG
1909 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1910 #endif
1911
1912 return 0;
1913 }
1914
1915 void tcp_v4_destroy_sock(struct sock *sk)
1916 {
1917 struct tcp_sock *tp = tcp_sk(sk);
1918
1919 tcp_clear_xmit_timers(sk);
1920
1921 tcp_cleanup_congestion_control(sk);
1922
1923 tcp_cleanup_ulp(sk);
1924
1925 /* Cleanup up the write buffer. */
1926 tcp_write_queue_purge(sk);
1927
1928 /* Check if we want to disable active TFO */
1929 tcp_fastopen_active_disable_ofo_check(sk);
1930
1931 /* Cleans up our, hopefully empty, out_of_order_queue. */
1932 skb_rbtree_purge(&tp->out_of_order_queue);
1933
1934 #ifdef CONFIG_TCP_MD5SIG
1935 /* Clean up the MD5 key list, if any */
1936 if (tp->md5sig_info) {
1937 tcp_clear_md5_list(sk);
1938 kfree_rcu(tp->md5sig_info, rcu);
1939 tp->md5sig_info = NULL;
1940 }
1941 #endif
1942
1943 /* Clean prequeue, it must be empty really */
1944 __skb_queue_purge(&tp->ucopy.prequeue);
1945
1946 /* Clean up a referenced TCP bind bucket. */
1947 if (inet_csk(sk)->icsk_bind_hash)
1948 inet_put_port(sk);
1949
1950 BUG_ON(tp->fastopen_rsk);
1951
1952 /* If socket is aborted during connect operation */
1953 tcp_free_fastopen_req(tp);
1954 tcp_saved_syn_free(tp);
1955
1956 sk_sockets_allocated_dec(sk);
1957 }
1958 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1959
1960 #ifdef CONFIG_PROC_FS
1961 /* Proc filesystem TCP sock list dumping. */
1962
1963 /*
1964 * Get next listener socket follow cur. If cur is NULL, get first socket
1965 * starting from bucket given in st->bucket; when st->bucket is zero the
1966 * very first socket in the hash table is returned.
1967 */
1968 static void *listening_get_next(struct seq_file *seq, void *cur)
1969 {
1970 struct tcp_iter_state *st = seq->private;
1971 struct net *net = seq_file_net(seq);
1972 struct inet_listen_hashbucket *ilb;
1973 struct sock *sk = cur;
1974
1975 if (!sk) {
1976 get_head:
1977 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1978 spin_lock(&ilb->lock);
1979 sk = sk_head(&ilb->head);
1980 st->offset = 0;
1981 goto get_sk;
1982 }
1983 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1984 ++st->num;
1985 ++st->offset;
1986
1987 sk = sk_next(sk);
1988 get_sk:
1989 sk_for_each_from(sk) {
1990 if (!net_eq(sock_net(sk), net))
1991 continue;
1992 if (sk->sk_family == st->family)
1993 return sk;
1994 }
1995 spin_unlock(&ilb->lock);
1996 st->offset = 0;
1997 if (++st->bucket < INET_LHTABLE_SIZE)
1998 goto get_head;
1999 return NULL;
2000 }
2001
2002 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2003 {
2004 struct tcp_iter_state *st = seq->private;
2005 void *rc;
2006
2007 st->bucket = 0;
2008 st->offset = 0;
2009 rc = listening_get_next(seq, NULL);
2010
2011 while (rc && *pos) {
2012 rc = listening_get_next(seq, rc);
2013 --*pos;
2014 }
2015 return rc;
2016 }
2017
2018 static inline bool empty_bucket(const struct tcp_iter_state *st)
2019 {
2020 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2021 }
2022
2023 /*
2024 * Get first established socket starting from bucket given in st->bucket.
2025 * If st->bucket is zero, the very first socket in the hash is returned.
2026 */
2027 static void *established_get_first(struct seq_file *seq)
2028 {
2029 struct tcp_iter_state *st = seq->private;
2030 struct net *net = seq_file_net(seq);
2031 void *rc = NULL;
2032
2033 st->offset = 0;
2034 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2035 struct sock *sk;
2036 struct hlist_nulls_node *node;
2037 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2038
2039 /* Lockless fast path for the common case of empty buckets */
2040 if (empty_bucket(st))
2041 continue;
2042
2043 spin_lock_bh(lock);
2044 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2045 if (sk->sk_family != st->family ||
2046 !net_eq(sock_net(sk), net)) {
2047 continue;
2048 }
2049 rc = sk;
2050 goto out;
2051 }
2052 spin_unlock_bh(lock);
2053 }
2054 out:
2055 return rc;
2056 }
2057
2058 static void *established_get_next(struct seq_file *seq, void *cur)
2059 {
2060 struct sock *sk = cur;
2061 struct hlist_nulls_node *node;
2062 struct tcp_iter_state *st = seq->private;
2063 struct net *net = seq_file_net(seq);
2064
2065 ++st->num;
2066 ++st->offset;
2067
2068 sk = sk_nulls_next(sk);
2069
2070 sk_nulls_for_each_from(sk, node) {
2071 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2072 return sk;
2073 }
2074
2075 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2076 ++st->bucket;
2077 return established_get_first(seq);
2078 }
2079
2080 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2081 {
2082 struct tcp_iter_state *st = seq->private;
2083 void *rc;
2084
2085 st->bucket = 0;
2086 rc = established_get_first(seq);
2087
2088 while (rc && pos) {
2089 rc = established_get_next(seq, rc);
2090 --pos;
2091 }
2092 return rc;
2093 }
2094
2095 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2096 {
2097 void *rc;
2098 struct tcp_iter_state *st = seq->private;
2099
2100 st->state = TCP_SEQ_STATE_LISTENING;
2101 rc = listening_get_idx(seq, &pos);
2102
2103 if (!rc) {
2104 st->state = TCP_SEQ_STATE_ESTABLISHED;
2105 rc = established_get_idx(seq, pos);
2106 }
2107
2108 return rc;
2109 }
2110
2111 static void *tcp_seek_last_pos(struct seq_file *seq)
2112 {
2113 struct tcp_iter_state *st = seq->private;
2114 int offset = st->offset;
2115 int orig_num = st->num;
2116 void *rc = NULL;
2117
2118 switch (st->state) {
2119 case TCP_SEQ_STATE_LISTENING:
2120 if (st->bucket >= INET_LHTABLE_SIZE)
2121 break;
2122 st->state = TCP_SEQ_STATE_LISTENING;
2123 rc = listening_get_next(seq, NULL);
2124 while (offset-- && rc)
2125 rc = listening_get_next(seq, rc);
2126 if (rc)
2127 break;
2128 st->bucket = 0;
2129 st->state = TCP_SEQ_STATE_ESTABLISHED;
2130 /* Fallthrough */
2131 case TCP_SEQ_STATE_ESTABLISHED:
2132 if (st->bucket > tcp_hashinfo.ehash_mask)
2133 break;
2134 rc = established_get_first(seq);
2135 while (offset-- && rc)
2136 rc = established_get_next(seq, rc);
2137 }
2138
2139 st->num = orig_num;
2140
2141 return rc;
2142 }
2143
2144 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2145 {
2146 struct tcp_iter_state *st = seq->private;
2147 void *rc;
2148
2149 if (*pos && *pos == st->last_pos) {
2150 rc = tcp_seek_last_pos(seq);
2151 if (rc)
2152 goto out;
2153 }
2154
2155 st->state = TCP_SEQ_STATE_LISTENING;
2156 st->num = 0;
2157 st->bucket = 0;
2158 st->offset = 0;
2159 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2160
2161 out:
2162 st->last_pos = *pos;
2163 return rc;
2164 }
2165
2166 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2167 {
2168 struct tcp_iter_state *st = seq->private;
2169 void *rc = NULL;
2170
2171 if (v == SEQ_START_TOKEN) {
2172 rc = tcp_get_idx(seq, 0);
2173 goto out;
2174 }
2175
2176 switch (st->state) {
2177 case TCP_SEQ_STATE_LISTENING:
2178 rc = listening_get_next(seq, v);
2179 if (!rc) {
2180 st->state = TCP_SEQ_STATE_ESTABLISHED;
2181 st->bucket = 0;
2182 st->offset = 0;
2183 rc = established_get_first(seq);
2184 }
2185 break;
2186 case TCP_SEQ_STATE_ESTABLISHED:
2187 rc = established_get_next(seq, v);
2188 break;
2189 }
2190 out:
2191 ++*pos;
2192 st->last_pos = *pos;
2193 return rc;
2194 }
2195
2196 static void tcp_seq_stop(struct seq_file *seq, void *v)
2197 {
2198 struct tcp_iter_state *st = seq->private;
2199
2200 switch (st->state) {
2201 case TCP_SEQ_STATE_LISTENING:
2202 if (v != SEQ_START_TOKEN)
2203 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2204 break;
2205 case TCP_SEQ_STATE_ESTABLISHED:
2206 if (v)
2207 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2208 break;
2209 }
2210 }
2211
2212 int tcp_seq_open(struct inode *inode, struct file *file)
2213 {
2214 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2215 struct tcp_iter_state *s;
2216 int err;
2217
2218 err = seq_open_net(inode, file, &afinfo->seq_ops,
2219 sizeof(struct tcp_iter_state));
2220 if (err < 0)
2221 return err;
2222
2223 s = ((struct seq_file *)file->private_data)->private;
2224 s->family = afinfo->family;
2225 s->last_pos = 0;
2226 return 0;
2227 }
2228 EXPORT_SYMBOL(tcp_seq_open);
2229
2230 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2231 {
2232 int rc = 0;
2233 struct proc_dir_entry *p;
2234
2235 afinfo->seq_ops.start = tcp_seq_start;
2236 afinfo->seq_ops.next = tcp_seq_next;
2237 afinfo->seq_ops.stop = tcp_seq_stop;
2238
2239 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2240 afinfo->seq_fops, afinfo);
2241 if (!p)
2242 rc = -ENOMEM;
2243 return rc;
2244 }
2245 EXPORT_SYMBOL(tcp_proc_register);
2246
2247 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2248 {
2249 remove_proc_entry(afinfo->name, net->proc_net);
2250 }
2251 EXPORT_SYMBOL(tcp_proc_unregister);
2252
2253 static void get_openreq4(const struct request_sock *req,
2254 struct seq_file *f, int i)
2255 {
2256 const struct inet_request_sock *ireq = inet_rsk(req);
2257 long delta = req->rsk_timer.expires - jiffies;
2258
2259 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2260 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2261 i,
2262 ireq->ir_loc_addr,
2263 ireq->ir_num,
2264 ireq->ir_rmt_addr,
2265 ntohs(ireq->ir_rmt_port),
2266 TCP_SYN_RECV,
2267 0, 0, /* could print option size, but that is af dependent. */
2268 1, /* timers active (only the expire timer) */
2269 jiffies_delta_to_clock_t(delta),
2270 req->num_timeout,
2271 from_kuid_munged(seq_user_ns(f),
2272 sock_i_uid(req->rsk_listener)),
2273 0, /* non standard timer */
2274 0, /* open_requests have no inode */
2275 0,
2276 req);
2277 }
2278
2279 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2280 {
2281 int timer_active;
2282 unsigned long timer_expires;
2283 const struct tcp_sock *tp = tcp_sk(sk);
2284 const struct inet_connection_sock *icsk = inet_csk(sk);
2285 const struct inet_sock *inet = inet_sk(sk);
2286 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2287 __be32 dest = inet->inet_daddr;
2288 __be32 src = inet->inet_rcv_saddr;
2289 __u16 destp = ntohs(inet->inet_dport);
2290 __u16 srcp = ntohs(inet->inet_sport);
2291 int rx_queue;
2292 int state;
2293
2294 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2295 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2296 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2297 timer_active = 1;
2298 timer_expires = icsk->icsk_timeout;
2299 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2300 timer_active = 4;
2301 timer_expires = icsk->icsk_timeout;
2302 } else if (timer_pending(&sk->sk_timer)) {
2303 timer_active = 2;
2304 timer_expires = sk->sk_timer.expires;
2305 } else {
2306 timer_active = 0;
2307 timer_expires = jiffies;
2308 }
2309
2310 state = sk_state_load(sk);
2311 if (state == TCP_LISTEN)
2312 rx_queue = sk->sk_ack_backlog;
2313 else
2314 /* Because we don't lock the socket,
2315 * we might find a transient negative value.
2316 */
2317 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2318
2319 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2320 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2321 i, src, srcp, dest, destp, state,
2322 tp->write_seq - tp->snd_una,
2323 rx_queue,
2324 timer_active,
2325 jiffies_delta_to_clock_t(timer_expires - jiffies),
2326 icsk->icsk_retransmits,
2327 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2328 icsk->icsk_probes_out,
2329 sock_i_ino(sk),
2330 refcount_read(&sk->sk_refcnt), sk,
2331 jiffies_to_clock_t(icsk->icsk_rto),
2332 jiffies_to_clock_t(icsk->icsk_ack.ato),
2333 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2334 tp->snd_cwnd,
2335 state == TCP_LISTEN ?
2336 fastopenq->max_qlen :
2337 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2338 }
2339
2340 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2341 struct seq_file *f, int i)
2342 {
2343 long delta = tw->tw_timer.expires - jiffies;
2344 __be32 dest, src;
2345 __u16 destp, srcp;
2346
2347 dest = tw->tw_daddr;
2348 src = tw->tw_rcv_saddr;
2349 destp = ntohs(tw->tw_dport);
2350 srcp = ntohs(tw->tw_sport);
2351
2352 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2353 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2354 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2355 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2356 refcount_read(&tw->tw_refcnt), tw);
2357 }
2358
2359 #define TMPSZ 150
2360
2361 static int tcp4_seq_show(struct seq_file *seq, void *v)
2362 {
2363 struct tcp_iter_state *st;
2364 struct sock *sk = v;
2365
2366 seq_setwidth(seq, TMPSZ - 1);
2367 if (v == SEQ_START_TOKEN) {
2368 seq_puts(seq, " sl local_address rem_address st tx_queue "
2369 "rx_queue tr tm->when retrnsmt uid timeout "
2370 "inode");
2371 goto out;
2372 }
2373 st = seq->private;
2374
2375 if (sk->sk_state == TCP_TIME_WAIT)
2376 get_timewait4_sock(v, seq, st->num);
2377 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2378 get_openreq4(v, seq, st->num);
2379 else
2380 get_tcp4_sock(v, seq, st->num);
2381 out:
2382 seq_pad(seq, '\n');
2383 return 0;
2384 }
2385
2386 static const struct file_operations tcp_afinfo_seq_fops = {
2387 .owner = THIS_MODULE,
2388 .open = tcp_seq_open,
2389 .read = seq_read,
2390 .llseek = seq_lseek,
2391 .release = seq_release_net
2392 };
2393
2394 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2395 .name = "tcp",
2396 .family = AF_INET,
2397 .seq_fops = &tcp_afinfo_seq_fops,
2398 .seq_ops = {
2399 .show = tcp4_seq_show,
2400 },
2401 };
2402
2403 static int __net_init tcp4_proc_init_net(struct net *net)
2404 {
2405 return tcp_proc_register(net, &tcp4_seq_afinfo);
2406 }
2407
2408 static void __net_exit tcp4_proc_exit_net(struct net *net)
2409 {
2410 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2411 }
2412
2413 static struct pernet_operations tcp4_net_ops = {
2414 .init = tcp4_proc_init_net,
2415 .exit = tcp4_proc_exit_net,
2416 };
2417
2418 int __init tcp4_proc_init(void)
2419 {
2420 return register_pernet_subsys(&tcp4_net_ops);
2421 }
2422
2423 void tcp4_proc_exit(void)
2424 {
2425 unregister_pernet_subsys(&tcp4_net_ops);
2426 }
2427 #endif /* CONFIG_PROC_FS */
2428
2429 struct proto tcp_prot = {
2430 .name = "TCP",
2431 .owner = THIS_MODULE,
2432 .close = tcp_close,
2433 .connect = tcp_v4_connect,
2434 .disconnect = tcp_disconnect,
2435 .accept = inet_csk_accept,
2436 .ioctl = tcp_ioctl,
2437 .init = tcp_v4_init_sock,
2438 .destroy = tcp_v4_destroy_sock,
2439 .shutdown = tcp_shutdown,
2440 .setsockopt = tcp_setsockopt,
2441 .getsockopt = tcp_getsockopt,
2442 .keepalive = tcp_set_keepalive,
2443 .recvmsg = tcp_recvmsg,
2444 .sendmsg = tcp_sendmsg,
2445 .sendpage = tcp_sendpage,
2446 .backlog_rcv = tcp_v4_do_rcv,
2447 .release_cb = tcp_release_cb,
2448 .hash = inet_hash,
2449 .unhash = inet_unhash,
2450 .get_port = inet_csk_get_port,
2451 .enter_memory_pressure = tcp_enter_memory_pressure,
2452 .leave_memory_pressure = tcp_leave_memory_pressure,
2453 .stream_memory_free = tcp_stream_memory_free,
2454 .sockets_allocated = &tcp_sockets_allocated,
2455 .orphan_count = &tcp_orphan_count,
2456 .memory_allocated = &tcp_memory_allocated,
2457 .memory_pressure = &tcp_memory_pressure,
2458 .sysctl_mem = sysctl_tcp_mem,
2459 .sysctl_wmem = sysctl_tcp_wmem,
2460 .sysctl_rmem = sysctl_tcp_rmem,
2461 .max_header = MAX_TCP_HEADER,
2462 .obj_size = sizeof(struct tcp_sock),
2463 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2464 .twsk_prot = &tcp_timewait_sock_ops,
2465 .rsk_prot = &tcp_request_sock_ops,
2466 .h.hashinfo = &tcp_hashinfo,
2467 .no_autobind = true,
2468 #ifdef CONFIG_COMPAT
2469 .compat_setsockopt = compat_tcp_setsockopt,
2470 .compat_getsockopt = compat_tcp_getsockopt,
2471 #endif
2472 .diag_destroy = tcp_abort,
2473 };
2474 EXPORT_SYMBOL(tcp_prot);
2475
2476 static void __net_exit tcp_sk_exit(struct net *net)
2477 {
2478 int cpu;
2479
2480 for_each_possible_cpu(cpu)
2481 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2482 free_percpu(net->ipv4.tcp_sk);
2483 }
2484
2485 static int __net_init tcp_sk_init(struct net *net)
2486 {
2487 int res, cpu, cnt;
2488
2489 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2490 if (!net->ipv4.tcp_sk)
2491 return -ENOMEM;
2492
2493 for_each_possible_cpu(cpu) {
2494 struct sock *sk;
2495
2496 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2497 IPPROTO_TCP, net);
2498 if (res)
2499 goto fail;
2500 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2501 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2502 }
2503
2504 net->ipv4.sysctl_tcp_ecn = 2;
2505 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2506
2507 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2508 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2509 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2510
2511 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2512 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2513 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2514
2515 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2516 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2517 net->ipv4.sysctl_tcp_syncookies = 1;
2518 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2519 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2520 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2521 net->ipv4.sysctl_tcp_orphan_retries = 0;
2522 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2523 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2524 net->ipv4.sysctl_tcp_tw_reuse = 0;
2525
2526 cnt = tcp_hashinfo.ehash_mask + 1;
2527 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2528 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2529
2530 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2531 net->ipv4.sysctl_tcp_sack = 1;
2532 net->ipv4.sysctl_tcp_window_scaling = 1;
2533 net->ipv4.sysctl_tcp_timestamps = 1;
2534
2535 return 0;
2536 fail:
2537 tcp_sk_exit(net);
2538
2539 return res;
2540 }
2541
2542 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2543 {
2544 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2545 }
2546
2547 static struct pernet_operations __net_initdata tcp_sk_ops = {
2548 .init = tcp_sk_init,
2549 .exit = tcp_sk_exit,
2550 .exit_batch = tcp_sk_exit_batch,
2551 };
2552
2553 void __init tcp_v4_init(void)
2554 {
2555 if (register_pernet_subsys(&tcp_sk_ops))
2556 panic("Failed to create the TCP control socket.\n");
2557 }