net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 101                                           ip_hdr(skb)->saddr,
 102                                           tcp_hdr(skb)->dest,
 103                                           tcp_hdr(skb)->source);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 109         struct tcp_sock *tp = tcp_sk(sk);
 110
 111         /* With PAWS, it is safe from the viewpoint
 112            of data integrity. Even without PAWS it is safe provided sequence
 113            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 114
 115            Actually, the idea is close to VJ's one, only timestamp cache is
 116            held not per host, but per port pair and TW bucket is used as state
 117            holder.
 118
 119            If TW bucket has been already destroyed we fall back to VJ's scheme
 120            and use initial timestamp retrieved from peer table.
 121          */
 122         if (tcptw->tw_ts_recent_stamp &&
 123             (!twp || (sysctl_tcp_tw_reuse &&
 124                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 125                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 126                 if (tp->write_seq == 0)
 127                         tp->write_seq = 1;
 128                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 129                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 130                 sock_hold(sktw);
 131                 return 1;
 132         }
 133
 134         return 0;
 135 }
 136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 137
 138 /* This will initiate an outgoing connection. */
 139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 140 {
 141         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 142         struct inet_sock *inet = inet_sk(sk);
 143         struct tcp_sock *tp = tcp_sk(sk);
 144         __be16 orig_sport, orig_dport;
 145         __be32 daddr, nexthop;
 146         struct flowi4 *fl4;
 147         struct rtable *rt;
 148         int err;
 149         struct ip_options_rcu *inet_opt;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row.sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(&tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235
 236         if (!tp->write_seq && likely(!tp->repair))
 237                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 238                                                            inet->inet_daddr,
 239                                                            inet->inet_sport,
 240                                                            usin->sin_port);
 241
 242         inet->inet_id = tp->write_seq ^ jiffies;
 243
 244         err = tcp_connect(sk);
 245
 246         rt = NULL;
 247         if (err)
 248                 goto failure;
 249
 250         return 0;
 251
 252 failure:
 253         /*
 254          * This unhashes the socket and releases the local port,
 255          * if necessary.
 256          */
 257         tcp_set_state(sk, TCP_CLOSE);
 258         ip_rt_put(rt);
 259         sk->sk_route_caps = 0;
 260         inet->inet_dport = 0;
 261         return err;
 262 }
 263 EXPORT_SYMBOL(tcp_v4_connect);
 264
 265 /*
 266  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 267  * It can be called through tcp_release_cb() if socket was owned by user
 268  * at the time tcp_v4_err() was called to handle ICMP message.
 269  */
 270 void tcp_v4_mtu_reduced(struct sock *sk)
 271 {
 272         struct dst_entry *dst;
 273         struct inet_sock *inet = inet_sk(sk);
 274         u32 mtu = tcp_sk(sk)->mtu_info;
 275
 276         dst = inet_csk_update_pmtu(sk, mtu);
 277         if (!dst)
 278                 return;
 279
 280         /* Something is about to be wrong... Remember soft error
 281          * for the case, if this connection will not able to recover.
 282          */
 283         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 284                 sk->sk_err_soft = EMSGSIZE;
 285
 286         mtu = dst_mtu(dst);
 287
 288         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 289             ip_sk_accept_pmtu(sk) &&
 290             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 291                 tcp_sync_mss(sk, mtu);
 292
 293                 /* Resend the TCP packet because it's
 294                  * clear that the old packet has been
 295                  * dropped. This is the new "fast" path mtu
 296                  * discovery.
 297                  */
 298                 tcp_simple_retransmit(sk);
 299         } /* else let the usual retransmit timer handle it */
 300 }
 301 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 302
 303 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 304 {
 305         struct dst_entry *dst = __sk_dst_check(sk, 0);
 306
 307         if (dst)
 308                 dst->ops->redirect(dst, sk, skb);
 309 }
 310
 311
 312 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 313 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 314 {
 315         struct request_sock *req = inet_reqsk(sk);
 316         struct net *net = sock_net(sk);
 317
 318         /* ICMPs are not backlogged, hence we cannot get
 319          * an established socket here.
 320          */
 321         if (seq != tcp_rsk(req)->snt_isn) {
 322                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 323         } else if (abort) {
 324                 /*
 325                  * Still in SYN_RECV, just remove it silently.
 326                  * There is no good way to pass the error to the newly
 327                  * created socket, and POSIX does not want network
 328                  * errors returned from accept().
 329                  */
 330                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 331                 tcp_listendrop(req->rsk_listener);
 332         }
 333         reqsk_put(req);
 334 }
 335 EXPORT_SYMBOL(tcp_req_err);
 336
 337 /*
 338  * This routine is called by the ICMP module when it gets some
 339  * sort of error condition.  If err < 0 then the socket should
 340  * be closed and the error returned to the user.  If err > 0
 341  * it's just the icmp type << 8 | icmp code.  After adjustment
 342  * header points to the first 8 bytes of the tcp header.  We need
 343  * to find the appropriate port.
 344  *
 345  * The locking strategy used here is very "optimistic". When
 346  * someone else accesses the socket the ICMP is just dropped
 347  * and for some paths there is no check at all.
 348  * A more general error queue to queue errors for later handling
 349  * is probably better.
 350  *
 351  */
 352
 353 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 354 {
 355         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 356         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 357         struct inet_connection_sock *icsk;
 358         struct tcp_sock *tp;
 359         struct inet_sock *inet;
 360         const int type = icmp_hdr(icmp_skb)->type;
 361         const int code = icmp_hdr(icmp_skb)->code;
 362         struct sock *sk;
 363         struct sk_buff *skb;
 364         struct request_sock *fastopen;
 365         __u32 seq, snd_una;
 366         __u32 remaining;
 367         int err;
 368         struct net *net = dev_net(icmp_skb->dev);
 369
 370         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 371                                        th->dest, iph->saddr, ntohs(th->source),
 372                                        inet_iif(icmp_skb));
 373         if (!sk) {
 374                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 375                 return;
 376         }
 377         if (sk->sk_state == TCP_TIME_WAIT) {
 378                 inet_twsk_put(inet_twsk(sk));
 379                 return;
 380         }
 381         seq = ntohl(th->seq);
 382         if (sk->sk_state == TCP_NEW_SYN_RECV)
 383                 return tcp_req_err(sk, seq,
 384                                   type == ICMP_PARAMETERPROB ||
 385                                   type == ICMP_TIME_EXCEEDED ||
 386                                   (type == ICMP_DEST_UNREACH &&
 387                                    (code == ICMP_NET_UNREACH ||
 388                                     code == ICMP_HOST_UNREACH)));
 389
 390         bh_lock_sock(sk);
 391         /* If too many ICMPs get dropped on busy
 392          * servers this needs to be solved differently.
 393          * We do take care of PMTU discovery (RFC1191) special case :
 394          * we can receive locally generated ICMP messages while socket is held.
 395          */
 396         if (sock_owned_by_user(sk)) {
 397                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 398                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 399         }
 400         if (sk->sk_state == TCP_CLOSE)
 401                 goto out;
 402
 403         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 404                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 405                 goto out;
 406         }
 407
 408         icsk = inet_csk(sk);
 409         tp = tcp_sk(sk);
 410         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 411         fastopen = tp->fastopen_rsk;
 412         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 413         if (sk->sk_state != TCP_LISTEN &&
 414             !between(seq, snd_una, tp->snd_nxt)) {
 415                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 416                 goto out;
 417         }
 418
 419         switch (type) {
 420         case ICMP_REDIRECT:
 421                 do_redirect(icmp_skb, sk);
 422                 goto out;
 423         case ICMP_SOURCE_QUENCH:
 424                 /* Just silently ignore these. */
 425                 goto out;
 426         case ICMP_PARAMETERPROB:
 427                 err = EPROTO;
 428                 break;
 429         case ICMP_DEST_UNREACH:
 430                 if (code > NR_ICMP_UNREACH)
 431                         goto out;
 432
 433                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 434                         /* We are not interested in TCP_LISTEN and open_requests
 435                          * (SYN-ACKs send out by Linux are always <576bytes so
 436                          * they should go through unfragmented).
 437                          */
 438                         if (sk->sk_state == TCP_LISTEN)
 439                                 goto out;
 440
 441                         tp->mtu_info = info;
 442                         if (!sock_owned_by_user(sk)) {
 443                                 tcp_v4_mtu_reduced(sk);
 444                         } else {
 445                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 446                                         sock_hold(sk);
 447                         }
 448                         goto out;
 449                 }
 450
 451                 err = icmp_err_convert[code].errno;
 452                 /* check if icmp_skb allows revert of backoff
 453                  * (see draft-zimmermann-tcp-lcd) */
 454                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 455                         break;
 456                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 457                     !icsk->icsk_backoff || fastopen)
 458                         break;
 459
 460                 if (sock_owned_by_user(sk))
 461                         break;
 462
 463                 icsk->icsk_backoff--;
 464                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 465                                                TCP_TIMEOUT_INIT;
 466                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 467
 468                 skb = tcp_write_queue_head(sk);
 469                 BUG_ON(!skb);
 470
 471                 remaining = icsk->icsk_rto -
 472                             min(icsk->icsk_rto,
 473                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 474
 475                 if (remaining) {
 476                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 477                                                   remaining, TCP_RTO_MAX);
 478                 } else {
 479                         /* RTO revert clocked out retransmission.
 480                          * Will retransmit now */
 481                         tcp_retransmit_timer(sk);
 482                 }
 483
 484                 break;
 485         case ICMP_TIME_EXCEEDED:
 486                 err = EHOSTUNREACH;
 487                 break;
 488         default:
 489                 goto out;
 490         }
 491
 492         switch (sk->sk_state) {
 493         case TCP_SYN_SENT:
 494         case TCP_SYN_RECV:
 495                 /* Only in fast or simultaneous open. If a fast open socket is
 496                  * is already accepted it is treated as a connected one below.
 497                  */
 498                 if (fastopen && !fastopen->sk)
 499                         break;
 500
 501                 if (!sock_owned_by_user(sk)) {
 502                         sk->sk_err = err;
 503
 504                         sk->sk_error_report(sk);
 505
 506                         tcp_done(sk);
 507                 } else {
 508                         sk->sk_err_soft = err;
 509                 }
 510                 goto out;
 511         }
 512
 513         /* If we've already connected we will keep trying
 514          * until we time out, or the user gives up.
 515          *
 516          * rfc1122 4.2.3.9 allows to consider as hard errors
 517          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 518          * but it is obsoleted by pmtu discovery).
 519          *
 520          * Note, that in modern internet, where routing is unreliable
 521          * and in each dark corner broken firewalls sit, sending random
 522          * errors ordered by their masters even this two messages finally lose
 523          * their original sense (even Linux sends invalid PORT_UNREACHs)
 524          *
 525          * Now we are in compliance with RFCs.
 526          *                                                      --ANK (980905)
 527          */
 528
 529         inet = inet_sk(sk);
 530         if (!sock_owned_by_user(sk) && inet->recverr) {
 531                 sk->sk_err = err;
 532                 sk->sk_error_report(sk);
 533         } else  { /* Only an error on timeout */
 534                 sk->sk_err_soft = err;
 535         }
 536
 537 out:
 538         bh_unlock_sock(sk);
 539         sock_put(sk);
 540 }
 541
 542 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 543 {
 544         struct tcphdr *th = tcp_hdr(skb);
 545
 546         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 547                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 548                 skb->csum_start = skb_transport_header(skb) - skb->head;
 549                 skb->csum_offset = offsetof(struct tcphdr, check);
 550         } else {
 551                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 552                                          csum_partial(th,
 553                                                       th->doff << 2,
 554                                                       skb->csum));
 555         }
 556 }
 557
 558 /* This routine computes an IPv4 TCP checksum. */
 559 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 560 {
 561         const struct inet_sock *inet = inet_sk(sk);
 562
 563         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 564 }
 565 EXPORT_SYMBOL(tcp_v4_send_check);
 566
 567 /*
 568  *      This routine will send an RST to the other tcp.
 569  *
 570  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 571  *                    for reset.
 572  *      Answer: if a packet caused RST, it is not for a socket
 573  *              existing in our system, if it is matched to a socket,
 574  *              it is just duplicate segment or bug in other side's TCP.
 575  *              So that we build reply only basing on parameters
 576  *              arrived with segment.
 577  *      Exception: precedence violation. We do not implement it in any case.
 578  */
 579
 580 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 581 {
 582         const struct tcphdr *th = tcp_hdr(skb);
 583         struct {
 584                 struct tcphdr th;
 585 #ifdef CONFIG_TCP_MD5SIG
 586                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 587 #endif
 588         } rep;
 589         struct ip_reply_arg arg;
 590 #ifdef CONFIG_TCP_MD5SIG
 591         struct tcp_md5sig_key *key = NULL;
 592         const __u8 *hash_location = NULL;
 593         unsigned char newhash[16];
 594         int genhash;
 595         struct sock *sk1 = NULL;
 596 #endif
 597         struct net *net;
 598
 599         /* Never send a reset in response to a reset. */
 600         if (th->rst)
 601                 return;
 602
 603         /* If sk not NULL, it means we did a successful lookup and incoming
 604          * route had to be correct. prequeue might have dropped our dst.
 605          */
 606         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 607                 return;
 608
 609         /* Swap the send and the receive. */
 610         memset(&rep, 0, sizeof(rep));
 611         rep.th.dest   = th->source;
 612         rep.th.source = th->dest;
 613         rep.th.doff   = sizeof(struct tcphdr) / 4;
 614         rep.th.rst    = 1;
 615
 616         if (th->ack) {
 617                 rep.th.seq = th->ack_seq;
 618         } else {
 619                 rep.th.ack = 1;
 620                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 621                                        skb->len - (th->doff << 2));
 622         }
 623
 624         memset(&arg, 0, sizeof(arg));
 625         arg.iov[0].iov_base = (unsigned char *)&rep;
 626         arg.iov[0].iov_len  = sizeof(rep.th);
 627
 628         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 629 #ifdef CONFIG_TCP_MD5SIG
 630         rcu_read_lock();
 631         hash_location = tcp_parse_md5sig_option(th);
 632         if (sk && sk_fullsock(sk)) {
 633                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 634                                         &ip_hdr(skb)->saddr, AF_INET);
 635         } else if (hash_location) {
 636                 /*
 637                  * active side is lost. Try to find listening socket through
 638                  * source port, and then find md5 key through listening socket.
 639                  * we are not loose security here:
 640                  * Incoming packet is checked with md5 hash with finding key,
 641                  * no RST generated if md5 hash doesn't match.
 642                  */
 643                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 644                                              ip_hdr(skb)->saddr,
 645                                              th->source, ip_hdr(skb)->daddr,
 646                                              ntohs(th->source), inet_iif(skb));
 647                 /* don't send rst if it can't find key */
 648                 if (!sk1)
 649                         goto out;
 650
 651                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 652                                         &ip_hdr(skb)->saddr, AF_INET);
 653                 if (!key)
 654                         goto out;
 655
 656
 657                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 658                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 659                         goto out;
 660
 661         }
 662
 663         if (key) {
 664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 665                                    (TCPOPT_NOP << 16) |
 666                                    (TCPOPT_MD5SIG << 8) |
 667                                    TCPOLEN_MD5SIG);
 668                 /* Update length and the length the header thinks exists */
 669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 670                 rep.th.doff = arg.iov[0].iov_len / 4;
 671
 672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 673                                      key, ip_hdr(skb)->saddr,
 674                                      ip_hdr(skb)->daddr, &rep.th);
 675         }
 676 #endif
 677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 678                                       ip_hdr(skb)->saddr, /* XXX */
 679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 681         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 682
 683         /* When socket is gone, all binding information is lost.
 684          * routing might fail in this case. No choice here, if we choose to force
 685          * input interface, we will misroute in case of asymmetric route.
 686          */
 687         if (sk)
 688                 arg.bound_dev_if = sk->sk_bound_dev_if;
 689
 690         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 691                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 692
 693         arg.tos = ip_hdr(skb)->tos;
 694         local_bh_disable();
 695         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 696                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 697                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 698                               &arg, arg.iov[0].iov_len);
 699
 700         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 701         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 702         local_bh_enable();
 703
 704 #ifdef CONFIG_TCP_MD5SIG
 705 out:
 706         rcu_read_unlock();
 707 #endif
 708 }
 709
 710 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 711    outside socket context is ugly, certainly. What can I do?
 712  */
 713
 714 static void tcp_v4_send_ack(struct net *net,
 715                             struct sk_buff *skb, u32 seq, u32 ack,
 716                             u32 win, u32 tsval, u32 tsecr, int oif,
 717                             struct tcp_md5sig_key *key,
 718                             int reply_flags, u8 tos)
 719 {
 720         const struct tcphdr *th = tcp_hdr(skb);
 721         struct {
 722                 struct tcphdr th;
 723                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 724 #ifdef CONFIG_TCP_MD5SIG
 725                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 726 #endif
 727                         ];
 728         } rep;
 729         struct ip_reply_arg arg;
 730
 731         memset(&rep.th, 0, sizeof(struct tcphdr));
 732         memset(&arg, 0, sizeof(arg));
 733
 734         arg.iov[0].iov_base = (unsigned char *)&rep;
 735         arg.iov[0].iov_len  = sizeof(rep.th);
 736         if (tsecr) {
 737                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 738                                    (TCPOPT_TIMESTAMP << 8) |
 739                                    TCPOLEN_TIMESTAMP);
 740                 rep.opt[1] = htonl(tsval);
 741                 rep.opt[2] = htonl(tsecr);
 742                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 743         }
 744
 745         /* Swap the send and the receive. */
 746         rep.th.dest    = th->source;
 747         rep.th.source  = th->dest;
 748         rep.th.doff    = arg.iov[0].iov_len / 4;
 749         rep.th.seq     = htonl(seq);
 750         rep.th.ack_seq = htonl(ack);
 751         rep.th.ack     = 1;
 752         rep.th.window  = htons(win);
 753
 754 #ifdef CONFIG_TCP_MD5SIG
 755         if (key) {
 756                 int offset = (tsecr) ? 3 : 0;
 757
 758                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 759                                           (TCPOPT_NOP << 16) |
 760                                           (TCPOPT_MD5SIG << 8) |
 761                                           TCPOLEN_MD5SIG);
 762                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 763                 rep.th.doff = arg.iov[0].iov_len/4;
 764
 765                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 766                                     key, ip_hdr(skb)->saddr,
 767                                     ip_hdr(skb)->daddr, &rep.th);
 768         }
 769 #endif
 770         arg.flags = reply_flags;
 771         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 772                                       ip_hdr(skb)->saddr, /* XXX */
 773                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 774         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 775         if (oif)
 776                 arg.bound_dev_if = oif;
 777         arg.tos = tos;
 778         local_bh_disable();
 779         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782                               &arg, arg.iov[0].iov_len);
 783
 784         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 785         local_bh_enable();
 786 }
 787
 788 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 789 {
 790         struct inet_timewait_sock *tw = inet_twsk(sk);
 791         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 792
 793         tcp_v4_send_ack(sock_net(sk), skb,
 794                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 795                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 796                         tcp_time_stamp + tcptw->tw_ts_offset,
 797                         tcptw->tw_ts_recent,
 798                         tw->tw_bound_dev_if,
 799                         tcp_twsk_md5_key(tcptw),
 800                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 801                         tw->tw_tos
 802                         );
 803
 804         inet_twsk_put(tw);
 805 }
 806
 807 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 808                                   struct request_sock *req)
 809 {
 810         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 811          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 812          */
 813         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 814                                              tcp_sk(sk)->snd_nxt;
 815
 816         /* RFC 7323 2.3
 817          * The window field (SEG.WND) of every outgoing segment, with the
 818          * exception of <SYN> segments, MUST be right-shifted by
 819          * Rcv.Wind.Shift bits:
 820          */
 821         tcp_v4_send_ack(sock_net(sk), skb, seq,
 822                         tcp_rsk(req)->rcv_nxt,
 823                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 824                         tcp_time_stamp,
 825                         req->ts_recent,
 826                         0,
 827                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 828                                           AF_INET),
 829                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 830                         ip_hdr(skb)->tos);
 831 }
 832
 833 /*
 834  *      Send a SYN-ACK after having received a SYN.
 835  *      This still operates on a request_sock only, not on a big
 836  *      socket.
 837  */
 838 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 839                               struct flowi *fl,
 840                               struct request_sock *req,
 841                               struct tcp_fastopen_cookie *foc,
 842                               enum tcp_synack_type synack_type)
 843 {
 844         const struct inet_request_sock *ireq = inet_rsk(req);
 845         struct flowi4 fl4;
 846         int err = -1;
 847         struct sk_buff *skb;
 848
 849         /* First, grab a route. */
 850         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 851                 return -1;
 852
 853         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 854
 855         if (skb) {
 856                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 857
 858                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 859                                             ireq->ir_rmt_addr,
 860                                             ireq->opt);
 861                 err = net_xmit_eval(err);
 862         }
 863
 864         return err;
 865 }
 866
 867 /*
 868  *      IPv4 request_sock destructor.
 869  */
 870 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 871 {
 872         kfree(inet_rsk(req)->opt);
 873 }
 874
 875 #ifdef CONFIG_TCP_MD5SIG
 876 /*
 877  * RFC2385 MD5 checksumming requires a mapping of
 878  * IP address->MD5 Key.
 879  * We need to maintain these in the sk structure.
 880  */
 881
 882 /* Find the Key structure for an address.  */
 883 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 884                                          const union tcp_md5_addr *addr,
 885                                          int family)
 886 {
 887         const struct tcp_sock *tp = tcp_sk(sk);
 888         struct tcp_md5sig_key *key;
 889         unsigned int size = sizeof(struct in_addr);
 890         const struct tcp_md5sig_info *md5sig;
 891
 892         /* caller either holds rcu_read_lock() or socket lock */
 893         md5sig = rcu_dereference_check(tp->md5sig_info,
 894                                        lockdep_sock_is_held(sk));
 895         if (!md5sig)
 896                 return NULL;
 897 #if IS_ENABLED(CONFIG_IPV6)
 898         if (family == AF_INET6)
 899                 size = sizeof(struct in6_addr);
 900 #endif
 901         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 902                 if (key->family != family)
 903                         continue;
 904                 if (!memcmp(&key->addr, addr, size))
 905                         return key;
 906         }
 907         return NULL;
 908 }
 909 EXPORT_SYMBOL(tcp_md5_do_lookup);
 910
 911 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 912                                          const struct sock *addr_sk)
 913 {
 914         const union tcp_md5_addr *addr;
 915
 916         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 917         return tcp_md5_do_lookup(sk, addr, AF_INET);
 918 }
 919 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 920
 921 /* This can be called on a newly created socket, from other files */
 922 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 923                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 924 {
 925         /* Add Key to the list */
 926         struct tcp_md5sig_key *key;
 927         struct tcp_sock *tp = tcp_sk(sk);
 928         struct tcp_md5sig_info *md5sig;
 929
 930         key = tcp_md5_do_lookup(sk, addr, family);
 931         if (key) {
 932                 /* Pre-existing entry - just update that one. */
 933                 memcpy(key->key, newkey, newkeylen);
 934                 key->keylen = newkeylen;
 935                 return 0;
 936         }
 937
 938         md5sig = rcu_dereference_protected(tp->md5sig_info,
 939                                            lockdep_sock_is_held(sk));
 940         if (!md5sig) {
 941                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 942                 if (!md5sig)
 943                         return -ENOMEM;
 944
 945                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 946                 INIT_HLIST_HEAD(&md5sig->head);
 947                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 948         }
 949
 950         key = sock_kmalloc(sk, sizeof(*key), gfp);
 951         if (!key)
 952                 return -ENOMEM;
 953         if (!tcp_alloc_md5sig_pool()) {
 954                 sock_kfree_s(sk, key, sizeof(*key));
 955                 return -ENOMEM;
 956         }
 957
 958         memcpy(key->key, newkey, newkeylen);
 959         key->keylen = newkeylen;
 960         key->family = family;
 961         memcpy(&key->addr, addr,
 962                (family == AF_INET6) ? sizeof(struct in6_addr) :
 963                                       sizeof(struct in_addr));
 964         hlist_add_head_rcu(&key->node, &md5sig->head);
 965         return 0;
 966 }
 967 EXPORT_SYMBOL(tcp_md5_do_add);
 968
 969 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 970 {
 971         struct tcp_md5sig_key *key;
 972
 973         key = tcp_md5_do_lookup(sk, addr, family);
 974         if (!key)
 975                 return -ENOENT;
 976         hlist_del_rcu(&key->node);
 977         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 978         kfree_rcu(key, rcu);
 979         return 0;
 980 }
 981 EXPORT_SYMBOL(tcp_md5_do_del);
 982
 983 static void tcp_clear_md5_list(struct sock *sk)
 984 {
 985         struct tcp_sock *tp = tcp_sk(sk);
 986         struct tcp_md5sig_key *key;
 987         struct hlist_node *n;
 988         struct tcp_md5sig_info *md5sig;
 989
 990         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 991
 992         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 993                 hlist_del_rcu(&key->node);
 994                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 995                 kfree_rcu(key, rcu);
 996         }
 997 }
 998
 999 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1000                                  int optlen)
1001 {
1002         struct tcp_md5sig cmd;
1003         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1004
1005         if (optlen < sizeof(cmd))
1006                 return -EINVAL;
1007
1008         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1009                 return -EFAULT;
1010
1011         if (sin->sin_family != AF_INET)
1012                 return -EINVAL;
1013
1014         if (!cmd.tcpm_keylen)
1015                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1016                                       AF_INET);
1017
1018         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1019                 return -EINVAL;
1020
1021         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1022                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1023                               GFP_KERNEL);
1024 }
1025
1026 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1027                                    __be32 daddr, __be32 saddr,
1028                                    const struct tcphdr *th, int nbytes)
1029 {
1030         struct tcp4_pseudohdr *bp;
1031         struct scatterlist sg;
1032         struct tcphdr *_th;
1033
1034         bp = hp->scratch;
1035         bp->saddr = saddr;
1036         bp->daddr = daddr;
1037         bp->pad = 0;
1038         bp->protocol = IPPROTO_TCP;
1039         bp->len = cpu_to_be16(nbytes);
1040
1041         _th = (struct tcphdr *)(bp + 1);
1042         memcpy(_th, th, sizeof(*th));
1043         _th->check = 0;
1044
1045         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1046         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1047                                 sizeof(*bp) + sizeof(*th));
1048         return crypto_ahash_update(hp->md5_req);
1049 }
1050
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1052                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 {
1054         struct tcp_md5sig_pool *hp;
1055         struct ahash_request *req;
1056
1057         hp = tcp_get_md5sig_pool();
1058         if (!hp)
1059                 goto clear_hash_noput;
1060         req = hp->md5_req;
1061
1062         if (crypto_ahash_init(req))
1063                 goto clear_hash;
1064         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_key(hp, key))
1067                 goto clear_hash;
1068         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1069         if (crypto_ahash_final(req))
1070                 goto clear_hash;
1071
1072         tcp_put_md5sig_pool();
1073         return 0;
1074
1075 clear_hash:
1076         tcp_put_md5sig_pool();
1077 clear_hash_noput:
1078         memset(md5_hash, 0, 16);
1079         return 1;
1080 }
1081
1082 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1083                         const struct sock *sk,
1084                         const struct sk_buff *skb)
1085 {
1086         struct tcp_md5sig_pool *hp;
1087         struct ahash_request *req;
1088         const struct tcphdr *th = tcp_hdr(skb);
1089         __be32 saddr, daddr;
1090
1091         if (sk) { /* valid for establish/request sockets */
1092                 saddr = sk->sk_rcv_saddr;
1093                 daddr = sk->sk_daddr;
1094         } else {
1095                 const struct iphdr *iph = ip_hdr(skb);
1096                 saddr = iph->saddr;
1097                 daddr = iph->daddr;
1098         }
1099
1100         hp = tcp_get_md5sig_pool();
1101         if (!hp)
1102                 goto clear_hash_noput;
1103         req = hp->md5_req;
1104
1105         if (crypto_ahash_init(req))
1106                 goto clear_hash;
1107
1108         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1109                 goto clear_hash;
1110         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1111                 goto clear_hash;
1112         if (tcp_md5_hash_key(hp, key))
1113                 goto clear_hash;
1114         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1115         if (crypto_ahash_final(req))
1116                 goto clear_hash;
1117
1118         tcp_put_md5sig_pool();
1119         return 0;
1120
1121 clear_hash:
1122         tcp_put_md5sig_pool();
1123 clear_hash_noput:
1124         memset(md5_hash, 0, 16);
1125         return 1;
1126 }
1127 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1128
1129 #endif
1130
1131 /* Called with rcu_read_lock() */
1132 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1133                                     const struct sk_buff *skb)
1134 {
1135 #ifdef CONFIG_TCP_MD5SIG
1136         /*
1137          * This gets called for each TCP segment that arrives
1138          * so we want to be efficient.
1139          * We have 3 drop cases:
1140          * o No MD5 hash and one expected.
1141          * o MD5 hash and we're not expecting one.
1142          * o MD5 hash and its wrong.
1143          */
1144         const __u8 *hash_location = NULL;
1145         struct tcp_md5sig_key *hash_expected;
1146         const struct iphdr *iph = ip_hdr(skb);
1147         const struct tcphdr *th = tcp_hdr(skb);
1148         int genhash;
1149         unsigned char newhash[16];
1150
1151         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1152                                           AF_INET);
1153         hash_location = tcp_parse_md5sig_option(th);
1154
1155         /* We've parsed the options - do we have a hash? */
1156         if (!hash_expected && !hash_location)
1157                 return false;
1158
1159         if (hash_expected && !hash_location) {
1160                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1161                 return true;
1162         }
1163
1164         if (!hash_expected && hash_location) {
1165                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1166                 return true;
1167         }
1168
1169         /* Okay, so this is hash_expected and hash_location -
1170          * so we need to calculate the checksum.
1171          */
1172         genhash = tcp_v4_md5_hash_skb(newhash,
1173                                       hash_expected,
1174                                       NULL, skb);
1175
1176         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1177                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1178                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179                                      &iph->saddr, ntohs(th->source),
1180                                      &iph->daddr, ntohs(th->dest),
1181                                      genhash ? " tcp_v4_calc_md5_hash failed"
1182                                      : "");
1183                 return true;
1184         }
1185         return false;
1186 #endif
1187         return false;
1188 }
1189
1190 static void tcp_v4_init_req(struct request_sock *req,
1191                             const struct sock *sk_listener,
1192                             struct sk_buff *skb)
1193 {
1194         struct inet_request_sock *ireq = inet_rsk(req);
1195
1196         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198         ireq->opt = tcp_v4_save_options(skb);
1199 }
1200
1201 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1202                                           struct flowi *fl,
1203                                           const struct request_sock *req,
1204                                           bool *strict)
1205 {
1206         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1207
1208         if (strict) {
1209                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1210                         *strict = true;
1211                 else
1212                         *strict = false;
1213         }
1214
1215         return dst;
1216 }
1217
1218 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1219         .family         =       PF_INET,
1220         .obj_size       =       sizeof(struct tcp_request_sock),
1221         .rtx_syn_ack    =       tcp_rtx_synack,
1222         .send_ack       =       tcp_v4_reqsk_send_ack,
1223         .destructor     =       tcp_v4_reqsk_destructor,
1224         .send_reset     =       tcp_v4_send_reset,
1225         .syn_ack_timeout =      tcp_syn_ack_timeout,
1226 };
1227
1228 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1229         .mss_clamp      =       TCP_MSS_DEFAULT,
1230 #ifdef CONFIG_TCP_MD5SIG
1231         .req_md5_lookup =       tcp_v4_md5_lookup,
1232         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1233 #endif
1234         .init_req       =       tcp_v4_init_req,
1235 #ifdef CONFIG_SYN_COOKIES
1236         .cookie_init_seq =      cookie_v4_init_sequence,
1237 #endif
1238         .route_req      =       tcp_v4_route_req,
1239         .init_seq       =       tcp_v4_init_sequence,
1240         .send_synack    =       tcp_v4_send_synack,
1241 };
1242
1243 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1244 {
1245         /* Never answer to SYNs send to broadcast or multicast */
1246         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1247                 goto drop;
1248
1249         return tcp_conn_request(&tcp_request_sock_ops,
1250                                 &tcp_request_sock_ipv4_ops, sk, skb);
1251
1252 drop:
1253         tcp_listendrop(sk);
1254         return 0;
1255 }
1256 EXPORT_SYMBOL(tcp_v4_conn_request);
1257
1258
1259 /*
1260  * The three way handshake has completed - we got a valid synack -
1261  * now create the new socket.
1262  */
1263 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1264                                   struct request_sock *req,
1265                                   struct dst_entry *dst,
1266                                   struct request_sock *req_unhash,
1267                                   bool *own_req)
1268 {
1269         struct inet_request_sock *ireq;
1270         struct inet_sock *newinet;
1271         struct tcp_sock *newtp;
1272         struct sock *newsk;
1273 #ifdef CONFIG_TCP_MD5SIG
1274         struct tcp_md5sig_key *key;
1275 #endif
1276         struct ip_options_rcu *inet_opt;
1277
1278         if (sk_acceptq_is_full(sk))
1279                 goto exit_overflow;
1280
1281         newsk = tcp_create_openreq_child(sk, req, skb);
1282         if (!newsk)
1283                 goto exit_nonewsk;
1284
1285         newsk->sk_gso_type = SKB_GSO_TCPV4;
1286         inet_sk_rx_dst_set(newsk, skb);
1287
1288         newtp                 = tcp_sk(newsk);
1289         newinet               = inet_sk(newsk);
1290         ireq                  = inet_rsk(req);
1291         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1292         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1293         newsk->sk_bound_dev_if = ireq->ir_iif;
1294         newinet->inet_saddr           = ireq->ir_loc_addr;
1295         inet_opt              = ireq->opt;
1296         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1297         ireq->opt             = NULL;
1298         newinet->mc_index     = inet_iif(skb);
1299         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1300         newinet->rcv_tos      = ip_hdr(skb)->tos;
1301         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1302         if (inet_opt)
1303                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304         newinet->inet_id = newtp->write_seq ^ jiffies;
1305
1306         if (!dst) {
1307                 dst = inet_csk_route_child_sock(sk, newsk, req);
1308                 if (!dst)
1309                         goto put_and_exit;
1310         } else {
1311                 /* syncookie case : see end of cookie_v4_check() */
1312         }
1313         sk_setup_caps(newsk, dst);
1314
1315         tcp_ca_openreq_child(newsk, dst);
1316
1317         tcp_sync_mss(newsk, dst_mtu(dst));
1318         newtp->advmss = dst_metric_advmss(dst);
1319         if (tcp_sk(sk)->rx_opt.user_mss &&
1320             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (*own_req)
1346                 tcp_move_syn(newtp, req);
1347
1348         return newsk;
1349
1350 exit_overflow:
1351         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353         dst_release(dst);
1354 exit:
1355         tcp_listendrop(sk);
1356         return NULL;
1357 put_and_exit:
1358         inet_csk_prepare_forced_close(newsk);
1359         tcp_done(newsk);
1360         goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367         const struct tcphdr *th = tcp_hdr(skb);
1368
1369         if (!th->syn)
1370                 sk = cookie_v4_check(sk, skb);
1371 #endif
1372         return sk;
1373 }
1374
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385         struct sock *rsk;
1386
1387         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388                 struct dst_entry *dst = sk->sk_rx_dst;
1389
1390                 sock_rps_save_rxhash(sk, skb);
1391                 sk_mark_napi_id(sk, skb);
1392                 if (dst) {
1393                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394                             !dst->ops->check(dst, 0)) {
1395                                 dst_release(dst);
1396                                 sk->sk_rx_dst = NULL;
1397                         }
1398                 }
1399                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400                 return 0;
1401         }
1402
1403         if (tcp_checksum_complete(skb))
1404                 goto csum_err;
1405
1406         if (sk->sk_state == TCP_LISTEN) {
1407                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408
1409                 if (!nsk)
1410                         goto discard;
1411                 if (nsk != sk) {
1412                         sock_rps_save_rxhash(nsk, skb);
1413                         sk_mark_napi_id(nsk, skb);
1414                         if (tcp_child_process(sk, nsk, skb)) {
1415                                 rsk = nsk;
1416                                 goto reset;
1417                         }
1418                         return 0;
1419                 }
1420         } else
1421                 sock_rps_save_rxhash(sk, skb);
1422
1423         if (tcp_rcv_state_process(sk, skb)) {
1424                 rsk = sk;
1425                 goto reset;
1426         }
1427         return 0;
1428
1429 reset:
1430         tcp_v4_send_reset(rsk, skb);
1431 discard:
1432         kfree_skb(skb);
1433         /* Be careful here. If this function gets more complicated and
1434          * gcc suffers from register pressure on the x86, sk (in %ebx)
1435          * might be destroyed here. This current version compiles correctly,
1436          * but you have been warned.
1437          */
1438         return 0;
1439
1440 csum_err:
1441         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1442         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1443         goto discard;
1444 }
1445 EXPORT_SYMBOL(tcp_v4_do_rcv);
1446
1447 void tcp_v4_early_demux(struct sk_buff *skb)
1448 {
1449         const struct iphdr *iph;
1450         const struct tcphdr *th;
1451         struct sock *sk;
1452
1453         if (skb->pkt_type != PACKET_HOST)
1454                 return;
1455
1456         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1457                 return;
1458
1459         iph = ip_hdr(skb);
1460         th = tcp_hdr(skb);
1461
1462         if (th->doff < sizeof(struct tcphdr) / 4)
1463                 return;
1464
1465         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1466                                        iph->saddr, th->source,
1467                                        iph->daddr, ntohs(th->dest),
1468                                        skb->skb_iif);
1469         if (sk) {
1470                 skb->sk = sk;
1471                 skb->destructor = sock_edemux;
1472                 if (sk_fullsock(sk)) {
1473                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1474
1475                         if (dst)
1476                                 dst = dst_check(dst, 0);
1477                         if (dst &&
1478                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1479                                 skb_dst_set_noref(skb, dst);
1480                 }
1481         }
1482 }
1483
1484 /* Packet is added to VJ-style prequeue for processing in process
1485  * context, if a reader task is waiting. Apparently, this exciting
1486  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1487  * failed somewhere. Latency? Burstiness? Well, at least now we will
1488  * see, why it failed. 8)8)                               --ANK
1489  *
1490  */
1491 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1492 {
1493         struct tcp_sock *tp = tcp_sk(sk);
1494
1495         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1496                 return false;
1497
1498         if (skb->len <= tcp_hdrlen(skb) &&
1499             skb_queue_len(&tp->ucopy.prequeue) == 0)
1500                 return false;
1501
1502         /* Before escaping RCU protected region, we need to take care of skb
1503          * dst. Prequeue is only enabled for established sockets.
1504          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1505          * Instead of doing full sk_rx_dst validity here, let's perform
1506          * an optimistic check.
1507          */
1508         if (likely(sk->sk_rx_dst))
1509                 skb_dst_drop(skb);
1510         else
1511                 skb_dst_force_safe(skb);
1512
1513         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1514         tp->ucopy.memory += skb->truesize;
1515         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1516             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1517                 struct sk_buff *skb1;
1518
1519                 BUG_ON(sock_owned_by_user(sk));
1520                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1521                                 skb_queue_len(&tp->ucopy.prequeue));
1522
1523                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1524                         sk_backlog_rcv(sk, skb1);
1525
1526                 tp->ucopy.memory = 0;
1527         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1528                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1529                                            POLLIN | POLLRDNORM | POLLRDBAND);
1530                 if (!inet_csk_ack_scheduled(sk))
1531                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1532                                                   (3 * tcp_rto_min(sk)) / 4,
1533                                                   TCP_RTO_MAX);
1534         }
1535         return true;
1536 }
1537 EXPORT_SYMBOL(tcp_prequeue);
1538
1539 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1540 {
1541         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1542
1543         /* Only socket owner can try to collapse/prune rx queues
1544          * to reduce memory overhead, so add a little headroom here.
1545          * Few sockets backlog are possibly concurrently non empty.
1546          */
1547         limit += 64*1024;
1548
1549         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1550          * we can fix skb->truesize to its real value to avoid future drops.
1551          * This is valid because skb is not yet charged to the socket.
1552          * It has been noticed pure SACK packets were sometimes dropped
1553          * (if cooked by drivers without copybreak feature).
1554          */
1555         if (!skb->data_len)
1556                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1557
1558         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1559                 bh_unlock_sock(sk);
1560                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1561                 return true;
1562         }
1563         return false;
1564 }
1565 EXPORT_SYMBOL(tcp_add_backlog);
1566
1567 /*
1568  *      From tcp_input.c
1569  */
1570
1571 int tcp_v4_rcv(struct sk_buff *skb)
1572 {
1573         struct net *net = dev_net(skb->dev);
1574         const struct iphdr *iph;
1575         const struct tcphdr *th;
1576         bool refcounted;
1577         struct sock *sk;
1578         int ret;
1579
1580         if (skb->pkt_type != PACKET_HOST)
1581                 goto discard_it;
1582
1583         /* Count it even if it's bad */
1584         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1585
1586         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1587                 goto discard_it;
1588
1589         th = (const struct tcphdr *)skb->data;
1590
1591         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1592                 goto bad_packet;
1593         if (!pskb_may_pull(skb, th->doff * 4))
1594                 goto discard_it;
1595
1596         /* An explanation is required here, I think.
1597          * Packet length and doff are validated by header prediction,
1598          * provided case of th->doff==0 is eliminated.
1599          * So, we defer the checks. */
1600
1601         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1602                 goto csum_error;
1603
1604         th = (const struct tcphdr *)skb->data;
1605         iph = ip_hdr(skb);
1606         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1607          * barrier() makes sure compiler wont play fool^Waliasing games.
1608          */
1609         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1610                 sizeof(struct inet_skb_parm));
1611         barrier();
1612
1613         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1614         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1615                                     skb->len - th->doff * 4);
1616         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1617         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1618         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1619         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1620         TCP_SKB_CB(skb)->sacked  = 0;
1621
1622 lookup:
1623         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1624                                th->dest, &refcounted);
1625         if (!sk)
1626                 goto no_tcp_socket;
1627
1628 process:
1629         if (sk->sk_state == TCP_TIME_WAIT)
1630                 goto do_time_wait;
1631
1632         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1633                 struct request_sock *req = inet_reqsk(sk);
1634                 struct sock *nsk;
1635
1636                 sk = req->rsk_listener;
1637                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1638                         sk_drops_add(sk, skb);
1639                         reqsk_put(req);
1640                         goto discard_it;
1641                 }
1642                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1643                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1644                         goto lookup;
1645                 }
1646                 /* We own a reference on the listener, increase it again
1647                  * as we might lose it too soon.
1648                  */
1649                 sock_hold(sk);
1650                 refcounted = true;
1651                 nsk = tcp_check_req(sk, skb, req, false);
1652                 if (!nsk) {
1653                         reqsk_put(req);
1654                         goto discard_and_relse;
1655                 }
1656                 if (nsk == sk) {
1657                         reqsk_put(req);
1658                 } else if (tcp_child_process(sk, nsk, skb)) {
1659                         tcp_v4_send_reset(nsk, skb);
1660                         goto discard_and_relse;
1661                 } else {
1662                         sock_put(sk);
1663                         return 0;
1664                 }
1665         }
1666         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1668                 goto discard_and_relse;
1669         }
1670
1671         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672                 goto discard_and_relse;
1673
1674         if (tcp_v4_inbound_md5_hash(sk, skb))
1675                 goto discard_and_relse;
1676
1677         nf_reset(skb);
1678
1679         if (sk_filter(sk, skb))
1680                 goto discard_and_relse;
1681
1682         skb->dev = NULL;
1683
1684         if (sk->sk_state == TCP_LISTEN) {
1685                 ret = tcp_v4_do_rcv(sk, skb);
1686                 goto put_and_return;
1687         }
1688
1689         sk_incoming_cpu_update(sk);
1690
1691         bh_lock_sock_nested(sk);
1692         tcp_segs_in(tcp_sk(sk), skb);
1693         ret = 0;
1694         if (!sock_owned_by_user(sk)) {
1695                 if (!tcp_prequeue(sk, skb))
1696                         ret = tcp_v4_do_rcv(sk, skb);
1697         } else if (tcp_add_backlog(sk, skb)) {
1698                 goto discard_and_relse;
1699         }
1700         bh_unlock_sock(sk);
1701
1702 put_and_return:
1703         if (refcounted)
1704                 sock_put(sk);
1705
1706         return ret;
1707
1708 no_tcp_socket:
1709         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1710                 goto discard_it;
1711
1712         if (tcp_checksum_complete(skb)) {
1713 csum_error:
1714                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1715 bad_packet:
1716                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1717         } else {
1718                 tcp_v4_send_reset(NULL, skb);
1719         }
1720
1721 discard_it:
1722         /* Discard frame. */
1723         kfree_skb(skb);
1724         return 0;
1725
1726 discard_and_relse:
1727         sk_drops_add(sk, skb);
1728         if (refcounted)
1729                 sock_put(sk);
1730         goto discard_it;
1731
1732 do_time_wait:
1733         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1734                 inet_twsk_put(inet_twsk(sk));
1735                 goto discard_it;
1736         }
1737
1738         if (tcp_checksum_complete(skb)) {
1739                 inet_twsk_put(inet_twsk(sk));
1740                 goto csum_error;
1741         }
1742         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1743         case TCP_TW_SYN: {
1744                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1745                                                         &tcp_hashinfo, skb,
1746                                                         __tcp_hdrlen(th),
1747                                                         iph->saddr, th->source,
1748                                                         iph->daddr, th->dest,
1749                                                         inet_iif(skb));
1750                 if (sk2) {
1751                         inet_twsk_deschedule_put(inet_twsk(sk));
1752                         sk = sk2;
1753                         refcounted = false;
1754                         goto process;
1755                 }
1756                 /* Fall through to ACK */
1757         }
1758         case TCP_TW_ACK:
1759                 tcp_v4_timewait_ack(sk, skb);
1760                 break;
1761         case TCP_TW_RST:
1762                 tcp_v4_send_reset(sk, skb);
1763                 inet_twsk_deschedule_put(inet_twsk(sk));
1764                 goto discard_it;
1765         case TCP_TW_SUCCESS:;
1766         }
1767         goto discard_it;
1768 }
1769
1770 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1771         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1772         .twsk_unique    = tcp_twsk_unique,
1773         .twsk_destructor= tcp_twsk_destructor,
1774 };
1775
1776 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1777 {
1778         struct dst_entry *dst = skb_dst(skb);
1779
1780         if (dst && dst_hold_safe(dst)) {
1781                 sk->sk_rx_dst = dst;
1782                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1783         }
1784 }
1785 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1786
1787 const struct inet_connection_sock_af_ops ipv4_specific = {
1788         .queue_xmit        = ip_queue_xmit,
1789         .send_check        = tcp_v4_send_check,
1790         .rebuild_header    = inet_sk_rebuild_header,
1791         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1792         .conn_request      = tcp_v4_conn_request,
1793         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1794         .net_header_len    = sizeof(struct iphdr),
1795         .setsockopt        = ip_setsockopt,
1796         .getsockopt        = ip_getsockopt,
1797         .addr2sockaddr     = inet_csk_addr2sockaddr,
1798         .sockaddr_len      = sizeof(struct sockaddr_in),
1799         .bind_conflict     = inet_csk_bind_conflict,
1800 #ifdef CONFIG_COMPAT
1801         .compat_setsockopt = compat_ip_setsockopt,
1802         .compat_getsockopt = compat_ip_getsockopt,
1803 #endif
1804         .mtu_reduced       = tcp_v4_mtu_reduced,
1805 };
1806 EXPORT_SYMBOL(ipv4_specific);
1807
1808 #ifdef CONFIG_TCP_MD5SIG
1809 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1810         .md5_lookup             = tcp_v4_md5_lookup,
1811         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1812         .md5_parse              = tcp_v4_parse_md5_keys,
1813 };
1814 #endif
1815
1816 /* NOTE: A lot of things set to zero explicitly by call to
1817  *       sk_alloc() so need not be done here.
1818  */
1819 static int tcp_v4_init_sock(struct sock *sk)
1820 {
1821         struct inet_connection_sock *icsk = inet_csk(sk);
1822
1823         tcp_init_sock(sk);
1824
1825         icsk->icsk_af_ops = &ipv4_specific;
1826
1827 #ifdef CONFIG_TCP_MD5SIG
1828         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1829 #endif
1830
1831         return 0;
1832 }
1833
1834 void tcp_v4_destroy_sock(struct sock *sk)
1835 {
1836         struct tcp_sock *tp = tcp_sk(sk);
1837
1838         tcp_clear_xmit_timers(sk);
1839
1840         tcp_cleanup_congestion_control(sk);
1841
1842         /* Cleanup up the write buffer. */
1843         tcp_write_queue_purge(sk);
1844
1845         /* Cleans up our, hopefully empty, out_of_order_queue. */
1846         skb_rbtree_purge(&tp->out_of_order_queue);
1847
1848 #ifdef CONFIG_TCP_MD5SIG
1849         /* Clean up the MD5 key list, if any */
1850         if (tp->md5sig_info) {
1851                 tcp_clear_md5_list(sk);
1852                 kfree_rcu(tp->md5sig_info, rcu);
1853                 tp->md5sig_info = NULL;
1854         }
1855 #endif
1856
1857         /* Clean prequeue, it must be empty really */
1858         __skb_queue_purge(&tp->ucopy.prequeue);
1859
1860         /* Clean up a referenced TCP bind bucket. */
1861         if (inet_csk(sk)->icsk_bind_hash)
1862                 inet_put_port(sk);
1863
1864         BUG_ON(tp->fastopen_rsk);
1865
1866         /* If socket is aborted during connect operation */
1867         tcp_free_fastopen_req(tp);
1868         tcp_saved_syn_free(tp);
1869
1870         local_bh_disable();
1871         sk_sockets_allocated_dec(sk);
1872         local_bh_enable();
1873 }
1874 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1875
1876 #ifdef CONFIG_PROC_FS
1877 /* Proc filesystem TCP sock list dumping. */
1878
1879 /*
1880  * Get next listener socket follow cur.  If cur is NULL, get first socket
1881  * starting from bucket given in st->bucket; when st->bucket is zero the
1882  * very first socket in the hash table is returned.
1883  */
1884 static void *listening_get_next(struct seq_file *seq, void *cur)
1885 {
1886         struct tcp_iter_state *st = seq->private;
1887         struct net *net = seq_file_net(seq);
1888         struct inet_listen_hashbucket *ilb;
1889         struct sock *sk = cur;
1890
1891         if (!sk) {
1892 get_head:
1893                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1894                 spin_lock(&ilb->lock);
1895                 sk = sk_head(&ilb->head);
1896                 st->offset = 0;
1897                 goto get_sk;
1898         }
1899         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1900         ++st->num;
1901         ++st->offset;
1902
1903         sk = sk_next(sk);
1904 get_sk:
1905         sk_for_each_from(sk) {
1906                 if (!net_eq(sock_net(sk), net))
1907                         continue;
1908                 if (sk->sk_family == st->family)
1909                         return sk;
1910         }
1911         spin_unlock(&ilb->lock);
1912         st->offset = 0;
1913         if (++st->bucket < INET_LHTABLE_SIZE)
1914                 goto get_head;
1915         return NULL;
1916 }
1917
1918 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1919 {
1920         struct tcp_iter_state *st = seq->private;
1921         void *rc;
1922
1923         st->bucket = 0;
1924         st->offset = 0;
1925         rc = listening_get_next(seq, NULL);
1926
1927         while (rc && *pos) {
1928                 rc = listening_get_next(seq, rc);
1929                 --*pos;
1930         }
1931         return rc;
1932 }
1933
1934 static inline bool empty_bucket(const struct tcp_iter_state *st)
1935 {
1936         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1937 }
1938
1939 /*
1940  * Get first established socket starting from bucket given in st->bucket.
1941  * If st->bucket is zero, the very first socket in the hash is returned.
1942  */
1943 static void *established_get_first(struct seq_file *seq)
1944 {
1945         struct tcp_iter_state *st = seq->private;
1946         struct net *net = seq_file_net(seq);
1947         void *rc = NULL;
1948
1949         st->offset = 0;
1950         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1951                 struct sock *sk;
1952                 struct hlist_nulls_node *node;
1953                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1954
1955                 /* Lockless fast path for the common case of empty buckets */
1956                 if (empty_bucket(st))
1957                         continue;
1958
1959                 spin_lock_bh(lock);
1960                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1961                         if (sk->sk_family != st->family ||
1962                             !net_eq(sock_net(sk), net)) {
1963                                 continue;
1964                         }
1965                         rc = sk;
1966                         goto out;
1967                 }
1968                 spin_unlock_bh(lock);
1969         }
1970 out:
1971         return rc;
1972 }
1973
1974 static void *established_get_next(struct seq_file *seq, void *cur)
1975 {
1976         struct sock *sk = cur;
1977         struct hlist_nulls_node *node;
1978         struct tcp_iter_state *st = seq->private;
1979         struct net *net = seq_file_net(seq);
1980
1981         ++st->num;
1982         ++st->offset;
1983
1984         sk = sk_nulls_next(sk);
1985
1986         sk_nulls_for_each_from(sk, node) {
1987                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1988                         return sk;
1989         }
1990
1991         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1992         ++st->bucket;
1993         return established_get_first(seq);
1994 }
1995
1996 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1997 {
1998         struct tcp_iter_state *st = seq->private;
1999         void *rc;
2000
2001         st->bucket = 0;
2002         rc = established_get_first(seq);
2003
2004         while (rc && pos) {
2005                 rc = established_get_next(seq, rc);
2006                 --pos;
2007         }
2008         return rc;
2009 }
2010
2011 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2012 {
2013         void *rc;
2014         struct tcp_iter_state *st = seq->private;
2015
2016         st->state = TCP_SEQ_STATE_LISTENING;
2017         rc        = listening_get_idx(seq, &pos);
2018
2019         if (!rc) {
2020                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2021                 rc        = established_get_idx(seq, pos);
2022         }
2023
2024         return rc;
2025 }
2026
2027 static void *tcp_seek_last_pos(struct seq_file *seq)
2028 {
2029         struct tcp_iter_state *st = seq->private;
2030         int offset = st->offset;
2031         int orig_num = st->num;
2032         void *rc = NULL;
2033
2034         switch (st->state) {
2035         case TCP_SEQ_STATE_LISTENING:
2036                 if (st->bucket >= INET_LHTABLE_SIZE)
2037                         break;
2038                 st->state = TCP_SEQ_STATE_LISTENING;
2039                 rc = listening_get_next(seq, NULL);
2040                 while (offset-- && rc)
2041                         rc = listening_get_next(seq, rc);
2042                 if (rc)
2043                         break;
2044                 st->bucket = 0;
2045                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2046                 /* Fallthrough */
2047         case TCP_SEQ_STATE_ESTABLISHED:
2048                 if (st->bucket > tcp_hashinfo.ehash_mask)
2049                         break;
2050                 rc = established_get_first(seq);
2051                 while (offset-- && rc)
2052                         rc = established_get_next(seq, rc);
2053         }
2054
2055         st->num = orig_num;
2056
2057         return rc;
2058 }
2059
2060 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2061 {
2062         struct tcp_iter_state *st = seq->private;
2063         void *rc;
2064
2065         if (*pos && *pos == st->last_pos) {
2066                 rc = tcp_seek_last_pos(seq);
2067                 if (rc)
2068                         goto out;
2069         }
2070
2071         st->state = TCP_SEQ_STATE_LISTENING;
2072         st->num = 0;
2073         st->bucket = 0;
2074         st->offset = 0;
2075         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2076
2077 out:
2078         st->last_pos = *pos;
2079         return rc;
2080 }
2081
2082 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2083 {
2084         struct tcp_iter_state *st = seq->private;
2085         void *rc = NULL;
2086
2087         if (v == SEQ_START_TOKEN) {
2088                 rc = tcp_get_idx(seq, 0);
2089                 goto out;
2090         }
2091
2092         switch (st->state) {
2093         case TCP_SEQ_STATE_LISTENING:
2094                 rc = listening_get_next(seq, v);
2095                 if (!rc) {
2096                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2097                         st->bucket = 0;
2098                         st->offset = 0;
2099                         rc        = established_get_first(seq);
2100                 }
2101                 break;
2102         case TCP_SEQ_STATE_ESTABLISHED:
2103                 rc = established_get_next(seq, v);
2104                 break;
2105         }
2106 out:
2107         ++*pos;
2108         st->last_pos = *pos;
2109         return rc;
2110 }
2111
2112 static void tcp_seq_stop(struct seq_file *seq, void *v)
2113 {
2114         struct tcp_iter_state *st = seq->private;
2115
2116         switch (st->state) {
2117         case TCP_SEQ_STATE_LISTENING:
2118                 if (v != SEQ_START_TOKEN)
2119                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2120                 break;
2121         case TCP_SEQ_STATE_ESTABLISHED:
2122                 if (v)
2123                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2124                 break;
2125         }
2126 }
2127
2128 int tcp_seq_open(struct inode *inode, struct file *file)
2129 {
2130         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2131         struct tcp_iter_state *s;
2132         int err;
2133
2134         err = seq_open_net(inode, file, &afinfo->seq_ops,
2135                           sizeof(struct tcp_iter_state));
2136         if (err < 0)
2137                 return err;
2138
2139         s = ((struct seq_file *)file->private_data)->private;
2140         s->family               = afinfo->family;
2141         s->last_pos             = 0;
2142         return 0;
2143 }
2144 EXPORT_SYMBOL(tcp_seq_open);
2145
2146 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2147 {
2148         int rc = 0;
2149         struct proc_dir_entry *p;
2150
2151         afinfo->seq_ops.start           = tcp_seq_start;
2152         afinfo->seq_ops.next            = tcp_seq_next;
2153         afinfo->seq_ops.stop            = tcp_seq_stop;
2154
2155         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2156                              afinfo->seq_fops, afinfo);
2157         if (!p)
2158                 rc = -ENOMEM;
2159         return rc;
2160 }
2161 EXPORT_SYMBOL(tcp_proc_register);
2162
2163 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2164 {
2165         remove_proc_entry(afinfo->name, net->proc_net);
2166 }
2167 EXPORT_SYMBOL(tcp_proc_unregister);
2168
2169 static void get_openreq4(const struct request_sock *req,
2170                          struct seq_file *f, int i)
2171 {
2172         const struct inet_request_sock *ireq = inet_rsk(req);
2173         long delta = req->rsk_timer.expires - jiffies;
2174
2175         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2176                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2177                 i,
2178                 ireq->ir_loc_addr,
2179                 ireq->ir_num,
2180                 ireq->ir_rmt_addr,
2181                 ntohs(ireq->ir_rmt_port),
2182                 TCP_SYN_RECV,
2183                 0, 0, /* could print option size, but that is af dependent. */
2184                 1,    /* timers active (only the expire timer) */
2185                 jiffies_delta_to_clock_t(delta),
2186                 req->num_timeout,
2187                 from_kuid_munged(seq_user_ns(f),
2188                                  sock_i_uid(req->rsk_listener)),
2189                 0,  /* non standard timer */
2190                 0, /* open_requests have no inode */
2191                 0,
2192                 req);
2193 }
2194
2195 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2196 {
2197         int timer_active;
2198         unsigned long timer_expires;
2199         const struct tcp_sock *tp = tcp_sk(sk);
2200         const struct inet_connection_sock *icsk = inet_csk(sk);
2201         const struct inet_sock *inet = inet_sk(sk);
2202         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2203         __be32 dest = inet->inet_daddr;
2204         __be32 src = inet->inet_rcv_saddr;
2205         __u16 destp = ntohs(inet->inet_dport);
2206         __u16 srcp = ntohs(inet->inet_sport);
2207         int rx_queue;
2208         int state;
2209
2210         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2211             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2212             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2213                 timer_active    = 1;
2214                 timer_expires   = icsk->icsk_timeout;
2215         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2216                 timer_active    = 4;
2217                 timer_expires   = icsk->icsk_timeout;
2218         } else if (timer_pending(&sk->sk_timer)) {
2219                 timer_active    = 2;
2220                 timer_expires   = sk->sk_timer.expires;
2221         } else {
2222                 timer_active    = 0;
2223                 timer_expires = jiffies;
2224         }
2225
2226         state = sk_state_load(sk);
2227         if (state == TCP_LISTEN)
2228                 rx_queue = sk->sk_ack_backlog;
2229         else
2230                 /* Because we don't lock the socket,
2231                  * we might find a transient negative value.
2232                  */
2233                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2234
2235         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2236                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2237                 i, src, srcp, dest, destp, state,
2238                 tp->write_seq - tp->snd_una,
2239                 rx_queue,
2240                 timer_active,
2241                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2242                 icsk->icsk_retransmits,
2243                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2244                 icsk->icsk_probes_out,
2245                 sock_i_ino(sk),
2246                 atomic_read(&sk->sk_refcnt), sk,
2247                 jiffies_to_clock_t(icsk->icsk_rto),
2248                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2249                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2250                 tp->snd_cwnd,
2251                 state == TCP_LISTEN ?
2252                     fastopenq->max_qlen :
2253                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2254 }
2255
2256 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2257                                struct seq_file *f, int i)
2258 {
2259         long delta = tw->tw_timer.expires - jiffies;
2260         __be32 dest, src;
2261         __u16 destp, srcp;
2262
2263         dest  = tw->tw_daddr;
2264         src   = tw->tw_rcv_saddr;
2265         destp = ntohs(tw->tw_dport);
2266         srcp  = ntohs(tw->tw_sport);
2267
2268         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2269                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2270                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2271                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2272                 atomic_read(&tw->tw_refcnt), tw);
2273 }
2274
2275 #define TMPSZ 150
2276
2277 static int tcp4_seq_show(struct seq_file *seq, void *v)
2278 {
2279         struct tcp_iter_state *st;
2280         struct sock *sk = v;
2281
2282         seq_setwidth(seq, TMPSZ - 1);
2283         if (v == SEQ_START_TOKEN) {
2284                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2285                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2286                            "inode");
2287                 goto out;
2288         }
2289         st = seq->private;
2290
2291         if (sk->sk_state == TCP_TIME_WAIT)
2292                 get_timewait4_sock(v, seq, st->num);
2293         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2294                 get_openreq4(v, seq, st->num);
2295         else
2296                 get_tcp4_sock(v, seq, st->num);
2297 out:
2298         seq_pad(seq, '\n');
2299         return 0;
2300 }
2301
2302 static const struct file_operations tcp_afinfo_seq_fops = {
2303         .owner   = THIS_MODULE,
2304         .open    = tcp_seq_open,
2305         .read    = seq_read,
2306         .llseek  = seq_lseek,
2307         .release = seq_release_net
2308 };
2309
2310 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2311         .name           = "tcp",
2312         .family         = AF_INET,
2313         .seq_fops       = &tcp_afinfo_seq_fops,
2314         .seq_ops        = {
2315                 .show           = tcp4_seq_show,
2316         },
2317 };
2318
2319 static int __net_init tcp4_proc_init_net(struct net *net)
2320 {
2321         return tcp_proc_register(net, &tcp4_seq_afinfo);
2322 }
2323
2324 static void __net_exit tcp4_proc_exit_net(struct net *net)
2325 {
2326         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2327 }
2328
2329 static struct pernet_operations tcp4_net_ops = {
2330         .init = tcp4_proc_init_net,
2331         .exit = tcp4_proc_exit_net,
2332 };
2333
2334 int __init tcp4_proc_init(void)
2335 {
2336         return register_pernet_subsys(&tcp4_net_ops);
2337 }
2338
2339 void tcp4_proc_exit(void)
2340 {
2341         unregister_pernet_subsys(&tcp4_net_ops);
2342 }
2343 #endif /* CONFIG_PROC_FS */
2344
2345 struct proto tcp_prot = {
2346         .name                   = "TCP",
2347         .owner                  = THIS_MODULE,
2348         .close                  = tcp_close,
2349         .connect                = tcp_v4_connect,
2350         .disconnect             = tcp_disconnect,
2351         .accept                 = inet_csk_accept,
2352         .ioctl                  = tcp_ioctl,
2353         .init                   = tcp_v4_init_sock,
2354         .destroy                = tcp_v4_destroy_sock,
2355         .shutdown               = tcp_shutdown,
2356         .setsockopt             = tcp_setsockopt,
2357         .getsockopt             = tcp_getsockopt,
2358         .recvmsg                = tcp_recvmsg,
2359         .sendmsg                = tcp_sendmsg,
2360         .sendpage               = tcp_sendpage,
2361         .backlog_rcv            = tcp_v4_do_rcv,
2362         .release_cb             = tcp_release_cb,
2363         .hash                   = inet_hash,
2364         .unhash                 = inet_unhash,
2365         .get_port               = inet_csk_get_port,
2366         .enter_memory_pressure  = tcp_enter_memory_pressure,
2367         .stream_memory_free     = tcp_stream_memory_free,
2368         .sockets_allocated      = &tcp_sockets_allocated,
2369         .orphan_count           = &tcp_orphan_count,
2370         .memory_allocated       = &tcp_memory_allocated,
2371         .memory_pressure        = &tcp_memory_pressure,
2372         .sysctl_mem             = sysctl_tcp_mem,
2373         .sysctl_wmem            = sysctl_tcp_wmem,
2374         .sysctl_rmem            = sysctl_tcp_rmem,
2375         .max_header             = MAX_TCP_HEADER,
2376         .obj_size               = sizeof(struct tcp_sock),
2377         .slab_flags             = SLAB_DESTROY_BY_RCU,
2378         .twsk_prot              = &tcp_timewait_sock_ops,
2379         .rsk_prot               = &tcp_request_sock_ops,
2380         .h.hashinfo             = &tcp_hashinfo,
2381         .no_autobind            = true,
2382 #ifdef CONFIG_COMPAT
2383         .compat_setsockopt      = compat_tcp_setsockopt,
2384         .compat_getsockopt      = compat_tcp_getsockopt,
2385 #endif
2386         .diag_destroy           = tcp_abort,
2387 };
2388 EXPORT_SYMBOL(tcp_prot);
2389
2390 static void __net_exit tcp_sk_exit(struct net *net)
2391 {
2392         int cpu;
2393
2394         for_each_possible_cpu(cpu)
2395                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2396         free_percpu(net->ipv4.tcp_sk);
2397 }
2398
2399 static int __net_init tcp_sk_init(struct net *net)
2400 {
2401         int res, cpu;
2402
2403         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2404         if (!net->ipv4.tcp_sk)
2405                 return -ENOMEM;
2406
2407         for_each_possible_cpu(cpu) {
2408                 struct sock *sk;
2409
2410                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2411                                            IPPROTO_TCP, net);
2412                 if (res)
2413                         goto fail;
2414                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2415                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2416         }
2417
2418         net->ipv4.sysctl_tcp_ecn = 2;
2419         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2420
2421         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2422         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2423         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2424
2425         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2426         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2427         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2428
2429         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2430         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2431         net->ipv4.sysctl_tcp_syncookies = 1;
2432         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2433         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2434         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2435         net->ipv4.sysctl_tcp_orphan_retries = 0;
2436         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2437         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2438
2439         return 0;
2440 fail:
2441         tcp_sk_exit(net);
2442
2443         return res;
2444 }
2445
2446 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2447 {
2448         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2449 }
2450
2451 static struct pernet_operations __net_initdata tcp_sk_ops = {
2452        .init       = tcp_sk_init,
2453        .exit       = tcp_sk_exit,
2454        .exit_batch = tcp_sk_exit_batch,
2455 };
2456
2457 void __init tcp_v4_init(void)
2458 {
2459         inet_hashinfo_init(&tcp_hashinfo);
2460         if (register_pernet_subsys(&tcp_sk_ops))
2461                 panic("Failed to create the TCP control socket.\n");
2462 }