net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115         struct tcp_sock *tp = tcp_sk(sk);
 116         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118         if (reuse == 2) {
 119                 /* Still does not detect *everything* that goes through
 120                  * lo, since we require a loopback src or dst address
 121                  * or direct binding to 'lo' interface.
 122                  */
 123                 bool loopback = false;
 124                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                         loopback = true;
 126 #if IS_ENABLED(CONFIG_IPV6)
 127                 if (tw->tw_family == AF_INET6) {
 128                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                             (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                              (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                             (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                              (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                 loopback = true;
 135                 } else
 136 #endif
 137                 {
 138                         if (ipv4_is_loopback(tw->tw_daddr) ||
 139                             ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                 loopback = true;
 141                 }
 142                 if (!loopback)
 143                         reuse = 0;
 144         }
 145
 146         /* With PAWS, it is safe from the viewpoint
 147            of data integrity. Even without PAWS it is safe provided sequence
 148            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150            Actually, the idea is close to VJ's one, only timestamp cache is
 151            held not per host, but per port pair and TW bucket is used as state
 152            holder.
 153
 154            If TW bucket has been already destroyed we fall back to VJ's scheme
 155            and use initial timestamp retrieved from peer table.
 156          */
 157         if (tcptw->tw_ts_recent_stamp &&
 158             (!twp || (reuse && time_after32(ktime_get_seconds(),
 159                                             tcptw->tw_ts_recent_stamp)))) {
 160                 /* In case of repair and re-using TIME-WAIT sockets we still
 161                  * want to be sure that it is safe as above but honor the
 162                  * sequence numbers and time stamps set as part of the repair
 163                  * process.
 164                  *
 165                  * Without this check re-using a TIME-WAIT socket with TCP
 166                  * repair would accumulate a -1 on the repair assigned
 167                  * sequence number. The first time it is reused the sequence
 168                  * is -1, the second time -2, etc. This fixes that issue
 169                  * without appearing to create any others.
 170                  */
 171                 if (likely(!tp->repair)) {
 172                         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 173                         if (tp->write_seq == 0)
 174                                 tp->write_seq = 1;
 175                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 176                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 177                 }
 178                 sock_hold(sktw);
 179                 return 1;
 180         }
 181
 182         return 0;
 183 }
 184 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 185
 186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 187                               int addr_len)
 188 {
 189         /* This check is replicated from tcp_v4_connect() and intended to
 190          * prevent BPF program called below from accessing bytes that are out
 191          * of the bound specified by user in addr_len.
 192          */
 193         if (addr_len < sizeof(struct sockaddr_in))
 194                 return -EINVAL;
 195
 196         sock_owned_by_me(sk);
 197
 198         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 199 }
 200
 201 /* This will initiate an outgoing connection. */
 202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 203 {
 204         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 205         struct inet_sock *inet = inet_sk(sk);
 206         struct tcp_sock *tp = tcp_sk(sk);
 207         __be16 orig_sport, orig_dport;
 208         __be32 daddr, nexthop;
 209         struct flowi4 *fl4;
 210         struct rtable *rt;
 211         int err;
 212         struct ip_options_rcu *inet_opt;
 213         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 214
 215         if (addr_len < sizeof(struct sockaddr_in))
 216                 return -EINVAL;
 217
 218         if (usin->sin_family != AF_INET)
 219                 return -EAFNOSUPPORT;
 220
 221         nexthop = daddr = usin->sin_addr.s_addr;
 222         inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                              lockdep_sock_is_held(sk));
 224         if (inet_opt && inet_opt->opt.srr) {
 225                 if (!daddr)
 226                         return -EINVAL;
 227                 nexthop = inet_opt->opt.faddr;
 228         }
 229
 230         orig_sport = inet->inet_sport;
 231         orig_dport = usin->sin_port;
 232         fl4 = &inet->cork.fl.u.ip4;
 233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 235                               IPPROTO_TCP,
 236                               orig_sport, orig_dport, sk);
 237         if (IS_ERR(rt)) {
 238                 err = PTR_ERR(rt);
 239                 if (err == -ENETUNREACH)
 240                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 241                 return err;
 242         }
 243
 244         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 245                 ip_rt_put(rt);
 246                 return -ENETUNREACH;
 247         }
 248
 249         if (!inet_opt || !inet_opt->opt.srr)
 250                 daddr = fl4->daddr;
 251
 252         if (!inet->inet_saddr)
 253                 inet->inet_saddr = fl4->saddr;
 254         sk_rcv_saddr_set(sk, inet->inet_saddr);
 255
 256         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 257                 /* Reset inherited state */
 258                 tp->rx_opt.ts_recent       = 0;
 259                 tp->rx_opt.ts_recent_stamp = 0;
 260                 if (likely(!tp->repair))
 261                         tp->write_seq      = 0;
 262         }
 263
 264         inet->inet_dport = usin->sin_port;
 265         sk_daddr_set(sk, daddr);
 266
 267         inet_csk(sk)->icsk_ext_hdr_len = 0;
 268         if (inet_opt)
 269                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 270
 271         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 272
 273         /* Socket identity is still unknown (sport may be zero).
 274          * However we set state to SYN-SENT and not releasing socket
 275          * lock select source port, enter ourselves into the hash tables and
 276          * complete initialization after this.
 277          */
 278         tcp_set_state(sk, TCP_SYN_SENT);
 279         err = inet_hash_connect(tcp_death_row, sk);
 280         if (err)
 281                 goto failure;
 282
 283         sk_set_txhash(sk);
 284
 285         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 286                                inet->inet_sport, inet->inet_dport, sk);
 287         if (IS_ERR(rt)) {
 288                 err = PTR_ERR(rt);
 289                 rt = NULL;
 290                 goto failure;
 291         }
 292         /* OK, now commit destination to socket.  */
 293         sk->sk_gso_type = SKB_GSO_TCPV4;
 294         sk_setup_caps(sk, &rt->dst);
 295         rt = NULL;
 296
 297         if (likely(!tp->repair)) {
 298                 if (!tp->write_seq)
 299                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 300                                                        inet->inet_daddr,
 301                                                        inet->inet_sport,
 302                                                        usin->sin_port);
 303                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 304                                                  inet->inet_saddr,
 305                                                  inet->inet_daddr);
 306         }
 307
 308         inet->inet_id = tp->write_seq ^ jiffies;
 309
 310         if (tcp_fastopen_defer_connect(sk, &err))
 311                 return err;
 312         if (err)
 313                 goto failure;
 314
 315         err = tcp_connect(sk);
 316
 317         if (err)
 318                 goto failure;
 319
 320         return 0;
 321
 322 failure:
 323         /*
 324          * This unhashes the socket and releases the local port,
 325          * if necessary.
 326          */
 327         tcp_set_state(sk, TCP_CLOSE);
 328         ip_rt_put(rt);
 329         sk->sk_route_caps = 0;
 330         inet->inet_dport = 0;
 331         return err;
 332 }
 333 EXPORT_SYMBOL(tcp_v4_connect);
 334
 335 /*
 336  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337  * It can be called through tcp_release_cb() if socket was owned by user
 338  * at the time tcp_v4_err() was called to handle ICMP message.
 339  */
 340 void tcp_v4_mtu_reduced(struct sock *sk)
 341 {
 342         struct inet_sock *inet = inet_sk(sk);
 343         struct dst_entry *dst;
 344         u32 mtu;
 345
 346         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347                 return;
 348         mtu = tcp_sk(sk)->mtu_info;
 349         dst = inet_csk_update_pmtu(sk, mtu);
 350         if (!dst)
 351                 return;
 352
 353         /* Something is about to be wrong... Remember soft error
 354          * for the case, if this connection will not able to recover.
 355          */
 356         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357                 sk->sk_err_soft = EMSGSIZE;
 358
 359         mtu = dst_mtu(dst);
 360
 361         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362             ip_sk_accept_pmtu(sk) &&
 363             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364                 tcp_sync_mss(sk, mtu);
 365
 366                 /* Resend the TCP packet because it's
 367                  * clear that the old packet has been
 368                  * dropped. This is the new "fast" path mtu
 369                  * discovery.
 370                  */
 371                 tcp_simple_retransmit(sk);
 372         } /* else let the usual retransmit timer handle it */
 373 }
 374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377 {
 378         struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380         if (dst)
 381                 dst->ops->redirect(dst, sk, skb);
 382 }
 383
 384
 385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387 {
 388         struct request_sock *req = inet_reqsk(sk);
 389         struct net *net = sock_net(sk);
 390
 391         /* ICMPs are not backlogged, hence we cannot get
 392          * an established socket here.
 393          */
 394         if (seq != tcp_rsk(req)->snt_isn) {
 395                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396         } else if (abort) {
 397                 /*
 398                  * Still in SYN_RECV, just remove it silently.
 399                  * There is no good way to pass the error to the newly
 400                  * created socket, and POSIX does not want network
 401                  * errors returned from accept().
 402                  */
 403                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404                 tcp_listendrop(req->rsk_listener);
 405         }
 406         reqsk_put(req);
 407 }
 408 EXPORT_SYMBOL(tcp_req_err);
 409
 410 /*
 411  * This routine is called by the ICMP module when it gets some
 412  * sort of error condition.  If err < 0 then the socket should
 413  * be closed and the error returned to the user.  If err > 0
 414  * it's just the icmp type << 8 | icmp code.  After adjustment
 415  * header points to the first 8 bytes of the tcp header.  We need
 416  * to find the appropriate port.
 417  *
 418  * The locking strategy used here is very "optimistic". When
 419  * someone else accesses the socket the ICMP is just dropped
 420  * and for some paths there is no check at all.
 421  * A more general error queue to queue errors for later handling
 422  * is probably better.
 423  *
 424  */
 425
 426 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427 {
 428         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430         struct inet_connection_sock *icsk;
 431         struct tcp_sock *tp;
 432         struct inet_sock *inet;
 433         const int type = icmp_hdr(icmp_skb)->type;
 434         const int code = icmp_hdr(icmp_skb)->code;
 435         struct sock *sk;
 436         struct sk_buff *skb;
 437         struct request_sock *fastopen;
 438         u32 seq, snd_una;
 439         s32 remaining;
 440         u32 delta_us;
 441         int err;
 442         struct net *net = dev_net(icmp_skb->dev);
 443
 444         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445                                        th->dest, iph->saddr, ntohs(th->source),
 446                                        inet_iif(icmp_skb), 0);
 447         if (!sk) {
 448                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449                 return;
 450         }
 451         if (sk->sk_state == TCP_TIME_WAIT) {
 452                 inet_twsk_put(inet_twsk(sk));
 453                 return;
 454         }
 455         seq = ntohl(th->seq);
 456         if (sk->sk_state == TCP_NEW_SYN_RECV)
 457                 return tcp_req_err(sk, seq,
 458                                   type == ICMP_PARAMETERPROB ||
 459                                   type == ICMP_TIME_EXCEEDED ||
 460                                   (type == ICMP_DEST_UNREACH &&
 461                                    (code == ICMP_NET_UNREACH ||
 462                                     code == ICMP_HOST_UNREACH)));
 463
 464         bh_lock_sock(sk);
 465         /* If too many ICMPs get dropped on busy
 466          * servers this needs to be solved differently.
 467          * We do take care of PMTU discovery (RFC1191) special case :
 468          * we can receive locally generated ICMP messages while socket is held.
 469          */
 470         if (sock_owned_by_user(sk)) {
 471                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 472                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 473         }
 474         if (sk->sk_state == TCP_CLOSE)
 475                 goto out;
 476
 477         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 478                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 479                 goto out;
 480         }
 481
 482         icsk = inet_csk(sk);
 483         tp = tcp_sk(sk);
 484         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 485         fastopen = tp->fastopen_rsk;
 486         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 487         if (sk->sk_state != TCP_LISTEN &&
 488             !between(seq, snd_una, tp->snd_nxt)) {
 489                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 490                 goto out;
 491         }
 492
 493         switch (type) {
 494         case ICMP_REDIRECT:
 495                 if (!sock_owned_by_user(sk))
 496                         do_redirect(icmp_skb, sk);
 497                 goto out;
 498         case ICMP_SOURCE_QUENCH:
 499                 /* Just silently ignore these. */
 500                 goto out;
 501         case ICMP_PARAMETERPROB:
 502                 err = EPROTO;
 503                 break;
 504         case ICMP_DEST_UNREACH:
 505                 if (code > NR_ICMP_UNREACH)
 506                         goto out;
 507
 508                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 509                         /* We are not interested in TCP_LISTEN and open_requests
 510                          * (SYN-ACKs send out by Linux are always <576bytes so
 511                          * they should go through unfragmented).
 512                          */
 513                         if (sk->sk_state == TCP_LISTEN)
 514                                 goto out;
 515
 516                         tp->mtu_info = info;
 517                         if (!sock_owned_by_user(sk)) {
 518                                 tcp_v4_mtu_reduced(sk);
 519                         } else {
 520                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 521                                         sock_hold(sk);
 522                         }
 523                         goto out;
 524                 }
 525
 526                 err = icmp_err_convert[code].errno;
 527                 /* check if icmp_skb allows revert of backoff
 528                  * (see draft-zimmermann-tcp-lcd) */
 529                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 530                         break;
 531                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 532                     !icsk->icsk_backoff || fastopen)
 533                         break;
 534
 535                 if (sock_owned_by_user(sk))
 536                         break;
 537
 538                 icsk->icsk_backoff--;
 539                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                                TCP_TIMEOUT_INIT;
 541                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543                 skb = tcp_rtx_queue_head(sk);
 544                 BUG_ON(!skb);
 545
 546                 tcp_mstamp_refresh(tp);
 547                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 548                 remaining = icsk->icsk_rto -
 549                             usecs_to_jiffies(delta_us);
 550
 551                 if (remaining > 0) {
 552                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 553                                                   remaining, TCP_RTO_MAX);
 554                 } else {
 555                         /* RTO revert clocked out retransmission.
 556                          * Will retransmit now */
 557                         tcp_retransmit_timer(sk);
 558                 }
 559
 560                 break;
 561         case ICMP_TIME_EXCEEDED:
 562                 err = EHOSTUNREACH;
 563                 break;
 564         default:
 565                 goto out;
 566         }
 567
 568         switch (sk->sk_state) {
 569         case TCP_SYN_SENT:
 570         case TCP_SYN_RECV:
 571                 /* Only in fast or simultaneous open. If a fast open socket is
 572                  * is already accepted it is treated as a connected one below.
 573                  */
 574                 if (fastopen && !fastopen->sk)
 575                         break;
 576
 577                 if (!sock_owned_by_user(sk)) {
 578                         sk->sk_err = err;
 579
 580                         sk->sk_error_report(sk);
 581
 582                         tcp_done(sk);
 583                 } else {
 584                         sk->sk_err_soft = err;
 585                 }
 586                 goto out;
 587         }
 588
 589         /* If we've already connected we will keep trying
 590          * until we time out, or the user gives up.
 591          *
 592          * rfc1122 4.2.3.9 allows to consider as hard errors
 593          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 594          * but it is obsoleted by pmtu discovery).
 595          *
 596          * Note, that in modern internet, where routing is unreliable
 597          * and in each dark corner broken firewalls sit, sending random
 598          * errors ordered by their masters even this two messages finally lose
 599          * their original sense (even Linux sends invalid PORT_UNREACHs)
 600          *
 601          * Now we are in compliance with RFCs.
 602          *                                                      --ANK (980905)
 603          */
 604
 605         inet = inet_sk(sk);
 606         if (!sock_owned_by_user(sk) && inet->recverr) {
 607                 sk->sk_err = err;
 608                 sk->sk_error_report(sk);
 609         } else  { /* Only an error on timeout */
 610                 sk->sk_err_soft = err;
 611         }
 612
 613 out:
 614         bh_unlock_sock(sk);
 615         sock_put(sk);
 616 }
 617
 618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619 {
 620         struct tcphdr *th = tcp_hdr(skb);
 621
 622         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623         skb->csum_start = skb_transport_header(skb) - skb->head;
 624         skb->csum_offset = offsetof(struct tcphdr, check);
 625 }
 626
 627 /* This routine computes an IPv4 TCP checksum. */
 628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629 {
 630         const struct inet_sock *inet = inet_sk(sk);
 631
 632         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633 }
 634 EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636 /*
 637  *      This routine will send an RST to the other tcp.
 638  *
 639  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640  *                    for reset.
 641  *      Answer: if a packet caused RST, it is not for a socket
 642  *              existing in our system, if it is matched to a socket,
 643  *              it is just duplicate segment or bug in other side's TCP.
 644  *              So that we build reply only basing on parameters
 645  *              arrived with segment.
 646  *      Exception: precedence violation. We do not implement it in any case.
 647  */
 648
 649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650 {
 651         const struct tcphdr *th = tcp_hdr(skb);
 652         struct {
 653                 struct tcphdr th;
 654 #ifdef CONFIG_TCP_MD5SIG
 655                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656 #endif
 657         } rep;
 658         struct ip_reply_arg arg;
 659 #ifdef CONFIG_TCP_MD5SIG
 660         struct tcp_md5sig_key *key = NULL;
 661         const __u8 *hash_location = NULL;
 662         unsigned char newhash[16];
 663         int genhash;
 664         struct sock *sk1 = NULL;
 665 #endif
 666         struct net *net;
 667         struct sock *ctl_sk;
 668
 669         /* Never send a reset in response to a reset. */
 670         if (th->rst)
 671                 return;
 672
 673         /* If sk not NULL, it means we did a successful lookup and incoming
 674          * route had to be correct. prequeue might have dropped our dst.
 675          */
 676         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                 return;
 678
 679         /* Swap the send and the receive. */
 680         memset(&rep, 0, sizeof(rep));
 681         rep.th.dest   = th->source;
 682         rep.th.source = th->dest;
 683         rep.th.doff   = sizeof(struct tcphdr) / 4;
 684         rep.th.rst    = 1;
 685
 686         if (th->ack) {
 687                 rep.th.seq = th->ack_seq;
 688         } else {
 689                 rep.th.ack = 1;
 690                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                        skb->len - (th->doff << 2));
 692         }
 693
 694         memset(&arg, 0, sizeof(arg));
 695         arg.iov[0].iov_base = (unsigned char *)&rep;
 696         arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699 #ifdef CONFIG_TCP_MD5SIG
 700         rcu_read_lock();
 701         hash_location = tcp_parse_md5sig_option(th);
 702         if (sk && sk_fullsock(sk)) {
 703                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                         &ip_hdr(skb)->saddr, AF_INET);
 705         } else if (hash_location) {
 706                 /*
 707                  * active side is lost. Try to find listening socket through
 708                  * source port, and then find md5 key through listening socket.
 709                  * we are not loose security here:
 710                  * Incoming packet is checked with md5 hash with finding key,
 711                  * no RST generated if md5 hash doesn't match.
 712                  */
 713                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                              ip_hdr(skb)->saddr,
 715                                              th->source, ip_hdr(skb)->daddr,
 716                                              ntohs(th->source), inet_iif(skb),
 717                                              tcp_v4_sdif(skb));
 718                 /* don't send rst if it can't find key */
 719                 if (!sk1)
 720                         goto out;
 721
 722                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                         &ip_hdr(skb)->saddr, AF_INET);
 724                 if (!key)
 725                         goto out;
 726
 727
 728                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                         goto out;
 731
 732         }
 733
 734         if (key) {
 735                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                    (TCPOPT_NOP << 16) |
 737                                    (TCPOPT_MD5SIG << 8) |
 738                                    TCPOLEN_MD5SIG);
 739                 /* Update length and the length the header thinks exists */
 740                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                 rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                      key, ip_hdr(skb)->saddr,
 745                                      ip_hdr(skb)->daddr, &rep.th);
 746         }
 747 #endif
 748         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                       ip_hdr(skb)->saddr, /* XXX */
 750                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754         /* When socket is gone, all binding information is lost.
 755          * routing might fail in this case. No choice here, if we choose to force
 756          * input interface, we will misroute in case of asymmetric route.
 757          */
 758         if (sk) {
 759                 arg.bound_dev_if = sk->sk_bound_dev_if;
 760                 if (sk_fullsock(sk))
 761                         trace_tcp_send_reset(sk, skb);
 762         }
 763
 764         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767         arg.tos = ip_hdr(skb)->tos;
 768         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769         local_bh_disable();
 770         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 771         if (sk)
 772                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 774         ip_send_unicast_reply(ctl_sk,
 775                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 776                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 777                               &arg, arg.iov[0].iov_len);
 778
 779         ctl_sk->sk_mark = 0;
 780         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 781         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 782         local_bh_enable();
 783
 784 #ifdef CONFIG_TCP_MD5SIG
 785 out:
 786         rcu_read_unlock();
 787 #endif
 788 }
 789
 790 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 791    outside socket context is ugly, certainly. What can I do?
 792  */
 793
 794 static void tcp_v4_send_ack(const struct sock *sk,
 795                             struct sk_buff *skb, u32 seq, u32 ack,
 796                             u32 win, u32 tsval, u32 tsecr, int oif,
 797                             struct tcp_md5sig_key *key,
 798                             int reply_flags, u8 tos)
 799 {
 800         const struct tcphdr *th = tcp_hdr(skb);
 801         struct {
 802                 struct tcphdr th;
 803                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 804 #ifdef CONFIG_TCP_MD5SIG
 805                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 806 #endif
 807                         ];
 808         } rep;
 809         struct net *net = sock_net(sk);
 810         struct ip_reply_arg arg;
 811         struct sock *ctl_sk;
 812
 813         memset(&rep.th, 0, sizeof(struct tcphdr));
 814         memset(&arg, 0, sizeof(arg));
 815
 816         arg.iov[0].iov_base = (unsigned char *)&rep;
 817         arg.iov[0].iov_len  = sizeof(rep.th);
 818         if (tsecr) {
 819                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 820                                    (TCPOPT_TIMESTAMP << 8) |
 821                                    TCPOLEN_TIMESTAMP);
 822                 rep.opt[1] = htonl(tsval);
 823                 rep.opt[2] = htonl(tsecr);
 824                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 825         }
 826
 827         /* Swap the send and the receive. */
 828         rep.th.dest    = th->source;
 829         rep.th.source  = th->dest;
 830         rep.th.doff    = arg.iov[0].iov_len / 4;
 831         rep.th.seq     = htonl(seq);
 832         rep.th.ack_seq = htonl(ack);
 833         rep.th.ack     = 1;
 834         rep.th.window  = htons(win);
 835
 836 #ifdef CONFIG_TCP_MD5SIG
 837         if (key) {
 838                 int offset = (tsecr) ? 3 : 0;
 839
 840                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 841                                           (TCPOPT_NOP << 16) |
 842                                           (TCPOPT_MD5SIG << 8) |
 843                                           TCPOLEN_MD5SIG);
 844                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 845                 rep.th.doff = arg.iov[0].iov_len/4;
 846
 847                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 848                                     key, ip_hdr(skb)->saddr,
 849                                     ip_hdr(skb)->daddr, &rep.th);
 850         }
 851 #endif
 852         arg.flags = reply_flags;
 853         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 854                                       ip_hdr(skb)->saddr, /* XXX */
 855                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 856         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 857         if (oif)
 858                 arg.bound_dev_if = oif;
 859         arg.tos = tos;
 860         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 861         local_bh_disable();
 862         ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 863         if (sk)
 864                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 865                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 866         ip_send_unicast_reply(ctl_sk,
 867                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 868                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 869                               &arg, arg.iov[0].iov_len);
 870
 871         ctl_sk->sk_mark = 0;
 872         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 873         local_bh_enable();
 874 }
 875
 876 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 877 {
 878         struct inet_timewait_sock *tw = inet_twsk(sk);
 879         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 880
 881         tcp_v4_send_ack(sk, skb,
 882                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 883                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 884                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 885                         tcptw->tw_ts_recent,
 886                         tw->tw_bound_dev_if,
 887                         tcp_twsk_md5_key(tcptw),
 888                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 889                         tw->tw_tos
 890                         );
 891
 892         inet_twsk_put(tw);
 893 }
 894
 895 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 896                                   struct request_sock *req)
 897 {
 898         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 899          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 900          */
 901         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 902                                              tcp_sk(sk)->snd_nxt;
 903
 904         /* RFC 7323 2.3
 905          * The window field (SEG.WND) of every outgoing segment, with the
 906          * exception of <SYN> segments, MUST be right-shifted by
 907          * Rcv.Wind.Shift bits:
 908          */
 909         tcp_v4_send_ack(sk, skb, seq,
 910                         tcp_rsk(req)->rcv_nxt,
 911                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 912                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 913                         req->ts_recent,
 914                         0,
 915                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 916                                           AF_INET),
 917                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 918                         ip_hdr(skb)->tos);
 919 }
 920
 921 /*
 922  *      Send a SYN-ACK after having received a SYN.
 923  *      This still operates on a request_sock only, not on a big
 924  *      socket.
 925  */
 926 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 927                               struct flowi *fl,
 928                               struct request_sock *req,
 929                               struct tcp_fastopen_cookie *foc,
 930                               enum tcp_synack_type synack_type)
 931 {
 932         const struct inet_request_sock *ireq = inet_rsk(req);
 933         struct flowi4 fl4;
 934         int err = -1;
 935         struct sk_buff *skb;
 936
 937         /* First, grab a route. */
 938         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 939                 return -1;
 940
 941         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 942
 943         if (skb) {
 944                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 945
 946                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 947                                             ireq->ir_rmt_addr,
 948                                             ireq_opt_deref(ireq));
 949                 err = net_xmit_eval(err);
 950         }
 951
 952         return err;
 953 }
 954
 955 /*
 956  *      IPv4 request_sock destructor.
 957  */
 958 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 959 {
 960         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 961 }
 962
 963 #ifdef CONFIG_TCP_MD5SIG
 964 /*
 965  * RFC2385 MD5 checksumming requires a mapping of
 966  * IP address->MD5 Key.
 967  * We need to maintain these in the sk structure.
 968  */
 969
 970 /* Find the Key structure for an address.  */
 971 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 972                                          const union tcp_md5_addr *addr,
 973                                          int family)
 974 {
 975         const struct tcp_sock *tp = tcp_sk(sk);
 976         struct tcp_md5sig_key *key;
 977         const struct tcp_md5sig_info *md5sig;
 978         __be32 mask;
 979         struct tcp_md5sig_key *best_match = NULL;
 980         bool match;
 981
 982         /* caller either holds rcu_read_lock() or socket lock */
 983         md5sig = rcu_dereference_check(tp->md5sig_info,
 984                                        lockdep_sock_is_held(sk));
 985         if (!md5sig)
 986                 return NULL;
 987
 988         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 989                 if (key->family != family)
 990                         continue;
 991
 992                 if (family == AF_INET) {
 993                         mask = inet_make_mask(key->prefixlen);
 994                         match = (key->addr.a4.s_addr & mask) ==
 995                                 (addr->a4.s_addr & mask);
 996 #if IS_ENABLED(CONFIG_IPV6)
 997                 } else if (family == AF_INET6) {
 998                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 999                                                   key->prefixlen);
1000 #endif
1001                 } else {
1002                         match = false;
1003                 }
1004
1005                 if (match && (!best_match ||
1006                               key->prefixlen > best_match->prefixlen))
1007                         best_match = key;
1008         }
1009         return best_match;
1010 }
1011 EXPORT_SYMBOL(tcp_md5_do_lookup);
1012
1013 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1014                                                       const union tcp_md5_addr *addr,
1015                                                       int family, u8 prefixlen)
1016 {
1017         const struct tcp_sock *tp = tcp_sk(sk);
1018         struct tcp_md5sig_key *key;
1019         unsigned int size = sizeof(struct in_addr);
1020         const struct tcp_md5sig_info *md5sig;
1021
1022         /* caller either holds rcu_read_lock() or socket lock */
1023         md5sig = rcu_dereference_check(tp->md5sig_info,
1024                                        lockdep_sock_is_held(sk));
1025         if (!md5sig)
1026                 return NULL;
1027 #if IS_ENABLED(CONFIG_IPV6)
1028         if (family == AF_INET6)
1029                 size = sizeof(struct in6_addr);
1030 #endif
1031         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1032                 if (key->family != family)
1033                         continue;
1034                 if (!memcmp(&key->addr, addr, size) &&
1035                     key->prefixlen == prefixlen)
1036                         return key;
1037         }
1038         return NULL;
1039 }
1040
1041 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1042                                          const struct sock *addr_sk)
1043 {
1044         const union tcp_md5_addr *addr;
1045
1046         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1047         return tcp_md5_do_lookup(sk, addr, AF_INET);
1048 }
1049 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1050
1051 /* This can be called on a newly created socket, from other files */
1052 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1053                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1054                    gfp_t gfp)
1055 {
1056         /* Add Key to the list */
1057         struct tcp_md5sig_key *key;
1058         struct tcp_sock *tp = tcp_sk(sk);
1059         struct tcp_md5sig_info *md5sig;
1060
1061         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1062         if (key) {
1063                 /* Pre-existing entry - just update that one. */
1064                 memcpy(key->key, newkey, newkeylen);
1065                 key->keylen = newkeylen;
1066                 return 0;
1067         }
1068
1069         md5sig = rcu_dereference_protected(tp->md5sig_info,
1070                                            lockdep_sock_is_held(sk));
1071         if (!md5sig) {
1072                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1073                 if (!md5sig)
1074                         return -ENOMEM;
1075
1076                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1077                 INIT_HLIST_HEAD(&md5sig->head);
1078                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1079         }
1080
1081         key = sock_kmalloc(sk, sizeof(*key), gfp);
1082         if (!key)
1083                 return -ENOMEM;
1084         if (!tcp_alloc_md5sig_pool()) {
1085                 sock_kfree_s(sk, key, sizeof(*key));
1086                 return -ENOMEM;
1087         }
1088
1089         memcpy(key->key, newkey, newkeylen);
1090         key->keylen = newkeylen;
1091         key->family = family;
1092         key->prefixlen = prefixlen;
1093         memcpy(&key->addr, addr,
1094                (family == AF_INET6) ? sizeof(struct in6_addr) :
1095                                       sizeof(struct in_addr));
1096         hlist_add_head_rcu(&key->node, &md5sig->head);
1097         return 0;
1098 }
1099 EXPORT_SYMBOL(tcp_md5_do_add);
1100
1101 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1102                    u8 prefixlen)
1103 {
1104         struct tcp_md5sig_key *key;
1105
1106         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1107         if (!key)
1108                 return -ENOENT;
1109         hlist_del_rcu(&key->node);
1110         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1111         kfree_rcu(key, rcu);
1112         return 0;
1113 }
1114 EXPORT_SYMBOL(tcp_md5_do_del);
1115
1116 static void tcp_clear_md5_list(struct sock *sk)
1117 {
1118         struct tcp_sock *tp = tcp_sk(sk);
1119         struct tcp_md5sig_key *key;
1120         struct hlist_node *n;
1121         struct tcp_md5sig_info *md5sig;
1122
1123         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1124
1125         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1126                 hlist_del_rcu(&key->node);
1127                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1128                 kfree_rcu(key, rcu);
1129         }
1130 }
1131
1132 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1133                                  char __user *optval, int optlen)
1134 {
1135         struct tcp_md5sig cmd;
1136         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1137         u8 prefixlen = 32;
1138
1139         if (optlen < sizeof(cmd))
1140                 return -EINVAL;
1141
1142         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1143                 return -EFAULT;
1144
1145         if (sin->sin_family != AF_INET)
1146                 return -EINVAL;
1147
1148         if (optname == TCP_MD5SIG_EXT &&
1149             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1150                 prefixlen = cmd.tcpm_prefixlen;
1151                 if (prefixlen > 32)
1152                         return -EINVAL;
1153         }
1154
1155         if (!cmd.tcpm_keylen)
1156                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1157                                       AF_INET, prefixlen);
1158
1159         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1160                 return -EINVAL;
1161
1162         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1163                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1164                               GFP_KERNEL);
1165 }
1166
1167 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1168                                    __be32 daddr, __be32 saddr,
1169                                    const struct tcphdr *th, int nbytes)
1170 {
1171         struct tcp4_pseudohdr *bp;
1172         struct scatterlist sg;
1173         struct tcphdr *_th;
1174
1175         bp = hp->scratch;
1176         bp->saddr = saddr;
1177         bp->daddr = daddr;
1178         bp->pad = 0;
1179         bp->protocol = IPPROTO_TCP;
1180         bp->len = cpu_to_be16(nbytes);
1181
1182         _th = (struct tcphdr *)(bp + 1);
1183         memcpy(_th, th, sizeof(*th));
1184         _th->check = 0;
1185
1186         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1187         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1188                                 sizeof(*bp) + sizeof(*th));
1189         return crypto_ahash_update(hp->md5_req);
1190 }
1191
1192 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1193                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1194 {
1195         struct tcp_md5sig_pool *hp;
1196         struct ahash_request *req;
1197
1198         hp = tcp_get_md5sig_pool();
1199         if (!hp)
1200                 goto clear_hash_noput;
1201         req = hp->md5_req;
1202
1203         if (crypto_ahash_init(req))
1204                 goto clear_hash;
1205         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1206                 goto clear_hash;
1207         if (tcp_md5_hash_key(hp, key))
1208                 goto clear_hash;
1209         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1210         if (crypto_ahash_final(req))
1211                 goto clear_hash;
1212
1213         tcp_put_md5sig_pool();
1214         return 0;
1215
1216 clear_hash:
1217         tcp_put_md5sig_pool();
1218 clear_hash_noput:
1219         memset(md5_hash, 0, 16);
1220         return 1;
1221 }
1222
1223 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1224                         const struct sock *sk,
1225                         const struct sk_buff *skb)
1226 {
1227         struct tcp_md5sig_pool *hp;
1228         struct ahash_request *req;
1229         const struct tcphdr *th = tcp_hdr(skb);
1230         __be32 saddr, daddr;
1231
1232         if (sk) { /* valid for establish/request sockets */
1233                 saddr = sk->sk_rcv_saddr;
1234                 daddr = sk->sk_daddr;
1235         } else {
1236                 const struct iphdr *iph = ip_hdr(skb);
1237                 saddr = iph->saddr;
1238                 daddr = iph->daddr;
1239         }
1240
1241         hp = tcp_get_md5sig_pool();
1242         if (!hp)
1243                 goto clear_hash_noput;
1244         req = hp->md5_req;
1245
1246         if (crypto_ahash_init(req))
1247                 goto clear_hash;
1248
1249         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1250                 goto clear_hash;
1251         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1252                 goto clear_hash;
1253         if (tcp_md5_hash_key(hp, key))
1254                 goto clear_hash;
1255         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1256         if (crypto_ahash_final(req))
1257                 goto clear_hash;
1258
1259         tcp_put_md5sig_pool();
1260         return 0;
1261
1262 clear_hash:
1263         tcp_put_md5sig_pool();
1264 clear_hash_noput:
1265         memset(md5_hash, 0, 16);
1266         return 1;
1267 }
1268 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1269
1270 #endif
1271
1272 /* Called with rcu_read_lock() */
1273 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1274                                     const struct sk_buff *skb)
1275 {
1276 #ifdef CONFIG_TCP_MD5SIG
1277         /*
1278          * This gets called for each TCP segment that arrives
1279          * so we want to be efficient.
1280          * We have 3 drop cases:
1281          * o No MD5 hash and one expected.
1282          * o MD5 hash and we're not expecting one.
1283          * o MD5 hash and its wrong.
1284          */
1285         const __u8 *hash_location = NULL;
1286         struct tcp_md5sig_key *hash_expected;
1287         const struct iphdr *iph = ip_hdr(skb);
1288         const struct tcphdr *th = tcp_hdr(skb);
1289         int genhash;
1290         unsigned char newhash[16];
1291
1292         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1293                                           AF_INET);
1294         hash_location = tcp_parse_md5sig_option(th);
1295
1296         /* We've parsed the options - do we have a hash? */
1297         if (!hash_expected && !hash_location)
1298                 return false;
1299
1300         if (hash_expected && !hash_location) {
1301                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1302                 return true;
1303         }
1304
1305         if (!hash_expected && hash_location) {
1306                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1307                 return true;
1308         }
1309
1310         /* Okay, so this is hash_expected and hash_location -
1311          * so we need to calculate the checksum.
1312          */
1313         genhash = tcp_v4_md5_hash_skb(newhash,
1314                                       hash_expected,
1315                                       NULL, skb);
1316
1317         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1318                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1319                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1320                                      &iph->saddr, ntohs(th->source),
1321                                      &iph->daddr, ntohs(th->dest),
1322                                      genhash ? " tcp_v4_calc_md5_hash failed"
1323                                      : "");
1324                 return true;
1325         }
1326         return false;
1327 #endif
1328         return false;
1329 }
1330
1331 static void tcp_v4_init_req(struct request_sock *req,
1332                             const struct sock *sk_listener,
1333                             struct sk_buff *skb)
1334 {
1335         struct inet_request_sock *ireq = inet_rsk(req);
1336         struct net *net = sock_net(sk_listener);
1337
1338         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1339         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1340         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1341 }
1342
1343 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1344                                           struct flowi *fl,
1345                                           const struct request_sock *req)
1346 {
1347         return inet_csk_route_req(sk, &fl->u.ip4, req);
1348 }
1349
1350 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1351         .family         =       PF_INET,
1352         .obj_size       =       sizeof(struct tcp_request_sock),
1353         .rtx_syn_ack    =       tcp_rtx_synack,
1354         .send_ack       =       tcp_v4_reqsk_send_ack,
1355         .destructor     =       tcp_v4_reqsk_destructor,
1356         .send_reset     =       tcp_v4_send_reset,
1357         .syn_ack_timeout =      tcp_syn_ack_timeout,
1358 };
1359
1360 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1361         .mss_clamp      =       TCP_MSS_DEFAULT,
1362 #ifdef CONFIG_TCP_MD5SIG
1363         .req_md5_lookup =       tcp_v4_md5_lookup,
1364         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1365 #endif
1366         .init_req       =       tcp_v4_init_req,
1367 #ifdef CONFIG_SYN_COOKIES
1368         .cookie_init_seq =      cookie_v4_init_sequence,
1369 #endif
1370         .route_req      =       tcp_v4_route_req,
1371         .init_seq       =       tcp_v4_init_seq,
1372         .init_ts_off    =       tcp_v4_init_ts_off,
1373         .send_synack    =       tcp_v4_send_synack,
1374 };
1375
1376 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1377 {
1378         /* Never answer to SYNs send to broadcast or multicast */
1379         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1380                 goto drop;
1381
1382         return tcp_conn_request(&tcp_request_sock_ops,
1383                                 &tcp_request_sock_ipv4_ops, sk, skb);
1384
1385 drop:
1386         tcp_listendrop(sk);
1387         return 0;
1388 }
1389 EXPORT_SYMBOL(tcp_v4_conn_request);
1390
1391
1392 /*
1393  * The three way handshake has completed - we got a valid synack -
1394  * now create the new socket.
1395  */
1396 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1397                                   struct request_sock *req,
1398                                   struct dst_entry *dst,
1399                                   struct request_sock *req_unhash,
1400                                   bool *own_req)
1401 {
1402         struct inet_request_sock *ireq;
1403         struct inet_sock *newinet;
1404         struct tcp_sock *newtp;
1405         struct sock *newsk;
1406 #ifdef CONFIG_TCP_MD5SIG
1407         struct tcp_md5sig_key *key;
1408 #endif
1409         struct ip_options_rcu *inet_opt;
1410
1411         if (sk_acceptq_is_full(sk))
1412                 goto exit_overflow;
1413
1414         newsk = tcp_create_openreq_child(sk, req, skb);
1415         if (!newsk)
1416                 goto exit_nonewsk;
1417
1418         newsk->sk_gso_type = SKB_GSO_TCPV4;
1419         inet_sk_rx_dst_set(newsk, skb);
1420
1421         newtp                 = tcp_sk(newsk);
1422         newinet               = inet_sk(newsk);
1423         ireq                  = inet_rsk(req);
1424         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1425         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1426         newsk->sk_bound_dev_if = ireq->ir_iif;
1427         newinet->inet_saddr   = ireq->ir_loc_addr;
1428         inet_opt              = rcu_dereference(ireq->ireq_opt);
1429         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1430         newinet->mc_index     = inet_iif(skb);
1431         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1432         newinet->rcv_tos      = ip_hdr(skb)->tos;
1433         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1434         if (inet_opt)
1435                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1436         newinet->inet_id = newtp->write_seq ^ jiffies;
1437
1438         if (!dst) {
1439                 dst = inet_csk_route_child_sock(sk, newsk, req);
1440                 if (!dst)
1441                         goto put_and_exit;
1442         } else {
1443                 /* syncookie case : see end of cookie_v4_check() */
1444         }
1445         sk_setup_caps(newsk, dst);
1446
1447         tcp_ca_openreq_child(newsk, dst);
1448
1449         tcp_sync_mss(newsk, dst_mtu(dst));
1450         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1451
1452         tcp_initialize_rcv_mss(newsk);
1453
1454 #ifdef CONFIG_TCP_MD5SIG
1455         /* Copy over the MD5 key from the original socket */
1456         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1457                                 AF_INET);
1458         if (key) {
1459                 /*
1460                  * We're using one, so create a matching key
1461                  * on the newsk structure. If we fail to get
1462                  * memory, then we end up not copying the key
1463                  * across. Shucks.
1464                  */
1465                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1466                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1467                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1468         }
1469 #endif
1470
1471         if (__inet_inherit_port(sk, newsk) < 0)
1472                 goto put_and_exit;
1473         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1474         if (likely(*own_req)) {
1475                 tcp_move_syn(newtp, req);
1476                 ireq->ireq_opt = NULL;
1477         } else {
1478                 newinet->inet_opt = NULL;
1479         }
1480         return newsk;
1481
1482 exit_overflow:
1483         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1484 exit_nonewsk:
1485         dst_release(dst);
1486 exit:
1487         tcp_listendrop(sk);
1488         return NULL;
1489 put_and_exit:
1490         newinet->inet_opt = NULL;
1491         inet_csk_prepare_forced_close(newsk);
1492         tcp_done(newsk);
1493         goto exit;
1494 }
1495 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1496
1497 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1498 {
1499 #ifdef CONFIG_SYN_COOKIES
1500         const struct tcphdr *th = tcp_hdr(skb);
1501
1502         if (!th->syn)
1503                 sk = cookie_v4_check(sk, skb);
1504 #endif
1505         return sk;
1506 }
1507
1508 /* The socket must have it's spinlock held when we get
1509  * here, unless it is a TCP_LISTEN socket.
1510  *
1511  * We have a potential double-lock case here, so even when
1512  * doing backlog processing we use the BH locking scheme.
1513  * This is because we cannot sleep with the original spinlock
1514  * held.
1515  */
1516 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1517 {
1518         struct sock *rsk;
1519
1520         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1521                 struct dst_entry *dst = sk->sk_rx_dst;
1522
1523                 sock_rps_save_rxhash(sk, skb);
1524                 sk_mark_napi_id(sk, skb);
1525                 if (dst) {
1526                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1527                             !dst->ops->check(dst, 0)) {
1528                                 dst_release(dst);
1529                                 sk->sk_rx_dst = NULL;
1530                         }
1531                 }
1532                 tcp_rcv_established(sk, skb);
1533                 return 0;
1534         }
1535
1536         if (tcp_checksum_complete(skb))
1537                 goto csum_err;
1538
1539         if (sk->sk_state == TCP_LISTEN) {
1540                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1541
1542                 if (!nsk)
1543                         goto discard;
1544                 if (nsk != sk) {
1545                         if (tcp_child_process(sk, nsk, skb)) {
1546                                 rsk = nsk;
1547                                 goto reset;
1548                         }
1549                         return 0;
1550                 }
1551         } else
1552                 sock_rps_save_rxhash(sk, skb);
1553
1554         if (tcp_rcv_state_process(sk, skb)) {
1555                 rsk = sk;
1556                 goto reset;
1557         }
1558         return 0;
1559
1560 reset:
1561         tcp_v4_send_reset(rsk, skb);
1562 discard:
1563         kfree_skb(skb);
1564         /* Be careful here. If this function gets more complicated and
1565          * gcc suffers from register pressure on the x86, sk (in %ebx)
1566          * might be destroyed here. This current version compiles correctly,
1567          * but you have been warned.
1568          */
1569         return 0;
1570
1571 csum_err:
1572         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1573         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1574         goto discard;
1575 }
1576 EXPORT_SYMBOL(tcp_v4_do_rcv);
1577
1578 int tcp_v4_early_demux(struct sk_buff *skb)
1579 {
1580         const struct iphdr *iph;
1581         const struct tcphdr *th;
1582         struct sock *sk;
1583
1584         if (skb->pkt_type != PACKET_HOST)
1585                 return 0;
1586
1587         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1588                 return 0;
1589
1590         iph = ip_hdr(skb);
1591         th = tcp_hdr(skb);
1592
1593         if (th->doff < sizeof(struct tcphdr) / 4)
1594                 return 0;
1595
1596         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1597                                        iph->saddr, th->source,
1598                                        iph->daddr, ntohs(th->dest),
1599                                        skb->skb_iif, inet_sdif(skb));
1600         if (sk) {
1601                 skb->sk = sk;
1602                 skb->destructor = sock_edemux;
1603                 if (sk_fullsock(sk)) {
1604                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1605
1606                         if (dst)
1607                                 dst = dst_check(dst, 0);
1608                         if (dst &&
1609                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1610                                 skb_dst_set_noref(skb, dst);
1611                 }
1612         }
1613         return 0;
1614 }
1615
1616 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1617 {
1618         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1619
1620         /* Only socket owner can try to collapse/prune rx queues
1621          * to reduce memory overhead, so add a little headroom here.
1622          * Few sockets backlog are possibly concurrently non empty.
1623          */
1624         limit += 64*1024;
1625
1626         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1627          * we can fix skb->truesize to its real value to avoid future drops.
1628          * This is valid because skb is not yet charged to the socket.
1629          * It has been noticed pure SACK packets were sometimes dropped
1630          * (if cooked by drivers without copybreak feature).
1631          */
1632         skb_condense(skb);
1633
1634         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1635                 bh_unlock_sock(sk);
1636                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1637                 return true;
1638         }
1639         return false;
1640 }
1641 EXPORT_SYMBOL(tcp_add_backlog);
1642
1643 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1644 {
1645         struct tcphdr *th = (struct tcphdr *)skb->data;
1646         unsigned int eaten = skb->len;
1647         int err;
1648
1649         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1650         if (!err) {
1651                 eaten -= skb->len;
1652                 TCP_SKB_CB(skb)->end_seq -= eaten;
1653         }
1654         return err;
1655 }
1656 EXPORT_SYMBOL(tcp_filter);
1657
1658 static void tcp_v4_restore_cb(struct sk_buff *skb)
1659 {
1660         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1661                 sizeof(struct inet_skb_parm));
1662 }
1663
1664 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1665                            const struct tcphdr *th)
1666 {
1667         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1668          * barrier() makes sure compiler wont play fool^Waliasing games.
1669          */
1670         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1671                 sizeof(struct inet_skb_parm));
1672         barrier();
1673
1674         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1675         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1676                                     skb->len - th->doff * 4);
1677         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1678         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1679         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1680         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1681         TCP_SKB_CB(skb)->sacked  = 0;
1682         TCP_SKB_CB(skb)->has_rxtstamp =
1683                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1684 }
1685
1686 /*
1687  *      From tcp_input.c
1688  */
1689
1690 int tcp_v4_rcv(struct sk_buff *skb)
1691 {
1692         struct net *net = dev_net(skb->dev);
1693         int sdif = inet_sdif(skb);
1694         const struct iphdr *iph;
1695         const struct tcphdr *th;
1696         bool refcounted;
1697         struct sock *sk;
1698         int ret;
1699
1700         if (skb->pkt_type != PACKET_HOST)
1701                 goto discard_it;
1702
1703         /* Count it even if it's bad */
1704         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1705
1706         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1707                 goto discard_it;
1708
1709         th = (const struct tcphdr *)skb->data;
1710
1711         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1712                 goto bad_packet;
1713         if (!pskb_may_pull(skb, th->doff * 4))
1714                 goto discard_it;
1715
1716         /* An explanation is required here, I think.
1717          * Packet length and doff are validated by header prediction,
1718          * provided case of th->doff==0 is eliminated.
1719          * So, we defer the checks. */
1720
1721         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1722                 goto csum_error;
1723
1724         th = (const struct tcphdr *)skb->data;
1725         iph = ip_hdr(skb);
1726 lookup:
1727         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1728                                th->dest, sdif, &refcounted);
1729         if (!sk)
1730                 goto no_tcp_socket;
1731
1732 process:
1733         if (sk->sk_state == TCP_TIME_WAIT)
1734                 goto do_time_wait;
1735
1736         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1737                 struct request_sock *req = inet_reqsk(sk);
1738                 bool req_stolen = false;
1739                 struct sock *nsk;
1740
1741                 sk = req->rsk_listener;
1742                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1743                         sk_drops_add(sk, skb);
1744                         reqsk_put(req);
1745                         goto discard_it;
1746                 }
1747                 if (tcp_checksum_complete(skb)) {
1748                         reqsk_put(req);
1749                         goto csum_error;
1750                 }
1751                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1752                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1753                         goto lookup;
1754                 }
1755                 /* We own a reference on the listener, increase it again
1756                  * as we might lose it too soon.
1757                  */
1758                 sock_hold(sk);
1759                 refcounted = true;
1760                 nsk = NULL;
1761                 if (!tcp_filter(sk, skb)) {
1762                         th = (const struct tcphdr *)skb->data;
1763                         iph = ip_hdr(skb);
1764                         tcp_v4_fill_cb(skb, iph, th);
1765                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1766                 }
1767                 if (!nsk) {
1768                         reqsk_put(req);
1769                         if (req_stolen) {
1770                                 /* Another cpu got exclusive access to req
1771                                  * and created a full blown socket.
1772                                  * Try to feed this packet to this socket
1773                                  * instead of discarding it.
1774                                  */
1775                                 tcp_v4_restore_cb(skb);
1776                                 sock_put(sk);
1777                                 goto lookup;
1778                         }
1779                         goto discard_and_relse;
1780                 }
1781                 if (nsk == sk) {
1782                         reqsk_put(req);
1783                         tcp_v4_restore_cb(skb);
1784                 } else if (tcp_child_process(sk, nsk, skb)) {
1785                         tcp_v4_send_reset(nsk, skb);
1786                         goto discard_and_relse;
1787                 } else {
1788                         sock_put(sk);
1789                         return 0;
1790                 }
1791         }
1792         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1793                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1794                 goto discard_and_relse;
1795         }
1796
1797         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1798                 goto discard_and_relse;
1799
1800         if (tcp_v4_inbound_md5_hash(sk, skb))
1801                 goto discard_and_relse;
1802
1803         nf_reset(skb);
1804
1805         if (tcp_filter(sk, skb))
1806                 goto discard_and_relse;
1807         th = (const struct tcphdr *)skb->data;
1808         iph = ip_hdr(skb);
1809         tcp_v4_fill_cb(skb, iph, th);
1810
1811         skb->dev = NULL;
1812
1813         if (sk->sk_state == TCP_LISTEN) {
1814                 ret = tcp_v4_do_rcv(sk, skb);
1815                 goto put_and_return;
1816         }
1817
1818         sk_incoming_cpu_update(sk);
1819
1820         bh_lock_sock_nested(sk);
1821         tcp_segs_in(tcp_sk(sk), skb);
1822         ret = 0;
1823         if (!sock_owned_by_user(sk)) {
1824                 ret = tcp_v4_do_rcv(sk, skb);
1825         } else if (tcp_add_backlog(sk, skb)) {
1826                 goto discard_and_relse;
1827         }
1828         bh_unlock_sock(sk);
1829
1830 put_and_return:
1831         if (refcounted)
1832                 sock_put(sk);
1833
1834         return ret;
1835
1836 no_tcp_socket:
1837         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1838                 goto discard_it;
1839
1840         tcp_v4_fill_cb(skb, iph, th);
1841
1842         if (tcp_checksum_complete(skb)) {
1843 csum_error:
1844                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1845 bad_packet:
1846                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1847         } else {
1848                 tcp_v4_send_reset(NULL, skb);
1849         }
1850
1851 discard_it:
1852         /* Discard frame. */
1853         kfree_skb(skb);
1854         return 0;
1855
1856 discard_and_relse:
1857         sk_drops_add(sk, skb);
1858         if (refcounted)
1859                 sock_put(sk);
1860         goto discard_it;
1861
1862 do_time_wait:
1863         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1864                 inet_twsk_put(inet_twsk(sk));
1865                 goto discard_it;
1866         }
1867
1868         tcp_v4_fill_cb(skb, iph, th);
1869
1870         if (tcp_checksum_complete(skb)) {
1871                 inet_twsk_put(inet_twsk(sk));
1872                 goto csum_error;
1873         }
1874         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1875         case TCP_TW_SYN: {
1876                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1877                                                         &tcp_hashinfo, skb,
1878                                                         __tcp_hdrlen(th),
1879                                                         iph->saddr, th->source,
1880                                                         iph->daddr, th->dest,
1881                                                         inet_iif(skb),
1882                                                         sdif);
1883                 if (sk2) {
1884                         inet_twsk_deschedule_put(inet_twsk(sk));
1885                         sk = sk2;
1886                         tcp_v4_restore_cb(skb);
1887                         refcounted = false;
1888                         goto process;
1889                 }
1890         }
1891                 /* to ACK */
1892                 /* fall through */
1893         case TCP_TW_ACK:
1894                 tcp_v4_timewait_ack(sk, skb);
1895                 break;
1896         case TCP_TW_RST:
1897                 tcp_v4_send_reset(sk, skb);
1898                 inet_twsk_deschedule_put(inet_twsk(sk));
1899                 goto discard_it;
1900         case TCP_TW_SUCCESS:;
1901         }
1902         goto discard_it;
1903 }
1904
1905 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1906         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1907         .twsk_unique    = tcp_twsk_unique,
1908         .twsk_destructor= tcp_twsk_destructor,
1909 };
1910
1911 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1912 {
1913         struct dst_entry *dst = skb_dst(skb);
1914
1915         if (dst && dst_hold_safe(dst)) {
1916                 sk->sk_rx_dst = dst;
1917                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1918         }
1919 }
1920 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1921
1922 const struct inet_connection_sock_af_ops ipv4_specific = {
1923         .queue_xmit        = ip_queue_xmit,
1924         .send_check        = tcp_v4_send_check,
1925         .rebuild_header    = inet_sk_rebuild_header,
1926         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1927         .conn_request      = tcp_v4_conn_request,
1928         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1929         .net_header_len    = sizeof(struct iphdr),
1930         .setsockopt        = ip_setsockopt,
1931         .getsockopt        = ip_getsockopt,
1932         .addr2sockaddr     = inet_csk_addr2sockaddr,
1933         .sockaddr_len      = sizeof(struct sockaddr_in),
1934 #ifdef CONFIG_COMPAT
1935         .compat_setsockopt = compat_ip_setsockopt,
1936         .compat_getsockopt = compat_ip_getsockopt,
1937 #endif
1938         .mtu_reduced       = tcp_v4_mtu_reduced,
1939 };
1940 EXPORT_SYMBOL(ipv4_specific);
1941
1942 #ifdef CONFIG_TCP_MD5SIG
1943 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1944         .md5_lookup             = tcp_v4_md5_lookup,
1945         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1946         .md5_parse              = tcp_v4_parse_md5_keys,
1947 };
1948 #endif
1949
1950 /* NOTE: A lot of things set to zero explicitly by call to
1951  *       sk_alloc() so need not be done here.
1952  */
1953 static int tcp_v4_init_sock(struct sock *sk)
1954 {
1955         struct inet_connection_sock *icsk = inet_csk(sk);
1956
1957         tcp_init_sock(sk);
1958
1959         icsk->icsk_af_ops = &ipv4_specific;
1960
1961 #ifdef CONFIG_TCP_MD5SIG
1962         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1963 #endif
1964
1965         return 0;
1966 }
1967
1968 void tcp_v4_destroy_sock(struct sock *sk)
1969 {
1970         struct tcp_sock *tp = tcp_sk(sk);
1971
1972         trace_tcp_destroy_sock(sk);
1973
1974         tcp_clear_xmit_timers(sk);
1975
1976         tcp_cleanup_congestion_control(sk);
1977
1978         tcp_cleanup_ulp(sk);
1979
1980         /* Cleanup up the write buffer. */
1981         tcp_write_queue_purge(sk);
1982
1983         /* Check if we want to disable active TFO */
1984         tcp_fastopen_active_disable_ofo_check(sk);
1985
1986         /* Cleans up our, hopefully empty, out_of_order_queue. */
1987         skb_rbtree_purge(&tp->out_of_order_queue);
1988
1989 #ifdef CONFIG_TCP_MD5SIG
1990         /* Clean up the MD5 key list, if any */
1991         if (tp->md5sig_info) {
1992                 tcp_clear_md5_list(sk);
1993                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1994                 tp->md5sig_info = NULL;
1995         }
1996 #endif
1997
1998         /* Clean up a referenced TCP bind bucket. */
1999         if (inet_csk(sk)->icsk_bind_hash)
2000                 inet_put_port(sk);
2001
2002         BUG_ON(tp->fastopen_rsk);
2003
2004         /* If socket is aborted during connect operation */
2005         tcp_free_fastopen_req(tp);
2006         tcp_fastopen_destroy_cipher(sk);
2007         tcp_saved_syn_free(tp);
2008
2009         sk_sockets_allocated_dec(sk);
2010 }
2011 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2012
2013 #ifdef CONFIG_PROC_FS
2014 /* Proc filesystem TCP sock list dumping. */
2015
2016 /*
2017  * Get next listener socket follow cur.  If cur is NULL, get first socket
2018  * starting from bucket given in st->bucket; when st->bucket is zero the
2019  * very first socket in the hash table is returned.
2020  */
2021 static void *listening_get_next(struct seq_file *seq, void *cur)
2022 {
2023         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2024         struct tcp_iter_state *st = seq->private;
2025         struct net *net = seq_file_net(seq);
2026         struct inet_listen_hashbucket *ilb;
2027         struct sock *sk = cur;
2028
2029         if (!sk) {
2030 get_head:
2031                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2032                 spin_lock(&ilb->lock);
2033                 sk = sk_head(&ilb->head);
2034                 st->offset = 0;
2035                 goto get_sk;
2036         }
2037         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038         ++st->num;
2039         ++st->offset;
2040
2041         sk = sk_next(sk);
2042 get_sk:
2043         sk_for_each_from(sk) {
2044                 if (!net_eq(sock_net(sk), net))
2045                         continue;
2046                 if (sk->sk_family == afinfo->family)
2047                         return sk;
2048         }
2049         spin_unlock(&ilb->lock);
2050         st->offset = 0;
2051         if (++st->bucket < INET_LHTABLE_SIZE)
2052                 goto get_head;
2053         return NULL;
2054 }
2055
2056 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2057 {
2058         struct tcp_iter_state *st = seq->private;
2059         void *rc;
2060
2061         st->bucket = 0;
2062         st->offset = 0;
2063         rc = listening_get_next(seq, NULL);
2064
2065         while (rc && *pos) {
2066                 rc = listening_get_next(seq, rc);
2067                 --*pos;
2068         }
2069         return rc;
2070 }
2071
2072 static inline bool empty_bucket(const struct tcp_iter_state *st)
2073 {
2074         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2075 }
2076
2077 /*
2078  * Get first established socket starting from bucket given in st->bucket.
2079  * If st->bucket is zero, the very first socket in the hash is returned.
2080  */
2081 static void *established_get_first(struct seq_file *seq)
2082 {
2083         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2084         struct tcp_iter_state *st = seq->private;
2085         struct net *net = seq_file_net(seq);
2086         void *rc = NULL;
2087
2088         st->offset = 0;
2089         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2090                 struct sock *sk;
2091                 struct hlist_nulls_node *node;
2092                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2093
2094                 /* Lockless fast path for the common case of empty buckets */
2095                 if (empty_bucket(st))
2096                         continue;
2097
2098                 spin_lock_bh(lock);
2099                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2100                         if (sk->sk_family != afinfo->family ||
2101                             !net_eq(sock_net(sk), net)) {
2102                                 continue;
2103                         }
2104                         rc = sk;
2105                         goto out;
2106                 }
2107                 spin_unlock_bh(lock);
2108         }
2109 out:
2110         return rc;
2111 }
2112
2113 static void *established_get_next(struct seq_file *seq, void *cur)
2114 {
2115         struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2116         struct sock *sk = cur;
2117         struct hlist_nulls_node *node;
2118         struct tcp_iter_state *st = seq->private;
2119         struct net *net = seq_file_net(seq);
2120
2121         ++st->num;
2122         ++st->offset;
2123
2124         sk = sk_nulls_next(sk);
2125
2126         sk_nulls_for_each_from(sk, node) {
2127                 if (sk->sk_family == afinfo->family &&
2128                     net_eq(sock_net(sk), net))
2129                         return sk;
2130         }
2131
2132         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2133         ++st->bucket;
2134         return established_get_first(seq);
2135 }
2136
2137 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2138 {
2139         struct tcp_iter_state *st = seq->private;
2140         void *rc;
2141
2142         st->bucket = 0;
2143         rc = established_get_first(seq);
2144
2145         while (rc && pos) {
2146                 rc = established_get_next(seq, rc);
2147                 --pos;
2148         }
2149         return rc;
2150 }
2151
2152 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2153 {
2154         void *rc;
2155         struct tcp_iter_state *st = seq->private;
2156
2157         st->state = TCP_SEQ_STATE_LISTENING;
2158         rc        = listening_get_idx(seq, &pos);
2159
2160         if (!rc) {
2161                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2162                 rc        = established_get_idx(seq, pos);
2163         }
2164
2165         return rc;
2166 }
2167
2168 static void *tcp_seek_last_pos(struct seq_file *seq)
2169 {
2170         struct tcp_iter_state *st = seq->private;
2171         int offset = st->offset;
2172         int orig_num = st->num;
2173         void *rc = NULL;
2174
2175         switch (st->state) {
2176         case TCP_SEQ_STATE_LISTENING:
2177                 if (st->bucket >= INET_LHTABLE_SIZE)
2178                         break;
2179                 st->state = TCP_SEQ_STATE_LISTENING;
2180                 rc = listening_get_next(seq, NULL);
2181                 while (offset-- && rc)
2182                         rc = listening_get_next(seq, rc);
2183                 if (rc)
2184                         break;
2185                 st->bucket = 0;
2186                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2187                 /* Fallthrough */
2188         case TCP_SEQ_STATE_ESTABLISHED:
2189                 if (st->bucket > tcp_hashinfo.ehash_mask)
2190                         break;
2191                 rc = established_get_first(seq);
2192                 while (offset-- && rc)
2193                         rc = established_get_next(seq, rc);
2194         }
2195
2196         st->num = orig_num;
2197
2198         return rc;
2199 }
2200
2201 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2202 {
2203         struct tcp_iter_state *st = seq->private;
2204         void *rc;
2205
2206         if (*pos && *pos == st->last_pos) {
2207                 rc = tcp_seek_last_pos(seq);
2208                 if (rc)
2209                         goto out;
2210         }
2211
2212         st->state = TCP_SEQ_STATE_LISTENING;
2213         st->num = 0;
2214         st->bucket = 0;
2215         st->offset = 0;
2216         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2217
2218 out:
2219         st->last_pos = *pos;
2220         return rc;
2221 }
2222 EXPORT_SYMBOL(tcp_seq_start);
2223
2224 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2225 {
2226         struct tcp_iter_state *st = seq->private;
2227         void *rc = NULL;
2228
2229         if (v == SEQ_START_TOKEN) {
2230                 rc = tcp_get_idx(seq, 0);
2231                 goto out;
2232         }
2233
2234         switch (st->state) {
2235         case TCP_SEQ_STATE_LISTENING:
2236                 rc = listening_get_next(seq, v);
2237                 if (!rc) {
2238                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2239                         st->bucket = 0;
2240                         st->offset = 0;
2241                         rc        = established_get_first(seq);
2242                 }
2243                 break;
2244         case TCP_SEQ_STATE_ESTABLISHED:
2245                 rc = established_get_next(seq, v);
2246                 break;
2247         }
2248 out:
2249         ++*pos;
2250         st->last_pos = *pos;
2251         return rc;
2252 }
2253 EXPORT_SYMBOL(tcp_seq_next);
2254
2255 void tcp_seq_stop(struct seq_file *seq, void *v)
2256 {
2257         struct tcp_iter_state *st = seq->private;
2258
2259         switch (st->state) {
2260         case TCP_SEQ_STATE_LISTENING:
2261                 if (v != SEQ_START_TOKEN)
2262                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2263                 break;
2264         case TCP_SEQ_STATE_ESTABLISHED:
2265                 if (v)
2266                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2267                 break;
2268         }
2269 }
2270 EXPORT_SYMBOL(tcp_seq_stop);
2271
2272 static void get_openreq4(const struct request_sock *req,
2273                          struct seq_file *f, int i)
2274 {
2275         const struct inet_request_sock *ireq = inet_rsk(req);
2276         long delta = req->rsk_timer.expires - jiffies;
2277
2278         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2279                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2280                 i,
2281                 ireq->ir_loc_addr,
2282                 ireq->ir_num,
2283                 ireq->ir_rmt_addr,
2284                 ntohs(ireq->ir_rmt_port),
2285                 TCP_SYN_RECV,
2286                 0, 0, /* could print option size, but that is af dependent. */
2287                 1,    /* timers active (only the expire timer) */
2288                 jiffies_delta_to_clock_t(delta),
2289                 req->num_timeout,
2290                 from_kuid_munged(seq_user_ns(f),
2291                                  sock_i_uid(req->rsk_listener)),
2292                 0,  /* non standard timer */
2293                 0, /* open_requests have no inode */
2294                 0,
2295                 req);
2296 }
2297
2298 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2299 {
2300         int timer_active;
2301         unsigned long timer_expires;
2302         const struct tcp_sock *tp = tcp_sk(sk);
2303         const struct inet_connection_sock *icsk = inet_csk(sk);
2304         const struct inet_sock *inet = inet_sk(sk);
2305         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2306         __be32 dest = inet->inet_daddr;
2307         __be32 src = inet->inet_rcv_saddr;
2308         __u16 destp = ntohs(inet->inet_dport);
2309         __u16 srcp = ntohs(inet->inet_sport);
2310         int rx_queue;
2311         int state;
2312
2313         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2314             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2315             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2316                 timer_active    = 1;
2317                 timer_expires   = icsk->icsk_timeout;
2318         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2319                 timer_active    = 4;
2320                 timer_expires   = icsk->icsk_timeout;
2321         } else if (timer_pending(&sk->sk_timer)) {
2322                 timer_active    = 2;
2323                 timer_expires   = sk->sk_timer.expires;
2324         } else {
2325                 timer_active    = 0;
2326                 timer_expires = jiffies;
2327         }
2328
2329         state = inet_sk_state_load(sk);
2330         if (state == TCP_LISTEN)
2331                 rx_queue = sk->sk_ack_backlog;
2332         else
2333                 /* Because we don't lock the socket,
2334                  * we might find a transient negative value.
2335                  */
2336                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2337
2338         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2339                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2340                 i, src, srcp, dest, destp, state,
2341                 tp->write_seq - tp->snd_una,
2342                 rx_queue,
2343                 timer_active,
2344                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2345                 icsk->icsk_retransmits,
2346                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2347                 icsk->icsk_probes_out,
2348                 sock_i_ino(sk),
2349                 refcount_read(&sk->sk_refcnt), sk,
2350                 jiffies_to_clock_t(icsk->icsk_rto),
2351                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2352                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2353                 tp->snd_cwnd,
2354                 state == TCP_LISTEN ?
2355                     fastopenq->max_qlen :
2356                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2357 }
2358
2359 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2360                                struct seq_file *f, int i)
2361 {
2362         long delta = tw->tw_timer.expires - jiffies;
2363         __be32 dest, src;
2364         __u16 destp, srcp;
2365
2366         dest  = tw->tw_daddr;
2367         src   = tw->tw_rcv_saddr;
2368         destp = ntohs(tw->tw_dport);
2369         srcp  = ntohs(tw->tw_sport);
2370
2371         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2372                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2373                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2374                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2375                 refcount_read(&tw->tw_refcnt), tw);
2376 }
2377
2378 #define TMPSZ 150
2379
2380 static int tcp4_seq_show(struct seq_file *seq, void *v)
2381 {
2382         struct tcp_iter_state *st;
2383         struct sock *sk = v;
2384
2385         seq_setwidth(seq, TMPSZ - 1);
2386         if (v == SEQ_START_TOKEN) {
2387                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2388                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2389                            "inode");
2390                 goto out;
2391         }
2392         st = seq->private;
2393
2394         if (sk->sk_state == TCP_TIME_WAIT)
2395                 get_timewait4_sock(v, seq, st->num);
2396         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2397                 get_openreq4(v, seq, st->num);
2398         else
2399                 get_tcp4_sock(v, seq, st->num);
2400 out:
2401         seq_pad(seq, '\n');
2402         return 0;
2403 }
2404
2405 static const struct seq_operations tcp4_seq_ops = {
2406         .show           = tcp4_seq_show,
2407         .start          = tcp_seq_start,
2408         .next           = tcp_seq_next,
2409         .stop           = tcp_seq_stop,
2410 };
2411
2412 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2413         .family         = AF_INET,
2414 };
2415
2416 static int __net_init tcp4_proc_init_net(struct net *net)
2417 {
2418         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2419                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2420                 return -ENOMEM;
2421         return 0;
2422 }
2423
2424 static void __net_exit tcp4_proc_exit_net(struct net *net)
2425 {
2426         remove_proc_entry("tcp", net->proc_net);
2427 }
2428
2429 static struct pernet_operations tcp4_net_ops = {
2430         .init = tcp4_proc_init_net,
2431         .exit = tcp4_proc_exit_net,
2432 };
2433
2434 int __init tcp4_proc_init(void)
2435 {
2436         return register_pernet_subsys(&tcp4_net_ops);
2437 }
2438
2439 void tcp4_proc_exit(void)
2440 {
2441         unregister_pernet_subsys(&tcp4_net_ops);
2442 }
2443 #endif /* CONFIG_PROC_FS */
2444
2445 struct proto tcp_prot = {
2446         .name                   = "TCP",
2447         .owner                  = THIS_MODULE,
2448         .close                  = tcp_close,
2449         .pre_connect            = tcp_v4_pre_connect,
2450         .connect                = tcp_v4_connect,
2451         .disconnect             = tcp_disconnect,
2452         .accept                 = inet_csk_accept,
2453         .ioctl                  = tcp_ioctl,
2454         .init                   = tcp_v4_init_sock,
2455         .destroy                = tcp_v4_destroy_sock,
2456         .shutdown               = tcp_shutdown,
2457         .setsockopt             = tcp_setsockopt,
2458         .getsockopt             = tcp_getsockopt,
2459         .keepalive              = tcp_set_keepalive,
2460         .recvmsg                = tcp_recvmsg,
2461         .sendmsg                = tcp_sendmsg,
2462         .sendpage               = tcp_sendpage,
2463         .backlog_rcv            = tcp_v4_do_rcv,
2464         .release_cb             = tcp_release_cb,
2465         .hash                   = inet_hash,
2466         .unhash                 = inet_unhash,
2467         .get_port               = inet_csk_get_port,
2468         .enter_memory_pressure  = tcp_enter_memory_pressure,
2469         .leave_memory_pressure  = tcp_leave_memory_pressure,
2470         .stream_memory_free     = tcp_stream_memory_free,
2471         .sockets_allocated      = &tcp_sockets_allocated,
2472         .orphan_count           = &tcp_orphan_count,
2473         .memory_allocated       = &tcp_memory_allocated,
2474         .memory_pressure        = &tcp_memory_pressure,
2475         .sysctl_mem             = sysctl_tcp_mem,
2476         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2477         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2478         .max_header             = MAX_TCP_HEADER,
2479         .obj_size               = sizeof(struct tcp_sock),
2480         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2481         .twsk_prot              = &tcp_timewait_sock_ops,
2482         .rsk_prot               = &tcp_request_sock_ops,
2483         .h.hashinfo             = &tcp_hashinfo,
2484         .no_autobind            = true,
2485 #ifdef CONFIG_COMPAT
2486         .compat_setsockopt      = compat_tcp_setsockopt,
2487         .compat_getsockopt      = compat_tcp_getsockopt,
2488 #endif
2489         .diag_destroy           = tcp_abort,
2490 };
2491 EXPORT_SYMBOL(tcp_prot);
2492
2493 static void __net_exit tcp_sk_exit(struct net *net)
2494 {
2495         int cpu;
2496
2497         module_put(net->ipv4.tcp_congestion_control->owner);
2498
2499         for_each_possible_cpu(cpu)
2500                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2501         free_percpu(net->ipv4.tcp_sk);
2502 }
2503
2504 static int __net_init tcp_sk_init(struct net *net)
2505 {
2506         int res, cpu, cnt;
2507
2508         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2509         if (!net->ipv4.tcp_sk)
2510                 return -ENOMEM;
2511
2512         for_each_possible_cpu(cpu) {
2513                 struct sock *sk;
2514
2515                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2516                                            IPPROTO_TCP, net);
2517                 if (res)
2518                         goto fail;
2519                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2520
2521                 /* Please enforce IP_DF and IPID==0 for RST and
2522                  * ACK sent in SYN-RECV and TIME-WAIT state.
2523                  */
2524                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2525
2526                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2527         }
2528
2529         net->ipv4.sysctl_tcp_ecn = 2;
2530         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2531
2532         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2533         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2534         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2535
2536         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2537         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2538         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2539
2540         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2541         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2542         net->ipv4.sysctl_tcp_syncookies = 1;
2543         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2544         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2545         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2546         net->ipv4.sysctl_tcp_orphan_retries = 0;
2547         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2548         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2549         net->ipv4.sysctl_tcp_tw_reuse = 2;
2550
2551         cnt = tcp_hashinfo.ehash_mask + 1;
2552         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2553         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2554
2555         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2556         net->ipv4.sysctl_tcp_sack = 1;
2557         net->ipv4.sysctl_tcp_window_scaling = 1;
2558         net->ipv4.sysctl_tcp_timestamps = 1;
2559         net->ipv4.sysctl_tcp_early_retrans = 3;
2560         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2561         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2562         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2563         net->ipv4.sysctl_tcp_max_reordering = 300;
2564         net->ipv4.sysctl_tcp_dsack = 1;
2565         net->ipv4.sysctl_tcp_app_win = 31;
2566         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2567         net->ipv4.sysctl_tcp_frto = 2;
2568         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2569         /* This limits the percentage of the congestion window which we
2570          * will allow a single TSO frame to consume.  Building TSO frames
2571          * which are too large can cause TCP streams to be bursty.
2572          */
2573         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2574         /* Default TSQ limit of four TSO segments */
2575         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2576         /* rfc5961 challenge ack rate limiting */
2577         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2578         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2579         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2580         net->ipv4.sysctl_tcp_autocorking = 1;
2581         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2582         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2583         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2584         if (net != &init_net) {
2585                 memcpy(net->ipv4.sysctl_tcp_rmem,
2586                        init_net.ipv4.sysctl_tcp_rmem,
2587                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2588                 memcpy(net->ipv4.sysctl_tcp_wmem,
2589                        init_net.ipv4.sysctl_tcp_wmem,
2590                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2591         }
2592         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2593         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2594         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2595         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2596         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2597         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2598
2599         /* Reno is always built in */
2600         if (!net_eq(net, &init_net) &&
2601             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2602                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2603         else
2604                 net->ipv4.tcp_congestion_control = &tcp_reno;
2605
2606         return 0;
2607 fail:
2608         tcp_sk_exit(net);
2609
2610         return res;
2611 }
2612
2613 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2614 {
2615         struct net *net;
2616
2617         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2618
2619         list_for_each_entry(net, net_exit_list, exit_list)
2620                 tcp_fastopen_ctx_destroy(net);
2621 }
2622
2623 static struct pernet_operations __net_initdata tcp_sk_ops = {
2624        .init       = tcp_sk_init,
2625        .exit       = tcp_sk_exit,
2626        .exit_batch = tcp_sk_exit_batch,
2627 };
2628
2629 void __init tcp_v4_init(void)
2630 {
2631         if (register_pernet_subsys(&tcp_sk_ops))
2632                 panic("Failed to create the TCP control socket.\n");
2633 }