net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83 #include <linux/inetdevice.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 #include <trace/events/tcp.h>
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                               ip_hdr(skb)->saddr,
 102                               tcp_hdr(skb)->dest,
 103                               tcp_hdr(skb)->source);
 104 }
 105
 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107 {
 108         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143 /* This will initiate an outgoing connection. */
 144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 145 {
 146         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 147         struct inet_sock *inet = inet_sk(sk);
 148         struct tcp_sock *tp = tcp_sk(sk);
 149         __be16 orig_sport, orig_dport;
 150         __be32 daddr, nexthop;
 151         struct flowi4 *fl4;
 152         struct rtable *rt;
 153         int err;
 154         struct ip_options_rcu *inet_opt;
 155         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 156
 157         if (addr_len < sizeof(struct sockaddr_in))
 158                 return -EINVAL;
 159
 160         if (usin->sin_family != AF_INET)
 161                 return -EAFNOSUPPORT;
 162
 163         nexthop = daddr = usin->sin_addr.s_addr;
 164         inet_opt = rcu_dereference_protected(inet->inet_opt,
 165                                              lockdep_sock_is_held(sk));
 166         if (inet_opt && inet_opt->opt.srr) {
 167                 if (!daddr)
 168                         return -EINVAL;
 169                 nexthop = inet_opt->opt.faddr;
 170         }
 171
 172         orig_sport = inet->inet_sport;
 173         orig_dport = usin->sin_port;
 174         fl4 = &inet->cork.fl.u.ip4;
 175         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 176                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 177                               IPPROTO_TCP,
 178                               orig_sport, orig_dport, sk);
 179         if (IS_ERR(rt)) {
 180                 err = PTR_ERR(rt);
 181                 if (err == -ENETUNREACH)
 182                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 183                 return err;
 184         }
 185
 186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                 ip_rt_put(rt);
 188                 return -ENETUNREACH;
 189         }
 190
 191         if (!inet_opt || !inet_opt->opt.srr)
 192                 daddr = fl4->daddr;
 193
 194         if (!inet->inet_saddr)
 195                 inet->inet_saddr = fl4->saddr;
 196         sk_rcv_saddr_set(sk, inet->inet_saddr);
 197
 198         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 199                 /* Reset inherited state */
 200                 tp->rx_opt.ts_recent       = 0;
 201                 tp->rx_opt.ts_recent_stamp = 0;
 202                 if (likely(!tp->repair))
 203                         tp->write_seq      = 0;
 204         }
 205
 206         inet->inet_dport = usin->sin_port;
 207         sk_daddr_set(sk, daddr);
 208
 209         inet_csk(sk)->icsk_ext_hdr_len = 0;
 210         if (inet_opt)
 211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215         /* Socket identity is still unknown (sport may be zero).
 216          * However we set state to SYN-SENT and not releasing socket
 217          * lock select source port, enter ourselves into the hash tables and
 218          * complete initialization after this.
 219          */
 220         tcp_set_state(sk, TCP_SYN_SENT);
 221         err = inet_hash_connect(tcp_death_row, sk);
 222         if (err)
 223                 goto failure;
 224
 225         sk_set_txhash(sk);
 226
 227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                                inet->inet_sport, inet->inet_dport, sk);
 229         if (IS_ERR(rt)) {
 230                 err = PTR_ERR(rt);
 231                 rt = NULL;
 232                 goto failure;
 233         }
 234         /* OK, now commit destination to socket.  */
 235         sk->sk_gso_type = SKB_GSO_TCPV4;
 236         sk_setup_caps(sk, &rt->dst);
 237         rt = NULL;
 238
 239         if (likely(!tp->repair)) {
 240                 if (!tp->write_seq)
 241                         tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 242                                                        inet->inet_daddr,
 243                                                        inet->inet_sport,
 244                                                        usin->sin_port);
 245                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 246                                                  inet->inet_saddr,
 247                                                  inet->inet_daddr);
 248         }
 249
 250         inet->inet_id = tp->write_seq ^ jiffies;
 251
 252         if (tcp_fastopen_defer_connect(sk, &err))
 253                 return err;
 254         if (err)
 255                 goto failure;
 256
 257         err = tcp_connect(sk);
 258
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279  * It can be called through tcp_release_cb() if socket was owned by user
 280  * at the time tcp_v4_err() was called to handle ICMP message.
 281  */
 282 void tcp_v4_mtu_reduced(struct sock *sk)
 283 {
 284         struct inet_sock *inet = inet_sk(sk);
 285         struct dst_entry *dst;
 286         u32 mtu;
 287
 288         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 289                 return;
 290         mtu = tcp_sk(sk)->mtu_info;
 291         dst = inet_csk_update_pmtu(sk, mtu);
 292         if (!dst)
 293                 return;
 294
 295         /* Something is about to be wrong... Remember soft error
 296          * for the case, if this connection will not able to recover.
 297          */
 298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                 sk->sk_err_soft = EMSGSIZE;
 300
 301         mtu = dst_mtu(dst);
 302
 303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304             ip_sk_accept_pmtu(sk) &&
 305             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                 tcp_sync_mss(sk, mtu);
 307
 308                 /* Resend the TCP packet because it's
 309                  * clear that the old packet has been
 310                  * dropped. This is the new "fast" path mtu
 311                  * discovery.
 312                  */
 313                 tcp_simple_retransmit(sk);
 314         } /* else let the usual retransmit timer handle it */
 315 }
 316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 317
 318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 319 {
 320         struct dst_entry *dst = __sk_dst_check(sk, 0);
 321
 322         if (dst)
 323                 dst->ops->redirect(dst, sk, skb);
 324 }
 325
 326
 327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 329 {
 330         struct request_sock *req = inet_reqsk(sk);
 331         struct net *net = sock_net(sk);
 332
 333         /* ICMPs are not backlogged, hence we cannot get
 334          * an established socket here.
 335          */
 336         if (seq != tcp_rsk(req)->snt_isn) {
 337                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 338         } else if (abort) {
 339                 /*
 340                  * Still in SYN_RECV, just remove it silently.
 341                  * There is no good way to pass the error to the newly
 342                  * created socket, and POSIX does not want network
 343                  * errors returned from accept().
 344                  */
 345                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 346                 tcp_listendrop(req->rsk_listener);
 347         }
 348         reqsk_put(req);
 349 }
 350 EXPORT_SYMBOL(tcp_req_err);
 351
 352 /*
 353  * This routine is called by the ICMP module when it gets some
 354  * sort of error condition.  If err < 0 then the socket should
 355  * be closed and the error returned to the user.  If err > 0
 356  * it's just the icmp type << 8 | icmp code.  After adjustment
 357  * header points to the first 8 bytes of the tcp header.  We need
 358  * to find the appropriate port.
 359  *
 360  * The locking strategy used here is very "optimistic". When
 361  * someone else accesses the socket the ICMP is just dropped
 362  * and for some paths there is no check at all.
 363  * A more general error queue to queue errors for later handling
 364  * is probably better.
 365  *
 366  */
 367
 368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 369 {
 370         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 371         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 372         struct inet_connection_sock *icsk;
 373         struct tcp_sock *tp;
 374         struct inet_sock *inet;
 375         const int type = icmp_hdr(icmp_skb)->type;
 376         const int code = icmp_hdr(icmp_skb)->code;
 377         struct sock *sk;
 378         struct sk_buff *skb;
 379         struct request_sock *fastopen;
 380         u32 seq, snd_una;
 381         s32 remaining;
 382         u32 delta_us;
 383         int err;
 384         struct net *net = dev_net(icmp_skb->dev);
 385
 386         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 387                                        th->dest, iph->saddr, ntohs(th->source),
 388                                        inet_iif(icmp_skb), 0);
 389         if (!sk) {
 390                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 391                 return;
 392         }
 393         if (sk->sk_state == TCP_TIME_WAIT) {
 394                 inet_twsk_put(inet_twsk(sk));
 395                 return;
 396         }
 397         seq = ntohl(th->seq);
 398         if (sk->sk_state == TCP_NEW_SYN_RECV)
 399                 return tcp_req_err(sk, seq,
 400                                   type == ICMP_PARAMETERPROB ||
 401                                   type == ICMP_TIME_EXCEEDED ||
 402                                   (type == ICMP_DEST_UNREACH &&
 403                                    (code == ICMP_NET_UNREACH ||
 404                                     code == ICMP_HOST_UNREACH)));
 405
 406         bh_lock_sock(sk);
 407         /* If too many ICMPs get dropped on busy
 408          * servers this needs to be solved differently.
 409          * We do take care of PMTU discovery (RFC1191) special case :
 410          * we can receive locally generated ICMP messages while socket is held.
 411          */
 412         if (sock_owned_by_user(sk)) {
 413                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 414                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 415         }
 416         if (sk->sk_state == TCP_CLOSE)
 417                 goto out;
 418
 419         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 420                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 421                 goto out;
 422         }
 423
 424         icsk = inet_csk(sk);
 425         tp = tcp_sk(sk);
 426         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 427         fastopen = tp->fastopen_rsk;
 428         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 429         if (sk->sk_state != TCP_LISTEN &&
 430             !between(seq, snd_una, tp->snd_nxt)) {
 431                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 432                 goto out;
 433         }
 434
 435         switch (type) {
 436         case ICMP_REDIRECT:
 437                 if (!sock_owned_by_user(sk))
 438                         do_redirect(icmp_skb, sk);
 439                 goto out;
 440         case ICMP_SOURCE_QUENCH:
 441                 /* Just silently ignore these. */
 442                 goto out;
 443         case ICMP_PARAMETERPROB:
 444                 err = EPROTO;
 445                 break;
 446         case ICMP_DEST_UNREACH:
 447                 if (code > NR_ICMP_UNREACH)
 448                         goto out;
 449
 450                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 451                         /* We are not interested in TCP_LISTEN and open_requests
 452                          * (SYN-ACKs send out by Linux are always <576bytes so
 453                          * they should go through unfragmented).
 454                          */
 455                         if (sk->sk_state == TCP_LISTEN)
 456                                 goto out;
 457
 458                         tp->mtu_info = info;
 459                         if (!sock_owned_by_user(sk)) {
 460                                 tcp_v4_mtu_reduced(sk);
 461                         } else {
 462                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 463                                         sock_hold(sk);
 464                         }
 465                         goto out;
 466                 }
 467
 468                 err = icmp_err_convert[code].errno;
 469                 /* check if icmp_skb allows revert of backoff
 470                  * (see draft-zimmermann-tcp-lcd) */
 471                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 472                         break;
 473                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 474                     !icsk->icsk_backoff || fastopen)
 475                         break;
 476
 477                 if (sock_owned_by_user(sk))
 478                         break;
 479
 480                 icsk->icsk_backoff--;
 481                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 482                                                TCP_TIMEOUT_INIT;
 483                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 484
 485                 skb = tcp_rtx_queue_head(sk);
 486                 BUG_ON(!skb);
 487
 488                 tcp_mstamp_refresh(tp);
 489                 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 490                 remaining = icsk->icsk_rto -
 491                             usecs_to_jiffies(delta_us);
 492
 493                 if (remaining > 0) {
 494                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 495                                                   remaining, TCP_RTO_MAX);
 496                 } else {
 497                         /* RTO revert clocked out retransmission.
 498                          * Will retransmit now */
 499                         tcp_retransmit_timer(sk);
 500                 }
 501
 502                 break;
 503         case ICMP_TIME_EXCEEDED:
 504                 err = EHOSTUNREACH;
 505                 break;
 506         default:
 507                 goto out;
 508         }
 509
 510         switch (sk->sk_state) {
 511         case TCP_SYN_SENT:
 512         case TCP_SYN_RECV:
 513                 /* Only in fast or simultaneous open. If a fast open socket is
 514                  * is already accepted it is treated as a connected one below.
 515                  */
 516                 if (fastopen && !fastopen->sk)
 517                         break;
 518
 519                 if (!sock_owned_by_user(sk)) {
 520                         sk->sk_err = err;
 521
 522                         sk->sk_error_report(sk);
 523
 524                         tcp_done(sk);
 525                 } else {
 526                         sk->sk_err_soft = err;
 527                 }
 528                 goto out;
 529         }
 530
 531         /* If we've already connected we will keep trying
 532          * until we time out, or the user gives up.
 533          *
 534          * rfc1122 4.2.3.9 allows to consider as hard errors
 535          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 536          * but it is obsoleted by pmtu discovery).
 537          *
 538          * Note, that in modern internet, where routing is unreliable
 539          * and in each dark corner broken firewalls sit, sending random
 540          * errors ordered by their masters even this two messages finally lose
 541          * their original sense (even Linux sends invalid PORT_UNREACHs)
 542          *
 543          * Now we are in compliance with RFCs.
 544          *                                                      --ANK (980905)
 545          */
 546
 547         inet = inet_sk(sk);
 548         if (!sock_owned_by_user(sk) && inet->recverr) {
 549                 sk->sk_err = err;
 550                 sk->sk_error_report(sk);
 551         } else  { /* Only an error on timeout */
 552                 sk->sk_err_soft = err;
 553         }
 554
 555 out:
 556         bh_unlock_sock(sk);
 557         sock_put(sk);
 558 }
 559
 560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 561 {
 562         struct tcphdr *th = tcp_hdr(skb);
 563
 564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                 skb->csum_start = skb_transport_header(skb) - skb->head;
 567                 skb->csum_offset = offsetof(struct tcphdr, check);
 568         } else {
 569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                          csum_partial(th,
 571                                                       th->doff << 2,
 572                                                       skb->csum));
 573         }
 574 }
 575
 576 /* This routine computes an IPv4 TCP checksum. */
 577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578 {
 579         const struct inet_sock *inet = inet_sk(sk);
 580
 581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582 }
 583 EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585 /*
 586  *      This routine will send an RST to the other tcp.
 587  *
 588  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 589  *                    for reset.
 590  *      Answer: if a packet caused RST, it is not for a socket
 591  *              existing in our system, if it is matched to a socket,
 592  *              it is just duplicate segment or bug in other side's TCP.
 593  *              So that we build reply only basing on parameters
 594  *              arrived with segment.
 595  *      Exception: precedence violation. We do not implement it in any case.
 596  */
 597
 598 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 599 {
 600         const struct tcphdr *th = tcp_hdr(skb);
 601         struct {
 602                 struct tcphdr th;
 603 #ifdef CONFIG_TCP_MD5SIG
 604                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 605 #endif
 606         } rep;
 607         struct ip_reply_arg arg;
 608 #ifdef CONFIG_TCP_MD5SIG
 609         struct tcp_md5sig_key *key = NULL;
 610         const __u8 *hash_location = NULL;
 611         unsigned char newhash[16];
 612         int genhash;
 613         struct sock *sk1 = NULL;
 614 #endif
 615         struct net *net;
 616
 617         /* Never send a reset in response to a reset. */
 618         if (th->rst)
 619                 return;
 620
 621         /* If sk not NULL, it means we did a successful lookup and incoming
 622          * route had to be correct. prequeue might have dropped our dst.
 623          */
 624         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 625                 return;
 626
 627         /* Swap the send and the receive. */
 628         memset(&rep, 0, sizeof(rep));
 629         rep.th.dest   = th->source;
 630         rep.th.source = th->dest;
 631         rep.th.doff   = sizeof(struct tcphdr) / 4;
 632         rep.th.rst    = 1;
 633
 634         if (th->ack) {
 635                 rep.th.seq = th->ack_seq;
 636         } else {
 637                 rep.th.ack = 1;
 638                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 639                                        skb->len - (th->doff << 2));
 640         }
 641
 642         memset(&arg, 0, sizeof(arg));
 643         arg.iov[0].iov_base = (unsigned char *)&rep;
 644         arg.iov[0].iov_len  = sizeof(rep.th);
 645
 646         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 647 #ifdef CONFIG_TCP_MD5SIG
 648         rcu_read_lock();
 649         hash_location = tcp_parse_md5sig_option(th);
 650         if (sk && sk_fullsock(sk)) {
 651                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                         &ip_hdr(skb)->saddr, AF_INET);
 653         } else if (hash_location) {
 654                 /*
 655                  * active side is lost. Try to find listening socket through
 656                  * source port, and then find md5 key through listening socket.
 657                  * we are not loose security here:
 658                  * Incoming packet is checked with md5 hash with finding key,
 659                  * no RST generated if md5 hash doesn't match.
 660                  */
 661                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 662                                              ip_hdr(skb)->saddr,
 663                                              th->source, ip_hdr(skb)->daddr,
 664                                              ntohs(th->source), inet_iif(skb),
 665                                              tcp_v4_sdif(skb));
 666                 /* don't send rst if it can't find key */
 667                 if (!sk1)
 668                         goto out;
 669
 670                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 671                                         &ip_hdr(skb)->saddr, AF_INET);
 672                 if (!key)
 673                         goto out;
 674
 675
 676                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 677                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 678                         goto out;
 679
 680         }
 681
 682         if (key) {
 683                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 684                                    (TCPOPT_NOP << 16) |
 685                                    (TCPOPT_MD5SIG << 8) |
 686                                    TCPOLEN_MD5SIG);
 687                 /* Update length and the length the header thinks exists */
 688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 689                 rep.th.doff = arg.iov[0].iov_len / 4;
 690
 691                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 692                                      key, ip_hdr(skb)->saddr,
 693                                      ip_hdr(skb)->daddr, &rep.th);
 694         }
 695 #endif
 696         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 697                                       ip_hdr(skb)->saddr, /* XXX */
 698                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 699         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 700         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 701
 702         /* When socket is gone, all binding information is lost.
 703          * routing might fail in this case. No choice here, if we choose to force
 704          * input interface, we will misroute in case of asymmetric route.
 705          */
 706         if (sk) {
 707                 arg.bound_dev_if = sk->sk_bound_dev_if;
 708                 if (sk_fullsock(sk))
 709                         trace_tcp_send_reset(sk, skb);
 710         }
 711
 712         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 713                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 714
 715         arg.tos = ip_hdr(skb)->tos;
 716         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 717         local_bh_disable();
 718         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 719                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 720                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 721                               &arg, arg.iov[0].iov_len);
 722
 723         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 724         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 725         local_bh_enable();
 726
 727 #ifdef CONFIG_TCP_MD5SIG
 728 out:
 729         rcu_read_unlock();
 730 #endif
 731 }
 732
 733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 734    outside socket context is ugly, certainly. What can I do?
 735  */
 736
 737 static void tcp_v4_send_ack(const struct sock *sk,
 738                             struct sk_buff *skb, u32 seq, u32 ack,
 739                             u32 win, u32 tsval, u32 tsecr, int oif,
 740                             struct tcp_md5sig_key *key,
 741                             int reply_flags, u8 tos)
 742 {
 743         const struct tcphdr *th = tcp_hdr(skb);
 744         struct {
 745                 struct tcphdr th;
 746                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 747 #ifdef CONFIG_TCP_MD5SIG
 748                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 749 #endif
 750                         ];
 751         } rep;
 752         struct net *net = sock_net(sk);
 753         struct ip_reply_arg arg;
 754
 755         memset(&rep.th, 0, sizeof(struct tcphdr));
 756         memset(&arg, 0, sizeof(arg));
 757
 758         arg.iov[0].iov_base = (unsigned char *)&rep;
 759         arg.iov[0].iov_len  = sizeof(rep.th);
 760         if (tsecr) {
 761                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_TIMESTAMP << 8) |
 763                                    TCPOLEN_TIMESTAMP);
 764                 rep.opt[1] = htonl(tsval);
 765                 rep.opt[2] = htonl(tsecr);
 766                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 767         }
 768
 769         /* Swap the send and the receive. */
 770         rep.th.dest    = th->source;
 771         rep.th.source  = th->dest;
 772         rep.th.doff    = arg.iov[0].iov_len / 4;
 773         rep.th.seq     = htonl(seq);
 774         rep.th.ack_seq = htonl(ack);
 775         rep.th.ack     = 1;
 776         rep.th.window  = htons(win);
 777
 778 #ifdef CONFIG_TCP_MD5SIG
 779         if (key) {
 780                 int offset = (tsecr) ? 3 : 0;
 781
 782                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 783                                           (TCPOPT_NOP << 16) |
 784                                           (TCPOPT_MD5SIG << 8) |
 785                                           TCPOLEN_MD5SIG);
 786                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 787                 rep.th.doff = arg.iov[0].iov_len/4;
 788
 789                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 790                                     key, ip_hdr(skb)->saddr,
 791                                     ip_hdr(skb)->daddr, &rep.th);
 792         }
 793 #endif
 794         arg.flags = reply_flags;
 795         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 796                                       ip_hdr(skb)->saddr, /* XXX */
 797                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 798         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 799         if (oif)
 800                 arg.bound_dev_if = oif;
 801         arg.tos = tos;
 802         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 803         local_bh_disable();
 804         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 805                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 806                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 807                               &arg, arg.iov[0].iov_len);
 808
 809         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 810         local_bh_enable();
 811 }
 812
 813 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 814 {
 815         struct inet_timewait_sock *tw = inet_twsk(sk);
 816         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 817
 818         tcp_v4_send_ack(sk, skb,
 819                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 820                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 821                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 822                         tcptw->tw_ts_recent,
 823                         tw->tw_bound_dev_if,
 824                         tcp_twsk_md5_key(tcptw),
 825                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 826                         tw->tw_tos
 827                         );
 828
 829         inet_twsk_put(tw);
 830 }
 831
 832 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 833                                   struct request_sock *req)
 834 {
 835         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 836          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 837          */
 838         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 839                                              tcp_sk(sk)->snd_nxt;
 840
 841         /* RFC 7323 2.3
 842          * The window field (SEG.WND) of every outgoing segment, with the
 843          * exception of <SYN> segments, MUST be right-shifted by
 844          * Rcv.Wind.Shift bits:
 845          */
 846         tcp_v4_send_ack(sk, skb, seq,
 847                         tcp_rsk(req)->rcv_nxt,
 848                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 849                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 850                         req->ts_recent,
 851                         0,
 852                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 853                                           AF_INET),
 854                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 855                         ip_hdr(skb)->tos);
 856 }
 857
 858 /*
 859  *      Send a SYN-ACK after having received a SYN.
 860  *      This still operates on a request_sock only, not on a big
 861  *      socket.
 862  */
 863 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 864                               struct flowi *fl,
 865                               struct request_sock *req,
 866                               struct tcp_fastopen_cookie *foc,
 867                               enum tcp_synack_type synack_type)
 868 {
 869         const struct inet_request_sock *ireq = inet_rsk(req);
 870         struct flowi4 fl4;
 871         int err = -1;
 872         struct sk_buff *skb;
 873
 874         /* First, grab a route. */
 875         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 876                 return -1;
 877
 878         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 879
 880         if (skb) {
 881                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 882
 883                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 884                                             ireq->ir_rmt_addr,
 885                                             ireq_opt_deref(ireq));
 886                 err = net_xmit_eval(err);
 887         }
 888
 889         return err;
 890 }
 891
 892 /*
 893  *      IPv4 request_sock destructor.
 894  */
 895 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 896 {
 897         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 898 }
 899
 900 #ifdef CONFIG_TCP_MD5SIG
 901 /*
 902  * RFC2385 MD5 checksumming requires a mapping of
 903  * IP address->MD5 Key.
 904  * We need to maintain these in the sk structure.
 905  */
 906
 907 /* Find the Key structure for an address.  */
 908 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 909                                          const union tcp_md5_addr *addr,
 910                                          int family)
 911 {
 912         const struct tcp_sock *tp = tcp_sk(sk);
 913         struct tcp_md5sig_key *key;
 914         const struct tcp_md5sig_info *md5sig;
 915         __be32 mask;
 916         struct tcp_md5sig_key *best_match = NULL;
 917         bool match;
 918
 919         /* caller either holds rcu_read_lock() or socket lock */
 920         md5sig = rcu_dereference_check(tp->md5sig_info,
 921                                        lockdep_sock_is_held(sk));
 922         if (!md5sig)
 923                 return NULL;
 924
 925         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 926                 if (key->family != family)
 927                         continue;
 928
 929                 if (family == AF_INET) {
 930                         mask = inet_make_mask(key->prefixlen);
 931                         match = (key->addr.a4.s_addr & mask) ==
 932                                 (addr->a4.s_addr & mask);
 933 #if IS_ENABLED(CONFIG_IPV6)
 934                 } else if (family == AF_INET6) {
 935                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 936                                                   key->prefixlen);
 937 #endif
 938                 } else {
 939                         match = false;
 940                 }
 941
 942                 if (match && (!best_match ||
 943                               key->prefixlen > best_match->prefixlen))
 944                         best_match = key;
 945         }
 946         return best_match;
 947 }
 948 EXPORT_SYMBOL(tcp_md5_do_lookup);
 949
 950 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 951                                                       const union tcp_md5_addr *addr,
 952                                                       int family, u8 prefixlen)
 953 {
 954         const struct tcp_sock *tp = tcp_sk(sk);
 955         struct tcp_md5sig_key *key;
 956         unsigned int size = sizeof(struct in_addr);
 957         const struct tcp_md5sig_info *md5sig;
 958
 959         /* caller either holds rcu_read_lock() or socket lock */
 960         md5sig = rcu_dereference_check(tp->md5sig_info,
 961                                        lockdep_sock_is_held(sk));
 962         if (!md5sig)
 963                 return NULL;
 964 #if IS_ENABLED(CONFIG_IPV6)
 965         if (family == AF_INET6)
 966                 size = sizeof(struct in6_addr);
 967 #endif
 968         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 969                 if (key->family != family)
 970                         continue;
 971                 if (!memcmp(&key->addr, addr, size) &&
 972                     key->prefixlen == prefixlen)
 973                         return key;
 974         }
 975         return NULL;
 976 }
 977
 978 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 979                                          const struct sock *addr_sk)
 980 {
 981         const union tcp_md5_addr *addr;
 982
 983         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 984         return tcp_md5_do_lookup(sk, addr, AF_INET);
 985 }
 986 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 987
 988 /* This can be called on a newly created socket, from other files */
 989 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 990                    int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 991                    gfp_t gfp)
 992 {
 993         /* Add Key to the list */
 994         struct tcp_md5sig_key *key;
 995         struct tcp_sock *tp = tcp_sk(sk);
 996         struct tcp_md5sig_info *md5sig;
 997
 998         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
 999         if (key) {
1000                 /* Pre-existing entry - just update that one. */
1001                 memcpy(key->key, newkey, newkeylen);
1002                 key->keylen = newkeylen;
1003                 return 0;
1004         }
1005
1006         md5sig = rcu_dereference_protected(tp->md5sig_info,
1007                                            lockdep_sock_is_held(sk));
1008         if (!md5sig) {
1009                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1010                 if (!md5sig)
1011                         return -ENOMEM;
1012
1013                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1014                 INIT_HLIST_HEAD(&md5sig->head);
1015                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1016         }
1017
1018         key = sock_kmalloc(sk, sizeof(*key), gfp);
1019         if (!key)
1020                 return -ENOMEM;
1021         if (!tcp_alloc_md5sig_pool()) {
1022                 sock_kfree_s(sk, key, sizeof(*key));
1023                 return -ENOMEM;
1024         }
1025
1026         memcpy(key->key, newkey, newkeylen);
1027         key->keylen = newkeylen;
1028         key->family = family;
1029         key->prefixlen = prefixlen;
1030         memcpy(&key->addr, addr,
1031                (family == AF_INET6) ? sizeof(struct in6_addr) :
1032                                       sizeof(struct in_addr));
1033         hlist_add_head_rcu(&key->node, &md5sig->head);
1034         return 0;
1035 }
1036 EXPORT_SYMBOL(tcp_md5_do_add);
1037
1038 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1039                    u8 prefixlen)
1040 {
1041         struct tcp_md5sig_key *key;
1042
1043         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1044         if (!key)
1045                 return -ENOENT;
1046         hlist_del_rcu(&key->node);
1047         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048         kfree_rcu(key, rcu);
1049         return 0;
1050 }
1051 EXPORT_SYMBOL(tcp_md5_do_del);
1052
1053 static void tcp_clear_md5_list(struct sock *sk)
1054 {
1055         struct tcp_sock *tp = tcp_sk(sk);
1056         struct tcp_md5sig_key *key;
1057         struct hlist_node *n;
1058         struct tcp_md5sig_info *md5sig;
1059
1060         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1061
1062         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1063                 hlist_del_rcu(&key->node);
1064                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1065                 kfree_rcu(key, rcu);
1066         }
1067 }
1068
1069 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1070                                  char __user *optval, int optlen)
1071 {
1072         struct tcp_md5sig cmd;
1073         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1074         u8 prefixlen = 32;
1075
1076         if (optlen < sizeof(cmd))
1077                 return -EINVAL;
1078
1079         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1080                 return -EFAULT;
1081
1082         if (sin->sin_family != AF_INET)
1083                 return -EINVAL;
1084
1085         if (optname == TCP_MD5SIG_EXT &&
1086             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1087                 prefixlen = cmd.tcpm_prefixlen;
1088                 if (prefixlen > 32)
1089                         return -EINVAL;
1090         }
1091
1092         if (!cmd.tcpm_keylen)
1093                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1094                                       AF_INET, prefixlen);
1095
1096         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1097                 return -EINVAL;
1098
1099         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1100                               AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1101                               GFP_KERNEL);
1102 }
1103
1104 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1105                                    __be32 daddr, __be32 saddr,
1106                                    const struct tcphdr *th, int nbytes)
1107 {
1108         struct tcp4_pseudohdr *bp;
1109         struct scatterlist sg;
1110         struct tcphdr *_th;
1111
1112         bp = hp->scratch;
1113         bp->saddr = saddr;
1114         bp->daddr = daddr;
1115         bp->pad = 0;
1116         bp->protocol = IPPROTO_TCP;
1117         bp->len = cpu_to_be16(nbytes);
1118
1119         _th = (struct tcphdr *)(bp + 1);
1120         memcpy(_th, th, sizeof(*th));
1121         _th->check = 0;
1122
1123         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1124         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1125                                 sizeof(*bp) + sizeof(*th));
1126         return crypto_ahash_update(hp->md5_req);
1127 }
1128
1129 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1130                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1131 {
1132         struct tcp_md5sig_pool *hp;
1133         struct ahash_request *req;
1134
1135         hp = tcp_get_md5sig_pool();
1136         if (!hp)
1137                 goto clear_hash_noput;
1138         req = hp->md5_req;
1139
1140         if (crypto_ahash_init(req))
1141                 goto clear_hash;
1142         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1143                 goto clear_hash;
1144         if (tcp_md5_hash_key(hp, key))
1145                 goto clear_hash;
1146         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1147         if (crypto_ahash_final(req))
1148                 goto clear_hash;
1149
1150         tcp_put_md5sig_pool();
1151         return 0;
1152
1153 clear_hash:
1154         tcp_put_md5sig_pool();
1155 clear_hash_noput:
1156         memset(md5_hash, 0, 16);
1157         return 1;
1158 }
1159
1160 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1161                         const struct sock *sk,
1162                         const struct sk_buff *skb)
1163 {
1164         struct tcp_md5sig_pool *hp;
1165         struct ahash_request *req;
1166         const struct tcphdr *th = tcp_hdr(skb);
1167         __be32 saddr, daddr;
1168
1169         if (sk) { /* valid for establish/request sockets */
1170                 saddr = sk->sk_rcv_saddr;
1171                 daddr = sk->sk_daddr;
1172         } else {
1173                 const struct iphdr *iph = ip_hdr(skb);
1174                 saddr = iph->saddr;
1175                 daddr = iph->daddr;
1176         }
1177
1178         hp = tcp_get_md5sig_pool();
1179         if (!hp)
1180                 goto clear_hash_noput;
1181         req = hp->md5_req;
1182
1183         if (crypto_ahash_init(req))
1184                 goto clear_hash;
1185
1186         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1187                 goto clear_hash;
1188         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1189                 goto clear_hash;
1190         if (tcp_md5_hash_key(hp, key))
1191                 goto clear_hash;
1192         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1193         if (crypto_ahash_final(req))
1194                 goto clear_hash;
1195
1196         tcp_put_md5sig_pool();
1197         return 0;
1198
1199 clear_hash:
1200         tcp_put_md5sig_pool();
1201 clear_hash_noput:
1202         memset(md5_hash, 0, 16);
1203         return 1;
1204 }
1205 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1206
1207 #endif
1208
1209 /* Called with rcu_read_lock() */
1210 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1211                                     const struct sk_buff *skb)
1212 {
1213 #ifdef CONFIG_TCP_MD5SIG
1214         /*
1215          * This gets called for each TCP segment that arrives
1216          * so we want to be efficient.
1217          * We have 3 drop cases:
1218          * o No MD5 hash and one expected.
1219          * o MD5 hash and we're not expecting one.
1220          * o MD5 hash and its wrong.
1221          */
1222         const __u8 *hash_location = NULL;
1223         struct tcp_md5sig_key *hash_expected;
1224         const struct iphdr *iph = ip_hdr(skb);
1225         const struct tcphdr *th = tcp_hdr(skb);
1226         int genhash;
1227         unsigned char newhash[16];
1228
1229         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1230                                           AF_INET);
1231         hash_location = tcp_parse_md5sig_option(th);
1232
1233         /* We've parsed the options - do we have a hash? */
1234         if (!hash_expected && !hash_location)
1235                 return false;
1236
1237         if (hash_expected && !hash_location) {
1238                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1239                 return true;
1240         }
1241
1242         if (!hash_expected && hash_location) {
1243                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1244                 return true;
1245         }
1246
1247         /* Okay, so this is hash_expected and hash_location -
1248          * so we need to calculate the checksum.
1249          */
1250         genhash = tcp_v4_md5_hash_skb(newhash,
1251                                       hash_expected,
1252                                       NULL, skb);
1253
1254         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1255                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1256                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1257                                      &iph->saddr, ntohs(th->source),
1258                                      &iph->daddr, ntohs(th->dest),
1259                                      genhash ? " tcp_v4_calc_md5_hash failed"
1260                                      : "");
1261                 return true;
1262         }
1263         return false;
1264 #endif
1265         return false;
1266 }
1267
1268 static void tcp_v4_init_req(struct request_sock *req,
1269                             const struct sock *sk_listener,
1270                             struct sk_buff *skb)
1271 {
1272         struct inet_request_sock *ireq = inet_rsk(req);
1273         struct net *net = sock_net(sk_listener);
1274
1275         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1276         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1277         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1278 }
1279
1280 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1281                                           struct flowi *fl,
1282                                           const struct request_sock *req)
1283 {
1284         return inet_csk_route_req(sk, &fl->u.ip4, req);
1285 }
1286
1287 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1288         .family         =       PF_INET,
1289         .obj_size       =       sizeof(struct tcp_request_sock),
1290         .rtx_syn_ack    =       tcp_rtx_synack,
1291         .send_ack       =       tcp_v4_reqsk_send_ack,
1292         .destructor     =       tcp_v4_reqsk_destructor,
1293         .send_reset     =       tcp_v4_send_reset,
1294         .syn_ack_timeout =      tcp_syn_ack_timeout,
1295 };
1296
1297 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1298         .mss_clamp      =       TCP_MSS_DEFAULT,
1299 #ifdef CONFIG_TCP_MD5SIG
1300         .req_md5_lookup =       tcp_v4_md5_lookup,
1301         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1302 #endif
1303         .init_req       =       tcp_v4_init_req,
1304 #ifdef CONFIG_SYN_COOKIES
1305         .cookie_init_seq =      cookie_v4_init_sequence,
1306 #endif
1307         .route_req      =       tcp_v4_route_req,
1308         .init_seq       =       tcp_v4_init_seq,
1309         .init_ts_off    =       tcp_v4_init_ts_off,
1310         .send_synack    =       tcp_v4_send_synack,
1311 };
1312
1313 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1314 {
1315         /* Never answer to SYNs send to broadcast or multicast */
1316         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1317                 goto drop;
1318
1319         return tcp_conn_request(&tcp_request_sock_ops,
1320                                 &tcp_request_sock_ipv4_ops, sk, skb);
1321
1322 drop:
1323         tcp_listendrop(sk);
1324         return 0;
1325 }
1326 EXPORT_SYMBOL(tcp_v4_conn_request);
1327
1328
1329 /*
1330  * The three way handshake has completed - we got a valid synack -
1331  * now create the new socket.
1332  */
1333 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1334                                   struct request_sock *req,
1335                                   struct dst_entry *dst,
1336                                   struct request_sock *req_unhash,
1337                                   bool *own_req)
1338 {
1339         struct inet_request_sock *ireq;
1340         struct inet_sock *newinet;
1341         struct tcp_sock *newtp;
1342         struct sock *newsk;
1343 #ifdef CONFIG_TCP_MD5SIG
1344         struct tcp_md5sig_key *key;
1345 #endif
1346         struct ip_options_rcu *inet_opt;
1347
1348         if (sk_acceptq_is_full(sk))
1349                 goto exit_overflow;
1350
1351         newsk = tcp_create_openreq_child(sk, req, skb);
1352         if (!newsk)
1353                 goto exit_nonewsk;
1354
1355         newsk->sk_gso_type = SKB_GSO_TCPV4;
1356         inet_sk_rx_dst_set(newsk, skb);
1357
1358         newtp                 = tcp_sk(newsk);
1359         newinet               = inet_sk(newsk);
1360         ireq                  = inet_rsk(req);
1361         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1362         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1363         newsk->sk_bound_dev_if = ireq->ir_iif;
1364         newinet->inet_saddr   = ireq->ir_loc_addr;
1365         inet_opt              = rcu_dereference(ireq->ireq_opt);
1366         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1367         newinet->mc_index     = inet_iif(skb);
1368         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1369         newinet->rcv_tos      = ip_hdr(skb)->tos;
1370         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1371         if (inet_opt)
1372                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1373         newinet->inet_id = newtp->write_seq ^ jiffies;
1374
1375         if (!dst) {
1376                 dst = inet_csk_route_child_sock(sk, newsk, req);
1377                 if (!dst)
1378                         goto put_and_exit;
1379         } else {
1380                 /* syncookie case : see end of cookie_v4_check() */
1381         }
1382         sk_setup_caps(newsk, dst);
1383
1384         tcp_ca_openreq_child(newsk, dst);
1385
1386         tcp_sync_mss(newsk, dst_mtu(dst));
1387         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1388
1389         tcp_initialize_rcv_mss(newsk);
1390
1391 #ifdef CONFIG_TCP_MD5SIG
1392         /* Copy over the MD5 key from the original socket */
1393         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1394                                 AF_INET);
1395         if (key) {
1396                 /*
1397                  * We're using one, so create a matching key
1398                  * on the newsk structure. If we fail to get
1399                  * memory, then we end up not copying the key
1400                  * across. Shucks.
1401                  */
1402                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1403                                AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1404                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1405         }
1406 #endif
1407
1408         if (__inet_inherit_port(sk, newsk) < 0)
1409                 goto put_and_exit;
1410         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1411         if (likely(*own_req)) {
1412                 tcp_move_syn(newtp, req);
1413                 ireq->ireq_opt = NULL;
1414         } else {
1415                 newinet->inet_opt = NULL;
1416         }
1417         return newsk;
1418
1419 exit_overflow:
1420         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1421 exit_nonewsk:
1422         dst_release(dst);
1423 exit:
1424         tcp_listendrop(sk);
1425         return NULL;
1426 put_and_exit:
1427         newinet->inet_opt = NULL;
1428         inet_csk_prepare_forced_close(newsk);
1429         tcp_done(newsk);
1430         goto exit;
1431 }
1432 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1433
1434 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1435 {
1436 #ifdef CONFIG_SYN_COOKIES
1437         const struct tcphdr *th = tcp_hdr(skb);
1438
1439         if (!th->syn)
1440                 sk = cookie_v4_check(sk, skb);
1441 #endif
1442         return sk;
1443 }
1444
1445 /* The socket must have it's spinlock held when we get
1446  * here, unless it is a TCP_LISTEN socket.
1447  *
1448  * We have a potential double-lock case here, so even when
1449  * doing backlog processing we use the BH locking scheme.
1450  * This is because we cannot sleep with the original spinlock
1451  * held.
1452  */
1453 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1454 {
1455         struct sock *rsk;
1456
1457         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1458                 struct dst_entry *dst = sk->sk_rx_dst;
1459
1460                 sock_rps_save_rxhash(sk, skb);
1461                 sk_mark_napi_id(sk, skb);
1462                 if (dst) {
1463                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1464                             !dst->ops->check(dst, 0)) {
1465                                 dst_release(dst);
1466                                 sk->sk_rx_dst = NULL;
1467                         }
1468                 }
1469                 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1470                 return 0;
1471         }
1472
1473         if (tcp_checksum_complete(skb))
1474                 goto csum_err;
1475
1476         if (sk->sk_state == TCP_LISTEN) {
1477                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1478
1479                 if (!nsk)
1480                         goto discard;
1481                 if (nsk != sk) {
1482                         if (tcp_child_process(sk, nsk, skb)) {
1483                                 rsk = nsk;
1484                                 goto reset;
1485                         }
1486                         return 0;
1487                 }
1488         } else
1489                 sock_rps_save_rxhash(sk, skb);
1490
1491         if (tcp_rcv_state_process(sk, skb)) {
1492                 rsk = sk;
1493                 goto reset;
1494         }
1495         return 0;
1496
1497 reset:
1498         tcp_v4_send_reset(rsk, skb);
1499 discard:
1500         kfree_skb(skb);
1501         /* Be careful here. If this function gets more complicated and
1502          * gcc suffers from register pressure on the x86, sk (in %ebx)
1503          * might be destroyed here. This current version compiles correctly,
1504          * but you have been warned.
1505          */
1506         return 0;
1507
1508 csum_err:
1509         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1510         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1511         goto discard;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_do_rcv);
1514
1515 int tcp_v4_early_demux(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph;
1518         const struct tcphdr *th;
1519         struct sock *sk;
1520
1521         if (skb->pkt_type != PACKET_HOST)
1522                 return 0;
1523
1524         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1525                 return 0;
1526
1527         iph = ip_hdr(skb);
1528         th = tcp_hdr(skb);
1529
1530         if (th->doff < sizeof(struct tcphdr) / 4)
1531                 return 0;
1532
1533         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1534                                        iph->saddr, th->source,
1535                                        iph->daddr, ntohs(th->dest),
1536                                        skb->skb_iif, inet_sdif(skb));
1537         if (sk) {
1538                 skb->sk = sk;
1539                 skb->destructor = sock_edemux;
1540                 if (sk_fullsock(sk)) {
1541                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1542
1543                         if (dst)
1544                                 dst = dst_check(dst, 0);
1545                         if (dst &&
1546                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1547                                 skb_dst_set_noref(skb, dst);
1548                 }
1549         }
1550         return 0;
1551 }
1552
1553 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554 {
1555         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556
1557         /* Only socket owner can try to collapse/prune rx queues
1558          * to reduce memory overhead, so add a little headroom here.
1559          * Few sockets backlog are possibly concurrently non empty.
1560          */
1561         limit += 64*1024;
1562
1563         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564          * we can fix skb->truesize to its real value to avoid future drops.
1565          * This is valid because skb is not yet charged to the socket.
1566          * It has been noticed pure SACK packets were sometimes dropped
1567          * (if cooked by drivers without copybreak feature).
1568          */
1569         skb_condense(skb);
1570
1571         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572                 bh_unlock_sock(sk);
1573                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574                 return true;
1575         }
1576         return false;
1577 }
1578 EXPORT_SYMBOL(tcp_add_backlog);
1579
1580 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581 {
1582         struct tcphdr *th = (struct tcphdr *)skb->data;
1583         unsigned int eaten = skb->len;
1584         int err;
1585
1586         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587         if (!err) {
1588                 eaten -= skb->len;
1589                 TCP_SKB_CB(skb)->end_seq -= eaten;
1590         }
1591         return err;
1592 }
1593 EXPORT_SYMBOL(tcp_filter);
1594
1595 static void tcp_v4_restore_cb(struct sk_buff *skb)
1596 {
1597         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1598                 sizeof(struct inet_skb_parm));
1599 }
1600
1601 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1602                            const struct tcphdr *th)
1603 {
1604         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1605          * barrier() makes sure compiler wont play fool^Waliasing games.
1606          */
1607         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1608                 sizeof(struct inet_skb_parm));
1609         barrier();
1610
1611         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1612         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1613                                     skb->len - th->doff * 4);
1614         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1615         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1616         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1617         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1618         TCP_SKB_CB(skb)->sacked  = 0;
1619         TCP_SKB_CB(skb)->has_rxtstamp =
1620                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1621 }
1622
1623 /*
1624  *      From tcp_input.c
1625  */
1626
1627 int tcp_v4_rcv(struct sk_buff *skb)
1628 {
1629         struct net *net = dev_net(skb->dev);
1630         int sdif = inet_sdif(skb);
1631         const struct iphdr *iph;
1632         const struct tcphdr *th;
1633         bool refcounted;
1634         struct sock *sk;
1635         int ret;
1636
1637         if (skb->pkt_type != PACKET_HOST)
1638                 goto discard_it;
1639
1640         /* Count it even if it's bad */
1641         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1642
1643         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644                 goto discard_it;
1645
1646         th = (const struct tcphdr *)skb->data;
1647
1648         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1649                 goto bad_packet;
1650         if (!pskb_may_pull(skb, th->doff * 4))
1651                 goto discard_it;
1652
1653         /* An explanation is required here, I think.
1654          * Packet length and doff are validated by header prediction,
1655          * provided case of th->doff==0 is eliminated.
1656          * So, we defer the checks. */
1657
1658         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1659                 goto csum_error;
1660
1661         th = (const struct tcphdr *)skb->data;
1662         iph = ip_hdr(skb);
1663 lookup:
1664         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1665                                th->dest, sdif, &refcounted);
1666         if (!sk)
1667                 goto no_tcp_socket;
1668
1669 process:
1670         if (sk->sk_state == TCP_TIME_WAIT)
1671                 goto do_time_wait;
1672
1673         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1674                 struct request_sock *req = inet_reqsk(sk);
1675                 struct sock *nsk;
1676
1677                 sk = req->rsk_listener;
1678                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1679                         sk_drops_add(sk, skb);
1680                         reqsk_put(req);
1681                         goto discard_it;
1682                 }
1683                 if (tcp_checksum_complete(skb)) {
1684                         reqsk_put(req);
1685                         goto csum_error;
1686                 }
1687                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1688                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1689                         goto lookup;
1690                 }
1691                 /* We own a reference on the listener, increase it again
1692                  * as we might lose it too soon.
1693                  */
1694                 sock_hold(sk);
1695                 refcounted = true;
1696                 nsk = NULL;
1697                 if (!tcp_filter(sk, skb)) {
1698                         th = (const struct tcphdr *)skb->data;
1699                         iph = ip_hdr(skb);
1700                         tcp_v4_fill_cb(skb, iph, th);
1701                         nsk = tcp_check_req(sk, skb, req, false);
1702                 }
1703                 if (!nsk) {
1704                         reqsk_put(req);
1705                         goto discard_and_relse;
1706                 }
1707                 if (nsk == sk) {
1708                         reqsk_put(req);
1709                         tcp_v4_restore_cb(skb);
1710                 } else if (tcp_child_process(sk, nsk, skb)) {
1711                         tcp_v4_send_reset(nsk, skb);
1712                         goto discard_and_relse;
1713                 } else {
1714                         sock_put(sk);
1715                         return 0;
1716                 }
1717         }
1718         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1719                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1720                 goto discard_and_relse;
1721         }
1722
1723         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1724                 goto discard_and_relse;
1725
1726         if (tcp_v4_inbound_md5_hash(sk, skb))
1727                 goto discard_and_relse;
1728
1729         nf_reset(skb);
1730
1731         if (tcp_filter(sk, skb))
1732                 goto discard_and_relse;
1733         th = (const struct tcphdr *)skb->data;
1734         iph = ip_hdr(skb);
1735         tcp_v4_fill_cb(skb, iph, th);
1736
1737         skb->dev = NULL;
1738
1739         if (sk->sk_state == TCP_LISTEN) {
1740                 ret = tcp_v4_do_rcv(sk, skb);
1741                 goto put_and_return;
1742         }
1743
1744         sk_incoming_cpu_update(sk);
1745
1746         bh_lock_sock_nested(sk);
1747         tcp_segs_in(tcp_sk(sk), skb);
1748         ret = 0;
1749         if (!sock_owned_by_user(sk)) {
1750                 ret = tcp_v4_do_rcv(sk, skb);
1751         } else if (tcp_add_backlog(sk, skb)) {
1752                 goto discard_and_relse;
1753         }
1754         bh_unlock_sock(sk);
1755
1756 put_and_return:
1757         if (refcounted)
1758                 sock_put(sk);
1759
1760         return ret;
1761
1762 no_tcp_socket:
1763         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1764                 goto discard_it;
1765
1766         tcp_v4_fill_cb(skb, iph, th);
1767
1768         if (tcp_checksum_complete(skb)) {
1769 csum_error:
1770                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1771 bad_packet:
1772                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1773         } else {
1774                 tcp_v4_send_reset(NULL, skb);
1775         }
1776
1777 discard_it:
1778         /* Discard frame. */
1779         kfree_skb(skb);
1780         return 0;
1781
1782 discard_and_relse:
1783         sk_drops_add(sk, skb);
1784         if (refcounted)
1785                 sock_put(sk);
1786         goto discard_it;
1787
1788 do_time_wait:
1789         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1790                 inet_twsk_put(inet_twsk(sk));
1791                 goto discard_it;
1792         }
1793
1794         tcp_v4_fill_cb(skb, iph, th);
1795
1796         if (tcp_checksum_complete(skb)) {
1797                 inet_twsk_put(inet_twsk(sk));
1798                 goto csum_error;
1799         }
1800         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1801         case TCP_TW_SYN: {
1802                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1803                                                         &tcp_hashinfo, skb,
1804                                                         __tcp_hdrlen(th),
1805                                                         iph->saddr, th->source,
1806                                                         iph->daddr, th->dest,
1807                                                         inet_iif(skb),
1808                                                         sdif);
1809                 if (sk2) {
1810                         inet_twsk_deschedule_put(inet_twsk(sk));
1811                         sk = sk2;
1812                         tcp_v4_restore_cb(skb);
1813                         refcounted = false;
1814                         goto process;
1815                 }
1816         }
1817                 /* to ACK */
1818                 /* fall through */
1819         case TCP_TW_ACK:
1820                 tcp_v4_timewait_ack(sk, skb);
1821                 break;
1822         case TCP_TW_RST:
1823                 tcp_v4_send_reset(sk, skb);
1824                 inet_twsk_deschedule_put(inet_twsk(sk));
1825                 goto discard_it;
1826         case TCP_TW_SUCCESS:;
1827         }
1828         goto discard_it;
1829 }
1830
1831 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1832         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1833         .twsk_unique    = tcp_twsk_unique,
1834         .twsk_destructor= tcp_twsk_destructor,
1835 };
1836
1837 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1838 {
1839         struct dst_entry *dst = skb_dst(skb);
1840
1841         if (dst && dst_hold_safe(dst)) {
1842                 sk->sk_rx_dst = dst;
1843                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1844         }
1845 }
1846 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1847
1848 const struct inet_connection_sock_af_ops ipv4_specific = {
1849         .queue_xmit        = ip_queue_xmit,
1850         .send_check        = tcp_v4_send_check,
1851         .rebuild_header    = inet_sk_rebuild_header,
1852         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1853         .conn_request      = tcp_v4_conn_request,
1854         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1855         .net_header_len    = sizeof(struct iphdr),
1856         .setsockopt        = ip_setsockopt,
1857         .getsockopt        = ip_getsockopt,
1858         .addr2sockaddr     = inet_csk_addr2sockaddr,
1859         .sockaddr_len      = sizeof(struct sockaddr_in),
1860 #ifdef CONFIG_COMPAT
1861         .compat_setsockopt = compat_ip_setsockopt,
1862         .compat_getsockopt = compat_ip_getsockopt,
1863 #endif
1864         .mtu_reduced       = tcp_v4_mtu_reduced,
1865 };
1866 EXPORT_SYMBOL(ipv4_specific);
1867
1868 #ifdef CONFIG_TCP_MD5SIG
1869 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1870         .md5_lookup             = tcp_v4_md5_lookup,
1871         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1872         .md5_parse              = tcp_v4_parse_md5_keys,
1873 };
1874 #endif
1875
1876 /* NOTE: A lot of things set to zero explicitly by call to
1877  *       sk_alloc() so need not be done here.
1878  */
1879 static int tcp_v4_init_sock(struct sock *sk)
1880 {
1881         struct inet_connection_sock *icsk = inet_csk(sk);
1882
1883         tcp_init_sock(sk);
1884
1885         icsk->icsk_af_ops = &ipv4_specific;
1886
1887 #ifdef CONFIG_TCP_MD5SIG
1888         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1889 #endif
1890
1891         return 0;
1892 }
1893
1894 void tcp_v4_destroy_sock(struct sock *sk)
1895 {
1896         struct tcp_sock *tp = tcp_sk(sk);
1897
1898         trace_tcp_destroy_sock(sk);
1899
1900         tcp_clear_xmit_timers(sk);
1901
1902         tcp_cleanup_congestion_control(sk);
1903
1904         tcp_cleanup_ulp(sk);
1905
1906         /* Cleanup up the write buffer. */
1907         tcp_write_queue_purge(sk);
1908
1909         /* Check if we want to disable active TFO */
1910         tcp_fastopen_active_disable_ofo_check(sk);
1911
1912         /* Cleans up our, hopefully empty, out_of_order_queue. */
1913         skb_rbtree_purge(&tp->out_of_order_queue);
1914
1915 #ifdef CONFIG_TCP_MD5SIG
1916         /* Clean up the MD5 key list, if any */
1917         if (tp->md5sig_info) {
1918                 tcp_clear_md5_list(sk);
1919                 kfree_rcu(tp->md5sig_info, rcu);
1920                 tp->md5sig_info = NULL;
1921         }
1922 #endif
1923
1924         /* Clean up a referenced TCP bind bucket. */
1925         if (inet_csk(sk)->icsk_bind_hash)
1926                 inet_put_port(sk);
1927
1928         BUG_ON(tp->fastopen_rsk);
1929
1930         /* If socket is aborted during connect operation */
1931         tcp_free_fastopen_req(tp);
1932         tcp_fastopen_destroy_cipher(sk);
1933         tcp_saved_syn_free(tp);
1934
1935         sk_sockets_allocated_dec(sk);
1936 }
1937 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1938
1939 #ifdef CONFIG_PROC_FS
1940 /* Proc filesystem TCP sock list dumping. */
1941
1942 /*
1943  * Get next listener socket follow cur.  If cur is NULL, get first socket
1944  * starting from bucket given in st->bucket; when st->bucket is zero the
1945  * very first socket in the hash table is returned.
1946  */
1947 static void *listening_get_next(struct seq_file *seq, void *cur)
1948 {
1949         struct tcp_iter_state *st = seq->private;
1950         struct net *net = seq_file_net(seq);
1951         struct inet_listen_hashbucket *ilb;
1952         struct sock *sk = cur;
1953
1954         if (!sk) {
1955 get_head:
1956                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1957                 spin_lock(&ilb->lock);
1958                 sk = sk_head(&ilb->head);
1959                 st->offset = 0;
1960                 goto get_sk;
1961         }
1962         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1963         ++st->num;
1964         ++st->offset;
1965
1966         sk = sk_next(sk);
1967 get_sk:
1968         sk_for_each_from(sk) {
1969                 if (!net_eq(sock_net(sk), net))
1970                         continue;
1971                 if (sk->sk_family == st->family)
1972                         return sk;
1973         }
1974         spin_unlock(&ilb->lock);
1975         st->offset = 0;
1976         if (++st->bucket < INET_LHTABLE_SIZE)
1977                 goto get_head;
1978         return NULL;
1979 }
1980
1981 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1982 {
1983         struct tcp_iter_state *st = seq->private;
1984         void *rc;
1985
1986         st->bucket = 0;
1987         st->offset = 0;
1988         rc = listening_get_next(seq, NULL);
1989
1990         while (rc && *pos) {
1991                 rc = listening_get_next(seq, rc);
1992                 --*pos;
1993         }
1994         return rc;
1995 }
1996
1997 static inline bool empty_bucket(const struct tcp_iter_state *st)
1998 {
1999         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2000 }
2001
2002 /*
2003  * Get first established socket starting from bucket given in st->bucket.
2004  * If st->bucket is zero, the very first socket in the hash is returned.
2005  */
2006 static void *established_get_first(struct seq_file *seq)
2007 {
2008         struct tcp_iter_state *st = seq->private;
2009         struct net *net = seq_file_net(seq);
2010         void *rc = NULL;
2011
2012         st->offset = 0;
2013         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2014                 struct sock *sk;
2015                 struct hlist_nulls_node *node;
2016                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2017
2018                 /* Lockless fast path for the common case of empty buckets */
2019                 if (empty_bucket(st))
2020                         continue;
2021
2022                 spin_lock_bh(lock);
2023                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2024                         if (sk->sk_family != st->family ||
2025                             !net_eq(sock_net(sk), net)) {
2026                                 continue;
2027                         }
2028                         rc = sk;
2029                         goto out;
2030                 }
2031                 spin_unlock_bh(lock);
2032         }
2033 out:
2034         return rc;
2035 }
2036
2037 static void *established_get_next(struct seq_file *seq, void *cur)
2038 {
2039         struct sock *sk = cur;
2040         struct hlist_nulls_node *node;
2041         struct tcp_iter_state *st = seq->private;
2042         struct net *net = seq_file_net(seq);
2043
2044         ++st->num;
2045         ++st->offset;
2046
2047         sk = sk_nulls_next(sk);
2048
2049         sk_nulls_for_each_from(sk, node) {
2050                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2051                         return sk;
2052         }
2053
2054         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2055         ++st->bucket;
2056         return established_get_first(seq);
2057 }
2058
2059 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2060 {
2061         struct tcp_iter_state *st = seq->private;
2062         void *rc;
2063
2064         st->bucket = 0;
2065         rc = established_get_first(seq);
2066
2067         while (rc && pos) {
2068                 rc = established_get_next(seq, rc);
2069                 --pos;
2070         }
2071         return rc;
2072 }
2073
2074 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2075 {
2076         void *rc;
2077         struct tcp_iter_state *st = seq->private;
2078
2079         st->state = TCP_SEQ_STATE_LISTENING;
2080         rc        = listening_get_idx(seq, &pos);
2081
2082         if (!rc) {
2083                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2084                 rc        = established_get_idx(seq, pos);
2085         }
2086
2087         return rc;
2088 }
2089
2090 static void *tcp_seek_last_pos(struct seq_file *seq)
2091 {
2092         struct tcp_iter_state *st = seq->private;
2093         int offset = st->offset;
2094         int orig_num = st->num;
2095         void *rc = NULL;
2096
2097         switch (st->state) {
2098         case TCP_SEQ_STATE_LISTENING:
2099                 if (st->bucket >= INET_LHTABLE_SIZE)
2100                         break;
2101                 st->state = TCP_SEQ_STATE_LISTENING;
2102                 rc = listening_get_next(seq, NULL);
2103                 while (offset-- && rc)
2104                         rc = listening_get_next(seq, rc);
2105                 if (rc)
2106                         break;
2107                 st->bucket = 0;
2108                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2109                 /* Fallthrough */
2110         case TCP_SEQ_STATE_ESTABLISHED:
2111                 if (st->bucket > tcp_hashinfo.ehash_mask)
2112                         break;
2113                 rc = established_get_first(seq);
2114                 while (offset-- && rc)
2115                         rc = established_get_next(seq, rc);
2116         }
2117
2118         st->num = orig_num;
2119
2120         return rc;
2121 }
2122
2123 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2124 {
2125         struct tcp_iter_state *st = seq->private;
2126         void *rc;
2127
2128         if (*pos && *pos == st->last_pos) {
2129                 rc = tcp_seek_last_pos(seq);
2130                 if (rc)
2131                         goto out;
2132         }
2133
2134         st->state = TCP_SEQ_STATE_LISTENING;
2135         st->num = 0;
2136         st->bucket = 0;
2137         st->offset = 0;
2138         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2139
2140 out:
2141         st->last_pos = *pos;
2142         return rc;
2143 }
2144
2145 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2146 {
2147         struct tcp_iter_state *st = seq->private;
2148         void *rc = NULL;
2149
2150         if (v == SEQ_START_TOKEN) {
2151                 rc = tcp_get_idx(seq, 0);
2152                 goto out;
2153         }
2154
2155         switch (st->state) {
2156         case TCP_SEQ_STATE_LISTENING:
2157                 rc = listening_get_next(seq, v);
2158                 if (!rc) {
2159                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2160                         st->bucket = 0;
2161                         st->offset = 0;
2162                         rc        = established_get_first(seq);
2163                 }
2164                 break;
2165         case TCP_SEQ_STATE_ESTABLISHED:
2166                 rc = established_get_next(seq, v);
2167                 break;
2168         }
2169 out:
2170         ++*pos;
2171         st->last_pos = *pos;
2172         return rc;
2173 }
2174
2175 static void tcp_seq_stop(struct seq_file *seq, void *v)
2176 {
2177         struct tcp_iter_state *st = seq->private;
2178
2179         switch (st->state) {
2180         case TCP_SEQ_STATE_LISTENING:
2181                 if (v != SEQ_START_TOKEN)
2182                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2183                 break;
2184         case TCP_SEQ_STATE_ESTABLISHED:
2185                 if (v)
2186                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2187                 break;
2188         }
2189 }
2190
2191 int tcp_seq_open(struct inode *inode, struct file *file)
2192 {
2193         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2194         struct tcp_iter_state *s;
2195         int err;
2196
2197         err = seq_open_net(inode, file, &afinfo->seq_ops,
2198                           sizeof(struct tcp_iter_state));
2199         if (err < 0)
2200                 return err;
2201
2202         s = ((struct seq_file *)file->private_data)->private;
2203         s->family               = afinfo->family;
2204         s->last_pos             = 0;
2205         return 0;
2206 }
2207 EXPORT_SYMBOL(tcp_seq_open);
2208
2209 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2210 {
2211         int rc = 0;
2212         struct proc_dir_entry *p;
2213
2214         afinfo->seq_ops.start           = tcp_seq_start;
2215         afinfo->seq_ops.next            = tcp_seq_next;
2216         afinfo->seq_ops.stop            = tcp_seq_stop;
2217
2218         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2219                              afinfo->seq_fops, afinfo);
2220         if (!p)
2221                 rc = -ENOMEM;
2222         return rc;
2223 }
2224 EXPORT_SYMBOL(tcp_proc_register);
2225
2226 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2227 {
2228         remove_proc_entry(afinfo->name, net->proc_net);
2229 }
2230 EXPORT_SYMBOL(tcp_proc_unregister);
2231
2232 static void get_openreq4(const struct request_sock *req,
2233                          struct seq_file *f, int i)
2234 {
2235         const struct inet_request_sock *ireq = inet_rsk(req);
2236         long delta = req->rsk_timer.expires - jiffies;
2237
2238         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2239                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2240                 i,
2241                 ireq->ir_loc_addr,
2242                 ireq->ir_num,
2243                 ireq->ir_rmt_addr,
2244                 ntohs(ireq->ir_rmt_port),
2245                 TCP_SYN_RECV,
2246                 0, 0, /* could print option size, but that is af dependent. */
2247                 1,    /* timers active (only the expire timer) */
2248                 jiffies_delta_to_clock_t(delta),
2249                 req->num_timeout,
2250                 from_kuid_munged(seq_user_ns(f),
2251                                  sock_i_uid(req->rsk_listener)),
2252                 0,  /* non standard timer */
2253                 0, /* open_requests have no inode */
2254                 0,
2255                 req);
2256 }
2257
2258 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2259 {
2260         int timer_active;
2261         unsigned long timer_expires;
2262         const struct tcp_sock *tp = tcp_sk(sk);
2263         const struct inet_connection_sock *icsk = inet_csk(sk);
2264         const struct inet_sock *inet = inet_sk(sk);
2265         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2266         __be32 dest = inet->inet_daddr;
2267         __be32 src = inet->inet_rcv_saddr;
2268         __u16 destp = ntohs(inet->inet_dport);
2269         __u16 srcp = ntohs(inet->inet_sport);
2270         int rx_queue;
2271         int state;
2272
2273         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2274             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2275             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2276                 timer_active    = 1;
2277                 timer_expires   = icsk->icsk_timeout;
2278         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2279                 timer_active    = 4;
2280                 timer_expires   = icsk->icsk_timeout;
2281         } else if (timer_pending(&sk->sk_timer)) {
2282                 timer_active    = 2;
2283                 timer_expires   = sk->sk_timer.expires;
2284         } else {
2285                 timer_active    = 0;
2286                 timer_expires = jiffies;
2287         }
2288
2289         state = sk_state_load(sk);
2290         if (state == TCP_LISTEN)
2291                 rx_queue = sk->sk_ack_backlog;
2292         else
2293                 /* Because we don't lock the socket,
2294                  * we might find a transient negative value.
2295                  */
2296                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2297
2298         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2299                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2300                 i, src, srcp, dest, destp, state,
2301                 tp->write_seq - tp->snd_una,
2302                 rx_queue,
2303                 timer_active,
2304                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2305                 icsk->icsk_retransmits,
2306                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2307                 icsk->icsk_probes_out,
2308                 sock_i_ino(sk),
2309                 refcount_read(&sk->sk_refcnt), sk,
2310                 jiffies_to_clock_t(icsk->icsk_rto),
2311                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2312                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2313                 tp->snd_cwnd,
2314                 state == TCP_LISTEN ?
2315                     fastopenq->max_qlen :
2316                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2317 }
2318
2319 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2320                                struct seq_file *f, int i)
2321 {
2322         long delta = tw->tw_timer.expires - jiffies;
2323         __be32 dest, src;
2324         __u16 destp, srcp;
2325
2326         dest  = tw->tw_daddr;
2327         src   = tw->tw_rcv_saddr;
2328         destp = ntohs(tw->tw_dport);
2329         srcp  = ntohs(tw->tw_sport);
2330
2331         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2332                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2333                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2334                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2335                 refcount_read(&tw->tw_refcnt), tw);
2336 }
2337
2338 #define TMPSZ 150
2339
2340 static int tcp4_seq_show(struct seq_file *seq, void *v)
2341 {
2342         struct tcp_iter_state *st;
2343         struct sock *sk = v;
2344
2345         seq_setwidth(seq, TMPSZ - 1);
2346         if (v == SEQ_START_TOKEN) {
2347                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2348                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2349                            "inode");
2350                 goto out;
2351         }
2352         st = seq->private;
2353
2354         if (sk->sk_state == TCP_TIME_WAIT)
2355                 get_timewait4_sock(v, seq, st->num);
2356         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2357                 get_openreq4(v, seq, st->num);
2358         else
2359                 get_tcp4_sock(v, seq, st->num);
2360 out:
2361         seq_pad(seq, '\n');
2362         return 0;
2363 }
2364
2365 static const struct file_operations tcp_afinfo_seq_fops = {
2366         .owner   = THIS_MODULE,
2367         .open    = tcp_seq_open,
2368         .read    = seq_read,
2369         .llseek  = seq_lseek,
2370         .release = seq_release_net
2371 };
2372
2373 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2374         .name           = "tcp",
2375         .family         = AF_INET,
2376         .seq_fops       = &tcp_afinfo_seq_fops,
2377         .seq_ops        = {
2378                 .show           = tcp4_seq_show,
2379         },
2380 };
2381
2382 static int __net_init tcp4_proc_init_net(struct net *net)
2383 {
2384         return tcp_proc_register(net, &tcp4_seq_afinfo);
2385 }
2386
2387 static void __net_exit tcp4_proc_exit_net(struct net *net)
2388 {
2389         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2390 }
2391
2392 static struct pernet_operations tcp4_net_ops = {
2393         .init = tcp4_proc_init_net,
2394         .exit = tcp4_proc_exit_net,
2395 };
2396
2397 int __init tcp4_proc_init(void)
2398 {
2399         return register_pernet_subsys(&tcp4_net_ops);
2400 }
2401
2402 void tcp4_proc_exit(void)
2403 {
2404         unregister_pernet_subsys(&tcp4_net_ops);
2405 }
2406 #endif /* CONFIG_PROC_FS */
2407
2408 struct proto tcp_prot = {
2409         .name                   = "TCP",
2410         .owner                  = THIS_MODULE,
2411         .close                  = tcp_close,
2412         .connect                = tcp_v4_connect,
2413         .disconnect             = tcp_disconnect,
2414         .accept                 = inet_csk_accept,
2415         .ioctl                  = tcp_ioctl,
2416         .init                   = tcp_v4_init_sock,
2417         .destroy                = tcp_v4_destroy_sock,
2418         .shutdown               = tcp_shutdown,
2419         .setsockopt             = tcp_setsockopt,
2420         .getsockopt             = tcp_getsockopt,
2421         .keepalive              = tcp_set_keepalive,
2422         .recvmsg                = tcp_recvmsg,
2423         .sendmsg                = tcp_sendmsg,
2424         .sendpage               = tcp_sendpage,
2425         .backlog_rcv            = tcp_v4_do_rcv,
2426         .release_cb             = tcp_release_cb,
2427         .hash                   = inet_hash,
2428         .unhash                 = inet_unhash,
2429         .get_port               = inet_csk_get_port,
2430         .enter_memory_pressure  = tcp_enter_memory_pressure,
2431         .leave_memory_pressure  = tcp_leave_memory_pressure,
2432         .stream_memory_free     = tcp_stream_memory_free,
2433         .sockets_allocated      = &tcp_sockets_allocated,
2434         .orphan_count           = &tcp_orphan_count,
2435         .memory_allocated       = &tcp_memory_allocated,
2436         .memory_pressure        = &tcp_memory_pressure,
2437         .sysctl_mem             = sysctl_tcp_mem,
2438         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2439         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2440         .max_header             = MAX_TCP_HEADER,
2441         .obj_size               = sizeof(struct tcp_sock),
2442         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2443         .twsk_prot              = &tcp_timewait_sock_ops,
2444         .rsk_prot               = &tcp_request_sock_ops,
2445         .h.hashinfo             = &tcp_hashinfo,
2446         .no_autobind            = true,
2447 #ifdef CONFIG_COMPAT
2448         .compat_setsockopt      = compat_tcp_setsockopt,
2449         .compat_getsockopt      = compat_tcp_getsockopt,
2450 #endif
2451         .diag_destroy           = tcp_abort,
2452 };
2453 EXPORT_SYMBOL(tcp_prot);
2454
2455 static void __net_exit tcp_sk_exit(struct net *net)
2456 {
2457         int cpu;
2458
2459         module_put(net->ipv4.tcp_congestion_control->owner);
2460
2461         for_each_possible_cpu(cpu)
2462                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2463         free_percpu(net->ipv4.tcp_sk);
2464 }
2465
2466 static int __net_init tcp_sk_init(struct net *net)
2467 {
2468         int res, cpu, cnt;
2469
2470         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2471         if (!net->ipv4.tcp_sk)
2472                 return -ENOMEM;
2473
2474         for_each_possible_cpu(cpu) {
2475                 struct sock *sk;
2476
2477                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2478                                            IPPROTO_TCP, net);
2479                 if (res)
2480                         goto fail;
2481                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2482                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2483         }
2484
2485         net->ipv4.sysctl_tcp_ecn = 2;
2486         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2487
2488         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2489         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2490         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2491
2492         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2493         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2494         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2495
2496         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2497         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2498         net->ipv4.sysctl_tcp_syncookies = 1;
2499         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2500         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2501         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2502         net->ipv4.sysctl_tcp_orphan_retries = 0;
2503         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2504         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2505         net->ipv4.sysctl_tcp_tw_reuse = 0;
2506
2507         cnt = tcp_hashinfo.ehash_mask + 1;
2508         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2509         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2510
2511         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2512         net->ipv4.sysctl_tcp_sack = 1;
2513         net->ipv4.sysctl_tcp_window_scaling = 1;
2514         net->ipv4.sysctl_tcp_timestamps = 1;
2515         net->ipv4.sysctl_tcp_early_retrans = 3;
2516         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2517         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2518         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2519         net->ipv4.sysctl_tcp_max_reordering = 300;
2520         net->ipv4.sysctl_tcp_dsack = 1;
2521         net->ipv4.sysctl_tcp_app_win = 31;
2522         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2523         net->ipv4.sysctl_tcp_frto = 2;
2524         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2525         /* This limits the percentage of the congestion window which we
2526          * will allow a single TSO frame to consume.  Building TSO frames
2527          * which are too large can cause TCP streams to be bursty.
2528          */
2529         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2530         /* Default TSQ limit of four TSO segments */
2531         net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2532         /* rfc5961 challenge ack rate limiting */
2533         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2534         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2535         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2536         net->ipv4.sysctl_tcp_autocorking = 1;
2537         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2538         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2539         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2540         if (net != &init_net) {
2541                 memcpy(net->ipv4.sysctl_tcp_rmem,
2542                        init_net.ipv4.sysctl_tcp_rmem,
2543                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2544                 memcpy(net->ipv4.sysctl_tcp_wmem,
2545                        init_net.ipv4.sysctl_tcp_wmem,
2546                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2547         }
2548         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2549         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2550         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2551         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2552
2553         /* Reno is always built in */
2554         if (!net_eq(net, &init_net) &&
2555             try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2556                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2557         else
2558                 net->ipv4.tcp_congestion_control = &tcp_reno;
2559
2560         return 0;
2561 fail:
2562         tcp_sk_exit(net);
2563
2564         return res;
2565 }
2566
2567 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2568 {
2569         struct net *net;
2570
2571         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2572
2573         list_for_each_entry(net, net_exit_list, exit_list)
2574                 tcp_fastopen_ctx_destroy(net);
2575 }
2576
2577 static struct pernet_operations __net_initdata tcp_sk_ops = {
2578        .init       = tcp_sk_init,
2579        .exit       = tcp_sk_exit,
2580        .exit_batch = tcp_sk_exit_batch,
2581 };
2582
2583 void __init tcp_v4_init(void)
2584 {
2585         if (register_pernet_subsys(&tcp_sk_ops))
2586                 panic("Failed to create the TCP control socket.\n");
2587 }