net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         const struct in6_addr *nexthop;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         unsigned int head_room;
 199         struct ipv6hdr *hdr;
 200         u8  proto = fl6->flowi6_proto;
 201         int seg_len = skb->len;
 202         int hlimit = -1;
 203         u32 mtu;
 204
 205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206         if (opt)
 207                 head_room += opt->opt_nflen + opt->opt_flen;
 208
 209         if (unlikely(skb_headroom(skb) < head_room)) {
 210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                 if (!skb2) {
 212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                       IPSTATS_MIB_OUTDISCARDS);
 214                         kfree_skb(skb);
 215                         return -ENOBUFS;
 216                 }
 217                 if (skb->sk)
 218                         skb_set_owner_w(skb2, skb->sk);
 219                 consume_skb(skb);
 220                 skb = skb2;
 221         }
 222
 223         if (opt) {
 224                 seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                 if (opt->opt_flen)
 227                         ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                 if (opt->opt_nflen)
 230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                              &fl6->saddr);
 232         }
 233
 234         skb_push(skb, sizeof(struct ipv6hdr));
 235         skb_reset_network_header(skb);
 236         hdr = ipv6_hdr(skb);
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = ip6_dst_hoplimit(dst);
 245
 246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                 ip6_autoflowlabel(net, np), fl6));
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         hdr->saddr = fl6->saddr;
 254         hdr->daddr = *first_hop;
 255
 256         skb->protocol = htons(ETH_P_IPV6);
 257         skb->priority = sk->sk_priority;
 258         skb->mark = mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264
 265                 /* if egress device is enslaved to an L3 master device pass the
 266                  * skb to its handler for processing
 267                  */
 268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                 if (unlikely(!skb))
 270                         return 0;
 271
 272                 /* hooks should never assume socket lock is held.
 273                  * we promote our socket to non const
 274                  */
 275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                                net, (struct sock *)sk, skb, NULL, dst->dev,
 277                                dst_output);
 278         }
 279
 280         skb->dev = dst->dev;
 281         /* ipv6_local_error() does not require socket lock,
 282          * we promote our socket to non const
 283          */
 284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287         kfree_skb(skb);
 288         return -EMSGSIZE;
 289 }
 290 EXPORT_SYMBOL(ip6_xmit);
 291
 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293 {
 294         struct ip6_ra_chain *ra;
 295         struct sock *last = NULL;
 296
 297         read_lock(&ip6_ra_lock);
 298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                 struct sock *sk = ra->sk;
 300                 if (sk && ra->sel == sel &&
 301                     (!sk->sk_bound_dev_if ||
 302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                         if (last) {
 304                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 305                                 if (skb2)
 306                                         rawv6_rcv(last, skb2);
 307                         }
 308                         last = sk;
 309                 }
 310         }
 311
 312         if (last) {
 313                 rawv6_rcv(last, skb);
 314                 read_unlock(&ip6_ra_lock);
 315                 return 1;
 316         }
 317         read_unlock(&ip6_ra_lock);
 318         return 0;
 319 }
 320
 321 static int ip6_forward_proxy_check(struct sk_buff *skb)
 322 {
 323         struct ipv6hdr *hdr = ipv6_hdr(skb);
 324         u8 nexthdr = hdr->nexthdr;
 325         __be16 frag_off;
 326         int offset;
 327
 328         if (ipv6_ext_hdr(nexthdr)) {
 329                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 330                 if (offset < 0)
 331                         return 0;
 332         } else
 333                 offset = sizeof(struct ipv6hdr);
 334
 335         if (nexthdr == IPPROTO_ICMPV6) {
 336                 struct icmp6hdr *icmp6;
 337
 338                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 339                                          offset + 1 - skb->data)))
 340                         return 0;
 341
 342                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 343
 344                 switch (icmp6->icmp6_type) {
 345                 case NDISC_ROUTER_SOLICITATION:
 346                 case NDISC_ROUTER_ADVERTISEMENT:
 347                 case NDISC_NEIGHBOUR_SOLICITATION:
 348                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 349                 case NDISC_REDIRECT:
 350                         /* For reaction involving unicast neighbor discovery
 351                          * message destined to the proxied address, pass it to
 352                          * input function.
 353                          */
 354                         return 1;
 355                 default:
 356                         break;
 357                 }
 358         }
 359
 360         /*
 361          * The proxying router can't forward traffic sent to a link-local
 362          * address, so signal the sender and discard the packet. This
 363          * behavior is clarified by the MIPv6 specification.
 364          */
 365         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 366                 dst_link_failure(skb);
 367                 return -1;
 368         }
 369
 370         return 0;
 371 }
 372
 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 374                                      struct sk_buff *skb)
 375 {
 376         struct dst_entry *dst = skb_dst(skb);
 377
 378         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 379         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 380
 381         skb->tstamp = 0;
 382         return dst_output(net, sk, skb);
 383 }
 384
 385 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 386 {
 387         unsigned int mtu;
 388         struct inet6_dev *idev;
 389
 390         if (dst_metric_locked(dst, RTAX_MTU)) {
 391                 mtu = dst_metric_raw(dst, RTAX_MTU);
 392                 if (mtu)
 393                         return mtu;
 394         }
 395
 396         mtu = IPV6_MIN_MTU;
 397         rcu_read_lock();
 398         idev = __in6_dev_get(dst->dev);
 399         if (idev)
 400                 mtu = idev->cnf.mtu6;
 401         rcu_read_unlock();
 402
 403         return mtu;
 404 }
 405
 406 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 407 {
 408         if (skb->len <= mtu)
 409                 return false;
 410
 411         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 412         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 413                 return true;
 414
 415         if (skb->ignore_df)
 416                 return false;
 417
 418         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 419                 return false;
 420
 421         return true;
 422 }
 423
 424 int ip6_forward(struct sk_buff *skb)
 425 {
 426         struct dst_entry *dst = skb_dst(skb);
 427         struct ipv6hdr *hdr = ipv6_hdr(skb);
 428         struct inet6_skb_parm *opt = IP6CB(skb);
 429         struct net *net = dev_net(dst->dev);
 430         u32 mtu;
 431
 432         if (net->ipv6.devconf_all->forwarding == 0)
 433                 goto error;
 434
 435         if (skb->pkt_type != PACKET_HOST)
 436                 goto drop;
 437
 438         if (unlikely(skb->sk))
 439                 goto drop;
 440
 441         if (skb_warn_if_lro(skb))
 442                 goto drop;
 443
 444         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 445                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 446                                 IPSTATS_MIB_INDISCARDS);
 447                 goto drop;
 448         }
 449
 450         skb_forward_csum(skb);
 451
 452         /*
 453          *      We DO NOT make any processing on
 454          *      RA packets, pushing them to user level AS IS
 455          *      without ane WARRANTY that application will be able
 456          *      to interpret them. The reason is that we
 457          *      cannot make anything clever here.
 458          *
 459          *      We are not end-node, so that if packet contains
 460          *      AH/ESP, we cannot make anything.
 461          *      Defragmentation also would be mistake, RA packets
 462          *      cannot be fragmented, because there is no warranty
 463          *      that different fragments will go along one path. --ANK
 464          */
 465         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 466                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 467                         return 0;
 468         }
 469
 470         /*
 471          *      check and decrement ttl
 472          */
 473         if (hdr->hop_limit <= 1) {
 474                 /* Force OUTPUT device used as source address */
 475                 skb->dev = dst->dev;
 476                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 477                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 478                                 IPSTATS_MIB_INHDRERRORS);
 479
 480                 kfree_skb(skb);
 481                 return -ETIMEDOUT;
 482         }
 483
 484         /* XXX: idev->cnf.proxy_ndp? */
 485         if (net->ipv6.devconf_all->proxy_ndp &&
 486             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 487                 int proxied = ip6_forward_proxy_check(skb);
 488                 if (proxied > 0)
 489                         return ip6_input(skb);
 490                 else if (proxied < 0) {
 491                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 492                                         IPSTATS_MIB_INDISCARDS);
 493                         goto drop;
 494                 }
 495         }
 496
 497         if (!xfrm6_route_forward(skb)) {
 498                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 499                                 IPSTATS_MIB_INDISCARDS);
 500                 goto drop;
 501         }
 502         dst = skb_dst(skb);
 503
 504         /* IPv6 specs say nothing about it, but it is clear that we cannot
 505            send redirects to source routed frames.
 506            We don't send redirects to frames decapsulated from IPsec.
 507          */
 508         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 509             opt->srcrt == 0 && !skb_sec_path(skb)) {
 510                 struct in6_addr *target = NULL;
 511                 struct inet_peer *peer;
 512                 struct rt6_info *rt;
 513
 514                 /*
 515                  *      incoming and outgoing devices are the same
 516                  *      send a redirect.
 517                  */
 518
 519                 rt = (struct rt6_info *) dst;
 520                 if (rt->rt6i_flags & RTF_GATEWAY)
 521                         target = &rt->rt6i_gateway;
 522                 else
 523                         target = &hdr->daddr;
 524
 525                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 526
 527                 /* Limit redirects both by destination (here)
 528                    and by source (inside ndisc_send_redirect)
 529                  */
 530                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 531                         ndisc_send_redirect(skb, target);
 532                 if (peer)
 533                         inet_putpeer(peer);
 534         } else {
 535                 int addrtype = ipv6_addr_type(&hdr->saddr);
 536
 537                 /* This check is security critical. */
 538                 if (addrtype == IPV6_ADDR_ANY ||
 539                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 540                         goto error;
 541                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 542                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 543                                     ICMPV6_NOT_NEIGHBOUR, 0);
 544                         goto error;
 545                 }
 546         }
 547
 548         mtu = ip6_dst_mtu_forward(dst);
 549         if (mtu < IPV6_MIN_MTU)
 550                 mtu = IPV6_MIN_MTU;
 551
 552         if (ip6_pkt_too_big(skb, mtu)) {
 553                 /* Again, force OUTPUT device used as source address */
 554                 skb->dev = dst->dev;
 555                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 556                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 557                                 IPSTATS_MIB_INTOOBIGERRORS);
 558                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 559                                 IPSTATS_MIB_FRAGFAILS);
 560                 kfree_skb(skb);
 561                 return -EMSGSIZE;
 562         }
 563
 564         if (skb_cow(skb, dst->dev->hard_header_len)) {
 565                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 566                                 IPSTATS_MIB_OUTDISCARDS);
 567                 goto drop;
 568         }
 569
 570         hdr = ipv6_hdr(skb);
 571
 572         /* Mangling hops number delayed to point after skb COW */
 573
 574         hdr->hop_limit--;
 575
 576         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 577                        net, NULL, skb, skb->dev, dst->dev,
 578                        ip6_forward_finish);
 579
 580 error:
 581         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 582 drop:
 583         kfree_skb(skb);
 584         return -EINVAL;
 585 }
 586
 587 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 588 {
 589         to->pkt_type = from->pkt_type;
 590         to->priority = from->priority;
 591         to->protocol = from->protocol;
 592         skb_dst_drop(to);
 593         skb_dst_set(to, dst_clone(skb_dst(from)));
 594         to->dev = from->dev;
 595         to->mark = from->mark;
 596
 597         skb_copy_hash(to, from);
 598
 599 #ifdef CONFIG_NET_SCHED
 600         to->tc_index = from->tc_index;
 601 #endif
 602         nf_copy(to, from);
 603         skb_copy_secmark(to, from);
 604 }
 605
 606 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 607                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 608 {
 609         struct sk_buff *frag;
 610         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 611         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 612                                 inet6_sk(skb->sk) : NULL;
 613         struct ipv6hdr *tmp_hdr;
 614         struct frag_hdr *fh;
 615         unsigned int mtu, hlen, left, len, nexthdr_offset;
 616         int hroom, troom;
 617         __be32 frag_id;
 618         int ptr, offset = 0, err = 0;
 619         u8 *prevhdr, nexthdr = 0;
 620
 621         err = ip6_find_1stfragopt(skb, &prevhdr);
 622         if (err < 0)
 623                 goto fail;
 624         hlen = err;
 625         nexthdr = *prevhdr;
 626         nexthdr_offset = prevhdr - skb_network_header(skb);
 627
 628         mtu = ip6_skb_dst_mtu(skb);
 629
 630         /* We must not fragment if the socket is set to force MTU discovery
 631          * or if the skb it not generated by a local socket.
 632          */
 633         if (unlikely(!skb->ignore_df && skb->len > mtu))
 634                 goto fail_toobig;
 635
 636         if (IP6CB(skb)->frag_max_size) {
 637                 if (IP6CB(skb)->frag_max_size > mtu)
 638                         goto fail_toobig;
 639
 640                 /* don't send fragments larger than what we received */
 641                 mtu = IP6CB(skb)->frag_max_size;
 642                 if (mtu < IPV6_MIN_MTU)
 643                         mtu = IPV6_MIN_MTU;
 644         }
 645
 646         if (np && np->frag_size < mtu) {
 647                 if (np->frag_size)
 648                         mtu = np->frag_size;
 649         }
 650         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 651                 goto fail_toobig;
 652         mtu -= hlen + sizeof(struct frag_hdr);
 653
 654         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 655                                     &ipv6_hdr(skb)->saddr);
 656
 657         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 658             (err = skb_checksum_help(skb)))
 659                 goto fail;
 660
 661         prevhdr = skb_network_header(skb) + nexthdr_offset;
 662         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 663         if (skb_has_frag_list(skb)) {
 664                 unsigned int first_len = skb_pagelen(skb);
 665                 struct sk_buff *frag2;
 666
 667                 if (first_len - hlen > mtu ||
 668                     ((first_len - hlen) & 7) ||
 669                     skb_cloned(skb) ||
 670                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 671                         goto slow_path;
 672
 673                 skb_walk_frags(skb, frag) {
 674                         /* Correct geometry. */
 675                         if (frag->len > mtu ||
 676                             ((frag->len & 7) && frag->next) ||
 677                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 678                                 goto slow_path_clean;
 679
 680                         /* Partially cloned skb? */
 681                         if (skb_shared(frag))
 682                                 goto slow_path_clean;
 683
 684                         BUG_ON(frag->sk);
 685                         if (skb->sk) {
 686                                 frag->sk = skb->sk;
 687                                 frag->destructor = sock_wfree;
 688                         }
 689                         skb->truesize -= frag->truesize;
 690                 }
 691
 692                 err = 0;
 693                 offset = 0;
 694                 /* BUILD HEADER */
 695
 696                 *prevhdr = NEXTHDR_FRAGMENT;
 697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 698                 if (!tmp_hdr) {
 699                         err = -ENOMEM;
 700                         goto fail;
 701                 }
 702                 frag = skb_shinfo(skb)->frag_list;
 703                 skb_frag_list_init(skb);
 704
 705                 __skb_pull(skb, hlen);
 706                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 707                 __skb_push(skb, hlen);
 708                 skb_reset_network_header(skb);
 709                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 710
 711                 fh->nexthdr = nexthdr;
 712                 fh->reserved = 0;
 713                 fh->frag_off = htons(IP6_MF);
 714                 fh->identification = frag_id;
 715
 716                 first_len = skb_pagelen(skb);
 717                 skb->data_len = first_len - skb_headlen(skb);
 718                 skb->len = first_len;
 719                 ipv6_hdr(skb)->payload_len = htons(first_len -
 720                                                    sizeof(struct ipv6hdr));
 721
 722                 for (;;) {
 723                         /* Prepare header of the next frame,
 724                          * before previous one went down. */
 725                         if (frag) {
 726                                 frag->ip_summed = CHECKSUM_NONE;
 727                                 skb_reset_transport_header(frag);
 728                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 729                                 __skb_push(frag, hlen);
 730                                 skb_reset_network_header(frag);
 731                                 memcpy(skb_network_header(frag), tmp_hdr,
 732                                        hlen);
 733                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 734                                 fh->nexthdr = nexthdr;
 735                                 fh->reserved = 0;
 736                                 fh->frag_off = htons(offset);
 737                                 if (frag->next)
 738                                         fh->frag_off |= htons(IP6_MF);
 739                                 fh->identification = frag_id;
 740                                 ipv6_hdr(frag)->payload_len =
 741                                                 htons(frag->len -
 742                                                       sizeof(struct ipv6hdr));
 743                                 ip6_copy_metadata(frag, skb);
 744                         }
 745
 746                         err = output(net, sk, skb);
 747                         if (!err)
 748                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 749                                               IPSTATS_MIB_FRAGCREATES);
 750
 751                         if (err || !frag)
 752                                 break;
 753
 754                         skb = frag;
 755                         frag = skb->next;
 756                         skb->next = NULL;
 757                 }
 758
 759                 kfree(tmp_hdr);
 760
 761                 if (err == 0) {
 762                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 763                                       IPSTATS_MIB_FRAGOKS);
 764                         return 0;
 765                 }
 766
 767                 kfree_skb_list(frag);
 768
 769                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 770                               IPSTATS_MIB_FRAGFAILS);
 771                 return err;
 772
 773 slow_path_clean:
 774                 skb_walk_frags(skb, frag2) {
 775                         if (frag2 == frag)
 776                                 break;
 777                         frag2->sk = NULL;
 778                         frag2->destructor = NULL;
 779                         skb->truesize += frag2->truesize;
 780                 }
 781         }
 782
 783 slow_path:
 784         left = skb->len - hlen;         /* Space per frame */
 785         ptr = hlen;                     /* Where to start from */
 786
 787         /*
 788          *      Fragment the datagram.
 789          */
 790
 791         troom = rt->dst.dev->needed_tailroom;
 792
 793         /*
 794          *      Keep copying data until we run out.
 795          */
 796         while (left > 0)        {
 797                 u8 *fragnexthdr_offset;
 798
 799                 len = left;
 800                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 801                 if (len > mtu)
 802                         len = mtu;
 803                 /* IF: we are not sending up to and including the packet end
 804                    then align the next start on an eight byte boundary */
 805                 if (len < left) {
 806                         len &= ~7;
 807                 }
 808
 809                 /* Allocate buffer */
 810                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 811                                  hroom + troom, GFP_ATOMIC);
 812                 if (!frag) {
 813                         err = -ENOMEM;
 814                         goto fail;
 815                 }
 816
 817                 /*
 818                  *      Set up data on packet
 819                  */
 820
 821                 ip6_copy_metadata(frag, skb);
 822                 skb_reserve(frag, hroom);
 823                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 824                 skb_reset_network_header(frag);
 825                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 826                 frag->transport_header = (frag->network_header + hlen +
 827                                           sizeof(struct frag_hdr));
 828
 829                 /*
 830                  *      Charge the memory for the fragment to any owner
 831                  *      it might possess
 832                  */
 833                 if (skb->sk)
 834                         skb_set_owner_w(frag, skb->sk);
 835
 836                 /*
 837                  *      Copy the packet header into the new buffer.
 838                  */
 839                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 840
 841                 fragnexthdr_offset = skb_network_header(frag);
 842                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 843                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 844
 845                 /*
 846                  *      Build fragment header.
 847                  */
 848                 fh->nexthdr = nexthdr;
 849                 fh->reserved = 0;
 850                 fh->identification = frag_id;
 851
 852                 /*
 853                  *      Copy a block of the IP datagram.
 854                  */
 855                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 856                                      len));
 857                 left -= len;
 858
 859                 fh->frag_off = htons(offset);
 860                 if (left > 0)
 861                         fh->frag_off |= htons(IP6_MF);
 862                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 863                                                     sizeof(struct ipv6hdr));
 864
 865                 ptr += len;
 866                 offset += len;
 867
 868                 /*
 869                  *      Put this fragment into the sending queue.
 870                  */
 871                 err = output(net, sk, frag);
 872                 if (err)
 873                         goto fail;
 874
 875                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 876                               IPSTATS_MIB_FRAGCREATES);
 877         }
 878         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 879                       IPSTATS_MIB_FRAGOKS);
 880         consume_skb(skb);
 881         return err;
 882
 883 fail_toobig:
 884         if (skb->sk && dst_allfrag(skb_dst(skb)))
 885                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 886
 887         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 888         err = -EMSGSIZE;
 889
 890 fail:
 891         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 892                       IPSTATS_MIB_FRAGFAILS);
 893         kfree_skb(skb);
 894         return err;
 895 }
 896
 897 static inline int ip6_rt_check(const struct rt6key *rt_key,
 898                                const struct in6_addr *fl_addr,
 899                                const struct in6_addr *addr_cache)
 900 {
 901         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 902                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 903 }
 904
 905 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 906                                           struct dst_entry *dst,
 907                                           const struct flowi6 *fl6)
 908 {
 909         struct ipv6_pinfo *np = inet6_sk(sk);
 910         struct rt6_info *rt;
 911
 912         if (!dst)
 913                 goto out;
 914
 915         if (dst->ops->family != AF_INET6) {
 916                 dst_release(dst);
 917                 return NULL;
 918         }
 919
 920         rt = (struct rt6_info *)dst;
 921         /* Yes, checking route validity in not connected
 922          * case is not very simple. Take into account,
 923          * that we do not support routing by source, TOS,
 924          * and MSG_DONTROUTE            --ANK (980726)
 925          *
 926          * 1. ip6_rt_check(): If route was host route,
 927          *    check that cached destination is current.
 928          *    If it is network route, we still may
 929          *    check its validity using saved pointer
 930          *    to the last used address: daddr_cache.
 931          *    We do not want to save whole address now,
 932          *    (because main consumer of this service
 933          *    is tcp, which has not this problem),
 934          *    so that the last trick works only on connected
 935          *    sockets.
 936          * 2. oif also should be the same.
 937          */
 938         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 939 #ifdef CONFIG_IPV6_SUBTREES
 940             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 941 #endif
 942            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 943               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 944                 dst_release(dst);
 945                 dst = NULL;
 946         }
 947
 948 out:
 949         return dst;
 950 }
 951
 952 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 953                                struct dst_entry **dst, struct flowi6 *fl6)
 954 {
 955 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 956         struct neighbour *n;
 957         struct rt6_info *rt;
 958 #endif
 959         int err;
 960         int flags = 0;
 961
 962         /* The correct way to handle this would be to do
 963          * ip6_route_get_saddr, and then ip6_route_output; however,
 964          * the route-specific preferred source forces the
 965          * ip6_route_output call _before_ ip6_route_get_saddr.
 966          *
 967          * In source specific routing (no src=any default route),
 968          * ip6_route_output will fail given src=any saddr, though, so
 969          * that's why we try it again later.
 970          */
 971         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 972                 struct rt6_info *rt;
 973                 bool had_dst = *dst != NULL;
 974
 975                 if (!had_dst)
 976                         *dst = ip6_route_output(net, sk, fl6);
 977                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 978                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 979                                           sk ? inet6_sk(sk)->srcprefs : 0,
 980                                           &fl6->saddr);
 981                 if (err)
 982                         goto out_err_release;
 983
 984                 /* If we had an erroneous initial result, pretend it
 985                  * never existed and let the SA-enabled version take
 986                  * over.
 987                  */
 988                 if (!had_dst && (*dst)->error) {
 989                         dst_release(*dst);
 990                         *dst = NULL;
 991                 }
 992
 993                 if (fl6->flowi6_oif)
 994                         flags |= RT6_LOOKUP_F_IFACE;
 995         }
 996
 997         if (!*dst)
 998                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 999
1000         err = (*dst)->error;
1001         if (err)
1002                 goto out_err_release;
1003
1004 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1005         /*
1006          * Here if the dst entry we've looked up
1007          * has a neighbour entry that is in the INCOMPLETE
1008          * state and the src address from the flow is
1009          * marked as OPTIMISTIC, we release the found
1010          * dst entry and replace it instead with the
1011          * dst entry of the nexthop router
1012          */
1013         rt = (struct rt6_info *) *dst;
1014         rcu_read_lock_bh();
1015         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1016                                       rt6_nexthop(rt, &fl6->daddr));
1017         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1018         rcu_read_unlock_bh();
1019
1020         if (err) {
1021                 struct inet6_ifaddr *ifp;
1022                 struct flowi6 fl_gw6;
1023                 int redirect;
1024
1025                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1026                                       (*dst)->dev, 1);
1027
1028                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1029                 if (ifp)
1030                         in6_ifa_put(ifp);
1031
1032                 if (redirect) {
1033                         /*
1034                          * We need to get the dst entry for the
1035                          * default router instead
1036                          */
1037                         dst_release(*dst);
1038                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1039                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1040                         *dst = ip6_route_output(net, sk, &fl_gw6);
1041                         err = (*dst)->error;
1042                         if (err)
1043                                 goto out_err_release;
1044                 }
1045         }
1046 #endif
1047         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1048             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1049                 err = -EAFNOSUPPORT;
1050                 goto out_err_release;
1051         }
1052
1053         return 0;
1054
1055 out_err_release:
1056         dst_release(*dst);
1057         *dst = NULL;
1058
1059         if (err == -ENETUNREACH)
1060                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1061         return err;
1062 }
1063
1064 /**
1065  *      ip6_dst_lookup - perform route lookup on flow
1066  *      @sk: socket which provides route info
1067  *      @dst: pointer to dst_entry * for result
1068  *      @fl6: flow to lookup
1069  *
1070  *      This function performs a route lookup on the given flow.
1071  *
1072  *      It returns zero on success, or a standard errno code on error.
1073  */
1074 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1075                    struct flowi6 *fl6)
1076 {
1077         *dst = NULL;
1078         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1079 }
1080 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1081
1082 /**
1083  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1084  *      @sk: socket which provides route info
1085  *      @fl6: flow to lookup
1086  *      @final_dst: final destination address for ipsec lookup
1087  *
1088  *      This function performs a route lookup on the given flow.
1089  *
1090  *      It returns a valid dst pointer on success, or a pointer encoded
1091  *      error code.
1092  */
1093 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1094                                       const struct in6_addr *final_dst)
1095 {
1096         struct dst_entry *dst = NULL;
1097         int err;
1098
1099         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1100         if (err)
1101                 return ERR_PTR(err);
1102         if (final_dst)
1103                 fl6->daddr = *final_dst;
1104
1105         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1108
1109 /**
1110  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1111  *      @sk: socket which provides the dst cache and route info
1112  *      @fl6: flow to lookup
1113  *      @final_dst: final destination address for ipsec lookup
1114  *
1115  *      This function performs a route lookup on the given flow with the
1116  *      possibility of using the cached route in the socket if it is valid.
1117  *      It will take the socket dst lock when operating on the dst cache.
1118  *      As a result, this function can only be used in process context.
1119  *
1120  *      It returns a valid dst pointer on success, or a pointer encoded
1121  *      error code.
1122  */
1123 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1124                                          const struct in6_addr *final_dst)
1125 {
1126         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1127
1128         dst = ip6_sk_dst_check(sk, dst, fl6);
1129         if (!dst)
1130                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1131
1132         return dst;
1133 }
1134 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1135
1136 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1137                                                gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1143                                                 gfp_t gfp)
1144 {
1145         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147
1148 static void ip6_append_data_mtu(unsigned int *mtu,
1149                                 int *maxfraglen,
1150                                 unsigned int fragheaderlen,
1151                                 struct sk_buff *skb,
1152                                 struct rt6_info *rt,
1153                                 unsigned int orig_mtu)
1154 {
1155         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1156                 if (!skb) {
1157                         /* first fragment, reserve header_len */
1158                         *mtu = orig_mtu - rt->dst.header_len;
1159
1160                 } else {
1161                         /*
1162                          * this fragment is not first, the headers
1163                          * space is regarded as data space.
1164                          */
1165                         *mtu = orig_mtu;
1166                 }
1167                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1168                               + fragheaderlen - sizeof(struct frag_hdr);
1169         }
1170 }
1171
1172 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1173                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1174                           struct rt6_info *rt, struct flowi6 *fl6)
1175 {
1176         struct ipv6_pinfo *np = inet6_sk(sk);
1177         unsigned int mtu;
1178         struct ipv6_txoptions *opt = ipc6->opt;
1179
1180         /*
1181          * setup for corking
1182          */
1183         if (opt) {
1184                 if (WARN_ON(v6_cork->opt))
1185                         return -EINVAL;
1186
1187                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1188                 if (unlikely(!v6_cork->opt))
1189                         return -ENOBUFS;
1190
1191                 v6_cork->opt->tot_len = sizeof(*opt);
1192                 v6_cork->opt->opt_flen = opt->opt_flen;
1193                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1194
1195                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1196                                                     sk->sk_allocation);
1197                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1198                         return -ENOBUFS;
1199
1200                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1201                                                     sk->sk_allocation);
1202                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1203                         return -ENOBUFS;
1204
1205                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1206                                                    sk->sk_allocation);
1207                 if (opt->hopopt && !v6_cork->opt->hopopt)
1208                         return -ENOBUFS;
1209
1210                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1211                                                     sk->sk_allocation);
1212                 if (opt->srcrt && !v6_cork->opt->srcrt)
1213                         return -ENOBUFS;
1214
1215                 /* need source address above miyazawa*/
1216         }
1217         dst_hold(&rt->dst);
1218         cork->base.dst = &rt->dst;
1219         cork->fl.u.ip6 = *fl6;
1220         v6_cork->hop_limit = ipc6->hlimit;
1221         v6_cork->tclass = ipc6->tclass;
1222         if (rt->dst.flags & DST_XFRM_TUNNEL)
1223                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1224                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1225         else
1226                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1228         if (np->frag_size < mtu) {
1229                 if (np->frag_size)
1230                         mtu = np->frag_size;
1231         }
1232         if (mtu < IPV6_MIN_MTU)
1233                 return -EINVAL;
1234         cork->base.fragsize = mtu;
1235         if (dst_allfrag(rt->dst.path))
1236                 cork->base.flags |= IPCORK_ALLFRAG;
1237         cork->base.length = 0;
1238
1239         return 0;
1240 }
1241
1242 static int __ip6_append_data(struct sock *sk,
1243                              struct flowi6 *fl6,
1244                              struct sk_buff_head *queue,
1245                              struct inet_cork *cork,
1246                              struct inet6_cork *v6_cork,
1247                              struct page_frag *pfrag,
1248                              int getfrag(void *from, char *to, int offset,
1249                                          int len, int odd, struct sk_buff *skb),
1250                              void *from, int length, int transhdrlen,
1251                              unsigned int flags, struct ipcm6_cookie *ipc6,
1252                              const struct sockcm_cookie *sockc)
1253 {
1254         struct sk_buff *skb, *skb_prev = NULL;
1255         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1256         int exthdrlen = 0;
1257         int dst_exthdrlen = 0;
1258         int hh_len;
1259         int copy;
1260         int err;
1261         int offset = 0;
1262         __u8 tx_flags = 0;
1263         u32 tskey = 0;
1264         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1265         struct ipv6_txoptions *opt = v6_cork->opt;
1266         int csummode = CHECKSUM_NONE;
1267         unsigned int maxnonfragsize, headersize;
1268
1269         skb = skb_peek_tail(queue);
1270         if (!skb) {
1271                 exthdrlen = opt ? opt->opt_flen : 0;
1272                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1273         }
1274
1275         mtu = cork->fragsize;
1276         orig_mtu = mtu;
1277
1278         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1279
1280         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1281                         (opt ? opt->opt_nflen : 0);
1282         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1283                      sizeof(struct frag_hdr);
1284
1285         headersize = sizeof(struct ipv6hdr) +
1286                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1287                      (dst_allfrag(&rt->dst) ?
1288                       sizeof(struct frag_hdr) : 0) +
1289                      rt->rt6i_nfheader_len;
1290
1291         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1292          * the first fragment
1293          */
1294         if (headersize + transhdrlen > mtu)
1295                 goto emsgsize;
1296
1297         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1298             (sk->sk_protocol == IPPROTO_UDP ||
1299              sk->sk_protocol == IPPROTO_RAW)) {
1300                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1301                                 sizeof(struct ipv6hdr));
1302                 goto emsgsize;
1303         }
1304
1305         if (ip6_sk_ignore_df(sk))
1306                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1307         else
1308                 maxnonfragsize = mtu;
1309
1310         if (cork->length + length > maxnonfragsize - headersize) {
1311 emsgsize:
1312                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1313                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1314                 return -EMSGSIZE;
1315         }
1316
1317         /* CHECKSUM_PARTIAL only with no extension headers and when
1318          * we are not going to fragment
1319          */
1320         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1321             headersize == sizeof(struct ipv6hdr) &&
1322             length <= mtu - headersize &&
1323             !(flags & MSG_MORE) &&
1324             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1325                 csummode = CHECKSUM_PARTIAL;
1326
1327         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1328                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1329                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1330                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1331                         tskey = sk->sk_tskey++;
1332         }
1333
1334         /*
1335          * Let's try using as much space as possible.
1336          * Use MTU if total length of the message fits into the MTU.
1337          * Otherwise, we need to reserve fragment header and
1338          * fragment alignment (= 8-15 octects, in total).
1339          *
1340          * Note that we may need to "move" the data from the tail of
1341          * of the buffer to the new fragment when we split
1342          * the message.
1343          *
1344          * FIXME: It may be fragmented into multiple chunks
1345          *        at once if non-fragmentable extension headers
1346          *        are too large.
1347          * --yoshfuji
1348          */
1349
1350         cork->length += length;
1351         if (!skb)
1352                 goto alloc_new_skb;
1353
1354         while (length > 0) {
1355                 /* Check if the remaining data fits into current packet. */
1356                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1357                 if (copy < length)
1358                         copy = maxfraglen - skb->len;
1359
1360                 if (copy <= 0) {
1361                         char *data;
1362                         unsigned int datalen;
1363                         unsigned int fraglen;
1364                         unsigned int fraggap;
1365                         unsigned int alloclen;
1366 alloc_new_skb:
1367                         /* There's no room in the current skb */
1368                         if (skb)
1369                                 fraggap = skb->len - maxfraglen;
1370                         else
1371                                 fraggap = 0;
1372                         /* update mtu and maxfraglen if necessary */
1373                         if (!skb || !skb_prev)
1374                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1375                                                     fragheaderlen, skb, rt,
1376                                                     orig_mtu);
1377
1378                         skb_prev = skb;
1379
1380                         /*
1381                          * If remaining data exceeds the mtu,
1382                          * we know we need more fragment(s).
1383                          */
1384                         datalen = length + fraggap;
1385
1386                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388                         if ((flags & MSG_MORE) &&
1389                             !(rt->dst.dev->features&NETIF_F_SG))
1390                                 alloclen = mtu;
1391                         else
1392                                 alloclen = datalen + fragheaderlen;
1393
1394                         alloclen += dst_exthdrlen;
1395
1396                         if (datalen != length + fraggap) {
1397                                 /*
1398                                  * this is not the last fragment, the trailer
1399                                  * space is regarded as data space.
1400                                  */
1401                                 datalen += rt->dst.trailer_len;
1402                         }
1403
1404                         alloclen += rt->dst.trailer_len;
1405                         fraglen = datalen + fragheaderlen;
1406
1407                         /*
1408                          * We just reserve space for fragment header.
1409                          * Note: this may be overallocation if the message
1410                          * (without MSG_MORE) fits into the MTU.
1411                          */
1412                         alloclen += sizeof(struct frag_hdr);
1413
1414                         copy = datalen - transhdrlen - fraggap;
1415                         if (copy < 0) {
1416                                 err = -EINVAL;
1417                                 goto error;
1418                         }
1419                         if (transhdrlen) {
1420                                 skb = sock_alloc_send_skb(sk,
1421                                                 alloclen + hh_len,
1422                                                 (flags & MSG_DONTWAIT), &err);
1423                         } else {
1424                                 skb = NULL;
1425                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1426                                     2 * sk->sk_sndbuf)
1427                                         skb = sock_wmalloc(sk,
1428                                                            alloclen + hh_len, 1,
1429                                                            sk->sk_allocation);
1430                                 if (unlikely(!skb))
1431                                         err = -ENOBUFS;
1432                         }
1433                         if (!skb)
1434                                 goto error;
1435                         /*
1436                          *      Fill in the control structures
1437                          */
1438                         skb->protocol = htons(ETH_P_IPV6);
1439                         skb->ip_summed = csummode;
1440                         skb->csum = 0;
1441                         /* reserve for fragmentation and ipsec header */
1442                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1443                                     dst_exthdrlen);
1444
1445                         /* Only the initial fragment is time stamped */
1446                         skb_shinfo(skb)->tx_flags = tx_flags;
1447                         tx_flags = 0;
1448                         skb_shinfo(skb)->tskey = tskey;
1449                         tskey = 0;
1450
1451                         /*
1452                          *      Find where to start putting bytes
1453                          */
1454                         data = skb_put(skb, fraglen);
1455                         skb_set_network_header(skb, exthdrlen);
1456                         data += fragheaderlen;
1457                         skb->transport_header = (skb->network_header +
1458                                                  fragheaderlen);
1459                         if (fraggap) {
1460                                 skb->csum = skb_copy_and_csum_bits(
1461                                         skb_prev, maxfraglen,
1462                                         data + transhdrlen, fraggap, 0);
1463                                 skb_prev->csum = csum_sub(skb_prev->csum,
1464                                                           skb->csum);
1465                                 data += fraggap;
1466                                 pskb_trim_unique(skb_prev, maxfraglen);
1467                         }
1468                         if (copy > 0 &&
1469                             getfrag(from, data + transhdrlen, offset,
1470                                     copy, fraggap, skb) < 0) {
1471                                 err = -EFAULT;
1472                                 kfree_skb(skb);
1473                                 goto error;
1474                         }
1475
1476                         offset += copy;
1477                         length -= datalen - fraggap;
1478                         transhdrlen = 0;
1479                         exthdrlen = 0;
1480                         dst_exthdrlen = 0;
1481
1482                         if ((flags & MSG_CONFIRM) && !skb_prev)
1483                                 skb_set_dst_pending_confirm(skb, 1);
1484
1485                         /*
1486                          * Put the packet on the pending queue
1487                          */
1488                         __skb_queue_tail(queue, skb);
1489                         continue;
1490                 }
1491
1492                 if (copy > length)
1493                         copy = length;
1494
1495                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1496                     skb_tailroom(skb) >= copy) {
1497                         unsigned int off;
1498
1499                         off = skb->len;
1500                         if (getfrag(from, skb_put(skb, copy),
1501                                                 offset, copy, off, skb) < 0) {
1502                                 __skb_trim(skb, off);
1503                                 err = -EFAULT;
1504                                 goto error;
1505                         }
1506                 } else {
1507                         int i = skb_shinfo(skb)->nr_frags;
1508
1509                         err = -ENOMEM;
1510                         if (!sk_page_frag_refill(sk, pfrag))
1511                                 goto error;
1512
1513                         if (!skb_can_coalesce(skb, i, pfrag->page,
1514                                               pfrag->offset)) {
1515                                 err = -EMSGSIZE;
1516                                 if (i == MAX_SKB_FRAGS)
1517                                         goto error;
1518
1519                                 __skb_fill_page_desc(skb, i, pfrag->page,
1520                                                      pfrag->offset, 0);
1521                                 skb_shinfo(skb)->nr_frags = ++i;
1522                                 get_page(pfrag->page);
1523                         }
1524                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1525                         if (getfrag(from,
1526                                     page_address(pfrag->page) + pfrag->offset,
1527                                     offset, copy, skb->len, skb) < 0)
1528                                 goto error_efault;
1529
1530                         pfrag->offset += copy;
1531                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1532                         skb->len += copy;
1533                         skb->data_len += copy;
1534                         skb->truesize += copy;
1535                         refcount_add(copy, &sk->sk_wmem_alloc);
1536                 }
1537                 offset += copy;
1538                 length -= copy;
1539         }
1540
1541         return 0;
1542
1543 error_efault:
1544         err = -EFAULT;
1545 error:
1546         cork->length -= length;
1547         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1548         return err;
1549 }
1550
1551 int ip6_append_data(struct sock *sk,
1552                     int getfrag(void *from, char *to, int offset, int len,
1553                                 int odd, struct sk_buff *skb),
1554                     void *from, int length, int transhdrlen,
1555                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1556                     struct rt6_info *rt, unsigned int flags,
1557                     const struct sockcm_cookie *sockc)
1558 {
1559         struct inet_sock *inet = inet_sk(sk);
1560         struct ipv6_pinfo *np = inet6_sk(sk);
1561         int exthdrlen;
1562         int err;
1563
1564         if (flags&MSG_PROBE)
1565                 return 0;
1566         if (skb_queue_empty(&sk->sk_write_queue)) {
1567                 /*
1568                  * setup for corking
1569                  */
1570                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1571                                      ipc6, rt, fl6);
1572                 if (err)
1573                         return err;
1574
1575                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1576                 length += exthdrlen;
1577                 transhdrlen += exthdrlen;
1578         } else {
1579                 fl6 = &inet->cork.fl.u.ip6;
1580                 transhdrlen = 0;
1581         }
1582
1583         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1584                                  &np->cork, sk_page_frag(sk), getfrag,
1585                                  from, length, transhdrlen, flags, ipc6, sockc);
1586 }
1587 EXPORT_SYMBOL_GPL(ip6_append_data);
1588
1589 static void ip6_cork_release(struct inet_cork_full *cork,
1590                              struct inet6_cork *v6_cork)
1591 {
1592         if (v6_cork->opt) {
1593                 kfree(v6_cork->opt->dst0opt);
1594                 kfree(v6_cork->opt->dst1opt);
1595                 kfree(v6_cork->opt->hopopt);
1596                 kfree(v6_cork->opt->srcrt);
1597                 kfree(v6_cork->opt);
1598                 v6_cork->opt = NULL;
1599         }
1600
1601         if (cork->base.dst) {
1602                 dst_release(cork->base.dst);
1603                 cork->base.dst = NULL;
1604                 cork->base.flags &= ~IPCORK_ALLFRAG;
1605         }
1606         memset(&cork->fl, 0, sizeof(cork->fl));
1607 }
1608
1609 struct sk_buff *__ip6_make_skb(struct sock *sk,
1610                                struct sk_buff_head *queue,
1611                                struct inet_cork_full *cork,
1612                                struct inet6_cork *v6_cork)
1613 {
1614         struct sk_buff *skb, *tmp_skb;
1615         struct sk_buff **tail_skb;
1616         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1617         struct ipv6_pinfo *np = inet6_sk(sk);
1618         struct net *net = sock_net(sk);
1619         struct ipv6hdr *hdr;
1620         struct ipv6_txoptions *opt = v6_cork->opt;
1621         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1622         struct flowi6 *fl6 = &cork->fl.u.ip6;
1623         unsigned char proto = fl6->flowi6_proto;
1624
1625         skb = __skb_dequeue(queue);
1626         if (!skb)
1627                 goto out;
1628         tail_skb = &(skb_shinfo(skb)->frag_list);
1629
1630         /* move skb->data to ip header from ext header */
1631         if (skb->data < skb_network_header(skb))
1632                 __skb_pull(skb, skb_network_offset(skb));
1633         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1634                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1635                 *tail_skb = tmp_skb;
1636                 tail_skb = &(tmp_skb->next);
1637                 skb->len += tmp_skb->len;
1638                 skb->data_len += tmp_skb->len;
1639                 skb->truesize += tmp_skb->truesize;
1640                 tmp_skb->destructor = NULL;
1641                 tmp_skb->sk = NULL;
1642         }
1643
1644         /* Allow local fragmentation. */
1645         skb->ignore_df = ip6_sk_ignore_df(sk);
1646
1647         *final_dst = fl6->daddr;
1648         __skb_pull(skb, skb_network_header_len(skb));
1649         if (opt && opt->opt_flen)
1650                 ipv6_push_frag_opts(skb, opt, &proto);
1651         if (opt && opt->opt_nflen)
1652                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1653
1654         skb_push(skb, sizeof(struct ipv6hdr));
1655         skb_reset_network_header(skb);
1656         hdr = ipv6_hdr(skb);
1657
1658         ip6_flow_hdr(hdr, v6_cork->tclass,
1659                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1660                                         ip6_autoflowlabel(net, np), fl6));
1661         hdr->hop_limit = v6_cork->hop_limit;
1662         hdr->nexthdr = proto;
1663         hdr->saddr = fl6->saddr;
1664         hdr->daddr = *final_dst;
1665
1666         skb->priority = sk->sk_priority;
1667         skb->mark = sk->sk_mark;
1668
1669         skb_dst_set(skb, dst_clone(&rt->dst));
1670         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1671         if (proto == IPPROTO_ICMPV6) {
1672                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1673
1674                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1675                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1676         }
1677
1678         ip6_cork_release(cork, v6_cork);
1679 out:
1680         return skb;
1681 }
1682
1683 int ip6_send_skb(struct sk_buff *skb)
1684 {
1685         struct net *net = sock_net(skb->sk);
1686         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1687         int err;
1688
1689         err = ip6_local_out(net, skb->sk, skb);
1690         if (err) {
1691                 if (err > 0)
1692                         err = net_xmit_errno(err);
1693                 if (err)
1694                         IP6_INC_STATS(net, rt->rt6i_idev,
1695                                       IPSTATS_MIB_OUTDISCARDS);
1696         }
1697
1698         return err;
1699 }
1700
1701 int ip6_push_pending_frames(struct sock *sk)
1702 {
1703         struct sk_buff *skb;
1704
1705         skb = ip6_finish_skb(sk);
1706         if (!skb)
1707                 return 0;
1708
1709         return ip6_send_skb(skb);
1710 }
1711 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1712
1713 static void __ip6_flush_pending_frames(struct sock *sk,
1714                                        struct sk_buff_head *queue,
1715                                        struct inet_cork_full *cork,
1716                                        struct inet6_cork *v6_cork)
1717 {
1718         struct sk_buff *skb;
1719
1720         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1721                 if (skb_dst(skb))
1722                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1723                                       IPSTATS_MIB_OUTDISCARDS);
1724                 kfree_skb(skb);
1725         }
1726
1727         ip6_cork_release(cork, v6_cork);
1728 }
1729
1730 void ip6_flush_pending_frames(struct sock *sk)
1731 {
1732         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1733                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1736
1737 struct sk_buff *ip6_make_skb(struct sock *sk,
1738                              int getfrag(void *from, char *to, int offset,
1739                                          int len, int odd, struct sk_buff *skb),
1740                              void *from, int length, int transhdrlen,
1741                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1742                              struct rt6_info *rt, unsigned int flags,
1743                              const struct sockcm_cookie *sockc)
1744 {
1745         struct inet_cork_full cork;
1746         struct inet6_cork v6_cork;
1747         struct sk_buff_head queue;
1748         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1749         int err;
1750
1751         if (flags & MSG_PROBE)
1752                 return NULL;
1753
1754         __skb_queue_head_init(&queue);
1755
1756         cork.base.flags = 0;
1757         cork.base.addr = 0;
1758         cork.base.opt = NULL;
1759         cork.base.dst = NULL;
1760         v6_cork.opt = NULL;
1761         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1762         if (err) {
1763                 ip6_cork_release(&cork, &v6_cork);
1764                 return ERR_PTR(err);
1765         }
1766         if (ipc6->dontfrag < 0)
1767                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1768
1769         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1770                                 &current->task_frag, getfrag, from,
1771                                 length + exthdrlen, transhdrlen + exthdrlen,
1772                                 flags, ipc6, sockc);
1773         if (err) {
1774                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1775                 return ERR_PTR(err);
1776         }
1777
1778         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1779 }