net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 142             dst_allfrag(skb_dst(skb)) ||
 143             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 144                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 145         else
 146                 return ip6_finish_output2(net, sk, skb);
 147 }
 148
 149 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 150 {
 151         struct net_device *dev = skb_dst(skb)->dev;
 152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153
 154         skb->protocol = htons(ETH_P_IPV6);
 155         skb->dev = dev;
 156
 157         if (unlikely(idev->cnf.disable_ipv6)) {
 158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                 kfree_skb(skb);
 160                 return 0;
 161         }
 162
 163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                             net, sk, skb, NULL, dev,
 165                             ip6_finish_output,
 166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167 }
 168
 169 /*
 170  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 171  * Note : socket lock is not held for SYNACK packets, but might be modified
 172  * by calls to skb_set_owner_w() and ipv6_local_error(),
 173  * which are using proper atomic operations or spinlocks.
 174  */
 175 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 176              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 177 {
 178         struct net *net = sock_net(sk);
 179         const struct ipv6_pinfo *np = inet6_sk(sk);
 180         struct in6_addr *first_hop = &fl6->daddr;
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct ipv6hdr *hdr;
 183         u8  proto = fl6->flowi6_proto;
 184         int seg_len = skb->len;
 185         int hlimit = -1;
 186         u32 mtu;
 187
 188         if (opt) {
 189                 unsigned int head_room;
 190
 191                 /* First: exthdrs may take lots of space (~8K for now)
 192                    MAX_HEADER is not enough.
 193                  */
 194                 head_room = opt->opt_nflen + opt->opt_flen;
 195                 seg_len += head_room;
 196                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 197
 198                 if (skb_headroom(skb) < head_room) {
 199                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 200                         if (!skb2) {
 201                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 202                                               IPSTATS_MIB_OUTDISCARDS);
 203                                 kfree_skb(skb);
 204                                 return -ENOBUFS;
 205                         }
 206                         consume_skb(skb);
 207                         skb = skb2;
 208                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 209                          * it is safe to call in our context (socket lock not held)
 210                          */
 211                         skb_set_owner_w(skb, (struct sock *)sk);
 212                 }
 213                 if (opt->opt_flen)
 214                         ipv6_push_frag_opts(skb, opt, &proto);
 215                 if (opt->opt_nflen)
 216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 217                                              &fl6->saddr);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 233                                                      np->autoflowlabel, fl6));
 234
 235         hdr->payload_len = htons(seg_len);
 236         hdr->nexthdr = proto;
 237         hdr->hop_limit = hlimit;
 238
 239         hdr->saddr = fl6->saddr;
 240         hdr->daddr = *first_hop;
 241
 242         skb->protocol = htons(ETH_P_IPV6);
 243         skb->priority = sk->sk_priority;
 244         skb->mark = mark;
 245
 246         mtu = dst_mtu(dst);
 247         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 248                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 249                               IPSTATS_MIB_OUT, skb->len);
 250
 251                 /* if egress device is enslaved to an L3 master device pass the
 252                  * skb to its handler for processing
 253                  */
 254                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 255                 if (unlikely(!skb))
 256                         return 0;
 257
 258                 /* hooks should never assume socket lock is held.
 259                  * we promote our socket to non const
 260                  */
 261                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 262                                net, (struct sock *)sk, skb, NULL, dst->dev,
 263                                dst_output);
 264         }
 265
 266         skb->dev = dst->dev;
 267         /* ipv6_local_error() does not require socket lock,
 268          * we promote our socket to non const
 269          */
 270         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 271
 272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 279 {
 280         struct ip6_ra_chain *ra;
 281         struct sock *last = NULL;
 282
 283         read_lock(&ip6_ra_lock);
 284         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 285                 struct sock *sk = ra->sk;
 286                 if (sk && ra->sel == sel &&
 287                     (!sk->sk_bound_dev_if ||
 288                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 289                         if (last) {
 290                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 291                                 if (skb2)
 292                                         rawv6_rcv(last, skb2);
 293                         }
 294                         last = sk;
 295                 }
 296         }
 297
 298         if (last) {
 299                 rawv6_rcv(last, skb);
 300                 read_unlock(&ip6_ra_lock);
 301                 return 1;
 302         }
 303         read_unlock(&ip6_ra_lock);
 304         return 0;
 305 }
 306
 307 static int ip6_forward_proxy_check(struct sk_buff *skb)
 308 {
 309         struct ipv6hdr *hdr = ipv6_hdr(skb);
 310         u8 nexthdr = hdr->nexthdr;
 311         __be16 frag_off;
 312         int offset;
 313
 314         if (ipv6_ext_hdr(nexthdr)) {
 315                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 316                 if (offset < 0)
 317                         return 0;
 318         } else
 319                 offset = sizeof(struct ipv6hdr);
 320
 321         if (nexthdr == IPPROTO_ICMPV6) {
 322                 struct icmp6hdr *icmp6;
 323
 324                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 325                                          offset + 1 - skb->data)))
 326                         return 0;
 327
 328                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 329
 330                 switch (icmp6->icmp6_type) {
 331                 case NDISC_ROUTER_SOLICITATION:
 332                 case NDISC_ROUTER_ADVERTISEMENT:
 333                 case NDISC_NEIGHBOUR_SOLICITATION:
 334                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 335                 case NDISC_REDIRECT:
 336                         /* For reaction involving unicast neighbor discovery
 337                          * message destined to the proxied address, pass it to
 338                          * input function.
 339                          */
 340                         return 1;
 341                 default:
 342                         break;
 343                 }
 344         }
 345
 346         /*
 347          * The proxying router can't forward traffic sent to a link-local
 348          * address, so signal the sender and discard the packet. This
 349          * behavior is clarified by the MIPv6 specification.
 350          */
 351         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 352                 dst_link_failure(skb);
 353                 return -1;
 354         }
 355
 356         return 0;
 357 }
 358
 359 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 360                                      struct sk_buff *skb)
 361 {
 362         return dst_output(net, sk, skb);
 363 }
 364
 365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 366 {
 367         unsigned int mtu;
 368         struct inet6_dev *idev;
 369
 370         if (dst_metric_locked(dst, RTAX_MTU)) {
 371                 mtu = dst_metric_raw(dst, RTAX_MTU);
 372                 if (mtu)
 373                         return mtu;
 374         }
 375
 376         mtu = IPV6_MIN_MTU;
 377         rcu_read_lock();
 378         idev = __in6_dev_get(dst->dev);
 379         if (idev)
 380                 mtu = idev->cnf.mtu6;
 381         rcu_read_unlock();
 382
 383         return mtu;
 384 }
 385
 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387 {
 388         if (skb->len <= mtu)
 389                 return false;
 390
 391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                 return true;
 394
 395         if (skb->ignore_df)
 396                 return false;
 397
 398         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 399                 return false;
 400
 401         return true;
 402 }
 403
 404 int ip6_forward(struct sk_buff *skb)
 405 {
 406         struct dst_entry *dst = skb_dst(skb);
 407         struct ipv6hdr *hdr = ipv6_hdr(skb);
 408         struct inet6_skb_parm *opt = IP6CB(skb);
 409         struct net *net = dev_net(dst->dev);
 410         u32 mtu;
 411
 412         if (net->ipv6.devconf_all->forwarding == 0)
 413                 goto error;
 414
 415         if (skb->pkt_type != PACKET_HOST)
 416                 goto drop;
 417
 418         if (unlikely(skb->sk))
 419                 goto drop;
 420
 421         if (skb_warn_if_lro(skb))
 422                 goto drop;
 423
 424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 426                                 IPSTATS_MIB_INDISCARDS);
 427                 goto drop;
 428         }
 429
 430         skb_forward_csum(skb);
 431
 432         /*
 433          *      We DO NOT make any processing on
 434          *      RA packets, pushing them to user level AS IS
 435          *      without ane WARRANTY that application will be able
 436          *      to interpret them. The reason is that we
 437          *      cannot make anything clever here.
 438          *
 439          *      We are not end-node, so that if packet contains
 440          *      AH/ESP, we cannot make anything.
 441          *      Defragmentation also would be mistake, RA packets
 442          *      cannot be fragmented, because there is no warranty
 443          *      that different fragments will go along one path. --ANK
 444          */
 445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 458                                 IPSTATS_MIB_INHDRERRORS);
 459
 460                 kfree_skb(skb);
 461                 return -ETIMEDOUT;
 462         }
 463
 464         /* XXX: idev->cnf.proxy_ndp? */
 465         if (net->ipv6.devconf_all->proxy_ndp &&
 466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 467                 int proxied = ip6_forward_proxy_check(skb);
 468                 if (proxied > 0)
 469                         return ip6_input(skb);
 470                 else if (proxied < 0) {
 471                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 472                                         IPSTATS_MIB_INDISCARDS);
 473                         goto drop;
 474                 }
 475         }
 476
 477         if (!xfrm6_route_forward(skb)) {
 478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                 IPSTATS_MIB_INDISCARDS);
 480                 goto drop;
 481         }
 482         dst = skb_dst(skb);
 483
 484         /* IPv6 specs say nothing about it, but it is clear that we cannot
 485            send redirects to source routed frames.
 486            We don't send redirects to frames decapsulated from IPsec.
 487          */
 488         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 489                 struct in6_addr *target = NULL;
 490                 struct inet_peer *peer;
 491                 struct rt6_info *rt;
 492
 493                 /*
 494                  *      incoming and outgoing devices are the same
 495                  *      send a redirect.
 496                  */
 497
 498                 rt = (struct rt6_info *) dst;
 499                 if (rt->rt6i_flags & RTF_GATEWAY)
 500                         target = &rt->rt6i_gateway;
 501                 else
 502                         target = &hdr->daddr;
 503
 504                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 505
 506                 /* Limit redirects both by destination (here)
 507                    and by source (inside ndisc_send_redirect)
 508                  */
 509                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 510                         ndisc_send_redirect(skb, target);
 511                 if (peer)
 512                         inet_putpeer(peer);
 513         } else {
 514                 int addrtype = ipv6_addr_type(&hdr->saddr);
 515
 516                 /* This check is security critical. */
 517                 if (addrtype == IPV6_ADDR_ANY ||
 518                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 519                         goto error;
 520                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 521                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 522                                     ICMPV6_NOT_NEIGHBOUR, 0);
 523                         goto error;
 524                 }
 525         }
 526
 527         mtu = ip6_dst_mtu_forward(dst);
 528         if (mtu < IPV6_MIN_MTU)
 529                 mtu = IPV6_MIN_MTU;
 530
 531         if (ip6_pkt_too_big(skb, mtu)) {
 532                 /* Again, force OUTPUT device used as source address */
 533                 skb->dev = dst->dev;
 534                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                 IPSTATS_MIB_INTOOBIGERRORS);
 537                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 538                                 IPSTATS_MIB_FRAGFAILS);
 539                 kfree_skb(skb);
 540                 return -EMSGSIZE;
 541         }
 542
 543         if (skb_cow(skb, dst->dev->hard_header_len)) {
 544                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 545                                 IPSTATS_MIB_OUTDISCARDS);
 546                 goto drop;
 547         }
 548
 549         hdr = ipv6_hdr(skb);
 550
 551         /* Mangling hops number delayed to point after skb COW */
 552
 553         hdr->hop_limit--;
 554
 555         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 556         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 557         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 558                        net, NULL, skb, skb->dev, dst->dev,
 559                        ip6_forward_finish);
 560
 561 error:
 562         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 563 drop:
 564         kfree_skb(skb);
 565         return -EINVAL;
 566 }
 567
 568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 569 {
 570         to->pkt_type = from->pkt_type;
 571         to->priority = from->priority;
 572         to->protocol = from->protocol;
 573         skb_dst_drop(to);
 574         skb_dst_set(to, dst_clone(skb_dst(from)));
 575         to->dev = from->dev;
 576         to->mark = from->mark;
 577
 578 #ifdef CONFIG_NET_SCHED
 579         to->tc_index = from->tc_index;
 580 #endif
 581         nf_copy(to, from);
 582         skb_copy_secmark(to, from);
 583 }
 584
 585 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 586                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 587 {
 588         struct sk_buff *frag;
 589         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 590         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 591                                 inet6_sk(skb->sk) : NULL;
 592         struct ipv6hdr *tmp_hdr;
 593         struct frag_hdr *fh;
 594         unsigned int mtu, hlen, left, len;
 595         int hroom, troom;
 596         __be32 frag_id;
 597         int ptr, offset = 0, err = 0;
 598         u8 *prevhdr, nexthdr = 0;
 599
 600         err = ip6_find_1stfragopt(skb, &prevhdr);
 601         if (err < 0)
 602                 goto fail;
 603         hlen = err;
 604         nexthdr = *prevhdr;
 605
 606         mtu = ip6_skb_dst_mtu(skb);
 607
 608         /* We must not fragment if the socket is set to force MTU discovery
 609          * or if the skb it not generated by a local socket.
 610          */
 611         if (unlikely(!skb->ignore_df && skb->len > mtu))
 612                 goto fail_toobig;
 613
 614         if (IP6CB(skb)->frag_max_size) {
 615                 if (IP6CB(skb)->frag_max_size > mtu)
 616                         goto fail_toobig;
 617
 618                 /* don't send fragments larger than what we received */
 619                 mtu = IP6CB(skb)->frag_max_size;
 620                 if (mtu < IPV6_MIN_MTU)
 621                         mtu = IPV6_MIN_MTU;
 622         }
 623
 624         if (np && np->frag_size < mtu) {
 625                 if (np->frag_size)
 626                         mtu = np->frag_size;
 627         }
 628         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 629                 goto fail_toobig;
 630         mtu -= hlen + sizeof(struct frag_hdr);
 631
 632         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 633                                     &ipv6_hdr(skb)->saddr);
 634
 635         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 636             (err = skb_checksum_help(skb)))
 637                 goto fail;
 638
 639         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 640         if (skb_has_frag_list(skb)) {
 641                 unsigned int first_len = skb_pagelen(skb);
 642                 struct sk_buff *frag2;
 643
 644                 if (first_len - hlen > mtu ||
 645                     ((first_len - hlen) & 7) ||
 646                     skb_cloned(skb) ||
 647                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 648                         goto slow_path;
 649
 650                 skb_walk_frags(skb, frag) {
 651                         /* Correct geometry. */
 652                         if (frag->len > mtu ||
 653                             ((frag->len & 7) && frag->next) ||
 654                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 655                                 goto slow_path_clean;
 656
 657                         /* Partially cloned skb? */
 658                         if (skb_shared(frag))
 659                                 goto slow_path_clean;
 660
 661                         BUG_ON(frag->sk);
 662                         if (skb->sk) {
 663                                 frag->sk = skb->sk;
 664                                 frag->destructor = sock_wfree;
 665                         }
 666                         skb->truesize -= frag->truesize;
 667                 }
 668
 669                 err = 0;
 670                 offset = 0;
 671                 /* BUILD HEADER */
 672
 673                 *prevhdr = NEXTHDR_FRAGMENT;
 674                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 675                 if (!tmp_hdr) {
 676                         err = -ENOMEM;
 677                         goto fail;
 678                 }
 679                 frag = skb_shinfo(skb)->frag_list;
 680                 skb_frag_list_init(skb);
 681
 682                 __skb_pull(skb, hlen);
 683                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 684                 __skb_push(skb, hlen);
 685                 skb_reset_network_header(skb);
 686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                 fh->nexthdr = nexthdr;
 689                 fh->reserved = 0;
 690                 fh->frag_off = htons(IP6_MF);
 691                 fh->identification = frag_id;
 692
 693                 first_len = skb_pagelen(skb);
 694                 skb->data_len = first_len - skb_headlen(skb);
 695                 skb->len = first_len;
 696                 ipv6_hdr(skb)->payload_len = htons(first_len -
 697                                                    sizeof(struct ipv6hdr));
 698
 699                 for (;;) {
 700                         /* Prepare header of the next frame,
 701                          * before previous one went down. */
 702                         if (frag) {
 703                                 frag->ip_summed = CHECKSUM_NONE;
 704                                 skb_reset_transport_header(frag);
 705                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 706                                 __skb_push(frag, hlen);
 707                                 skb_reset_network_header(frag);
 708                                 memcpy(skb_network_header(frag), tmp_hdr,
 709                                        hlen);
 710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 711                                 fh->nexthdr = nexthdr;
 712                                 fh->reserved = 0;
 713                                 fh->frag_off = htons(offset);
 714                                 if (frag->next)
 715                                         fh->frag_off |= htons(IP6_MF);
 716                                 fh->identification = frag_id;
 717                                 ipv6_hdr(frag)->payload_len =
 718                                                 htons(frag->len -
 719                                                       sizeof(struct ipv6hdr));
 720                                 ip6_copy_metadata(frag, skb);
 721                         }
 722
 723                         err = output(net, sk, skb);
 724                         if (!err)
 725                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 726                                               IPSTATS_MIB_FRAGCREATES);
 727
 728                         if (err || !frag)
 729                                 break;
 730
 731                         skb = frag;
 732                         frag = skb->next;
 733                         skb->next = NULL;
 734                 }
 735
 736                 kfree(tmp_hdr);
 737
 738                 if (err == 0) {
 739                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 740                                       IPSTATS_MIB_FRAGOKS);
 741                         return 0;
 742                 }
 743
 744                 kfree_skb_list(frag);
 745
 746                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 747                               IPSTATS_MIB_FRAGFAILS);
 748                 return err;
 749
 750 slow_path_clean:
 751                 skb_walk_frags(skb, frag2) {
 752                         if (frag2 == frag)
 753                                 break;
 754                         frag2->sk = NULL;
 755                         frag2->destructor = NULL;
 756                         skb->truesize += frag2->truesize;
 757                 }
 758         }
 759
 760 slow_path:
 761         left = skb->len - hlen;         /* Space per frame */
 762         ptr = hlen;                     /* Where to start from */
 763
 764         /*
 765          *      Fragment the datagram.
 766          */
 767
 768         troom = rt->dst.dev->needed_tailroom;
 769
 770         /*
 771          *      Keep copying data until we run out.
 772          */
 773         while (left > 0)        {
 774                 u8 *fragnexthdr_offset;
 775
 776                 len = left;
 777                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 778                 if (len > mtu)
 779                         len = mtu;
 780                 /* IF: we are not sending up to and including the packet end
 781                    then align the next start on an eight byte boundary */
 782                 if (len < left) {
 783                         len &= ~7;
 784                 }
 785
 786                 /* Allocate buffer */
 787                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 788                                  hroom + troom, GFP_ATOMIC);
 789                 if (!frag) {
 790                         err = -ENOMEM;
 791                         goto fail;
 792                 }
 793
 794                 /*
 795                  *      Set up data on packet
 796                  */
 797
 798                 ip6_copy_metadata(frag, skb);
 799                 skb_reserve(frag, hroom);
 800                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 801                 skb_reset_network_header(frag);
 802                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 803                 frag->transport_header = (frag->network_header + hlen +
 804                                           sizeof(struct frag_hdr));
 805
 806                 /*
 807                  *      Charge the memory for the fragment to any owner
 808                  *      it might possess
 809                  */
 810                 if (skb->sk)
 811                         skb_set_owner_w(frag, skb->sk);
 812
 813                 /*
 814                  *      Copy the packet header into the new buffer.
 815                  */
 816                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 817
 818                 fragnexthdr_offset = skb_network_header(frag);
 819                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 820                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 821
 822                 /*
 823                  *      Build fragment header.
 824                  */
 825                 fh->nexthdr = nexthdr;
 826                 fh->reserved = 0;
 827                 fh->identification = frag_id;
 828
 829                 /*
 830                  *      Copy a block of the IP datagram.
 831                  */
 832                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 833                                      len));
 834                 left -= len;
 835
 836                 fh->frag_off = htons(offset);
 837                 if (left > 0)
 838                         fh->frag_off |= htons(IP6_MF);
 839                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 840                                                     sizeof(struct ipv6hdr));
 841
 842                 ptr += len;
 843                 offset += len;
 844
 845                 /*
 846                  *      Put this fragment into the sending queue.
 847                  */
 848                 err = output(net, sk, frag);
 849                 if (err)
 850                         goto fail;
 851
 852                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 853                               IPSTATS_MIB_FRAGCREATES);
 854         }
 855         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 856                       IPSTATS_MIB_FRAGOKS);
 857         consume_skb(skb);
 858         return err;
 859
 860 fail_toobig:
 861         if (skb->sk && dst_allfrag(skb_dst(skb)))
 862                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 863
 864         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 865         err = -EMSGSIZE;
 866
 867 fail:
 868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 869                       IPSTATS_MIB_FRAGFAILS);
 870         kfree_skb(skb);
 871         return err;
 872 }
 873
 874 static inline int ip6_rt_check(const struct rt6key *rt_key,
 875                                const struct in6_addr *fl_addr,
 876                                const struct in6_addr *addr_cache)
 877 {
 878         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 879                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 880 }
 881
 882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 883                                           struct dst_entry *dst,
 884                                           const struct flowi6 *fl6)
 885 {
 886         struct ipv6_pinfo *np = inet6_sk(sk);
 887         struct rt6_info *rt;
 888
 889         if (!dst)
 890                 goto out;
 891
 892         if (dst->ops->family != AF_INET6) {
 893                 dst_release(dst);
 894                 return NULL;
 895         }
 896
 897         rt = (struct rt6_info *)dst;
 898         /* Yes, checking route validity in not connected
 899          * case is not very simple. Take into account,
 900          * that we do not support routing by source, TOS,
 901          * and MSG_DONTROUTE            --ANK (980726)
 902          *
 903          * 1. ip6_rt_check(): If route was host route,
 904          *    check that cached destination is current.
 905          *    If it is network route, we still may
 906          *    check its validity using saved pointer
 907          *    to the last used address: daddr_cache.
 908          *    We do not want to save whole address now,
 909          *    (because main consumer of this service
 910          *    is tcp, which has not this problem),
 911          *    so that the last trick works only on connected
 912          *    sockets.
 913          * 2. oif also should be the same.
 914          */
 915         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 916 #ifdef CONFIG_IPV6_SUBTREES
 917             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 918 #endif
 919            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 920               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 921                 dst_release(dst);
 922                 dst = NULL;
 923         }
 924
 925 out:
 926         return dst;
 927 }
 928
 929 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 930                                struct dst_entry **dst, struct flowi6 *fl6)
 931 {
 932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 933         struct neighbour *n;
 934         struct rt6_info *rt;
 935 #endif
 936         int err;
 937         int flags = 0;
 938
 939         /* The correct way to handle this would be to do
 940          * ip6_route_get_saddr, and then ip6_route_output; however,
 941          * the route-specific preferred source forces the
 942          * ip6_route_output call _before_ ip6_route_get_saddr.
 943          *
 944          * In source specific routing (no src=any default route),
 945          * ip6_route_output will fail given src=any saddr, though, so
 946          * that's why we try it again later.
 947          */
 948         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 949                 struct rt6_info *rt;
 950                 bool had_dst = *dst != NULL;
 951
 952                 if (!had_dst)
 953                         *dst = ip6_route_output(net, sk, fl6);
 954                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 955                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 956                                           sk ? inet6_sk(sk)->srcprefs : 0,
 957                                           &fl6->saddr);
 958                 if (err)
 959                         goto out_err_release;
 960
 961                 /* If we had an erroneous initial result, pretend it
 962                  * never existed and let the SA-enabled version take
 963                  * over.
 964                  */
 965                 if (!had_dst && (*dst)->error) {
 966                         dst_release(*dst);
 967                         *dst = NULL;
 968                 }
 969
 970                 if (fl6->flowi6_oif)
 971                         flags |= RT6_LOOKUP_F_IFACE;
 972         }
 973
 974         if (!*dst)
 975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 976
 977         err = (*dst)->error;
 978         if (err)
 979                 goto out_err_release;
 980
 981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 982         /*
 983          * Here if the dst entry we've looked up
 984          * has a neighbour entry that is in the INCOMPLETE
 985          * state and the src address from the flow is
 986          * marked as OPTIMISTIC, we release the found
 987          * dst entry and replace it instead with the
 988          * dst entry of the nexthop router
 989          */
 990         rt = (struct rt6_info *) *dst;
 991         rcu_read_lock_bh();
 992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 993                                       rt6_nexthop(rt, &fl6->daddr));
 994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 995         rcu_read_unlock_bh();
 996
 997         if (err) {
 998                 struct inet6_ifaddr *ifp;
 999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026                 err = -EAFNOSUPPORT;
1027                 goto out_err_release;
1028         }
1029
1030         return 0;
1031
1032 out_err_release:
1033         dst_release(*dst);
1034         *dst = NULL;
1035
1036         if (err == -ENETUNREACH)
1037                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038         return err;
1039 }
1040
1041 /**
1042  *      ip6_dst_lookup - perform route lookup on flow
1043  *      @sk: socket which provides route info
1044  *      @dst: pointer to dst_entry * for result
1045  *      @fl6: flow to lookup
1046  *
1047  *      This function performs a route lookup on the given flow.
1048  *
1049  *      It returns zero on success, or a standard errno code on error.
1050  */
1051 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052                    struct flowi6 *fl6)
1053 {
1054         *dst = NULL;
1055         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056 }
1057 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058
1059 /**
1060  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061  *      @sk: socket which provides route info
1062  *      @fl6: flow to lookup
1063  *      @final_dst: final destination address for ipsec lookup
1064  *
1065  *      This function performs a route lookup on the given flow.
1066  *
1067  *      It returns a valid dst pointer on success, or a pointer encoded
1068  *      error code.
1069  */
1070 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071                                       const struct in6_addr *final_dst)
1072 {
1073         struct dst_entry *dst = NULL;
1074         int err;
1075
1076         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077         if (err)
1078                 return ERR_PTR(err);
1079         if (final_dst)
1080                 fl6->daddr = *final_dst;
1081
1082         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083 }
1084 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085
1086 /**
1087  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088  *      @sk: socket which provides the dst cache and route info
1089  *      @fl6: flow to lookup
1090  *      @final_dst: final destination address for ipsec lookup
1091  *
1092  *      This function performs a route lookup on the given flow with the
1093  *      possibility of using the cached route in the socket if it is valid.
1094  *      It will take the socket dst lock when operating on the dst cache.
1095  *      As a result, this function can only be used in process context.
1096  *
1097  *      It returns a valid dst pointer on success, or a pointer encoded
1098  *      error code.
1099  */
1100 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1101                                          const struct in6_addr *final_dst)
1102 {
1103         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1104
1105         dst = ip6_sk_dst_check(sk, dst, fl6);
1106         if (!dst)
1107                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1108
1109         return dst;
1110 }
1111 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112
1113 static inline int ip6_ufo_append_data(struct sock *sk,
1114                         struct sk_buff_head *queue,
1115                         int getfrag(void *from, char *to, int offset, int len,
1116                         int odd, struct sk_buff *skb),
1117                         void *from, int length, int hh_len, int fragheaderlen,
1118                         int exthdrlen, int transhdrlen, int mtu,
1119                         unsigned int flags, const struct flowi6 *fl6)
1120
1121 {
1122         struct sk_buff *skb;
1123         int err;
1124
1125         /* There is support for UDP large send offload by network
1126          * device, so create one single skb packet containing complete
1127          * udp datagram
1128          */
1129         skb = skb_peek_tail(queue);
1130         if (!skb) {
1131                 skb = sock_alloc_send_skb(sk,
1132                         hh_len + fragheaderlen + transhdrlen + 20,
1133                         (flags & MSG_DONTWAIT), &err);
1134                 if (!skb)
1135                         return err;
1136
1137                 /* reserve space for Hardware header */
1138                 skb_reserve(skb, hh_len);
1139
1140                 /* create space for UDP/IP header */
1141                 skb_put(skb, fragheaderlen + transhdrlen);
1142
1143                 /* initialize network header pointer */
1144                 skb_set_network_header(skb, exthdrlen);
1145
1146                 /* initialize protocol header pointer */
1147                 skb->transport_header = skb->network_header + fragheaderlen;
1148
1149                 skb->protocol = htons(ETH_P_IPV6);
1150                 skb->csum = 0;
1151
1152                 if (flags & MSG_CONFIRM)
1153                         skb_set_dst_pending_confirm(skb, 1);
1154
1155                 __skb_queue_tail(queue, skb);
1156         } else if (skb_is_gso(skb)) {
1157                 goto append;
1158         }
1159
1160         skb->ip_summed = CHECKSUM_PARTIAL;
1161         /* Specify the length of each IPv6 datagram fragment.
1162          * It has to be a multiple of 8.
1163          */
1164         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1165                                      sizeof(struct frag_hdr)) & ~7;
1166         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1167         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1168                                                          &fl6->daddr,
1169                                                          &fl6->saddr);
1170
1171 append:
1172         return skb_append_datato_frags(sk, skb, getfrag, from,
1173                                        (length - transhdrlen));
1174 }
1175
1176 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1177                                                gfp_t gfp)
1178 {
1179         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1180 }
1181
1182 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1183                                                 gfp_t gfp)
1184 {
1185         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1186 }
1187
1188 static void ip6_append_data_mtu(unsigned int *mtu,
1189                                 int *maxfraglen,
1190                                 unsigned int fragheaderlen,
1191                                 struct sk_buff *skb,
1192                                 struct rt6_info *rt,
1193                                 unsigned int orig_mtu)
1194 {
1195         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1196                 if (!skb) {
1197                         /* first fragment, reserve header_len */
1198                         *mtu = orig_mtu - rt->dst.header_len;
1199
1200                 } else {
1201                         /*
1202                          * this fragment is not first, the headers
1203                          * space is regarded as data space.
1204                          */
1205                         *mtu = orig_mtu;
1206                 }
1207                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1208                               + fragheaderlen - sizeof(struct frag_hdr);
1209         }
1210 }
1211
1212 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1213                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1214                           struct rt6_info *rt, struct flowi6 *fl6)
1215 {
1216         struct ipv6_pinfo *np = inet6_sk(sk);
1217         unsigned int mtu;
1218         struct ipv6_txoptions *opt = ipc6->opt;
1219
1220         /*
1221          * setup for corking
1222          */
1223         if (opt) {
1224                 if (WARN_ON(v6_cork->opt))
1225                         return -EINVAL;
1226
1227                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1228                 if (unlikely(!v6_cork->opt))
1229                         return -ENOBUFS;
1230
1231                 v6_cork->opt->tot_len = opt->tot_len;
1232                 v6_cork->opt->opt_flen = opt->opt_flen;
1233                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1234
1235                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1236                                                     sk->sk_allocation);
1237                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1238                         return -ENOBUFS;
1239
1240                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1241                                                     sk->sk_allocation);
1242                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1243                         return -ENOBUFS;
1244
1245                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1246                                                    sk->sk_allocation);
1247                 if (opt->hopopt && !v6_cork->opt->hopopt)
1248                         return -ENOBUFS;
1249
1250                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1251                                                     sk->sk_allocation);
1252                 if (opt->srcrt && !v6_cork->opt->srcrt)
1253                         return -ENOBUFS;
1254
1255                 /* need source address above miyazawa*/
1256         }
1257         dst_hold(&rt->dst);
1258         cork->base.dst = &rt->dst;
1259         cork->fl.u.ip6 = *fl6;
1260         v6_cork->hop_limit = ipc6->hlimit;
1261         v6_cork->tclass = ipc6->tclass;
1262         if (rt->dst.flags & DST_XFRM_TUNNEL)
1263                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1264                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1265         else
1266                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1267                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1268         if (np->frag_size < mtu) {
1269                 if (np->frag_size)
1270                         mtu = np->frag_size;
1271         }
1272         cork->base.fragsize = mtu;
1273         if (dst_allfrag(rt->dst.path))
1274                 cork->base.flags |= IPCORK_ALLFRAG;
1275         cork->base.length = 0;
1276
1277         return 0;
1278 }
1279
1280 static int __ip6_append_data(struct sock *sk,
1281                              struct flowi6 *fl6,
1282                              struct sk_buff_head *queue,
1283                              struct inet_cork *cork,
1284                              struct inet6_cork *v6_cork,
1285                              struct page_frag *pfrag,
1286                              int getfrag(void *from, char *to, int offset,
1287                                          int len, int odd, struct sk_buff *skb),
1288                              void *from, int length, int transhdrlen,
1289                              unsigned int flags, struct ipcm6_cookie *ipc6,
1290                              const struct sockcm_cookie *sockc)
1291 {
1292         struct sk_buff *skb, *skb_prev = NULL;
1293         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1294         int exthdrlen = 0;
1295         int dst_exthdrlen = 0;
1296         int hh_len;
1297         int copy;
1298         int err;
1299         int offset = 0;
1300         __u8 tx_flags = 0;
1301         u32 tskey = 0;
1302         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1303         struct ipv6_txoptions *opt = v6_cork->opt;
1304         int csummode = CHECKSUM_NONE;
1305         unsigned int maxnonfragsize, headersize;
1306
1307         skb = skb_peek_tail(queue);
1308         if (!skb) {
1309                 exthdrlen = opt ? opt->opt_flen : 0;
1310                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1311         }
1312
1313         mtu = cork->fragsize;
1314         orig_mtu = mtu;
1315
1316         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1317
1318         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1319                         (opt ? opt->opt_nflen : 0);
1320         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1321                      sizeof(struct frag_hdr);
1322
1323         headersize = sizeof(struct ipv6hdr) +
1324                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1325                      (dst_allfrag(&rt->dst) ?
1326                       sizeof(struct frag_hdr) : 0) +
1327                      rt->rt6i_nfheader_len;
1328
1329         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1330             (sk->sk_protocol == IPPROTO_UDP ||
1331              sk->sk_protocol == IPPROTO_RAW)) {
1332                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1333                                 sizeof(struct ipv6hdr));
1334                 goto emsgsize;
1335         }
1336
1337         if (ip6_sk_ignore_df(sk))
1338                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1339         else
1340                 maxnonfragsize = mtu;
1341
1342         if (cork->length + length > maxnonfragsize - headersize) {
1343 emsgsize:
1344                 ipv6_local_error(sk, EMSGSIZE, fl6,
1345                                  mtu - headersize +
1346                                  sizeof(struct ipv6hdr));
1347                 return -EMSGSIZE;
1348         }
1349
1350         /* CHECKSUM_PARTIAL only with no extension headers and when
1351          * we are not going to fragment
1352          */
1353         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1354             headersize == sizeof(struct ipv6hdr) &&
1355             length <= mtu - headersize &&
1356             !(flags & MSG_MORE) &&
1357             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1358                 csummode = CHECKSUM_PARTIAL;
1359
1360         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1361                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1362                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1363                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1364                         tskey = sk->sk_tskey++;
1365         }
1366
1367         /*
1368          * Let's try using as much space as possible.
1369          * Use MTU if total length of the message fits into the MTU.
1370          * Otherwise, we need to reserve fragment header and
1371          * fragment alignment (= 8-15 octects, in total).
1372          *
1373          * Note that we may need to "move" the data from the tail of
1374          * of the buffer to the new fragment when we split
1375          * the message.
1376          *
1377          * FIXME: It may be fragmented into multiple chunks
1378          *        at once if non-fragmentable extension headers
1379          *        are too large.
1380          * --yoshfuji
1381          */
1382
1383         cork->length += length;
1384         if ((((length + (skb ? skb->len : headersize)) > mtu) ||
1385              (skb && skb_is_gso(skb))) &&
1386             (sk->sk_protocol == IPPROTO_UDP) &&
1387             (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1388             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1389                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1390                                           hh_len, fragheaderlen, exthdrlen,
1391                                           transhdrlen, mtu, flags, fl6);
1392                 if (err)
1393                         goto error;
1394                 return 0;
1395         }
1396
1397         if (!skb)
1398                 goto alloc_new_skb;
1399
1400         while (length > 0) {
1401                 /* Check if the remaining data fits into current packet. */
1402                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1403                 if (copy < length)
1404                         copy = maxfraglen - skb->len;
1405
1406                 if (copy <= 0) {
1407                         char *data;
1408                         unsigned int datalen;
1409                         unsigned int fraglen;
1410                         unsigned int fraggap;
1411                         unsigned int alloclen;
1412 alloc_new_skb:
1413                         /* There's no room in the current skb */
1414                         if (skb)
1415                                 fraggap = skb->len - maxfraglen;
1416                         else
1417                                 fraggap = 0;
1418                         /* update mtu and maxfraglen if necessary */
1419                         if (!skb || !skb_prev)
1420                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1421                                                     fragheaderlen, skb, rt,
1422                                                     orig_mtu);
1423
1424                         skb_prev = skb;
1425
1426                         /*
1427                          * If remaining data exceeds the mtu,
1428                          * we know we need more fragment(s).
1429                          */
1430                         datalen = length + fraggap;
1431
1432                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1433                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1434                         if ((flags & MSG_MORE) &&
1435                             !(rt->dst.dev->features&NETIF_F_SG))
1436                                 alloclen = mtu;
1437                         else
1438                                 alloclen = datalen + fragheaderlen;
1439
1440                         alloclen += dst_exthdrlen;
1441
1442                         if (datalen != length + fraggap) {
1443                                 /*
1444                                  * this is not the last fragment, the trailer
1445                                  * space is regarded as data space.
1446                                  */
1447                                 datalen += rt->dst.trailer_len;
1448                         }
1449
1450                         alloclen += rt->dst.trailer_len;
1451                         fraglen = datalen + fragheaderlen;
1452
1453                         /*
1454                          * We just reserve space for fragment header.
1455                          * Note: this may be overallocation if the message
1456                          * (without MSG_MORE) fits into the MTU.
1457                          */
1458                         alloclen += sizeof(struct frag_hdr);
1459
1460                         copy = datalen - transhdrlen - fraggap;
1461                         if (copy < 0) {
1462                                 err = -EINVAL;
1463                                 goto error;
1464                         }
1465                         if (transhdrlen) {
1466                                 skb = sock_alloc_send_skb(sk,
1467                                                 alloclen + hh_len,
1468                                                 (flags & MSG_DONTWAIT), &err);
1469                         } else {
1470                                 skb = NULL;
1471                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1472                                     2 * sk->sk_sndbuf)
1473                                         skb = sock_wmalloc(sk,
1474                                                            alloclen + hh_len, 1,
1475                                                            sk->sk_allocation);
1476                                 if (unlikely(!skb))
1477                                         err = -ENOBUFS;
1478                         }
1479                         if (!skb)
1480                                 goto error;
1481                         /*
1482                          *      Fill in the control structures
1483                          */
1484                         skb->protocol = htons(ETH_P_IPV6);
1485                         skb->ip_summed = csummode;
1486                         skb->csum = 0;
1487                         /* reserve for fragmentation and ipsec header */
1488                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1489                                     dst_exthdrlen);
1490
1491                         /* Only the initial fragment is time stamped */
1492                         skb_shinfo(skb)->tx_flags = tx_flags;
1493                         tx_flags = 0;
1494                         skb_shinfo(skb)->tskey = tskey;
1495                         tskey = 0;
1496
1497                         /*
1498                          *      Find where to start putting bytes
1499                          */
1500                         data = skb_put(skb, fraglen);
1501                         skb_set_network_header(skb, exthdrlen);
1502                         data += fragheaderlen;
1503                         skb->transport_header = (skb->network_header +
1504                                                  fragheaderlen);
1505                         if (fraggap) {
1506                                 skb->csum = skb_copy_and_csum_bits(
1507                                         skb_prev, maxfraglen,
1508                                         data + transhdrlen, fraggap, 0);
1509                                 skb_prev->csum = csum_sub(skb_prev->csum,
1510                                                           skb->csum);
1511                                 data += fraggap;
1512                                 pskb_trim_unique(skb_prev, maxfraglen);
1513                         }
1514                         if (copy > 0 &&
1515                             getfrag(from, data + transhdrlen, offset,
1516                                     copy, fraggap, skb) < 0) {
1517                                 err = -EFAULT;
1518                                 kfree_skb(skb);
1519                                 goto error;
1520                         }
1521
1522                         offset += copy;
1523                         length -= datalen - fraggap;
1524                         transhdrlen = 0;
1525                         exthdrlen = 0;
1526                         dst_exthdrlen = 0;
1527
1528                         if ((flags & MSG_CONFIRM) && !skb_prev)
1529                                 skb_set_dst_pending_confirm(skb, 1);
1530
1531                         /*
1532                          * Put the packet on the pending queue
1533                          */
1534                         __skb_queue_tail(queue, skb);
1535                         continue;
1536                 }
1537
1538                 if (copy > length)
1539                         copy = length;
1540
1541                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1542                         unsigned int off;
1543
1544                         off = skb->len;
1545                         if (getfrag(from, skb_put(skb, copy),
1546                                                 offset, copy, off, skb) < 0) {
1547                                 __skb_trim(skb, off);
1548                                 err = -EFAULT;
1549                                 goto error;
1550                         }
1551                 } else {
1552                         int i = skb_shinfo(skb)->nr_frags;
1553
1554                         err = -ENOMEM;
1555                         if (!sk_page_frag_refill(sk, pfrag))
1556                                 goto error;
1557
1558                         if (!skb_can_coalesce(skb, i, pfrag->page,
1559                                               pfrag->offset)) {
1560                                 err = -EMSGSIZE;
1561                                 if (i == MAX_SKB_FRAGS)
1562                                         goto error;
1563
1564                                 __skb_fill_page_desc(skb, i, pfrag->page,
1565                                                      pfrag->offset, 0);
1566                                 skb_shinfo(skb)->nr_frags = ++i;
1567                                 get_page(pfrag->page);
1568                         }
1569                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1570                         if (getfrag(from,
1571                                     page_address(pfrag->page) + pfrag->offset,
1572                                     offset, copy, skb->len, skb) < 0)
1573                                 goto error_efault;
1574
1575                         pfrag->offset += copy;
1576                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1577                         skb->len += copy;
1578                         skb->data_len += copy;
1579                         skb->truesize += copy;
1580                         refcount_add(copy, &sk->sk_wmem_alloc);
1581                 }
1582                 offset += copy;
1583                 length -= copy;
1584         }
1585
1586         return 0;
1587
1588 error_efault:
1589         err = -EFAULT;
1590 error:
1591         cork->length -= length;
1592         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1593         return err;
1594 }
1595
1596 int ip6_append_data(struct sock *sk,
1597                     int getfrag(void *from, char *to, int offset, int len,
1598                                 int odd, struct sk_buff *skb),
1599                     void *from, int length, int transhdrlen,
1600                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601                     struct rt6_info *rt, unsigned int flags,
1602                     const struct sockcm_cookie *sockc)
1603 {
1604         struct inet_sock *inet = inet_sk(sk);
1605         struct ipv6_pinfo *np = inet6_sk(sk);
1606         int exthdrlen;
1607         int err;
1608
1609         if (flags&MSG_PROBE)
1610                 return 0;
1611         if (skb_queue_empty(&sk->sk_write_queue)) {
1612                 /*
1613                  * setup for corking
1614                  */
1615                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1616                                      ipc6, rt, fl6);
1617                 if (err)
1618                         return err;
1619
1620                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1621                 length += exthdrlen;
1622                 transhdrlen += exthdrlen;
1623         } else {
1624                 fl6 = &inet->cork.fl.u.ip6;
1625                 transhdrlen = 0;
1626         }
1627
1628         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1629                                  &np->cork, sk_page_frag(sk), getfrag,
1630                                  from, length, transhdrlen, flags, ipc6, sockc);
1631 }
1632 EXPORT_SYMBOL_GPL(ip6_append_data);
1633
1634 static void ip6_cork_release(struct inet_cork_full *cork,
1635                              struct inet6_cork *v6_cork)
1636 {
1637         if (v6_cork->opt) {
1638                 kfree(v6_cork->opt->dst0opt);
1639                 kfree(v6_cork->opt->dst1opt);
1640                 kfree(v6_cork->opt->hopopt);
1641                 kfree(v6_cork->opt->srcrt);
1642                 kfree(v6_cork->opt);
1643                 v6_cork->opt = NULL;
1644         }
1645
1646         if (cork->base.dst) {
1647                 dst_release(cork->base.dst);
1648                 cork->base.dst = NULL;
1649                 cork->base.flags &= ~IPCORK_ALLFRAG;
1650         }
1651         memset(&cork->fl, 0, sizeof(cork->fl));
1652 }
1653
1654 struct sk_buff *__ip6_make_skb(struct sock *sk,
1655                                struct sk_buff_head *queue,
1656                                struct inet_cork_full *cork,
1657                                struct inet6_cork *v6_cork)
1658 {
1659         struct sk_buff *skb, *tmp_skb;
1660         struct sk_buff **tail_skb;
1661         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1662         struct ipv6_pinfo *np = inet6_sk(sk);
1663         struct net *net = sock_net(sk);
1664         struct ipv6hdr *hdr;
1665         struct ipv6_txoptions *opt = v6_cork->opt;
1666         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1667         struct flowi6 *fl6 = &cork->fl.u.ip6;
1668         unsigned char proto = fl6->flowi6_proto;
1669
1670         skb = __skb_dequeue(queue);
1671         if (!skb)
1672                 goto out;
1673         tail_skb = &(skb_shinfo(skb)->frag_list);
1674
1675         /* move skb->data to ip header from ext header */
1676         if (skb->data < skb_network_header(skb))
1677                 __skb_pull(skb, skb_network_offset(skb));
1678         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1679                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1680                 *tail_skb = tmp_skb;
1681                 tail_skb = &(tmp_skb->next);
1682                 skb->len += tmp_skb->len;
1683                 skb->data_len += tmp_skb->len;
1684                 skb->truesize += tmp_skb->truesize;
1685                 tmp_skb->destructor = NULL;
1686                 tmp_skb->sk = NULL;
1687         }
1688
1689         /* Allow local fragmentation. */
1690         skb->ignore_df = ip6_sk_ignore_df(sk);
1691
1692         *final_dst = fl6->daddr;
1693         __skb_pull(skb, skb_network_header_len(skb));
1694         if (opt && opt->opt_flen)
1695                 ipv6_push_frag_opts(skb, opt, &proto);
1696         if (opt && opt->opt_nflen)
1697                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1698
1699         skb_push(skb, sizeof(struct ipv6hdr));
1700         skb_reset_network_header(skb);
1701         hdr = ipv6_hdr(skb);
1702
1703         ip6_flow_hdr(hdr, v6_cork->tclass,
1704                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1705                                         np->autoflowlabel, fl6));
1706         hdr->hop_limit = v6_cork->hop_limit;
1707         hdr->nexthdr = proto;
1708         hdr->saddr = fl6->saddr;
1709         hdr->daddr = *final_dst;
1710
1711         skb->priority = sk->sk_priority;
1712         skb->mark = sk->sk_mark;
1713
1714         skb_dst_set(skb, dst_clone(&rt->dst));
1715         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1716         if (proto == IPPROTO_ICMPV6) {
1717                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1718
1719                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1720                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1721         }
1722
1723         ip6_cork_release(cork, v6_cork);
1724 out:
1725         return skb;
1726 }
1727
1728 int ip6_send_skb(struct sk_buff *skb)
1729 {
1730         struct net *net = sock_net(skb->sk);
1731         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1732         int err;
1733
1734         err = ip6_local_out(net, skb->sk, skb);
1735         if (err) {
1736                 if (err > 0)
1737                         err = net_xmit_errno(err);
1738                 if (err)
1739                         IP6_INC_STATS(net, rt->rt6i_idev,
1740                                       IPSTATS_MIB_OUTDISCARDS);
1741         }
1742
1743         return err;
1744 }
1745
1746 int ip6_push_pending_frames(struct sock *sk)
1747 {
1748         struct sk_buff *skb;
1749
1750         skb = ip6_finish_skb(sk);
1751         if (!skb)
1752                 return 0;
1753
1754         return ip6_send_skb(skb);
1755 }
1756 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1757
1758 static void __ip6_flush_pending_frames(struct sock *sk,
1759                                        struct sk_buff_head *queue,
1760                                        struct inet_cork_full *cork,
1761                                        struct inet6_cork *v6_cork)
1762 {
1763         struct sk_buff *skb;
1764
1765         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1766                 if (skb_dst(skb))
1767                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1768                                       IPSTATS_MIB_OUTDISCARDS);
1769                 kfree_skb(skb);
1770         }
1771
1772         ip6_cork_release(cork, v6_cork);
1773 }
1774
1775 void ip6_flush_pending_frames(struct sock *sk)
1776 {
1777         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1778                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1779 }
1780 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1781
1782 struct sk_buff *ip6_make_skb(struct sock *sk,
1783                              int getfrag(void *from, char *to, int offset,
1784                                          int len, int odd, struct sk_buff *skb),
1785                              void *from, int length, int transhdrlen,
1786                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1787                              struct rt6_info *rt, unsigned int flags,
1788                              const struct sockcm_cookie *sockc)
1789 {
1790         struct inet_cork_full cork;
1791         struct inet6_cork v6_cork;
1792         struct sk_buff_head queue;
1793         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794         int err;
1795
1796         if (flags & MSG_PROBE)
1797                 return NULL;
1798
1799         __skb_queue_head_init(&queue);
1800
1801         cork.base.flags = 0;
1802         cork.base.addr = 0;
1803         cork.base.opt = NULL;
1804         v6_cork.opt = NULL;
1805         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1806         if (err)
1807                 return ERR_PTR(err);
1808
1809         if (ipc6->dontfrag < 0)
1810                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1811
1812         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1813                                 &current->task_frag, getfrag, from,
1814                                 length + exthdrlen, transhdrlen + exthdrlen,
1815                                 flags, ipc6, sockc);
1816         if (err) {
1817                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1818                 return ERR_PTR(err);
1819         }
1820
1821         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1822 }