net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         skb->protocol = htons(ETH_P_IPV6);
  71         skb->dev = dev;
  72
  73         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  74                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  75
  76                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  77                     ((mroute6_socket(net, skb) &&
  78                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  79                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  80                                          &ipv6_hdr(skb)->saddr))) {
  81                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  82
  83                         /* Do not check for IFF_ALLMULTI; multicast routing
  84                            is not supported in any case.
  85                          */
  86                         if (newskb)
  87                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  88                                         net, sk, newskb, NULL, newskb->dev,
  89                                         dev_loopback_xmit);
  90
  91                         if (ipv6_hdr(skb)->hop_limit == 0) {
  92                                 IP6_INC_STATS(net, idev,
  93                                               IPSTATS_MIB_OUTDISCARDS);
  94                                 kfree_skb(skb);
  95                                 return 0;
  96                         }
  97                 }
  98
  99                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 100
 101                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 102                     IPV6_ADDR_SCOPE_NODELOCAL &&
 103                     !(dev->flags & IFF_LOOPBACK)) {
 104                         kfree_skb(skb);
 105                         return 0;
 106                 }
 107         }
 108
 109         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 110                 int res = lwtunnel_xmit(skb);
 111
 112                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 113                         return res;
 114         }
 115
 116         rcu_read_lock_bh();
 117         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 118         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 119         if (unlikely(!neigh))
 120                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 121         if (!IS_ERR(neigh)) {
 122                 ret = dst_neigh_output(dst, neigh, skb);
 123                 rcu_read_unlock_bh();
 124                 return ret;
 125         }
 126         rcu_read_unlock_bh();
 127
 128         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 129         kfree_skb(skb);
 130         return -EINVAL;
 131 }
 132
 133 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 134 {
 135         int ret;
 136
 137         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 138         if (ret) {
 139                 kfree_skb(skb);
 140                 return ret;
 141         }
 142
 143         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 144             dst_allfrag(skb_dst(skb)) ||
 145             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 146                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 147         else
 148                 return ip6_finish_output2(net, sk, skb);
 149 }
 150
 151 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 152 {
 153         struct net_device *dev = skb_dst(skb)->dev;
 154         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 155
 156         if (unlikely(idev->cnf.disable_ipv6)) {
 157                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 158                 kfree_skb(skb);
 159                 return 0;
 160         }
 161
 162         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 163                             net, sk, skb, NULL, dev,
 164                             ip6_finish_output,
 165                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 166 }
 167
 168 /*
 169  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 170  * Note : socket lock is not held for SYNACK packets, but might be modified
 171  * by calls to skb_set_owner_w() and ipv6_local_error(),
 172  * which are using proper atomic operations or spinlocks.
 173  */
 174 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 175              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 176 {
 177         struct net *net = sock_net(sk);
 178         const struct ipv6_pinfo *np = inet6_sk(sk);
 179         struct in6_addr *first_hop = &fl6->daddr;
 180         struct dst_entry *dst = skb_dst(skb);
 181         struct ipv6hdr *hdr;
 182         u8  proto = fl6->flowi6_proto;
 183         int seg_len = skb->len;
 184         int hlimit = -1;
 185         u32 mtu;
 186
 187         if (opt) {
 188                 unsigned int head_room;
 189
 190                 /* First: exthdrs may take lots of space (~8K for now)
 191                    MAX_HEADER is not enough.
 192                  */
 193                 head_room = opt->opt_nflen + opt->opt_flen;
 194                 seg_len += head_room;
 195                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 196
 197                 if (skb_headroom(skb) < head_room) {
 198                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 199                         if (!skb2) {
 200                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 201                                               IPSTATS_MIB_OUTDISCARDS);
 202                                 kfree_skb(skb);
 203                                 return -ENOBUFS;
 204                         }
 205                         consume_skb(skb);
 206                         skb = skb2;
 207                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 208                          * it is safe to call in our context (socket lock not held)
 209                          */
 210                         skb_set_owner_w(skb, (struct sock *)sk);
 211                 }
 212                 if (opt->opt_flen)
 213                         ipv6_push_frag_opts(skb, opt, &proto);
 214                 if (opt->opt_nflen)
 215                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 216                                              &fl6->saddr);
 217         }
 218
 219         skb_push(skb, sizeof(struct ipv6hdr));
 220         skb_reset_network_header(skb);
 221         hdr = ipv6_hdr(skb);
 222
 223         /*
 224          *      Fill in the IPv6 header
 225          */
 226         if (np)
 227                 hlimit = np->hop_limit;
 228         if (hlimit < 0)
 229                 hlimit = ip6_dst_hoplimit(dst);
 230
 231         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 232                                                      np->autoflowlabel, fl6));
 233
 234         hdr->payload_len = htons(seg_len);
 235         hdr->nexthdr = proto;
 236         hdr->hop_limit = hlimit;
 237
 238         hdr->saddr = fl6->saddr;
 239         hdr->daddr = *first_hop;
 240
 241         skb->protocol = htons(ETH_P_IPV6);
 242         skb->priority = sk->sk_priority;
 243         skb->mark = mark;
 244
 245         mtu = dst_mtu(dst);
 246         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 247                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 248                               IPSTATS_MIB_OUT, skb->len);
 249
 250                 /* if egress device is enslaved to an L3 master device pass the
 251                  * skb to its handler for processing
 252                  */
 253                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 254                 if (unlikely(!skb))
 255                         return 0;
 256
 257                 /* hooks should never assume socket lock is held.
 258                  * we promote our socket to non const
 259                  */
 260                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 261                                net, (struct sock *)sk, skb, NULL, dst->dev,
 262                                dst_output);
 263         }
 264
 265         skb->dev = dst->dev;
 266         /* ipv6_local_error() does not require socket lock,
 267          * we promote our socket to non const
 268          */
 269         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 270
 271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 272         kfree_skb(skb);
 273         return -EMSGSIZE;
 274 }
 275 EXPORT_SYMBOL(ip6_xmit);
 276
 277 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 278 {
 279         struct ip6_ra_chain *ra;
 280         struct sock *last = NULL;
 281
 282         read_lock(&ip6_ra_lock);
 283         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 284                 struct sock *sk = ra->sk;
 285                 if (sk && ra->sel == sel &&
 286                     (!sk->sk_bound_dev_if ||
 287                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 288                         if (last) {
 289                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 290                                 if (skb2)
 291                                         rawv6_rcv(last, skb2);
 292                         }
 293                         last = sk;
 294                 }
 295         }
 296
 297         if (last) {
 298                 rawv6_rcv(last, skb);
 299                 read_unlock(&ip6_ra_lock);
 300                 return 1;
 301         }
 302         read_unlock(&ip6_ra_lock);
 303         return 0;
 304 }
 305
 306 static int ip6_forward_proxy_check(struct sk_buff *skb)
 307 {
 308         struct ipv6hdr *hdr = ipv6_hdr(skb);
 309         u8 nexthdr = hdr->nexthdr;
 310         __be16 frag_off;
 311         int offset;
 312
 313         if (ipv6_ext_hdr(nexthdr)) {
 314                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 315                 if (offset < 0)
 316                         return 0;
 317         } else
 318                 offset = sizeof(struct ipv6hdr);
 319
 320         if (nexthdr == IPPROTO_ICMPV6) {
 321                 struct icmp6hdr *icmp6;
 322
 323                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 324                                          offset + 1 - skb->data)))
 325                         return 0;
 326
 327                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 328
 329                 switch (icmp6->icmp6_type) {
 330                 case NDISC_ROUTER_SOLICITATION:
 331                 case NDISC_ROUTER_ADVERTISEMENT:
 332                 case NDISC_NEIGHBOUR_SOLICITATION:
 333                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 334                 case NDISC_REDIRECT:
 335                         /* For reaction involving unicast neighbor discovery
 336                          * message destined to the proxied address, pass it to
 337                          * input function.
 338                          */
 339                         return 1;
 340                 default:
 341                         break;
 342                 }
 343         }
 344
 345         /*
 346          * The proxying router can't forward traffic sent to a link-local
 347          * address, so signal the sender and discard the packet. This
 348          * behavior is clarified by the MIPv6 specification.
 349          */
 350         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 351                 dst_link_failure(skb);
 352                 return -1;
 353         }
 354
 355         return 0;
 356 }
 357
 358 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 359                                      struct sk_buff *skb)
 360 {
 361         return dst_output(net, sk, skb);
 362 }
 363
 364 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 365 {
 366         unsigned int mtu;
 367         struct inet6_dev *idev;
 368
 369         if (dst_metric_locked(dst, RTAX_MTU)) {
 370                 mtu = dst_metric_raw(dst, RTAX_MTU);
 371                 if (mtu)
 372                         return mtu;
 373         }
 374
 375         mtu = IPV6_MIN_MTU;
 376         rcu_read_lock();
 377         idev = __in6_dev_get(dst->dev);
 378         if (idev)
 379                 mtu = idev->cnf.mtu6;
 380         rcu_read_unlock();
 381
 382         return mtu;
 383 }
 384
 385 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 386 {
 387         if (skb->len <= mtu)
 388                 return false;
 389
 390         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 391         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 392                 return true;
 393
 394         if (skb->ignore_df)
 395                 return false;
 396
 397         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 398                 return false;
 399
 400         return true;
 401 }
 402
 403 int ip6_forward(struct sk_buff *skb)
 404 {
 405         struct dst_entry *dst = skb_dst(skb);
 406         struct ipv6hdr *hdr = ipv6_hdr(skb);
 407         struct inet6_skb_parm *opt = IP6CB(skb);
 408         struct net *net = dev_net(dst->dev);
 409         u32 mtu;
 410
 411         if (net->ipv6.devconf_all->forwarding == 0)
 412                 goto error;
 413
 414         if (skb->pkt_type != PACKET_HOST)
 415                 goto drop;
 416
 417         if (unlikely(skb->sk))
 418                 goto drop;
 419
 420         if (skb_warn_if_lro(skb))
 421                 goto drop;
 422
 423         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 424                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 425                                 IPSTATS_MIB_INDISCARDS);
 426                 goto drop;
 427         }
 428
 429         skb_forward_csum(skb);
 430
 431         /*
 432          *      We DO NOT make any processing on
 433          *      RA packets, pushing them to user level AS IS
 434          *      without ane WARRANTY that application will be able
 435          *      to interpret them. The reason is that we
 436          *      cannot make anything clever here.
 437          *
 438          *      We are not end-node, so that if packet contains
 439          *      AH/ESP, we cannot make anything.
 440          *      Defragmentation also would be mistake, RA packets
 441          *      cannot be fragmented, because there is no warranty
 442          *      that different fragments will go along one path. --ANK
 443          */
 444         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 445                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 446                         return 0;
 447         }
 448
 449         /*
 450          *      check and decrement ttl
 451          */
 452         if (hdr->hop_limit <= 1) {
 453                 /* Force OUTPUT device used as source address */
 454                 skb->dev = dst->dev;
 455                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 456                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 457                                 IPSTATS_MIB_INHDRERRORS);
 458
 459                 kfree_skb(skb);
 460                 return -ETIMEDOUT;
 461         }
 462
 463         /* XXX: idev->cnf.proxy_ndp? */
 464         if (net->ipv6.devconf_all->proxy_ndp &&
 465             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 466                 int proxied = ip6_forward_proxy_check(skb);
 467                 if (proxied > 0)
 468                         return ip6_input(skb);
 469                 else if (proxied < 0) {
 470                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 471                                         IPSTATS_MIB_INDISCARDS);
 472                         goto drop;
 473                 }
 474         }
 475
 476         if (!xfrm6_route_forward(skb)) {
 477                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 478                                 IPSTATS_MIB_INDISCARDS);
 479                 goto drop;
 480         }
 481         dst = skb_dst(skb);
 482
 483         /* IPv6 specs say nothing about it, but it is clear that we cannot
 484            send redirects to source routed frames.
 485            We don't send redirects to frames decapsulated from IPsec.
 486          */
 487         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 488                 struct in6_addr *target = NULL;
 489                 struct inet_peer *peer;
 490                 struct rt6_info *rt;
 491
 492                 /*
 493                  *      incoming and outgoing devices are the same
 494                  *      send a redirect.
 495                  */
 496
 497                 rt = (struct rt6_info *) dst;
 498                 if (rt->rt6i_flags & RTF_GATEWAY)
 499                         target = &rt->rt6i_gateway;
 500                 else
 501                         target = &hdr->daddr;
 502
 503                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 504
 505                 /* Limit redirects both by destination (here)
 506                    and by source (inside ndisc_send_redirect)
 507                  */
 508                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 509                         ndisc_send_redirect(skb, target);
 510                 if (peer)
 511                         inet_putpeer(peer);
 512         } else {
 513                 int addrtype = ipv6_addr_type(&hdr->saddr);
 514
 515                 /* This check is security critical. */
 516                 if (addrtype == IPV6_ADDR_ANY ||
 517                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 518                         goto error;
 519                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 520                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 521                                     ICMPV6_NOT_NEIGHBOUR, 0);
 522                         goto error;
 523                 }
 524         }
 525
 526         mtu = ip6_dst_mtu_forward(dst);
 527         if (mtu < IPV6_MIN_MTU)
 528                 mtu = IPV6_MIN_MTU;
 529
 530         if (ip6_pkt_too_big(skb, mtu)) {
 531                 /* Again, force OUTPUT device used as source address */
 532                 skb->dev = dst->dev;
 533                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 534                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 535                                 IPSTATS_MIB_INTOOBIGERRORS);
 536                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 537                                 IPSTATS_MIB_FRAGFAILS);
 538                 kfree_skb(skb);
 539                 return -EMSGSIZE;
 540         }
 541
 542         if (skb_cow(skb, dst->dev->hard_header_len)) {
 543                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 544                                 IPSTATS_MIB_OUTDISCARDS);
 545                 goto drop;
 546         }
 547
 548         hdr = ipv6_hdr(skb);
 549
 550         /* Mangling hops number delayed to point after skb COW */
 551
 552         hdr->hop_limit--;
 553
 554         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 555         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 556         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 557                        net, NULL, skb, skb->dev, dst->dev,
 558                        ip6_forward_finish);
 559
 560 error:
 561         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 562 drop:
 563         kfree_skb(skb);
 564         return -EINVAL;
 565 }
 566
 567 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 568 {
 569         to->pkt_type = from->pkt_type;
 570         to->priority = from->priority;
 571         to->protocol = from->protocol;
 572         skb_dst_drop(to);
 573         skb_dst_set(to, dst_clone(skb_dst(from)));
 574         to->dev = from->dev;
 575         to->mark = from->mark;
 576
 577 #ifdef CONFIG_NET_SCHED
 578         to->tc_index = from->tc_index;
 579 #endif
 580         nf_copy(to, from);
 581         skb_copy_secmark(to, from);
 582 }
 583
 584 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 585                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 586 {
 587         struct sk_buff *frag;
 588         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 589         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 590                                 inet6_sk(skb->sk) : NULL;
 591         struct ipv6hdr *tmp_hdr;
 592         struct frag_hdr *fh;
 593         unsigned int mtu, hlen, left, len;
 594         int hroom, troom;
 595         __be32 frag_id;
 596         int ptr, offset = 0, err = 0;
 597         u8 *prevhdr, nexthdr = 0;
 598
 599         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 600         nexthdr = *prevhdr;
 601
 602         mtu = ip6_skb_dst_mtu(skb);
 603
 604         /* We must not fragment if the socket is set to force MTU discovery
 605          * or if the skb it not generated by a local socket.
 606          */
 607         if (unlikely(!skb->ignore_df && skb->len > mtu))
 608                 goto fail_toobig;
 609
 610         if (IP6CB(skb)->frag_max_size) {
 611                 if (IP6CB(skb)->frag_max_size > mtu)
 612                         goto fail_toobig;
 613
 614                 /* don't send fragments larger than what we received */
 615                 mtu = IP6CB(skb)->frag_max_size;
 616                 if (mtu < IPV6_MIN_MTU)
 617                         mtu = IPV6_MIN_MTU;
 618         }
 619
 620         if (np && np->frag_size < mtu) {
 621                 if (np->frag_size)
 622                         mtu = np->frag_size;
 623         }
 624         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 625                 goto fail_toobig;
 626         mtu -= hlen + sizeof(struct frag_hdr);
 627
 628         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 629                                     &ipv6_hdr(skb)->saddr);
 630
 631         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 632             (err = skb_checksum_help(skb)))
 633                 goto fail;
 634
 635         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 636         if (skb_has_frag_list(skb)) {
 637                 unsigned int first_len = skb_pagelen(skb);
 638                 struct sk_buff *frag2;
 639
 640                 if (first_len - hlen > mtu ||
 641                     ((first_len - hlen) & 7) ||
 642                     skb_cloned(skb) ||
 643                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 644                         goto slow_path;
 645
 646                 skb_walk_frags(skb, frag) {
 647                         /* Correct geometry. */
 648                         if (frag->len > mtu ||
 649                             ((frag->len & 7) && frag->next) ||
 650                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 651                                 goto slow_path_clean;
 652
 653                         /* Partially cloned skb? */
 654                         if (skb_shared(frag))
 655                                 goto slow_path_clean;
 656
 657                         BUG_ON(frag->sk);
 658                         if (skb->sk) {
 659                                 frag->sk = skb->sk;
 660                                 frag->destructor = sock_wfree;
 661                         }
 662                         skb->truesize -= frag->truesize;
 663                 }
 664
 665                 err = 0;
 666                 offset = 0;
 667                 /* BUILD HEADER */
 668
 669                 *prevhdr = NEXTHDR_FRAGMENT;
 670                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 671                 if (!tmp_hdr) {
 672                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 673                                       IPSTATS_MIB_FRAGFAILS);
 674                         err = -ENOMEM;
 675                         goto fail;
 676                 }
 677                 frag = skb_shinfo(skb)->frag_list;
 678                 skb_frag_list_init(skb);
 679
 680                 __skb_pull(skb, hlen);
 681                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 682                 __skb_push(skb, hlen);
 683                 skb_reset_network_header(skb);
 684                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 685
 686                 fh->nexthdr = nexthdr;
 687                 fh->reserved = 0;
 688                 fh->frag_off = htons(IP6_MF);
 689                 fh->identification = frag_id;
 690
 691                 first_len = skb_pagelen(skb);
 692                 skb->data_len = first_len - skb_headlen(skb);
 693                 skb->len = first_len;
 694                 ipv6_hdr(skb)->payload_len = htons(first_len -
 695                                                    sizeof(struct ipv6hdr));
 696
 697                 dst_hold(&rt->dst);
 698
 699                 for (;;) {
 700                         /* Prepare header of the next frame,
 701                          * before previous one went down. */
 702                         if (frag) {
 703                                 frag->ip_summed = CHECKSUM_NONE;
 704                                 skb_reset_transport_header(frag);
 705                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 706                                 __skb_push(frag, hlen);
 707                                 skb_reset_network_header(frag);
 708                                 memcpy(skb_network_header(frag), tmp_hdr,
 709                                        hlen);
 710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 711                                 fh->nexthdr = nexthdr;
 712                                 fh->reserved = 0;
 713                                 fh->frag_off = htons(offset);
 714                                 if (frag->next)
 715                                         fh->frag_off |= htons(IP6_MF);
 716                                 fh->identification = frag_id;
 717                                 ipv6_hdr(frag)->payload_len =
 718                                                 htons(frag->len -
 719                                                       sizeof(struct ipv6hdr));
 720                                 ip6_copy_metadata(frag, skb);
 721                         }
 722
 723                         err = output(net, sk, skb);
 724                         if (!err)
 725                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 726                                               IPSTATS_MIB_FRAGCREATES);
 727
 728                         if (err || !frag)
 729                                 break;
 730
 731                         skb = frag;
 732                         frag = skb->next;
 733                         skb->next = NULL;
 734                 }
 735
 736                 kfree(tmp_hdr);
 737
 738                 if (err == 0) {
 739                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 740                                       IPSTATS_MIB_FRAGOKS);
 741                         ip6_rt_put(rt);
 742                         return 0;
 743                 }
 744
 745                 kfree_skb_list(frag);
 746
 747                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 748                               IPSTATS_MIB_FRAGFAILS);
 749                 ip6_rt_put(rt);
 750                 return err;
 751
 752 slow_path_clean:
 753                 skb_walk_frags(skb, frag2) {
 754                         if (frag2 == frag)
 755                                 break;
 756                         frag2->sk = NULL;
 757                         frag2->destructor = NULL;
 758                         skb->truesize += frag2->truesize;
 759                 }
 760         }
 761
 762 slow_path:
 763         left = skb->len - hlen;         /* Space per frame */
 764         ptr = hlen;                     /* Where to start from */
 765
 766         /*
 767          *      Fragment the datagram.
 768          */
 769
 770         *prevhdr = NEXTHDR_FRAGMENT;
 771         troom = rt->dst.dev->needed_tailroom;
 772
 773         /*
 774          *      Keep copying data until we run out.
 775          */
 776         while (left > 0)        {
 777                 len = left;
 778                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 779                 if (len > mtu)
 780                         len = mtu;
 781                 /* IF: we are not sending up to and including the packet end
 782                    then align the next start on an eight byte boundary */
 783                 if (len < left) {
 784                         len &= ~7;
 785                 }
 786
 787                 /* Allocate buffer */
 788                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 789                                  hroom + troom, GFP_ATOMIC);
 790                 if (!frag) {
 791                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 792                                       IPSTATS_MIB_FRAGFAILS);
 793                         err = -ENOMEM;
 794                         goto fail;
 795                 }
 796
 797                 /*
 798                  *      Set up data on packet
 799                  */
 800
 801                 ip6_copy_metadata(frag, skb);
 802                 skb_reserve(frag, hroom);
 803                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 804                 skb_reset_network_header(frag);
 805                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 806                 frag->transport_header = (frag->network_header + hlen +
 807                                           sizeof(struct frag_hdr));
 808
 809                 /*
 810                  *      Charge the memory for the fragment to any owner
 811                  *      it might possess
 812                  */
 813                 if (skb->sk)
 814                         skb_set_owner_w(frag, skb->sk);
 815
 816                 /*
 817                  *      Copy the packet header into the new buffer.
 818                  */
 819                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 820
 821                 /*
 822                  *      Build fragment header.
 823                  */
 824                 fh->nexthdr = nexthdr;
 825                 fh->reserved = 0;
 826                 fh->identification = frag_id;
 827
 828                 /*
 829                  *      Copy a block of the IP datagram.
 830                  */
 831                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 832                                      len));
 833                 left -= len;
 834
 835                 fh->frag_off = htons(offset);
 836                 if (left > 0)
 837                         fh->frag_off |= htons(IP6_MF);
 838                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 839                                                     sizeof(struct ipv6hdr));
 840
 841                 ptr += len;
 842                 offset += len;
 843
 844                 /*
 845                  *      Put this fragment into the sending queue.
 846                  */
 847                 err = output(net, sk, frag);
 848                 if (err)
 849                         goto fail;
 850
 851                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 852                               IPSTATS_MIB_FRAGCREATES);
 853         }
 854         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 855                       IPSTATS_MIB_FRAGOKS);
 856         consume_skb(skb);
 857         return err;
 858
 859 fail_toobig:
 860         if (skb->sk && dst_allfrag(skb_dst(skb)))
 861                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 862
 863         skb->dev = skb_dst(skb)->dev;
 864         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 865         err = -EMSGSIZE;
 866
 867 fail:
 868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 869                       IPSTATS_MIB_FRAGFAILS);
 870         kfree_skb(skb);
 871         return err;
 872 }
 873
 874 static inline int ip6_rt_check(const struct rt6key *rt_key,
 875                                const struct in6_addr *fl_addr,
 876                                const struct in6_addr *addr_cache)
 877 {
 878         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 879                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 880 }
 881
 882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 883                                           struct dst_entry *dst,
 884                                           const struct flowi6 *fl6)
 885 {
 886         struct ipv6_pinfo *np = inet6_sk(sk);
 887         struct rt6_info *rt;
 888
 889         if (!dst)
 890                 goto out;
 891
 892         if (dst->ops->family != AF_INET6) {
 893                 dst_release(dst);
 894                 return NULL;
 895         }
 896
 897         rt = (struct rt6_info *)dst;
 898         /* Yes, checking route validity in not connected
 899          * case is not very simple. Take into account,
 900          * that we do not support routing by source, TOS,
 901          * and MSG_DONTROUTE            --ANK (980726)
 902          *
 903          * 1. ip6_rt_check(): If route was host route,
 904          *    check that cached destination is current.
 905          *    If it is network route, we still may
 906          *    check its validity using saved pointer
 907          *    to the last used address: daddr_cache.
 908          *    We do not want to save whole address now,
 909          *    (because main consumer of this service
 910          *    is tcp, which has not this problem),
 911          *    so that the last trick works only on connected
 912          *    sockets.
 913          * 2. oif also should be the same.
 914          */
 915         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 916 #ifdef CONFIG_IPV6_SUBTREES
 917             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 918 #endif
 919            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 920               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 921                 dst_release(dst);
 922                 dst = NULL;
 923         }
 924
 925 out:
 926         return dst;
 927 }
 928
 929 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 930                                struct dst_entry **dst, struct flowi6 *fl6)
 931 {
 932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 933         struct neighbour *n;
 934         struct rt6_info *rt;
 935 #endif
 936         int err;
 937         int flags = 0;
 938
 939         /* The correct way to handle this would be to do
 940          * ip6_route_get_saddr, and then ip6_route_output; however,
 941          * the route-specific preferred source forces the
 942          * ip6_route_output call _before_ ip6_route_get_saddr.
 943          *
 944          * In source specific routing (no src=any default route),
 945          * ip6_route_output will fail given src=any saddr, though, so
 946          * that's why we try it again later.
 947          */
 948         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 949                 struct rt6_info *rt;
 950                 bool had_dst = *dst != NULL;
 951
 952                 if (!had_dst)
 953                         *dst = ip6_route_output(net, sk, fl6);
 954                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 955                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 956                                           sk ? inet6_sk(sk)->srcprefs : 0,
 957                                           &fl6->saddr);
 958                 if (err)
 959                         goto out_err_release;
 960
 961                 /* If we had an erroneous initial result, pretend it
 962                  * never existed and let the SA-enabled version take
 963                  * over.
 964                  */
 965                 if (!had_dst && (*dst)->error) {
 966                         dst_release(*dst);
 967                         *dst = NULL;
 968                 }
 969
 970                 if (fl6->flowi6_oif)
 971                         flags |= RT6_LOOKUP_F_IFACE;
 972         }
 973
 974         if (!*dst)
 975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 976
 977         err = (*dst)->error;
 978         if (err)
 979                 goto out_err_release;
 980
 981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 982         /*
 983          * Here if the dst entry we've looked up
 984          * has a neighbour entry that is in the INCOMPLETE
 985          * state and the src address from the flow is
 986          * marked as OPTIMISTIC, we release the found
 987          * dst entry and replace it instead with the
 988          * dst entry of the nexthop router
 989          */
 990         rt = (struct rt6_info *) *dst;
 991         rcu_read_lock_bh();
 992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 993                                       rt6_nexthop(rt, &fl6->daddr));
 994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 995         rcu_read_unlock_bh();
 996
 997         if (err) {
 998                 struct inet6_ifaddr *ifp;
 999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024
1025         return 0;
1026
1027 out_err_release:
1028         dst_release(*dst);
1029         *dst = NULL;
1030
1031         if (err == -ENETUNREACH)
1032                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1033         return err;
1034 }
1035
1036 /**
1037  *      ip6_dst_lookup - perform route lookup on flow
1038  *      @sk: socket which provides route info
1039  *      @dst: pointer to dst_entry * for result
1040  *      @fl6: flow to lookup
1041  *
1042  *      This function performs a route lookup on the given flow.
1043  *
1044  *      It returns zero on success, or a standard errno code on error.
1045  */
1046 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1047                    struct flowi6 *fl6)
1048 {
1049         *dst = NULL;
1050         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1051 }
1052 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1053
1054 /**
1055  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1056  *      @sk: socket which provides route info
1057  *      @fl6: flow to lookup
1058  *      @final_dst: final destination address for ipsec lookup
1059  *
1060  *      This function performs a route lookup on the given flow.
1061  *
1062  *      It returns a valid dst pointer on success, or a pointer encoded
1063  *      error code.
1064  */
1065 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1066                                       const struct in6_addr *final_dst)
1067 {
1068         struct dst_entry *dst = NULL;
1069         int err;
1070
1071         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1072         if (err)
1073                 return ERR_PTR(err);
1074         if (final_dst)
1075                 fl6->daddr = *final_dst;
1076
1077         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1078 }
1079 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1080
1081 /**
1082  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1083  *      @sk: socket which provides the dst cache and route info
1084  *      @fl6: flow to lookup
1085  *      @final_dst: final destination address for ipsec lookup
1086  *
1087  *      This function performs a route lookup on the given flow with the
1088  *      possibility of using the cached route in the socket if it is valid.
1089  *      It will take the socket dst lock when operating on the dst cache.
1090  *      As a result, this function can only be used in process context.
1091  *
1092  *      It returns a valid dst pointer on success, or a pointer encoded
1093  *      error code.
1094  */
1095 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1096                                          const struct in6_addr *final_dst)
1097 {
1098         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1099
1100         dst = ip6_sk_dst_check(sk, dst, fl6);
1101         if (!dst)
1102                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1103
1104         return dst;
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1107
1108 static inline int ip6_ufo_append_data(struct sock *sk,
1109                         struct sk_buff_head *queue,
1110                         int getfrag(void *from, char *to, int offset, int len,
1111                         int odd, struct sk_buff *skb),
1112                         void *from, int length, int hh_len, int fragheaderlen,
1113                         int exthdrlen, int transhdrlen, int mtu,
1114                         unsigned int flags, const struct flowi6 *fl6)
1115
1116 {
1117         struct sk_buff *skb;
1118         int err;
1119
1120         /* There is support for UDP large send offload by network
1121          * device, so create one single skb packet containing complete
1122          * udp datagram
1123          */
1124         skb = skb_peek_tail(queue);
1125         if (!skb) {
1126                 skb = sock_alloc_send_skb(sk,
1127                         hh_len + fragheaderlen + transhdrlen + 20,
1128                         (flags & MSG_DONTWAIT), &err);
1129                 if (!skb)
1130                         return err;
1131
1132                 /* reserve space for Hardware header */
1133                 skb_reserve(skb, hh_len);
1134
1135                 /* create space for UDP/IP header */
1136                 skb_put(skb, fragheaderlen + transhdrlen);
1137
1138                 /* initialize network header pointer */
1139                 skb_set_network_header(skb, exthdrlen);
1140
1141                 /* initialize protocol header pointer */
1142                 skb->transport_header = skb->network_header + fragheaderlen;
1143
1144                 skb->protocol = htons(ETH_P_IPV6);
1145                 skb->csum = 0;
1146
1147                 __skb_queue_tail(queue, skb);
1148         } else if (skb_is_gso(skb)) {
1149                 goto append;
1150         }
1151
1152         skb->ip_summed = CHECKSUM_PARTIAL;
1153         /* Specify the length of each IPv6 datagram fragment.
1154          * It has to be a multiple of 8.
1155          */
1156         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1157                                      sizeof(struct frag_hdr)) & ~7;
1158         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1159         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1160                                                          &fl6->daddr,
1161                                                          &fl6->saddr);
1162
1163 append:
1164         return skb_append_datato_frags(sk, skb, getfrag, from,
1165                                        (length - transhdrlen));
1166 }
1167
1168 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1169                                                gfp_t gfp)
1170 {
1171         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1172 }
1173
1174 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1175                                                 gfp_t gfp)
1176 {
1177         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1178 }
1179
1180 static void ip6_append_data_mtu(unsigned int *mtu,
1181                                 int *maxfraglen,
1182                                 unsigned int fragheaderlen,
1183                                 struct sk_buff *skb,
1184                                 struct rt6_info *rt,
1185                                 unsigned int orig_mtu)
1186 {
1187         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1188                 if (!skb) {
1189                         /* first fragment, reserve header_len */
1190                         *mtu = orig_mtu - rt->dst.header_len;
1191
1192                 } else {
1193                         /*
1194                          * this fragment is not first, the headers
1195                          * space is regarded as data space.
1196                          */
1197                         *mtu = orig_mtu;
1198                 }
1199                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1200                               + fragheaderlen - sizeof(struct frag_hdr);
1201         }
1202 }
1203
1204 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1205                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1206                           struct rt6_info *rt, struct flowi6 *fl6)
1207 {
1208         struct ipv6_pinfo *np = inet6_sk(sk);
1209         unsigned int mtu;
1210         struct ipv6_txoptions *opt = ipc6->opt;
1211
1212         /*
1213          * setup for corking
1214          */
1215         if (opt) {
1216                 if (WARN_ON(v6_cork->opt))
1217                         return -EINVAL;
1218
1219                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1220                 if (unlikely(!v6_cork->opt))
1221                         return -ENOBUFS;
1222
1223                 v6_cork->opt->tot_len = opt->tot_len;
1224                 v6_cork->opt->opt_flen = opt->opt_flen;
1225                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1226
1227                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1228                                                     sk->sk_allocation);
1229                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1230                         return -ENOBUFS;
1231
1232                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1233                                                     sk->sk_allocation);
1234                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1235                         return -ENOBUFS;
1236
1237                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1238                                                    sk->sk_allocation);
1239                 if (opt->hopopt && !v6_cork->opt->hopopt)
1240                         return -ENOBUFS;
1241
1242                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1243                                                     sk->sk_allocation);
1244                 if (opt->srcrt && !v6_cork->opt->srcrt)
1245                         return -ENOBUFS;
1246
1247                 /* need source address above miyazawa*/
1248         }
1249         dst_hold(&rt->dst);
1250         cork->base.dst = &rt->dst;
1251         cork->fl.u.ip6 = *fl6;
1252         v6_cork->hop_limit = ipc6->hlimit;
1253         v6_cork->tclass = ipc6->tclass;
1254         if (rt->dst.flags & DST_XFRM_TUNNEL)
1255                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1256                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1257         else
1258                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1259                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1260         if (np->frag_size < mtu) {
1261                 if (np->frag_size)
1262                         mtu = np->frag_size;
1263         }
1264         cork->base.fragsize = mtu;
1265         if (dst_allfrag(rt->dst.path))
1266                 cork->base.flags |= IPCORK_ALLFRAG;
1267         cork->base.length = 0;
1268
1269         return 0;
1270 }
1271
1272 static int __ip6_append_data(struct sock *sk,
1273                              struct flowi6 *fl6,
1274                              struct sk_buff_head *queue,
1275                              struct inet_cork *cork,
1276                              struct inet6_cork *v6_cork,
1277                              struct page_frag *pfrag,
1278                              int getfrag(void *from, char *to, int offset,
1279                                          int len, int odd, struct sk_buff *skb),
1280                              void *from, int length, int transhdrlen,
1281                              unsigned int flags, struct ipcm6_cookie *ipc6,
1282                              const struct sockcm_cookie *sockc)
1283 {
1284         struct sk_buff *skb, *skb_prev = NULL;
1285         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1286         int exthdrlen = 0;
1287         int dst_exthdrlen = 0;
1288         int hh_len;
1289         int copy;
1290         int err;
1291         int offset = 0;
1292         __u8 tx_flags = 0;
1293         u32 tskey = 0;
1294         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1295         struct ipv6_txoptions *opt = v6_cork->opt;
1296         int csummode = CHECKSUM_NONE;
1297         unsigned int maxnonfragsize, headersize;
1298
1299         skb = skb_peek_tail(queue);
1300         if (!skb) {
1301                 exthdrlen = opt ? opt->opt_flen : 0;
1302                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1303         }
1304
1305         mtu = cork->fragsize;
1306         orig_mtu = mtu;
1307
1308         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1309
1310         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1311                         (opt ? opt->opt_nflen : 0);
1312         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1313                      sizeof(struct frag_hdr);
1314
1315         headersize = sizeof(struct ipv6hdr) +
1316                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1317                      (dst_allfrag(&rt->dst) ?
1318                       sizeof(struct frag_hdr) : 0) +
1319                      rt->rt6i_nfheader_len;
1320
1321         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1322             (sk->sk_protocol == IPPROTO_UDP ||
1323              sk->sk_protocol == IPPROTO_RAW)) {
1324                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1325                                 sizeof(struct ipv6hdr));
1326                 goto emsgsize;
1327         }
1328
1329         if (ip6_sk_ignore_df(sk))
1330                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1331         else
1332                 maxnonfragsize = mtu;
1333
1334         if (cork->length + length > maxnonfragsize - headersize) {
1335 emsgsize:
1336                 ipv6_local_error(sk, EMSGSIZE, fl6,
1337                                  mtu - headersize +
1338                                  sizeof(struct ipv6hdr));
1339                 return -EMSGSIZE;
1340         }
1341
1342         /* CHECKSUM_PARTIAL only with no extension headers and when
1343          * we are not going to fragment
1344          */
1345         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1346             headersize == sizeof(struct ipv6hdr) &&
1347             length <= mtu - headersize &&
1348             !(flags & MSG_MORE) &&
1349             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1350                 csummode = CHECKSUM_PARTIAL;
1351
1352         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1353                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1354                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1355                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1356                         tskey = sk->sk_tskey++;
1357         }
1358
1359         /*
1360          * Let's try using as much space as possible.
1361          * Use MTU if total length of the message fits into the MTU.
1362          * Otherwise, we need to reserve fragment header and
1363          * fragment alignment (= 8-15 octects, in total).
1364          *
1365          * Note that we may need to "move" the data from the tail of
1366          * of the buffer to the new fragment when we split
1367          * the message.
1368          *
1369          * FIXME: It may be fragmented into multiple chunks
1370          *        at once if non-fragmentable extension headers
1371          *        are too large.
1372          * --yoshfuji
1373          */
1374
1375         cork->length += length;
1376         if ((((length + fragheaderlen) > mtu) ||
1377              (skb && skb_is_gso(skb))) &&
1378             (sk->sk_protocol == IPPROTO_UDP) &&
1379             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1380             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1381                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1382                                           hh_len, fragheaderlen, exthdrlen,
1383                                           transhdrlen, mtu, flags, fl6);
1384                 if (err)
1385                         goto error;
1386                 return 0;
1387         }
1388
1389         if (!skb)
1390                 goto alloc_new_skb;
1391
1392         while (length > 0) {
1393                 /* Check if the remaining data fits into current packet. */
1394                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1395                 if (copy < length)
1396                         copy = maxfraglen - skb->len;
1397
1398                 if (copy <= 0) {
1399                         char *data;
1400                         unsigned int datalen;
1401                         unsigned int fraglen;
1402                         unsigned int fraggap;
1403                         unsigned int alloclen;
1404 alloc_new_skb:
1405                         /* There's no room in the current skb */
1406                         if (skb)
1407                                 fraggap = skb->len - maxfraglen;
1408                         else
1409                                 fraggap = 0;
1410                         /* update mtu and maxfraglen if necessary */
1411                         if (!skb || !skb_prev)
1412                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1413                                                     fragheaderlen, skb, rt,
1414                                                     orig_mtu);
1415
1416                         skb_prev = skb;
1417
1418                         /*
1419                          * If remaining data exceeds the mtu,
1420                          * we know we need more fragment(s).
1421                          */
1422                         datalen = length + fraggap;
1423
1424                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1425                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1426                         if ((flags & MSG_MORE) &&
1427                             !(rt->dst.dev->features&NETIF_F_SG))
1428                                 alloclen = mtu;
1429                         else
1430                                 alloclen = datalen + fragheaderlen;
1431
1432                         alloclen += dst_exthdrlen;
1433
1434                         if (datalen != length + fraggap) {
1435                                 /*
1436                                  * this is not the last fragment, the trailer
1437                                  * space is regarded as data space.
1438                                  */
1439                                 datalen += rt->dst.trailer_len;
1440                         }
1441
1442                         alloclen += rt->dst.trailer_len;
1443                         fraglen = datalen + fragheaderlen;
1444
1445                         /*
1446                          * We just reserve space for fragment header.
1447                          * Note: this may be overallocation if the message
1448                          * (without MSG_MORE) fits into the MTU.
1449                          */
1450                         alloclen += sizeof(struct frag_hdr);
1451
1452                         if (transhdrlen) {
1453                                 skb = sock_alloc_send_skb(sk,
1454                                                 alloclen + hh_len,
1455                                                 (flags & MSG_DONTWAIT), &err);
1456                         } else {
1457                                 skb = NULL;
1458                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1459                                     2 * sk->sk_sndbuf)
1460                                         skb = sock_wmalloc(sk,
1461                                                            alloclen + hh_len, 1,
1462                                                            sk->sk_allocation);
1463                                 if (unlikely(!skb))
1464                                         err = -ENOBUFS;
1465                         }
1466                         if (!skb)
1467                                 goto error;
1468                         /*
1469                          *      Fill in the control structures
1470                          */
1471                         skb->protocol = htons(ETH_P_IPV6);
1472                         skb->ip_summed = csummode;
1473                         skb->csum = 0;
1474                         /* reserve for fragmentation and ipsec header */
1475                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1476                                     dst_exthdrlen);
1477
1478                         /* Only the initial fragment is time stamped */
1479                         skb_shinfo(skb)->tx_flags = tx_flags;
1480                         tx_flags = 0;
1481                         skb_shinfo(skb)->tskey = tskey;
1482                         tskey = 0;
1483
1484                         /*
1485                          *      Find where to start putting bytes
1486                          */
1487                         data = skb_put(skb, fraglen);
1488                         skb_set_network_header(skb, exthdrlen);
1489                         data += fragheaderlen;
1490                         skb->transport_header = (skb->network_header +
1491                                                  fragheaderlen);
1492                         if (fraggap) {
1493                                 skb->csum = skb_copy_and_csum_bits(
1494                                         skb_prev, maxfraglen,
1495                                         data + transhdrlen, fraggap, 0);
1496                                 skb_prev->csum = csum_sub(skb_prev->csum,
1497                                                           skb->csum);
1498                                 data += fraggap;
1499                                 pskb_trim_unique(skb_prev, maxfraglen);
1500                         }
1501                         copy = datalen - transhdrlen - fraggap;
1502
1503                         if (copy < 0) {
1504                                 err = -EINVAL;
1505                                 kfree_skb(skb);
1506                                 goto error;
1507                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1508                                 err = -EFAULT;
1509                                 kfree_skb(skb);
1510                                 goto error;
1511                         }
1512
1513                         offset += copy;
1514                         length -= datalen - fraggap;
1515                         transhdrlen = 0;
1516                         exthdrlen = 0;
1517                         dst_exthdrlen = 0;
1518
1519                         /*
1520                          * Put the packet on the pending queue
1521                          */
1522                         __skb_queue_tail(queue, skb);
1523                         continue;
1524                 }
1525
1526                 if (copy > length)
1527                         copy = length;
1528
1529                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1530                         unsigned int off;
1531
1532                         off = skb->len;
1533                         if (getfrag(from, skb_put(skb, copy),
1534                                                 offset, copy, off, skb) < 0) {
1535                                 __skb_trim(skb, off);
1536                                 err = -EFAULT;
1537                                 goto error;
1538                         }
1539                 } else {
1540                         int i = skb_shinfo(skb)->nr_frags;
1541
1542                         err = -ENOMEM;
1543                         if (!sk_page_frag_refill(sk, pfrag))
1544                                 goto error;
1545
1546                         if (!skb_can_coalesce(skb, i, pfrag->page,
1547                                               pfrag->offset)) {
1548                                 err = -EMSGSIZE;
1549                                 if (i == MAX_SKB_FRAGS)
1550                                         goto error;
1551
1552                                 __skb_fill_page_desc(skb, i, pfrag->page,
1553                                                      pfrag->offset, 0);
1554                                 skb_shinfo(skb)->nr_frags = ++i;
1555                                 get_page(pfrag->page);
1556                         }
1557                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1558                         if (getfrag(from,
1559                                     page_address(pfrag->page) + pfrag->offset,
1560                                     offset, copy, skb->len, skb) < 0)
1561                                 goto error_efault;
1562
1563                         pfrag->offset += copy;
1564                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1565                         skb->len += copy;
1566                         skb->data_len += copy;
1567                         skb->truesize += copy;
1568                         atomic_add(copy, &sk->sk_wmem_alloc);
1569                 }
1570                 offset += copy;
1571                 length -= copy;
1572         }
1573
1574         return 0;
1575
1576 error_efault:
1577         err = -EFAULT;
1578 error:
1579         cork->length -= length;
1580         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1581         return err;
1582 }
1583
1584 int ip6_append_data(struct sock *sk,
1585                     int getfrag(void *from, char *to, int offset, int len,
1586                                 int odd, struct sk_buff *skb),
1587                     void *from, int length, int transhdrlen,
1588                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1589                     struct rt6_info *rt, unsigned int flags,
1590                     const struct sockcm_cookie *sockc)
1591 {
1592         struct inet_sock *inet = inet_sk(sk);
1593         struct ipv6_pinfo *np = inet6_sk(sk);
1594         int exthdrlen;
1595         int err;
1596
1597         if (flags&MSG_PROBE)
1598                 return 0;
1599         if (skb_queue_empty(&sk->sk_write_queue)) {
1600                 /*
1601                  * setup for corking
1602                  */
1603                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1604                                      ipc6, rt, fl6);
1605                 if (err)
1606                         return err;
1607
1608                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1609                 length += exthdrlen;
1610                 transhdrlen += exthdrlen;
1611         } else {
1612                 fl6 = &inet->cork.fl.u.ip6;
1613                 transhdrlen = 0;
1614         }
1615
1616         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1617                                  &np->cork, sk_page_frag(sk), getfrag,
1618                                  from, length, transhdrlen, flags, ipc6, sockc);
1619 }
1620 EXPORT_SYMBOL_GPL(ip6_append_data);
1621
1622 static void ip6_cork_release(struct inet_cork_full *cork,
1623                              struct inet6_cork *v6_cork)
1624 {
1625         if (v6_cork->opt) {
1626                 kfree(v6_cork->opt->dst0opt);
1627                 kfree(v6_cork->opt->dst1opt);
1628                 kfree(v6_cork->opt->hopopt);
1629                 kfree(v6_cork->opt->srcrt);
1630                 kfree(v6_cork->opt);
1631                 v6_cork->opt = NULL;
1632         }
1633
1634         if (cork->base.dst) {
1635                 dst_release(cork->base.dst);
1636                 cork->base.dst = NULL;
1637                 cork->base.flags &= ~IPCORK_ALLFRAG;
1638         }
1639         memset(&cork->fl, 0, sizeof(cork->fl));
1640 }
1641
1642 struct sk_buff *__ip6_make_skb(struct sock *sk,
1643                                struct sk_buff_head *queue,
1644                                struct inet_cork_full *cork,
1645                                struct inet6_cork *v6_cork)
1646 {
1647         struct sk_buff *skb, *tmp_skb;
1648         struct sk_buff **tail_skb;
1649         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1650         struct ipv6_pinfo *np = inet6_sk(sk);
1651         struct net *net = sock_net(sk);
1652         struct ipv6hdr *hdr;
1653         struct ipv6_txoptions *opt = v6_cork->opt;
1654         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1655         struct flowi6 *fl6 = &cork->fl.u.ip6;
1656         unsigned char proto = fl6->flowi6_proto;
1657
1658         skb = __skb_dequeue(queue);
1659         if (!skb)
1660                 goto out;
1661         tail_skb = &(skb_shinfo(skb)->frag_list);
1662
1663         /* move skb->data to ip header from ext header */
1664         if (skb->data < skb_network_header(skb))
1665                 __skb_pull(skb, skb_network_offset(skb));
1666         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1667                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1668                 *tail_skb = tmp_skb;
1669                 tail_skb = &(tmp_skb->next);
1670                 skb->len += tmp_skb->len;
1671                 skb->data_len += tmp_skb->len;
1672                 skb->truesize += tmp_skb->truesize;
1673                 tmp_skb->destructor = NULL;
1674                 tmp_skb->sk = NULL;
1675         }
1676
1677         /* Allow local fragmentation. */
1678         skb->ignore_df = ip6_sk_ignore_df(sk);
1679
1680         *final_dst = fl6->daddr;
1681         __skb_pull(skb, skb_network_header_len(skb));
1682         if (opt && opt->opt_flen)
1683                 ipv6_push_frag_opts(skb, opt, &proto);
1684         if (opt && opt->opt_nflen)
1685                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1686
1687         skb_push(skb, sizeof(struct ipv6hdr));
1688         skb_reset_network_header(skb);
1689         hdr = ipv6_hdr(skb);
1690
1691         ip6_flow_hdr(hdr, v6_cork->tclass,
1692                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1693                                         np->autoflowlabel, fl6));
1694         hdr->hop_limit = v6_cork->hop_limit;
1695         hdr->nexthdr = proto;
1696         hdr->saddr = fl6->saddr;
1697         hdr->daddr = *final_dst;
1698
1699         skb->priority = sk->sk_priority;
1700         skb->mark = sk->sk_mark;
1701
1702         skb_dst_set(skb, dst_clone(&rt->dst));
1703         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1704         if (proto == IPPROTO_ICMPV6) {
1705                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1706
1707                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1708                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1709         }
1710
1711         ip6_cork_release(cork, v6_cork);
1712 out:
1713         return skb;
1714 }
1715
1716 int ip6_send_skb(struct sk_buff *skb)
1717 {
1718         struct net *net = sock_net(skb->sk);
1719         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1720         int err;
1721
1722         err = ip6_local_out(net, skb->sk, skb);
1723         if (err) {
1724                 if (err > 0)
1725                         err = net_xmit_errno(err);
1726                 if (err)
1727                         IP6_INC_STATS(net, rt->rt6i_idev,
1728                                       IPSTATS_MIB_OUTDISCARDS);
1729         }
1730
1731         return err;
1732 }
1733
1734 int ip6_push_pending_frames(struct sock *sk)
1735 {
1736         struct sk_buff *skb;
1737
1738         skb = ip6_finish_skb(sk);
1739         if (!skb)
1740                 return 0;
1741
1742         return ip6_send_skb(skb);
1743 }
1744 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1745
1746 static void __ip6_flush_pending_frames(struct sock *sk,
1747                                        struct sk_buff_head *queue,
1748                                        struct inet_cork_full *cork,
1749                                        struct inet6_cork *v6_cork)
1750 {
1751         struct sk_buff *skb;
1752
1753         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1754                 if (skb_dst(skb))
1755                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1756                                       IPSTATS_MIB_OUTDISCARDS);
1757                 kfree_skb(skb);
1758         }
1759
1760         ip6_cork_release(cork, v6_cork);
1761 }
1762
1763 void ip6_flush_pending_frames(struct sock *sk)
1764 {
1765         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1766                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1767 }
1768 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1769
1770 struct sk_buff *ip6_make_skb(struct sock *sk,
1771                              int getfrag(void *from, char *to, int offset,
1772                                          int len, int odd, struct sk_buff *skb),
1773                              void *from, int length, int transhdrlen,
1774                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1775                              struct rt6_info *rt, unsigned int flags,
1776                              const struct sockcm_cookie *sockc)
1777 {
1778         struct inet_cork_full cork;
1779         struct inet6_cork v6_cork;
1780         struct sk_buff_head queue;
1781         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1782         int err;
1783
1784         if (flags & MSG_PROBE)
1785                 return NULL;
1786
1787         __skb_queue_head_init(&queue);
1788
1789         cork.base.flags = 0;
1790         cork.base.addr = 0;
1791         cork.base.opt = NULL;
1792         v6_cork.opt = NULL;
1793         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1794         if (err)
1795                 return ERR_PTR(err);
1796
1797         if (ipc6->dontfrag < 0)
1798                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1799
1800         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1801                                 &current->task_frag, getfrag, from,
1802                                 length + exthdrlen, transhdrlen + exthdrlen,
1803                                 flags, ipc6, sockc);
1804         if (err) {
1805                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1806                 return ERR_PTR(err);
1807         }
1808
1809         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1810 }