net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 static int ip6_finish_output2(struct sk_buff *skb)
  87 {
  88         struct dst_entry *dst = skb_dst(skb);
  89         struct net_device *dev = dst->dev;
  90         struct neighbour *neigh;
  91
  92         skb->protocol = htons(ETH_P_IPV6);
  93         skb->dev = dev;
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  99                     ((mroute6_socket(dev_net(dev), skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(dev_net(dev), idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 122                                 skb->len);
 123         }
 124
 125         rcu_read_lock();
 126         neigh = dst_get_neighbour_noref(dst);
 127         if (neigh) {
 128                 int res = dst_neigh_output(dst, neigh, skb);
 129
 130                 rcu_read_unlock();
 131                 return res;
 132         }
 133         rcu_read_unlock();
 134         IP6_INC_STATS_BH(dev_net(dst->dev),
 135                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 136         kfree_skb(skb);
 137         return -EINVAL;
 138 }
 139
 140 static int ip6_finish_output(struct sk_buff *skb)
 141 {
 142         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 143             dst_allfrag(skb_dst(skb)))
 144                 return ip6_fragment(skb, ip6_finish_output2);
 145         else
 146                 return ip6_finish_output2(skb);
 147 }
 148
 149 int ip6_output(struct sk_buff *skb)
 150 {
 151         struct net_device *dev = skb_dst(skb)->dev;
 152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153         if (unlikely(idev->cnf.disable_ipv6)) {
 154                 IP6_INC_STATS(dev_net(dev), idev,
 155                               IPSTATS_MIB_OUTDISCARDS);
 156                 kfree_skb(skb);
 157                 return 0;
 158         }
 159
 160         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 161                             ip6_finish_output,
 162                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 163 }
 164
 165 /*
 166  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 167  */
 168
 169 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 170              struct ipv6_txoptions *opt, int tclass)
 171 {
 172         struct net *net = sock_net(sk);
 173         struct ipv6_pinfo *np = inet6_sk(sk);
 174         struct in6_addr *first_hop = &fl6->daddr;
 175         struct dst_entry *dst = skb_dst(skb);
 176         struct ipv6hdr *hdr;
 177         u8  proto = fl6->flowi6_proto;
 178         int seg_len = skb->len;
 179         int hlimit = -1;
 180         u32 mtu;
 181
 182         if (opt) {
 183                 unsigned int head_room;
 184
 185                 /* First: exthdrs may take lots of space (~8K for now)
 186                    MAX_HEADER is not enough.
 187                  */
 188                 head_room = opt->opt_nflen + opt->opt_flen;
 189                 seg_len += head_room;
 190                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 191
 192                 if (skb_headroom(skb) < head_room) {
 193                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 194                         if (skb2 == NULL) {
 195                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 196                                               IPSTATS_MIB_OUTDISCARDS);
 197                                 kfree_skb(skb);
 198                                 return -ENOBUFS;
 199                         }
 200                         consume_skb(skb);
 201                         skb = skb2;
 202                         skb_set_owner_w(skb, sk);
 203                 }
 204                 if (opt->opt_flen)
 205                         ipv6_push_frag_opts(skb, opt, &proto);
 206                 if (opt->opt_nflen)
 207                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 208         }
 209
 210         skb_push(skb, sizeof(struct ipv6hdr));
 211         skb_reset_network_header(skb);
 212         hdr = ipv6_hdr(skb);
 213
 214         /*
 215          *      Fill in the IPv6 header
 216          */
 217         if (np)
 218                 hlimit = np->hop_limit;
 219         if (hlimit < 0)
 220                 hlimit = ip6_dst_hoplimit(dst);
 221
 222         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 223
 224         hdr->payload_len = htons(seg_len);
 225         hdr->nexthdr = proto;
 226         hdr->hop_limit = hlimit;
 227
 228         hdr->saddr = fl6->saddr;
 229         hdr->daddr = *first_hop;
 230
 231         skb->priority = sk->sk_priority;
 232         skb->mark = sk->sk_mark;
 233
 234         mtu = dst_mtu(dst);
 235         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 236                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 237                               IPSTATS_MIB_OUT, skb->len);
 238                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 239                                dst->dev, dst_output);
 240         }
 241
 242         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
 243         skb->dev = dst->dev;
 244         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 245         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 246         kfree_skb(skb);
 247         return -EMSGSIZE;
 248 }
 249
 250 EXPORT_SYMBOL(ip6_xmit);
 251
 252 /*
 253  *      To avoid extra problems ND packets are send through this
 254  *      routine. It's code duplication but I really want to avoid
 255  *      extra checks since ipv6_build_header is used by TCP (which
 256  *      is for us performance critical)
 257  */
 258
 259 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 260                const struct in6_addr *saddr, const struct in6_addr *daddr,
 261                int proto, int len)
 262 {
 263         struct ipv6_pinfo *np = inet6_sk(sk);
 264         struct ipv6hdr *hdr;
 265
 266         skb->protocol = htons(ETH_P_IPV6);
 267         skb->dev = dev;
 268
 269         skb_reset_network_header(skb);
 270         skb_put(skb, sizeof(struct ipv6hdr));
 271         hdr = ipv6_hdr(skb);
 272
 273         *(__be32*)hdr = htonl(0x60000000);
 274
 275         hdr->payload_len = htons(len);
 276         hdr->nexthdr = proto;
 277         hdr->hop_limit = np->hop_limit;
 278
 279         hdr->saddr = *saddr;
 280         hdr->daddr = *daddr;
 281
 282         return 0;
 283 }
 284
 285 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 286 {
 287         struct ip6_ra_chain *ra;
 288         struct sock *last = NULL;
 289
 290         read_lock(&ip6_ra_lock);
 291         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 292                 struct sock *sk = ra->sk;
 293                 if (sk && ra->sel == sel &&
 294                     (!sk->sk_bound_dev_if ||
 295                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 296                         if (last) {
 297                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 298                                 if (skb2)
 299                                         rawv6_rcv(last, skb2);
 300                         }
 301                         last = sk;
 302                 }
 303         }
 304
 305         if (last) {
 306                 rawv6_rcv(last, skb);
 307                 read_unlock(&ip6_ra_lock);
 308                 return 1;
 309         }
 310         read_unlock(&ip6_ra_lock);
 311         return 0;
 312 }
 313
 314 static int ip6_forward_proxy_check(struct sk_buff *skb)
 315 {
 316         struct ipv6hdr *hdr = ipv6_hdr(skb);
 317         u8 nexthdr = hdr->nexthdr;
 318         __be16 frag_off;
 319         int offset;
 320
 321         if (ipv6_ext_hdr(nexthdr)) {
 322                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 323                 if (offset < 0)
 324                         return 0;
 325         } else
 326                 offset = sizeof(struct ipv6hdr);
 327
 328         if (nexthdr == IPPROTO_ICMPV6) {
 329                 struct icmp6hdr *icmp6;
 330
 331                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 332                                          offset + 1 - skb->data)))
 333                         return 0;
 334
 335                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 336
 337                 switch (icmp6->icmp6_type) {
 338                 case NDISC_ROUTER_SOLICITATION:
 339                 case NDISC_ROUTER_ADVERTISEMENT:
 340                 case NDISC_NEIGHBOUR_SOLICITATION:
 341                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 342                 case NDISC_REDIRECT:
 343                         /* For reaction involving unicast neighbor discovery
 344                          * message destined to the proxied address, pass it to
 345                          * input function.
 346                          */
 347                         return 1;
 348                 default:
 349                         break;
 350                 }
 351         }
 352
 353         /*
 354          * The proxying router can't forward traffic sent to a link-local
 355          * address, so signal the sender and discard the packet. This
 356          * behavior is clarified by the MIPv6 specification.
 357          */
 358         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 359                 dst_link_failure(skb);
 360                 return -1;
 361         }
 362
 363         return 0;
 364 }
 365
 366 static inline int ip6_forward_finish(struct sk_buff *skb)
 367 {
 368         return dst_output(skb);
 369 }
 370
 371 int ip6_forward(struct sk_buff *skb)
 372 {
 373         struct dst_entry *dst = skb_dst(skb);
 374         struct ipv6hdr *hdr = ipv6_hdr(skb);
 375         struct inet6_skb_parm *opt = IP6CB(skb);
 376         struct net *net = dev_net(dst->dev);
 377         u32 mtu;
 378
 379         if (net->ipv6.devconf_all->forwarding == 0)
 380                 goto error;
 381
 382         if (skb_warn_if_lro(skb))
 383                 goto drop;
 384
 385         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 386                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 387                 goto drop;
 388         }
 389
 390         if (skb->pkt_type != PACKET_HOST)
 391                 goto drop;
 392
 393         skb_forward_csum(skb);
 394
 395         /*
 396          *      We DO NOT make any processing on
 397          *      RA packets, pushing them to user level AS IS
 398          *      without ane WARRANTY that application will be able
 399          *      to interpret them. The reason is that we
 400          *      cannot make anything clever here.
 401          *
 402          *      We are not end-node, so that if packet contains
 403          *      AH/ESP, we cannot make anything.
 404          *      Defragmentation also would be mistake, RA packets
 405          *      cannot be fragmented, because there is no warranty
 406          *      that different fragments will go along one path. --ANK
 407          */
 408         if (opt->ra) {
 409                 u8 *ptr = skb_network_header(skb) + opt->ra;
 410                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 411                         return 0;
 412         }
 413
 414         /*
 415          *      check and decrement ttl
 416          */
 417         if (hdr->hop_limit <= 1) {
 418                 /* Force OUTPUT device used as source address */
 419                 skb->dev = dst->dev;
 420                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 421                 IP6_INC_STATS_BH(net,
 422                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 423
 424                 kfree_skb(skb);
 425                 return -ETIMEDOUT;
 426         }
 427
 428         /* XXX: idev->cnf.proxy_ndp? */
 429         if (net->ipv6.devconf_all->proxy_ndp &&
 430             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 431                 int proxied = ip6_forward_proxy_check(skb);
 432                 if (proxied > 0)
 433                         return ip6_input(skb);
 434                 else if (proxied < 0) {
 435                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 436                                       IPSTATS_MIB_INDISCARDS);
 437                         goto drop;
 438                 }
 439         }
 440
 441         if (!xfrm6_route_forward(skb)) {
 442                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 443                 goto drop;
 444         }
 445         dst = skb_dst(skb);
 446
 447         /* IPv6 specs say nothing about it, but it is clear that we cannot
 448            send redirects to source routed frames.
 449            We don't send redirects to frames decapsulated from IPsec.
 450          */
 451         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 452                 struct in6_addr *target = NULL;
 453                 struct inet_peer *peer;
 454                 struct rt6_info *rt;
 455
 456                 /*
 457                  *      incoming and outgoing devices are the same
 458                  *      send a redirect.
 459                  */
 460
 461                 rt = (struct rt6_info *) dst;
 462                 if (rt->rt6i_flags & RTF_GATEWAY)
 463                         target = &rt->rt6i_gateway;
 464                 else
 465                         target = &hdr->daddr;
 466
 467                 peer = rt6_get_peer_create(rt);
 468
 469                 /* Limit redirects both by destination (here)
 470                    and by source (inside ndisc_send_redirect)
 471                  */
 472                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 473                         ndisc_send_redirect(skb, target);
 474         } else {
 475                 int addrtype = ipv6_addr_type(&hdr->saddr);
 476
 477                 /* This check is security critical. */
 478                 if (addrtype == IPV6_ADDR_ANY ||
 479                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 480                         goto error;
 481                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 482                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 483                                     ICMPV6_NOT_NEIGHBOUR, 0);
 484                         goto error;
 485                 }
 486         }
 487
 488         mtu = dst_mtu(dst);
 489         if (mtu < IPV6_MIN_MTU)
 490                 mtu = IPV6_MIN_MTU;
 491
 492         if (skb->len > mtu && !skb_is_gso(skb)) {
 493                 /* Again, force OUTPUT device used as source address */
 494                 skb->dev = dst->dev;
 495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 496                 IP6_INC_STATS_BH(net,
 497                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 498                 IP6_INC_STATS_BH(net,
 499                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 500                 kfree_skb(skb);
 501                 return -EMSGSIZE;
 502         }
 503
 504         if (skb_cow(skb, dst->dev->hard_header_len)) {
 505                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 506                 goto drop;
 507         }
 508
 509         hdr = ipv6_hdr(skb);
 510
 511         /* Mangling hops number delayed to point after skb COW */
 512
 513         hdr->hop_limit--;
 514
 515         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 516         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 517         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 518                        ip6_forward_finish);
 519
 520 error:
 521         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 522 drop:
 523         kfree_skb(skb);
 524         return -EINVAL;
 525 }
 526
 527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 528 {
 529         to->pkt_type = from->pkt_type;
 530         to->priority = from->priority;
 531         to->protocol = from->protocol;
 532         skb_dst_drop(to);
 533         skb_dst_set(to, dst_clone(skb_dst(from)));
 534         to->dev = from->dev;
 535         to->mark = from->mark;
 536
 537 #ifdef CONFIG_NET_SCHED
 538         to->tc_index = from->tc_index;
 539 #endif
 540         nf_copy(to, from);
 541 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 542     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 543         to->nf_trace = from->nf_trace;
 544 #endif
 545         skb_copy_secmark(to, from);
 546 }
 547
 548 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 549 {
 550         u16 offset = sizeof(struct ipv6hdr);
 551         struct ipv6_opt_hdr *exthdr =
 552                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 553         unsigned int packet_len = skb->tail - skb->network_header;
 554         int found_rhdr = 0;
 555         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 556
 557         while (offset + 1 <= packet_len) {
 558
 559                 switch (**nexthdr) {
 560
 561                 case NEXTHDR_HOP:
 562                         break;
 563                 case NEXTHDR_ROUTING:
 564                         found_rhdr = 1;
 565                         break;
 566                 case NEXTHDR_DEST:
 567 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 568                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 569                                 break;
 570 #endif
 571                         if (found_rhdr)
 572                                 return offset;
 573                         break;
 574                 default :
 575                         return offset;
 576                 }
 577
 578                 offset += ipv6_optlen(exthdr);
 579                 *nexthdr = &exthdr->nexthdr;
 580                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 581                                                  offset);
 582         }
 583
 584         return offset;
 585 }
 586
 587 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 588 {
 589         static atomic_t ipv6_fragmentation_id;
 590         int old, new;
 591
 592         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 593                 struct inet_peer *peer = rt6_get_peer_create(rt);
 594
 595                 if (peer) {
 596                         fhdr->identification = htonl(inet_getid(peer, 0));
 597                         return;
 598                 }
 599         }
 600         do {
 601                 old = atomic_read(&ipv6_fragmentation_id);
 602                 new = old + 1;
 603                 if (!new)
 604                         new = 1;
 605         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 606         fhdr->identification = htonl(new);
 607 }
 608
 609 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 610 {
 611         struct sk_buff *frag;
 612         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 613         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 614         struct ipv6hdr *tmp_hdr;
 615         struct frag_hdr *fh;
 616         unsigned int mtu, hlen, left, len;
 617         int hroom, troom;
 618         __be32 frag_id = 0;
 619         int ptr, offset = 0, err=0;
 620         u8 *prevhdr, nexthdr = 0;
 621         struct net *net = dev_net(skb_dst(skb)->dev);
 622
 623         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 624         nexthdr = *prevhdr;
 625
 626         mtu = ip6_skb_dst_mtu(skb);
 627
 628         /* We must not fragment if the socket is set to force MTU discovery
 629          * or if the skb it not generated by a local socket.
 630          */
 631         if (unlikely(!skb->local_df && skb->len > mtu)) {
 632                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 633                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 634
 635                 skb->dev = skb_dst(skb)->dev;
 636                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 637                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 638                               IPSTATS_MIB_FRAGFAILS);
 639                 kfree_skb(skb);
 640                 return -EMSGSIZE;
 641         }
 642
 643         if (np && np->frag_size < mtu) {
 644                 if (np->frag_size)
 645                         mtu = np->frag_size;
 646         }
 647         mtu -= hlen + sizeof(struct frag_hdr);
 648
 649         if (skb_has_frag_list(skb)) {
 650                 int first_len = skb_pagelen(skb);
 651                 struct sk_buff *frag2;
 652
 653                 if (first_len - hlen > mtu ||
 654                     ((first_len - hlen) & 7) ||
 655                     skb_cloned(skb))
 656                         goto slow_path;
 657
 658                 skb_walk_frags(skb, frag) {
 659                         /* Correct geometry. */
 660                         if (frag->len > mtu ||
 661                             ((frag->len & 7) && frag->next) ||
 662                             skb_headroom(frag) < hlen)
 663                                 goto slow_path_clean;
 664
 665                         /* Partially cloned skb? */
 666                         if (skb_shared(frag))
 667                                 goto slow_path_clean;
 668
 669                         BUG_ON(frag->sk);
 670                         if (skb->sk) {
 671                                 frag->sk = skb->sk;
 672                                 frag->destructor = sock_wfree;
 673                         }
 674                         skb->truesize -= frag->truesize;
 675                 }
 676
 677                 err = 0;
 678                 offset = 0;
 679                 frag = skb_shinfo(skb)->frag_list;
 680                 skb_frag_list_init(skb);
 681                 /* BUILD HEADER */
 682
 683                 *prevhdr = NEXTHDR_FRAGMENT;
 684                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 685                 if (!tmp_hdr) {
 686                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 687                                       IPSTATS_MIB_FRAGFAILS);
 688                         return -ENOMEM;
 689                 }
 690
 691                 __skb_pull(skb, hlen);
 692                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 693                 __skb_push(skb, hlen);
 694                 skb_reset_network_header(skb);
 695                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 696
 697                 ipv6_select_ident(fh, rt);
 698                 fh->nexthdr = nexthdr;
 699                 fh->reserved = 0;
 700                 fh->frag_off = htons(IP6_MF);
 701                 frag_id = fh->identification;
 702
 703                 first_len = skb_pagelen(skb);
 704                 skb->data_len = first_len - skb_headlen(skb);
 705                 skb->len = first_len;
 706                 ipv6_hdr(skb)->payload_len = htons(first_len -
 707                                                    sizeof(struct ipv6hdr));
 708
 709                 dst_hold(&rt->dst);
 710
 711                 for (;;) {
 712                         /* Prepare header of the next frame,
 713                          * before previous one went down. */
 714                         if (frag) {
 715                                 frag->ip_summed = CHECKSUM_NONE;
 716                                 skb_reset_transport_header(frag);
 717                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 718                                 __skb_push(frag, hlen);
 719                                 skb_reset_network_header(frag);
 720                                 memcpy(skb_network_header(frag), tmp_hdr,
 721                                        hlen);
 722                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 723                                 fh->nexthdr = nexthdr;
 724                                 fh->reserved = 0;
 725                                 fh->frag_off = htons(offset);
 726                                 if (frag->next != NULL)
 727                                         fh->frag_off |= htons(IP6_MF);
 728                                 fh->identification = frag_id;
 729                                 ipv6_hdr(frag)->payload_len =
 730                                                 htons(frag->len -
 731                                                       sizeof(struct ipv6hdr));
 732                                 ip6_copy_metadata(frag, skb);
 733                         }
 734
 735                         err = output(skb);
 736                         if(!err)
 737                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 738                                               IPSTATS_MIB_FRAGCREATES);
 739
 740                         if (err || !frag)
 741                                 break;
 742
 743                         skb = frag;
 744                         frag = skb->next;
 745                         skb->next = NULL;
 746                 }
 747
 748                 kfree(tmp_hdr);
 749
 750                 if (err == 0) {
 751                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 752                                       IPSTATS_MIB_FRAGOKS);
 753                         dst_release(&rt->dst);
 754                         return 0;
 755                 }
 756
 757                 while (frag) {
 758                         skb = frag->next;
 759                         kfree_skb(frag);
 760                         frag = skb;
 761                 }
 762
 763                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 764                               IPSTATS_MIB_FRAGFAILS);
 765                 dst_release(&rt->dst);
 766                 return err;
 767
 768 slow_path_clean:
 769                 skb_walk_frags(skb, frag2) {
 770                         if (frag2 == frag)
 771                                 break;
 772                         frag2->sk = NULL;
 773                         frag2->destructor = NULL;
 774                         skb->truesize += frag2->truesize;
 775                 }
 776         }
 777
 778 slow_path:
 779         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 780             skb_checksum_help(skb))
 781                 goto fail;
 782
 783         left = skb->len - hlen;         /* Space per frame */
 784         ptr = hlen;                     /* Where to start from */
 785
 786         /*
 787          *      Fragment the datagram.
 788          */
 789
 790         *prevhdr = NEXTHDR_FRAGMENT;
 791         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 792         troom = rt->dst.dev->needed_tailroom;
 793
 794         /*
 795          *      Keep copying data until we run out.
 796          */
 797         while(left > 0) {
 798                 len = left;
 799                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 800                 if (len > mtu)
 801                         len = mtu;
 802                 /* IF: we are not sending up to and including the packet end
 803                    then align the next start on an eight byte boundary */
 804                 if (len < left) {
 805                         len &= ~7;
 806                 }
 807                 /*
 808                  *      Allocate buffer.
 809                  */
 810
 811                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 812                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 813                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 814                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 815                                       IPSTATS_MIB_FRAGFAILS);
 816                         err = -ENOMEM;
 817                         goto fail;
 818                 }
 819
 820                 /*
 821                  *      Set up data on packet
 822                  */
 823
 824                 ip6_copy_metadata(frag, skb);
 825                 skb_reserve(frag, hroom);
 826                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 827                 skb_reset_network_header(frag);
 828                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 829                 frag->transport_header = (frag->network_header + hlen +
 830                                           sizeof(struct frag_hdr));
 831
 832                 /*
 833                  *      Charge the memory for the fragment to any owner
 834                  *      it might possess
 835                  */
 836                 if (skb->sk)
 837                         skb_set_owner_w(frag, skb->sk);
 838
 839                 /*
 840                  *      Copy the packet header into the new buffer.
 841                  */
 842                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 843
 844                 /*
 845                  *      Build fragment header.
 846                  */
 847                 fh->nexthdr = nexthdr;
 848                 fh->reserved = 0;
 849                 if (!frag_id) {
 850                         ipv6_select_ident(fh, rt);
 851                         frag_id = fh->identification;
 852                 } else
 853                         fh->identification = frag_id;
 854
 855                 /*
 856                  *      Copy a block of the IP datagram.
 857                  */
 858                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 859                         BUG();
 860                 left -= len;
 861
 862                 fh->frag_off = htons(offset);
 863                 if (left > 0)
 864                         fh->frag_off |= htons(IP6_MF);
 865                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 866                                                     sizeof(struct ipv6hdr));
 867
 868                 ptr += len;
 869                 offset += len;
 870
 871                 /*
 872                  *      Put this fragment into the sending queue.
 873                  */
 874                 err = output(frag);
 875                 if (err)
 876                         goto fail;
 877
 878                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 879                               IPSTATS_MIB_FRAGCREATES);
 880         }
 881         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 882                       IPSTATS_MIB_FRAGOKS);
 883         consume_skb(skb);
 884         return err;
 885
 886 fail:
 887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 888                       IPSTATS_MIB_FRAGFAILS);
 889         kfree_skb(skb);
 890         return err;
 891 }
 892
 893 static inline int ip6_rt_check(const struct rt6key *rt_key,
 894                                const struct in6_addr *fl_addr,
 895                                const struct in6_addr *addr_cache)
 896 {
 897         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 898                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 899 }
 900
 901 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 902                                           struct dst_entry *dst,
 903                                           const struct flowi6 *fl6)
 904 {
 905         struct ipv6_pinfo *np = inet6_sk(sk);
 906         struct rt6_info *rt = (struct rt6_info *)dst;
 907
 908         if (!dst)
 909                 goto out;
 910
 911         /* Yes, checking route validity in not connected
 912          * case is not very simple. Take into account,
 913          * that we do not support routing by source, TOS,
 914          * and MSG_DONTROUTE            --ANK (980726)
 915          *
 916          * 1. ip6_rt_check(): If route was host route,
 917          *    check that cached destination is current.
 918          *    If it is network route, we still may
 919          *    check its validity using saved pointer
 920          *    to the last used address: daddr_cache.
 921          *    We do not want to save whole address now,
 922          *    (because main consumer of this service
 923          *    is tcp, which has not this problem),
 924          *    so that the last trick works only on connected
 925          *    sockets.
 926          * 2. oif also should be the same.
 927          */
 928         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 929 #ifdef CONFIG_IPV6_SUBTREES
 930             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 931 #endif
 932             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 933                 dst_release(dst);
 934                 dst = NULL;
 935         }
 936
 937 out:
 938         return dst;
 939 }
 940
 941 static int ip6_dst_lookup_tail(struct sock *sk,
 942                                struct dst_entry **dst, struct flowi6 *fl6)
 943 {
 944         struct net *net = sock_net(sk);
 945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 946         struct neighbour *n;
 947 #endif
 948         int err;
 949
 950         if (*dst == NULL)
 951                 *dst = ip6_route_output(net, sk, fl6);
 952
 953         if ((err = (*dst)->error))
 954                 goto out_err_release;
 955
 956         if (ipv6_addr_any(&fl6->saddr)) {
 957                 struct rt6_info *rt = (struct rt6_info *) *dst;
 958                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 959                                           sk ? inet6_sk(sk)->srcprefs : 0,
 960                                           &fl6->saddr);
 961                 if (err)
 962                         goto out_err_release;
 963         }
 964
 965 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 966         /*
 967          * Here if the dst entry we've looked up
 968          * has a neighbour entry that is in the INCOMPLETE
 969          * state and the src address from the flow is
 970          * marked as OPTIMISTIC, we release the found
 971          * dst entry and replace it instead with the
 972          * dst entry of the nexthop router
 973          */
 974         rcu_read_lock();
 975         n = dst_get_neighbour_noref(*dst);
 976         if (n && !(n->nud_state & NUD_VALID)) {
 977                 struct inet6_ifaddr *ifp;
 978                 struct flowi6 fl_gw6;
 979                 int redirect;
 980
 981                 rcu_read_unlock();
 982                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 983                                       (*dst)->dev, 1);
 984
 985                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 986                 if (ifp)
 987                         in6_ifa_put(ifp);
 988
 989                 if (redirect) {
 990                         /*
 991                          * We need to get the dst entry for the
 992                          * default router instead
 993                          */
 994                         dst_release(*dst);
 995                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 996                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 997                         *dst = ip6_route_output(net, sk, &fl_gw6);
 998                         if ((err = (*dst)->error))
 999                                 goto out_err_release;
1000                 }
1001         } else {
1002                 rcu_read_unlock();
1003         }
1004 #endif
1005
1006         return 0;
1007
1008 out_err_release:
1009         if (err == -ENETUNREACH)
1010                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011         dst_release(*dst);
1012         *dst = NULL;
1013         return err;
1014 }
1015
1016 /**
1017  *      ip6_dst_lookup - perform route lookup on flow
1018  *      @sk: socket which provides route info
1019  *      @dst: pointer to dst_entry * for result
1020  *      @fl6: flow to lookup
1021  *
1022  *      This function performs a route lookup on the given flow.
1023  *
1024  *      It returns zero on success, or a standard errno code on error.
1025  */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027 {
1028         *dst = NULL;
1029         return ip6_dst_lookup_tail(sk, dst, fl6);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032
1033 /**
1034  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035  *      @sk: socket which provides route info
1036  *      @fl6: flow to lookup
1037  *      @final_dst: final destination address for ipsec lookup
1038  *      @can_sleep: we are in a sleepable context
1039  *
1040  *      This function performs a route lookup on the given flow.
1041  *
1042  *      It returns a valid dst pointer on success, or a pointer encoded
1043  *      error code.
1044  */
1045 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046                                       const struct in6_addr *final_dst,
1047                                       bool can_sleep)
1048 {
1049         struct dst_entry *dst = NULL;
1050         int err;
1051
1052         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053         if (err)
1054                 return ERR_PTR(err);
1055         if (final_dst)
1056                 fl6->daddr = *final_dst;
1057         if (can_sleep)
1058                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059
1060         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061 }
1062 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063
1064 /**
1065  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066  *      @sk: socket which provides the dst cache and route info
1067  *      @fl6: flow to lookup
1068  *      @final_dst: final destination address for ipsec lookup
1069  *      @can_sleep: we are in a sleepable context
1070  *
1071  *      This function performs a route lookup on the given flow with the
1072  *      possibility of using the cached route in the socket if it is valid.
1073  *      It will take the socket dst lock when operating on the dst cache.
1074  *      As a result, this function can only be used in process context.
1075  *
1076  *      It returns a valid dst pointer on success, or a pointer encoded
1077  *      error code.
1078  */
1079 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080                                          const struct in6_addr *final_dst,
1081                                          bool can_sleep)
1082 {
1083         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084         int err;
1085
1086         dst = ip6_sk_dst_check(sk, dst, fl6);
1087
1088         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089         if (err)
1090                 return ERR_PTR(err);
1091         if (final_dst)
1092                 fl6->daddr = *final_dst;
1093         if (can_sleep)
1094                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095
1096         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099
1100 static inline int ip6_ufo_append_data(struct sock *sk,
1101                         int getfrag(void *from, char *to, int offset, int len,
1102                         int odd, struct sk_buff *skb),
1103                         void *from, int length, int hh_len, int fragheaderlen,
1104                         int transhdrlen, int mtu,unsigned int flags,
1105                         struct rt6_info *rt)
1106
1107 {
1108         struct sk_buff *skb;
1109         int err;
1110
1111         /* There is support for UDP large send offload by network
1112          * device, so create one single skb packet containing complete
1113          * udp datagram
1114          */
1115         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116                 skb = sock_alloc_send_skb(sk,
1117                         hh_len + fragheaderlen + transhdrlen + 20,
1118                         (flags & MSG_DONTWAIT), &err);
1119                 if (skb == NULL)
1120                         return err;
1121
1122                 /* reserve space for Hardware header */
1123                 skb_reserve(skb, hh_len);
1124
1125                 /* create space for UDP/IP header */
1126                 skb_put(skb,fragheaderlen + transhdrlen);
1127
1128                 /* initialize network header pointer */
1129                 skb_reset_network_header(skb);
1130
1131                 /* initialize protocol header pointer */
1132                 skb->transport_header = skb->network_header + fragheaderlen;
1133
1134                 skb->ip_summed = CHECKSUM_PARTIAL;
1135                 skb->csum = 0;
1136         }
1137
1138         err = skb_append_datato_frags(sk,skb, getfrag, from,
1139                                       (length - transhdrlen));
1140         if (!err) {
1141                 struct frag_hdr fhdr;
1142
1143                 /* Specify the length of each IPv6 datagram fragment.
1144                  * It has to be a multiple of 8.
1145                  */
1146                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147                                              sizeof(struct frag_hdr)) & ~7;
1148                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149                 ipv6_select_ident(&fhdr, rt);
1150                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151                 __skb_queue_tail(&sk->sk_write_queue, skb);
1152
1153                 return 0;
1154         }
1155         /* There is not enough support do UPD LSO,
1156          * so follow normal path
1157          */
1158         kfree_skb(skb);
1159
1160         return err;
1161 }
1162
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164                                                gfp_t gfp)
1165 {
1166         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 }
1168
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170                                                 gfp_t gfp)
1171 {
1172         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 static void ip6_append_data_mtu(int *mtu,
1176                                 int *maxfraglen,
1177                                 unsigned int fragheaderlen,
1178                                 struct sk_buff *skb,
1179                                 struct rt6_info *rt)
1180 {
1181         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1182                 if (skb == NULL) {
1183                         /* first fragment, reserve header_len */
1184                         *mtu = *mtu - rt->dst.header_len;
1185
1186                 } else {
1187                         /*
1188                          * this fragment is not first, the headers
1189                          * space is regarded as data space.
1190                          */
1191                         *mtu = dst_mtu(rt->dst.path);
1192                 }
1193                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1194                               + fragheaderlen - sizeof(struct frag_hdr);
1195         }
1196 }
1197
1198 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1199         int offset, int len, int odd, struct sk_buff *skb),
1200         void *from, int length, int transhdrlen,
1201         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1202         struct rt6_info *rt, unsigned int flags, int dontfrag)
1203 {
1204         struct inet_sock *inet = inet_sk(sk);
1205         struct ipv6_pinfo *np = inet6_sk(sk);
1206         struct inet_cork *cork;
1207         struct sk_buff *skb, *skb_prev = NULL;
1208         unsigned int maxfraglen, fragheaderlen;
1209         int exthdrlen;
1210         int dst_exthdrlen;
1211         int hh_len;
1212         int mtu;
1213         int copy;
1214         int err;
1215         int offset = 0;
1216         __u8 tx_flags = 0;
1217
1218         if (flags&MSG_PROBE)
1219                 return 0;
1220         cork = &inet->cork.base;
1221         if (skb_queue_empty(&sk->sk_write_queue)) {
1222                 /*
1223                  * setup for corking
1224                  */
1225                 if (opt) {
1226                         if (WARN_ON(np->cork.opt))
1227                                 return -EINVAL;
1228
1229                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1230                         if (unlikely(np->cork.opt == NULL))
1231                                 return -ENOBUFS;
1232
1233                         np->cork.opt->tot_len = opt->tot_len;
1234                         np->cork.opt->opt_flen = opt->opt_flen;
1235                         np->cork.opt->opt_nflen = opt->opt_nflen;
1236
1237                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1238                                                             sk->sk_allocation);
1239                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1240                                 return -ENOBUFS;
1241
1242                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1243                                                             sk->sk_allocation);
1244                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1245                                 return -ENOBUFS;
1246
1247                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1248                                                            sk->sk_allocation);
1249                         if (opt->hopopt && !np->cork.opt->hopopt)
1250                                 return -ENOBUFS;
1251
1252                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1253                                                             sk->sk_allocation);
1254                         if (opt->srcrt && !np->cork.opt->srcrt)
1255                                 return -ENOBUFS;
1256
1257                         /* need source address above miyazawa*/
1258                 }
1259                 dst_hold(&rt->dst);
1260                 cork->dst = &rt->dst;
1261                 inet->cork.fl.u.ip6 = *fl6;
1262                 np->cork.hop_limit = hlimit;
1263                 np->cork.tclass = tclass;
1264                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1265                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1266                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1267                 else
1268                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1269                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1270                 if (np->frag_size < mtu) {
1271                         if (np->frag_size)
1272                                 mtu = np->frag_size;
1273                 }
1274                 cork->fragsize = mtu;
1275                 if (dst_allfrag(rt->dst.path))
1276                         cork->flags |= IPCORK_ALLFRAG;
1277                 cork->length = 0;
1278                 sk->sk_sndmsg_page = NULL;
1279                 sk->sk_sndmsg_off = 0;
1280                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1281                 length += exthdrlen;
1282                 transhdrlen += exthdrlen;
1283                 dst_exthdrlen = rt->dst.header_len;
1284         } else {
1285                 rt = (struct rt6_info *)cork->dst;
1286                 fl6 = &inet->cork.fl.u.ip6;
1287                 opt = np->cork.opt;
1288                 transhdrlen = 0;
1289                 exthdrlen = 0;
1290                 dst_exthdrlen = 0;
1291                 mtu = cork->fragsize;
1292         }
1293
1294         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297                         (opt ? opt->opt_nflen : 0);
1298         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1299
1300         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1301                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1302                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1303                         return -EMSGSIZE;
1304                 }
1305         }
1306
1307         /* For UDP, check if TX timestamp is enabled */
1308         if (sk->sk_type == SOCK_DGRAM) {
1309                 err = sock_tx_timestamp(sk, &tx_flags);
1310                 if (err)
1311                         goto error;
1312         }
1313
1314         /*
1315          * Let's try using as much space as possible.
1316          * Use MTU if total length of the message fits into the MTU.
1317          * Otherwise, we need to reserve fragment header and
1318          * fragment alignment (= 8-15 octects, in total).
1319          *
1320          * Note that we may need to "move" the data from the tail of
1321          * of the buffer to the new fragment when we split
1322          * the message.
1323          *
1324          * FIXME: It may be fragmented into multiple chunks
1325          *        at once if non-fragmentable extension headers
1326          *        are too large.
1327          * --yoshfuji
1328          */
1329
1330         cork->length += length;
1331         if (length > mtu) {
1332                 int proto = sk->sk_protocol;
1333                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1334                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1335                         return -EMSGSIZE;
1336                 }
1337
1338                 if (proto == IPPROTO_UDP &&
1339                     (rt->dst.dev->features & NETIF_F_UFO)) {
1340
1341                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1342                                                   hh_len, fragheaderlen,
1343                                                   transhdrlen, mtu, flags, rt);
1344                         if (err)
1345                                 goto error;
1346                         return 0;
1347                 }
1348         }
1349
1350         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1351                 goto alloc_new_skb;
1352
1353         while (length > 0) {
1354                 /* Check if the remaining data fits into current packet. */
1355                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1356                 if (copy < length)
1357                         copy = maxfraglen - skb->len;
1358
1359                 if (copy <= 0) {
1360                         char *data;
1361                         unsigned int datalen;
1362                         unsigned int fraglen;
1363                         unsigned int fraggap;
1364                         unsigned int alloclen;
1365 alloc_new_skb:
1366                         /* There's no room in the current skb */
1367                         if (skb)
1368                                 fraggap = skb->len - maxfraglen;
1369                         else
1370                                 fraggap = 0;
1371                         /* update mtu and maxfraglen if necessary */
1372                         if (skb == NULL || skb_prev == NULL)
1373                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1374                                                     fragheaderlen, skb, rt);
1375
1376                         skb_prev = skb;
1377
1378                         /*
1379                          * If remaining data exceeds the mtu,
1380                          * we know we need more fragment(s).
1381                          */
1382                         datalen = length + fraggap;
1383
1384                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1385                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1386                         if ((flags & MSG_MORE) &&
1387                             !(rt->dst.dev->features&NETIF_F_SG))
1388                                 alloclen = mtu;
1389                         else
1390                                 alloclen = datalen + fragheaderlen;
1391
1392                         alloclen += dst_exthdrlen;
1393
1394                         if (datalen != length + fraggap) {
1395                                 /*
1396                                  * this is not the last fragment, the trailer
1397                                  * space is regarded as data space.
1398                                  */
1399                                 datalen += rt->dst.trailer_len;
1400                         }
1401
1402                         alloclen += rt->dst.trailer_len;
1403                         fraglen = datalen + fragheaderlen;
1404
1405                         /*
1406                          * We just reserve space for fragment header.
1407                          * Note: this may be overallocation if the message
1408                          * (without MSG_MORE) fits into the MTU.
1409                          */
1410                         alloclen += sizeof(struct frag_hdr);
1411
1412                         if (transhdrlen) {
1413                                 skb = sock_alloc_send_skb(sk,
1414                                                 alloclen + hh_len,
1415                                                 (flags & MSG_DONTWAIT), &err);
1416                         } else {
1417                                 skb = NULL;
1418                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1419                                     2 * sk->sk_sndbuf)
1420                                         skb = sock_wmalloc(sk,
1421                                                            alloclen + hh_len, 1,
1422                                                            sk->sk_allocation);
1423                                 if (unlikely(skb == NULL))
1424                                         err = -ENOBUFS;
1425                                 else {
1426                                         /* Only the initial fragment
1427                                          * is time stamped.
1428                                          */
1429                                         tx_flags = 0;
1430                                 }
1431                         }
1432                         if (skb == NULL)
1433                                 goto error;
1434                         /*
1435                          *      Fill in the control structures
1436                          */
1437                         skb->ip_summed = CHECKSUM_NONE;
1438                         skb->csum = 0;
1439                         /* reserve for fragmentation and ipsec header */
1440                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1441                                     dst_exthdrlen);
1442
1443                         if (sk->sk_type == SOCK_DGRAM)
1444                                 skb_shinfo(skb)->tx_flags = tx_flags;
1445
1446                         /*
1447                          *      Find where to start putting bytes
1448                          */
1449                         data = skb_put(skb, fraglen);
1450                         skb_set_network_header(skb, exthdrlen);
1451                         data += fragheaderlen;
1452                         skb->transport_header = (skb->network_header +
1453                                                  fragheaderlen);
1454                         if (fraggap) {
1455                                 skb->csum = skb_copy_and_csum_bits(
1456                                         skb_prev, maxfraglen,
1457                                         data + transhdrlen, fraggap, 0);
1458                                 skb_prev->csum = csum_sub(skb_prev->csum,
1459                                                           skb->csum);
1460                                 data += fraggap;
1461                                 pskb_trim_unique(skb_prev, maxfraglen);
1462                         }
1463                         copy = datalen - transhdrlen - fraggap;
1464
1465                         if (copy < 0) {
1466                                 err = -EINVAL;
1467                                 kfree_skb(skb);
1468                                 goto error;
1469                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1470                                 err = -EFAULT;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         }
1474
1475                         offset += copy;
1476                         length -= datalen - fraggap;
1477                         transhdrlen = 0;
1478                         exthdrlen = 0;
1479                         dst_exthdrlen = 0;
1480
1481                         /*
1482                          * Put the packet on the pending queue
1483                          */
1484                         __skb_queue_tail(&sk->sk_write_queue, skb);
1485                         continue;
1486                 }
1487
1488                 if (copy > length)
1489                         copy = length;
1490
1491                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492                         unsigned int off;
1493
1494                         off = skb->len;
1495                         if (getfrag(from, skb_put(skb, copy),
1496                                                 offset, copy, off, skb) < 0) {
1497                                 __skb_trim(skb, off);
1498                                 err = -EFAULT;
1499                                 goto error;
1500                         }
1501                 } else {
1502                         int i = skb_shinfo(skb)->nr_frags;
1503                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1504                         struct page *page = sk->sk_sndmsg_page;
1505                         int off = sk->sk_sndmsg_off;
1506                         unsigned int left;
1507
1508                         if (page && (left = PAGE_SIZE - off) > 0) {
1509                                 if (copy >= left)
1510                                         copy = left;
1511                                 if (page != skb_frag_page(frag)) {
1512                                         if (i == MAX_SKB_FRAGS) {
1513                                                 err = -EMSGSIZE;
1514                                                 goto error;
1515                                         }
1516                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1517                                         skb_frag_ref(skb, i);
1518                                         frag = &skb_shinfo(skb)->frags[i];
1519                                 }
1520                         } else if(i < MAX_SKB_FRAGS) {
1521                                 if (copy > PAGE_SIZE)
1522                                         copy = PAGE_SIZE;
1523                                 page = alloc_pages(sk->sk_allocation, 0);
1524                                 if (page == NULL) {
1525                                         err = -ENOMEM;
1526                                         goto error;
1527                                 }
1528                                 sk->sk_sndmsg_page = page;
1529                                 sk->sk_sndmsg_off = 0;
1530
1531                                 skb_fill_page_desc(skb, i, page, 0, 0);
1532                                 frag = &skb_shinfo(skb)->frags[i];
1533                         } else {
1534                                 err = -EMSGSIZE;
1535                                 goto error;
1536                         }
1537                         if (getfrag(from,
1538                                     skb_frag_address(frag) + skb_frag_size(frag),
1539                                     offset, copy, skb->len, skb) < 0) {
1540                                 err = -EFAULT;
1541                                 goto error;
1542                         }
1543                         sk->sk_sndmsg_off += copy;
1544                         skb_frag_size_add(frag, copy);
1545                         skb->len += copy;
1546                         skb->data_len += copy;
1547                         skb->truesize += copy;
1548                         atomic_add(copy, &sk->sk_wmem_alloc);
1549                 }
1550                 offset += copy;
1551                 length -= copy;
1552         }
1553         return 0;
1554 error:
1555         cork->length -= length;
1556         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1557         return err;
1558 }
1559 EXPORT_SYMBOL_GPL(ip6_append_data);
1560
1561 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1562 {
1563         if (np->cork.opt) {
1564                 kfree(np->cork.opt->dst0opt);
1565                 kfree(np->cork.opt->dst1opt);
1566                 kfree(np->cork.opt->hopopt);
1567                 kfree(np->cork.opt->srcrt);
1568                 kfree(np->cork.opt);
1569                 np->cork.opt = NULL;
1570         }
1571
1572         if (inet->cork.base.dst) {
1573                 dst_release(inet->cork.base.dst);
1574                 inet->cork.base.dst = NULL;
1575                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1576         }
1577         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1578 }
1579
1580 int ip6_push_pending_frames(struct sock *sk)
1581 {
1582         struct sk_buff *skb, *tmp_skb;
1583         struct sk_buff **tail_skb;
1584         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1585         struct inet_sock *inet = inet_sk(sk);
1586         struct ipv6_pinfo *np = inet6_sk(sk);
1587         struct net *net = sock_net(sk);
1588         struct ipv6hdr *hdr;
1589         struct ipv6_txoptions *opt = np->cork.opt;
1590         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1591         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1592         unsigned char proto = fl6->flowi6_proto;
1593         int err = 0;
1594
1595         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1596                 goto out;
1597         tail_skb = &(skb_shinfo(skb)->frag_list);
1598
1599         /* move skb->data to ip header from ext header */
1600         if (skb->data < skb_network_header(skb))
1601                 __skb_pull(skb, skb_network_offset(skb));
1602         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1603                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1604                 *tail_skb = tmp_skb;
1605                 tail_skb = &(tmp_skb->next);
1606                 skb->len += tmp_skb->len;
1607                 skb->data_len += tmp_skb->len;
1608                 skb->truesize += tmp_skb->truesize;
1609                 tmp_skb->destructor = NULL;
1610                 tmp_skb->sk = NULL;
1611         }
1612
1613         /* Allow local fragmentation. */
1614         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1615                 skb->local_df = 1;
1616
1617         *final_dst = fl6->daddr;
1618         __skb_pull(skb, skb_network_header_len(skb));
1619         if (opt && opt->opt_flen)
1620                 ipv6_push_frag_opts(skb, opt, &proto);
1621         if (opt && opt->opt_nflen)
1622                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1623
1624         skb_push(skb, sizeof(struct ipv6hdr));
1625         skb_reset_network_header(skb);
1626         hdr = ipv6_hdr(skb);
1627
1628         *(__be32*)hdr = fl6->flowlabel |
1629                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1630
1631         hdr->hop_limit = np->cork.hop_limit;
1632         hdr->nexthdr = proto;
1633         hdr->saddr = fl6->saddr;
1634         hdr->daddr = *final_dst;
1635
1636         skb->priority = sk->sk_priority;
1637         skb->mark = sk->sk_mark;
1638
1639         skb_dst_set(skb, dst_clone(&rt->dst));
1640         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1641         if (proto == IPPROTO_ICMPV6) {
1642                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1643
1644                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1645                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1646         }
1647
1648         err = ip6_local_out(skb);
1649         if (err) {
1650                 if (err > 0)
1651                         err = net_xmit_errno(err);
1652                 if (err)
1653                         goto error;
1654         }
1655
1656 out:
1657         ip6_cork_release(inet, np);
1658         return err;
1659 error:
1660         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1661         goto out;
1662 }
1663 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1664
1665 void ip6_flush_pending_frames(struct sock *sk)
1666 {
1667         struct sk_buff *skb;
1668
1669         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1670                 if (skb_dst(skb))
1671                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1672                                       IPSTATS_MIB_OUTDISCARDS);
1673                 kfree_skb(skb);
1674         }
1675
1676         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1677 }
1678 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);