net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 static int ip6_finish_output2(struct sk_buff *skb)
  87 {
  88         struct dst_entry *dst = skb_dst(skb);
  89         struct net_device *dev = dst->dev;
  90         struct neighbour *neigh;
  91         struct rt6_info *rt;
  92
  93         skb->protocol = htons(ETH_P_IPV6);
  94         skb->dev = dev;
  95
  96         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  97                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  98
  99                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 100                     ((mroute6_socket(dev_net(dev), skb) &&
 101                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 102                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 103                                          &ipv6_hdr(skb)->saddr))) {
 104                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 105
 106                         /* Do not check for IFF_ALLMULTI; multicast routing
 107                            is not supported in any case.
 108                          */
 109                         if (newskb)
 110                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 111                                         newskb, NULL, newskb->dev,
 112                                         dev_loopback_xmit);
 113
 114                         if (ipv6_hdr(skb)->hop_limit == 0) {
 115                                 IP6_INC_STATS(dev_net(dev), idev,
 116                                               IPSTATS_MIB_OUTDISCARDS);
 117                                 kfree_skb(skb);
 118                                 return 0;
 119                         }
 120                 }
 121
 122                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 123                                 skb->len);
 124         }
 125
 126         rcu_read_lock();
 127         rt = (struct rt6_info *) dst;
 128         neigh = rt->n;
 129         if (neigh) {
 130                 int res = dst_neigh_output(dst, neigh, skb);
 131
 132                 rcu_read_unlock();
 133                 return res;
 134         }
 135         rcu_read_unlock();
 136         IP6_INC_STATS_BH(dev_net(dst->dev),
 137                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 138         kfree_skb(skb);
 139         return -EINVAL;
 140 }
 141
 142 static int ip6_finish_output(struct sk_buff *skb)
 143 {
 144         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 145             dst_allfrag(skb_dst(skb)))
 146                 return ip6_fragment(skb, ip6_finish_output2);
 147         else
 148                 return ip6_finish_output2(skb);
 149 }
 150
 151 int ip6_output(struct sk_buff *skb)
 152 {
 153         struct net_device *dev = skb_dst(skb)->dev;
 154         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 155         if (unlikely(idev->cnf.disable_ipv6)) {
 156                 IP6_INC_STATS(dev_net(dev), idev,
 157                               IPSTATS_MIB_OUTDISCARDS);
 158                 kfree_skb(skb);
 159                 return 0;
 160         }
 161
 162         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 163                             ip6_finish_output,
 164                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 165 }
 166
 167 /*
 168  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 169  */
 170
 171 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 172              struct ipv6_txoptions *opt, int tclass)
 173 {
 174         struct net *net = sock_net(sk);
 175         struct ipv6_pinfo *np = inet6_sk(sk);
 176         struct in6_addr *first_hop = &fl6->daddr;
 177         struct dst_entry *dst = skb_dst(skb);
 178         struct ipv6hdr *hdr;
 179         u8  proto = fl6->flowi6_proto;
 180         int seg_len = skb->len;
 181         int hlimit = -1;
 182         u32 mtu;
 183
 184         if (opt) {
 185                 unsigned int head_room;
 186
 187                 /* First: exthdrs may take lots of space (~8K for now)
 188                    MAX_HEADER is not enough.
 189                  */
 190                 head_room = opt->opt_nflen + opt->opt_flen;
 191                 seg_len += head_room;
 192                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 193
 194                 if (skb_headroom(skb) < head_room) {
 195                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 196                         if (skb2 == NULL) {
 197                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 198                                               IPSTATS_MIB_OUTDISCARDS);
 199                                 kfree_skb(skb);
 200                                 return -ENOBUFS;
 201                         }
 202                         consume_skb(skb);
 203                         skb = skb2;
 204                         skb_set_owner_w(skb, sk);
 205                 }
 206                 if (opt->opt_flen)
 207                         ipv6_push_frag_opts(skb, opt, &proto);
 208                 if (opt->opt_nflen)
 209                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 210         }
 211
 212         skb_push(skb, sizeof(struct ipv6hdr));
 213         skb_reset_network_header(skb);
 214         hdr = ipv6_hdr(skb);
 215
 216         /*
 217          *      Fill in the IPv6 header
 218          */
 219         if (np)
 220                 hlimit = np->hop_limit;
 221         if (hlimit < 0)
 222                 hlimit = ip6_dst_hoplimit(dst);
 223
 224         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 225
 226         hdr->payload_len = htons(seg_len);
 227         hdr->nexthdr = proto;
 228         hdr->hop_limit = hlimit;
 229
 230         hdr->saddr = fl6->saddr;
 231         hdr->daddr = *first_hop;
 232
 233         skb->priority = sk->sk_priority;
 234         skb->mark = sk->sk_mark;
 235
 236         mtu = dst_mtu(dst);
 237         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 238                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 239                               IPSTATS_MIB_OUT, skb->len);
 240                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 241                                dst->dev, dst_output);
 242         }
 243
 244         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
 245         skb->dev = dst->dev;
 246         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 247         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 248         kfree_skb(skb);
 249         return -EMSGSIZE;
 250 }
 251
 252 EXPORT_SYMBOL(ip6_xmit);
 253
 254 /*
 255  *      To avoid extra problems ND packets are send through this
 256  *      routine. It's code duplication but I really want to avoid
 257  *      extra checks since ipv6_build_header is used by TCP (which
 258  *      is for us performance critical)
 259  */
 260
 261 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 262                const struct in6_addr *saddr, const struct in6_addr *daddr,
 263                int proto, int len)
 264 {
 265         struct ipv6_pinfo *np = inet6_sk(sk);
 266         struct ipv6hdr *hdr;
 267
 268         skb->protocol = htons(ETH_P_IPV6);
 269         skb->dev = dev;
 270
 271         skb_reset_network_header(skb);
 272         skb_put(skb, sizeof(struct ipv6hdr));
 273         hdr = ipv6_hdr(skb);
 274
 275         *(__be32*)hdr = htonl(0x60000000);
 276
 277         hdr->payload_len = htons(len);
 278         hdr->nexthdr = proto;
 279         hdr->hop_limit = np->hop_limit;
 280
 281         hdr->saddr = *saddr;
 282         hdr->daddr = *daddr;
 283
 284         return 0;
 285 }
 286
 287 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 288 {
 289         struct ip6_ra_chain *ra;
 290         struct sock *last = NULL;
 291
 292         read_lock(&ip6_ra_lock);
 293         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 294                 struct sock *sk = ra->sk;
 295                 if (sk && ra->sel == sel &&
 296                     (!sk->sk_bound_dev_if ||
 297                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 298                         if (last) {
 299                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 300                                 if (skb2)
 301                                         rawv6_rcv(last, skb2);
 302                         }
 303                         last = sk;
 304                 }
 305         }
 306
 307         if (last) {
 308                 rawv6_rcv(last, skb);
 309                 read_unlock(&ip6_ra_lock);
 310                 return 1;
 311         }
 312         read_unlock(&ip6_ra_lock);
 313         return 0;
 314 }
 315
 316 static int ip6_forward_proxy_check(struct sk_buff *skb)
 317 {
 318         struct ipv6hdr *hdr = ipv6_hdr(skb);
 319         u8 nexthdr = hdr->nexthdr;
 320         __be16 frag_off;
 321         int offset;
 322
 323         if (ipv6_ext_hdr(nexthdr)) {
 324                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 325                 if (offset < 0)
 326                         return 0;
 327         } else
 328                 offset = sizeof(struct ipv6hdr);
 329
 330         if (nexthdr == IPPROTO_ICMPV6) {
 331                 struct icmp6hdr *icmp6;
 332
 333                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 334                                          offset + 1 - skb->data)))
 335                         return 0;
 336
 337                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 338
 339                 switch (icmp6->icmp6_type) {
 340                 case NDISC_ROUTER_SOLICITATION:
 341                 case NDISC_ROUTER_ADVERTISEMENT:
 342                 case NDISC_NEIGHBOUR_SOLICITATION:
 343                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 344                 case NDISC_REDIRECT:
 345                         /* For reaction involving unicast neighbor discovery
 346                          * message destined to the proxied address, pass it to
 347                          * input function.
 348                          */
 349                         return 1;
 350                 default:
 351                         break;
 352                 }
 353         }
 354
 355         /*
 356          * The proxying router can't forward traffic sent to a link-local
 357          * address, so signal the sender and discard the packet. This
 358          * behavior is clarified by the MIPv6 specification.
 359          */
 360         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 361                 dst_link_failure(skb);
 362                 return -1;
 363         }
 364
 365         return 0;
 366 }
 367
 368 static inline int ip6_forward_finish(struct sk_buff *skb)
 369 {
 370         return dst_output(skb);
 371 }
 372
 373 int ip6_forward(struct sk_buff *skb)
 374 {
 375         struct dst_entry *dst = skb_dst(skb);
 376         struct ipv6hdr *hdr = ipv6_hdr(skb);
 377         struct inet6_skb_parm *opt = IP6CB(skb);
 378         struct net *net = dev_net(dst->dev);
 379         u32 mtu;
 380
 381         if (net->ipv6.devconf_all->forwarding == 0)
 382                 goto error;
 383
 384         if (skb_warn_if_lro(skb))
 385                 goto drop;
 386
 387         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 388                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 389                 goto drop;
 390         }
 391
 392         if (skb->pkt_type != PACKET_HOST)
 393                 goto drop;
 394
 395         skb_forward_csum(skb);
 396
 397         /*
 398          *      We DO NOT make any processing on
 399          *      RA packets, pushing them to user level AS IS
 400          *      without ane WARRANTY that application will be able
 401          *      to interpret them. The reason is that we
 402          *      cannot make anything clever here.
 403          *
 404          *      We are not end-node, so that if packet contains
 405          *      AH/ESP, we cannot make anything.
 406          *      Defragmentation also would be mistake, RA packets
 407          *      cannot be fragmented, because there is no warranty
 408          *      that different fragments will go along one path. --ANK
 409          */
 410         if (opt->ra) {
 411                 u8 *ptr = skb_network_header(skb) + opt->ra;
 412                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 413                         return 0;
 414         }
 415
 416         /*
 417          *      check and decrement ttl
 418          */
 419         if (hdr->hop_limit <= 1) {
 420                 /* Force OUTPUT device used as source address */
 421                 skb->dev = dst->dev;
 422                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 423                 IP6_INC_STATS_BH(net,
 424                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 425
 426                 kfree_skb(skb);
 427                 return -ETIMEDOUT;
 428         }
 429
 430         /* XXX: idev->cnf.proxy_ndp? */
 431         if (net->ipv6.devconf_all->proxy_ndp &&
 432             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 433                 int proxied = ip6_forward_proxy_check(skb);
 434                 if (proxied > 0)
 435                         return ip6_input(skb);
 436                 else if (proxied < 0) {
 437                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 438                                       IPSTATS_MIB_INDISCARDS);
 439                         goto drop;
 440                 }
 441         }
 442
 443         if (!xfrm6_route_forward(skb)) {
 444                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 445                 goto drop;
 446         }
 447         dst = skb_dst(skb);
 448
 449         /* IPv6 specs say nothing about it, but it is clear that we cannot
 450            send redirects to source routed frames.
 451            We don't send redirects to frames decapsulated from IPsec.
 452          */
 453         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 454                 struct in6_addr *target = NULL;
 455                 struct inet_peer *peer;
 456                 struct rt6_info *rt;
 457
 458                 /*
 459                  *      incoming and outgoing devices are the same
 460                  *      send a redirect.
 461                  */
 462
 463                 rt = (struct rt6_info *) dst;
 464                 if (rt->rt6i_flags & RTF_GATEWAY)
 465                         target = &rt->rt6i_gateway;
 466                 else
 467                         target = &hdr->daddr;
 468
 469                 peer = rt6_get_peer_create(rt);
 470
 471                 /* Limit redirects both by destination (here)
 472                    and by source (inside ndisc_send_redirect)
 473                  */
 474                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 475                         ndisc_send_redirect(skb, target);
 476         } else {
 477                 int addrtype = ipv6_addr_type(&hdr->saddr);
 478
 479                 /* This check is security critical. */
 480                 if (addrtype == IPV6_ADDR_ANY ||
 481                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 482                         goto error;
 483                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 484                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 485                                     ICMPV6_NOT_NEIGHBOUR, 0);
 486                         goto error;
 487                 }
 488         }
 489
 490         mtu = dst_mtu(dst);
 491         if (mtu < IPV6_MIN_MTU)
 492                 mtu = IPV6_MIN_MTU;
 493
 494         if (skb->len > mtu && !skb_is_gso(skb)) {
 495                 /* Again, force OUTPUT device used as source address */
 496                 skb->dev = dst->dev;
 497                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 498                 IP6_INC_STATS_BH(net,
 499                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 500                 IP6_INC_STATS_BH(net,
 501                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 502                 kfree_skb(skb);
 503                 return -EMSGSIZE;
 504         }
 505
 506         if (skb_cow(skb, dst->dev->hard_header_len)) {
 507                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 508                 goto drop;
 509         }
 510
 511         hdr = ipv6_hdr(skb);
 512
 513         /* Mangling hops number delayed to point after skb COW */
 514
 515         hdr->hop_limit--;
 516
 517         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 518         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 519         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 520                        ip6_forward_finish);
 521
 522 error:
 523         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 524 drop:
 525         kfree_skb(skb);
 526         return -EINVAL;
 527 }
 528
 529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 530 {
 531         to->pkt_type = from->pkt_type;
 532         to->priority = from->priority;
 533         to->protocol = from->protocol;
 534         skb_dst_drop(to);
 535         skb_dst_set(to, dst_clone(skb_dst(from)));
 536         to->dev = from->dev;
 537         to->mark = from->mark;
 538
 539 #ifdef CONFIG_NET_SCHED
 540         to->tc_index = from->tc_index;
 541 #endif
 542         nf_copy(to, from);
 543 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 544     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 545         to->nf_trace = from->nf_trace;
 546 #endif
 547         skb_copy_secmark(to, from);
 548 }
 549
 550 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 551 {
 552         u16 offset = sizeof(struct ipv6hdr);
 553         struct ipv6_opt_hdr *exthdr =
 554                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 555         unsigned int packet_len = skb->tail - skb->network_header;
 556         int found_rhdr = 0;
 557         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 558
 559         while (offset + 1 <= packet_len) {
 560
 561                 switch (**nexthdr) {
 562
 563                 case NEXTHDR_HOP:
 564                         break;
 565                 case NEXTHDR_ROUTING:
 566                         found_rhdr = 1;
 567                         break;
 568                 case NEXTHDR_DEST:
 569 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 570                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 571                                 break;
 572 #endif
 573                         if (found_rhdr)
 574                                 return offset;
 575                         break;
 576                 default :
 577                         return offset;
 578                 }
 579
 580                 offset += ipv6_optlen(exthdr);
 581                 *nexthdr = &exthdr->nexthdr;
 582                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 583                                                  offset);
 584         }
 585
 586         return offset;
 587 }
 588
 589 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 590 {
 591         static atomic_t ipv6_fragmentation_id;
 592         int old, new;
 593
 594         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 595                 struct inet_peer *peer = rt6_get_peer_create(rt);
 596
 597                 if (peer) {
 598                         fhdr->identification = htonl(inet_getid(peer, 0));
 599                         return;
 600                 }
 601         }
 602         do {
 603                 old = atomic_read(&ipv6_fragmentation_id);
 604                 new = old + 1;
 605                 if (!new)
 606                         new = 1;
 607         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 608         fhdr->identification = htonl(new);
 609 }
 610
 611 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 612 {
 613         struct sk_buff *frag;
 614         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 615         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 616         struct ipv6hdr *tmp_hdr;
 617         struct frag_hdr *fh;
 618         unsigned int mtu, hlen, left, len;
 619         int hroom, troom;
 620         __be32 frag_id = 0;
 621         int ptr, offset = 0, err=0;
 622         u8 *prevhdr, nexthdr = 0;
 623         struct net *net = dev_net(skb_dst(skb)->dev);
 624
 625         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 626         nexthdr = *prevhdr;
 627
 628         mtu = ip6_skb_dst_mtu(skb);
 629
 630         /* We must not fragment if the socket is set to force MTU discovery
 631          * or if the skb it not generated by a local socket.
 632          */
 633         if (unlikely(!skb->local_df && skb->len > mtu)) {
 634                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 635                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 636
 637                 skb->dev = skb_dst(skb)->dev;
 638                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 639                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 640                               IPSTATS_MIB_FRAGFAILS);
 641                 kfree_skb(skb);
 642                 return -EMSGSIZE;
 643         }
 644
 645         if (np && np->frag_size < mtu) {
 646                 if (np->frag_size)
 647                         mtu = np->frag_size;
 648         }
 649         mtu -= hlen + sizeof(struct frag_hdr);
 650
 651         if (skb_has_frag_list(skb)) {
 652                 int first_len = skb_pagelen(skb);
 653                 struct sk_buff *frag2;
 654
 655                 if (first_len - hlen > mtu ||
 656                     ((first_len - hlen) & 7) ||
 657                     skb_cloned(skb))
 658                         goto slow_path;
 659
 660                 skb_walk_frags(skb, frag) {
 661                         /* Correct geometry. */
 662                         if (frag->len > mtu ||
 663                             ((frag->len & 7) && frag->next) ||
 664                             skb_headroom(frag) < hlen)
 665                                 goto slow_path_clean;
 666
 667                         /* Partially cloned skb? */
 668                         if (skb_shared(frag))
 669                                 goto slow_path_clean;
 670
 671                         BUG_ON(frag->sk);
 672                         if (skb->sk) {
 673                                 frag->sk = skb->sk;
 674                                 frag->destructor = sock_wfree;
 675                         }
 676                         skb->truesize -= frag->truesize;
 677                 }
 678
 679                 err = 0;
 680                 offset = 0;
 681                 frag = skb_shinfo(skb)->frag_list;
 682                 skb_frag_list_init(skb);
 683                 /* BUILD HEADER */
 684
 685                 *prevhdr = NEXTHDR_FRAGMENT;
 686                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 687                 if (!tmp_hdr) {
 688                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 689                                       IPSTATS_MIB_FRAGFAILS);
 690                         return -ENOMEM;
 691                 }
 692
 693                 __skb_pull(skb, hlen);
 694                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 695                 __skb_push(skb, hlen);
 696                 skb_reset_network_header(skb);
 697                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 698
 699                 ipv6_select_ident(fh, rt);
 700                 fh->nexthdr = nexthdr;
 701                 fh->reserved = 0;
 702                 fh->frag_off = htons(IP6_MF);
 703                 frag_id = fh->identification;
 704
 705                 first_len = skb_pagelen(skb);
 706                 skb->data_len = first_len - skb_headlen(skb);
 707                 skb->len = first_len;
 708                 ipv6_hdr(skb)->payload_len = htons(first_len -
 709                                                    sizeof(struct ipv6hdr));
 710
 711                 dst_hold(&rt->dst);
 712
 713                 for (;;) {
 714                         /* Prepare header of the next frame,
 715                          * before previous one went down. */
 716                         if (frag) {
 717                                 frag->ip_summed = CHECKSUM_NONE;
 718                                 skb_reset_transport_header(frag);
 719                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 720                                 __skb_push(frag, hlen);
 721                                 skb_reset_network_header(frag);
 722                                 memcpy(skb_network_header(frag), tmp_hdr,
 723                                        hlen);
 724                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 725                                 fh->nexthdr = nexthdr;
 726                                 fh->reserved = 0;
 727                                 fh->frag_off = htons(offset);
 728                                 if (frag->next != NULL)
 729                                         fh->frag_off |= htons(IP6_MF);
 730                                 fh->identification = frag_id;
 731                                 ipv6_hdr(frag)->payload_len =
 732                                                 htons(frag->len -
 733                                                       sizeof(struct ipv6hdr));
 734                                 ip6_copy_metadata(frag, skb);
 735                         }
 736
 737                         err = output(skb);
 738                         if(!err)
 739                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 740                                               IPSTATS_MIB_FRAGCREATES);
 741
 742                         if (err || !frag)
 743                                 break;
 744
 745                         skb = frag;
 746                         frag = skb->next;
 747                         skb->next = NULL;
 748                 }
 749
 750                 kfree(tmp_hdr);
 751
 752                 if (err == 0) {
 753                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 754                                       IPSTATS_MIB_FRAGOKS);
 755                         dst_release(&rt->dst);
 756                         return 0;
 757                 }
 758
 759                 while (frag) {
 760                         skb = frag->next;
 761                         kfree_skb(frag);
 762                         frag = skb;
 763                 }
 764
 765                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 766                               IPSTATS_MIB_FRAGFAILS);
 767                 dst_release(&rt->dst);
 768                 return err;
 769
 770 slow_path_clean:
 771                 skb_walk_frags(skb, frag2) {
 772                         if (frag2 == frag)
 773                                 break;
 774                         frag2->sk = NULL;
 775                         frag2->destructor = NULL;
 776                         skb->truesize += frag2->truesize;
 777                 }
 778         }
 779
 780 slow_path:
 781         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 782             skb_checksum_help(skb))
 783                 goto fail;
 784
 785         left = skb->len - hlen;         /* Space per frame */
 786         ptr = hlen;                     /* Where to start from */
 787
 788         /*
 789          *      Fragment the datagram.
 790          */
 791
 792         *prevhdr = NEXTHDR_FRAGMENT;
 793         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 794         troom = rt->dst.dev->needed_tailroom;
 795
 796         /*
 797          *      Keep copying data until we run out.
 798          */
 799         while(left > 0) {
 800                 len = left;
 801                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 802                 if (len > mtu)
 803                         len = mtu;
 804                 /* IF: we are not sending up to and including the packet end
 805                    then align the next start on an eight byte boundary */
 806                 if (len < left) {
 807                         len &= ~7;
 808                 }
 809                 /*
 810                  *      Allocate buffer.
 811                  */
 812
 813                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 814                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 815                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 816                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 817                                       IPSTATS_MIB_FRAGFAILS);
 818                         err = -ENOMEM;
 819                         goto fail;
 820                 }
 821
 822                 /*
 823                  *      Set up data on packet
 824                  */
 825
 826                 ip6_copy_metadata(frag, skb);
 827                 skb_reserve(frag, hroom);
 828                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 829                 skb_reset_network_header(frag);
 830                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 831                 frag->transport_header = (frag->network_header + hlen +
 832                                           sizeof(struct frag_hdr));
 833
 834                 /*
 835                  *      Charge the memory for the fragment to any owner
 836                  *      it might possess
 837                  */
 838                 if (skb->sk)
 839                         skb_set_owner_w(frag, skb->sk);
 840
 841                 /*
 842                  *      Copy the packet header into the new buffer.
 843                  */
 844                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 845
 846                 /*
 847                  *      Build fragment header.
 848                  */
 849                 fh->nexthdr = nexthdr;
 850                 fh->reserved = 0;
 851                 if (!frag_id) {
 852                         ipv6_select_ident(fh, rt);
 853                         frag_id = fh->identification;
 854                 } else
 855                         fh->identification = frag_id;
 856
 857                 /*
 858                  *      Copy a block of the IP datagram.
 859                  */
 860                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 861                         BUG();
 862                 left -= len;
 863
 864                 fh->frag_off = htons(offset);
 865                 if (left > 0)
 866                         fh->frag_off |= htons(IP6_MF);
 867                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 868                                                     sizeof(struct ipv6hdr));
 869
 870                 ptr += len;
 871                 offset += len;
 872
 873                 /*
 874                  *      Put this fragment into the sending queue.
 875                  */
 876                 err = output(frag);
 877                 if (err)
 878                         goto fail;
 879
 880                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 881                               IPSTATS_MIB_FRAGCREATES);
 882         }
 883         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 884                       IPSTATS_MIB_FRAGOKS);
 885         consume_skb(skb);
 886         return err;
 887
 888 fail:
 889         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 890                       IPSTATS_MIB_FRAGFAILS);
 891         kfree_skb(skb);
 892         return err;
 893 }
 894
 895 static inline int ip6_rt_check(const struct rt6key *rt_key,
 896                                const struct in6_addr *fl_addr,
 897                                const struct in6_addr *addr_cache)
 898 {
 899         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 900                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 901 }
 902
 903 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 904                                           struct dst_entry *dst,
 905                                           const struct flowi6 *fl6)
 906 {
 907         struct ipv6_pinfo *np = inet6_sk(sk);
 908         struct rt6_info *rt = (struct rt6_info *)dst;
 909
 910         if (!dst)
 911                 goto out;
 912
 913         /* Yes, checking route validity in not connected
 914          * case is not very simple. Take into account,
 915          * that we do not support routing by source, TOS,
 916          * and MSG_DONTROUTE            --ANK (980726)
 917          *
 918          * 1. ip6_rt_check(): If route was host route,
 919          *    check that cached destination is current.
 920          *    If it is network route, we still may
 921          *    check its validity using saved pointer
 922          *    to the last used address: daddr_cache.
 923          *    We do not want to save whole address now,
 924          *    (because main consumer of this service
 925          *    is tcp, which has not this problem),
 926          *    so that the last trick works only on connected
 927          *    sockets.
 928          * 2. oif also should be the same.
 929          */
 930         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 931 #ifdef CONFIG_IPV6_SUBTREES
 932             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 933 #endif
 934             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 935                 dst_release(dst);
 936                 dst = NULL;
 937         }
 938
 939 out:
 940         return dst;
 941 }
 942
 943 static int ip6_dst_lookup_tail(struct sock *sk,
 944                                struct dst_entry **dst, struct flowi6 *fl6)
 945 {
 946         struct net *net = sock_net(sk);
 947 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 948         struct neighbour *n;
 949         struct rt6_info *rt;
 950 #endif
 951         int err;
 952
 953         if (*dst == NULL)
 954                 *dst = ip6_route_output(net, sk, fl6);
 955
 956         if ((err = (*dst)->error))
 957                 goto out_err_release;
 958
 959         if (ipv6_addr_any(&fl6->saddr)) {
 960                 struct rt6_info *rt = (struct rt6_info *) *dst;
 961                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 962                                           sk ? inet6_sk(sk)->srcprefs : 0,
 963                                           &fl6->saddr);
 964                 if (err)
 965                         goto out_err_release;
 966         }
 967
 968 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 969         /*
 970          * Here if the dst entry we've looked up
 971          * has a neighbour entry that is in the INCOMPLETE
 972          * state and the src address from the flow is
 973          * marked as OPTIMISTIC, we release the found
 974          * dst entry and replace it instead with the
 975          * dst entry of the nexthop router
 976          */
 977         rcu_read_lock();
 978         rt = (struct rt6_info *) dst;
 979         n = rt->n;
 980         if (n && !(n->nud_state & NUD_VALID)) {
 981                 struct inet6_ifaddr *ifp;
 982                 struct flowi6 fl_gw6;
 983                 int redirect;
 984
 985                 rcu_read_unlock();
 986                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 987                                       (*dst)->dev, 1);
 988
 989                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 990                 if (ifp)
 991                         in6_ifa_put(ifp);
 992
 993                 if (redirect) {
 994                         /*
 995                          * We need to get the dst entry for the
 996                          * default router instead
 997                          */
 998                         dst_release(*dst);
 999                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1000                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1001                         *dst = ip6_route_output(net, sk, &fl_gw6);
1002                         if ((err = (*dst)->error))
1003                                 goto out_err_release;
1004                 }
1005         } else {
1006                 rcu_read_unlock();
1007         }
1008 #endif
1009
1010         return 0;
1011
1012 out_err_release:
1013         if (err == -ENETUNREACH)
1014                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1015         dst_release(*dst);
1016         *dst = NULL;
1017         return err;
1018 }
1019
1020 /**
1021  *      ip6_dst_lookup - perform route lookup on flow
1022  *      @sk: socket which provides route info
1023  *      @dst: pointer to dst_entry * for result
1024  *      @fl6: flow to lookup
1025  *
1026  *      This function performs a route lookup on the given flow.
1027  *
1028  *      It returns zero on success, or a standard errno code on error.
1029  */
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1031 {
1032         *dst = NULL;
1033         return ip6_dst_lookup_tail(sk, dst, fl6);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1036
1037 /**
1038  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039  *      @sk: socket which provides route info
1040  *      @fl6: flow to lookup
1041  *      @final_dst: final destination address for ipsec lookup
1042  *      @can_sleep: we are in a sleepable context
1043  *
1044  *      This function performs a route lookup on the given flow.
1045  *
1046  *      It returns a valid dst pointer on success, or a pointer encoded
1047  *      error code.
1048  */
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050                                       const struct in6_addr *final_dst,
1051                                       bool can_sleep)
1052 {
1053         struct dst_entry *dst = NULL;
1054         int err;
1055
1056         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057         if (err)
1058                 return ERR_PTR(err);
1059         if (final_dst)
1060                 fl6->daddr = *final_dst;
1061         if (can_sleep)
1062                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063
1064         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1067
1068 /**
1069  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070  *      @sk: socket which provides the dst cache and route info
1071  *      @fl6: flow to lookup
1072  *      @final_dst: final destination address for ipsec lookup
1073  *      @can_sleep: we are in a sleepable context
1074  *
1075  *      This function performs a route lookup on the given flow with the
1076  *      possibility of using the cached route in the socket if it is valid.
1077  *      It will take the socket dst lock when operating on the dst cache.
1078  *      As a result, this function can only be used in process context.
1079  *
1080  *      It returns a valid dst pointer on success, or a pointer encoded
1081  *      error code.
1082  */
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084                                          const struct in6_addr *final_dst,
1085                                          bool can_sleep)
1086 {
1087         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1088         int err;
1089
1090         dst = ip6_sk_dst_check(sk, dst, fl6);
1091
1092         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093         if (err)
1094                 return ERR_PTR(err);
1095         if (final_dst)
1096                 fl6->daddr = *final_dst;
1097         if (can_sleep)
1098                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099
1100         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105                         int getfrag(void *from, char *to, int offset, int len,
1106                         int odd, struct sk_buff *skb),
1107                         void *from, int length, int hh_len, int fragheaderlen,
1108                         int transhdrlen, int mtu,unsigned int flags,
1109                         struct rt6_info *rt)
1110
1111 {
1112         struct sk_buff *skb;
1113         int err;
1114
1115         /* There is support for UDP large send offload by network
1116          * device, so create one single skb packet containing complete
1117          * udp datagram
1118          */
1119         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120                 skb = sock_alloc_send_skb(sk,
1121                         hh_len + fragheaderlen + transhdrlen + 20,
1122                         (flags & MSG_DONTWAIT), &err);
1123                 if (skb == NULL)
1124                         return err;
1125
1126                 /* reserve space for Hardware header */
1127                 skb_reserve(skb, hh_len);
1128
1129                 /* create space for UDP/IP header */
1130                 skb_put(skb,fragheaderlen + transhdrlen);
1131
1132                 /* initialize network header pointer */
1133                 skb_reset_network_header(skb);
1134
1135                 /* initialize protocol header pointer */
1136                 skb->transport_header = skb->network_header + fragheaderlen;
1137
1138                 skb->ip_summed = CHECKSUM_PARTIAL;
1139                 skb->csum = 0;
1140         }
1141
1142         err = skb_append_datato_frags(sk,skb, getfrag, from,
1143                                       (length - transhdrlen));
1144         if (!err) {
1145                 struct frag_hdr fhdr;
1146
1147                 /* Specify the length of each IPv6 datagram fragment.
1148                  * It has to be a multiple of 8.
1149                  */
1150                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1151                                              sizeof(struct frag_hdr)) & ~7;
1152                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153                 ipv6_select_ident(&fhdr, rt);
1154                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1155                 __skb_queue_tail(&sk->sk_write_queue, skb);
1156
1157                 return 0;
1158         }
1159         /* There is not enough support do UPD LSO,
1160          * so follow normal path
1161          */
1162         kfree_skb(skb);
1163
1164         return err;
1165 }
1166
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1168                                                gfp_t gfp)
1169 {
1170         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1174                                                 gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static void ip6_append_data_mtu(int *mtu,
1180                                 int *maxfraglen,
1181                                 unsigned int fragheaderlen,
1182                                 struct sk_buff *skb,
1183                                 struct rt6_info *rt)
1184 {
1185         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186                 if (skb == NULL) {
1187                         /* first fragment, reserve header_len */
1188                         *mtu = *mtu - rt->dst.header_len;
1189
1190                 } else {
1191                         /*
1192                          * this fragment is not first, the headers
1193                          * space is regarded as data space.
1194                          */
1195                         *mtu = dst_mtu(rt->dst.path);
1196                 }
1197                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198                               + fragheaderlen - sizeof(struct frag_hdr);
1199         }
1200 }
1201
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203         int offset, int len, int odd, struct sk_buff *skb),
1204         void *from, int length, int transhdrlen,
1205         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206         struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 {
1208         struct inet_sock *inet = inet_sk(sk);
1209         struct ipv6_pinfo *np = inet6_sk(sk);
1210         struct inet_cork *cork;
1211         struct sk_buff *skb, *skb_prev = NULL;
1212         unsigned int maxfraglen, fragheaderlen;
1213         int exthdrlen;
1214         int dst_exthdrlen;
1215         int hh_len;
1216         int mtu;
1217         int copy;
1218         int err;
1219         int offset = 0;
1220         __u8 tx_flags = 0;
1221
1222         if (flags&MSG_PROBE)
1223                 return 0;
1224         cork = &inet->cork.base;
1225         if (skb_queue_empty(&sk->sk_write_queue)) {
1226                 /*
1227                  * setup for corking
1228                  */
1229                 if (opt) {
1230                         if (WARN_ON(np->cork.opt))
1231                                 return -EINVAL;
1232
1233                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1234                         if (unlikely(np->cork.opt == NULL))
1235                                 return -ENOBUFS;
1236
1237                         np->cork.opt->tot_len = opt->tot_len;
1238                         np->cork.opt->opt_flen = opt->opt_flen;
1239                         np->cork.opt->opt_nflen = opt->opt_nflen;
1240
1241                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242                                                             sk->sk_allocation);
1243                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247                                                             sk->sk_allocation);
1248                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1249                                 return -ENOBUFS;
1250
1251                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252                                                            sk->sk_allocation);
1253                         if (opt->hopopt && !np->cork.opt->hopopt)
1254                                 return -ENOBUFS;
1255
1256                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257                                                             sk->sk_allocation);
1258                         if (opt->srcrt && !np->cork.opt->srcrt)
1259                                 return -ENOBUFS;
1260
1261                         /* need source address above miyazawa*/
1262                 }
1263                 dst_hold(&rt->dst);
1264                 cork->dst = &rt->dst;
1265                 inet->cork.fl.u.ip6 = *fl6;
1266                 np->cork.hop_limit = hlimit;
1267                 np->cork.tclass = tclass;
1268                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1269                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271                 else
1272                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274                 if (np->frag_size < mtu) {
1275                         if (np->frag_size)
1276                                 mtu = np->frag_size;
1277                 }
1278                 cork->fragsize = mtu;
1279                 if (dst_allfrag(rt->dst.path))
1280                         cork->flags |= IPCORK_ALLFRAG;
1281                 cork->length = 0;
1282                 sk->sk_sndmsg_page = NULL;
1283                 sk->sk_sndmsg_off = 0;
1284                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1285                 length += exthdrlen;
1286                 transhdrlen += exthdrlen;
1287                 dst_exthdrlen = rt->dst.header_len;
1288         } else {
1289                 rt = (struct rt6_info *)cork->dst;
1290                 fl6 = &inet->cork.fl.u.ip6;
1291                 opt = np->cork.opt;
1292                 transhdrlen = 0;
1293                 exthdrlen = 0;
1294                 dst_exthdrlen = 0;
1295                 mtu = cork->fragsize;
1296         }
1297
1298         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1299
1300         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1301                         (opt ? opt->opt_nflen : 0);
1302         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1303
1304         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1305                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1306                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1307                         return -EMSGSIZE;
1308                 }
1309         }
1310
1311         /* For UDP, check if TX timestamp is enabled */
1312         if (sk->sk_type == SOCK_DGRAM) {
1313                 err = sock_tx_timestamp(sk, &tx_flags);
1314                 if (err)
1315                         goto error;
1316         }
1317
1318         /*
1319          * Let's try using as much space as possible.
1320          * Use MTU if total length of the message fits into the MTU.
1321          * Otherwise, we need to reserve fragment header and
1322          * fragment alignment (= 8-15 octects, in total).
1323          *
1324          * Note that we may need to "move" the data from the tail of
1325          * of the buffer to the new fragment when we split
1326          * the message.
1327          *
1328          * FIXME: It may be fragmented into multiple chunks
1329          *        at once if non-fragmentable extension headers
1330          *        are too large.
1331          * --yoshfuji
1332          */
1333
1334         cork->length += length;
1335         if (length > mtu) {
1336                 int proto = sk->sk_protocol;
1337                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1338                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1339                         return -EMSGSIZE;
1340                 }
1341
1342                 if (proto == IPPROTO_UDP &&
1343                     (rt->dst.dev->features & NETIF_F_UFO)) {
1344
1345                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1346                                                   hh_len, fragheaderlen,
1347                                                   transhdrlen, mtu, flags, rt);
1348                         if (err)
1349                                 goto error;
1350                         return 0;
1351                 }
1352         }
1353
1354         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1355                 goto alloc_new_skb;
1356
1357         while (length > 0) {
1358                 /* Check if the remaining data fits into current packet. */
1359                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1360                 if (copy < length)
1361                         copy = maxfraglen - skb->len;
1362
1363                 if (copy <= 0) {
1364                         char *data;
1365                         unsigned int datalen;
1366                         unsigned int fraglen;
1367                         unsigned int fraggap;
1368                         unsigned int alloclen;
1369 alloc_new_skb:
1370                         /* There's no room in the current skb */
1371                         if (skb)
1372                                 fraggap = skb->len - maxfraglen;
1373                         else
1374                                 fraggap = 0;
1375                         /* update mtu and maxfraglen if necessary */
1376                         if (skb == NULL || skb_prev == NULL)
1377                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1378                                                     fragheaderlen, skb, rt);
1379
1380                         skb_prev = skb;
1381
1382                         /*
1383                          * If remaining data exceeds the mtu,
1384                          * we know we need more fragment(s).
1385                          */
1386                         datalen = length + fraggap;
1387
1388                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1389                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1390                         if ((flags & MSG_MORE) &&
1391                             !(rt->dst.dev->features&NETIF_F_SG))
1392                                 alloclen = mtu;
1393                         else
1394                                 alloclen = datalen + fragheaderlen;
1395
1396                         alloclen += dst_exthdrlen;
1397
1398                         if (datalen != length + fraggap) {
1399                                 /*
1400                                  * this is not the last fragment, the trailer
1401                                  * space is regarded as data space.
1402                                  */
1403                                 datalen += rt->dst.trailer_len;
1404                         }
1405
1406                         alloclen += rt->dst.trailer_len;
1407                         fraglen = datalen + fragheaderlen;
1408
1409                         /*
1410                          * We just reserve space for fragment header.
1411                          * Note: this may be overallocation if the message
1412                          * (without MSG_MORE) fits into the MTU.
1413                          */
1414                         alloclen += sizeof(struct frag_hdr);
1415
1416                         if (transhdrlen) {
1417                                 skb = sock_alloc_send_skb(sk,
1418                                                 alloclen + hh_len,
1419                                                 (flags & MSG_DONTWAIT), &err);
1420                         } else {
1421                                 skb = NULL;
1422                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1423                                     2 * sk->sk_sndbuf)
1424                                         skb = sock_wmalloc(sk,
1425                                                            alloclen + hh_len, 1,
1426                                                            sk->sk_allocation);
1427                                 if (unlikely(skb == NULL))
1428                                         err = -ENOBUFS;
1429                                 else {
1430                                         /* Only the initial fragment
1431                                          * is time stamped.
1432                                          */
1433                                         tx_flags = 0;
1434                                 }
1435                         }
1436                         if (skb == NULL)
1437                                 goto error;
1438                         /*
1439                          *      Fill in the control structures
1440                          */
1441                         skb->ip_summed = CHECKSUM_NONE;
1442                         skb->csum = 0;
1443                         /* reserve for fragmentation and ipsec header */
1444                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1445                                     dst_exthdrlen);
1446
1447                         if (sk->sk_type == SOCK_DGRAM)
1448                                 skb_shinfo(skb)->tx_flags = tx_flags;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         copy = datalen - transhdrlen - fraggap;
1468
1469                         if (copy < 0) {
1470                                 err = -EINVAL;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1474                                 err = -EFAULT;
1475                                 kfree_skb(skb);
1476                                 goto error;
1477                         }
1478
1479                         offset += copy;
1480                         length -= datalen - fraggap;
1481                         transhdrlen = 0;
1482                         exthdrlen = 0;
1483                         dst_exthdrlen = 0;
1484
1485                         /*
1486                          * Put the packet on the pending queue
1487                          */
1488                         __skb_queue_tail(&sk->sk_write_queue, skb);
1489                         continue;
1490                 }
1491
1492                 if (copy > length)
1493                         copy = length;
1494
1495                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1496                         unsigned int off;
1497
1498                         off = skb->len;
1499                         if (getfrag(from, skb_put(skb, copy),
1500                                                 offset, copy, off, skb) < 0) {
1501                                 __skb_trim(skb, off);
1502                                 err = -EFAULT;
1503                                 goto error;
1504                         }
1505                 } else {
1506                         int i = skb_shinfo(skb)->nr_frags;
1507                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1508                         struct page *page = sk->sk_sndmsg_page;
1509                         int off = sk->sk_sndmsg_off;
1510                         unsigned int left;
1511
1512                         if (page && (left = PAGE_SIZE - off) > 0) {
1513                                 if (copy >= left)
1514                                         copy = left;
1515                                 if (page != skb_frag_page(frag)) {
1516                                         if (i == MAX_SKB_FRAGS) {
1517                                                 err = -EMSGSIZE;
1518                                                 goto error;
1519                                         }
1520                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1521                                         skb_frag_ref(skb, i);
1522                                         frag = &skb_shinfo(skb)->frags[i];
1523                                 }
1524                         } else if(i < MAX_SKB_FRAGS) {
1525                                 if (copy > PAGE_SIZE)
1526                                         copy = PAGE_SIZE;
1527                                 page = alloc_pages(sk->sk_allocation, 0);
1528                                 if (page == NULL) {
1529                                         err = -ENOMEM;
1530                                         goto error;
1531                                 }
1532                                 sk->sk_sndmsg_page = page;
1533                                 sk->sk_sndmsg_off = 0;
1534
1535                                 skb_fill_page_desc(skb, i, page, 0, 0);
1536                                 frag = &skb_shinfo(skb)->frags[i];
1537                         } else {
1538                                 err = -EMSGSIZE;
1539                                 goto error;
1540                         }
1541                         if (getfrag(from,
1542                                     skb_frag_address(frag) + skb_frag_size(frag),
1543                                     offset, copy, skb->len, skb) < 0) {
1544                                 err = -EFAULT;
1545                                 goto error;
1546                         }
1547                         sk->sk_sndmsg_off += copy;
1548                         skb_frag_size_add(frag, copy);
1549                         skb->len += copy;
1550                         skb->data_len += copy;
1551                         skb->truesize += copy;
1552                         atomic_add(copy, &sk->sk_wmem_alloc);
1553                 }
1554                 offset += copy;
1555                 length -= copy;
1556         }
1557         return 0;
1558 error:
1559         cork->length -= length;
1560         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561         return err;
1562 }
1563 EXPORT_SYMBOL_GPL(ip6_append_data);
1564
1565 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1566 {
1567         if (np->cork.opt) {
1568                 kfree(np->cork.opt->dst0opt);
1569                 kfree(np->cork.opt->dst1opt);
1570                 kfree(np->cork.opt->hopopt);
1571                 kfree(np->cork.opt->srcrt);
1572                 kfree(np->cork.opt);
1573                 np->cork.opt = NULL;
1574         }
1575
1576         if (inet->cork.base.dst) {
1577                 dst_release(inet->cork.base.dst);
1578                 inet->cork.base.dst = NULL;
1579                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1580         }
1581         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1582 }
1583
1584 int ip6_push_pending_frames(struct sock *sk)
1585 {
1586         struct sk_buff *skb, *tmp_skb;
1587         struct sk_buff **tail_skb;
1588         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1589         struct inet_sock *inet = inet_sk(sk);
1590         struct ipv6_pinfo *np = inet6_sk(sk);
1591         struct net *net = sock_net(sk);
1592         struct ipv6hdr *hdr;
1593         struct ipv6_txoptions *opt = np->cork.opt;
1594         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1595         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1596         unsigned char proto = fl6->flowi6_proto;
1597         int err = 0;
1598
1599         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1600                 goto out;
1601         tail_skb = &(skb_shinfo(skb)->frag_list);
1602
1603         /* move skb->data to ip header from ext header */
1604         if (skb->data < skb_network_header(skb))
1605                 __skb_pull(skb, skb_network_offset(skb));
1606         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1607                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1608                 *tail_skb = tmp_skb;
1609                 tail_skb = &(tmp_skb->next);
1610                 skb->len += tmp_skb->len;
1611                 skb->data_len += tmp_skb->len;
1612                 skb->truesize += tmp_skb->truesize;
1613                 tmp_skb->destructor = NULL;
1614                 tmp_skb->sk = NULL;
1615         }
1616
1617         /* Allow local fragmentation. */
1618         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1619                 skb->local_df = 1;
1620
1621         *final_dst = fl6->daddr;
1622         __skb_pull(skb, skb_network_header_len(skb));
1623         if (opt && opt->opt_flen)
1624                 ipv6_push_frag_opts(skb, opt, &proto);
1625         if (opt && opt->opt_nflen)
1626                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1627
1628         skb_push(skb, sizeof(struct ipv6hdr));
1629         skb_reset_network_header(skb);
1630         hdr = ipv6_hdr(skb);
1631
1632         *(__be32*)hdr = fl6->flowlabel |
1633                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1634
1635         hdr->hop_limit = np->cork.hop_limit;
1636         hdr->nexthdr = proto;
1637         hdr->saddr = fl6->saddr;
1638         hdr->daddr = *final_dst;
1639
1640         skb->priority = sk->sk_priority;
1641         skb->mark = sk->sk_mark;
1642
1643         skb_dst_set(skb, dst_clone(&rt->dst));
1644         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1645         if (proto == IPPROTO_ICMPV6) {
1646                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1647
1648                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1649                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1650         }
1651
1652         err = ip6_local_out(skb);
1653         if (err) {
1654                 if (err > 0)
1655                         err = net_xmit_errno(err);
1656                 if (err)
1657                         goto error;
1658         }
1659
1660 out:
1661         ip6_cork_release(inet, np);
1662         return err;
1663 error:
1664         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1665         goto out;
1666 }
1667 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1668
1669 void ip6_flush_pending_frames(struct sock *sk)
1670 {
1671         struct sk_buff *skb;
1672
1673         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1674                 if (skb_dst(skb))
1675                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1676                                       IPSTATS_MIB_OUTDISCARDS);
1677                 kfree_skb(skb);
1678         }
1679
1680         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1681 }
1682 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);