net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         rcu_read_lock();
 139         neigh = dst_get_neighbour_noref(dst);
 140         if (neigh) {
 141                 int res = neigh_output(neigh, skb);
 142
 143                 rcu_read_unlock();
 144                 return res;
 145         }
 146         rcu_read_unlock();
 147         IP6_INC_STATS_BH(dev_net(dst->dev),
 148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 149         kfree_skb(skb);
 150         return -EINVAL;
 151 }
 152
 153 static int ip6_finish_output(struct sk_buff *skb)
 154 {
 155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 156             dst_allfrag(skb_dst(skb)))
 157                 return ip6_fragment(skb, ip6_finish_output2);
 158         else
 159                 return ip6_finish_output2(skb);
 160 }
 161
 162 int ip6_output(struct sk_buff *skb)
 163 {
 164         struct net_device *dev = skb_dst(skb)->dev;
 165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 166         if (unlikely(idev->cnf.disable_ipv6)) {
 167                 IP6_INC_STATS(dev_net(dev), idev,
 168                               IPSTATS_MIB_OUTDISCARDS);
 169                 kfree_skb(skb);
 170                 return 0;
 171         }
 172
 173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 174                             ip6_finish_output,
 175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 176 }
 177
 178 /*
 179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 180  */
 181
 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 183              struct ipv6_txoptions *opt, int tclass)
 184 {
 185         struct net *net = sock_net(sk);
 186         struct ipv6_pinfo *np = inet6_sk(sk);
 187         struct in6_addr *first_hop = &fl6->daddr;
 188         struct dst_entry *dst = skb_dst(skb);
 189         struct ipv6hdr *hdr;
 190         u8  proto = fl6->flowi6_proto;
 191         int seg_len = skb->len;
 192         int hlimit = -1;
 193         u32 mtu;
 194
 195         if (opt) {
 196                 unsigned int head_room;
 197
 198                 /* First: exthdrs may take lots of space (~8K for now)
 199                    MAX_HEADER is not enough.
 200                  */
 201                 head_room = opt->opt_nflen + opt->opt_flen;
 202                 seg_len += head_room;
 203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 204
 205                 if (skb_headroom(skb) < head_room) {
 206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 207                         if (skb2 == NULL) {
 208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 209                                               IPSTATS_MIB_OUTDISCARDS);
 210                                 kfree_skb(skb);
 211                                 return -ENOBUFS;
 212                         }
 213                         kfree_skb(skb);
 214                         skb = skb2;
 215                         skb_set_owner_w(skb, sk);
 216                 }
 217                 if (opt->opt_flen)
 218                         ipv6_push_frag_opts(skb, opt, &proto);
 219                 if (opt->opt_nflen)
 220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 221         }
 222
 223         skb_push(skb, sizeof(struct ipv6hdr));
 224         skb_reset_network_header(skb);
 225         hdr = ipv6_hdr(skb);
 226
 227         /*
 228          *      Fill in the IPv6 header
 229          */
 230         if (np)
 231                 hlimit = np->hop_limit;
 232         if (hlimit < 0)
 233                 hlimit = ip6_dst_hoplimit(dst);
 234
 235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 236
 237         hdr->payload_len = htons(seg_len);
 238         hdr->nexthdr = proto;
 239         hdr->hop_limit = hlimit;
 240
 241         hdr->saddr = fl6->saddr;
 242         hdr->daddr = *first_hop;
 243
 244         skb->priority = sk->sk_priority;
 245         skb->mark = sk->sk_mark;
 246
 247         mtu = dst_mtu(dst);
 248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 250                               IPSTATS_MIB_OUT, skb->len);
 251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 252                                dst->dev, dst_output);
 253         }
 254
 255         if (net_ratelimit())
 256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 257         skb->dev = dst->dev;
 258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 260         kfree_skb(skb);
 261         return -EMSGSIZE;
 262 }
 263
 264 EXPORT_SYMBOL(ip6_xmit);
 265
 266 /*
 267  *      To avoid extra problems ND packets are send through this
 268  *      routine. It's code duplication but I really want to avoid
 269  *      extra checks since ipv6_build_header is used by TCP (which
 270  *      is for us performance critical)
 271  */
 272
 273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 274                const struct in6_addr *saddr, const struct in6_addr *daddr,
 275                int proto, int len)
 276 {
 277         struct ipv6_pinfo *np = inet6_sk(sk);
 278         struct ipv6hdr *hdr;
 279
 280         skb->protocol = htons(ETH_P_IPV6);
 281         skb->dev = dev;
 282
 283         skb_reset_network_header(skb);
 284         skb_put(skb, sizeof(struct ipv6hdr));
 285         hdr = ipv6_hdr(skb);
 286
 287         *(__be32*)hdr = htonl(0x60000000);
 288
 289         hdr->payload_len = htons(len);
 290         hdr->nexthdr = proto;
 291         hdr->hop_limit = np->hop_limit;
 292
 293         hdr->saddr = *saddr;
 294         hdr->daddr = *daddr;
 295
 296         return 0;
 297 }
 298
 299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 300 {
 301         struct ip6_ra_chain *ra;
 302         struct sock *last = NULL;
 303
 304         read_lock(&ip6_ra_lock);
 305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 306                 struct sock *sk = ra->sk;
 307                 if (sk && ra->sel == sel &&
 308                     (!sk->sk_bound_dev_if ||
 309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 310                         if (last) {
 311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 312                                 if (skb2)
 313                                         rawv6_rcv(last, skb2);
 314                         }
 315                         last = sk;
 316                 }
 317         }
 318
 319         if (last) {
 320                 rawv6_rcv(last, skb);
 321                 read_unlock(&ip6_ra_lock);
 322                 return 1;
 323         }
 324         read_unlock(&ip6_ra_lock);
 325         return 0;
 326 }
 327
 328 static int ip6_forward_proxy_check(struct sk_buff *skb)
 329 {
 330         struct ipv6hdr *hdr = ipv6_hdr(skb);
 331         u8 nexthdr = hdr->nexthdr;
 332         __be16 frag_off;
 333         int offset;
 334
 335         if (ipv6_ext_hdr(nexthdr)) {
 336                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 337                 if (offset < 0)
 338                         return 0;
 339         } else
 340                 offset = sizeof(struct ipv6hdr);
 341
 342         if (nexthdr == IPPROTO_ICMPV6) {
 343                 struct icmp6hdr *icmp6;
 344
 345                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 346                                          offset + 1 - skb->data)))
 347                         return 0;
 348
 349                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 350
 351                 switch (icmp6->icmp6_type) {
 352                 case NDISC_ROUTER_SOLICITATION:
 353                 case NDISC_ROUTER_ADVERTISEMENT:
 354                 case NDISC_NEIGHBOUR_SOLICITATION:
 355                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 356                 case NDISC_REDIRECT:
 357                         /* For reaction involving unicast neighbor discovery
 358                          * message destined to the proxied address, pass it to
 359                          * input function.
 360                          */
 361                         return 1;
 362                 default:
 363                         break;
 364                 }
 365         }
 366
 367         /*
 368          * The proxying router can't forward traffic sent to a link-local
 369          * address, so signal the sender and discard the packet. This
 370          * behavior is clarified by the MIPv6 specification.
 371          */
 372         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 373                 dst_link_failure(skb);
 374                 return -1;
 375         }
 376
 377         return 0;
 378 }
 379
 380 static inline int ip6_forward_finish(struct sk_buff *skb)
 381 {
 382         return dst_output(skb);
 383 }
 384
 385 int ip6_forward(struct sk_buff *skb)
 386 {
 387         struct dst_entry *dst = skb_dst(skb);
 388         struct ipv6hdr *hdr = ipv6_hdr(skb);
 389         struct inet6_skb_parm *opt = IP6CB(skb);
 390         struct net *net = dev_net(dst->dev);
 391         u32 mtu;
 392
 393         if (net->ipv6.devconf_all->forwarding == 0)
 394                 goto error;
 395
 396         if (skb_warn_if_lro(skb))
 397                 goto drop;
 398
 399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 401                 goto drop;
 402         }
 403
 404         if (skb->pkt_type != PACKET_HOST)
 405                 goto drop;
 406
 407         skb_forward_csum(skb);
 408
 409         /*
 410          *      We DO NOT make any processing on
 411          *      RA packets, pushing them to user level AS IS
 412          *      without ane WARRANTY that application will be able
 413          *      to interpret them. The reason is that we
 414          *      cannot make anything clever here.
 415          *
 416          *      We are not end-node, so that if packet contains
 417          *      AH/ESP, we cannot make anything.
 418          *      Defragmentation also would be mistake, RA packets
 419          *      cannot be fragmented, because there is no warranty
 420          *      that different fragments will go along one path. --ANK
 421          */
 422         if (opt->ra) {
 423                 u8 *ptr = skb_network_header(skb) + opt->ra;
 424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 425                         return 0;
 426         }
 427
 428         /*
 429          *      check and decrement ttl
 430          */
 431         if (hdr->hop_limit <= 1) {
 432                 /* Force OUTPUT device used as source address */
 433                 skb->dev = dst->dev;
 434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 435                 IP6_INC_STATS_BH(net,
 436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 437
 438                 kfree_skb(skb);
 439                 return -ETIMEDOUT;
 440         }
 441
 442         /* XXX: idev->cnf.proxy_ndp? */
 443         if (net->ipv6.devconf_all->proxy_ndp &&
 444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 445                 int proxied = ip6_forward_proxy_check(skb);
 446                 if (proxied > 0)
 447                         return ip6_input(skb);
 448                 else if (proxied < 0) {
 449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 450                                       IPSTATS_MIB_INDISCARDS);
 451                         goto drop;
 452                 }
 453         }
 454
 455         if (!xfrm6_route_forward(skb)) {
 456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 457                 goto drop;
 458         }
 459         dst = skb_dst(skb);
 460
 461         /* IPv6 specs say nothing about it, but it is clear that we cannot
 462            send redirects to source routed frames.
 463            We don't send redirects to frames decapsulated from IPsec.
 464          */
 465         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 466                 struct in6_addr *target = NULL;
 467                 struct rt6_info *rt;
 468
 469                 /*
 470                  *      incoming and outgoing devices are the same
 471                  *      send a redirect.
 472                  */
 473
 474                 rt = (struct rt6_info *) dst;
 475                 if (rt->rt6i_flags & RTF_GATEWAY)
 476                         target = &rt->rt6i_gateway;
 477                 else
 478                         target = &hdr->daddr;
 479
 480                 if (!rt->rt6i_peer)
 481                         rt6_bind_peer(rt, 1);
 482
 483                 /* Limit redirects both by destination (here)
 484                    and by source (inside ndisc_send_redirect)
 485                  */
 486                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
 487                         ndisc_send_redirect(skb, target);
 488         } else {
 489                 int addrtype = ipv6_addr_type(&hdr->saddr);
 490
 491                 /* This check is security critical. */
 492                 if (addrtype == IPV6_ADDR_ANY ||
 493                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 494                         goto error;
 495                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 496                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 497                                     ICMPV6_NOT_NEIGHBOUR, 0);
 498                         goto error;
 499                 }
 500         }
 501
 502         mtu = dst_mtu(dst);
 503         if (mtu < IPV6_MIN_MTU)
 504                 mtu = IPV6_MIN_MTU;
 505
 506         if (skb->len > mtu && !skb_is_gso(skb)) {
 507                 /* Again, force OUTPUT device used as source address */
 508                 skb->dev = dst->dev;
 509                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 510                 IP6_INC_STATS_BH(net,
 511                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 512                 IP6_INC_STATS_BH(net,
 513                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 514                 kfree_skb(skb);
 515                 return -EMSGSIZE;
 516         }
 517
 518         if (skb_cow(skb, dst->dev->hard_header_len)) {
 519                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 520                 goto drop;
 521         }
 522
 523         hdr = ipv6_hdr(skb);
 524
 525         /* Mangling hops number delayed to point after skb COW */
 526
 527         hdr->hop_limit--;
 528
 529         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 530         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 531                        ip6_forward_finish);
 532
 533 error:
 534         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 535 drop:
 536         kfree_skb(skb);
 537         return -EINVAL;
 538 }
 539
 540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 541 {
 542         to->pkt_type = from->pkt_type;
 543         to->priority = from->priority;
 544         to->protocol = from->protocol;
 545         skb_dst_drop(to);
 546         skb_dst_set(to, dst_clone(skb_dst(from)));
 547         to->dev = from->dev;
 548         to->mark = from->mark;
 549
 550 #ifdef CONFIG_NET_SCHED
 551         to->tc_index = from->tc_index;
 552 #endif
 553         nf_copy(to, from);
 554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 555     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 556         to->nf_trace = from->nf_trace;
 557 #endif
 558         skb_copy_secmark(to, from);
 559 }
 560
 561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 562 {
 563         u16 offset = sizeof(struct ipv6hdr);
 564         struct ipv6_opt_hdr *exthdr =
 565                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 566         unsigned int packet_len = skb->tail - skb->network_header;
 567         int found_rhdr = 0;
 568         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 569
 570         while (offset + 1 <= packet_len) {
 571
 572                 switch (**nexthdr) {
 573
 574                 case NEXTHDR_HOP:
 575                         break;
 576                 case NEXTHDR_ROUTING:
 577                         found_rhdr = 1;
 578                         break;
 579                 case NEXTHDR_DEST:
 580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 581                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 582                                 break;
 583 #endif
 584                         if (found_rhdr)
 585                                 return offset;
 586                         break;
 587                 default :
 588                         return offset;
 589                 }
 590
 591                 offset += ipv6_optlen(exthdr);
 592                 *nexthdr = &exthdr->nexthdr;
 593                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 594                                                  offset);
 595         }
 596
 597         return offset;
 598 }
 599
 600 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 601 {
 602         static atomic_t ipv6_fragmentation_id;
 603         int old, new;
 604
 605         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 606                 struct inet_peer *peer;
 607
 608                 if (!rt->rt6i_peer)
 609                         rt6_bind_peer(rt, 1);
 610                 peer = rt->rt6i_peer;
 611                 if (peer) {
 612                         fhdr->identification = htonl(inet_getid(peer, 0));
 613                         return;
 614                 }
 615         }
 616         do {
 617                 old = atomic_read(&ipv6_fragmentation_id);
 618                 new = old + 1;
 619                 if (!new)
 620                         new = 1;
 621         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 622         fhdr->identification = htonl(new);
 623 }
 624
 625 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 626 {
 627         struct sk_buff *frag;
 628         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 629         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 630         struct ipv6hdr *tmp_hdr;
 631         struct frag_hdr *fh;
 632         unsigned int mtu, hlen, left, len;
 633         int hroom, troom;
 634         __be32 frag_id = 0;
 635         int ptr, offset = 0, err=0;
 636         u8 *prevhdr, nexthdr = 0;
 637         struct net *net = dev_net(skb_dst(skb)->dev);
 638
 639         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 640         nexthdr = *prevhdr;
 641
 642         mtu = ip6_skb_dst_mtu(skb);
 643
 644         /* We must not fragment if the socket is set to force MTU discovery
 645          * or if the skb it not generated by a local socket.
 646          */
 647         if (!skb->local_df && skb->len > mtu) {
 648                 skb->dev = skb_dst(skb)->dev;
 649                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 650                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 651                               IPSTATS_MIB_FRAGFAILS);
 652                 kfree_skb(skb);
 653                 return -EMSGSIZE;
 654         }
 655
 656         if (np && np->frag_size < mtu) {
 657                 if (np->frag_size)
 658                         mtu = np->frag_size;
 659         }
 660         mtu -= hlen + sizeof(struct frag_hdr);
 661
 662         if (skb_has_frag_list(skb)) {
 663                 int first_len = skb_pagelen(skb);
 664                 struct sk_buff *frag2;
 665
 666                 if (first_len - hlen > mtu ||
 667                     ((first_len - hlen) & 7) ||
 668                     skb_cloned(skb))
 669                         goto slow_path;
 670
 671                 skb_walk_frags(skb, frag) {
 672                         /* Correct geometry. */
 673                         if (frag->len > mtu ||
 674                             ((frag->len & 7) && frag->next) ||
 675                             skb_headroom(frag) < hlen)
 676                                 goto slow_path_clean;
 677
 678                         /* Partially cloned skb? */
 679                         if (skb_shared(frag))
 680                                 goto slow_path_clean;
 681
 682                         BUG_ON(frag->sk);
 683                         if (skb->sk) {
 684                                 frag->sk = skb->sk;
 685                                 frag->destructor = sock_wfree;
 686                         }
 687                         skb->truesize -= frag->truesize;
 688                 }
 689
 690                 err = 0;
 691                 offset = 0;
 692                 frag = skb_shinfo(skb)->frag_list;
 693                 skb_frag_list_init(skb);
 694                 /* BUILD HEADER */
 695
 696                 *prevhdr = NEXTHDR_FRAGMENT;
 697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 698                 if (!tmp_hdr) {
 699                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 700                                       IPSTATS_MIB_FRAGFAILS);
 701                         return -ENOMEM;
 702                 }
 703
 704                 __skb_pull(skb, hlen);
 705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 706                 __skb_push(skb, hlen);
 707                 skb_reset_network_header(skb);
 708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 709
 710                 ipv6_select_ident(fh, rt);
 711                 fh->nexthdr = nexthdr;
 712                 fh->reserved = 0;
 713                 fh->frag_off = htons(IP6_MF);
 714                 frag_id = fh->identification;
 715
 716                 first_len = skb_pagelen(skb);
 717                 skb->data_len = first_len - skb_headlen(skb);
 718                 skb->len = first_len;
 719                 ipv6_hdr(skb)->payload_len = htons(first_len -
 720                                                    sizeof(struct ipv6hdr));
 721
 722                 dst_hold(&rt->dst);
 723
 724                 for (;;) {
 725                         /* Prepare header of the next frame,
 726                          * before previous one went down. */
 727                         if (frag) {
 728                                 frag->ip_summed = CHECKSUM_NONE;
 729                                 skb_reset_transport_header(frag);
 730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 731                                 __skb_push(frag, hlen);
 732                                 skb_reset_network_header(frag);
 733                                 memcpy(skb_network_header(frag), tmp_hdr,
 734                                        hlen);
 735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 736                                 fh->nexthdr = nexthdr;
 737                                 fh->reserved = 0;
 738                                 fh->frag_off = htons(offset);
 739                                 if (frag->next != NULL)
 740                                         fh->frag_off |= htons(IP6_MF);
 741                                 fh->identification = frag_id;
 742                                 ipv6_hdr(frag)->payload_len =
 743                                                 htons(frag->len -
 744                                                       sizeof(struct ipv6hdr));
 745                                 ip6_copy_metadata(frag, skb);
 746                         }
 747
 748                         err = output(skb);
 749                         if(!err)
 750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 751                                               IPSTATS_MIB_FRAGCREATES);
 752
 753                         if (err || !frag)
 754                                 break;
 755
 756                         skb = frag;
 757                         frag = skb->next;
 758                         skb->next = NULL;
 759                 }
 760
 761                 kfree(tmp_hdr);
 762
 763                 if (err == 0) {
 764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 765                                       IPSTATS_MIB_FRAGOKS);
 766                         dst_release(&rt->dst);
 767                         return 0;
 768                 }
 769
 770                 while (frag) {
 771                         skb = frag->next;
 772                         kfree_skb(frag);
 773                         frag = skb;
 774                 }
 775
 776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 777                               IPSTATS_MIB_FRAGFAILS);
 778                 dst_release(&rt->dst);
 779                 return err;
 780
 781 slow_path_clean:
 782                 skb_walk_frags(skb, frag2) {
 783                         if (frag2 == frag)
 784                                 break;
 785                         frag2->sk = NULL;
 786                         frag2->destructor = NULL;
 787                         skb->truesize += frag2->truesize;
 788                 }
 789         }
 790
 791 slow_path:
 792         left = skb->len - hlen;         /* Space per frame */
 793         ptr = hlen;                     /* Where to start from */
 794
 795         /*
 796          *      Fragment the datagram.
 797          */
 798
 799         *prevhdr = NEXTHDR_FRAGMENT;
 800         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 801         troom = rt->dst.dev->needed_tailroom;
 802
 803         /*
 804          *      Keep copying data until we run out.
 805          */
 806         while(left > 0) {
 807                 len = left;
 808                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 809                 if (len > mtu)
 810                         len = mtu;
 811                 /* IF: we are not sending up to and including the packet end
 812                    then align the next start on an eight byte boundary */
 813                 if (len < left) {
 814                         len &= ~7;
 815                 }
 816                 /*
 817                  *      Allocate buffer.
 818                  */
 819
 820                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 821                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 822                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 823                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 824                                       IPSTATS_MIB_FRAGFAILS);
 825                         err = -ENOMEM;
 826                         goto fail;
 827                 }
 828
 829                 /*
 830                  *      Set up data on packet
 831                  */
 832
 833                 ip6_copy_metadata(frag, skb);
 834                 skb_reserve(frag, hroom);
 835                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 836                 skb_reset_network_header(frag);
 837                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 838                 frag->transport_header = (frag->network_header + hlen +
 839                                           sizeof(struct frag_hdr));
 840
 841                 /*
 842                  *      Charge the memory for the fragment to any owner
 843                  *      it might possess
 844                  */
 845                 if (skb->sk)
 846                         skb_set_owner_w(frag, skb->sk);
 847
 848                 /*
 849                  *      Copy the packet header into the new buffer.
 850                  */
 851                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 852
 853                 /*
 854                  *      Build fragment header.
 855                  */
 856                 fh->nexthdr = nexthdr;
 857                 fh->reserved = 0;
 858                 if (!frag_id) {
 859                         ipv6_select_ident(fh, rt);
 860                         frag_id = fh->identification;
 861                 } else
 862                         fh->identification = frag_id;
 863
 864                 /*
 865                  *      Copy a block of the IP datagram.
 866                  */
 867                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 868                         BUG();
 869                 left -= len;
 870
 871                 fh->frag_off = htons(offset);
 872                 if (left > 0)
 873                         fh->frag_off |= htons(IP6_MF);
 874                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 875                                                     sizeof(struct ipv6hdr));
 876
 877                 ptr += len;
 878                 offset += len;
 879
 880                 /*
 881                  *      Put this fragment into the sending queue.
 882                  */
 883                 err = output(frag);
 884                 if (err)
 885                         goto fail;
 886
 887                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 888                               IPSTATS_MIB_FRAGCREATES);
 889         }
 890         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 891                       IPSTATS_MIB_FRAGOKS);
 892         kfree_skb(skb);
 893         return err;
 894
 895 fail:
 896         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 897                       IPSTATS_MIB_FRAGFAILS);
 898         kfree_skb(skb);
 899         return err;
 900 }
 901
 902 static inline int ip6_rt_check(const struct rt6key *rt_key,
 903                                const struct in6_addr *fl_addr,
 904                                const struct in6_addr *addr_cache)
 905 {
 906         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 907                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 908 }
 909
 910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 911                                           struct dst_entry *dst,
 912                                           const struct flowi6 *fl6)
 913 {
 914         struct ipv6_pinfo *np = inet6_sk(sk);
 915         struct rt6_info *rt = (struct rt6_info *)dst;
 916
 917         if (!dst)
 918                 goto out;
 919
 920         /* Yes, checking route validity in not connected
 921          * case is not very simple. Take into account,
 922          * that we do not support routing by source, TOS,
 923          * and MSG_DONTROUTE            --ANK (980726)
 924          *
 925          * 1. ip6_rt_check(): If route was host route,
 926          *    check that cached destination is current.
 927          *    If it is network route, we still may
 928          *    check its validity using saved pointer
 929          *    to the last used address: daddr_cache.
 930          *    We do not want to save whole address now,
 931          *    (because main consumer of this service
 932          *    is tcp, which has not this problem),
 933          *    so that the last trick works only on connected
 934          *    sockets.
 935          * 2. oif also should be the same.
 936          */
 937         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 938 #ifdef CONFIG_IPV6_SUBTREES
 939             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 940 #endif
 941             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 942                 dst_release(dst);
 943                 dst = NULL;
 944         }
 945
 946 out:
 947         return dst;
 948 }
 949
 950 static int ip6_dst_lookup_tail(struct sock *sk,
 951                                struct dst_entry **dst, struct flowi6 *fl6)
 952 {
 953         struct net *net = sock_net(sk);
 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 955         struct neighbour *n;
 956 #endif
 957         int err;
 958
 959         if (*dst == NULL)
 960                 *dst = ip6_route_output(net, sk, fl6);
 961
 962         if ((err = (*dst)->error))
 963                 goto out_err_release;
 964
 965         if (ipv6_addr_any(&fl6->saddr)) {
 966                 struct rt6_info *rt = (struct rt6_info *) *dst;
 967                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 968                                           sk ? inet6_sk(sk)->srcprefs : 0,
 969                                           &fl6->saddr);
 970                 if (err)
 971                         goto out_err_release;
 972         }
 973
 974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 975         /*
 976          * Here if the dst entry we've looked up
 977          * has a neighbour entry that is in the INCOMPLETE
 978          * state and the src address from the flow is
 979          * marked as OPTIMISTIC, we release the found
 980          * dst entry and replace it instead with the
 981          * dst entry of the nexthop router
 982          */
 983         rcu_read_lock();
 984         n = dst_get_neighbour_noref(*dst);
 985         if (n && !(n->nud_state & NUD_VALID)) {
 986                 struct inet6_ifaddr *ifp;
 987                 struct flowi6 fl_gw6;
 988                 int redirect;
 989
 990                 rcu_read_unlock();
 991                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 992                                       (*dst)->dev, 1);
 993
 994                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 995                 if (ifp)
 996                         in6_ifa_put(ifp);
 997
 998                 if (redirect) {
 999                         /*
1000                          * We need to get the dst entry for the
1001                          * default router instead
1002                          */
1003                         dst_release(*dst);
1004                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006                         *dst = ip6_route_output(net, sk, &fl_gw6);
1007                         if ((err = (*dst)->error))
1008                                 goto out_err_release;
1009                 }
1010         } else {
1011                 rcu_read_unlock();
1012         }
1013 #endif
1014
1015         return 0;
1016
1017 out_err_release:
1018         if (err == -ENETUNREACH)
1019                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020         dst_release(*dst);
1021         *dst = NULL;
1022         return err;
1023 }
1024
1025 /**
1026  *      ip6_dst_lookup - perform route lookup on flow
1027  *      @sk: socket which provides route info
1028  *      @dst: pointer to dst_entry * for result
1029  *      @fl6: flow to lookup
1030  *
1031  *      This function performs a route lookup on the given flow.
1032  *
1033  *      It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037         *dst = NULL;
1038         return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041
1042 /**
1043  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *      @sk: socket which provides route info
1045  *      @fl6: flow to lookup
1046  *      @final_dst: final destination address for ipsec lookup
1047  *      @can_sleep: we are in a sleepable context
1048  *
1049  *      This function performs a route lookup on the given flow.
1050  *
1051  *      It returns a valid dst pointer on success, or a pointer encoded
1052  *      error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055                                       const struct in6_addr *final_dst,
1056                                       bool can_sleep)
1057 {
1058         struct dst_entry *dst = NULL;
1059         int err;
1060
1061         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062         if (err)
1063                 return ERR_PTR(err);
1064         if (final_dst)
1065                 fl6->daddr = *final_dst;
1066         if (can_sleep)
1067                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068
1069         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072
1073 /**
1074  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *      @sk: socket which provides the dst cache and route info
1076  *      @fl6: flow to lookup
1077  *      @final_dst: final destination address for ipsec lookup
1078  *      @can_sleep: we are in a sleepable context
1079  *
1080  *      This function performs a route lookup on the given flow with the
1081  *      possibility of using the cached route in the socket if it is valid.
1082  *      It will take the socket dst lock when operating on the dst cache.
1083  *      As a result, this function can only be used in process context.
1084  *
1085  *      It returns a valid dst pointer on success, or a pointer encoded
1086  *      error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089                                          const struct in6_addr *final_dst,
1090                                          bool can_sleep)
1091 {
1092         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093         int err;
1094
1095         dst = ip6_sk_dst_check(sk, dst, fl6);
1096
1097         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098         if (err)
1099                 return ERR_PTR(err);
1100         if (final_dst)
1101                 fl6->daddr = *final_dst;
1102         if (can_sleep)
1103                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104
1105         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110                         int getfrag(void *from, char *to, int offset, int len,
1111                         int odd, struct sk_buff *skb),
1112                         void *from, int length, int hh_len, int fragheaderlen,
1113                         int transhdrlen, int mtu,unsigned int flags,
1114                         struct rt6_info *rt)
1115
1116 {
1117         struct sk_buff *skb;
1118         int err;
1119
1120         /* There is support for UDP large send offload by network
1121          * device, so create one single skb packet containing complete
1122          * udp datagram
1123          */
1124         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (skb == NULL)
1129                         return err;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb,fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_reset_network_header(skb);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->ip_summed = CHECKSUM_PARTIAL;
1144                 skb->csum = 0;
1145         }
1146
1147         err = skb_append_datato_frags(sk,skb, getfrag, from,
1148                                       (length - transhdrlen));
1149         if (!err) {
1150                 struct frag_hdr fhdr;
1151
1152                 /* Specify the length of each IPv6 datagram fragment.
1153                  * It has to be a multiple of 8.
1154                  */
1155                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156                                              sizeof(struct frag_hdr)) & ~7;
1157                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158                 ipv6_select_ident(&fhdr, rt);
1159                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160                 __skb_queue_tail(&sk->sk_write_queue, skb);
1161
1162                 return 0;
1163         }
1164         /* There is not enough support do UPD LSO,
1165          * so follow normal path
1166          */
1167         kfree_skb(skb);
1168
1169         return err;
1170 }
1171
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173                                                gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179                                                 gfp_t gfp)
1180 {
1181         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185         int offset, int len, int odd, struct sk_buff *skb),
1186         void *from, int length, int transhdrlen,
1187         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188         struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190         struct inet_sock *inet = inet_sk(sk);
1191         struct ipv6_pinfo *np = inet6_sk(sk);
1192         struct inet_cork *cork;
1193         struct sk_buff *skb;
1194         unsigned int maxfraglen, fragheaderlen;
1195         int exthdrlen;
1196         int dst_exthdrlen;
1197         int hh_len;
1198         int mtu;
1199         int copy;
1200         int err;
1201         int offset = 0;
1202         int csummode = CHECKSUM_NONE;
1203         __u8 tx_flags = 0;
1204
1205         if (flags&MSG_PROBE)
1206                 return 0;
1207         cork = &inet->cork.base;
1208         if (skb_queue_empty(&sk->sk_write_queue)) {
1209                 /*
1210                  * setup for corking
1211                  */
1212                 if (opt) {
1213                         if (WARN_ON(np->cork.opt))
1214                                 return -EINVAL;
1215
1216                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1217                         if (unlikely(np->cork.opt == NULL))
1218                                 return -ENOBUFS;
1219
1220                         np->cork.opt->tot_len = opt->tot_len;
1221                         np->cork.opt->opt_flen = opt->opt_flen;
1222                         np->cork.opt->opt_nflen = opt->opt_nflen;
1223
1224                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1225                                                             sk->sk_allocation);
1226                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1227                                 return -ENOBUFS;
1228
1229                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1230                                                             sk->sk_allocation);
1231                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1232                                 return -ENOBUFS;
1233
1234                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1235                                                            sk->sk_allocation);
1236                         if (opt->hopopt && !np->cork.opt->hopopt)
1237                                 return -ENOBUFS;
1238
1239                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1240                                                             sk->sk_allocation);
1241                         if (opt->srcrt && !np->cork.opt->srcrt)
1242                                 return -ENOBUFS;
1243
1244                         /* need source address above miyazawa*/
1245                 }
1246                 dst_hold(&rt->dst);
1247                 cork->dst = &rt->dst;
1248                 inet->cork.fl.u.ip6 = *fl6;
1249                 np->cork.hop_limit = hlimit;
1250                 np->cork.tclass = tclass;
1251                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1252                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1253                 if (np->frag_size < mtu) {
1254                         if (np->frag_size)
1255                                 mtu = np->frag_size;
1256                 }
1257                 cork->fragsize = mtu;
1258                 if (dst_allfrag(rt->dst.path))
1259                         cork->flags |= IPCORK_ALLFRAG;
1260                 cork->length = 0;
1261                 sk->sk_sndmsg_page = NULL;
1262                 sk->sk_sndmsg_off = 0;
1263                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1264                 length += exthdrlen;
1265                 transhdrlen += exthdrlen;
1266                 dst_exthdrlen = rt->dst.header_len;
1267         } else {
1268                 rt = (struct rt6_info *)cork->dst;
1269                 fl6 = &inet->cork.fl.u.ip6;
1270                 opt = np->cork.opt;
1271                 transhdrlen = 0;
1272                 exthdrlen = 0;
1273                 dst_exthdrlen = 0;
1274                 mtu = cork->fragsize;
1275         }
1276
1277         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278
1279         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280                         (opt ? opt->opt_nflen : 0);
1281         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1282
1283         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1284                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1285                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1286                         return -EMSGSIZE;
1287                 }
1288         }
1289
1290         /* For UDP, check if TX timestamp is enabled */
1291         if (sk->sk_type == SOCK_DGRAM) {
1292                 err = sock_tx_timestamp(sk, &tx_flags);
1293                 if (err)
1294                         goto error;
1295         }
1296
1297         /*
1298          * Let's try using as much space as possible.
1299          * Use MTU if total length of the message fits into the MTU.
1300          * Otherwise, we need to reserve fragment header and
1301          * fragment alignment (= 8-15 octects, in total).
1302          *
1303          * Note that we may need to "move" the data from the tail of
1304          * of the buffer to the new fragment when we split
1305          * the message.
1306          *
1307          * FIXME: It may be fragmented into multiple chunks
1308          *        at once if non-fragmentable extension headers
1309          *        are too large.
1310          * --yoshfuji
1311          */
1312
1313         cork->length += length;
1314         if (length > mtu) {
1315                 int proto = sk->sk_protocol;
1316                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1317                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1318                         return -EMSGSIZE;
1319                 }
1320
1321                 if (proto == IPPROTO_UDP &&
1322                     (rt->dst.dev->features & NETIF_F_UFO)) {
1323
1324                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1325                                                   hh_len, fragheaderlen,
1326                                                   transhdrlen, mtu, flags, rt);
1327                         if (err)
1328                                 goto error;
1329                         return 0;
1330                 }
1331         }
1332
1333         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1334                 goto alloc_new_skb;
1335
1336         while (length > 0) {
1337                 /* Check if the remaining data fits into current packet. */
1338                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1339                 if (copy < length)
1340                         copy = maxfraglen - skb->len;
1341
1342                 if (copy <= 0) {
1343                         char *data;
1344                         unsigned int datalen;
1345                         unsigned int fraglen;
1346                         unsigned int fraggap;
1347                         unsigned int alloclen;
1348                         struct sk_buff *skb_prev;
1349 alloc_new_skb:
1350                         skb_prev = skb;
1351
1352                         /* There's no room in the current skb */
1353                         if (skb_prev)
1354                                 fraggap = skb_prev->len - maxfraglen;
1355                         else
1356                                 fraggap = 0;
1357
1358                         /*
1359                          * If remaining data exceeds the mtu,
1360                          * we know we need more fragment(s).
1361                          */
1362                         datalen = length + fraggap;
1363                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1364                                 datalen = maxfraglen - fragheaderlen;
1365
1366                         fraglen = datalen + fragheaderlen;
1367                         if ((flags & MSG_MORE) &&
1368                             !(rt->dst.dev->features&NETIF_F_SG))
1369                                 alloclen = mtu;
1370                         else
1371                                 alloclen = datalen + fragheaderlen;
1372
1373                         alloclen += dst_exthdrlen;
1374
1375                         /*
1376                          * The last fragment gets additional space at tail.
1377                          * Note: we overallocate on fragments with MSG_MODE
1378                          * because we have no idea if we're the last one.
1379                          */
1380                         if (datalen == length + fraggap)
1381                                 alloclen += rt->dst.trailer_len;
1382
1383                         /*
1384                          * We just reserve space for fragment header.
1385                          * Note: this may be overallocation if the message
1386                          * (without MSG_MORE) fits into the MTU.
1387                          */
1388                         alloclen += sizeof(struct frag_hdr);
1389
1390                         if (transhdrlen) {
1391                                 skb = sock_alloc_send_skb(sk,
1392                                                 alloclen + hh_len,
1393                                                 (flags & MSG_DONTWAIT), &err);
1394                         } else {
1395                                 skb = NULL;
1396                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1397                                     2 * sk->sk_sndbuf)
1398                                         skb = sock_wmalloc(sk,
1399                                                            alloclen + hh_len, 1,
1400                                                            sk->sk_allocation);
1401                                 if (unlikely(skb == NULL))
1402                                         err = -ENOBUFS;
1403                                 else {
1404                                         /* Only the initial fragment
1405                                          * is time stamped.
1406                                          */
1407                                         tx_flags = 0;
1408                                 }
1409                         }
1410                         if (skb == NULL)
1411                                 goto error;
1412                         /*
1413                          *      Fill in the control structures
1414                          */
1415                         skb->ip_summed = csummode;
1416                         skb->csum = 0;
1417                         /* reserve for fragmentation and ipsec header */
1418                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1419                                     dst_exthdrlen);
1420
1421                         if (sk->sk_type == SOCK_DGRAM)
1422                                 skb_shinfo(skb)->tx_flags = tx_flags;
1423
1424                         /*
1425                          *      Find where to start putting bytes
1426                          */
1427                         data = skb_put(skb, fraglen);
1428                         skb_set_network_header(skb, exthdrlen);
1429                         data += fragheaderlen;
1430                         skb->transport_header = (skb->network_header +
1431                                                  fragheaderlen);
1432                         if (fraggap) {
1433                                 skb->csum = skb_copy_and_csum_bits(
1434                                         skb_prev, maxfraglen,
1435                                         data + transhdrlen, fraggap, 0);
1436                                 skb_prev->csum = csum_sub(skb_prev->csum,
1437                                                           skb->csum);
1438                                 data += fraggap;
1439                                 pskb_trim_unique(skb_prev, maxfraglen);
1440                         }
1441                         copy = datalen - transhdrlen - fraggap;
1442
1443                         if (copy < 0) {
1444                                 err = -EINVAL;
1445                                 kfree_skb(skb);
1446                                 goto error;
1447                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1448                                 err = -EFAULT;
1449                                 kfree_skb(skb);
1450                                 goto error;
1451                         }
1452
1453                         offset += copy;
1454                         length -= datalen - fraggap;
1455                         transhdrlen = 0;
1456                         exthdrlen = 0;
1457                         dst_exthdrlen = 0;
1458                         csummode = CHECKSUM_NONE;
1459
1460                         /*
1461                          * Put the packet on the pending queue
1462                          */
1463                         __skb_queue_tail(&sk->sk_write_queue, skb);
1464                         continue;
1465                 }
1466
1467                 if (copy > length)
1468                         copy = length;
1469
1470                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1471                         unsigned int off;
1472
1473                         off = skb->len;
1474                         if (getfrag(from, skb_put(skb, copy),
1475                                                 offset, copy, off, skb) < 0) {
1476                                 __skb_trim(skb, off);
1477                                 err = -EFAULT;
1478                                 goto error;
1479                         }
1480                 } else {
1481                         int i = skb_shinfo(skb)->nr_frags;
1482                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1483                         struct page *page = sk->sk_sndmsg_page;
1484                         int off = sk->sk_sndmsg_off;
1485                         unsigned int left;
1486
1487                         if (page && (left = PAGE_SIZE - off) > 0) {
1488                                 if (copy >= left)
1489                                         copy = left;
1490                                 if (page != skb_frag_page(frag)) {
1491                                         if (i == MAX_SKB_FRAGS) {
1492                                                 err = -EMSGSIZE;
1493                                                 goto error;
1494                                         }
1495                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1496                                         skb_frag_ref(skb, i);
1497                                         frag = &skb_shinfo(skb)->frags[i];
1498                                 }
1499                         } else if(i < MAX_SKB_FRAGS) {
1500                                 if (copy > PAGE_SIZE)
1501                                         copy = PAGE_SIZE;
1502                                 page = alloc_pages(sk->sk_allocation, 0);
1503                                 if (page == NULL) {
1504                                         err = -ENOMEM;
1505                                         goto error;
1506                                 }
1507                                 sk->sk_sndmsg_page = page;
1508                                 sk->sk_sndmsg_off = 0;
1509
1510                                 skb_fill_page_desc(skb, i, page, 0, 0);
1511                                 frag = &skb_shinfo(skb)->frags[i];
1512                         } else {
1513                                 err = -EMSGSIZE;
1514                                 goto error;
1515                         }
1516                         if (getfrag(from,
1517                                     skb_frag_address(frag) + skb_frag_size(frag),
1518                                     offset, copy, skb->len, skb) < 0) {
1519                                 err = -EFAULT;
1520                                 goto error;
1521                         }
1522                         sk->sk_sndmsg_off += copy;
1523                         skb_frag_size_add(frag, copy);
1524                         skb->len += copy;
1525                         skb->data_len += copy;
1526                         skb->truesize += copy;
1527                         atomic_add(copy, &sk->sk_wmem_alloc);
1528                 }
1529                 offset += copy;
1530                 length -= copy;
1531         }
1532         return 0;
1533 error:
1534         cork->length -= length;
1535         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536         return err;
1537 }
1538
1539 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1540 {
1541         if (np->cork.opt) {
1542                 kfree(np->cork.opt->dst0opt);
1543                 kfree(np->cork.opt->dst1opt);
1544                 kfree(np->cork.opt->hopopt);
1545                 kfree(np->cork.opt->srcrt);
1546                 kfree(np->cork.opt);
1547                 np->cork.opt = NULL;
1548         }
1549
1550         if (inet->cork.base.dst) {
1551                 dst_release(inet->cork.base.dst);
1552                 inet->cork.base.dst = NULL;
1553                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1554         }
1555         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1556 }
1557
1558 int ip6_push_pending_frames(struct sock *sk)
1559 {
1560         struct sk_buff *skb, *tmp_skb;
1561         struct sk_buff **tail_skb;
1562         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1563         struct inet_sock *inet = inet_sk(sk);
1564         struct ipv6_pinfo *np = inet6_sk(sk);
1565         struct net *net = sock_net(sk);
1566         struct ipv6hdr *hdr;
1567         struct ipv6_txoptions *opt = np->cork.opt;
1568         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1569         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1570         unsigned char proto = fl6->flowi6_proto;
1571         int err = 0;
1572
1573         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1574                 goto out;
1575         tail_skb = &(skb_shinfo(skb)->frag_list);
1576
1577         /* move skb->data to ip header from ext header */
1578         if (skb->data < skb_network_header(skb))
1579                 __skb_pull(skb, skb_network_offset(skb));
1580         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1581                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1582                 *tail_skb = tmp_skb;
1583                 tail_skb = &(tmp_skb->next);
1584                 skb->len += tmp_skb->len;
1585                 skb->data_len += tmp_skb->len;
1586                 skb->truesize += tmp_skb->truesize;
1587                 tmp_skb->destructor = NULL;
1588                 tmp_skb->sk = NULL;
1589         }
1590
1591         /* Allow local fragmentation. */
1592         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1593                 skb->local_df = 1;
1594
1595         *final_dst = fl6->daddr;
1596         __skb_pull(skb, skb_network_header_len(skb));
1597         if (opt && opt->opt_flen)
1598                 ipv6_push_frag_opts(skb, opt, &proto);
1599         if (opt && opt->opt_nflen)
1600                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1601
1602         skb_push(skb, sizeof(struct ipv6hdr));
1603         skb_reset_network_header(skb);
1604         hdr = ipv6_hdr(skb);
1605
1606         *(__be32*)hdr = fl6->flowlabel |
1607                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1608
1609         hdr->hop_limit = np->cork.hop_limit;
1610         hdr->nexthdr = proto;
1611         hdr->saddr = fl6->saddr;
1612         hdr->daddr = *final_dst;
1613
1614         skb->priority = sk->sk_priority;
1615         skb->mark = sk->sk_mark;
1616
1617         skb_dst_set(skb, dst_clone(&rt->dst));
1618         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1619         if (proto == IPPROTO_ICMPV6) {
1620                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1621
1622                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1623                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1624         }
1625
1626         err = ip6_local_out(skb);
1627         if (err) {
1628                 if (err > 0)
1629                         err = net_xmit_errno(err);
1630                 if (err)
1631                         goto error;
1632         }
1633
1634 out:
1635         ip6_cork_release(inet, np);
1636         return err;
1637 error:
1638         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1639         goto out;
1640 }
1641
1642 void ip6_flush_pending_frames(struct sock *sk)
1643 {
1644         struct sk_buff *skb;
1645
1646         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1647                 if (skb_dst(skb))
1648                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1649                                       IPSTATS_MIB_OUTDISCARDS);
1650                 kfree_skb(skb);
1651         }
1652
1653         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1654 }