net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         rcu_read_lock();
 139         neigh = dst_get_neighbour_noref(dst);
 140         if (neigh) {
 141                 int res = neigh_output(neigh, skb);
 142
 143                 rcu_read_unlock();
 144                 return res;
 145         }
 146         rcu_read_unlock();
 147         IP6_INC_STATS_BH(dev_net(dst->dev),
 148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 149         kfree_skb(skb);
 150         return -EINVAL;
 151 }
 152
 153 static int ip6_finish_output(struct sk_buff *skb)
 154 {
 155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 156             dst_allfrag(skb_dst(skb)))
 157                 return ip6_fragment(skb, ip6_finish_output2);
 158         else
 159                 return ip6_finish_output2(skb);
 160 }
 161
 162 int ip6_output(struct sk_buff *skb)
 163 {
 164         struct net_device *dev = skb_dst(skb)->dev;
 165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 166         if (unlikely(idev->cnf.disable_ipv6)) {
 167                 IP6_INC_STATS(dev_net(dev), idev,
 168                               IPSTATS_MIB_OUTDISCARDS);
 169                 kfree_skb(skb);
 170                 return 0;
 171         }
 172
 173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 174                             ip6_finish_output,
 175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 176 }
 177
 178 /*
 179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 180  */
 181
 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 183              struct ipv6_txoptions *opt, int tclass)
 184 {
 185         struct net *net = sock_net(sk);
 186         struct ipv6_pinfo *np = inet6_sk(sk);
 187         struct in6_addr *first_hop = &fl6->daddr;
 188         struct dst_entry *dst = skb_dst(skb);
 189         struct ipv6hdr *hdr;
 190         u8  proto = fl6->flowi6_proto;
 191         int seg_len = skb->len;
 192         int hlimit = -1;
 193         u32 mtu;
 194
 195         if (opt) {
 196                 unsigned int head_room;
 197
 198                 /* First: exthdrs may take lots of space (~8K for now)
 199                    MAX_HEADER is not enough.
 200                  */
 201                 head_room = opt->opt_nflen + opt->opt_flen;
 202                 seg_len += head_room;
 203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 204
 205                 if (skb_headroom(skb) < head_room) {
 206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 207                         if (skb2 == NULL) {
 208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 209                                               IPSTATS_MIB_OUTDISCARDS);
 210                                 kfree_skb(skb);
 211                                 return -ENOBUFS;
 212                         }
 213                         consume_skb(skb);
 214                         skb = skb2;
 215                         skb_set_owner_w(skb, sk);
 216                 }
 217                 if (opt->opt_flen)
 218                         ipv6_push_frag_opts(skb, opt, &proto);
 219                 if (opt->opt_nflen)
 220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 221         }
 222
 223         skb_push(skb, sizeof(struct ipv6hdr));
 224         skb_reset_network_header(skb);
 225         hdr = ipv6_hdr(skb);
 226
 227         /*
 228          *      Fill in the IPv6 header
 229          */
 230         if (np)
 231                 hlimit = np->hop_limit;
 232         if (hlimit < 0)
 233                 hlimit = ip6_dst_hoplimit(dst);
 234
 235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 236
 237         hdr->payload_len = htons(seg_len);
 238         hdr->nexthdr = proto;
 239         hdr->hop_limit = hlimit;
 240
 241         hdr->saddr = fl6->saddr;
 242         hdr->daddr = *first_hop;
 243
 244         skb->priority = sk->sk_priority;
 245         skb->mark = sk->sk_mark;
 246
 247         mtu = dst_mtu(dst);
 248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 250                               IPSTATS_MIB_OUT, skb->len);
 251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 252                                dst->dev, dst_output);
 253         }
 254
 255         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
 256         skb->dev = dst->dev;
 257         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 258         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 259         kfree_skb(skb);
 260         return -EMSGSIZE;
 261 }
 262
 263 EXPORT_SYMBOL(ip6_xmit);
 264
 265 /*
 266  *      To avoid extra problems ND packets are send through this
 267  *      routine. It's code duplication but I really want to avoid
 268  *      extra checks since ipv6_build_header is used by TCP (which
 269  *      is for us performance critical)
 270  */
 271
 272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 273                const struct in6_addr *saddr, const struct in6_addr *daddr,
 274                int proto, int len)
 275 {
 276         struct ipv6_pinfo *np = inet6_sk(sk);
 277         struct ipv6hdr *hdr;
 278
 279         skb->protocol = htons(ETH_P_IPV6);
 280         skb->dev = dev;
 281
 282         skb_reset_network_header(skb);
 283         skb_put(skb, sizeof(struct ipv6hdr));
 284         hdr = ipv6_hdr(skb);
 285
 286         *(__be32*)hdr = htonl(0x60000000);
 287
 288         hdr->payload_len = htons(len);
 289         hdr->nexthdr = proto;
 290         hdr->hop_limit = np->hop_limit;
 291
 292         hdr->saddr = *saddr;
 293         hdr->daddr = *daddr;
 294
 295         return 0;
 296 }
 297
 298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 299 {
 300         struct ip6_ra_chain *ra;
 301         struct sock *last = NULL;
 302
 303         read_lock(&ip6_ra_lock);
 304         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 305                 struct sock *sk = ra->sk;
 306                 if (sk && ra->sel == sel &&
 307                     (!sk->sk_bound_dev_if ||
 308                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 309                         if (last) {
 310                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 311                                 if (skb2)
 312                                         rawv6_rcv(last, skb2);
 313                         }
 314                         last = sk;
 315                 }
 316         }
 317
 318         if (last) {
 319                 rawv6_rcv(last, skb);
 320                 read_unlock(&ip6_ra_lock);
 321                 return 1;
 322         }
 323         read_unlock(&ip6_ra_lock);
 324         return 0;
 325 }
 326
 327 static int ip6_forward_proxy_check(struct sk_buff *skb)
 328 {
 329         struct ipv6hdr *hdr = ipv6_hdr(skb);
 330         u8 nexthdr = hdr->nexthdr;
 331         __be16 frag_off;
 332         int offset;
 333
 334         if (ipv6_ext_hdr(nexthdr)) {
 335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 336                 if (offset < 0)
 337                         return 0;
 338         } else
 339                 offset = sizeof(struct ipv6hdr);
 340
 341         if (nexthdr == IPPROTO_ICMPV6) {
 342                 struct icmp6hdr *icmp6;
 343
 344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 345                                          offset + 1 - skb->data)))
 346                         return 0;
 347
 348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 349
 350                 switch (icmp6->icmp6_type) {
 351                 case NDISC_ROUTER_SOLICITATION:
 352                 case NDISC_ROUTER_ADVERTISEMENT:
 353                 case NDISC_NEIGHBOUR_SOLICITATION:
 354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 355                 case NDISC_REDIRECT:
 356                         /* For reaction involving unicast neighbor discovery
 357                          * message destined to the proxied address, pass it to
 358                          * input function.
 359                          */
 360                         return 1;
 361                 default:
 362                         break;
 363                 }
 364         }
 365
 366         /*
 367          * The proxying router can't forward traffic sent to a link-local
 368          * address, so signal the sender and discard the packet. This
 369          * behavior is clarified by the MIPv6 specification.
 370          */
 371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 372                 dst_link_failure(skb);
 373                 return -1;
 374         }
 375
 376         return 0;
 377 }
 378
 379 static inline int ip6_forward_finish(struct sk_buff *skb)
 380 {
 381         return dst_output(skb);
 382 }
 383
 384 int ip6_forward(struct sk_buff *skb)
 385 {
 386         struct dst_entry *dst = skb_dst(skb);
 387         struct ipv6hdr *hdr = ipv6_hdr(skb);
 388         struct inet6_skb_parm *opt = IP6CB(skb);
 389         struct net *net = dev_net(dst->dev);
 390         u32 mtu;
 391
 392         if (net->ipv6.devconf_all->forwarding == 0)
 393                 goto error;
 394
 395         if (skb_warn_if_lro(skb))
 396                 goto drop;
 397
 398         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 399                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 400                 goto drop;
 401         }
 402
 403         if (skb->pkt_type != PACKET_HOST)
 404                 goto drop;
 405
 406         skb_forward_csum(skb);
 407
 408         /*
 409          *      We DO NOT make any processing on
 410          *      RA packets, pushing them to user level AS IS
 411          *      without ane WARRANTY that application will be able
 412          *      to interpret them. The reason is that we
 413          *      cannot make anything clever here.
 414          *
 415          *      We are not end-node, so that if packet contains
 416          *      AH/ESP, we cannot make anything.
 417          *      Defragmentation also would be mistake, RA packets
 418          *      cannot be fragmented, because there is no warranty
 419          *      that different fragments will go along one path. --ANK
 420          */
 421         if (opt->ra) {
 422                 u8 *ptr = skb_network_header(skb) + opt->ra;
 423                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 424                         return 0;
 425         }
 426
 427         /*
 428          *      check and decrement ttl
 429          */
 430         if (hdr->hop_limit <= 1) {
 431                 /* Force OUTPUT device used as source address */
 432                 skb->dev = dst->dev;
 433                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 434                 IP6_INC_STATS_BH(net,
 435                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 436
 437                 kfree_skb(skb);
 438                 return -ETIMEDOUT;
 439         }
 440
 441         /* XXX: idev->cnf.proxy_ndp? */
 442         if (net->ipv6.devconf_all->proxy_ndp &&
 443             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 444                 int proxied = ip6_forward_proxy_check(skb);
 445                 if (proxied > 0)
 446                         return ip6_input(skb);
 447                 else if (proxied < 0) {
 448                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 449                                       IPSTATS_MIB_INDISCARDS);
 450                         goto drop;
 451                 }
 452         }
 453
 454         if (!xfrm6_route_forward(skb)) {
 455                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 456                 goto drop;
 457         }
 458         dst = skb_dst(skb);
 459
 460         /* IPv6 specs say nothing about it, but it is clear that we cannot
 461            send redirects to source routed frames.
 462            We don't send redirects to frames decapsulated from IPsec.
 463          */
 464         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 465                 struct in6_addr *target = NULL;
 466                 struct rt6_info *rt;
 467
 468                 /*
 469                  *      incoming and outgoing devices are the same
 470                  *      send a redirect.
 471                  */
 472
 473                 rt = (struct rt6_info *) dst;
 474                 if (rt->rt6i_flags & RTF_GATEWAY)
 475                         target = &rt->rt6i_gateway;
 476                 else
 477                         target = &hdr->daddr;
 478
 479                 if (!rt->rt6i_peer)
 480                         rt6_bind_peer(rt, 1);
 481
 482                 /* Limit redirects both by destination (here)
 483                    and by source (inside ndisc_send_redirect)
 484                  */
 485                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
 486                         ndisc_send_redirect(skb, target);
 487         } else {
 488                 int addrtype = ipv6_addr_type(&hdr->saddr);
 489
 490                 /* This check is security critical. */
 491                 if (addrtype == IPV6_ADDR_ANY ||
 492                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 493                         goto error;
 494                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 495                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 496                                     ICMPV6_NOT_NEIGHBOUR, 0);
 497                         goto error;
 498                 }
 499         }
 500
 501         mtu = dst_mtu(dst);
 502         if (mtu < IPV6_MIN_MTU)
 503                 mtu = IPV6_MIN_MTU;
 504
 505         if (skb->len > mtu && !skb_is_gso(skb)) {
 506                 /* Again, force OUTPUT device used as source address */
 507                 skb->dev = dst->dev;
 508                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 509                 IP6_INC_STATS_BH(net,
 510                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 511                 IP6_INC_STATS_BH(net,
 512                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 513                 kfree_skb(skb);
 514                 return -EMSGSIZE;
 515         }
 516
 517         if (skb_cow(skb, dst->dev->hard_header_len)) {
 518                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 519                 goto drop;
 520         }
 521
 522         hdr = ipv6_hdr(skb);
 523
 524         /* Mangling hops number delayed to point after skb COW */
 525
 526         hdr->hop_limit--;
 527
 528         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 529         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 530                        ip6_forward_finish);
 531
 532 error:
 533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 534 drop:
 535         kfree_skb(skb);
 536         return -EINVAL;
 537 }
 538
 539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 540 {
 541         to->pkt_type = from->pkt_type;
 542         to->priority = from->priority;
 543         to->protocol = from->protocol;
 544         skb_dst_drop(to);
 545         skb_dst_set(to, dst_clone(skb_dst(from)));
 546         to->dev = from->dev;
 547         to->mark = from->mark;
 548
 549 #ifdef CONFIG_NET_SCHED
 550         to->tc_index = from->tc_index;
 551 #endif
 552         nf_copy(to, from);
 553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 555         to->nf_trace = from->nf_trace;
 556 #endif
 557         skb_copy_secmark(to, from);
 558 }
 559
 560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 561 {
 562         u16 offset = sizeof(struct ipv6hdr);
 563         struct ipv6_opt_hdr *exthdr =
 564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 565         unsigned int packet_len = skb->tail - skb->network_header;
 566         int found_rhdr = 0;
 567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 568
 569         while (offset + 1 <= packet_len) {
 570
 571                 switch (**nexthdr) {
 572
 573                 case NEXTHDR_HOP:
 574                         break;
 575                 case NEXTHDR_ROUTING:
 576                         found_rhdr = 1;
 577                         break;
 578                 case NEXTHDR_DEST:
 579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 581                                 break;
 582 #endif
 583                         if (found_rhdr)
 584                                 return offset;
 585                         break;
 586                 default :
 587                         return offset;
 588                 }
 589
 590                 offset += ipv6_optlen(exthdr);
 591                 *nexthdr = &exthdr->nexthdr;
 592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 593                                                  offset);
 594         }
 595
 596         return offset;
 597 }
 598
 599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 600 {
 601         static atomic_t ipv6_fragmentation_id;
 602         int old, new;
 603
 604         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 605                 struct inet_peer *peer;
 606
 607                 if (!rt->rt6i_peer)
 608                         rt6_bind_peer(rt, 1);
 609                 peer = rt->rt6i_peer;
 610                 if (peer) {
 611                         fhdr->identification = htonl(inet_getid(peer, 0));
 612                         return;
 613                 }
 614         }
 615         do {
 616                 old = atomic_read(&ipv6_fragmentation_id);
 617                 new = old + 1;
 618                 if (!new)
 619                         new = 1;
 620         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 621         fhdr->identification = htonl(new);
 622 }
 623
 624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 625 {
 626         struct sk_buff *frag;
 627         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 628         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 629         struct ipv6hdr *tmp_hdr;
 630         struct frag_hdr *fh;
 631         unsigned int mtu, hlen, left, len;
 632         int hroom, troom;
 633         __be32 frag_id = 0;
 634         int ptr, offset = 0, err=0;
 635         u8 *prevhdr, nexthdr = 0;
 636         struct net *net = dev_net(skb_dst(skb)->dev);
 637
 638         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 639         nexthdr = *prevhdr;
 640
 641         mtu = ip6_skb_dst_mtu(skb);
 642
 643         /* We must not fragment if the socket is set to force MTU discovery
 644          * or if the skb it not generated by a local socket.
 645          */
 646         if (unlikely(!skb->local_df && skb->len > mtu)) {
 647                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 648                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 649
 650                 skb->dev = skb_dst(skb)->dev;
 651                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 652                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 653                               IPSTATS_MIB_FRAGFAILS);
 654                 kfree_skb(skb);
 655                 return -EMSGSIZE;
 656         }
 657
 658         if (np && np->frag_size < mtu) {
 659                 if (np->frag_size)
 660                         mtu = np->frag_size;
 661         }
 662         mtu -= hlen + sizeof(struct frag_hdr);
 663
 664         if (skb_has_frag_list(skb)) {
 665                 int first_len = skb_pagelen(skb);
 666                 struct sk_buff *frag2;
 667
 668                 if (first_len - hlen > mtu ||
 669                     ((first_len - hlen) & 7) ||
 670                     skb_cloned(skb))
 671                         goto slow_path;
 672
 673                 skb_walk_frags(skb, frag) {
 674                         /* Correct geometry. */
 675                         if (frag->len > mtu ||
 676                             ((frag->len & 7) && frag->next) ||
 677                             skb_headroom(frag) < hlen)
 678                                 goto slow_path_clean;
 679
 680                         /* Partially cloned skb? */
 681                         if (skb_shared(frag))
 682                                 goto slow_path_clean;
 683
 684                         BUG_ON(frag->sk);
 685                         if (skb->sk) {
 686                                 frag->sk = skb->sk;
 687                                 frag->destructor = sock_wfree;
 688                         }
 689                         skb->truesize -= frag->truesize;
 690                 }
 691
 692                 err = 0;
 693                 offset = 0;
 694                 frag = skb_shinfo(skb)->frag_list;
 695                 skb_frag_list_init(skb);
 696                 /* BUILD HEADER */
 697
 698                 *prevhdr = NEXTHDR_FRAGMENT;
 699                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 700                 if (!tmp_hdr) {
 701                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 702                                       IPSTATS_MIB_FRAGFAILS);
 703                         return -ENOMEM;
 704                 }
 705
 706                 __skb_pull(skb, hlen);
 707                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 708                 __skb_push(skb, hlen);
 709                 skb_reset_network_header(skb);
 710                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 711
 712                 ipv6_select_ident(fh, rt);
 713                 fh->nexthdr = nexthdr;
 714                 fh->reserved = 0;
 715                 fh->frag_off = htons(IP6_MF);
 716                 frag_id = fh->identification;
 717
 718                 first_len = skb_pagelen(skb);
 719                 skb->data_len = first_len - skb_headlen(skb);
 720                 skb->len = first_len;
 721                 ipv6_hdr(skb)->payload_len = htons(first_len -
 722                                                    sizeof(struct ipv6hdr));
 723
 724                 dst_hold(&rt->dst);
 725
 726                 for (;;) {
 727                         /* Prepare header of the next frame,
 728                          * before previous one went down. */
 729                         if (frag) {
 730                                 frag->ip_summed = CHECKSUM_NONE;
 731                                 skb_reset_transport_header(frag);
 732                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 733                                 __skb_push(frag, hlen);
 734                                 skb_reset_network_header(frag);
 735                                 memcpy(skb_network_header(frag), tmp_hdr,
 736                                        hlen);
 737                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 738                                 fh->nexthdr = nexthdr;
 739                                 fh->reserved = 0;
 740                                 fh->frag_off = htons(offset);
 741                                 if (frag->next != NULL)
 742                                         fh->frag_off |= htons(IP6_MF);
 743                                 fh->identification = frag_id;
 744                                 ipv6_hdr(frag)->payload_len =
 745                                                 htons(frag->len -
 746                                                       sizeof(struct ipv6hdr));
 747                                 ip6_copy_metadata(frag, skb);
 748                         }
 749
 750                         err = output(skb);
 751                         if(!err)
 752                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 753                                               IPSTATS_MIB_FRAGCREATES);
 754
 755                         if (err || !frag)
 756                                 break;
 757
 758                         skb = frag;
 759                         frag = skb->next;
 760                         skb->next = NULL;
 761                 }
 762
 763                 kfree(tmp_hdr);
 764
 765                 if (err == 0) {
 766                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 767                                       IPSTATS_MIB_FRAGOKS);
 768                         dst_release(&rt->dst);
 769                         return 0;
 770                 }
 771
 772                 while (frag) {
 773                         skb = frag->next;
 774                         kfree_skb(frag);
 775                         frag = skb;
 776                 }
 777
 778                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 779                               IPSTATS_MIB_FRAGFAILS);
 780                 dst_release(&rt->dst);
 781                 return err;
 782
 783 slow_path_clean:
 784                 skb_walk_frags(skb, frag2) {
 785                         if (frag2 == frag)
 786                                 break;
 787                         frag2->sk = NULL;
 788                         frag2->destructor = NULL;
 789                         skb->truesize += frag2->truesize;
 790                 }
 791         }
 792
 793 slow_path:
 794         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 795             skb_checksum_help(skb))
 796                 goto fail;
 797
 798         left = skb->len - hlen;         /* Space per frame */
 799         ptr = hlen;                     /* Where to start from */
 800
 801         /*
 802          *      Fragment the datagram.
 803          */
 804
 805         *prevhdr = NEXTHDR_FRAGMENT;
 806         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 807         troom = rt->dst.dev->needed_tailroom;
 808
 809         /*
 810          *      Keep copying data until we run out.
 811          */
 812         while(left > 0) {
 813                 len = left;
 814                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 815                 if (len > mtu)
 816                         len = mtu;
 817                 /* IF: we are not sending up to and including the packet end
 818                    then align the next start on an eight byte boundary */
 819                 if (len < left) {
 820                         len &= ~7;
 821                 }
 822                 /*
 823                  *      Allocate buffer.
 824                  */
 825
 826                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 827                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 828                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 829                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 830                                       IPSTATS_MIB_FRAGFAILS);
 831                         err = -ENOMEM;
 832                         goto fail;
 833                 }
 834
 835                 /*
 836                  *      Set up data on packet
 837                  */
 838
 839                 ip6_copy_metadata(frag, skb);
 840                 skb_reserve(frag, hroom);
 841                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 842                 skb_reset_network_header(frag);
 843                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 844                 frag->transport_header = (frag->network_header + hlen +
 845                                           sizeof(struct frag_hdr));
 846
 847                 /*
 848                  *      Charge the memory for the fragment to any owner
 849                  *      it might possess
 850                  */
 851                 if (skb->sk)
 852                         skb_set_owner_w(frag, skb->sk);
 853
 854                 /*
 855                  *      Copy the packet header into the new buffer.
 856                  */
 857                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 858
 859                 /*
 860                  *      Build fragment header.
 861                  */
 862                 fh->nexthdr = nexthdr;
 863                 fh->reserved = 0;
 864                 if (!frag_id) {
 865                         ipv6_select_ident(fh, rt);
 866                         frag_id = fh->identification;
 867                 } else
 868                         fh->identification = frag_id;
 869
 870                 /*
 871                  *      Copy a block of the IP datagram.
 872                  */
 873                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 874                         BUG();
 875                 left -= len;
 876
 877                 fh->frag_off = htons(offset);
 878                 if (left > 0)
 879                         fh->frag_off |= htons(IP6_MF);
 880                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 881                                                     sizeof(struct ipv6hdr));
 882
 883                 ptr += len;
 884                 offset += len;
 885
 886                 /*
 887                  *      Put this fragment into the sending queue.
 888                  */
 889                 err = output(frag);
 890                 if (err)
 891                         goto fail;
 892
 893                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 894                               IPSTATS_MIB_FRAGCREATES);
 895         }
 896         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 897                       IPSTATS_MIB_FRAGOKS);
 898         consume_skb(skb);
 899         return err;
 900
 901 fail:
 902         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 903                       IPSTATS_MIB_FRAGFAILS);
 904         kfree_skb(skb);
 905         return err;
 906 }
 907
 908 static inline int ip6_rt_check(const struct rt6key *rt_key,
 909                                const struct in6_addr *fl_addr,
 910                                const struct in6_addr *addr_cache)
 911 {
 912         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 913                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 914 }
 915
 916 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 917                                           struct dst_entry *dst,
 918                                           const struct flowi6 *fl6)
 919 {
 920         struct ipv6_pinfo *np = inet6_sk(sk);
 921         struct rt6_info *rt = (struct rt6_info *)dst;
 922
 923         if (!dst)
 924                 goto out;
 925
 926         /* Yes, checking route validity in not connected
 927          * case is not very simple. Take into account,
 928          * that we do not support routing by source, TOS,
 929          * and MSG_DONTROUTE            --ANK (980726)
 930          *
 931          * 1. ip6_rt_check(): If route was host route,
 932          *    check that cached destination is current.
 933          *    If it is network route, we still may
 934          *    check its validity using saved pointer
 935          *    to the last used address: daddr_cache.
 936          *    We do not want to save whole address now,
 937          *    (because main consumer of this service
 938          *    is tcp, which has not this problem),
 939          *    so that the last trick works only on connected
 940          *    sockets.
 941          * 2. oif also should be the same.
 942          */
 943         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 944 #ifdef CONFIG_IPV6_SUBTREES
 945             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 946 #endif
 947             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 948                 dst_release(dst);
 949                 dst = NULL;
 950         }
 951
 952 out:
 953         return dst;
 954 }
 955
 956 static int ip6_dst_lookup_tail(struct sock *sk,
 957                                struct dst_entry **dst, struct flowi6 *fl6)
 958 {
 959         struct net *net = sock_net(sk);
 960 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 961         struct neighbour *n;
 962 #endif
 963         int err;
 964
 965         if (*dst == NULL)
 966                 *dst = ip6_route_output(net, sk, fl6);
 967
 968         if ((err = (*dst)->error))
 969                 goto out_err_release;
 970
 971         if (ipv6_addr_any(&fl6->saddr)) {
 972                 struct rt6_info *rt = (struct rt6_info *) *dst;
 973                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 974                                           sk ? inet6_sk(sk)->srcprefs : 0,
 975                                           &fl6->saddr);
 976                 if (err)
 977                         goto out_err_release;
 978         }
 979
 980 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 981         /*
 982          * Here if the dst entry we've looked up
 983          * has a neighbour entry that is in the INCOMPLETE
 984          * state and the src address from the flow is
 985          * marked as OPTIMISTIC, we release the found
 986          * dst entry and replace it instead with the
 987          * dst entry of the nexthop router
 988          */
 989         rcu_read_lock();
 990         n = dst_get_neighbour_noref(*dst);
 991         if (n && !(n->nud_state & NUD_VALID)) {
 992                 struct inet6_ifaddr *ifp;
 993                 struct flowi6 fl_gw6;
 994                 int redirect;
 995
 996                 rcu_read_unlock();
 997                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 998                                       (*dst)->dev, 1);
 999
1000                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1001                 if (ifp)
1002                         in6_ifa_put(ifp);
1003
1004                 if (redirect) {
1005                         /*
1006                          * We need to get the dst entry for the
1007                          * default router instead
1008                          */
1009                         dst_release(*dst);
1010                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1011                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1012                         *dst = ip6_route_output(net, sk, &fl_gw6);
1013                         if ((err = (*dst)->error))
1014                                 goto out_err_release;
1015                 }
1016         } else {
1017                 rcu_read_unlock();
1018         }
1019 #endif
1020
1021         return 0;
1022
1023 out_err_release:
1024         if (err == -ENETUNREACH)
1025                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1026         dst_release(*dst);
1027         *dst = NULL;
1028         return err;
1029 }
1030
1031 /**
1032  *      ip6_dst_lookup - perform route lookup on flow
1033  *      @sk: socket which provides route info
1034  *      @dst: pointer to dst_entry * for result
1035  *      @fl6: flow to lookup
1036  *
1037  *      This function performs a route lookup on the given flow.
1038  *
1039  *      It returns zero on success, or a standard errno code on error.
1040  */
1041 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1042 {
1043         *dst = NULL;
1044         return ip6_dst_lookup_tail(sk, dst, fl6);
1045 }
1046 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1047
1048 /**
1049  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1050  *      @sk: socket which provides route info
1051  *      @fl6: flow to lookup
1052  *      @final_dst: final destination address for ipsec lookup
1053  *      @can_sleep: we are in a sleepable context
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns a valid dst pointer on success, or a pointer encoded
1058  *      error code.
1059  */
1060 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1061                                       const struct in6_addr *final_dst,
1062                                       bool can_sleep)
1063 {
1064         struct dst_entry *dst = NULL;
1065         int err;
1066
1067         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1068         if (err)
1069                 return ERR_PTR(err);
1070         if (final_dst)
1071                 fl6->daddr = *final_dst;
1072         if (can_sleep)
1073                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1074
1075         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1078
1079 /**
1080  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1081  *      @sk: socket which provides the dst cache and route info
1082  *      @fl6: flow to lookup
1083  *      @final_dst: final destination address for ipsec lookup
1084  *      @can_sleep: we are in a sleepable context
1085  *
1086  *      This function performs a route lookup on the given flow with the
1087  *      possibility of using the cached route in the socket if it is valid.
1088  *      It will take the socket dst lock when operating on the dst cache.
1089  *      As a result, this function can only be used in process context.
1090  *
1091  *      It returns a valid dst pointer on success, or a pointer encoded
1092  *      error code.
1093  */
1094 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1095                                          const struct in6_addr *final_dst,
1096                                          bool can_sleep)
1097 {
1098         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1099         int err;
1100
1101         dst = ip6_sk_dst_check(sk, dst, fl6);
1102
1103         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1104         if (err)
1105                 return ERR_PTR(err);
1106         if (final_dst)
1107                 fl6->daddr = *final_dst;
1108         if (can_sleep)
1109                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1110
1111         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1114
1115 static inline int ip6_ufo_append_data(struct sock *sk,
1116                         int getfrag(void *from, char *to, int offset, int len,
1117                         int odd, struct sk_buff *skb),
1118                         void *from, int length, int hh_len, int fragheaderlen,
1119                         int transhdrlen, int mtu,unsigned int flags,
1120                         struct rt6_info *rt)
1121
1122 {
1123         struct sk_buff *skb;
1124         int err;
1125
1126         /* There is support for UDP large send offload by network
1127          * device, so create one single skb packet containing complete
1128          * udp datagram
1129          */
1130         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1131                 skb = sock_alloc_send_skb(sk,
1132                         hh_len + fragheaderlen + transhdrlen + 20,
1133                         (flags & MSG_DONTWAIT), &err);
1134                 if (skb == NULL)
1135                         return err;
1136
1137                 /* reserve space for Hardware header */
1138                 skb_reserve(skb, hh_len);
1139
1140                 /* create space for UDP/IP header */
1141                 skb_put(skb,fragheaderlen + transhdrlen);
1142
1143                 /* initialize network header pointer */
1144                 skb_reset_network_header(skb);
1145
1146                 /* initialize protocol header pointer */
1147                 skb->transport_header = skb->network_header + fragheaderlen;
1148
1149                 skb->ip_summed = CHECKSUM_PARTIAL;
1150                 skb->csum = 0;
1151         }
1152
1153         err = skb_append_datato_frags(sk,skb, getfrag, from,
1154                                       (length - transhdrlen));
1155         if (!err) {
1156                 struct frag_hdr fhdr;
1157
1158                 /* Specify the length of each IPv6 datagram fragment.
1159                  * It has to be a multiple of 8.
1160                  */
1161                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1162                                              sizeof(struct frag_hdr)) & ~7;
1163                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1164                 ipv6_select_ident(&fhdr, rt);
1165                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1166                 __skb_queue_tail(&sk->sk_write_queue, skb);
1167
1168                 return 0;
1169         }
1170         /* There is not enough support do UPD LSO,
1171          * so follow normal path
1172          */
1173         kfree_skb(skb);
1174
1175         return err;
1176 }
1177
1178 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1179                                                gfp_t gfp)
1180 {
1181         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183
1184 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1185                                                 gfp_t gfp)
1186 {
1187         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1188 }
1189
1190 static void ip6_append_data_mtu(int *mtu,
1191                                 int *maxfraglen,
1192                                 unsigned int fragheaderlen,
1193                                 struct sk_buff *skb,
1194                                 struct rt6_info *rt)
1195 {
1196         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1197                 if (skb == NULL) {
1198                         /* first fragment, reserve header_len */
1199                         *mtu = *mtu - rt->dst.header_len;
1200
1201                 } else {
1202                         /*
1203                          * this fragment is not first, the headers
1204                          * space is regarded as data space.
1205                          */
1206                         *mtu = dst_mtu(rt->dst.path);
1207                 }
1208                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1209                               + fragheaderlen - sizeof(struct frag_hdr);
1210         }
1211 }
1212
1213 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1214         int offset, int len, int odd, struct sk_buff *skb),
1215         void *from, int length, int transhdrlen,
1216         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1217         struct rt6_info *rt, unsigned int flags, int dontfrag)
1218 {
1219         struct inet_sock *inet = inet_sk(sk);
1220         struct ipv6_pinfo *np = inet6_sk(sk);
1221         struct inet_cork *cork;
1222         struct sk_buff *skb, *skb_prev = NULL;
1223         unsigned int maxfraglen, fragheaderlen;
1224         int exthdrlen;
1225         int dst_exthdrlen;
1226         int hh_len;
1227         int mtu;
1228         int copy;
1229         int err;
1230         int offset = 0;
1231         __u8 tx_flags = 0;
1232
1233         if (flags&MSG_PROBE)
1234                 return 0;
1235         cork = &inet->cork.base;
1236         if (skb_queue_empty(&sk->sk_write_queue)) {
1237                 /*
1238                  * setup for corking
1239                  */
1240                 if (opt) {
1241                         if (WARN_ON(np->cork.opt))
1242                                 return -EINVAL;
1243
1244                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1245                         if (unlikely(np->cork.opt == NULL))
1246                                 return -ENOBUFS;
1247
1248                         np->cork.opt->tot_len = opt->tot_len;
1249                         np->cork.opt->opt_flen = opt->opt_flen;
1250                         np->cork.opt->opt_nflen = opt->opt_nflen;
1251
1252                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1253                                                             sk->sk_allocation);
1254                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1255                                 return -ENOBUFS;
1256
1257                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1258                                                             sk->sk_allocation);
1259                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1260                                 return -ENOBUFS;
1261
1262                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1263                                                            sk->sk_allocation);
1264                         if (opt->hopopt && !np->cork.opt->hopopt)
1265                                 return -ENOBUFS;
1266
1267                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1268                                                             sk->sk_allocation);
1269                         if (opt->srcrt && !np->cork.opt->srcrt)
1270                                 return -ENOBUFS;
1271
1272                         /* need source address above miyazawa*/
1273                 }
1274                 dst_hold(&rt->dst);
1275                 cork->dst = &rt->dst;
1276                 inet->cork.fl.u.ip6 = *fl6;
1277                 np->cork.hop_limit = hlimit;
1278                 np->cork.tclass = tclass;
1279                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1280                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1281                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1282                 else
1283                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1284                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1285                 if (np->frag_size < mtu) {
1286                         if (np->frag_size)
1287                                 mtu = np->frag_size;
1288                 }
1289                 cork->fragsize = mtu;
1290                 if (dst_allfrag(rt->dst.path))
1291                         cork->flags |= IPCORK_ALLFRAG;
1292                 cork->length = 0;
1293                 sk->sk_sndmsg_page = NULL;
1294                 sk->sk_sndmsg_off = 0;
1295                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1296                 length += exthdrlen;
1297                 transhdrlen += exthdrlen;
1298                 dst_exthdrlen = rt->dst.header_len;
1299         } else {
1300                 rt = (struct rt6_info *)cork->dst;
1301                 fl6 = &inet->cork.fl.u.ip6;
1302                 opt = np->cork.opt;
1303                 transhdrlen = 0;
1304                 exthdrlen = 0;
1305                 dst_exthdrlen = 0;
1306                 mtu = cork->fragsize;
1307         }
1308
1309         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1310
1311         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1312                         (opt ? opt->opt_nflen : 0);
1313         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1314
1315         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1316                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1317                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1318                         return -EMSGSIZE;
1319                 }
1320         }
1321
1322         /* For UDP, check if TX timestamp is enabled */
1323         if (sk->sk_type == SOCK_DGRAM) {
1324                 err = sock_tx_timestamp(sk, &tx_flags);
1325                 if (err)
1326                         goto error;
1327         }
1328
1329         /*
1330          * Let's try using as much space as possible.
1331          * Use MTU if total length of the message fits into the MTU.
1332          * Otherwise, we need to reserve fragment header and
1333          * fragment alignment (= 8-15 octects, in total).
1334          *
1335          * Note that we may need to "move" the data from the tail of
1336          * of the buffer to the new fragment when we split
1337          * the message.
1338          *
1339          * FIXME: It may be fragmented into multiple chunks
1340          *        at once if non-fragmentable extension headers
1341          *        are too large.
1342          * --yoshfuji
1343          */
1344
1345         cork->length += length;
1346         if (length > mtu) {
1347                 int proto = sk->sk_protocol;
1348                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1349                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1350                         return -EMSGSIZE;
1351                 }
1352
1353                 if (proto == IPPROTO_UDP &&
1354                     (rt->dst.dev->features & NETIF_F_UFO)) {
1355
1356                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1357                                                   hh_len, fragheaderlen,
1358                                                   transhdrlen, mtu, flags, rt);
1359                         if (err)
1360                                 goto error;
1361                         return 0;
1362                 }
1363         }
1364
1365         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1366                 goto alloc_new_skb;
1367
1368         while (length > 0) {
1369                 /* Check if the remaining data fits into current packet. */
1370                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1371                 if (copy < length)
1372                         copy = maxfraglen - skb->len;
1373
1374                 if (copy <= 0) {
1375                         char *data;
1376                         unsigned int datalen;
1377                         unsigned int fraglen;
1378                         unsigned int fraggap;
1379                         unsigned int alloclen;
1380 alloc_new_skb:
1381                         /* There's no room in the current skb */
1382                         if (skb)
1383                                 fraggap = skb->len - maxfraglen;
1384                         else
1385                                 fraggap = 0;
1386                         /* update mtu and maxfraglen if necessary */
1387                         if (skb == NULL || skb_prev == NULL)
1388                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1389                                                     fragheaderlen, skb, rt);
1390
1391                         skb_prev = skb;
1392
1393                         /*
1394                          * If remaining data exceeds the mtu,
1395                          * we know we need more fragment(s).
1396                          */
1397                         datalen = length + fraggap;
1398
1399                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1400                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1401                         if ((flags & MSG_MORE) &&
1402                             !(rt->dst.dev->features&NETIF_F_SG))
1403                                 alloclen = mtu;
1404                         else
1405                                 alloclen = datalen + fragheaderlen;
1406
1407                         alloclen += dst_exthdrlen;
1408
1409                         if (datalen != length + fraggap) {
1410                                 /*
1411                                  * this is not the last fragment, the trailer
1412                                  * space is regarded as data space.
1413                                  */
1414                                 datalen += rt->dst.trailer_len;
1415                         }
1416
1417                         alloclen += rt->dst.trailer_len;
1418                         fraglen = datalen + fragheaderlen;
1419
1420                         /*
1421                          * We just reserve space for fragment header.
1422                          * Note: this may be overallocation if the message
1423                          * (without MSG_MORE) fits into the MTU.
1424                          */
1425                         alloclen += sizeof(struct frag_hdr);
1426
1427                         if (transhdrlen) {
1428                                 skb = sock_alloc_send_skb(sk,
1429                                                 alloclen + hh_len,
1430                                                 (flags & MSG_DONTWAIT), &err);
1431                         } else {
1432                                 skb = NULL;
1433                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1434                                     2 * sk->sk_sndbuf)
1435                                         skb = sock_wmalloc(sk,
1436                                                            alloclen + hh_len, 1,
1437                                                            sk->sk_allocation);
1438                                 if (unlikely(skb == NULL))
1439                                         err = -ENOBUFS;
1440                                 else {
1441                                         /* Only the initial fragment
1442                                          * is time stamped.
1443                                          */
1444                                         tx_flags = 0;
1445                                 }
1446                         }
1447                         if (skb == NULL)
1448                                 goto error;
1449                         /*
1450                          *      Fill in the control structures
1451                          */
1452                         skb->ip_summed = CHECKSUM_NONE;
1453                         skb->csum = 0;
1454                         /* reserve for fragmentation and ipsec header */
1455                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1456                                     dst_exthdrlen);
1457
1458                         if (sk->sk_type == SOCK_DGRAM)
1459                                 skb_shinfo(skb)->tx_flags = tx_flags;
1460
1461                         /*
1462                          *      Find where to start putting bytes
1463                          */
1464                         data = skb_put(skb, fraglen);
1465                         skb_set_network_header(skb, exthdrlen);
1466                         data += fragheaderlen;
1467                         skb->transport_header = (skb->network_header +
1468                                                  fragheaderlen);
1469                         if (fraggap) {
1470                                 skb->csum = skb_copy_and_csum_bits(
1471                                         skb_prev, maxfraglen,
1472                                         data + transhdrlen, fraggap, 0);
1473                                 skb_prev->csum = csum_sub(skb_prev->csum,
1474                                                           skb->csum);
1475                                 data += fraggap;
1476                                 pskb_trim_unique(skb_prev, maxfraglen);
1477                         }
1478                         copy = datalen - transhdrlen - fraggap;
1479
1480                         if (copy < 0) {
1481                                 err = -EINVAL;
1482                                 kfree_skb(skb);
1483                                 goto error;
1484                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1485                                 err = -EFAULT;
1486                                 kfree_skb(skb);
1487                                 goto error;
1488                         }
1489
1490                         offset += copy;
1491                         length -= datalen - fraggap;
1492                         transhdrlen = 0;
1493                         exthdrlen = 0;
1494                         dst_exthdrlen = 0;
1495
1496                         /*
1497                          * Put the packet on the pending queue
1498                          */
1499                         __skb_queue_tail(&sk->sk_write_queue, skb);
1500                         continue;
1501                 }
1502
1503                 if (copy > length)
1504                         copy = length;
1505
1506                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1507                         unsigned int off;
1508
1509                         off = skb->len;
1510                         if (getfrag(from, skb_put(skb, copy),
1511                                                 offset, copy, off, skb) < 0) {
1512                                 __skb_trim(skb, off);
1513                                 err = -EFAULT;
1514                                 goto error;
1515                         }
1516                 } else {
1517                         int i = skb_shinfo(skb)->nr_frags;
1518                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1519                         struct page *page = sk->sk_sndmsg_page;
1520                         int off = sk->sk_sndmsg_off;
1521                         unsigned int left;
1522
1523                         if (page && (left = PAGE_SIZE - off) > 0) {
1524                                 if (copy >= left)
1525                                         copy = left;
1526                                 if (page != skb_frag_page(frag)) {
1527                                         if (i == MAX_SKB_FRAGS) {
1528                                                 err = -EMSGSIZE;
1529                                                 goto error;
1530                                         }
1531                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1532                                         skb_frag_ref(skb, i);
1533                                         frag = &skb_shinfo(skb)->frags[i];
1534                                 }
1535                         } else if(i < MAX_SKB_FRAGS) {
1536                                 if (copy > PAGE_SIZE)
1537                                         copy = PAGE_SIZE;
1538                                 page = alloc_pages(sk->sk_allocation, 0);
1539                                 if (page == NULL) {
1540                                         err = -ENOMEM;
1541                                         goto error;
1542                                 }
1543                                 sk->sk_sndmsg_page = page;
1544                                 sk->sk_sndmsg_off = 0;
1545
1546                                 skb_fill_page_desc(skb, i, page, 0, 0);
1547                                 frag = &skb_shinfo(skb)->frags[i];
1548                         } else {
1549                                 err = -EMSGSIZE;
1550                                 goto error;
1551                         }
1552                         if (getfrag(from,
1553                                     skb_frag_address(frag) + skb_frag_size(frag),
1554                                     offset, copy, skb->len, skb) < 0) {
1555                                 err = -EFAULT;
1556                                 goto error;
1557                         }
1558                         sk->sk_sndmsg_off += copy;
1559                         skb_frag_size_add(frag, copy);
1560                         skb->len += copy;
1561                         skb->data_len += copy;
1562                         skb->truesize += copy;
1563                         atomic_add(copy, &sk->sk_wmem_alloc);
1564                 }
1565                 offset += copy;
1566                 length -= copy;
1567         }
1568         return 0;
1569 error:
1570         cork->length -= length;
1571         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1572         return err;
1573 }
1574 EXPORT_SYMBOL_GPL(ip6_append_data);
1575
1576 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1577 {
1578         if (np->cork.opt) {
1579                 kfree(np->cork.opt->dst0opt);
1580                 kfree(np->cork.opt->dst1opt);
1581                 kfree(np->cork.opt->hopopt);
1582                 kfree(np->cork.opt->srcrt);
1583                 kfree(np->cork.opt);
1584                 np->cork.opt = NULL;
1585         }
1586
1587         if (inet->cork.base.dst) {
1588                 dst_release(inet->cork.base.dst);
1589                 inet->cork.base.dst = NULL;
1590                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1591         }
1592         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1593 }
1594
1595 int ip6_push_pending_frames(struct sock *sk)
1596 {
1597         struct sk_buff *skb, *tmp_skb;
1598         struct sk_buff **tail_skb;
1599         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1600         struct inet_sock *inet = inet_sk(sk);
1601         struct ipv6_pinfo *np = inet6_sk(sk);
1602         struct net *net = sock_net(sk);
1603         struct ipv6hdr *hdr;
1604         struct ipv6_txoptions *opt = np->cork.opt;
1605         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1606         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1607         unsigned char proto = fl6->flowi6_proto;
1608         int err = 0;
1609
1610         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1611                 goto out;
1612         tail_skb = &(skb_shinfo(skb)->frag_list);
1613
1614         /* move skb->data to ip header from ext header */
1615         if (skb->data < skb_network_header(skb))
1616                 __skb_pull(skb, skb_network_offset(skb));
1617         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1618                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1619                 *tail_skb = tmp_skb;
1620                 tail_skb = &(tmp_skb->next);
1621                 skb->len += tmp_skb->len;
1622                 skb->data_len += tmp_skb->len;
1623                 skb->truesize += tmp_skb->truesize;
1624                 tmp_skb->destructor = NULL;
1625                 tmp_skb->sk = NULL;
1626         }
1627
1628         /* Allow local fragmentation. */
1629         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1630                 skb->local_df = 1;
1631
1632         *final_dst = fl6->daddr;
1633         __skb_pull(skb, skb_network_header_len(skb));
1634         if (opt && opt->opt_flen)
1635                 ipv6_push_frag_opts(skb, opt, &proto);
1636         if (opt && opt->opt_nflen)
1637                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1638
1639         skb_push(skb, sizeof(struct ipv6hdr));
1640         skb_reset_network_header(skb);
1641         hdr = ipv6_hdr(skb);
1642
1643         *(__be32*)hdr = fl6->flowlabel |
1644                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1645
1646         hdr->hop_limit = np->cork.hop_limit;
1647         hdr->nexthdr = proto;
1648         hdr->saddr = fl6->saddr;
1649         hdr->daddr = *final_dst;
1650
1651         skb->priority = sk->sk_priority;
1652         skb->mark = sk->sk_mark;
1653
1654         skb_dst_set(skb, dst_clone(&rt->dst));
1655         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1656         if (proto == IPPROTO_ICMPV6) {
1657                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1658
1659                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1660                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1661         }
1662
1663         err = ip6_local_out(skb);
1664         if (err) {
1665                 if (err > 0)
1666                         err = net_xmit_errno(err);
1667                 if (err)
1668                         goto error;
1669         }
1670
1671 out:
1672         ip6_cork_release(inet, np);
1673         return err;
1674 error:
1675         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1676         goto out;
1677 }
1678 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1679
1680 void ip6_flush_pending_frames(struct sock *sk)
1681 {
1682         struct sk_buff *skb;
1683
1684         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1685                 if (skb_dst(skb))
1686                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1687                                       IPSTATS_MIB_OUTDISCARDS);
1688                 kfree_skb(skb);
1689         }
1690
1691         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1692 }
1693 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);