net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(dev_net(dst->dev),
 107                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         WARN_ON(!newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 141                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 142                                          &ipv6_hdr(skb)->saddr))) {
 143                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 144
 145                         /* Do not check for IFF_ALLMULTI; multicast routing
 146                            is not supported in any case.
 147                          */
 148                         if (newskb)
 149                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 150                                         NULL, newskb->dev,
 151                                         ip6_dev_loopback_xmit);
 152
 153                         if (ipv6_hdr(skb)->hop_limit == 0) {
 154                                 IP6_INC_STATS(dev_net(dev), idev,
 155                                               IPSTATS_MIB_OUTDISCARDS);
 156                                 kfree_skb(skb);
 157                                 return 0;
 158                         }
 159                 }
 160
 161                 IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
 162         }
 163
 164         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 165                        ip6_output_finish);
 166 }
 167
 168 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 169 {
 170         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 171
 172         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 173                skb->dst->dev->mtu : dst_mtu(skb->dst);
 174 }
 175
 176 int ip6_output(struct sk_buff *skb)
 177 {
 178         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 179         if (unlikely(idev->cnf.disable_ipv6)) {
 180                 IP6_INC_STATS(dev_net(skb->dst->dev), idev,
 181                               IPSTATS_MIB_OUTDISCARDS);
 182                 kfree_skb(skb);
 183                 return 0;
 184         }
 185
 186         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 187                                 dst_allfrag(skb->dst))
 188                 return ip6_fragment(skb, ip6_output2);
 189         else
 190                 return ip6_output2(skb);
 191 }
 192
 193 /*
 194  *      xmit an sk_buff (used by TCP)
 195  */
 196
 197 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 198              struct ipv6_txoptions *opt, int ipfragok)
 199 {
 200         struct net *net = sock_net(sk);
 201         struct ipv6_pinfo *np = inet6_sk(sk);
 202         struct in6_addr *first_hop = &fl->fl6_dst;
 203         struct dst_entry *dst = skb->dst;
 204         struct ipv6hdr *hdr;
 205         u8  proto = fl->proto;
 206         int seg_len = skb->len;
 207         int hlimit, tclass;
 208         u32 mtu;
 209
 210         if (opt) {
 211                 unsigned int head_room;
 212
 213                 /* First: exthdrs may take lots of space (~8K for now)
 214                    MAX_HEADER is not enough.
 215                  */
 216                 head_room = opt->opt_nflen + opt->opt_flen;
 217                 seg_len += head_room;
 218                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 219
 220                 if (skb_headroom(skb) < head_room) {
 221                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 222                         if (skb2 == NULL) {
 223                                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 224                                               IPSTATS_MIB_OUTDISCARDS);
 225                                 kfree_skb(skb);
 226                                 return -ENOBUFS;
 227                         }
 228                         kfree_skb(skb);
 229                         skb = skb2;
 230                         if (sk)
 231                                 skb_set_owner_w(skb, sk);
 232                 }
 233                 if (opt->opt_flen)
 234                         ipv6_push_frag_opts(skb, opt, &proto);
 235                 if (opt->opt_nflen)
 236                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 237         }
 238
 239         skb_push(skb, sizeof(struct ipv6hdr));
 240         skb_reset_network_header(skb);
 241         hdr = ipv6_hdr(skb);
 242
 243         /* Allow local fragmentation. */
 244         if (ipfragok)
 245                 skb->local_df = 1;
 246
 247         /*
 248          *      Fill in the IPv6 header
 249          */
 250
 251         hlimit = -1;
 252         if (np)
 253                 hlimit = np->hop_limit;
 254         if (hlimit < 0)
 255                 hlimit = ip6_dst_hoplimit(dst);
 256
 257         tclass = -1;
 258         if (np)
 259                 tclass = np->tclass;
 260         if (tclass < 0)
 261                 tclass = 0;
 262
 263         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 264
 265         hdr->payload_len = htons(seg_len);
 266         hdr->nexthdr = proto;
 267         hdr->hop_limit = hlimit;
 268
 269         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 270         ipv6_addr_copy(&hdr->daddr, first_hop);
 271
 272         skb->priority = sk->sk_priority;
 273         skb->mark = sk->sk_mark;
 274
 275         mtu = dst_mtu(dst);
 276         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 277                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 278                               IPSTATS_MIB_OUTREQUESTS);
 279                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 280                                 dst_output);
 281         }
 282
 283         if (net_ratelimit())
 284                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 285         skb->dev = dst->dev;
 286         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 287         IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 288         kfree_skb(skb);
 289         return -EMSGSIZE;
 290 }
 291
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 /*
 295  *      To avoid extra problems ND packets are send through this
 296  *      routine. It's code duplication but I really want to avoid
 297  *      extra checks since ipv6_build_header is used by TCP (which
 298  *      is for us performance critical)
 299  */
 300
 301 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 302                const struct in6_addr *saddr, const struct in6_addr *daddr,
 303                int proto, int len)
 304 {
 305         struct ipv6_pinfo *np = inet6_sk(sk);
 306         struct ipv6hdr *hdr;
 307         int totlen;
 308
 309         skb->protocol = htons(ETH_P_IPV6);
 310         skb->dev = dev;
 311
 312         totlen = len + sizeof(struct ipv6hdr);
 313
 314         skb_reset_network_header(skb);
 315         skb_put(skb, sizeof(struct ipv6hdr));
 316         hdr = ipv6_hdr(skb);
 317
 318         *(__be32*)hdr = htonl(0x60000000);
 319
 320         hdr->payload_len = htons(len);
 321         hdr->nexthdr = proto;
 322         hdr->hop_limit = np->hop_limit;
 323
 324         ipv6_addr_copy(&hdr->saddr, saddr);
 325         ipv6_addr_copy(&hdr->daddr, daddr);
 326
 327         return 0;
 328 }
 329
 330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 331 {
 332         struct ip6_ra_chain *ra;
 333         struct sock *last = NULL;
 334
 335         read_lock(&ip6_ra_lock);
 336         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 337                 struct sock *sk = ra->sk;
 338                 if (sk && ra->sel == sel &&
 339                     (!sk->sk_bound_dev_if ||
 340                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 341                         if (last) {
 342                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 343                                 if (skb2)
 344                                         rawv6_rcv(last, skb2);
 345                         }
 346                         last = sk;
 347                 }
 348         }
 349
 350         if (last) {
 351                 rawv6_rcv(last, skb);
 352                 read_unlock(&ip6_ra_lock);
 353                 return 1;
 354         }
 355         read_unlock(&ip6_ra_lock);
 356         return 0;
 357 }
 358
 359 static int ip6_forward_proxy_check(struct sk_buff *skb)
 360 {
 361         struct ipv6hdr *hdr = ipv6_hdr(skb);
 362         u8 nexthdr = hdr->nexthdr;
 363         int offset;
 364
 365         if (ipv6_ext_hdr(nexthdr)) {
 366                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 367                 if (offset < 0)
 368                         return 0;
 369         } else
 370                 offset = sizeof(struct ipv6hdr);
 371
 372         if (nexthdr == IPPROTO_ICMPV6) {
 373                 struct icmp6hdr *icmp6;
 374
 375                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 376                                          offset + 1 - skb->data)))
 377                         return 0;
 378
 379                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 380
 381                 switch (icmp6->icmp6_type) {
 382                 case NDISC_ROUTER_SOLICITATION:
 383                 case NDISC_ROUTER_ADVERTISEMENT:
 384                 case NDISC_NEIGHBOUR_SOLICITATION:
 385                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 386                 case NDISC_REDIRECT:
 387                         /* For reaction involving unicast neighbor discovery
 388                          * message destined to the proxied address, pass it to
 389                          * input function.
 390                          */
 391                         return 1;
 392                 default:
 393                         break;
 394                 }
 395         }
 396
 397         /*
 398          * The proxying router can't forward traffic sent to a link-local
 399          * address, so signal the sender and discard the packet. This
 400          * behavior is clarified by the MIPv6 specification.
 401          */
 402         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 403                 dst_link_failure(skb);
 404                 return -1;
 405         }
 406
 407         return 0;
 408 }
 409
 410 static inline int ip6_forward_finish(struct sk_buff *skb)
 411 {
 412         return dst_output(skb);
 413 }
 414
 415 int ip6_forward(struct sk_buff *skb)
 416 {
 417         struct dst_entry *dst = skb->dst;
 418         struct ipv6hdr *hdr = ipv6_hdr(skb);
 419         struct inet6_skb_parm *opt = IP6CB(skb);
 420         struct net *net = dev_net(dst->dev);
 421
 422         if (net->ipv6.devconf_all->forwarding == 0)
 423                 goto error;
 424
 425         if (skb_warn_if_lro(skb))
 426                 goto drop;
 427
 428         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 429                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 430                 goto drop;
 431         }
 432
 433         skb_forward_csum(skb);
 434
 435         /*
 436          *      We DO NOT make any processing on
 437          *      RA packets, pushing them to user level AS IS
 438          *      without ane WARRANTY that application will be able
 439          *      to interpret them. The reason is that we
 440          *      cannot make anything clever here.
 441          *
 442          *      We are not end-node, so that if packet contains
 443          *      AH/ESP, we cannot make anything.
 444          *      Defragmentation also would be mistake, RA packets
 445          *      cannot be fragmented, because there is no warranty
 446          *      that different fragments will go along one path. --ANK
 447          */
 448         if (opt->ra) {
 449                 u8 *ptr = skb_network_header(skb) + opt->ra;
 450                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 451                         return 0;
 452         }
 453
 454         /*
 455          *      check and decrement ttl
 456          */
 457         if (hdr->hop_limit <= 1) {
 458                 /* Force OUTPUT device used as source address */
 459                 skb->dev = dst->dev;
 460                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 461                             0, skb->dev);
 462                 IP6_INC_STATS_BH(net,
 463                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 464
 465                 kfree_skb(skb);
 466                 return -ETIMEDOUT;
 467         }
 468
 469         /* XXX: idev->cnf.proxy_ndp? */
 470         if (net->ipv6.devconf_all->proxy_ndp &&
 471             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 472                 int proxied = ip6_forward_proxy_check(skb);
 473                 if (proxied > 0)
 474                         return ip6_input(skb);
 475                 else if (proxied < 0) {
 476                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 477                                       IPSTATS_MIB_INDISCARDS);
 478                         goto drop;
 479                 }
 480         }
 481
 482         if (!xfrm6_route_forward(skb)) {
 483                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 484                 goto drop;
 485         }
 486         dst = skb->dst;
 487
 488         /* IPv6 specs say nothing about it, but it is clear that we cannot
 489            send redirects to source routed frames.
 490            We don't send redirects to frames decapsulated from IPsec.
 491          */
 492         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 493             !skb->sp) {
 494                 struct in6_addr *target = NULL;
 495                 struct rt6_info *rt;
 496                 struct neighbour *n = dst->neighbour;
 497
 498                 /*
 499                  *      incoming and outgoing devices are the same
 500                  *      send a redirect.
 501                  */
 502
 503                 rt = (struct rt6_info *) dst;
 504                 if ((rt->rt6i_flags & RTF_GATEWAY))
 505                         target = (struct in6_addr*)&n->primary_key;
 506                 else
 507                         target = &hdr->daddr;
 508
 509                 /* Limit redirects both by destination (here)
 510                    and by source (inside ndisc_send_redirect)
 511                  */
 512                 if (xrlim_allow(dst, 1*HZ))
 513                         ndisc_send_redirect(skb, n, target);
 514         } else {
 515                 int addrtype = ipv6_addr_type(&hdr->saddr);
 516
 517                 /* This check is security critical. */
 518                 if (addrtype == IPV6_ADDR_ANY ||
 519                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 520                         goto error;
 521                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 522                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 523                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 524                         goto error;
 525                 }
 526         }
 527
 528         if (skb->len > dst_mtu(dst)) {
 529                 /* Again, force OUTPUT device used as source address */
 530                 skb->dev = dst->dev;
 531                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 532                 IP6_INC_STATS_BH(net,
 533                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 534                 IP6_INC_STATS_BH(net,
 535                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 536                 kfree_skb(skb);
 537                 return -EMSGSIZE;
 538         }
 539
 540         if (skb_cow(skb, dst->dev->hard_header_len)) {
 541                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 542                 goto drop;
 543         }
 544
 545         hdr = ipv6_hdr(skb);
 546
 547         /* Mangling hops number delayed to point after skb COW */
 548
 549         hdr->hop_limit--;
 550
 551         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 552         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 553                        ip6_forward_finish);
 554
 555 error:
 556         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 557 drop:
 558         kfree_skb(skb);
 559         return -EINVAL;
 560 }
 561
 562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 563 {
 564         to->pkt_type = from->pkt_type;
 565         to->priority = from->priority;
 566         to->protocol = from->protocol;
 567         dst_release(to->dst);
 568         to->dst = dst_clone(from->dst);
 569         to->dev = from->dev;
 570         to->mark = from->mark;
 571
 572 #ifdef CONFIG_NET_SCHED
 573         to->tc_index = from->tc_index;
 574 #endif
 575         nf_copy(to, from);
 576 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 577     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 578         to->nf_trace = from->nf_trace;
 579 #endif
 580         skb_copy_secmark(to, from);
 581 }
 582
 583 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 584 {
 585         u16 offset = sizeof(struct ipv6hdr);
 586         struct ipv6_opt_hdr *exthdr =
 587                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 588         unsigned int packet_len = skb->tail - skb->network_header;
 589         int found_rhdr = 0;
 590         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 591
 592         while (offset + 1 <= packet_len) {
 593
 594                 switch (**nexthdr) {
 595
 596                 case NEXTHDR_HOP:
 597                         break;
 598                 case NEXTHDR_ROUTING:
 599                         found_rhdr = 1;
 600                         break;
 601                 case NEXTHDR_DEST:
 602 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 603                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 604                                 break;
 605 #endif
 606                         if (found_rhdr)
 607                                 return offset;
 608                         break;
 609                 default :
 610                         return offset;
 611                 }
 612
 613                 offset += ipv6_optlen(exthdr);
 614                 *nexthdr = &exthdr->nexthdr;
 615                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 616                                                  offset);
 617         }
 618
 619         return offset;
 620 }
 621
 622 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 623 {
 624         struct sk_buff *frag;
 625         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 626         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 627         struct ipv6hdr *tmp_hdr;
 628         struct frag_hdr *fh;
 629         unsigned int mtu, hlen, left, len;
 630         __be32 frag_id = 0;
 631         int ptr, offset = 0, err=0;
 632         u8 *prevhdr, nexthdr = 0;
 633         struct net *net = dev_net(skb->dst->dev);
 634
 635         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 636         nexthdr = *prevhdr;
 637
 638         mtu = ip6_skb_dst_mtu(skb);
 639
 640         /* We must not fragment if the socket is set to force MTU discovery
 641          * or if the skb it not generated by a local socket.  (This last
 642          * check should be redundant, but it's free.)
 643          */
 644         if (!skb->local_df) {
 645                 skb->dev = skb->dst->dev;
 646                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 647                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 648                               IPSTATS_MIB_FRAGFAILS);
 649                 kfree_skb(skb);
 650                 return -EMSGSIZE;
 651         }
 652
 653         if (np && np->frag_size < mtu) {
 654                 if (np->frag_size)
 655                         mtu = np->frag_size;
 656         }
 657         mtu -= hlen + sizeof(struct frag_hdr);
 658
 659         if (skb_shinfo(skb)->frag_list) {
 660                 int first_len = skb_pagelen(skb);
 661                 int truesizes = 0;
 662
 663                 if (first_len - hlen > mtu ||
 664                     ((first_len - hlen) & 7) ||
 665                     skb_cloned(skb))
 666                         goto slow_path;
 667
 668                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 669                         /* Correct geometry. */
 670                         if (frag->len > mtu ||
 671                             ((frag->len & 7) && frag->next) ||
 672                             skb_headroom(frag) < hlen)
 673                             goto slow_path;
 674
 675                         /* Partially cloned skb? */
 676                         if (skb_shared(frag))
 677                                 goto slow_path;
 678
 679                         BUG_ON(frag->sk);
 680                         if (skb->sk) {
 681                                 sock_hold(skb->sk);
 682                                 frag->sk = skb->sk;
 683                                 frag->destructor = sock_wfree;
 684                                 truesizes += frag->truesize;
 685                         }
 686                 }
 687
 688                 err = 0;
 689                 offset = 0;
 690                 frag = skb_shinfo(skb)->frag_list;
 691                 skb_shinfo(skb)->frag_list = NULL;
 692                 /* BUILD HEADER */
 693
 694                 *prevhdr = NEXTHDR_FRAGMENT;
 695                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 696                 if (!tmp_hdr) {
 697                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 698                                       IPSTATS_MIB_FRAGFAILS);
 699                         return -ENOMEM;
 700                 }
 701
 702                 __skb_pull(skb, hlen);
 703                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 704                 __skb_push(skb, hlen);
 705                 skb_reset_network_header(skb);
 706                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 707
 708                 ipv6_select_ident(skb, fh);
 709                 fh->nexthdr = nexthdr;
 710                 fh->reserved = 0;
 711                 fh->frag_off = htons(IP6_MF);
 712                 frag_id = fh->identification;
 713
 714                 first_len = skb_pagelen(skb);
 715                 skb->data_len = first_len - skb_headlen(skb);
 716                 skb->truesize -= truesizes;
 717                 skb->len = first_len;
 718                 ipv6_hdr(skb)->payload_len = htons(first_len -
 719                                                    sizeof(struct ipv6hdr));
 720
 721                 dst_hold(&rt->u.dst);
 722
 723                 for (;;) {
 724                         /* Prepare header of the next frame,
 725                          * before previous one went down. */
 726                         if (frag) {
 727                                 frag->ip_summed = CHECKSUM_NONE;
 728                                 skb_reset_transport_header(frag);
 729                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 730                                 __skb_push(frag, hlen);
 731                                 skb_reset_network_header(frag);
 732                                 memcpy(skb_network_header(frag), tmp_hdr,
 733                                        hlen);
 734                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 735                                 fh->nexthdr = nexthdr;
 736                                 fh->reserved = 0;
 737                                 fh->frag_off = htons(offset);
 738                                 if (frag->next != NULL)
 739                                         fh->frag_off |= htons(IP6_MF);
 740                                 fh->identification = frag_id;
 741                                 ipv6_hdr(frag)->payload_len =
 742                                                 htons(frag->len -
 743                                                       sizeof(struct ipv6hdr));
 744                                 ip6_copy_metadata(frag, skb);
 745                         }
 746
 747                         err = output(skb);
 748                         if(!err)
 749                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 750                                               IPSTATS_MIB_FRAGCREATES);
 751
 752                         if (err || !frag)
 753                                 break;
 754
 755                         skb = frag;
 756                         frag = skb->next;
 757                         skb->next = NULL;
 758                 }
 759
 760                 kfree(tmp_hdr);
 761
 762                 if (err == 0) {
 763                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 764                                       IPSTATS_MIB_FRAGOKS);
 765                         dst_release(&rt->u.dst);
 766                         return 0;
 767                 }
 768
 769                 while (frag) {
 770                         skb = frag->next;
 771                         kfree_skb(frag);
 772                         frag = skb;
 773                 }
 774
 775                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 776                               IPSTATS_MIB_FRAGFAILS);
 777                 dst_release(&rt->u.dst);
 778                 return err;
 779         }
 780
 781 slow_path:
 782         left = skb->len - hlen;         /* Space per frame */
 783         ptr = hlen;                     /* Where to start from */
 784
 785         /*
 786          *      Fragment the datagram.
 787          */
 788
 789         *prevhdr = NEXTHDR_FRAGMENT;
 790
 791         /*
 792          *      Keep copying data until we run out.
 793          */
 794         while(left > 0) {
 795                 len = left;
 796                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 797                 if (len > mtu)
 798                         len = mtu;
 799                 /* IF: we are not sending upto and including the packet end
 800                    then align the next start on an eight byte boundary */
 801                 if (len < left) {
 802                         len &= ~7;
 803                 }
 804                 /*
 805                  *      Allocate buffer.
 806                  */
 807
 808                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 809                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 810                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 811                                       IPSTATS_MIB_FRAGFAILS);
 812                         err = -ENOMEM;
 813                         goto fail;
 814                 }
 815
 816                 /*
 817                  *      Set up data on packet
 818                  */
 819
 820                 ip6_copy_metadata(frag, skb);
 821                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 822                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 823                 skb_reset_network_header(frag);
 824                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 825                 frag->transport_header = (frag->network_header + hlen +
 826                                           sizeof(struct frag_hdr));
 827
 828                 /*
 829                  *      Charge the memory for the fragment to any owner
 830                  *      it might possess
 831                  */
 832                 if (skb->sk)
 833                         skb_set_owner_w(frag, skb->sk);
 834
 835                 /*
 836                  *      Copy the packet header into the new buffer.
 837                  */
 838                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 839
 840                 /*
 841                  *      Build fragment header.
 842                  */
 843                 fh->nexthdr = nexthdr;
 844                 fh->reserved = 0;
 845                 if (!frag_id) {
 846                         ipv6_select_ident(skb, fh);
 847                         frag_id = fh->identification;
 848                 } else
 849                         fh->identification = frag_id;
 850
 851                 /*
 852                  *      Copy a block of the IP datagram.
 853                  */
 854                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 855                         BUG();
 856                 left -= len;
 857
 858                 fh->frag_off = htons(offset);
 859                 if (left > 0)
 860                         fh->frag_off |= htons(IP6_MF);
 861                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 862                                                     sizeof(struct ipv6hdr));
 863
 864                 ptr += len;
 865                 offset += len;
 866
 867                 /*
 868                  *      Put this fragment into the sending queue.
 869                  */
 870                 err = output(frag);
 871                 if (err)
 872                         goto fail;
 873
 874                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 875                               IPSTATS_MIB_FRAGCREATES);
 876         }
 877         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 878                       IPSTATS_MIB_FRAGOKS);
 879         kfree_skb(skb);
 880         return err;
 881
 882 fail:
 883         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 884                       IPSTATS_MIB_FRAGFAILS);
 885         kfree_skb(skb);
 886         return err;
 887 }
 888
 889 static inline int ip6_rt_check(struct rt6key *rt_key,
 890                                struct in6_addr *fl_addr,
 891                                struct in6_addr *addr_cache)
 892 {
 893         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 894                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 895 }
 896
 897 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 898                                           struct dst_entry *dst,
 899                                           struct flowi *fl)
 900 {
 901         struct ipv6_pinfo *np = inet6_sk(sk);
 902         struct rt6_info *rt = (struct rt6_info *)dst;
 903
 904         if (!dst)
 905                 goto out;
 906
 907         /* Yes, checking route validity in not connected
 908          * case is not very simple. Take into account,
 909          * that we do not support routing by source, TOS,
 910          * and MSG_DONTROUTE            --ANK (980726)
 911          *
 912          * 1. ip6_rt_check(): If route was host route,
 913          *    check that cached destination is current.
 914          *    If it is network route, we still may
 915          *    check its validity using saved pointer
 916          *    to the last used address: daddr_cache.
 917          *    We do not want to save whole address now,
 918          *    (because main consumer of this service
 919          *    is tcp, which has not this problem),
 920          *    so that the last trick works only on connected
 921          *    sockets.
 922          * 2. oif also should be the same.
 923          */
 924         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 925 #ifdef CONFIG_IPV6_SUBTREES
 926             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 927 #endif
 928             (fl->oif && fl->oif != dst->dev->ifindex)) {
 929                 dst_release(dst);
 930                 dst = NULL;
 931         }
 932
 933 out:
 934         return dst;
 935 }
 936
 937 static int ip6_dst_lookup_tail(struct sock *sk,
 938                                struct dst_entry **dst, struct flowi *fl)
 939 {
 940         int err;
 941         struct net *net = sock_net(sk);
 942
 943         if (*dst == NULL)
 944                 *dst = ip6_route_output(net, sk, fl);
 945
 946         if ((err = (*dst)->error))
 947                 goto out_err_release;
 948
 949         if (ipv6_addr_any(&fl->fl6_src)) {
 950                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 951                                          &fl->fl6_dst,
 952                                          sk ? inet6_sk(sk)->srcprefs : 0,
 953                                          &fl->fl6_src);
 954                 if (err)
 955                         goto out_err_release;
 956         }
 957
 958 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 959         /*
 960          * Here if the dst entry we've looked up
 961          * has a neighbour entry that is in the INCOMPLETE
 962          * state and the src address from the flow is
 963          * marked as OPTIMISTIC, we release the found
 964          * dst entry and replace it instead with the
 965          * dst entry of the nexthop router
 966          */
 967         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 968                 struct inet6_ifaddr *ifp;
 969                 struct flowi fl_gw;
 970                 int redirect;
 971
 972                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 973                                       (*dst)->dev, 1);
 974
 975                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 976                 if (ifp)
 977                         in6_ifa_put(ifp);
 978
 979                 if (redirect) {
 980                         /*
 981                          * We need to get the dst entry for the
 982                          * default router instead
 983                          */
 984                         dst_release(*dst);
 985                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 986                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 987                         *dst = ip6_route_output(net, sk, &fl_gw);
 988                         if ((err = (*dst)->error))
 989                                 goto out_err_release;
 990                 }
 991         }
 992 #endif
 993
 994         return 0;
 995
 996 out_err_release:
 997         if (err == -ENETUNREACH)
 998                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 999         dst_release(*dst);
1000         *dst = NULL;
1001         return err;
1002 }
1003
1004 /**
1005  *      ip6_dst_lookup - perform route lookup on flow
1006  *      @sk: socket which provides route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow.
1011  *
1012  *      It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016         *dst = NULL;
1017         return ip6_dst_lookup_tail(sk, dst, fl);
1018 }
1019 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1020
1021 /**
1022  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1023  *      @sk: socket which provides the dst cache and route info
1024  *      @dst: pointer to dst_entry * for result
1025  *      @fl: flow to lookup
1026  *
1027  *      This function performs a route lookup on the given flow with the
1028  *      possibility of using the cached route in the socket if it is valid.
1029  *      It will take the socket dst lock when operating on the dst cache.
1030  *      As a result, this function can only be used in process context.
1031  *
1032  *      It returns zero on success, or a standard errno code on error.
1033  */
1034 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1035 {
1036         *dst = NULL;
1037         if (sk) {
1038                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1039                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1040         }
1041
1042         return ip6_dst_lookup_tail(sk, dst, fl);
1043 }
1044 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1045
1046 static inline int ip6_ufo_append_data(struct sock *sk,
1047                         int getfrag(void *from, char *to, int offset, int len,
1048                         int odd, struct sk_buff *skb),
1049                         void *from, int length, int hh_len, int fragheaderlen,
1050                         int transhdrlen, int mtu,unsigned int flags)
1051
1052 {
1053         struct sk_buff *skb;
1054         int err;
1055
1056         /* There is support for UDP large send offload by network
1057          * device, so create one single skb packet containing complete
1058          * udp datagram
1059          */
1060         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1061                 skb = sock_alloc_send_skb(sk,
1062                         hh_len + fragheaderlen + transhdrlen + 20,
1063                         (flags & MSG_DONTWAIT), &err);
1064                 if (skb == NULL)
1065                         return -ENOMEM;
1066
1067                 /* reserve space for Hardware header */
1068                 skb_reserve(skb, hh_len);
1069
1070                 /* create space for UDP/IP header */
1071                 skb_put(skb,fragheaderlen + transhdrlen);
1072
1073                 /* initialize network header pointer */
1074                 skb_reset_network_header(skb);
1075
1076                 /* initialize protocol header pointer */
1077                 skb->transport_header = skb->network_header + fragheaderlen;
1078
1079                 skb->ip_summed = CHECKSUM_PARTIAL;
1080                 skb->csum = 0;
1081                 sk->sk_sndmsg_off = 0;
1082         }
1083
1084         err = skb_append_datato_frags(sk,skb, getfrag, from,
1085                                       (length - transhdrlen));
1086         if (!err) {
1087                 struct frag_hdr fhdr;
1088
1089                 /* specify the length of each IP datagram fragment*/
1090                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1091                                             sizeof(struct frag_hdr);
1092                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1093                 ipv6_select_ident(skb, &fhdr);
1094                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1095                 __skb_queue_tail(&sk->sk_write_queue, skb);
1096
1097                 return 0;
1098         }
1099         /* There is not enough support do UPD LSO,
1100          * so follow normal path
1101          */
1102         kfree_skb(skb);
1103
1104         return err;
1105 }
1106
1107 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1108         int offset, int len, int odd, struct sk_buff *skb),
1109         void *from, int length, int transhdrlen,
1110         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1111         struct rt6_info *rt, unsigned int flags)
1112 {
1113         struct inet_sock *inet = inet_sk(sk);
1114         struct ipv6_pinfo *np = inet6_sk(sk);
1115         struct sk_buff *skb;
1116         unsigned int maxfraglen, fragheaderlen;
1117         int exthdrlen;
1118         int hh_len;
1119         int mtu;
1120         int copy;
1121         int err;
1122         int offset = 0;
1123         int csummode = CHECKSUM_NONE;
1124
1125         if (flags&MSG_PROBE)
1126                 return 0;
1127         if (skb_queue_empty(&sk->sk_write_queue)) {
1128                 /*
1129                  * setup for corking
1130                  */
1131                 if (opt) {
1132                         if (np->cork.opt == NULL) {
1133                                 np->cork.opt = kmalloc(opt->tot_len,
1134                                                        sk->sk_allocation);
1135                                 if (unlikely(np->cork.opt == NULL))
1136                                         return -ENOBUFS;
1137                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1138                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1139                                 return -EINVAL;
1140                         }
1141                         memcpy(np->cork.opt, opt, opt->tot_len);
1142                         inet->cork.flags |= IPCORK_OPT;
1143                         /* need source address above miyazawa*/
1144                 }
1145                 dst_hold(&rt->u.dst);
1146                 inet->cork.dst = &rt->u.dst;
1147                 inet->cork.fl = *fl;
1148                 np->cork.hop_limit = hlimit;
1149                 np->cork.tclass = tclass;
1150                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1151                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1152                 if (np->frag_size < mtu) {
1153                         if (np->frag_size)
1154                                 mtu = np->frag_size;
1155                 }
1156                 inet->cork.fragsize = mtu;
1157                 if (dst_allfrag(rt->u.dst.path))
1158                         inet->cork.flags |= IPCORK_ALLFRAG;
1159                 inet->cork.length = 0;
1160                 sk->sk_sndmsg_page = NULL;
1161                 sk->sk_sndmsg_off = 0;
1162                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1163                             rt->rt6i_nfheader_len;
1164                 length += exthdrlen;
1165                 transhdrlen += exthdrlen;
1166         } else {
1167                 rt = (struct rt6_info *)inet->cork.dst;
1168                 fl = &inet->cork.fl;
1169                 if (inet->cork.flags & IPCORK_OPT)
1170                         opt = np->cork.opt;
1171                 transhdrlen = 0;
1172                 exthdrlen = 0;
1173                 mtu = inet->cork.fragsize;
1174         }
1175
1176         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1177
1178         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1179                         (opt ? opt->opt_nflen : 0);
1180         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1181
1182         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1183                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1184                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1185                         return -EMSGSIZE;
1186                 }
1187         }
1188
1189         /*
1190          * Let's try using as much space as possible.
1191          * Use MTU if total length of the message fits into the MTU.
1192          * Otherwise, we need to reserve fragment header and
1193          * fragment alignment (= 8-15 octects, in total).
1194          *
1195          * Note that we may need to "move" the data from the tail of
1196          * of the buffer to the new fragment when we split
1197          * the message.
1198          *
1199          * FIXME: It may be fragmented into multiple chunks
1200          *        at once if non-fragmentable extension headers
1201          *        are too large.
1202          * --yoshfuji
1203          */
1204
1205         inet->cork.length += length;
1206         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1207             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1208
1209                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1210                                           fragheaderlen, transhdrlen, mtu,
1211                                           flags);
1212                 if (err)
1213                         goto error;
1214                 return 0;
1215         }
1216
1217         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1218                 goto alloc_new_skb;
1219
1220         while (length > 0) {
1221                 /* Check if the remaining data fits into current packet. */
1222                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1223                 if (copy < length)
1224                         copy = maxfraglen - skb->len;
1225
1226                 if (copy <= 0) {
1227                         char *data;
1228                         unsigned int datalen;
1229                         unsigned int fraglen;
1230                         unsigned int fraggap;
1231                         unsigned int alloclen;
1232                         struct sk_buff *skb_prev;
1233 alloc_new_skb:
1234                         skb_prev = skb;
1235
1236                         /* There's no room in the current skb */
1237                         if (skb_prev)
1238                                 fraggap = skb_prev->len - maxfraglen;
1239                         else
1240                                 fraggap = 0;
1241
1242                         /*
1243                          * If remaining data exceeds the mtu,
1244                          * we know we need more fragment(s).
1245                          */
1246                         datalen = length + fraggap;
1247                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1248                                 datalen = maxfraglen - fragheaderlen;
1249
1250                         fraglen = datalen + fragheaderlen;
1251                         if ((flags & MSG_MORE) &&
1252                             !(rt->u.dst.dev->features&NETIF_F_SG))
1253                                 alloclen = mtu;
1254                         else
1255                                 alloclen = datalen + fragheaderlen;
1256
1257                         /*
1258                          * The last fragment gets additional space at tail.
1259                          * Note: we overallocate on fragments with MSG_MODE
1260                          * because we have no idea if we're the last one.
1261                          */
1262                         if (datalen == length + fraggap)
1263                                 alloclen += rt->u.dst.trailer_len;
1264
1265                         /*
1266                          * We just reserve space for fragment header.
1267                          * Note: this may be overallocation if the message
1268                          * (without MSG_MORE) fits into the MTU.
1269                          */
1270                         alloclen += sizeof(struct frag_hdr);
1271
1272                         if (transhdrlen) {
1273                                 skb = sock_alloc_send_skb(sk,
1274                                                 alloclen + hh_len,
1275                                                 (flags & MSG_DONTWAIT), &err);
1276                         } else {
1277                                 skb = NULL;
1278                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1279                                     2 * sk->sk_sndbuf)
1280                                         skb = sock_wmalloc(sk,
1281                                                            alloclen + hh_len, 1,
1282                                                            sk->sk_allocation);
1283                                 if (unlikely(skb == NULL))
1284                                         err = -ENOBUFS;
1285                         }
1286                         if (skb == NULL)
1287                                 goto error;
1288                         /*
1289                          *      Fill in the control structures
1290                          */
1291                         skb->ip_summed = csummode;
1292                         skb->csum = 0;
1293                         /* reserve for fragmentation */
1294                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1295
1296                         /*
1297                          *      Find where to start putting bytes
1298                          */
1299                         data = skb_put(skb, fraglen);
1300                         skb_set_network_header(skb, exthdrlen);
1301                         data += fragheaderlen;
1302                         skb->transport_header = (skb->network_header +
1303                                                  fragheaderlen);
1304                         if (fraggap) {
1305                                 skb->csum = skb_copy_and_csum_bits(
1306                                         skb_prev, maxfraglen,
1307                                         data + transhdrlen, fraggap, 0);
1308                                 skb_prev->csum = csum_sub(skb_prev->csum,
1309                                                           skb->csum);
1310                                 data += fraggap;
1311                                 pskb_trim_unique(skb_prev, maxfraglen);
1312                         }
1313                         copy = datalen - transhdrlen - fraggap;
1314                         if (copy < 0) {
1315                                 err = -EINVAL;
1316                                 kfree_skb(skb);
1317                                 goto error;
1318                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1319                                 err = -EFAULT;
1320                                 kfree_skb(skb);
1321                                 goto error;
1322                         }
1323
1324                         offset += copy;
1325                         length -= datalen - fraggap;
1326                         transhdrlen = 0;
1327                         exthdrlen = 0;
1328                         csummode = CHECKSUM_NONE;
1329
1330                         /*
1331                          * Put the packet on the pending queue
1332                          */
1333                         __skb_queue_tail(&sk->sk_write_queue, skb);
1334                         continue;
1335                 }
1336
1337                 if (copy > length)
1338                         copy = length;
1339
1340                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1341                         unsigned int off;
1342
1343                         off = skb->len;
1344                         if (getfrag(from, skb_put(skb, copy),
1345                                                 offset, copy, off, skb) < 0) {
1346                                 __skb_trim(skb, off);
1347                                 err = -EFAULT;
1348                                 goto error;
1349                         }
1350                 } else {
1351                         int i = skb_shinfo(skb)->nr_frags;
1352                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1353                         struct page *page = sk->sk_sndmsg_page;
1354                         int off = sk->sk_sndmsg_off;
1355                         unsigned int left;
1356
1357                         if (page && (left = PAGE_SIZE - off) > 0) {
1358                                 if (copy >= left)
1359                                         copy = left;
1360                                 if (page != frag->page) {
1361                                         if (i == MAX_SKB_FRAGS) {
1362                                                 err = -EMSGSIZE;
1363                                                 goto error;
1364                                         }
1365                                         get_page(page);
1366                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1367                                         frag = &skb_shinfo(skb)->frags[i];
1368                                 }
1369                         } else if(i < MAX_SKB_FRAGS) {
1370                                 if (copy > PAGE_SIZE)
1371                                         copy = PAGE_SIZE;
1372                                 page = alloc_pages(sk->sk_allocation, 0);
1373                                 if (page == NULL) {
1374                                         err = -ENOMEM;
1375                                         goto error;
1376                                 }
1377                                 sk->sk_sndmsg_page = page;
1378                                 sk->sk_sndmsg_off = 0;
1379
1380                                 skb_fill_page_desc(skb, i, page, 0, 0);
1381                                 frag = &skb_shinfo(skb)->frags[i];
1382                         } else {
1383                                 err = -EMSGSIZE;
1384                                 goto error;
1385                         }
1386                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1387                                 err = -EFAULT;
1388                                 goto error;
1389                         }
1390                         sk->sk_sndmsg_off += copy;
1391                         frag->size += copy;
1392                         skb->len += copy;
1393                         skb->data_len += copy;
1394                         skb->truesize += copy;
1395                         atomic_add(copy, &sk->sk_wmem_alloc);
1396                 }
1397                 offset += copy;
1398                 length -= copy;
1399         }
1400         return 0;
1401 error:
1402         inet->cork.length -= length;
1403         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1404         return err;
1405 }
1406
1407 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1408 {
1409         inet->cork.flags &= ~IPCORK_OPT;
1410         kfree(np->cork.opt);
1411         np->cork.opt = NULL;
1412         if (inet->cork.dst) {
1413                 dst_release(inet->cork.dst);
1414                 inet->cork.dst = NULL;
1415                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1416         }
1417         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1418 }
1419
1420 int ip6_push_pending_frames(struct sock *sk)
1421 {
1422         struct sk_buff *skb, *tmp_skb;
1423         struct sk_buff **tail_skb;
1424         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1425         struct inet_sock *inet = inet_sk(sk);
1426         struct ipv6_pinfo *np = inet6_sk(sk);
1427         struct net *net = sock_net(sk);
1428         struct ipv6hdr *hdr;
1429         struct ipv6_txoptions *opt = np->cork.opt;
1430         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1431         struct flowi *fl = &inet->cork.fl;
1432         unsigned char proto = fl->proto;
1433         int err = 0;
1434
1435         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1436                 goto out;
1437         tail_skb = &(skb_shinfo(skb)->frag_list);
1438
1439         /* move skb->data to ip header from ext header */
1440         if (skb->data < skb_network_header(skb))
1441                 __skb_pull(skb, skb_network_offset(skb));
1442         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1443                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1444                 *tail_skb = tmp_skb;
1445                 tail_skb = &(tmp_skb->next);
1446                 skb->len += tmp_skb->len;
1447                 skb->data_len += tmp_skb->len;
1448                 skb->truesize += tmp_skb->truesize;
1449                 __sock_put(tmp_skb->sk);
1450                 tmp_skb->destructor = NULL;
1451                 tmp_skb->sk = NULL;
1452         }
1453
1454         /* Allow local fragmentation. */
1455         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1456                 skb->local_df = 1;
1457
1458         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1459         __skb_pull(skb, skb_network_header_len(skb));
1460         if (opt && opt->opt_flen)
1461                 ipv6_push_frag_opts(skb, opt, &proto);
1462         if (opt && opt->opt_nflen)
1463                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1464
1465         skb_push(skb, sizeof(struct ipv6hdr));
1466         skb_reset_network_header(skb);
1467         hdr = ipv6_hdr(skb);
1468
1469         *(__be32*)hdr = fl->fl6_flowlabel |
1470                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1471
1472         hdr->hop_limit = np->cork.hop_limit;
1473         hdr->nexthdr = proto;
1474         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1475         ipv6_addr_copy(&hdr->daddr, final_dst);
1476
1477         skb->priority = sk->sk_priority;
1478         skb->mark = sk->sk_mark;
1479
1480         skb->dst = dst_clone(&rt->u.dst);
1481         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1482         if (proto == IPPROTO_ICMPV6) {
1483                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1484
1485                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1486                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1487         }
1488
1489         err = ip6_local_out(skb);
1490         if (err) {
1491                 if (err > 0)
1492                         err = np->recverr ? net_xmit_errno(err) : 0;
1493                 if (err)
1494                         goto error;
1495         }
1496
1497 out:
1498         ip6_cork_release(inet, np);
1499         return err;
1500 error:
1501         goto out;
1502 }
1503
1504 void ip6_flush_pending_frames(struct sock *sk)
1505 {
1506         struct sk_buff *skb;
1507
1508         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1509                 if (skb->dst)
1510                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
1511                                       IPSTATS_MIB_OUTDISCARDS);
1512                 kfree_skb(skb);
1513         }
1514
1515         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1516 }