net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         rcu_read_lock();
 139         neigh = dst_get_neighbour_noref(dst);
 140         if (neigh) {
 141                 int res = neigh_output(neigh, skb);
 142
 143                 rcu_read_unlock();
 144                 return res;
 145         }
 146         rcu_read_unlock();
 147         IP6_INC_STATS_BH(dev_net(dst->dev),
 148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 149         kfree_skb(skb);
 150         return -EINVAL;
 151 }
 152
 153 static int ip6_finish_output(struct sk_buff *skb)
 154 {
 155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 156             dst_allfrag(skb_dst(skb)))
 157                 return ip6_fragment(skb, ip6_finish_output2);
 158         else
 159                 return ip6_finish_output2(skb);
 160 }
 161
 162 int ip6_output(struct sk_buff *skb)
 163 {
 164         struct net_device *dev = skb_dst(skb)->dev;
 165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 166         if (unlikely(idev->cnf.disable_ipv6)) {
 167                 IP6_INC_STATS(dev_net(dev), idev,
 168                               IPSTATS_MIB_OUTDISCARDS);
 169                 kfree_skb(skb);
 170                 return 0;
 171         }
 172
 173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 174                             ip6_finish_output,
 175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 176 }
 177
 178 /*
 179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 180  */
 181
 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 183              struct ipv6_txoptions *opt, int tclass)
 184 {
 185         struct net *net = sock_net(sk);
 186         struct ipv6_pinfo *np = inet6_sk(sk);
 187         struct in6_addr *first_hop = &fl6->daddr;
 188         struct dst_entry *dst = skb_dst(skb);
 189         struct ipv6hdr *hdr;
 190         u8  proto = fl6->flowi6_proto;
 191         int seg_len = skb->len;
 192         int hlimit = -1;
 193         u32 mtu;
 194
 195         if (opt) {
 196                 unsigned int head_room;
 197
 198                 /* First: exthdrs may take lots of space (~8K for now)
 199                    MAX_HEADER is not enough.
 200                  */
 201                 head_room = opt->opt_nflen + opt->opt_flen;
 202                 seg_len += head_room;
 203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 204
 205                 if (skb_headroom(skb) < head_room) {
 206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 207                         if (skb2 == NULL) {
 208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 209                                               IPSTATS_MIB_OUTDISCARDS);
 210                                 kfree_skb(skb);
 211                                 return -ENOBUFS;
 212                         }
 213                         consume_skb(skb);
 214                         skb = skb2;
 215                         skb_set_owner_w(skb, sk);
 216                 }
 217                 if (opt->opt_flen)
 218                         ipv6_push_frag_opts(skb, opt, &proto);
 219                 if (opt->opt_nflen)
 220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 221         }
 222
 223         skb_push(skb, sizeof(struct ipv6hdr));
 224         skb_reset_network_header(skb);
 225         hdr = ipv6_hdr(skb);
 226
 227         /*
 228          *      Fill in the IPv6 header
 229          */
 230         if (np)
 231                 hlimit = np->hop_limit;
 232         if (hlimit < 0)
 233                 hlimit = ip6_dst_hoplimit(dst);
 234
 235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 236
 237         hdr->payload_len = htons(seg_len);
 238         hdr->nexthdr = proto;
 239         hdr->hop_limit = hlimit;
 240
 241         hdr->saddr = fl6->saddr;
 242         hdr->daddr = *first_hop;
 243
 244         skb->priority = sk->sk_priority;
 245         skb->mark = sk->sk_mark;
 246
 247         mtu = dst_mtu(dst);
 248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 250                               IPSTATS_MIB_OUT, skb->len);
 251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 252                                dst->dev, dst_output);
 253         }
 254
 255         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
 256         skb->dev = dst->dev;
 257         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 258         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 259         kfree_skb(skb);
 260         return -EMSGSIZE;
 261 }
 262
 263 EXPORT_SYMBOL(ip6_xmit);
 264
 265 /*
 266  *      To avoid extra problems ND packets are send through this
 267  *      routine. It's code duplication but I really want to avoid
 268  *      extra checks since ipv6_build_header is used by TCP (which
 269  *      is for us performance critical)
 270  */
 271
 272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 273                const struct in6_addr *saddr, const struct in6_addr *daddr,
 274                int proto, int len)
 275 {
 276         struct ipv6_pinfo *np = inet6_sk(sk);
 277         struct ipv6hdr *hdr;
 278
 279         skb->protocol = htons(ETH_P_IPV6);
 280         skb->dev = dev;
 281
 282         skb_reset_network_header(skb);
 283         skb_put(skb, sizeof(struct ipv6hdr));
 284         hdr = ipv6_hdr(skb);
 285
 286         *(__be32*)hdr = htonl(0x60000000);
 287
 288         hdr->payload_len = htons(len);
 289         hdr->nexthdr = proto;
 290         hdr->hop_limit = np->hop_limit;
 291
 292         hdr->saddr = *saddr;
 293         hdr->daddr = *daddr;
 294
 295         return 0;
 296 }
 297
 298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 299 {
 300         struct ip6_ra_chain *ra;
 301         struct sock *last = NULL;
 302
 303         read_lock(&ip6_ra_lock);
 304         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 305                 struct sock *sk = ra->sk;
 306                 if (sk && ra->sel == sel &&
 307                     (!sk->sk_bound_dev_if ||
 308                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 309                         if (last) {
 310                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 311                                 if (skb2)
 312                                         rawv6_rcv(last, skb2);
 313                         }
 314                         last = sk;
 315                 }
 316         }
 317
 318         if (last) {
 319                 rawv6_rcv(last, skb);
 320                 read_unlock(&ip6_ra_lock);
 321                 return 1;
 322         }
 323         read_unlock(&ip6_ra_lock);
 324         return 0;
 325 }
 326
 327 static int ip6_forward_proxy_check(struct sk_buff *skb)
 328 {
 329         struct ipv6hdr *hdr = ipv6_hdr(skb);
 330         u8 nexthdr = hdr->nexthdr;
 331         __be16 frag_off;
 332         int offset;
 333
 334         if (ipv6_ext_hdr(nexthdr)) {
 335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 336                 if (offset < 0)
 337                         return 0;
 338         } else
 339                 offset = sizeof(struct ipv6hdr);
 340
 341         if (nexthdr == IPPROTO_ICMPV6) {
 342                 struct icmp6hdr *icmp6;
 343
 344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 345                                          offset + 1 - skb->data)))
 346                         return 0;
 347
 348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 349
 350                 switch (icmp6->icmp6_type) {
 351                 case NDISC_ROUTER_SOLICITATION:
 352                 case NDISC_ROUTER_ADVERTISEMENT:
 353                 case NDISC_NEIGHBOUR_SOLICITATION:
 354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 355                 case NDISC_REDIRECT:
 356                         /* For reaction involving unicast neighbor discovery
 357                          * message destined to the proxied address, pass it to
 358                          * input function.
 359                          */
 360                         return 1;
 361                 default:
 362                         break;
 363                 }
 364         }
 365
 366         /*
 367          * The proxying router can't forward traffic sent to a link-local
 368          * address, so signal the sender and discard the packet. This
 369          * behavior is clarified by the MIPv6 specification.
 370          */
 371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 372                 dst_link_failure(skb);
 373                 return -1;
 374         }
 375
 376         return 0;
 377 }
 378
 379 static inline int ip6_forward_finish(struct sk_buff *skb)
 380 {
 381         return dst_output(skb);
 382 }
 383
 384 int ip6_forward(struct sk_buff *skb)
 385 {
 386         struct dst_entry *dst = skb_dst(skb);
 387         struct ipv6hdr *hdr = ipv6_hdr(skb);
 388         struct inet6_skb_parm *opt = IP6CB(skb);
 389         struct net *net = dev_net(dst->dev);
 390         u32 mtu;
 391
 392         if (net->ipv6.devconf_all->forwarding == 0)
 393                 goto error;
 394
 395         if (skb_warn_if_lro(skb))
 396                 goto drop;
 397
 398         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 399                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 400                 goto drop;
 401         }
 402
 403         if (skb->pkt_type != PACKET_HOST)
 404                 goto drop;
 405
 406         skb_forward_csum(skb);
 407
 408         /*
 409          *      We DO NOT make any processing on
 410          *      RA packets, pushing them to user level AS IS
 411          *      without ane WARRANTY that application will be able
 412          *      to interpret them. The reason is that we
 413          *      cannot make anything clever here.
 414          *
 415          *      We are not end-node, so that if packet contains
 416          *      AH/ESP, we cannot make anything.
 417          *      Defragmentation also would be mistake, RA packets
 418          *      cannot be fragmented, because there is no warranty
 419          *      that different fragments will go along one path. --ANK
 420          */
 421         if (opt->ra) {
 422                 u8 *ptr = skb_network_header(skb) + opt->ra;
 423                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 424                         return 0;
 425         }
 426
 427         /*
 428          *      check and decrement ttl
 429          */
 430         if (hdr->hop_limit <= 1) {
 431                 /* Force OUTPUT device used as source address */
 432                 skb->dev = dst->dev;
 433                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 434                 IP6_INC_STATS_BH(net,
 435                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 436
 437                 kfree_skb(skb);
 438                 return -ETIMEDOUT;
 439         }
 440
 441         /* XXX: idev->cnf.proxy_ndp? */
 442         if (net->ipv6.devconf_all->proxy_ndp &&
 443             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 444                 int proxied = ip6_forward_proxy_check(skb);
 445                 if (proxied > 0)
 446                         return ip6_input(skb);
 447                 else if (proxied < 0) {
 448                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 449                                       IPSTATS_MIB_INDISCARDS);
 450                         goto drop;
 451                 }
 452         }
 453
 454         if (!xfrm6_route_forward(skb)) {
 455                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 456                 goto drop;
 457         }
 458         dst = skb_dst(skb);
 459
 460         /* IPv6 specs say nothing about it, but it is clear that we cannot
 461            send redirects to source routed frames.
 462            We don't send redirects to frames decapsulated from IPsec.
 463          */
 464         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 465                 struct in6_addr *target = NULL;
 466                 struct inet_peer *peer;
 467                 struct rt6_info *rt;
 468
 469                 /*
 470                  *      incoming and outgoing devices are the same
 471                  *      send a redirect.
 472                  */
 473
 474                 rt = (struct rt6_info *) dst;
 475                 if (rt->rt6i_flags & RTF_GATEWAY)
 476                         target = &rt->rt6i_gateway;
 477                 else
 478                         target = &hdr->daddr;
 479
 480                 peer = rt6_get_peer_create(rt);
 481
 482                 /* Limit redirects both by destination (here)
 483                    and by source (inside ndisc_send_redirect)
 484                  */
 485                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 486                         ndisc_send_redirect(skb, target);
 487         } else {
 488                 int addrtype = ipv6_addr_type(&hdr->saddr);
 489
 490                 /* This check is security critical. */
 491                 if (addrtype == IPV6_ADDR_ANY ||
 492                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 493                         goto error;
 494                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 495                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 496                                     ICMPV6_NOT_NEIGHBOUR, 0);
 497                         goto error;
 498                 }
 499         }
 500
 501         mtu = dst_mtu(dst);
 502         if (mtu < IPV6_MIN_MTU)
 503                 mtu = IPV6_MIN_MTU;
 504
 505         if (skb->len > mtu && !skb_is_gso(skb)) {
 506                 /* Again, force OUTPUT device used as source address */
 507                 skb->dev = dst->dev;
 508                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 509                 IP6_INC_STATS_BH(net,
 510                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 511                 IP6_INC_STATS_BH(net,
 512                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 513                 kfree_skb(skb);
 514                 return -EMSGSIZE;
 515         }
 516
 517         if (skb_cow(skb, dst->dev->hard_header_len)) {
 518                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 519                 goto drop;
 520         }
 521
 522         hdr = ipv6_hdr(skb);
 523
 524         /* Mangling hops number delayed to point after skb COW */
 525
 526         hdr->hop_limit--;
 527
 528         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 529         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 530                        ip6_forward_finish);
 531
 532 error:
 533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 534 drop:
 535         kfree_skb(skb);
 536         return -EINVAL;
 537 }
 538
 539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 540 {
 541         to->pkt_type = from->pkt_type;
 542         to->priority = from->priority;
 543         to->protocol = from->protocol;
 544         skb_dst_drop(to);
 545         skb_dst_set(to, dst_clone(skb_dst(from)));
 546         to->dev = from->dev;
 547         to->mark = from->mark;
 548
 549 #ifdef CONFIG_NET_SCHED
 550         to->tc_index = from->tc_index;
 551 #endif
 552         nf_copy(to, from);
 553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 555         to->nf_trace = from->nf_trace;
 556 #endif
 557         skb_copy_secmark(to, from);
 558 }
 559
 560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 561 {
 562         u16 offset = sizeof(struct ipv6hdr);
 563         struct ipv6_opt_hdr *exthdr =
 564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 565         unsigned int packet_len = skb->tail - skb->network_header;
 566         int found_rhdr = 0;
 567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 568
 569         while (offset + 1 <= packet_len) {
 570
 571                 switch (**nexthdr) {
 572
 573                 case NEXTHDR_HOP:
 574                         break;
 575                 case NEXTHDR_ROUTING:
 576                         found_rhdr = 1;
 577                         break;
 578                 case NEXTHDR_DEST:
 579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 581                                 break;
 582 #endif
 583                         if (found_rhdr)
 584                                 return offset;
 585                         break;
 586                 default :
 587                         return offset;
 588                 }
 589
 590                 offset += ipv6_optlen(exthdr);
 591                 *nexthdr = &exthdr->nexthdr;
 592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 593                                                  offset);
 594         }
 595
 596         return offset;
 597 }
 598
 599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 600 {
 601         static atomic_t ipv6_fragmentation_id;
 602         int old, new;
 603
 604         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 605                 struct inet_peer *peer = rt6_get_peer_create(rt);
 606
 607                 if (peer) {
 608                         fhdr->identification = htonl(inet_getid(peer, 0));
 609                         return;
 610                 }
 611         }
 612         do {
 613                 old = atomic_read(&ipv6_fragmentation_id);
 614                 new = old + 1;
 615                 if (!new)
 616                         new = 1;
 617         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 618         fhdr->identification = htonl(new);
 619 }
 620
 621 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 622 {
 623         struct sk_buff *frag;
 624         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 625         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 626         struct ipv6hdr *tmp_hdr;
 627         struct frag_hdr *fh;
 628         unsigned int mtu, hlen, left, len;
 629         int hroom, troom;
 630         __be32 frag_id = 0;
 631         int ptr, offset = 0, err=0;
 632         u8 *prevhdr, nexthdr = 0;
 633         struct net *net = dev_net(skb_dst(skb)->dev);
 634
 635         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 636         nexthdr = *prevhdr;
 637
 638         mtu = ip6_skb_dst_mtu(skb);
 639
 640         /* We must not fragment if the socket is set to force MTU discovery
 641          * or if the skb it not generated by a local socket.
 642          */
 643         if (unlikely(!skb->local_df && skb->len > mtu)) {
 644                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 645                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 646
 647                 skb->dev = skb_dst(skb)->dev;
 648                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 649                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 650                               IPSTATS_MIB_FRAGFAILS);
 651                 kfree_skb(skb);
 652                 return -EMSGSIZE;
 653         }
 654
 655         if (np && np->frag_size < mtu) {
 656                 if (np->frag_size)
 657                         mtu = np->frag_size;
 658         }
 659         mtu -= hlen + sizeof(struct frag_hdr);
 660
 661         if (skb_has_frag_list(skb)) {
 662                 int first_len = skb_pagelen(skb);
 663                 struct sk_buff *frag2;
 664
 665                 if (first_len - hlen > mtu ||
 666                     ((first_len - hlen) & 7) ||
 667                     skb_cloned(skb))
 668                         goto slow_path;
 669
 670                 skb_walk_frags(skb, frag) {
 671                         /* Correct geometry. */
 672                         if (frag->len > mtu ||
 673                             ((frag->len & 7) && frag->next) ||
 674                             skb_headroom(frag) < hlen)
 675                                 goto slow_path_clean;
 676
 677                         /* Partially cloned skb? */
 678                         if (skb_shared(frag))
 679                                 goto slow_path_clean;
 680
 681                         BUG_ON(frag->sk);
 682                         if (skb->sk) {
 683                                 frag->sk = skb->sk;
 684                                 frag->destructor = sock_wfree;
 685                         }
 686                         skb->truesize -= frag->truesize;
 687                 }
 688
 689                 err = 0;
 690                 offset = 0;
 691                 frag = skb_shinfo(skb)->frag_list;
 692                 skb_frag_list_init(skb);
 693                 /* BUILD HEADER */
 694
 695                 *prevhdr = NEXTHDR_FRAGMENT;
 696                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 697                 if (!tmp_hdr) {
 698                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 699                                       IPSTATS_MIB_FRAGFAILS);
 700                         return -ENOMEM;
 701                 }
 702
 703                 __skb_pull(skb, hlen);
 704                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 705                 __skb_push(skb, hlen);
 706                 skb_reset_network_header(skb);
 707                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 708
 709                 ipv6_select_ident(fh, rt);
 710                 fh->nexthdr = nexthdr;
 711                 fh->reserved = 0;
 712                 fh->frag_off = htons(IP6_MF);
 713                 frag_id = fh->identification;
 714
 715                 first_len = skb_pagelen(skb);
 716                 skb->data_len = first_len - skb_headlen(skb);
 717                 skb->len = first_len;
 718                 ipv6_hdr(skb)->payload_len = htons(first_len -
 719                                                    sizeof(struct ipv6hdr));
 720
 721                 dst_hold(&rt->dst);
 722
 723                 for (;;) {
 724                         /* Prepare header of the next frame,
 725                          * before previous one went down. */
 726                         if (frag) {
 727                                 frag->ip_summed = CHECKSUM_NONE;
 728                                 skb_reset_transport_header(frag);
 729                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 730                                 __skb_push(frag, hlen);
 731                                 skb_reset_network_header(frag);
 732                                 memcpy(skb_network_header(frag), tmp_hdr,
 733                                        hlen);
 734                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 735                                 fh->nexthdr = nexthdr;
 736                                 fh->reserved = 0;
 737                                 fh->frag_off = htons(offset);
 738                                 if (frag->next != NULL)
 739                                         fh->frag_off |= htons(IP6_MF);
 740                                 fh->identification = frag_id;
 741                                 ipv6_hdr(frag)->payload_len =
 742                                                 htons(frag->len -
 743                                                       sizeof(struct ipv6hdr));
 744                                 ip6_copy_metadata(frag, skb);
 745                         }
 746
 747                         err = output(skb);
 748                         if(!err)
 749                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 750                                               IPSTATS_MIB_FRAGCREATES);
 751
 752                         if (err || !frag)
 753                                 break;
 754
 755                         skb = frag;
 756                         frag = skb->next;
 757                         skb->next = NULL;
 758                 }
 759
 760                 kfree(tmp_hdr);
 761
 762                 if (err == 0) {
 763                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 764                                       IPSTATS_MIB_FRAGOKS);
 765                         dst_release(&rt->dst);
 766                         return 0;
 767                 }
 768
 769                 while (frag) {
 770                         skb = frag->next;
 771                         kfree_skb(frag);
 772                         frag = skb;
 773                 }
 774
 775                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 776                               IPSTATS_MIB_FRAGFAILS);
 777                 dst_release(&rt->dst);
 778                 return err;
 779
 780 slow_path_clean:
 781                 skb_walk_frags(skb, frag2) {
 782                         if (frag2 == frag)
 783                                 break;
 784                         frag2->sk = NULL;
 785                         frag2->destructor = NULL;
 786                         skb->truesize += frag2->truesize;
 787                 }
 788         }
 789
 790 slow_path:
 791         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 792             skb_checksum_help(skb))
 793                 goto fail;
 794
 795         left = skb->len - hlen;         /* Space per frame */
 796         ptr = hlen;                     /* Where to start from */
 797
 798         /*
 799          *      Fragment the datagram.
 800          */
 801
 802         *prevhdr = NEXTHDR_FRAGMENT;
 803         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 804         troom = rt->dst.dev->needed_tailroom;
 805
 806         /*
 807          *      Keep copying data until we run out.
 808          */
 809         while(left > 0) {
 810                 len = left;
 811                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 812                 if (len > mtu)
 813                         len = mtu;
 814                 /* IF: we are not sending up to and including the packet end
 815                    then align the next start on an eight byte boundary */
 816                 if (len < left) {
 817                         len &= ~7;
 818                 }
 819                 /*
 820                  *      Allocate buffer.
 821                  */
 822
 823                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 824                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 825                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 826                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 827                                       IPSTATS_MIB_FRAGFAILS);
 828                         err = -ENOMEM;
 829                         goto fail;
 830                 }
 831
 832                 /*
 833                  *      Set up data on packet
 834                  */
 835
 836                 ip6_copy_metadata(frag, skb);
 837                 skb_reserve(frag, hroom);
 838                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 839                 skb_reset_network_header(frag);
 840                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 841                 frag->transport_header = (frag->network_header + hlen +
 842                                           sizeof(struct frag_hdr));
 843
 844                 /*
 845                  *      Charge the memory for the fragment to any owner
 846                  *      it might possess
 847                  */
 848                 if (skb->sk)
 849                         skb_set_owner_w(frag, skb->sk);
 850
 851                 /*
 852                  *      Copy the packet header into the new buffer.
 853                  */
 854                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 855
 856                 /*
 857                  *      Build fragment header.
 858                  */
 859                 fh->nexthdr = nexthdr;
 860                 fh->reserved = 0;
 861                 if (!frag_id) {
 862                         ipv6_select_ident(fh, rt);
 863                         frag_id = fh->identification;
 864                 } else
 865                         fh->identification = frag_id;
 866
 867                 /*
 868                  *      Copy a block of the IP datagram.
 869                  */
 870                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 871                         BUG();
 872                 left -= len;
 873
 874                 fh->frag_off = htons(offset);
 875                 if (left > 0)
 876                         fh->frag_off |= htons(IP6_MF);
 877                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 878                                                     sizeof(struct ipv6hdr));
 879
 880                 ptr += len;
 881                 offset += len;
 882
 883                 /*
 884                  *      Put this fragment into the sending queue.
 885                  */
 886                 err = output(frag);
 887                 if (err)
 888                         goto fail;
 889
 890                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 891                               IPSTATS_MIB_FRAGCREATES);
 892         }
 893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 894                       IPSTATS_MIB_FRAGOKS);
 895         consume_skb(skb);
 896         return err;
 897
 898 fail:
 899         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 900                       IPSTATS_MIB_FRAGFAILS);
 901         kfree_skb(skb);
 902         return err;
 903 }
 904
 905 static inline int ip6_rt_check(const struct rt6key *rt_key,
 906                                const struct in6_addr *fl_addr,
 907                                const struct in6_addr *addr_cache)
 908 {
 909         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 910                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 911 }
 912
 913 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 914                                           struct dst_entry *dst,
 915                                           const struct flowi6 *fl6)
 916 {
 917         struct ipv6_pinfo *np = inet6_sk(sk);
 918         struct rt6_info *rt = (struct rt6_info *)dst;
 919
 920         if (!dst)
 921                 goto out;
 922
 923         /* Yes, checking route validity in not connected
 924          * case is not very simple. Take into account,
 925          * that we do not support routing by source, TOS,
 926          * and MSG_DONTROUTE            --ANK (980726)
 927          *
 928          * 1. ip6_rt_check(): If route was host route,
 929          *    check that cached destination is current.
 930          *    If it is network route, we still may
 931          *    check its validity using saved pointer
 932          *    to the last used address: daddr_cache.
 933          *    We do not want to save whole address now,
 934          *    (because main consumer of this service
 935          *    is tcp, which has not this problem),
 936          *    so that the last trick works only on connected
 937          *    sockets.
 938          * 2. oif also should be the same.
 939          */
 940         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 941 #ifdef CONFIG_IPV6_SUBTREES
 942             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 943 #endif
 944             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 945                 dst_release(dst);
 946                 dst = NULL;
 947         }
 948
 949 out:
 950         return dst;
 951 }
 952
 953 static int ip6_dst_lookup_tail(struct sock *sk,
 954                                struct dst_entry **dst, struct flowi6 *fl6)
 955 {
 956         struct net *net = sock_net(sk);
 957 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 958         struct neighbour *n;
 959 #endif
 960         int err;
 961
 962         if (*dst == NULL)
 963                 *dst = ip6_route_output(net, sk, fl6);
 964
 965         if ((err = (*dst)->error))
 966                 goto out_err_release;
 967
 968         if (ipv6_addr_any(&fl6->saddr)) {
 969                 struct rt6_info *rt = (struct rt6_info *) *dst;
 970                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 971                                           sk ? inet6_sk(sk)->srcprefs : 0,
 972                                           &fl6->saddr);
 973                 if (err)
 974                         goto out_err_release;
 975         }
 976
 977 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 978         /*
 979          * Here if the dst entry we've looked up
 980          * has a neighbour entry that is in the INCOMPLETE
 981          * state and the src address from the flow is
 982          * marked as OPTIMISTIC, we release the found
 983          * dst entry and replace it instead with the
 984          * dst entry of the nexthop router
 985          */
 986         rcu_read_lock();
 987         n = dst_get_neighbour_noref(*dst);
 988         if (n && !(n->nud_state & NUD_VALID)) {
 989                 struct inet6_ifaddr *ifp;
 990                 struct flowi6 fl_gw6;
 991                 int redirect;
 992
 993                 rcu_read_unlock();
 994                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 995                                       (*dst)->dev, 1);
 996
 997                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 998                 if (ifp)
 999                         in6_ifa_put(ifp);
1000
1001                 if (redirect) {
1002                         /*
1003                          * We need to get the dst entry for the
1004                          * default router instead
1005                          */
1006                         dst_release(*dst);
1007                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1008                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1009                         *dst = ip6_route_output(net, sk, &fl_gw6);
1010                         if ((err = (*dst)->error))
1011                                 goto out_err_release;
1012                 }
1013         } else {
1014                 rcu_read_unlock();
1015         }
1016 #endif
1017
1018         return 0;
1019
1020 out_err_release:
1021         if (err == -ENETUNREACH)
1022                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023         dst_release(*dst);
1024         *dst = NULL;
1025         return err;
1026 }
1027
1028 /**
1029  *      ip6_dst_lookup - perform route lookup on flow
1030  *      @sk: socket which provides route info
1031  *      @dst: pointer to dst_entry * for result
1032  *      @fl6: flow to lookup
1033  *
1034  *      This function performs a route lookup on the given flow.
1035  *
1036  *      It returns zero on success, or a standard errno code on error.
1037  */
1038 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1039 {
1040         *dst = NULL;
1041         return ip6_dst_lookup_tail(sk, dst, fl6);
1042 }
1043 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1044
1045 /**
1046  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1047  *      @sk: socket which provides route info
1048  *      @fl6: flow to lookup
1049  *      @final_dst: final destination address for ipsec lookup
1050  *      @can_sleep: we are in a sleepable context
1051  *
1052  *      This function performs a route lookup on the given flow.
1053  *
1054  *      It returns a valid dst pointer on success, or a pointer encoded
1055  *      error code.
1056  */
1057 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1058                                       const struct in6_addr *final_dst,
1059                                       bool can_sleep)
1060 {
1061         struct dst_entry *dst = NULL;
1062         int err;
1063
1064         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1065         if (err)
1066                 return ERR_PTR(err);
1067         if (final_dst)
1068                 fl6->daddr = *final_dst;
1069         if (can_sleep)
1070                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1071
1072         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1075
1076 /**
1077  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078  *      @sk: socket which provides the dst cache and route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *      @can_sleep: we are in a sleepable context
1082  *
1083  *      This function performs a route lookup on the given flow with the
1084  *      possibility of using the cached route in the socket if it is valid.
1085  *      It will take the socket dst lock when operating on the dst cache.
1086  *      As a result, this function can only be used in process context.
1087  *
1088  *      It returns a valid dst pointer on success, or a pointer encoded
1089  *      error code.
1090  */
1091 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1092                                          const struct in6_addr *final_dst,
1093                                          bool can_sleep)
1094 {
1095         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1096         int err;
1097
1098         dst = ip6_sk_dst_check(sk, dst, fl6);
1099
1100         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1101         if (err)
1102                 return ERR_PTR(err);
1103         if (final_dst)
1104                 fl6->daddr = *final_dst;
1105         if (can_sleep)
1106                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1107
1108         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1109 }
1110 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1111
1112 static inline int ip6_ufo_append_data(struct sock *sk,
1113                         int getfrag(void *from, char *to, int offset, int len,
1114                         int odd, struct sk_buff *skb),
1115                         void *from, int length, int hh_len, int fragheaderlen,
1116                         int transhdrlen, int mtu,unsigned int flags,
1117                         struct rt6_info *rt)
1118
1119 {
1120         struct sk_buff *skb;
1121         int err;
1122
1123         /* There is support for UDP large send offload by network
1124          * device, so create one single skb packet containing complete
1125          * udp datagram
1126          */
1127         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1128                 skb = sock_alloc_send_skb(sk,
1129                         hh_len + fragheaderlen + transhdrlen + 20,
1130                         (flags & MSG_DONTWAIT), &err);
1131                 if (skb == NULL)
1132                         return err;
1133
1134                 /* reserve space for Hardware header */
1135                 skb_reserve(skb, hh_len);
1136
1137                 /* create space for UDP/IP header */
1138                 skb_put(skb,fragheaderlen + transhdrlen);
1139
1140                 /* initialize network header pointer */
1141                 skb_reset_network_header(skb);
1142
1143                 /* initialize protocol header pointer */
1144                 skb->transport_header = skb->network_header + fragheaderlen;
1145
1146                 skb->ip_summed = CHECKSUM_PARTIAL;
1147                 skb->csum = 0;
1148         }
1149
1150         err = skb_append_datato_frags(sk,skb, getfrag, from,
1151                                       (length - transhdrlen));
1152         if (!err) {
1153                 struct frag_hdr fhdr;
1154
1155                 /* Specify the length of each IPv6 datagram fragment.
1156                  * It has to be a multiple of 8.
1157                  */
1158                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1159                                              sizeof(struct frag_hdr)) & ~7;
1160                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161                 ipv6_select_ident(&fhdr, rt);
1162                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1163                 __skb_queue_tail(&sk->sk_write_queue, skb);
1164
1165                 return 0;
1166         }
1167         /* There is not enough support do UPD LSO,
1168          * so follow normal path
1169          */
1170         kfree_skb(skb);
1171
1172         return err;
1173 }
1174
1175 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1176                                                gfp_t gfp)
1177 {
1178         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 }
1180
1181 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1182                                                 gfp_t gfp)
1183 {
1184         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1185 }
1186
1187 static void ip6_append_data_mtu(int *mtu,
1188                                 int *maxfraglen,
1189                                 unsigned int fragheaderlen,
1190                                 struct sk_buff *skb,
1191                                 struct rt6_info *rt)
1192 {
1193         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1194                 if (skb == NULL) {
1195                         /* first fragment, reserve header_len */
1196                         *mtu = *mtu - rt->dst.header_len;
1197
1198                 } else {
1199                         /*
1200                          * this fragment is not first, the headers
1201                          * space is regarded as data space.
1202                          */
1203                         *mtu = dst_mtu(rt->dst.path);
1204                 }
1205                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1206                               + fragheaderlen - sizeof(struct frag_hdr);
1207         }
1208 }
1209
1210 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1211         int offset, int len, int odd, struct sk_buff *skb),
1212         void *from, int length, int transhdrlen,
1213         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1214         struct rt6_info *rt, unsigned int flags, int dontfrag)
1215 {
1216         struct inet_sock *inet = inet_sk(sk);
1217         struct ipv6_pinfo *np = inet6_sk(sk);
1218         struct inet_cork *cork;
1219         struct sk_buff *skb, *skb_prev = NULL;
1220         unsigned int maxfraglen, fragheaderlen;
1221         int exthdrlen;
1222         int dst_exthdrlen;
1223         int hh_len;
1224         int mtu;
1225         int copy;
1226         int err;
1227         int offset = 0;
1228         __u8 tx_flags = 0;
1229
1230         if (flags&MSG_PROBE)
1231                 return 0;
1232         cork = &inet->cork.base;
1233         if (skb_queue_empty(&sk->sk_write_queue)) {
1234                 /*
1235                  * setup for corking
1236                  */
1237                 if (opt) {
1238                         if (WARN_ON(np->cork.opt))
1239                                 return -EINVAL;
1240
1241                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1242                         if (unlikely(np->cork.opt == NULL))
1243                                 return -ENOBUFS;
1244
1245                         np->cork.opt->tot_len = opt->tot_len;
1246                         np->cork.opt->opt_flen = opt->opt_flen;
1247                         np->cork.opt->opt_nflen = opt->opt_nflen;
1248
1249                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1250                                                             sk->sk_allocation);
1251                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1252                                 return -ENOBUFS;
1253
1254                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1255                                                             sk->sk_allocation);
1256                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1257                                 return -ENOBUFS;
1258
1259                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1260                                                            sk->sk_allocation);
1261                         if (opt->hopopt && !np->cork.opt->hopopt)
1262                                 return -ENOBUFS;
1263
1264                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1265                                                             sk->sk_allocation);
1266                         if (opt->srcrt && !np->cork.opt->srcrt)
1267                                 return -ENOBUFS;
1268
1269                         /* need source address above miyazawa*/
1270                 }
1271                 dst_hold(&rt->dst);
1272                 cork->dst = &rt->dst;
1273                 inet->cork.fl.u.ip6 = *fl6;
1274                 np->cork.hop_limit = hlimit;
1275                 np->cork.tclass = tclass;
1276                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1277                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1278                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1279                 else
1280                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1281                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1282                 if (np->frag_size < mtu) {
1283                         if (np->frag_size)
1284                                 mtu = np->frag_size;
1285                 }
1286                 cork->fragsize = mtu;
1287                 if (dst_allfrag(rt->dst.path))
1288                         cork->flags |= IPCORK_ALLFRAG;
1289                 cork->length = 0;
1290                 sk->sk_sndmsg_page = NULL;
1291                 sk->sk_sndmsg_off = 0;
1292                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1293                 length += exthdrlen;
1294                 transhdrlen += exthdrlen;
1295                 dst_exthdrlen = rt->dst.header_len;
1296         } else {
1297                 rt = (struct rt6_info *)cork->dst;
1298                 fl6 = &inet->cork.fl.u.ip6;
1299                 opt = np->cork.opt;
1300                 transhdrlen = 0;
1301                 exthdrlen = 0;
1302                 dst_exthdrlen = 0;
1303                 mtu = cork->fragsize;
1304         }
1305
1306         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1307
1308         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1309                         (opt ? opt->opt_nflen : 0);
1310         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1311
1312         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1313                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1314                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1315                         return -EMSGSIZE;
1316                 }
1317         }
1318
1319         /* For UDP, check if TX timestamp is enabled */
1320         if (sk->sk_type == SOCK_DGRAM) {
1321                 err = sock_tx_timestamp(sk, &tx_flags);
1322                 if (err)
1323                         goto error;
1324         }
1325
1326         /*
1327          * Let's try using as much space as possible.
1328          * Use MTU if total length of the message fits into the MTU.
1329          * Otherwise, we need to reserve fragment header and
1330          * fragment alignment (= 8-15 octects, in total).
1331          *
1332          * Note that we may need to "move" the data from the tail of
1333          * of the buffer to the new fragment when we split
1334          * the message.
1335          *
1336          * FIXME: It may be fragmented into multiple chunks
1337          *        at once if non-fragmentable extension headers
1338          *        are too large.
1339          * --yoshfuji
1340          */
1341
1342         cork->length += length;
1343         if (length > mtu) {
1344                 int proto = sk->sk_protocol;
1345                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1346                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1347                         return -EMSGSIZE;
1348                 }
1349
1350                 if (proto == IPPROTO_UDP &&
1351                     (rt->dst.dev->features & NETIF_F_UFO)) {
1352
1353                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1354                                                   hh_len, fragheaderlen,
1355                                                   transhdrlen, mtu, flags, rt);
1356                         if (err)
1357                                 goto error;
1358                         return 0;
1359                 }
1360         }
1361
1362         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1363                 goto alloc_new_skb;
1364
1365         while (length > 0) {
1366                 /* Check if the remaining data fits into current packet. */
1367                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1368                 if (copy < length)
1369                         copy = maxfraglen - skb->len;
1370
1371                 if (copy <= 0) {
1372                         char *data;
1373                         unsigned int datalen;
1374                         unsigned int fraglen;
1375                         unsigned int fraggap;
1376                         unsigned int alloclen;
1377 alloc_new_skb:
1378                         /* There's no room in the current skb */
1379                         if (skb)
1380                                 fraggap = skb->len - maxfraglen;
1381                         else
1382                                 fraggap = 0;
1383                         /* update mtu and maxfraglen if necessary */
1384                         if (skb == NULL || skb_prev == NULL)
1385                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1386                                                     fragheaderlen, skb, rt);
1387
1388                         skb_prev = skb;
1389
1390                         /*
1391                          * If remaining data exceeds the mtu,
1392                          * we know we need more fragment(s).
1393                          */
1394                         datalen = length + fraggap;
1395
1396                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1397                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1398                         if ((flags & MSG_MORE) &&
1399                             !(rt->dst.dev->features&NETIF_F_SG))
1400                                 alloclen = mtu;
1401                         else
1402                                 alloclen = datalen + fragheaderlen;
1403
1404                         alloclen += dst_exthdrlen;
1405
1406                         if (datalen != length + fraggap) {
1407                                 /*
1408                                  * this is not the last fragment, the trailer
1409                                  * space is regarded as data space.
1410                                  */
1411                                 datalen += rt->dst.trailer_len;
1412                         }
1413
1414                         alloclen += rt->dst.trailer_len;
1415                         fraglen = datalen + fragheaderlen;
1416
1417                         /*
1418                          * We just reserve space for fragment header.
1419                          * Note: this may be overallocation if the message
1420                          * (without MSG_MORE) fits into the MTU.
1421                          */
1422                         alloclen += sizeof(struct frag_hdr);
1423
1424                         if (transhdrlen) {
1425                                 skb = sock_alloc_send_skb(sk,
1426                                                 alloclen + hh_len,
1427                                                 (flags & MSG_DONTWAIT), &err);
1428                         } else {
1429                                 skb = NULL;
1430                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1431                                     2 * sk->sk_sndbuf)
1432                                         skb = sock_wmalloc(sk,
1433                                                            alloclen + hh_len, 1,
1434                                                            sk->sk_allocation);
1435                                 if (unlikely(skb == NULL))
1436                                         err = -ENOBUFS;
1437                                 else {
1438                                         /* Only the initial fragment
1439                                          * is time stamped.
1440                                          */
1441                                         tx_flags = 0;
1442                                 }
1443                         }
1444                         if (skb == NULL)
1445                                 goto error;
1446                         /*
1447                          *      Fill in the control structures
1448                          */
1449                         skb->ip_summed = CHECKSUM_NONE;
1450                         skb->csum = 0;
1451                         /* reserve for fragmentation and ipsec header */
1452                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1453                                     dst_exthdrlen);
1454
1455                         if (sk->sk_type == SOCK_DGRAM)
1456                                 skb_shinfo(skb)->tx_flags = tx_flags;
1457
1458                         /*
1459                          *      Find where to start putting bytes
1460                          */
1461                         data = skb_put(skb, fraglen);
1462                         skb_set_network_header(skb, exthdrlen);
1463                         data += fragheaderlen;
1464                         skb->transport_header = (skb->network_header +
1465                                                  fragheaderlen);
1466                         if (fraggap) {
1467                                 skb->csum = skb_copy_and_csum_bits(
1468                                         skb_prev, maxfraglen,
1469                                         data + transhdrlen, fraggap, 0);
1470                                 skb_prev->csum = csum_sub(skb_prev->csum,
1471                                                           skb->csum);
1472                                 data += fraggap;
1473                                 pskb_trim_unique(skb_prev, maxfraglen);
1474                         }
1475                         copy = datalen - transhdrlen - fraggap;
1476
1477                         if (copy < 0) {
1478                                 err = -EINVAL;
1479                                 kfree_skb(skb);
1480                                 goto error;
1481                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1482                                 err = -EFAULT;
1483                                 kfree_skb(skb);
1484                                 goto error;
1485                         }
1486
1487                         offset += copy;
1488                         length -= datalen - fraggap;
1489                         transhdrlen = 0;
1490                         exthdrlen = 0;
1491                         dst_exthdrlen = 0;
1492
1493                         /*
1494                          * Put the packet on the pending queue
1495                          */
1496                         __skb_queue_tail(&sk->sk_write_queue, skb);
1497                         continue;
1498                 }
1499
1500                 if (copy > length)
1501                         copy = length;
1502
1503                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1504                         unsigned int off;
1505
1506                         off = skb->len;
1507                         if (getfrag(from, skb_put(skb, copy),
1508                                                 offset, copy, off, skb) < 0) {
1509                                 __skb_trim(skb, off);
1510                                 err = -EFAULT;
1511                                 goto error;
1512                         }
1513                 } else {
1514                         int i = skb_shinfo(skb)->nr_frags;
1515                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1516                         struct page *page = sk->sk_sndmsg_page;
1517                         int off = sk->sk_sndmsg_off;
1518                         unsigned int left;
1519
1520                         if (page && (left = PAGE_SIZE - off) > 0) {
1521                                 if (copy >= left)
1522                                         copy = left;
1523                                 if (page != skb_frag_page(frag)) {
1524                                         if (i == MAX_SKB_FRAGS) {
1525                                                 err = -EMSGSIZE;
1526                                                 goto error;
1527                                         }
1528                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1529                                         skb_frag_ref(skb, i);
1530                                         frag = &skb_shinfo(skb)->frags[i];
1531                                 }
1532                         } else if(i < MAX_SKB_FRAGS) {
1533                                 if (copy > PAGE_SIZE)
1534                                         copy = PAGE_SIZE;
1535                                 page = alloc_pages(sk->sk_allocation, 0);
1536                                 if (page == NULL) {
1537                                         err = -ENOMEM;
1538                                         goto error;
1539                                 }
1540                                 sk->sk_sndmsg_page = page;
1541                                 sk->sk_sndmsg_off = 0;
1542
1543                                 skb_fill_page_desc(skb, i, page, 0, 0);
1544                                 frag = &skb_shinfo(skb)->frags[i];
1545                         } else {
1546                                 err = -EMSGSIZE;
1547                                 goto error;
1548                         }
1549                         if (getfrag(from,
1550                                     skb_frag_address(frag) + skb_frag_size(frag),
1551                                     offset, copy, skb->len, skb) < 0) {
1552                                 err = -EFAULT;
1553                                 goto error;
1554                         }
1555                         sk->sk_sndmsg_off += copy;
1556                         skb_frag_size_add(frag, copy);
1557                         skb->len += copy;
1558                         skb->data_len += copy;
1559                         skb->truesize += copy;
1560                         atomic_add(copy, &sk->sk_wmem_alloc);
1561                 }
1562                 offset += copy;
1563                 length -= copy;
1564         }
1565         return 0;
1566 error:
1567         cork->length -= length;
1568         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1569         return err;
1570 }
1571 EXPORT_SYMBOL_GPL(ip6_append_data);
1572
1573 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1574 {
1575         if (np->cork.opt) {
1576                 kfree(np->cork.opt->dst0opt);
1577                 kfree(np->cork.opt->dst1opt);
1578                 kfree(np->cork.opt->hopopt);
1579                 kfree(np->cork.opt->srcrt);
1580                 kfree(np->cork.opt);
1581                 np->cork.opt = NULL;
1582         }
1583
1584         if (inet->cork.base.dst) {
1585                 dst_release(inet->cork.base.dst);
1586                 inet->cork.base.dst = NULL;
1587                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1588         }
1589         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1590 }
1591
1592 int ip6_push_pending_frames(struct sock *sk)
1593 {
1594         struct sk_buff *skb, *tmp_skb;
1595         struct sk_buff **tail_skb;
1596         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1597         struct inet_sock *inet = inet_sk(sk);
1598         struct ipv6_pinfo *np = inet6_sk(sk);
1599         struct net *net = sock_net(sk);
1600         struct ipv6hdr *hdr;
1601         struct ipv6_txoptions *opt = np->cork.opt;
1602         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1603         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1604         unsigned char proto = fl6->flowi6_proto;
1605         int err = 0;
1606
1607         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1608                 goto out;
1609         tail_skb = &(skb_shinfo(skb)->frag_list);
1610
1611         /* move skb->data to ip header from ext header */
1612         if (skb->data < skb_network_header(skb))
1613                 __skb_pull(skb, skb_network_offset(skb));
1614         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1615                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1616                 *tail_skb = tmp_skb;
1617                 tail_skb = &(tmp_skb->next);
1618                 skb->len += tmp_skb->len;
1619                 skb->data_len += tmp_skb->len;
1620                 skb->truesize += tmp_skb->truesize;
1621                 tmp_skb->destructor = NULL;
1622                 tmp_skb->sk = NULL;
1623         }
1624
1625         /* Allow local fragmentation. */
1626         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1627                 skb->local_df = 1;
1628
1629         *final_dst = fl6->daddr;
1630         __skb_pull(skb, skb_network_header_len(skb));
1631         if (opt && opt->opt_flen)
1632                 ipv6_push_frag_opts(skb, opt, &proto);
1633         if (opt && opt->opt_nflen)
1634                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1635
1636         skb_push(skb, sizeof(struct ipv6hdr));
1637         skb_reset_network_header(skb);
1638         hdr = ipv6_hdr(skb);
1639
1640         *(__be32*)hdr = fl6->flowlabel |
1641                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1642
1643         hdr->hop_limit = np->cork.hop_limit;
1644         hdr->nexthdr = proto;
1645         hdr->saddr = fl6->saddr;
1646         hdr->daddr = *final_dst;
1647
1648         skb->priority = sk->sk_priority;
1649         skb->mark = sk->sk_mark;
1650
1651         skb_dst_set(skb, dst_clone(&rt->dst));
1652         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1653         if (proto == IPPROTO_ICMPV6) {
1654                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1655
1656                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1657                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1658         }
1659
1660         err = ip6_local_out(skb);
1661         if (err) {
1662                 if (err > 0)
1663                         err = net_xmit_errno(err);
1664                 if (err)
1665                         goto error;
1666         }
1667
1668 out:
1669         ip6_cork_release(inet, np);
1670         return err;
1671 error:
1672         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1673         goto out;
1674 }
1675 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1676
1677 void ip6_flush_pending_frames(struct sock *sk)
1678 {
1679         struct sk_buff *skb;
1680
1681         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1682                 if (skb_dst(skb))
1683                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1684                                       IPSTATS_MIB_OUTDISCARDS);
1685                 kfree_skb(skb);
1686         }
1687
1688         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1689 }
1690 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);