net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct neighbour *neigh;
  64         struct in6_addr *nexthop;
  65         int ret;
  66
  67         skb->protocol = htons(ETH_P_IPV6);
  68         skb->dev = dev;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(dev_net(dev), skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(dev_net(dev), idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
  97                                 skb->len);
  98
  99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 100                     IPV6_ADDR_SCOPE_NODELOCAL &&
 101                     !(dev->flags & IFF_LOOPBACK)) {
 102                         kfree_skb(skb);
 103                         return 0;
 104                 }
 105         }
 106
 107         rcu_read_lock_bh();
 108         nexthop = rt6_nexthop((struct rt6_info *)dst);
 109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 110         if (unlikely(!neigh))
 111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 112         if (!IS_ERR(neigh)) {
 113                 ret = dst_neigh_output(dst, neigh, skb);
 114                 rcu_read_unlock_bh();
 115                 return ret;
 116         }
 117         rcu_read_unlock_bh();
 118
 119         IP6_INC_STATS(dev_net(dst->dev),
 120                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 121         kfree_skb(skb);
 122         return -EINVAL;
 123 }
 124
 125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
 126 {
 127         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 128             dst_allfrag(skb_dst(skb)) ||
 129             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 130                 return ip6_fragment(sk, skb, ip6_finish_output2);
 131         else
 132                 return ip6_finish_output2(sk, skb);
 133 }
 134
 135 int ip6_output(struct sock *sk, struct sk_buff *skb)
 136 {
 137         struct net_device *dev = skb_dst(skb)->dev;
 138         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 139         if (unlikely(idev->cnf.disable_ipv6)) {
 140                 IP6_INC_STATS(dev_net(dev), idev,
 141                               IPSTATS_MIB_OUTDISCARDS);
 142                 kfree_skb(skb);
 143                 return 0;
 144         }
 145
 146         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
 147                             NULL, dev,
 148                             ip6_finish_output,
 149                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 150 }
 151
 152 /*
 153  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 154  */
 155
 156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 157              struct ipv6_txoptions *opt, int tclass)
 158 {
 159         struct net *net = sock_net(sk);
 160         struct ipv6_pinfo *np = inet6_sk(sk);
 161         struct in6_addr *first_hop = &fl6->daddr;
 162         struct dst_entry *dst = skb_dst(skb);
 163         struct ipv6hdr *hdr;
 164         u8  proto = fl6->flowi6_proto;
 165         int seg_len = skb->len;
 166         int hlimit = -1;
 167         u32 mtu;
 168
 169         if (opt) {
 170                 unsigned int head_room;
 171
 172                 /* First: exthdrs may take lots of space (~8K for now)
 173                    MAX_HEADER is not enough.
 174                  */
 175                 head_room = opt->opt_nflen + opt->opt_flen;
 176                 seg_len += head_room;
 177                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 178
 179                 if (skb_headroom(skb) < head_room) {
 180                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 181                         if (!skb2) {
 182                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 183                                               IPSTATS_MIB_OUTDISCARDS);
 184                                 kfree_skb(skb);
 185                                 return -ENOBUFS;
 186                         }
 187                         consume_skb(skb);
 188                         skb = skb2;
 189                         skb_set_owner_w(skb, sk);
 190                 }
 191                 if (opt->opt_flen)
 192                         ipv6_push_frag_opts(skb, opt, &proto);
 193                 if (opt->opt_nflen)
 194                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 195         }
 196
 197         skb_push(skb, sizeof(struct ipv6hdr));
 198         skb_reset_network_header(skb);
 199         hdr = ipv6_hdr(skb);
 200
 201         /*
 202          *      Fill in the IPv6 header
 203          */
 204         if (np)
 205                 hlimit = np->hop_limit;
 206         if (hlimit < 0)
 207                 hlimit = ip6_dst_hoplimit(dst);
 208
 209         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 210                                                      np->autoflowlabel));
 211
 212         hdr->payload_len = htons(seg_len);
 213         hdr->nexthdr = proto;
 214         hdr->hop_limit = hlimit;
 215
 216         hdr->saddr = fl6->saddr;
 217         hdr->daddr = *first_hop;
 218
 219         skb->protocol = htons(ETH_P_IPV6);
 220         skb->priority = sk->sk_priority;
 221         skb->mark = sk->sk_mark;
 222
 223         mtu = dst_mtu(dst);
 224         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 225                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 226                               IPSTATS_MIB_OUT, skb->len);
 227                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
 228                                NULL, dst->dev, dst_output_sk);
 229         }
 230
 231         skb->dev = dst->dev;
 232         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 233         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 234         kfree_skb(skb);
 235         return -EMSGSIZE;
 236 }
 237 EXPORT_SYMBOL(ip6_xmit);
 238
 239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 240 {
 241         struct ip6_ra_chain *ra;
 242         struct sock *last = NULL;
 243
 244         read_lock(&ip6_ra_lock);
 245         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 246                 struct sock *sk = ra->sk;
 247                 if (sk && ra->sel == sel &&
 248                     (!sk->sk_bound_dev_if ||
 249                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 250                         if (last) {
 251                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 252                                 if (skb2)
 253                                         rawv6_rcv(last, skb2);
 254                         }
 255                         last = sk;
 256                 }
 257         }
 258
 259         if (last) {
 260                 rawv6_rcv(last, skb);
 261                 read_unlock(&ip6_ra_lock);
 262                 return 1;
 263         }
 264         read_unlock(&ip6_ra_lock);
 265         return 0;
 266 }
 267
 268 static int ip6_forward_proxy_check(struct sk_buff *skb)
 269 {
 270         struct ipv6hdr *hdr = ipv6_hdr(skb);
 271         u8 nexthdr = hdr->nexthdr;
 272         __be16 frag_off;
 273         int offset;
 274
 275         if (ipv6_ext_hdr(nexthdr)) {
 276                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 277                 if (offset < 0)
 278                         return 0;
 279         } else
 280                 offset = sizeof(struct ipv6hdr);
 281
 282         if (nexthdr == IPPROTO_ICMPV6) {
 283                 struct icmp6hdr *icmp6;
 284
 285                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 286                                          offset + 1 - skb->data)))
 287                         return 0;
 288
 289                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 290
 291                 switch (icmp6->icmp6_type) {
 292                 case NDISC_ROUTER_SOLICITATION:
 293                 case NDISC_ROUTER_ADVERTISEMENT:
 294                 case NDISC_NEIGHBOUR_SOLICITATION:
 295                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 296                 case NDISC_REDIRECT:
 297                         /* For reaction involving unicast neighbor discovery
 298                          * message destined to the proxied address, pass it to
 299                          * input function.
 300                          */
 301                         return 1;
 302                 default:
 303                         break;
 304                 }
 305         }
 306
 307         /*
 308          * The proxying router can't forward traffic sent to a link-local
 309          * address, so signal the sender and discard the packet. This
 310          * behavior is clarified by the MIPv6 specification.
 311          */
 312         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 313                 dst_link_failure(skb);
 314                 return -1;
 315         }
 316
 317         return 0;
 318 }
 319
 320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
 321 {
 322         skb_sender_cpu_clear(skb);
 323         return dst_output_sk(sk, skb);
 324 }
 325
 326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 327 {
 328         unsigned int mtu;
 329         struct inet6_dev *idev;
 330
 331         if (dst_metric_locked(dst, RTAX_MTU)) {
 332                 mtu = dst_metric_raw(dst, RTAX_MTU);
 333                 if (mtu)
 334                         return mtu;
 335         }
 336
 337         mtu = IPV6_MIN_MTU;
 338         rcu_read_lock();
 339         idev = __in6_dev_get(dst->dev);
 340         if (idev)
 341                 mtu = idev->cnf.mtu6;
 342         rcu_read_unlock();
 343
 344         return mtu;
 345 }
 346
 347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 348 {
 349         if (skb->len <= mtu)
 350                 return false;
 351
 352         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 353         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 354                 return true;
 355
 356         if (skb->ignore_df)
 357                 return false;
 358
 359         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
 360                 return false;
 361
 362         return true;
 363 }
 364
 365 int ip6_forward(struct sk_buff *skb)
 366 {
 367         struct dst_entry *dst = skb_dst(skb);
 368         struct ipv6hdr *hdr = ipv6_hdr(skb);
 369         struct inet6_skb_parm *opt = IP6CB(skb);
 370         struct net *net = dev_net(dst->dev);
 371         u32 mtu;
 372
 373         if (net->ipv6.devconf_all->forwarding == 0)
 374                 goto error;
 375
 376         if (skb->pkt_type != PACKET_HOST)
 377                 goto drop;
 378
 379         if (skb_warn_if_lro(skb))
 380                 goto drop;
 381
 382         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 383                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 384                                  IPSTATS_MIB_INDISCARDS);
 385                 goto drop;
 386         }
 387
 388         skb_forward_csum(skb);
 389
 390         /*
 391          *      We DO NOT make any processing on
 392          *      RA packets, pushing them to user level AS IS
 393          *      without ane WARRANTY that application will be able
 394          *      to interpret them. The reason is that we
 395          *      cannot make anything clever here.
 396          *
 397          *      We are not end-node, so that if packet contains
 398          *      AH/ESP, we cannot make anything.
 399          *      Defragmentation also would be mistake, RA packets
 400          *      cannot be fragmented, because there is no warranty
 401          *      that different fragments will go along one path. --ANK
 402          */
 403         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 404                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 405                         return 0;
 406         }
 407
 408         /*
 409          *      check and decrement ttl
 410          */
 411         if (hdr->hop_limit <= 1) {
 412                 /* Force OUTPUT device used as source address */
 413                 skb->dev = dst->dev;
 414                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 415                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 416                                  IPSTATS_MIB_INHDRERRORS);
 417
 418                 kfree_skb(skb);
 419                 return -ETIMEDOUT;
 420         }
 421
 422         /* XXX: idev->cnf.proxy_ndp? */
 423         if (net->ipv6.devconf_all->proxy_ndp &&
 424             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 425                 int proxied = ip6_forward_proxy_check(skb);
 426                 if (proxied > 0)
 427                         return ip6_input(skb);
 428                 else if (proxied < 0) {
 429                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 430                                          IPSTATS_MIB_INDISCARDS);
 431                         goto drop;
 432                 }
 433         }
 434
 435         if (!xfrm6_route_forward(skb)) {
 436                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 437                                  IPSTATS_MIB_INDISCARDS);
 438                 goto drop;
 439         }
 440         dst = skb_dst(skb);
 441
 442         /* IPv6 specs say nothing about it, but it is clear that we cannot
 443            send redirects to source routed frames.
 444            We don't send redirects to frames decapsulated from IPsec.
 445          */
 446         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 447                 struct in6_addr *target = NULL;
 448                 struct inet_peer *peer;
 449                 struct rt6_info *rt;
 450
 451                 /*
 452                  *      incoming and outgoing devices are the same
 453                  *      send a redirect.
 454                  */
 455
 456                 rt = (struct rt6_info *) dst;
 457                 if (rt->rt6i_flags & RTF_GATEWAY)
 458                         target = &rt->rt6i_gateway;
 459                 else
 460                         target = &hdr->daddr;
 461
 462                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 463
 464                 /* Limit redirects both by destination (here)
 465                    and by source (inside ndisc_send_redirect)
 466                  */
 467                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 468                         ndisc_send_redirect(skb, target);
 469                 if (peer)
 470                         inet_putpeer(peer);
 471         } else {
 472                 int addrtype = ipv6_addr_type(&hdr->saddr);
 473
 474                 /* This check is security critical. */
 475                 if (addrtype == IPV6_ADDR_ANY ||
 476                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 477                         goto error;
 478                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 479                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 480                                     ICMPV6_NOT_NEIGHBOUR, 0);
 481                         goto error;
 482                 }
 483         }
 484
 485         mtu = ip6_dst_mtu_forward(dst);
 486         if (mtu < IPV6_MIN_MTU)
 487                 mtu = IPV6_MIN_MTU;
 488
 489         if (ip6_pkt_too_big(skb, mtu)) {
 490                 /* Again, force OUTPUT device used as source address */
 491                 skb->dev = dst->dev;
 492                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 493                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 494                                  IPSTATS_MIB_INTOOBIGERRORS);
 495                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 496                                  IPSTATS_MIB_FRAGFAILS);
 497                 kfree_skb(skb);
 498                 return -EMSGSIZE;
 499         }
 500
 501         if (skb_cow(skb, dst->dev->hard_header_len)) {
 502                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
 503                                  IPSTATS_MIB_OUTDISCARDS);
 504                 goto drop;
 505         }
 506
 507         hdr = ipv6_hdr(skb);
 508
 509         /* Mangling hops number delayed to point after skb COW */
 510
 511         hdr->hop_limit--;
 512
 513         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 514         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 515         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
 516                        skb->dev, dst->dev,
 517                        ip6_forward_finish);
 518
 519 error:
 520         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 521 drop:
 522         kfree_skb(skb);
 523         return -EINVAL;
 524 }
 525
 526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 527 {
 528         to->pkt_type = from->pkt_type;
 529         to->priority = from->priority;
 530         to->protocol = from->protocol;
 531         skb_dst_drop(to);
 532         skb_dst_set(to, dst_clone(skb_dst(from)));
 533         to->dev = from->dev;
 534         to->mark = from->mark;
 535
 536 #ifdef CONFIG_NET_SCHED
 537         to->tc_index = from->tc_index;
 538 #endif
 539         nf_copy(to, from);
 540         skb_copy_secmark(to, from);
 541 }
 542
 543 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 544                  int (*output)(struct sock *, struct sk_buff *))
 545 {
 546         struct sk_buff *frag;
 547         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 548         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 549                                 inet6_sk(skb->sk) : NULL;
 550         struct ipv6hdr *tmp_hdr;
 551         struct frag_hdr *fh;
 552         unsigned int mtu, hlen, left, len;
 553         int hroom, troom;
 554         __be32 frag_id = 0;
 555         int ptr, offset = 0, err = 0;
 556         u8 *prevhdr, nexthdr = 0;
 557         struct net *net = dev_net(skb_dst(skb)->dev);
 558
 559         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 560         nexthdr = *prevhdr;
 561
 562         mtu = ip6_skb_dst_mtu(skb);
 563
 564         /* We must not fragment if the socket is set to force MTU discovery
 565          * or if the skb it not generated by a local socket.
 566          */
 567         if (unlikely(!skb->ignore_df && skb->len > mtu) ||
 568                      (IP6CB(skb)->frag_max_size &&
 569                       IP6CB(skb)->frag_max_size > mtu)) {
 570                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 571                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 572
 573                 skb->dev = skb_dst(skb)->dev;
 574                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 575                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 576                               IPSTATS_MIB_FRAGFAILS);
 577                 kfree_skb(skb);
 578                 return -EMSGSIZE;
 579         }
 580
 581         if (np && np->frag_size < mtu) {
 582                 if (np->frag_size)
 583                         mtu = np->frag_size;
 584         }
 585         mtu -= hlen + sizeof(struct frag_hdr);
 586
 587         if (skb_has_frag_list(skb)) {
 588                 int first_len = skb_pagelen(skb);
 589                 struct sk_buff *frag2;
 590
 591                 if (first_len - hlen > mtu ||
 592                     ((first_len - hlen) & 7) ||
 593                     skb_cloned(skb))
 594                         goto slow_path;
 595
 596                 skb_walk_frags(skb, frag) {
 597                         /* Correct geometry. */
 598                         if (frag->len > mtu ||
 599                             ((frag->len & 7) && frag->next) ||
 600                             skb_headroom(frag) < hlen)
 601                                 goto slow_path_clean;
 602
 603                         /* Partially cloned skb? */
 604                         if (skb_shared(frag))
 605                                 goto slow_path_clean;
 606
 607                         BUG_ON(frag->sk);
 608                         if (skb->sk) {
 609                                 frag->sk = skb->sk;
 610                                 frag->destructor = sock_wfree;
 611                         }
 612                         skb->truesize -= frag->truesize;
 613                 }
 614
 615                 err = 0;
 616                 offset = 0;
 617                 frag = skb_shinfo(skb)->frag_list;
 618                 skb_frag_list_init(skb);
 619                 /* BUILD HEADER */
 620
 621                 *prevhdr = NEXTHDR_FRAGMENT;
 622                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 623                 if (!tmp_hdr) {
 624                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 625                                       IPSTATS_MIB_FRAGFAILS);
 626                         return -ENOMEM;
 627                 }
 628
 629                 __skb_pull(skb, hlen);
 630                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 631                 __skb_push(skb, hlen);
 632                 skb_reset_network_header(skb);
 633                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 634
 635                 ipv6_select_ident(net, fh, rt);
 636                 fh->nexthdr = nexthdr;
 637                 fh->reserved = 0;
 638                 fh->frag_off = htons(IP6_MF);
 639                 frag_id = fh->identification;
 640
 641                 first_len = skb_pagelen(skb);
 642                 skb->data_len = first_len - skb_headlen(skb);
 643                 skb->len = first_len;
 644                 ipv6_hdr(skb)->payload_len = htons(first_len -
 645                                                    sizeof(struct ipv6hdr));
 646
 647                 dst_hold(&rt->dst);
 648
 649                 for (;;) {
 650                         /* Prepare header of the next frame,
 651                          * before previous one went down. */
 652                         if (frag) {
 653                                 frag->ip_summed = CHECKSUM_NONE;
 654                                 skb_reset_transport_header(frag);
 655                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 656                                 __skb_push(frag, hlen);
 657                                 skb_reset_network_header(frag);
 658                                 memcpy(skb_network_header(frag), tmp_hdr,
 659                                        hlen);
 660                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 661                                 fh->nexthdr = nexthdr;
 662                                 fh->reserved = 0;
 663                                 fh->frag_off = htons(offset);
 664                                 if (frag->next)
 665                                         fh->frag_off |= htons(IP6_MF);
 666                                 fh->identification = frag_id;
 667                                 ipv6_hdr(frag)->payload_len =
 668                                                 htons(frag->len -
 669                                                       sizeof(struct ipv6hdr));
 670                                 ip6_copy_metadata(frag, skb);
 671                         }
 672
 673                         err = output(sk, skb);
 674                         if (!err)
 675                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 676                                               IPSTATS_MIB_FRAGCREATES);
 677
 678                         if (err || !frag)
 679                                 break;
 680
 681                         skb = frag;
 682                         frag = skb->next;
 683                         skb->next = NULL;
 684                 }
 685
 686                 kfree(tmp_hdr);
 687
 688                 if (err == 0) {
 689                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 690                                       IPSTATS_MIB_FRAGOKS);
 691                         ip6_rt_put(rt);
 692                         return 0;
 693                 }
 694
 695                 kfree_skb_list(frag);
 696
 697                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 698                               IPSTATS_MIB_FRAGFAILS);
 699                 ip6_rt_put(rt);
 700                 return err;
 701
 702 slow_path_clean:
 703                 skb_walk_frags(skb, frag2) {
 704                         if (frag2 == frag)
 705                                 break;
 706                         frag2->sk = NULL;
 707                         frag2->destructor = NULL;
 708                         skb->truesize += frag2->truesize;
 709                 }
 710         }
 711
 712 slow_path:
 713         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 714             skb_checksum_help(skb))
 715                 goto fail;
 716
 717         left = skb->len - hlen;         /* Space per frame */
 718         ptr = hlen;                     /* Where to start from */
 719
 720         /*
 721          *      Fragment the datagram.
 722          */
 723
 724         *prevhdr = NEXTHDR_FRAGMENT;
 725         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 726         troom = rt->dst.dev->needed_tailroom;
 727
 728         /*
 729          *      Keep copying data until we run out.
 730          */
 731         while (left > 0)        {
 732                 len = left;
 733                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 734                 if (len > mtu)
 735                         len = mtu;
 736                 /* IF: we are not sending up to and including the packet end
 737                    then align the next start on an eight byte boundary */
 738                 if (len < left) {
 739                         len &= ~7;
 740                 }
 741
 742                 /* Allocate buffer */
 743                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 744                                  hroom + troom, GFP_ATOMIC);
 745                 if (!frag) {
 746                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 747                                       IPSTATS_MIB_FRAGFAILS);
 748                         err = -ENOMEM;
 749                         goto fail;
 750                 }
 751
 752                 /*
 753                  *      Set up data on packet
 754                  */
 755
 756                 ip6_copy_metadata(frag, skb);
 757                 skb_reserve(frag, hroom);
 758                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 759                 skb_reset_network_header(frag);
 760                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 761                 frag->transport_header = (frag->network_header + hlen +
 762                                           sizeof(struct frag_hdr));
 763
 764                 /*
 765                  *      Charge the memory for the fragment to any owner
 766                  *      it might possess
 767                  */
 768                 if (skb->sk)
 769                         skb_set_owner_w(frag, skb->sk);
 770
 771                 /*
 772                  *      Copy the packet header into the new buffer.
 773                  */
 774                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 775
 776                 /*
 777                  *      Build fragment header.
 778                  */
 779                 fh->nexthdr = nexthdr;
 780                 fh->reserved = 0;
 781                 if (!frag_id) {
 782                         ipv6_select_ident(net, fh, rt);
 783                         frag_id = fh->identification;
 784                 } else
 785                         fh->identification = frag_id;
 786
 787                 /*
 788                  *      Copy a block of the IP datagram.
 789                  */
 790                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 791                                      len));
 792                 left -= len;
 793
 794                 fh->frag_off = htons(offset);
 795                 if (left > 0)
 796                         fh->frag_off |= htons(IP6_MF);
 797                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 798                                                     sizeof(struct ipv6hdr));
 799
 800                 ptr += len;
 801                 offset += len;
 802
 803                 /*
 804                  *      Put this fragment into the sending queue.
 805                  */
 806                 err = output(sk, frag);
 807                 if (err)
 808                         goto fail;
 809
 810                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 811                               IPSTATS_MIB_FRAGCREATES);
 812         }
 813         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 814                       IPSTATS_MIB_FRAGOKS);
 815         consume_skb(skb);
 816         return err;
 817
 818 fail:
 819         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 820                       IPSTATS_MIB_FRAGFAILS);
 821         kfree_skb(skb);
 822         return err;
 823 }
 824
 825 static inline int ip6_rt_check(const struct rt6key *rt_key,
 826                                const struct in6_addr *fl_addr,
 827                                const struct in6_addr *addr_cache)
 828 {
 829         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 830                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 831 }
 832
 833 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 834                                           struct dst_entry *dst,
 835                                           const struct flowi6 *fl6)
 836 {
 837         struct ipv6_pinfo *np = inet6_sk(sk);
 838         struct rt6_info *rt;
 839
 840         if (!dst)
 841                 goto out;
 842
 843         if (dst->ops->family != AF_INET6) {
 844                 dst_release(dst);
 845                 return NULL;
 846         }
 847
 848         rt = (struct rt6_info *)dst;
 849         /* Yes, checking route validity in not connected
 850          * case is not very simple. Take into account,
 851          * that we do not support routing by source, TOS,
 852          * and MSG_DONTROUTE            --ANK (980726)
 853          *
 854          * 1. ip6_rt_check(): If route was host route,
 855          *    check that cached destination is current.
 856          *    If it is network route, we still may
 857          *    check its validity using saved pointer
 858          *    to the last used address: daddr_cache.
 859          *    We do not want to save whole address now,
 860          *    (because main consumer of this service
 861          *    is tcp, which has not this problem),
 862          *    so that the last trick works only on connected
 863          *    sockets.
 864          * 2. oif also should be the same.
 865          */
 866         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 867 #ifdef CONFIG_IPV6_SUBTREES
 868             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 869 #endif
 870             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 871                 dst_release(dst);
 872                 dst = NULL;
 873         }
 874
 875 out:
 876         return dst;
 877 }
 878
 879 static int ip6_dst_lookup_tail(struct sock *sk,
 880                                struct dst_entry **dst, struct flowi6 *fl6)
 881 {
 882         struct net *net = sock_net(sk);
 883 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 884         struct neighbour *n;
 885         struct rt6_info *rt;
 886 #endif
 887         int err;
 888
 889         if (!*dst)
 890                 *dst = ip6_route_output(net, sk, fl6);
 891
 892         err = (*dst)->error;
 893         if (err)
 894                 goto out_err_release;
 895
 896         if (ipv6_addr_any(&fl6->saddr)) {
 897                 struct rt6_info *rt = (struct rt6_info *) *dst;
 898                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 899                                           sk ? inet6_sk(sk)->srcprefs : 0,
 900                                           &fl6->saddr);
 901                 if (err)
 902                         goto out_err_release;
 903         }
 904
 905 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 906         /*
 907          * Here if the dst entry we've looked up
 908          * has a neighbour entry that is in the INCOMPLETE
 909          * state and the src address from the flow is
 910          * marked as OPTIMISTIC, we release the found
 911          * dst entry and replace it instead with the
 912          * dst entry of the nexthop router
 913          */
 914         rt = (struct rt6_info *) *dst;
 915         rcu_read_lock_bh();
 916         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
 917         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 918         rcu_read_unlock_bh();
 919
 920         if (err) {
 921                 struct inet6_ifaddr *ifp;
 922                 struct flowi6 fl_gw6;
 923                 int redirect;
 924
 925                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 926                                       (*dst)->dev, 1);
 927
 928                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 929                 if (ifp)
 930                         in6_ifa_put(ifp);
 931
 932                 if (redirect) {
 933                         /*
 934                          * We need to get the dst entry for the
 935                          * default router instead
 936                          */
 937                         dst_release(*dst);
 938                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 939                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 940                         *dst = ip6_route_output(net, sk, &fl_gw6);
 941                         err = (*dst)->error;
 942                         if (err)
 943                                 goto out_err_release;
 944                 }
 945         }
 946 #endif
 947
 948         return 0;
 949
 950 out_err_release:
 951         if (err == -ENETUNREACH)
 952                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 953         dst_release(*dst);
 954         *dst = NULL;
 955         return err;
 956 }
 957
 958 /**
 959  *      ip6_dst_lookup - perform route lookup on flow
 960  *      @sk: socket which provides route info
 961  *      @dst: pointer to dst_entry * for result
 962  *      @fl6: flow to lookup
 963  *
 964  *      This function performs a route lookup on the given flow.
 965  *
 966  *      It returns zero on success, or a standard errno code on error.
 967  */
 968 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 969 {
 970         *dst = NULL;
 971         return ip6_dst_lookup_tail(sk, dst, fl6);
 972 }
 973 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 974
 975 /**
 976  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 977  *      @sk: socket which provides route info
 978  *      @fl6: flow to lookup
 979  *      @final_dst: final destination address for ipsec lookup
 980  *
 981  *      This function performs a route lookup on the given flow.
 982  *
 983  *      It returns a valid dst pointer on success, or a pointer encoded
 984  *      error code.
 985  */
 986 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 987                                       const struct in6_addr *final_dst)
 988 {
 989         struct dst_entry *dst = NULL;
 990         int err;
 991
 992         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 993         if (err)
 994                 return ERR_PTR(err);
 995         if (final_dst)
 996                 fl6->daddr = *final_dst;
 997
 998         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 999 }
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1001
1002 /**
1003  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1004  *      @sk: socket which provides the dst cache and route info
1005  *      @fl6: flow to lookup
1006  *      @final_dst: final destination address for ipsec lookup
1007  *
1008  *      This function performs a route lookup on the given flow with the
1009  *      possibility of using the cached route in the socket if it is valid.
1010  *      It will take the socket dst lock when operating on the dst cache.
1011  *      As a result, this function can only be used in process context.
1012  *
1013  *      It returns a valid dst pointer on success, or a pointer encoded
1014  *      error code.
1015  */
1016 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1017                                          const struct in6_addr *final_dst)
1018 {
1019         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1020         int err;
1021
1022         dst = ip6_sk_dst_check(sk, dst, fl6);
1023
1024         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1025         if (err)
1026                 return ERR_PTR(err);
1027         if (final_dst)
1028                 fl6->daddr = *final_dst;
1029
1030         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1031 }
1032 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1033
1034 static inline int ip6_ufo_append_data(struct sock *sk,
1035                         struct sk_buff_head *queue,
1036                         int getfrag(void *from, char *to, int offset, int len,
1037                         int odd, struct sk_buff *skb),
1038                         void *from, int length, int hh_len, int fragheaderlen,
1039                         int transhdrlen, int mtu, unsigned int flags,
1040                         struct rt6_info *rt)
1041
1042 {
1043         struct sk_buff *skb;
1044         struct frag_hdr fhdr;
1045         int err;
1046
1047         /* There is support for UDP large send offload by network
1048          * device, so create one single skb packet containing complete
1049          * udp datagram
1050          */
1051         skb = skb_peek_tail(queue);
1052         if (!skb) {
1053                 skb = sock_alloc_send_skb(sk,
1054                         hh_len + fragheaderlen + transhdrlen + 20,
1055                         (flags & MSG_DONTWAIT), &err);
1056                 if (!skb)
1057                         return err;
1058
1059                 /* reserve space for Hardware header */
1060                 skb_reserve(skb, hh_len);
1061
1062                 /* create space for UDP/IP header */
1063                 skb_put(skb, fragheaderlen + transhdrlen);
1064
1065                 /* initialize network header pointer */
1066                 skb_reset_network_header(skb);
1067
1068                 /* initialize protocol header pointer */
1069                 skb->transport_header = skb->network_header + fragheaderlen;
1070
1071                 skb->protocol = htons(ETH_P_IPV6);
1072                 skb->csum = 0;
1073
1074                 __skb_queue_tail(queue, skb);
1075         } else if (skb_is_gso(skb)) {
1076                 goto append;
1077         }
1078
1079         skb->ip_summed = CHECKSUM_PARTIAL;
1080         /* Specify the length of each IPv6 datagram fragment.
1081          * It has to be a multiple of 8.
1082          */
1083         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1084                                      sizeof(struct frag_hdr)) & ~7;
1085         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1086         ipv6_select_ident(sock_net(sk), &fhdr, rt);
1087         skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1088
1089 append:
1090         return skb_append_datato_frags(sk, skb, getfrag, from,
1091                                        (length - transhdrlen));
1092 }
1093
1094 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1095                                                gfp_t gfp)
1096 {
1097         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1098 }
1099
1100 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1101                                                 gfp_t gfp)
1102 {
1103         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1104 }
1105
1106 static void ip6_append_data_mtu(unsigned int *mtu,
1107                                 int *maxfraglen,
1108                                 unsigned int fragheaderlen,
1109                                 struct sk_buff *skb,
1110                                 struct rt6_info *rt,
1111                                 unsigned int orig_mtu)
1112 {
1113         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1114                 if (!skb) {
1115                         /* first fragment, reserve header_len */
1116                         *mtu = orig_mtu - rt->dst.header_len;
1117
1118                 } else {
1119                         /*
1120                          * this fragment is not first, the headers
1121                          * space is regarded as data space.
1122                          */
1123                         *mtu = orig_mtu;
1124                 }
1125                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1126                               + fragheaderlen - sizeof(struct frag_hdr);
1127         }
1128 }
1129
1130 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1131                           struct inet6_cork *v6_cork,
1132                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1133                           struct rt6_info *rt, struct flowi6 *fl6)
1134 {
1135         struct ipv6_pinfo *np = inet6_sk(sk);
1136         unsigned int mtu;
1137
1138         /*
1139          * setup for corking
1140          */
1141         if (opt) {
1142                 if (WARN_ON(v6_cork->opt))
1143                         return -EINVAL;
1144
1145                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1146                 if (unlikely(!v6_cork->opt))
1147                         return -ENOBUFS;
1148
1149                 v6_cork->opt->tot_len = opt->tot_len;
1150                 v6_cork->opt->opt_flen = opt->opt_flen;
1151                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1152
1153                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1154                                                     sk->sk_allocation);
1155                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1156                         return -ENOBUFS;
1157
1158                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1159                                                     sk->sk_allocation);
1160                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1161                         return -ENOBUFS;
1162
1163                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1164                                                    sk->sk_allocation);
1165                 if (opt->hopopt && !v6_cork->opt->hopopt)
1166                         return -ENOBUFS;
1167
1168                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1169                                                     sk->sk_allocation);
1170                 if (opt->srcrt && !v6_cork->opt->srcrt)
1171                         return -ENOBUFS;
1172
1173                 /* need source address above miyazawa*/
1174         }
1175         dst_hold(&rt->dst);
1176         cork->base.dst = &rt->dst;
1177         cork->fl.u.ip6 = *fl6;
1178         v6_cork->hop_limit = hlimit;
1179         v6_cork->tclass = tclass;
1180         if (rt->dst.flags & DST_XFRM_TUNNEL)
1181                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1182                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1183         else
1184                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1185                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1186         if (np->frag_size < mtu) {
1187                 if (np->frag_size)
1188                         mtu = np->frag_size;
1189         }
1190         cork->base.fragsize = mtu;
1191         if (dst_allfrag(rt->dst.path))
1192                 cork->base.flags |= IPCORK_ALLFRAG;
1193         cork->base.length = 0;
1194
1195         return 0;
1196 }
1197
1198 static int __ip6_append_data(struct sock *sk,
1199                              struct flowi6 *fl6,
1200                              struct sk_buff_head *queue,
1201                              struct inet_cork *cork,
1202                              struct inet6_cork *v6_cork,
1203                              struct page_frag *pfrag,
1204                              int getfrag(void *from, char *to, int offset,
1205                                          int len, int odd, struct sk_buff *skb),
1206                              void *from, int length, int transhdrlen,
1207                              unsigned int flags, int dontfrag)
1208 {
1209         struct sk_buff *skb, *skb_prev = NULL;
1210         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1211         int exthdrlen = 0;
1212         int dst_exthdrlen = 0;
1213         int hh_len;
1214         int copy;
1215         int err;
1216         int offset = 0;
1217         __u8 tx_flags = 0;
1218         u32 tskey = 0;
1219         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1220         struct ipv6_txoptions *opt = v6_cork->opt;
1221         int csummode = CHECKSUM_NONE;
1222
1223         skb = skb_peek_tail(queue);
1224         if (!skb) {
1225                 exthdrlen = opt ? opt->opt_flen : 0;
1226                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1227         }
1228
1229         mtu = cork->fragsize;
1230         orig_mtu = mtu;
1231
1232         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1233
1234         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1235                         (opt ? opt->opt_nflen : 0);
1236         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1237                      sizeof(struct frag_hdr);
1238
1239         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1240                 unsigned int maxnonfragsize, headersize;
1241
1242                 headersize = sizeof(struct ipv6hdr) +
1243                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1244                              (dst_allfrag(&rt->dst) ?
1245                               sizeof(struct frag_hdr) : 0) +
1246                              rt->rt6i_nfheader_len;
1247
1248                 if (ip6_sk_ignore_df(sk))
1249                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1250                 else
1251                         maxnonfragsize = mtu;
1252
1253                 /* dontfrag active */
1254                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1255                     (sk->sk_protocol == IPPROTO_UDP ||
1256                      sk->sk_protocol == IPPROTO_RAW)) {
1257                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1258                                                    sizeof(struct ipv6hdr));
1259                         goto emsgsize;
1260                 }
1261
1262                 if (cork->length + length > maxnonfragsize - headersize) {
1263 emsgsize:
1264                         ipv6_local_error(sk, EMSGSIZE, fl6,
1265                                          mtu - headersize +
1266                                          sizeof(struct ipv6hdr));
1267                         return -EMSGSIZE;
1268                 }
1269         }
1270
1271         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1272                 sock_tx_timestamp(sk, &tx_flags);
1273                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1274                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1275                         tskey = sk->sk_tskey++;
1276         }
1277
1278         /* If this is the first and only packet and device
1279          * supports checksum offloading, let's use it.
1280          */
1281         if (!skb && sk->sk_protocol == IPPROTO_UDP &&
1282             length + fragheaderlen < mtu &&
1283             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1284             !exthdrlen)
1285                 csummode = CHECKSUM_PARTIAL;
1286         /*
1287          * Let's try using as much space as possible.
1288          * Use MTU if total length of the message fits into the MTU.
1289          * Otherwise, we need to reserve fragment header and
1290          * fragment alignment (= 8-15 octects, in total).
1291          *
1292          * Note that we may need to "move" the data from the tail of
1293          * of the buffer to the new fragment when we split
1294          * the message.
1295          *
1296          * FIXME: It may be fragmented into multiple chunks
1297          *        at once if non-fragmentable extension headers
1298          *        are too large.
1299          * --yoshfuji
1300          */
1301
1302         cork->length += length;
1303         if (((length > mtu) ||
1304              (skb && skb_is_gso(skb))) &&
1305             (sk->sk_protocol == IPPROTO_UDP) &&
1306             (rt->dst.dev->features & NETIF_F_UFO) &&
1307             (sk->sk_type == SOCK_DGRAM)) {
1308                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1309                                           hh_len, fragheaderlen,
1310                                           transhdrlen, mtu, flags, rt);
1311                 if (err)
1312                         goto error;
1313                 return 0;
1314         }
1315
1316         if (!skb)
1317                 goto alloc_new_skb;
1318
1319         while (length > 0) {
1320                 /* Check if the remaining data fits into current packet. */
1321                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1322                 if (copy < length)
1323                         copy = maxfraglen - skb->len;
1324
1325                 if (copy <= 0) {
1326                         char *data;
1327                         unsigned int datalen;
1328                         unsigned int fraglen;
1329                         unsigned int fraggap;
1330                         unsigned int alloclen;
1331 alloc_new_skb:
1332                         /* There's no room in the current skb */
1333                         if (skb)
1334                                 fraggap = skb->len - maxfraglen;
1335                         else
1336                                 fraggap = 0;
1337                         /* update mtu and maxfraglen if necessary */
1338                         if (!skb || !skb_prev)
1339                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1340                                                     fragheaderlen, skb, rt,
1341                                                     orig_mtu);
1342
1343                         skb_prev = skb;
1344
1345                         /*
1346                          * If remaining data exceeds the mtu,
1347                          * we know we need more fragment(s).
1348                          */
1349                         datalen = length + fraggap;
1350
1351                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1352                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1353                         if ((flags & MSG_MORE) &&
1354                             !(rt->dst.dev->features&NETIF_F_SG))
1355                                 alloclen = mtu;
1356                         else
1357                                 alloclen = datalen + fragheaderlen;
1358
1359                         alloclen += dst_exthdrlen;
1360
1361                         if (datalen != length + fraggap) {
1362                                 /*
1363                                  * this is not the last fragment, the trailer
1364                                  * space is regarded as data space.
1365                                  */
1366                                 datalen += rt->dst.trailer_len;
1367                         }
1368
1369                         alloclen += rt->dst.trailer_len;
1370                         fraglen = datalen + fragheaderlen;
1371
1372                         /*
1373                          * We just reserve space for fragment header.
1374                          * Note: this may be overallocation if the message
1375                          * (without MSG_MORE) fits into the MTU.
1376                          */
1377                         alloclen += sizeof(struct frag_hdr);
1378
1379                         if (transhdrlen) {
1380                                 skb = sock_alloc_send_skb(sk,
1381                                                 alloclen + hh_len,
1382                                                 (flags & MSG_DONTWAIT), &err);
1383                         } else {
1384                                 skb = NULL;
1385                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1386                                     2 * sk->sk_sndbuf)
1387                                         skb = sock_wmalloc(sk,
1388                                                            alloclen + hh_len, 1,
1389                                                            sk->sk_allocation);
1390                                 if (unlikely(!skb))
1391                                         err = -ENOBUFS;
1392                         }
1393                         if (!skb)
1394                                 goto error;
1395                         /*
1396                          *      Fill in the control structures
1397                          */
1398                         skb->protocol = htons(ETH_P_IPV6);
1399                         skb->ip_summed = csummode;
1400                         skb->csum = 0;
1401                         /* reserve for fragmentation and ipsec header */
1402                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1403                                     dst_exthdrlen);
1404
1405                         /* Only the initial fragment is time stamped */
1406                         skb_shinfo(skb)->tx_flags = tx_flags;
1407                         tx_flags = 0;
1408                         skb_shinfo(skb)->tskey = tskey;
1409                         tskey = 0;
1410
1411                         /*
1412                          *      Find where to start putting bytes
1413                          */
1414                         data = skb_put(skb, fraglen);
1415                         skb_set_network_header(skb, exthdrlen);
1416                         data += fragheaderlen;
1417                         skb->transport_header = (skb->network_header +
1418                                                  fragheaderlen);
1419                         if (fraggap) {
1420                                 skb->csum = skb_copy_and_csum_bits(
1421                                         skb_prev, maxfraglen,
1422                                         data + transhdrlen, fraggap, 0);
1423                                 skb_prev->csum = csum_sub(skb_prev->csum,
1424                                                           skb->csum);
1425                                 data += fraggap;
1426                                 pskb_trim_unique(skb_prev, maxfraglen);
1427                         }
1428                         copy = datalen - transhdrlen - fraggap;
1429
1430                         if (copy < 0) {
1431                                 err = -EINVAL;
1432                                 kfree_skb(skb);
1433                                 goto error;
1434                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1435                                 err = -EFAULT;
1436                                 kfree_skb(skb);
1437                                 goto error;
1438                         }
1439
1440                         offset += copy;
1441                         length -= datalen - fraggap;
1442                         transhdrlen = 0;
1443                         exthdrlen = 0;
1444                         dst_exthdrlen = 0;
1445
1446                         /*
1447                          * Put the packet on the pending queue
1448                          */
1449                         __skb_queue_tail(queue, skb);
1450                         continue;
1451                 }
1452
1453                 if (copy > length)
1454                         copy = length;
1455
1456                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1457                         unsigned int off;
1458
1459                         off = skb->len;
1460                         if (getfrag(from, skb_put(skb, copy),
1461                                                 offset, copy, off, skb) < 0) {
1462                                 __skb_trim(skb, off);
1463                                 err = -EFAULT;
1464                                 goto error;
1465                         }
1466                 } else {
1467                         int i = skb_shinfo(skb)->nr_frags;
1468
1469                         err = -ENOMEM;
1470                         if (!sk_page_frag_refill(sk, pfrag))
1471                                 goto error;
1472
1473                         if (!skb_can_coalesce(skb, i, pfrag->page,
1474                                               pfrag->offset)) {
1475                                 err = -EMSGSIZE;
1476                                 if (i == MAX_SKB_FRAGS)
1477                                         goto error;
1478
1479                                 __skb_fill_page_desc(skb, i, pfrag->page,
1480                                                      pfrag->offset, 0);
1481                                 skb_shinfo(skb)->nr_frags = ++i;
1482                                 get_page(pfrag->page);
1483                         }
1484                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1485                         if (getfrag(from,
1486                                     page_address(pfrag->page) + pfrag->offset,
1487                                     offset, copy, skb->len, skb) < 0)
1488                                 goto error_efault;
1489
1490                         pfrag->offset += copy;
1491                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1492                         skb->len += copy;
1493                         skb->data_len += copy;
1494                         skb->truesize += copy;
1495                         atomic_add(copy, &sk->sk_wmem_alloc);
1496                 }
1497                 offset += copy;
1498                 length -= copy;
1499         }
1500
1501         return 0;
1502
1503 error_efault:
1504         err = -EFAULT;
1505 error:
1506         cork->length -= length;
1507         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1508         return err;
1509 }
1510
1511 int ip6_append_data(struct sock *sk,
1512                     int getfrag(void *from, char *to, int offset, int len,
1513                                 int odd, struct sk_buff *skb),
1514                     void *from, int length, int transhdrlen, int hlimit,
1515                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1516                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1517 {
1518         struct inet_sock *inet = inet_sk(sk);
1519         struct ipv6_pinfo *np = inet6_sk(sk);
1520         int exthdrlen;
1521         int err;
1522
1523         if (flags&MSG_PROBE)
1524                 return 0;
1525         if (skb_queue_empty(&sk->sk_write_queue)) {
1526                 /*
1527                  * setup for corking
1528                  */
1529                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1530                                      tclass, opt, rt, fl6);
1531                 if (err)
1532                         return err;
1533
1534                 exthdrlen = (opt ? opt->opt_flen : 0);
1535                 length += exthdrlen;
1536                 transhdrlen += exthdrlen;
1537         } else {
1538                 fl6 = &inet->cork.fl.u.ip6;
1539                 transhdrlen = 0;
1540         }
1541
1542         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1543                                  &np->cork, sk_page_frag(sk), getfrag,
1544                                  from, length, transhdrlen, flags, dontfrag);
1545 }
1546 EXPORT_SYMBOL_GPL(ip6_append_data);
1547
1548 static void ip6_cork_release(struct inet_cork_full *cork,
1549                              struct inet6_cork *v6_cork)
1550 {
1551         if (v6_cork->opt) {
1552                 kfree(v6_cork->opt->dst0opt);
1553                 kfree(v6_cork->opt->dst1opt);
1554                 kfree(v6_cork->opt->hopopt);
1555                 kfree(v6_cork->opt->srcrt);
1556                 kfree(v6_cork->opt);
1557                 v6_cork->opt = NULL;
1558         }
1559
1560         if (cork->base.dst) {
1561                 dst_release(cork->base.dst);
1562                 cork->base.dst = NULL;
1563                 cork->base.flags &= ~IPCORK_ALLFRAG;
1564         }
1565         memset(&cork->fl, 0, sizeof(cork->fl));
1566 }
1567
1568 struct sk_buff *__ip6_make_skb(struct sock *sk,
1569                                struct sk_buff_head *queue,
1570                                struct inet_cork_full *cork,
1571                                struct inet6_cork *v6_cork)
1572 {
1573         struct sk_buff *skb, *tmp_skb;
1574         struct sk_buff **tail_skb;
1575         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1576         struct ipv6_pinfo *np = inet6_sk(sk);
1577         struct net *net = sock_net(sk);
1578         struct ipv6hdr *hdr;
1579         struct ipv6_txoptions *opt = v6_cork->opt;
1580         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1581         struct flowi6 *fl6 = &cork->fl.u.ip6;
1582         unsigned char proto = fl6->flowi6_proto;
1583
1584         skb = __skb_dequeue(queue);
1585         if (!skb)
1586                 goto out;
1587         tail_skb = &(skb_shinfo(skb)->frag_list);
1588
1589         /* move skb->data to ip header from ext header */
1590         if (skb->data < skb_network_header(skb))
1591                 __skb_pull(skb, skb_network_offset(skb));
1592         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1593                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1594                 *tail_skb = tmp_skb;
1595                 tail_skb = &(tmp_skb->next);
1596                 skb->len += tmp_skb->len;
1597                 skb->data_len += tmp_skb->len;
1598                 skb->truesize += tmp_skb->truesize;
1599                 tmp_skb->destructor = NULL;
1600                 tmp_skb->sk = NULL;
1601         }
1602
1603         /* Allow local fragmentation. */
1604         skb->ignore_df = ip6_sk_ignore_df(sk);
1605
1606         *final_dst = fl6->daddr;
1607         __skb_pull(skb, skb_network_header_len(skb));
1608         if (opt && opt->opt_flen)
1609                 ipv6_push_frag_opts(skb, opt, &proto);
1610         if (opt && opt->opt_nflen)
1611                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1612
1613         skb_push(skb, sizeof(struct ipv6hdr));
1614         skb_reset_network_header(skb);
1615         hdr = ipv6_hdr(skb);
1616
1617         ip6_flow_hdr(hdr, v6_cork->tclass,
1618                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1619                                         np->autoflowlabel));
1620         hdr->hop_limit = v6_cork->hop_limit;
1621         hdr->nexthdr = proto;
1622         hdr->saddr = fl6->saddr;
1623         hdr->daddr = *final_dst;
1624
1625         skb->priority = sk->sk_priority;
1626         skb->mark = sk->sk_mark;
1627
1628         skb_dst_set(skb, dst_clone(&rt->dst));
1629         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1630         if (proto == IPPROTO_ICMPV6) {
1631                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1632
1633                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1634                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1635         }
1636
1637         ip6_cork_release(cork, v6_cork);
1638 out:
1639         return skb;
1640 }
1641
1642 int ip6_send_skb(struct sk_buff *skb)
1643 {
1644         struct net *net = sock_net(skb->sk);
1645         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1646         int err;
1647
1648         err = ip6_local_out(skb);
1649         if (err) {
1650                 if (err > 0)
1651                         err = net_xmit_errno(err);
1652                 if (err)
1653                         IP6_INC_STATS(net, rt->rt6i_idev,
1654                                       IPSTATS_MIB_OUTDISCARDS);
1655         }
1656
1657         return err;
1658 }
1659
1660 int ip6_push_pending_frames(struct sock *sk)
1661 {
1662         struct sk_buff *skb;
1663
1664         skb = ip6_finish_skb(sk);
1665         if (!skb)
1666                 return 0;
1667
1668         return ip6_send_skb(skb);
1669 }
1670 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1671
1672 static void __ip6_flush_pending_frames(struct sock *sk,
1673                                        struct sk_buff_head *queue,
1674                                        struct inet_cork_full *cork,
1675                                        struct inet6_cork *v6_cork)
1676 {
1677         struct sk_buff *skb;
1678
1679         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1680                 if (skb_dst(skb))
1681                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1682                                       IPSTATS_MIB_OUTDISCARDS);
1683                 kfree_skb(skb);
1684         }
1685
1686         ip6_cork_release(cork, v6_cork);
1687 }
1688
1689 void ip6_flush_pending_frames(struct sock *sk)
1690 {
1691         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1692                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1693 }
1694 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1695
1696 struct sk_buff *ip6_make_skb(struct sock *sk,
1697                              int getfrag(void *from, char *to, int offset,
1698                                          int len, int odd, struct sk_buff *skb),
1699                              void *from, int length, int transhdrlen,
1700                              int hlimit, int tclass,
1701                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1702                              struct rt6_info *rt, unsigned int flags,
1703                              int dontfrag)
1704 {
1705         struct inet_cork_full cork;
1706         struct inet6_cork v6_cork;
1707         struct sk_buff_head queue;
1708         int exthdrlen = (opt ? opt->opt_flen : 0);
1709         int err;
1710
1711         if (flags & MSG_PROBE)
1712                 return NULL;
1713
1714         __skb_queue_head_init(&queue);
1715
1716         cork.base.flags = 0;
1717         cork.base.addr = 0;
1718         cork.base.opt = NULL;
1719         v6_cork.opt = NULL;
1720         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1721         if (err)
1722                 return ERR_PTR(err);
1723
1724         if (dontfrag < 0)
1725                 dontfrag = inet6_sk(sk)->dontfrag;
1726
1727         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1728                                 &current->task_frag, getfrag, from,
1729                                 length + exthdrlen, transhdrlen + exthdrlen,
1730                                 flags, dontfrag);
1731         if (err) {
1732                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1733                 return ERR_PTR(err);
1734         }
1735
1736         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1737 }