net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 142             dst_allfrag(skb_dst(skb)) ||
 143             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 144                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 145         else
 146                 return ip6_finish_output2(net, sk, skb);
 147 }
 148
 149 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 150 {
 151         struct net_device *dev = skb_dst(skb)->dev;
 152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153
 154         skb->protocol = htons(ETH_P_IPV6);
 155         skb->dev = dev;
 156
 157         if (unlikely(idev->cnf.disable_ipv6)) {
 158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                 kfree_skb(skb);
 160                 return 0;
 161         }
 162
 163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                             net, sk, skb, NULL, dev,
 165                             ip6_finish_output,
 166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167 }
 168
 169 static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 170 {
 171         if (!np->autoflowlabel_set)
 172                 return ip6_default_np_autolabel(net);
 173         else
 174                 return np->autoflowlabel;
 175 }
 176
 177 /*
 178  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 179  * Note : socket lock is not held for SYNACK packets, but might be modified
 180  * by calls to skb_set_owner_w() and ipv6_local_error(),
 181  * which are using proper atomic operations or spinlocks.
 182  */
 183 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 184              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 185 {
 186         struct net *net = sock_net(sk);
 187         const struct ipv6_pinfo *np = inet6_sk(sk);
 188         struct in6_addr *first_hop = &fl6->daddr;
 189         struct dst_entry *dst = skb_dst(skb);
 190         struct ipv6hdr *hdr;
 191         u8  proto = fl6->flowi6_proto;
 192         int seg_len = skb->len;
 193         int hlimit = -1;
 194         u32 mtu;
 195
 196         if (opt) {
 197                 unsigned int head_room;
 198
 199                 /* First: exthdrs may take lots of space (~8K for now)
 200                    MAX_HEADER is not enough.
 201                  */
 202                 head_room = opt->opt_nflen + opt->opt_flen;
 203                 seg_len += head_room;
 204                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 205
 206                 if (skb_headroom(skb) < head_room) {
 207                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 208                         if (!skb2) {
 209                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 210                                               IPSTATS_MIB_OUTDISCARDS);
 211                                 kfree_skb(skb);
 212                                 return -ENOBUFS;
 213                         }
 214                         consume_skb(skb);
 215                         skb = skb2;
 216                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 217                          * it is safe to call in our context (socket lock not held)
 218                          */
 219                         skb_set_owner_w(skb, (struct sock *)sk);
 220                 }
 221                 if (opt->opt_flen)
 222                         ipv6_push_frag_opts(skb, opt, &proto);
 223                 if (opt->opt_nflen)
 224                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 225                                              &fl6->saddr);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235         if (np)
 236                 hlimit = np->hop_limit;
 237         if (hlimit < 0)
 238                 hlimit = ip6_dst_hoplimit(dst);
 239
 240         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 241                                 ip6_autoflowlabel(net, np), fl6));
 242
 243         hdr->payload_len = htons(seg_len);
 244         hdr->nexthdr = proto;
 245         hdr->hop_limit = hlimit;
 246
 247         hdr->saddr = fl6->saddr;
 248         hdr->daddr = *first_hop;
 249
 250         skb->protocol = htons(ETH_P_IPV6);
 251         skb->priority = sk->sk_priority;
 252         skb->mark = mark;
 253
 254         mtu = dst_mtu(dst);
 255         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 256                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 257                               IPSTATS_MIB_OUT, skb->len);
 258
 259                 /* if egress device is enslaved to an L3 master device pass the
 260                  * skb to its handler for processing
 261                  */
 262                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 263                 if (unlikely(!skb))
 264                         return 0;
 265
 266                 /* hooks should never assume socket lock is held.
 267                  * we promote our socket to non const
 268                  */
 269                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 270                                net, (struct sock *)sk, skb, NULL, dst->dev,
 271                                dst_output);
 272         }
 273
 274         skb->dev = dst->dev;
 275         /* ipv6_local_error() does not require socket lock,
 276          * we promote our socket to non const
 277          */
 278         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 279
 280         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 281         kfree_skb(skb);
 282         return -EMSGSIZE;
 283 }
 284 EXPORT_SYMBOL(ip6_xmit);
 285
 286 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 287 {
 288         struct ip6_ra_chain *ra;
 289         struct sock *last = NULL;
 290
 291         read_lock(&ip6_ra_lock);
 292         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 293                 struct sock *sk = ra->sk;
 294                 if (sk && ra->sel == sel &&
 295                     (!sk->sk_bound_dev_if ||
 296                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 297                         if (last) {
 298                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 299                                 if (skb2)
 300                                         rawv6_rcv(last, skb2);
 301                         }
 302                         last = sk;
 303                 }
 304         }
 305
 306         if (last) {
 307                 rawv6_rcv(last, skb);
 308                 read_unlock(&ip6_ra_lock);
 309                 return 1;
 310         }
 311         read_unlock(&ip6_ra_lock);
 312         return 0;
 313 }
 314
 315 static int ip6_forward_proxy_check(struct sk_buff *skb)
 316 {
 317         struct ipv6hdr *hdr = ipv6_hdr(skb);
 318         u8 nexthdr = hdr->nexthdr;
 319         __be16 frag_off;
 320         int offset;
 321
 322         if (ipv6_ext_hdr(nexthdr)) {
 323                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 324                 if (offset < 0)
 325                         return 0;
 326         } else
 327                 offset = sizeof(struct ipv6hdr);
 328
 329         if (nexthdr == IPPROTO_ICMPV6) {
 330                 struct icmp6hdr *icmp6;
 331
 332                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 333                                          offset + 1 - skb->data)))
 334                         return 0;
 335
 336                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 337
 338                 switch (icmp6->icmp6_type) {
 339                 case NDISC_ROUTER_SOLICITATION:
 340                 case NDISC_ROUTER_ADVERTISEMENT:
 341                 case NDISC_NEIGHBOUR_SOLICITATION:
 342                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 343                 case NDISC_REDIRECT:
 344                         /* For reaction involving unicast neighbor discovery
 345                          * message destined to the proxied address, pass it to
 346                          * input function.
 347                          */
 348                         return 1;
 349                 default:
 350                         break;
 351                 }
 352         }
 353
 354         /*
 355          * The proxying router can't forward traffic sent to a link-local
 356          * address, so signal the sender and discard the packet. This
 357          * behavior is clarified by the MIPv6 specification.
 358          */
 359         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 360                 dst_link_failure(skb);
 361                 return -1;
 362         }
 363
 364         return 0;
 365 }
 366
 367 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 368                                      struct sk_buff *skb)
 369 {
 370         return dst_output(net, sk, skb);
 371 }
 372
 373 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 374 {
 375         unsigned int mtu;
 376         struct inet6_dev *idev;
 377
 378         if (dst_metric_locked(dst, RTAX_MTU)) {
 379                 mtu = dst_metric_raw(dst, RTAX_MTU);
 380                 if (mtu)
 381                         return mtu;
 382         }
 383
 384         mtu = IPV6_MIN_MTU;
 385         rcu_read_lock();
 386         idev = __in6_dev_get(dst->dev);
 387         if (idev)
 388                 mtu = idev->cnf.mtu6;
 389         rcu_read_unlock();
 390
 391         return mtu;
 392 }
 393
 394 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 395 {
 396         if (skb->len <= mtu)
 397                 return false;
 398
 399         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 400         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 401                 return true;
 402
 403         if (skb->ignore_df)
 404                 return false;
 405
 406         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 407                 return false;
 408
 409         return true;
 410 }
 411
 412 int ip6_forward(struct sk_buff *skb)
 413 {
 414         struct dst_entry *dst = skb_dst(skb);
 415         struct ipv6hdr *hdr = ipv6_hdr(skb);
 416         struct inet6_skb_parm *opt = IP6CB(skb);
 417         struct net *net = dev_net(dst->dev);
 418         u32 mtu;
 419
 420         if (net->ipv6.devconf_all->forwarding == 0)
 421                 goto error;
 422
 423         if (skb->pkt_type != PACKET_HOST)
 424                 goto drop;
 425
 426         if (unlikely(skb->sk))
 427                 goto drop;
 428
 429         if (skb_warn_if_lro(skb))
 430                 goto drop;
 431
 432         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 433                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 434                                 IPSTATS_MIB_INDISCARDS);
 435                 goto drop;
 436         }
 437
 438         skb_forward_csum(skb);
 439
 440         /*
 441          *      We DO NOT make any processing on
 442          *      RA packets, pushing them to user level AS IS
 443          *      without ane WARRANTY that application will be able
 444          *      to interpret them. The reason is that we
 445          *      cannot make anything clever here.
 446          *
 447          *      We are not end-node, so that if packet contains
 448          *      AH/ESP, we cannot make anything.
 449          *      Defragmentation also would be mistake, RA packets
 450          *      cannot be fragmented, because there is no warranty
 451          *      that different fragments will go along one path. --ANK
 452          */
 453         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 454                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 455                         return 0;
 456         }
 457
 458         /*
 459          *      check and decrement ttl
 460          */
 461         if (hdr->hop_limit <= 1) {
 462                 /* Force OUTPUT device used as source address */
 463                 skb->dev = dst->dev;
 464                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 465                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 466                                 IPSTATS_MIB_INHDRERRORS);
 467
 468                 kfree_skb(skb);
 469                 return -ETIMEDOUT;
 470         }
 471
 472         /* XXX: idev->cnf.proxy_ndp? */
 473         if (net->ipv6.devconf_all->proxy_ndp &&
 474             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 475                 int proxied = ip6_forward_proxy_check(skb);
 476                 if (proxied > 0)
 477                         return ip6_input(skb);
 478                 else if (proxied < 0) {
 479                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 480                                         IPSTATS_MIB_INDISCARDS);
 481                         goto drop;
 482                 }
 483         }
 484
 485         if (!xfrm6_route_forward(skb)) {
 486                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 487                                 IPSTATS_MIB_INDISCARDS);
 488                 goto drop;
 489         }
 490         dst = skb_dst(skb);
 491
 492         /* IPv6 specs say nothing about it, but it is clear that we cannot
 493            send redirects to source routed frames.
 494            We don't send redirects to frames decapsulated from IPsec.
 495          */
 496         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 497                 struct in6_addr *target = NULL;
 498                 struct inet_peer *peer;
 499                 struct rt6_info *rt;
 500
 501                 /*
 502                  *      incoming and outgoing devices are the same
 503                  *      send a redirect.
 504                  */
 505
 506                 rt = (struct rt6_info *) dst;
 507                 if (rt->rt6i_flags & RTF_GATEWAY)
 508                         target = &rt->rt6i_gateway;
 509                 else
 510                         target = &hdr->daddr;
 511
 512                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 513
 514                 /* Limit redirects both by destination (here)
 515                    and by source (inside ndisc_send_redirect)
 516                  */
 517                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 518                         ndisc_send_redirect(skb, target);
 519                 if (peer)
 520                         inet_putpeer(peer);
 521         } else {
 522                 int addrtype = ipv6_addr_type(&hdr->saddr);
 523
 524                 /* This check is security critical. */
 525                 if (addrtype == IPV6_ADDR_ANY ||
 526                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 527                         goto error;
 528                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 529                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 530                                     ICMPV6_NOT_NEIGHBOUR, 0);
 531                         goto error;
 532                 }
 533         }
 534
 535         mtu = ip6_dst_mtu_forward(dst);
 536         if (mtu < IPV6_MIN_MTU)
 537                 mtu = IPV6_MIN_MTU;
 538
 539         if (ip6_pkt_too_big(skb, mtu)) {
 540                 /* Again, force OUTPUT device used as source address */
 541                 skb->dev = dst->dev;
 542                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 543                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 544                                 IPSTATS_MIB_INTOOBIGERRORS);
 545                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 546                                 IPSTATS_MIB_FRAGFAILS);
 547                 kfree_skb(skb);
 548                 return -EMSGSIZE;
 549         }
 550
 551         if (skb_cow(skb, dst->dev->hard_header_len)) {
 552                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 553                                 IPSTATS_MIB_OUTDISCARDS);
 554                 goto drop;
 555         }
 556
 557         hdr = ipv6_hdr(skb);
 558
 559         /* Mangling hops number delayed to point after skb COW */
 560
 561         hdr->hop_limit--;
 562
 563         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 564         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 565         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 566                        net, NULL, skb, skb->dev, dst->dev,
 567                        ip6_forward_finish);
 568
 569 error:
 570         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 571 drop:
 572         kfree_skb(skb);
 573         return -EINVAL;
 574 }
 575
 576 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 577 {
 578         to->pkt_type = from->pkt_type;
 579         to->priority = from->priority;
 580         to->protocol = from->protocol;
 581         skb_dst_drop(to);
 582         skb_dst_set(to, dst_clone(skb_dst(from)));
 583         to->dev = from->dev;
 584         to->mark = from->mark;
 585
 586 #ifdef CONFIG_NET_SCHED
 587         to->tc_index = from->tc_index;
 588 #endif
 589         nf_copy(to, from);
 590         skb_copy_secmark(to, from);
 591 }
 592
 593 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 594                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 595 {
 596         struct sk_buff *frag;
 597         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 598         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 599                                 inet6_sk(skb->sk) : NULL;
 600         struct ipv6hdr *tmp_hdr;
 601         struct frag_hdr *fh;
 602         unsigned int mtu, hlen, left, len;
 603         int hroom, troom;
 604         __be32 frag_id;
 605         int ptr, offset = 0, err = 0;
 606         u8 *prevhdr, nexthdr = 0;
 607
 608         err = ip6_find_1stfragopt(skb, &prevhdr);
 609         if (err < 0)
 610                 goto fail;
 611         hlen = err;
 612         nexthdr = *prevhdr;
 613
 614         mtu = ip6_skb_dst_mtu(skb);
 615
 616         /* We must not fragment if the socket is set to force MTU discovery
 617          * or if the skb it not generated by a local socket.
 618          */
 619         if (unlikely(!skb->ignore_df && skb->len > mtu))
 620                 goto fail_toobig;
 621
 622         if (IP6CB(skb)->frag_max_size) {
 623                 if (IP6CB(skb)->frag_max_size > mtu)
 624                         goto fail_toobig;
 625
 626                 /* don't send fragments larger than what we received */
 627                 mtu = IP6CB(skb)->frag_max_size;
 628                 if (mtu < IPV6_MIN_MTU)
 629                         mtu = IPV6_MIN_MTU;
 630         }
 631
 632         if (np && np->frag_size < mtu) {
 633                 if (np->frag_size)
 634                         mtu = np->frag_size;
 635         }
 636         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 637                 goto fail_toobig;
 638         mtu -= hlen + sizeof(struct frag_hdr);
 639
 640         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 641                                     &ipv6_hdr(skb)->saddr);
 642
 643         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 644             (err = skb_checksum_help(skb)))
 645                 goto fail;
 646
 647         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 648         if (skb_has_frag_list(skb)) {
 649                 unsigned int first_len = skb_pagelen(skb);
 650                 struct sk_buff *frag2;
 651
 652                 if (first_len - hlen > mtu ||
 653                     ((first_len - hlen) & 7) ||
 654                     skb_cloned(skb) ||
 655                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 656                         goto slow_path;
 657
 658                 skb_walk_frags(skb, frag) {
 659                         /* Correct geometry. */
 660                         if (frag->len > mtu ||
 661                             ((frag->len & 7) && frag->next) ||
 662                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 663                                 goto slow_path_clean;
 664
 665                         /* Partially cloned skb? */
 666                         if (skb_shared(frag))
 667                                 goto slow_path_clean;
 668
 669                         BUG_ON(frag->sk);
 670                         if (skb->sk) {
 671                                 frag->sk = skb->sk;
 672                                 frag->destructor = sock_wfree;
 673                         }
 674                         skb->truesize -= frag->truesize;
 675                 }
 676
 677                 err = 0;
 678                 offset = 0;
 679                 /* BUILD HEADER */
 680
 681                 *prevhdr = NEXTHDR_FRAGMENT;
 682                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 683                 if (!tmp_hdr) {
 684                         err = -ENOMEM;
 685                         goto fail;
 686                 }
 687                 frag = skb_shinfo(skb)->frag_list;
 688                 skb_frag_list_init(skb);
 689
 690                 __skb_pull(skb, hlen);
 691                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 692                 __skb_push(skb, hlen);
 693                 skb_reset_network_header(skb);
 694                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 695
 696                 fh->nexthdr = nexthdr;
 697                 fh->reserved = 0;
 698                 fh->frag_off = htons(IP6_MF);
 699                 fh->identification = frag_id;
 700
 701                 first_len = skb_pagelen(skb);
 702                 skb->data_len = first_len - skb_headlen(skb);
 703                 skb->len = first_len;
 704                 ipv6_hdr(skb)->payload_len = htons(first_len -
 705                                                    sizeof(struct ipv6hdr));
 706
 707                 for (;;) {
 708                         /* Prepare header of the next frame,
 709                          * before previous one went down. */
 710                         if (frag) {
 711                                 frag->ip_summed = CHECKSUM_NONE;
 712                                 skb_reset_transport_header(frag);
 713                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 714                                 __skb_push(frag, hlen);
 715                                 skb_reset_network_header(frag);
 716                                 memcpy(skb_network_header(frag), tmp_hdr,
 717                                        hlen);
 718                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 719                                 fh->nexthdr = nexthdr;
 720                                 fh->reserved = 0;
 721                                 fh->frag_off = htons(offset);
 722                                 if (frag->next)
 723                                         fh->frag_off |= htons(IP6_MF);
 724                                 fh->identification = frag_id;
 725                                 ipv6_hdr(frag)->payload_len =
 726                                                 htons(frag->len -
 727                                                       sizeof(struct ipv6hdr));
 728                                 ip6_copy_metadata(frag, skb);
 729                         }
 730
 731                         err = output(net, sk, skb);
 732                         if (!err)
 733                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 734                                               IPSTATS_MIB_FRAGCREATES);
 735
 736                         if (err || !frag)
 737                                 break;
 738
 739                         skb = frag;
 740                         frag = skb->next;
 741                         skb->next = NULL;
 742                 }
 743
 744                 kfree(tmp_hdr);
 745
 746                 if (err == 0) {
 747                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 748                                       IPSTATS_MIB_FRAGOKS);
 749                         return 0;
 750                 }
 751
 752                 kfree_skb_list(frag);
 753
 754                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 755                               IPSTATS_MIB_FRAGFAILS);
 756                 return err;
 757
 758 slow_path_clean:
 759                 skb_walk_frags(skb, frag2) {
 760                         if (frag2 == frag)
 761                                 break;
 762                         frag2->sk = NULL;
 763                         frag2->destructor = NULL;
 764                         skb->truesize += frag2->truesize;
 765                 }
 766         }
 767
 768 slow_path:
 769         left = skb->len - hlen;         /* Space per frame */
 770         ptr = hlen;                     /* Where to start from */
 771
 772         /*
 773          *      Fragment the datagram.
 774          */
 775
 776         troom = rt->dst.dev->needed_tailroom;
 777
 778         /*
 779          *      Keep copying data until we run out.
 780          */
 781         while (left > 0)        {
 782                 u8 *fragnexthdr_offset;
 783
 784                 len = left;
 785                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 786                 if (len > mtu)
 787                         len = mtu;
 788                 /* IF: we are not sending up to and including the packet end
 789                    then align the next start on an eight byte boundary */
 790                 if (len < left) {
 791                         len &= ~7;
 792                 }
 793
 794                 /* Allocate buffer */
 795                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 796                                  hroom + troom, GFP_ATOMIC);
 797                 if (!frag) {
 798                         err = -ENOMEM;
 799                         goto fail;
 800                 }
 801
 802                 /*
 803                  *      Set up data on packet
 804                  */
 805
 806                 ip6_copy_metadata(frag, skb);
 807                 skb_reserve(frag, hroom);
 808                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 809                 skb_reset_network_header(frag);
 810                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 811                 frag->transport_header = (frag->network_header + hlen +
 812                                           sizeof(struct frag_hdr));
 813
 814                 /*
 815                  *      Charge the memory for the fragment to any owner
 816                  *      it might possess
 817                  */
 818                 if (skb->sk)
 819                         skb_set_owner_w(frag, skb->sk);
 820
 821                 /*
 822                  *      Copy the packet header into the new buffer.
 823                  */
 824                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 825
 826                 fragnexthdr_offset = skb_network_header(frag);
 827                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 828                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 829
 830                 /*
 831                  *      Build fragment header.
 832                  */
 833                 fh->nexthdr = nexthdr;
 834                 fh->reserved = 0;
 835                 fh->identification = frag_id;
 836
 837                 /*
 838                  *      Copy a block of the IP datagram.
 839                  */
 840                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 841                                      len));
 842                 left -= len;
 843
 844                 fh->frag_off = htons(offset);
 845                 if (left > 0)
 846                         fh->frag_off |= htons(IP6_MF);
 847                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 848                                                     sizeof(struct ipv6hdr));
 849
 850                 ptr += len;
 851                 offset += len;
 852
 853                 /*
 854                  *      Put this fragment into the sending queue.
 855                  */
 856                 err = output(net, sk, frag);
 857                 if (err)
 858                         goto fail;
 859
 860                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 861                               IPSTATS_MIB_FRAGCREATES);
 862         }
 863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                       IPSTATS_MIB_FRAGOKS);
 865         consume_skb(skb);
 866         return err;
 867
 868 fail_toobig:
 869         if (skb->sk && dst_allfrag(skb_dst(skb)))
 870                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 871
 872         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 873         err = -EMSGSIZE;
 874
 875 fail:
 876         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 877                       IPSTATS_MIB_FRAGFAILS);
 878         kfree_skb(skb);
 879         return err;
 880 }
 881
 882 static inline int ip6_rt_check(const struct rt6key *rt_key,
 883                                const struct in6_addr *fl_addr,
 884                                const struct in6_addr *addr_cache)
 885 {
 886         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 887                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 888 }
 889
 890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 891                                           struct dst_entry *dst,
 892                                           const struct flowi6 *fl6)
 893 {
 894         struct ipv6_pinfo *np = inet6_sk(sk);
 895         struct rt6_info *rt;
 896
 897         if (!dst)
 898                 goto out;
 899
 900         if (dst->ops->family != AF_INET6) {
 901                 dst_release(dst);
 902                 return NULL;
 903         }
 904
 905         rt = (struct rt6_info *)dst;
 906         /* Yes, checking route validity in not connected
 907          * case is not very simple. Take into account,
 908          * that we do not support routing by source, TOS,
 909          * and MSG_DONTROUTE            --ANK (980726)
 910          *
 911          * 1. ip6_rt_check(): If route was host route,
 912          *    check that cached destination is current.
 913          *    If it is network route, we still may
 914          *    check its validity using saved pointer
 915          *    to the last used address: daddr_cache.
 916          *    We do not want to save whole address now,
 917          *    (because main consumer of this service
 918          *    is tcp, which has not this problem),
 919          *    so that the last trick works only on connected
 920          *    sockets.
 921          * 2. oif also should be the same.
 922          */
 923         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 924 #ifdef CONFIG_IPV6_SUBTREES
 925             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 926 #endif
 927            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 928               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 929                 dst_release(dst);
 930                 dst = NULL;
 931         }
 932
 933 out:
 934         return dst;
 935 }
 936
 937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 938                                struct dst_entry **dst, struct flowi6 *fl6)
 939 {
 940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 941         struct neighbour *n;
 942         struct rt6_info *rt;
 943 #endif
 944         int err;
 945         int flags = 0;
 946
 947         /* The correct way to handle this would be to do
 948          * ip6_route_get_saddr, and then ip6_route_output; however,
 949          * the route-specific preferred source forces the
 950          * ip6_route_output call _before_ ip6_route_get_saddr.
 951          *
 952          * In source specific routing (no src=any default route),
 953          * ip6_route_output will fail given src=any saddr, though, so
 954          * that's why we try it again later.
 955          */
 956         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 957                 struct rt6_info *rt;
 958                 bool had_dst = *dst != NULL;
 959
 960                 if (!had_dst)
 961                         *dst = ip6_route_output(net, sk, fl6);
 962                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 963                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 964                                           sk ? inet6_sk(sk)->srcprefs : 0,
 965                                           &fl6->saddr);
 966                 if (err)
 967                         goto out_err_release;
 968
 969                 /* If we had an erroneous initial result, pretend it
 970                  * never existed and let the SA-enabled version take
 971                  * over.
 972                  */
 973                 if (!had_dst && (*dst)->error) {
 974                         dst_release(*dst);
 975                         *dst = NULL;
 976                 }
 977
 978                 if (fl6->flowi6_oif)
 979                         flags |= RT6_LOOKUP_F_IFACE;
 980         }
 981
 982         if (!*dst)
 983                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 984
 985         err = (*dst)->error;
 986         if (err)
 987                 goto out_err_release;
 988
 989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 990         /*
 991          * Here if the dst entry we've looked up
 992          * has a neighbour entry that is in the INCOMPLETE
 993          * state and the src address from the flow is
 994          * marked as OPTIMISTIC, we release the found
 995          * dst entry and replace it instead with the
 996          * dst entry of the nexthop router
 997          */
 998         rt = (struct rt6_info *) *dst;
 999         rcu_read_lock_bh();
1000         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1001                                       rt6_nexthop(rt, &fl6->daddr));
1002         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1003         rcu_read_unlock_bh();
1004
1005         if (err) {
1006                 struct inet6_ifaddr *ifp;
1007                 struct flowi6 fl_gw6;
1008                 int redirect;
1009
1010                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1011                                       (*dst)->dev, 1);
1012
1013                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1014                 if (ifp)
1015                         in6_ifa_put(ifp);
1016
1017                 if (redirect) {
1018                         /*
1019                          * We need to get the dst entry for the
1020                          * default router instead
1021                          */
1022                         dst_release(*dst);
1023                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1024                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1025                         *dst = ip6_route_output(net, sk, &fl_gw6);
1026                         err = (*dst)->error;
1027                         if (err)
1028                                 goto out_err_release;
1029                 }
1030         }
1031 #endif
1032         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1033             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1034                 err = -EAFNOSUPPORT;
1035                 goto out_err_release;
1036         }
1037
1038         return 0;
1039
1040 out_err_release:
1041         dst_release(*dst);
1042         *dst = NULL;
1043
1044         if (err == -ENETUNREACH)
1045                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1046         return err;
1047 }
1048
1049 /**
1050  *      ip6_dst_lookup - perform route lookup on flow
1051  *      @sk: socket which provides route info
1052  *      @dst: pointer to dst_entry * for result
1053  *      @fl6: flow to lookup
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns zero on success, or a standard errno code on error.
1058  */
1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1060                    struct flowi6 *fl6)
1061 {
1062         *dst = NULL;
1063         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1066
1067 /**
1068  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1069  *      @sk: socket which provides route info
1070  *      @fl6: flow to lookup
1071  *      @final_dst: final destination address for ipsec lookup
1072  *
1073  *      This function performs a route lookup on the given flow.
1074  *
1075  *      It returns a valid dst pointer on success, or a pointer encoded
1076  *      error code.
1077  */
1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1079                                       const struct in6_addr *final_dst)
1080 {
1081         struct dst_entry *dst = NULL;
1082         int err;
1083
1084         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1085         if (err)
1086                 return ERR_PTR(err);
1087         if (final_dst)
1088                 fl6->daddr = *final_dst;
1089
1090         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1091 }
1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1093
1094 /**
1095  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1096  *      @sk: socket which provides the dst cache and route info
1097  *      @fl6: flow to lookup
1098  *      @final_dst: final destination address for ipsec lookup
1099  *
1100  *      This function performs a route lookup on the given flow with the
1101  *      possibility of using the cached route in the socket if it is valid.
1102  *      It will take the socket dst lock when operating on the dst cache.
1103  *      As a result, this function can only be used in process context.
1104  *
1105  *      It returns a valid dst pointer on success, or a pointer encoded
1106  *      error code.
1107  */
1108 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1109                                          const struct in6_addr *final_dst)
1110 {
1111         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112
1113         dst = ip6_sk_dst_check(sk, dst, fl6);
1114         if (!dst)
1115                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1116
1117         return dst;
1118 }
1119 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1120
1121 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1122                                                gfp_t gfp)
1123 {
1124         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1125 }
1126
1127 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1128                                                 gfp_t gfp)
1129 {
1130         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1131 }
1132
1133 static void ip6_append_data_mtu(unsigned int *mtu,
1134                                 int *maxfraglen,
1135                                 unsigned int fragheaderlen,
1136                                 struct sk_buff *skb,
1137                                 struct rt6_info *rt,
1138                                 unsigned int orig_mtu)
1139 {
1140         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1141                 if (!skb) {
1142                         /* first fragment, reserve header_len */
1143                         *mtu = orig_mtu - rt->dst.header_len;
1144
1145                 } else {
1146                         /*
1147                          * this fragment is not first, the headers
1148                          * space is regarded as data space.
1149                          */
1150                         *mtu = orig_mtu;
1151                 }
1152                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1153                               + fragheaderlen - sizeof(struct frag_hdr);
1154         }
1155 }
1156
1157 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1158                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1159                           struct rt6_info *rt, struct flowi6 *fl6)
1160 {
1161         struct ipv6_pinfo *np = inet6_sk(sk);
1162         unsigned int mtu;
1163         struct ipv6_txoptions *opt = ipc6->opt;
1164
1165         /*
1166          * setup for corking
1167          */
1168         if (opt) {
1169                 if (WARN_ON(v6_cork->opt))
1170                         return -EINVAL;
1171
1172                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1173                 if (unlikely(!v6_cork->opt))
1174                         return -ENOBUFS;
1175
1176                 v6_cork->opt->tot_len = sizeof(*opt);
1177                 v6_cork->opt->opt_flen = opt->opt_flen;
1178                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1179
1180                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1181                                                     sk->sk_allocation);
1182                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1186                                                     sk->sk_allocation);
1187                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1188                         return -ENOBUFS;
1189
1190                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1191                                                    sk->sk_allocation);
1192                 if (opt->hopopt && !v6_cork->opt->hopopt)
1193                         return -ENOBUFS;
1194
1195                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1196                                                     sk->sk_allocation);
1197                 if (opt->srcrt && !v6_cork->opt->srcrt)
1198                         return -ENOBUFS;
1199
1200                 /* need source address above miyazawa*/
1201         }
1202         dst_hold(&rt->dst);
1203         cork->base.dst = &rt->dst;
1204         cork->fl.u.ip6 = *fl6;
1205         v6_cork->hop_limit = ipc6->hlimit;
1206         v6_cork->tclass = ipc6->tclass;
1207         if (rt->dst.flags & DST_XFRM_TUNNEL)
1208                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1209                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1210         else
1211                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1212                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1213         if (np->frag_size < mtu) {
1214                 if (np->frag_size)
1215                         mtu = np->frag_size;
1216         }
1217         if (mtu < IPV6_MIN_MTU)
1218                 return -EINVAL;
1219         cork->base.fragsize = mtu;
1220         if (dst_allfrag(rt->dst.path))
1221                 cork->base.flags |= IPCORK_ALLFRAG;
1222         cork->base.length = 0;
1223
1224         return 0;
1225 }
1226
1227 static int __ip6_append_data(struct sock *sk,
1228                              struct flowi6 *fl6,
1229                              struct sk_buff_head *queue,
1230                              struct inet_cork *cork,
1231                              struct inet6_cork *v6_cork,
1232                              struct page_frag *pfrag,
1233                              int getfrag(void *from, char *to, int offset,
1234                                          int len, int odd, struct sk_buff *skb),
1235                              void *from, int length, int transhdrlen,
1236                              unsigned int flags, struct ipcm6_cookie *ipc6,
1237                              const struct sockcm_cookie *sockc)
1238 {
1239         struct sk_buff *skb, *skb_prev = NULL;
1240         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1241         int exthdrlen = 0;
1242         int dst_exthdrlen = 0;
1243         int hh_len;
1244         int copy;
1245         int err;
1246         int offset = 0;
1247         __u8 tx_flags = 0;
1248         u32 tskey = 0;
1249         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1250         struct ipv6_txoptions *opt = v6_cork->opt;
1251         int csummode = CHECKSUM_NONE;
1252         unsigned int maxnonfragsize, headersize;
1253
1254         skb = skb_peek_tail(queue);
1255         if (!skb) {
1256                 exthdrlen = opt ? opt->opt_flen : 0;
1257                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1258         }
1259
1260         mtu = cork->fragsize;
1261         orig_mtu = mtu;
1262
1263         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1264
1265         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1266                         (opt ? opt->opt_nflen : 0);
1267         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1268                      sizeof(struct frag_hdr);
1269
1270         headersize = sizeof(struct ipv6hdr) +
1271                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1272                      (dst_allfrag(&rt->dst) ?
1273                       sizeof(struct frag_hdr) : 0) +
1274                      rt->rt6i_nfheader_len;
1275
1276         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1277             (sk->sk_protocol == IPPROTO_UDP ||
1278              sk->sk_protocol == IPPROTO_RAW)) {
1279                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1280                                 sizeof(struct ipv6hdr));
1281                 goto emsgsize;
1282         }
1283
1284         if (ip6_sk_ignore_df(sk))
1285                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1286         else
1287                 maxnonfragsize = mtu;
1288
1289         if (cork->length + length > maxnonfragsize - headersize) {
1290 emsgsize:
1291                 ipv6_local_error(sk, EMSGSIZE, fl6,
1292                                  mtu - headersize +
1293                                  sizeof(struct ipv6hdr));
1294                 return -EMSGSIZE;
1295         }
1296
1297         /* CHECKSUM_PARTIAL only with no extension headers and when
1298          * we are not going to fragment
1299          */
1300         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1301             headersize == sizeof(struct ipv6hdr) &&
1302             length <= mtu - headersize &&
1303             !(flags & MSG_MORE) &&
1304             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1305                 csummode = CHECKSUM_PARTIAL;
1306
1307         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1308                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1309                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1310                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1311                         tskey = sk->sk_tskey++;
1312         }
1313
1314         /*
1315          * Let's try using as much space as possible.
1316          * Use MTU if total length of the message fits into the MTU.
1317          * Otherwise, we need to reserve fragment header and
1318          * fragment alignment (= 8-15 octects, in total).
1319          *
1320          * Note that we may need to "move" the data from the tail of
1321          * of the buffer to the new fragment when we split
1322          * the message.
1323          *
1324          * FIXME: It may be fragmented into multiple chunks
1325          *        at once if non-fragmentable extension headers
1326          *        are too large.
1327          * --yoshfuji
1328          */
1329
1330         cork->length += length;
1331         if (!skb)
1332                 goto alloc_new_skb;
1333
1334         while (length > 0) {
1335                 /* Check if the remaining data fits into current packet. */
1336                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1337                 if (copy < length)
1338                         copy = maxfraglen - skb->len;
1339
1340                 if (copy <= 0) {
1341                         char *data;
1342                         unsigned int datalen;
1343                         unsigned int fraglen;
1344                         unsigned int fraggap;
1345                         unsigned int alloclen;
1346 alloc_new_skb:
1347                         /* There's no room in the current skb */
1348                         if (skb)
1349                                 fraggap = skb->len - maxfraglen;
1350                         else
1351                                 fraggap = 0;
1352                         /* update mtu and maxfraglen if necessary */
1353                         if (!skb || !skb_prev)
1354                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1355                                                     fragheaderlen, skb, rt,
1356                                                     orig_mtu);
1357
1358                         skb_prev = skb;
1359
1360                         /*
1361                          * If remaining data exceeds the mtu,
1362                          * we know we need more fragment(s).
1363                          */
1364                         datalen = length + fraggap;
1365
1366                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1367                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1368                         if ((flags & MSG_MORE) &&
1369                             !(rt->dst.dev->features&NETIF_F_SG))
1370                                 alloclen = mtu;
1371                         else
1372                                 alloclen = datalen + fragheaderlen;
1373
1374                         alloclen += dst_exthdrlen;
1375
1376                         if (datalen != length + fraggap) {
1377                                 /*
1378                                  * this is not the last fragment, the trailer
1379                                  * space is regarded as data space.
1380                                  */
1381                                 datalen += rt->dst.trailer_len;
1382                         }
1383
1384                         alloclen += rt->dst.trailer_len;
1385                         fraglen = datalen + fragheaderlen;
1386
1387                         /*
1388                          * We just reserve space for fragment header.
1389                          * Note: this may be overallocation if the message
1390                          * (without MSG_MORE) fits into the MTU.
1391                          */
1392                         alloclen += sizeof(struct frag_hdr);
1393
1394                         copy = datalen - transhdrlen - fraggap;
1395                         if (copy < 0) {
1396                                 err = -EINVAL;
1397                                 goto error;
1398                         }
1399                         if (transhdrlen) {
1400                                 skb = sock_alloc_send_skb(sk,
1401                                                 alloclen + hh_len,
1402                                                 (flags & MSG_DONTWAIT), &err);
1403                         } else {
1404                                 skb = NULL;
1405                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1406                                     2 * sk->sk_sndbuf)
1407                                         skb = sock_wmalloc(sk,
1408                                                            alloclen + hh_len, 1,
1409                                                            sk->sk_allocation);
1410                                 if (unlikely(!skb))
1411                                         err = -ENOBUFS;
1412                         }
1413                         if (!skb)
1414                                 goto error;
1415                         /*
1416                          *      Fill in the control structures
1417                          */
1418                         skb->protocol = htons(ETH_P_IPV6);
1419                         skb->ip_summed = csummode;
1420                         skb->csum = 0;
1421                         /* reserve for fragmentation and ipsec header */
1422                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1423                                     dst_exthdrlen);
1424
1425                         /* Only the initial fragment is time stamped */
1426                         skb_shinfo(skb)->tx_flags = tx_flags;
1427                         tx_flags = 0;
1428                         skb_shinfo(skb)->tskey = tskey;
1429                         tskey = 0;
1430
1431                         /*
1432                          *      Find where to start putting bytes
1433                          */
1434                         data = skb_put(skb, fraglen);
1435                         skb_set_network_header(skb, exthdrlen);
1436                         data += fragheaderlen;
1437                         skb->transport_header = (skb->network_header +
1438                                                  fragheaderlen);
1439                         if (fraggap) {
1440                                 skb->csum = skb_copy_and_csum_bits(
1441                                         skb_prev, maxfraglen,
1442                                         data + transhdrlen, fraggap, 0);
1443                                 skb_prev->csum = csum_sub(skb_prev->csum,
1444                                                           skb->csum);
1445                                 data += fraggap;
1446                                 pskb_trim_unique(skb_prev, maxfraglen);
1447                         }
1448                         if (copy > 0 &&
1449                             getfrag(from, data + transhdrlen, offset,
1450                                     copy, fraggap, skb) < 0) {
1451                                 err = -EFAULT;
1452                                 kfree_skb(skb);
1453                                 goto error;
1454                         }
1455
1456                         offset += copy;
1457                         length -= datalen - fraggap;
1458                         transhdrlen = 0;
1459                         exthdrlen = 0;
1460                         dst_exthdrlen = 0;
1461
1462                         if ((flags & MSG_CONFIRM) && !skb_prev)
1463                                 skb_set_dst_pending_confirm(skb, 1);
1464
1465                         /*
1466                          * Put the packet on the pending queue
1467                          */
1468                         __skb_queue_tail(queue, skb);
1469                         continue;
1470                 }
1471
1472                 if (copy > length)
1473                         copy = length;
1474
1475                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1476                         unsigned int off;
1477
1478                         off = skb->len;
1479                         if (getfrag(from, skb_put(skb, copy),
1480                                                 offset, copy, off, skb) < 0) {
1481                                 __skb_trim(skb, off);
1482                                 err = -EFAULT;
1483                                 goto error;
1484                         }
1485                 } else {
1486                         int i = skb_shinfo(skb)->nr_frags;
1487
1488                         err = -ENOMEM;
1489                         if (!sk_page_frag_refill(sk, pfrag))
1490                                 goto error;
1491
1492                         if (!skb_can_coalesce(skb, i, pfrag->page,
1493                                               pfrag->offset)) {
1494                                 err = -EMSGSIZE;
1495                                 if (i == MAX_SKB_FRAGS)
1496                                         goto error;
1497
1498                                 __skb_fill_page_desc(skb, i, pfrag->page,
1499                                                      pfrag->offset, 0);
1500                                 skb_shinfo(skb)->nr_frags = ++i;
1501                                 get_page(pfrag->page);
1502                         }
1503                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1504                         if (getfrag(from,
1505                                     page_address(pfrag->page) + pfrag->offset,
1506                                     offset, copy, skb->len, skb) < 0)
1507                                 goto error_efault;
1508
1509                         pfrag->offset += copy;
1510                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1511                         skb->len += copy;
1512                         skb->data_len += copy;
1513                         skb->truesize += copy;
1514                         refcount_add(copy, &sk->sk_wmem_alloc);
1515                 }
1516                 offset += copy;
1517                 length -= copy;
1518         }
1519
1520         return 0;
1521
1522 error_efault:
1523         err = -EFAULT;
1524 error:
1525         cork->length -= length;
1526         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1527         return err;
1528 }
1529
1530 int ip6_append_data(struct sock *sk,
1531                     int getfrag(void *from, char *to, int offset, int len,
1532                                 int odd, struct sk_buff *skb),
1533                     void *from, int length, int transhdrlen,
1534                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1535                     struct rt6_info *rt, unsigned int flags,
1536                     const struct sockcm_cookie *sockc)
1537 {
1538         struct inet_sock *inet = inet_sk(sk);
1539         struct ipv6_pinfo *np = inet6_sk(sk);
1540         int exthdrlen;
1541         int err;
1542
1543         if (flags&MSG_PROBE)
1544                 return 0;
1545         if (skb_queue_empty(&sk->sk_write_queue)) {
1546                 /*
1547                  * setup for corking
1548                  */
1549                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1550                                      ipc6, rt, fl6);
1551                 if (err)
1552                         return err;
1553
1554                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1555                 length += exthdrlen;
1556                 transhdrlen += exthdrlen;
1557         } else {
1558                 fl6 = &inet->cork.fl.u.ip6;
1559                 transhdrlen = 0;
1560         }
1561
1562         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1563                                  &np->cork, sk_page_frag(sk), getfrag,
1564                                  from, length, transhdrlen, flags, ipc6, sockc);
1565 }
1566 EXPORT_SYMBOL_GPL(ip6_append_data);
1567
1568 static void ip6_cork_release(struct inet_cork_full *cork,
1569                              struct inet6_cork *v6_cork)
1570 {
1571         if (v6_cork->opt) {
1572                 kfree(v6_cork->opt->dst0opt);
1573                 kfree(v6_cork->opt->dst1opt);
1574                 kfree(v6_cork->opt->hopopt);
1575                 kfree(v6_cork->opt->srcrt);
1576                 kfree(v6_cork->opt);
1577                 v6_cork->opt = NULL;
1578         }
1579
1580         if (cork->base.dst) {
1581                 dst_release(cork->base.dst);
1582                 cork->base.dst = NULL;
1583                 cork->base.flags &= ~IPCORK_ALLFRAG;
1584         }
1585         memset(&cork->fl, 0, sizeof(cork->fl));
1586 }
1587
1588 struct sk_buff *__ip6_make_skb(struct sock *sk,
1589                                struct sk_buff_head *queue,
1590                                struct inet_cork_full *cork,
1591                                struct inet6_cork *v6_cork)
1592 {
1593         struct sk_buff *skb, *tmp_skb;
1594         struct sk_buff **tail_skb;
1595         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1596         struct ipv6_pinfo *np = inet6_sk(sk);
1597         struct net *net = sock_net(sk);
1598         struct ipv6hdr *hdr;
1599         struct ipv6_txoptions *opt = v6_cork->opt;
1600         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1601         struct flowi6 *fl6 = &cork->fl.u.ip6;
1602         unsigned char proto = fl6->flowi6_proto;
1603
1604         skb = __skb_dequeue(queue);
1605         if (!skb)
1606                 goto out;
1607         tail_skb = &(skb_shinfo(skb)->frag_list);
1608
1609         /* move skb->data to ip header from ext header */
1610         if (skb->data < skb_network_header(skb))
1611                 __skb_pull(skb, skb_network_offset(skb));
1612         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1613                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1614                 *tail_skb = tmp_skb;
1615                 tail_skb = &(tmp_skb->next);
1616                 skb->len += tmp_skb->len;
1617                 skb->data_len += tmp_skb->len;
1618                 skb->truesize += tmp_skb->truesize;
1619                 tmp_skb->destructor = NULL;
1620                 tmp_skb->sk = NULL;
1621         }
1622
1623         /* Allow local fragmentation. */
1624         skb->ignore_df = ip6_sk_ignore_df(sk);
1625
1626         *final_dst = fl6->daddr;
1627         __skb_pull(skb, skb_network_header_len(skb));
1628         if (opt && opt->opt_flen)
1629                 ipv6_push_frag_opts(skb, opt, &proto);
1630         if (opt && opt->opt_nflen)
1631                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1632
1633         skb_push(skb, sizeof(struct ipv6hdr));
1634         skb_reset_network_header(skb);
1635         hdr = ipv6_hdr(skb);
1636
1637         ip6_flow_hdr(hdr, v6_cork->tclass,
1638                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1639                                         ip6_autoflowlabel(net, np), fl6));
1640         hdr->hop_limit = v6_cork->hop_limit;
1641         hdr->nexthdr = proto;
1642         hdr->saddr = fl6->saddr;
1643         hdr->daddr = *final_dst;
1644
1645         skb->priority = sk->sk_priority;
1646         skb->mark = sk->sk_mark;
1647
1648         skb_dst_set(skb, dst_clone(&rt->dst));
1649         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1650         if (proto == IPPROTO_ICMPV6) {
1651                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1652
1653                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1654                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1655         }
1656
1657         ip6_cork_release(cork, v6_cork);
1658 out:
1659         return skb;
1660 }
1661
1662 int ip6_send_skb(struct sk_buff *skb)
1663 {
1664         struct net *net = sock_net(skb->sk);
1665         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1666         int err;
1667
1668         err = ip6_local_out(net, skb->sk, skb);
1669         if (err) {
1670                 if (err > 0)
1671                         err = net_xmit_errno(err);
1672                 if (err)
1673                         IP6_INC_STATS(net, rt->rt6i_idev,
1674                                       IPSTATS_MIB_OUTDISCARDS);
1675         }
1676
1677         return err;
1678 }
1679
1680 int ip6_push_pending_frames(struct sock *sk)
1681 {
1682         struct sk_buff *skb;
1683
1684         skb = ip6_finish_skb(sk);
1685         if (!skb)
1686                 return 0;
1687
1688         return ip6_send_skb(skb);
1689 }
1690 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1691
1692 static void __ip6_flush_pending_frames(struct sock *sk,
1693                                        struct sk_buff_head *queue,
1694                                        struct inet_cork_full *cork,
1695                                        struct inet6_cork *v6_cork)
1696 {
1697         struct sk_buff *skb;
1698
1699         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1700                 if (skb_dst(skb))
1701                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1702                                       IPSTATS_MIB_OUTDISCARDS);
1703                 kfree_skb(skb);
1704         }
1705
1706         ip6_cork_release(cork, v6_cork);
1707 }
1708
1709 void ip6_flush_pending_frames(struct sock *sk)
1710 {
1711         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1712                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1713 }
1714 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1715
1716 struct sk_buff *ip6_make_skb(struct sock *sk,
1717                              int getfrag(void *from, char *to, int offset,
1718                                          int len, int odd, struct sk_buff *skb),
1719                              void *from, int length, int transhdrlen,
1720                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1721                              struct rt6_info *rt, unsigned int flags,
1722                              const struct sockcm_cookie *sockc)
1723 {
1724         struct inet_cork_full cork;
1725         struct inet6_cork v6_cork;
1726         struct sk_buff_head queue;
1727         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1728         int err;
1729
1730         if (flags & MSG_PROBE)
1731                 return NULL;
1732
1733         __skb_queue_head_init(&queue);
1734
1735         cork.base.flags = 0;
1736         cork.base.addr = 0;
1737         cork.base.opt = NULL;
1738         cork.base.dst = NULL;
1739         v6_cork.opt = NULL;
1740         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1741         if (err) {
1742                 ip6_cork_release(&cork, &v6_cork);
1743                 return ERR_PTR(err);
1744         }
1745         if (ipc6->dontfrag < 0)
1746                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1747
1748         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1749                                 &current->task_frag, getfrag, from,
1750                                 length + exthdrlen, transhdrlen + exthdrlen,
1751                                 flags, ipc6, sockc);
1752         if (err) {
1753                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1754                 return ERR_PTR(err);
1755         }
1756
1757         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1758 }