net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <linux/module.h>
  47 #include <linux/types.h>
  48 #include <linux/kernel.h>
  49 #include <linux/mm.h>
  50 #include <linux/string.h>
  51 #include <linux/errno.h>
  52 #include <linux/highmem.h>
  53 #include <linux/slab.h>
  54
  55 #include <linux/socket.h>
  56 #include <linux/sockios.h>
  57 #include <linux/in.h>
  58 #include <linux/inet.h>
  59 #include <linux/netdevice.h>
  60 #include <linux/etherdevice.h>
  61 #include <linux/proc_fs.h>
  62 #include <linux/stat.h>
  63 #include <linux/init.h>
  64
  65 #include <net/snmp.h>
  66 #include <net/ip.h>
  67 #include <net/protocol.h>
  68 #include <net/route.h>
  69 #include <net/xfrm.h>
  70 #include <linux/skbuff.h>
  71 #include <net/sock.h>
  72 #include <net/arp.h>
  73 #include <net/icmp.h>
  74 #include <net/checksum.h>
  75 #include <net/inetpeer.h>
  76 #include <linux/igmp.h>
  77 #include <linux/netfilter_ipv4.h>
  78 #include <linux/netfilter_bridge.h>
  79 #include <linux/mroute.h>
  80 #include <linux/netlink.h>
  81 #include <linux/tcp.h>
  82
  83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
  85
  86 /* Generate a checksum for an outgoing IP datagram. */
  87 __inline__ void ip_send_check(struct iphdr *iph)
  88 {
  89         iph->check = 0;
  90         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  91 }
  92 EXPORT_SYMBOL(ip_send_check);
  93
  94 int __ip_local_out(struct sk_buff *skb)
  95 {
  96         struct iphdr *iph = ip_hdr(skb);
  97
  98         iph->tot_len = htons(skb->len);
  99         ip_send_check(iph);
 100         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 101                        skb_dst(skb)->dev, dst_output);
 102 }
 103
 104 int ip_local_out(struct sk_buff *skb)
 105 {
 106         int err;
 107
 108         err = __ip_local_out(skb);
 109         if (likely(err == 1))
 110                 err = dst_output(skb);
 111
 112         return err;
 113 }
 114 EXPORT_SYMBOL_GPL(ip_local_out);
 115
 116 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 117 {
 118         int ttl = inet->uc_ttl;
 119
 120         if (ttl < 0)
 121                 ttl = ip4_dst_hoplimit(dst);
 122         return ttl;
 123 }
 124
 125 /*
 126  *              Add an ip header to a skbuff and send it out.
 127  *
 128  */
 129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 130                           __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 131 {
 132         struct inet_sock *inet = inet_sk(sk);
 133         struct rtable *rt = skb_rtable(skb);
 134         struct iphdr *iph;
 135
 136         /* Build the IP header. */
 137         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
 138         skb_reset_network_header(skb);
 139         iph = ip_hdr(skb);
 140         iph->version  = 4;
 141         iph->ihl      = 5;
 142         iph->tos      = inet->tos;
 143         if (ip_dont_fragment(sk, &rt->dst))
 144                 iph->frag_off = htons(IP_DF);
 145         else
 146                 iph->frag_off = 0;
 147         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 148         iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
 149         iph->saddr    = saddr;
 150         iph->protocol = sk->sk_protocol;
 151         ip_select_ident(iph, &rt->dst, sk);
 152
 153         if (opt && opt->opt.optlen) {
 154                 iph->ihl += opt->opt.optlen>>2;
 155                 ip_options_build(skb, &opt->opt, daddr, rt, 0);
 156         }
 157
 158         skb->priority = sk->sk_priority;
 159         skb->mark = sk->sk_mark;
 160
 161         /* Send it out. */
 162         return ip_local_out(skb);
 163 }
 164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 165
 166 static inline int ip_finish_output2(struct sk_buff *skb)
 167 {
 168         struct dst_entry *dst = skb_dst(skb);
 169         struct rtable *rt = (struct rtable *)dst;
 170         struct net_device *dev = dst->dev;
 171         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 172         struct neighbour *neigh;
 173         u32 nexthop;
 174
 175         if (rt->rt_type == RTN_MULTICAST) {
 176                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 177         } else if (rt->rt_type == RTN_BROADCAST)
 178                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 179
 180         /* Be paranoid, rather than too clever. */
 181         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 182                 struct sk_buff *skb2;
 183
 184                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 185                 if (skb2 == NULL) {
 186                         kfree_skb(skb);
 187                         return -ENOMEM;
 188                 }
 189                 if (skb->sk)
 190                         skb_set_owner_w(skb2, skb->sk);
 191                 consume_skb(skb);
 192                 skb = skb2;
 193         }
 194
 195         rcu_read_lock_bh();
 196         nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
 197         neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 198         if (unlikely(!neigh))
 199                 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 200         if (neigh) {
 201                 int res = dst_neigh_output(dst, neigh, skb);
 202
 203                 rcu_read_unlock_bh();
 204                 return res;
 205         }
 206         rcu_read_unlock_bh();
 207
 208         net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
 209                             __func__);
 210         kfree_skb(skb);
 211         return -EINVAL;
 212 }
 213
 214 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 215 {
 216         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 217
 218         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 219                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 220 }
 221
 222 static int ip_finish_output(struct sk_buff *skb)
 223 {
 224 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 225         /* Policy lookup after SNAT yielded a new policy */
 226         if (skb_dst(skb)->xfrm != NULL) {
 227                 IPCB(skb)->flags |= IPSKB_REROUTED;
 228                 return dst_output(skb);
 229         }
 230 #endif
 231         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 232                 return ip_fragment(skb, ip_finish_output2);
 233         else
 234                 return ip_finish_output2(skb);
 235 }
 236
 237 int ip_mc_output(struct sk_buff *skb)
 238 {
 239         struct sock *sk = skb->sk;
 240         struct rtable *rt = skb_rtable(skb);
 241         struct net_device *dev = rt->dst.dev;
 242
 243         /*
 244          *      If the indicated interface is up and running, send the packet.
 245          */
 246         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 247
 248         skb->dev = dev;
 249         skb->protocol = htons(ETH_P_IP);
 250
 251         /*
 252          *      Multicasts are looped back for other local users
 253          */
 254
 255         if (rt->rt_flags&RTCF_MULTICAST) {
 256                 if (sk_mc_loop(sk)
 257 #ifdef CONFIG_IP_MROUTE
 258                 /* Small optimization: do not loopback not local frames,
 259                    which returned after forwarding; they will be  dropped
 260                    by ip_mr_input in any case.
 261                    Note, that local frames are looped back to be delivered
 262                    to local recipients.
 263
 264                    This check is duplicated in ip_mr_input at the moment.
 265                  */
 266                     &&
 267                     ((rt->rt_flags & RTCF_LOCAL) ||
 268                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 269 #endif
 270                    ) {
 271                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 272                         if (newskb)
 273                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 274                                         newskb, NULL, newskb->dev,
 275                                         dev_loopback_xmit);
 276                 }
 277
 278                 /* Multicasts with ttl 0 must not go beyond the host */
 279
 280                 if (ip_hdr(skb)->ttl == 0) {
 281                         kfree_skb(skb);
 282                         return 0;
 283                 }
 284         }
 285
 286         if (rt->rt_flags&RTCF_BROADCAST) {
 287                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 288                 if (newskb)
 289                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 290                                 NULL, newskb->dev, dev_loopback_xmit);
 291         }
 292
 293         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 294                             skb->dev, ip_finish_output,
 295                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 296 }
 297
 298 int ip_output(struct sk_buff *skb)
 299 {
 300         struct net_device *dev = skb_dst(skb)->dev;
 301
 302         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 303
 304         skb->dev = dev;
 305         skb->protocol = htons(ETH_P_IP);
 306
 307         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 308                             ip_finish_output,
 309                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 310 }
 311
 312 /*
 313  * copy saddr and daddr, possibly using 64bit load/stores
 314  * Equivalent to :
 315  *   iph->saddr = fl4->saddr;
 316  *   iph->daddr = fl4->daddr;
 317  */
 318 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
 319 {
 320         BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
 321                      offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
 322         memcpy(&iph->saddr, &fl4->saddr,
 323                sizeof(fl4->saddr) + sizeof(fl4->daddr));
 324 }
 325
 326 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 327 {
 328         struct sock *sk = skb->sk;
 329         struct inet_sock *inet = inet_sk(sk);
 330         struct ip_options_rcu *inet_opt;
 331         struct flowi4 *fl4;
 332         struct rtable *rt;
 333         struct iphdr *iph;
 334         int res;
 335
 336         /* Skip all of this if the packet is already routed,
 337          * f.e. by something like SCTP.
 338          */
 339         rcu_read_lock();
 340         inet_opt = rcu_dereference(inet->inet_opt);
 341         fl4 = &fl->u.ip4;
 342         rt = skb_rtable(skb);
 343         if (rt != NULL)
 344                 goto packet_routed;
 345
 346         /* Make sure we can route this packet. */
 347         rt = (struct rtable *)__sk_dst_check(sk, 0);
 348         if (rt == NULL) {
 349                 __be32 daddr;
 350
 351                 /* Use correct destination address if we have options. */
 352                 daddr = inet->inet_daddr;
 353                 if (inet_opt && inet_opt->opt.srr)
 354                         daddr = inet_opt->opt.faddr;
 355
 356                 /* If this fails, retransmit mechanism of transport layer will
 357                  * keep trying until route appears or the connection times
 358                  * itself out.
 359                  */
 360                 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
 361                                            daddr, inet->inet_saddr,
 362                                            inet->inet_dport,
 363                                            inet->inet_sport,
 364                                            sk->sk_protocol,
 365                                            RT_CONN_FLAGS(sk),
 366                                            sk->sk_bound_dev_if);
 367                 if (IS_ERR(rt))
 368                         goto no_route;
 369                 sk_setup_caps(sk, &rt->dst);
 370         }
 371         skb_dst_set_noref(skb, &rt->dst);
 372
 373 packet_routed:
 374         if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
 375                 goto no_route;
 376
 377         /* OK, we know where to send it, allocate and build IP header. */
 378         skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 379         skb_reset_network_header(skb);
 380         iph = ip_hdr(skb);
 381         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 382         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 383                 iph->frag_off = htons(IP_DF);
 384         else
 385                 iph->frag_off = 0;
 386         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 387         iph->protocol = sk->sk_protocol;
 388         ip_copy_addrs(iph, fl4);
 389
 390         /* Transport layer set skb->h.foo itself. */
 391
 392         if (inet_opt && inet_opt->opt.optlen) {
 393                 iph->ihl += inet_opt->opt.optlen >> 2;
 394                 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 395         }
 396
 397         ip_select_ident_more(iph, &rt->dst, sk,
 398                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 399
 400         skb->priority = sk->sk_priority;
 401         skb->mark = sk->sk_mark;
 402
 403         res = ip_local_out(skb);
 404         rcu_read_unlock();
 405         return res;
 406
 407 no_route:
 408         rcu_read_unlock();
 409         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 410         kfree_skb(skb);
 411         return -EHOSTUNREACH;
 412 }
 413 EXPORT_SYMBOL(ip_queue_xmit);
 414
 415
 416 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 417 {
 418         to->pkt_type = from->pkt_type;
 419         to->priority = from->priority;
 420         to->protocol = from->protocol;
 421         skb_dst_drop(to);
 422         skb_dst_copy(to, from);
 423         to->dev = from->dev;
 424         to->mark = from->mark;
 425
 426         /* Copy the flags to each fragment. */
 427         IPCB(to)->flags = IPCB(from)->flags;
 428
 429 #ifdef CONFIG_NET_SCHED
 430         to->tc_index = from->tc_index;
 431 #endif
 432         nf_copy(to, from);
 433 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 434     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 435         to->nf_trace = from->nf_trace;
 436 #endif
 437 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 438         to->ipvs_property = from->ipvs_property;
 439 #endif
 440         skb_copy_secmark(to, from);
 441 }
 442
 443 /*
 444  *      This IP datagram is too large to be sent in one piece.  Break it up into
 445  *      smaller pieces (each of size equal to IP header plus
 446  *      a block of the data of the original IP data part) that will yet fit in a
 447  *      single device frame, and queue such a frame for sending.
 448  */
 449
 450 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 451 {
 452         struct iphdr *iph;
 453         int ptr;
 454         struct net_device *dev;
 455         struct sk_buff *skb2;
 456         unsigned int mtu, hlen, left, len, ll_rs;
 457         int offset;
 458         __be16 not_last_frag;
 459         struct rtable *rt = skb_rtable(skb);
 460         int err = 0;
 461
 462         dev = rt->dst.dev;
 463
 464         /*
 465          *      Point into the IP datagram header.
 466          */
 467
 468         iph = ip_hdr(skb);
 469
 470         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 471                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 472                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 473                           htonl(ip_skb_dst_mtu(skb)));
 474                 kfree_skb(skb);
 475                 return -EMSGSIZE;
 476         }
 477
 478         /*
 479          *      Setup starting values.
 480          */
 481
 482         hlen = iph->ihl * 4;
 483         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 484 #ifdef CONFIG_BRIDGE_NETFILTER
 485         if (skb->nf_bridge)
 486                 mtu -= nf_bridge_mtu_reduction(skb);
 487 #endif
 488         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 489
 490         /* When frag_list is given, use it. First, check its validity:
 491          * some transformers could create wrong frag_list or break existing
 492          * one, it is not prohibited. In this case fall back to copying.
 493          *
 494          * LATER: this step can be merged to real generation of fragments,
 495          * we can switch to copy when see the first bad fragment.
 496          */
 497         if (skb_has_frag_list(skb)) {
 498                 struct sk_buff *frag, *frag2;
 499                 int first_len = skb_pagelen(skb);
 500
 501                 if (first_len - hlen > mtu ||
 502                     ((first_len - hlen) & 7) ||
 503                     ip_is_fragment(iph) ||
 504                     skb_cloned(skb))
 505                         goto slow_path;
 506
 507                 skb_walk_frags(skb, frag) {
 508                         /* Correct geometry. */
 509                         if (frag->len > mtu ||
 510                             ((frag->len & 7) && frag->next) ||
 511                             skb_headroom(frag) < hlen)
 512                                 goto slow_path_clean;
 513
 514                         /* Partially cloned skb? */
 515                         if (skb_shared(frag))
 516                                 goto slow_path_clean;
 517
 518                         BUG_ON(frag->sk);
 519                         if (skb->sk) {
 520                                 frag->sk = skb->sk;
 521                                 frag->destructor = sock_wfree;
 522                         }
 523                         skb->truesize -= frag->truesize;
 524                 }
 525
 526                 /* Everything is OK. Generate! */
 527
 528                 err = 0;
 529                 offset = 0;
 530                 frag = skb_shinfo(skb)->frag_list;
 531                 skb_frag_list_init(skb);
 532                 skb->data_len = first_len - skb_headlen(skb);
 533                 skb->len = first_len;
 534                 iph->tot_len = htons(first_len);
 535                 iph->frag_off = htons(IP_MF);
 536                 ip_send_check(iph);
 537
 538                 for (;;) {
 539                         /* Prepare header of the next frame,
 540                          * before previous one went down. */
 541                         if (frag) {
 542                                 frag->ip_summed = CHECKSUM_NONE;
 543                                 skb_reset_transport_header(frag);
 544                                 __skb_push(frag, hlen);
 545                                 skb_reset_network_header(frag);
 546                                 memcpy(skb_network_header(frag), iph, hlen);
 547                                 iph = ip_hdr(frag);
 548                                 iph->tot_len = htons(frag->len);
 549                                 ip_copy_metadata(frag, skb);
 550                                 if (offset == 0)
 551                                         ip_options_fragment(frag);
 552                                 offset += skb->len - hlen;
 553                                 iph->frag_off = htons(offset>>3);
 554                                 if (frag->next != NULL)
 555                                         iph->frag_off |= htons(IP_MF);
 556                                 /* Ready, complete checksum */
 557                                 ip_send_check(iph);
 558                         }
 559
 560                         err = output(skb);
 561
 562                         if (!err)
 563                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 564                         if (err || !frag)
 565                                 break;
 566
 567                         skb = frag;
 568                         frag = skb->next;
 569                         skb->next = NULL;
 570                 }
 571
 572                 if (err == 0) {
 573                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 574                         return 0;
 575                 }
 576
 577                 while (frag) {
 578                         skb = frag->next;
 579                         kfree_skb(frag);
 580                         frag = skb;
 581                 }
 582                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 583                 return err;
 584
 585 slow_path_clean:
 586                 skb_walk_frags(skb, frag2) {
 587                         if (frag2 == frag)
 588                                 break;
 589                         frag2->sk = NULL;
 590                         frag2->destructor = NULL;
 591                         skb->truesize += frag2->truesize;
 592                 }
 593         }
 594
 595 slow_path:
 596         left = skb->len - hlen;         /* Space per frame */
 597         ptr = hlen;             /* Where to start from */
 598
 599         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 600          * we need to make room for the encapsulating header
 601          */
 602         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 603
 604         /*
 605          *      Fragment the datagram.
 606          */
 607
 608         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 609         not_last_frag = iph->frag_off & htons(IP_MF);
 610
 611         /*
 612          *      Keep copying data until we run out.
 613          */
 614
 615         while (left > 0) {
 616                 len = left;
 617                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 618                 if (len > mtu)
 619                         len = mtu;
 620                 /* IF: we are not sending up to and including the packet end
 621                    then align the next start on an eight byte boundary */
 622                 if (len < left) {
 623                         len &= ~7;
 624                 }
 625                 /*
 626                  *      Allocate buffer.
 627                  */
 628
 629                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 630                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 631                         err = -ENOMEM;
 632                         goto fail;
 633                 }
 634
 635                 /*
 636                  *      Set up data on packet
 637                  */
 638
 639                 ip_copy_metadata(skb2, skb);
 640                 skb_reserve(skb2, ll_rs);
 641                 skb_put(skb2, len + hlen);
 642                 skb_reset_network_header(skb2);
 643                 skb2->transport_header = skb2->network_header + hlen;
 644
 645                 /*
 646                  *      Charge the memory for the fragment to any owner
 647                  *      it might possess
 648                  */
 649
 650                 if (skb->sk)
 651                         skb_set_owner_w(skb2, skb->sk);
 652
 653                 /*
 654                  *      Copy the packet header into the new buffer.
 655                  */
 656
 657                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 658
 659                 /*
 660                  *      Copy a block of the IP datagram.
 661                  */
 662                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 663                         BUG();
 664                 left -= len;
 665
 666                 /*
 667                  *      Fill in the new header fields.
 668                  */
 669                 iph = ip_hdr(skb2);
 670                 iph->frag_off = htons((offset >> 3));
 671
 672                 /* ANK: dirty, but effective trick. Upgrade options only if
 673                  * the segment to be fragmented was THE FIRST (otherwise,
 674                  * options are already fixed) and make it ONCE
 675                  * on the initial skb, so that all the following fragments
 676                  * will inherit fixed options.
 677                  */
 678                 if (offset == 0)
 679                         ip_options_fragment(skb);
 680
 681                 /*
 682                  *      Added AC : If we are fragmenting a fragment that's not the
 683                  *                 last fragment then keep MF on each bit
 684                  */
 685                 if (left > 0 || not_last_frag)
 686                         iph->frag_off |= htons(IP_MF);
 687                 ptr += len;
 688                 offset += len;
 689
 690                 /*
 691                  *      Put this fragment into the sending queue.
 692                  */
 693                 iph->tot_len = htons(len + hlen);
 694
 695                 ip_send_check(iph);
 696
 697                 err = output(skb2);
 698                 if (err)
 699                         goto fail;
 700
 701                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 702         }
 703         consume_skb(skb);
 704         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 705         return err;
 706
 707 fail:
 708         kfree_skb(skb);
 709         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 710         return err;
 711 }
 712 EXPORT_SYMBOL(ip_fragment);
 713
 714 int
 715 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 716 {
 717         struct iovec *iov = from;
 718
 719         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 720                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 721                         return -EFAULT;
 722         } else {
 723                 __wsum csum = 0;
 724                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 725                         return -EFAULT;
 726                 skb->csum = csum_block_add(skb->csum, csum, odd);
 727         }
 728         return 0;
 729 }
 730 EXPORT_SYMBOL(ip_generic_getfrag);
 731
 732 static inline __wsum
 733 csum_page(struct page *page, int offset, int copy)
 734 {
 735         char *kaddr;
 736         __wsum csum;
 737         kaddr = kmap(page);
 738         csum = csum_partial(kaddr + offset, copy, 0);
 739         kunmap(page);
 740         return csum;
 741 }
 742
 743 static inline int ip_ufo_append_data(struct sock *sk,
 744                         struct sk_buff_head *queue,
 745                         int getfrag(void *from, char *to, int offset, int len,
 746                                int odd, struct sk_buff *skb),
 747                         void *from, int length, int hh_len, int fragheaderlen,
 748                         int transhdrlen, int maxfraglen, unsigned int flags)
 749 {
 750         struct sk_buff *skb;
 751         int err;
 752
 753         /* There is support for UDP fragmentation offload by network
 754          * device, so create one single skb packet containing complete
 755          * udp datagram
 756          */
 757         if ((skb = skb_peek_tail(queue)) == NULL) {
 758                 skb = sock_alloc_send_skb(sk,
 759                         hh_len + fragheaderlen + transhdrlen + 20,
 760                         (flags & MSG_DONTWAIT), &err);
 761
 762                 if (skb == NULL)
 763                         return err;
 764
 765                 /* reserve space for Hardware header */
 766                 skb_reserve(skb, hh_len);
 767
 768                 /* create space for UDP/IP header */
 769                 skb_put(skb, fragheaderlen + transhdrlen);
 770
 771                 /* initialize network header pointer */
 772                 skb_reset_network_header(skb);
 773
 774                 /* initialize protocol header pointer */
 775                 skb->transport_header = skb->network_header + fragheaderlen;
 776
 777                 skb->ip_summed = CHECKSUM_PARTIAL;
 778                 skb->csum = 0;
 779
 780                 /* specify the length of each IP datagram fragment */
 781                 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
 782                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 783                 __skb_queue_tail(queue, skb);
 784         }
 785
 786         return skb_append_datato_frags(sk, skb, getfrag, from,
 787                                        (length - transhdrlen));
 788 }
 789
 790 static int __ip_append_data(struct sock *sk,
 791                             struct flowi4 *fl4,
 792                             struct sk_buff_head *queue,
 793                             struct inet_cork *cork,
 794                             int getfrag(void *from, char *to, int offset,
 795                                         int len, int odd, struct sk_buff *skb),
 796                             void *from, int length, int transhdrlen,
 797                             unsigned int flags)
 798 {
 799         struct inet_sock *inet = inet_sk(sk);
 800         struct sk_buff *skb;
 801
 802         struct ip_options *opt = cork->opt;
 803         int hh_len;
 804         int exthdrlen;
 805         int mtu;
 806         int copy;
 807         int err;
 808         int offset = 0;
 809         unsigned int maxfraglen, fragheaderlen;
 810         int csummode = CHECKSUM_NONE;
 811         struct rtable *rt = (struct rtable *)cork->dst;
 812
 813         skb = skb_peek_tail(queue);
 814
 815         exthdrlen = !skb ? rt->dst.header_len : 0;
 816         mtu = cork->fragsize;
 817
 818         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 819
 820         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 821         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 822
 823         if (cork->length + length > 0xFFFF - fragheaderlen) {
 824                 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
 825                                mtu-exthdrlen);
 826                 return -EMSGSIZE;
 827         }
 828
 829         /*
 830          * transhdrlen > 0 means that this is the first fragment and we wish
 831          * it won't be fragmented in the future.
 832          */
 833         if (transhdrlen &&
 834             length + fragheaderlen <= mtu &&
 835             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 836             !exthdrlen)
 837                 csummode = CHECKSUM_PARTIAL;
 838
 839         cork->length += length;
 840         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 841             (sk->sk_protocol == IPPROTO_UDP) &&
 842             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
 843                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 844                                          hh_len, fragheaderlen, transhdrlen,
 845                                          maxfraglen, flags);
 846                 if (err)
 847                         goto error;
 848                 return 0;
 849         }
 850
 851         /* So, what's going on in the loop below?
 852          *
 853          * We use calculated fragment length to generate chained skb,
 854          * each of segments is IP fragment ready for sending to network after
 855          * adding appropriate IP header.
 856          */
 857
 858         if (!skb)
 859                 goto alloc_new_skb;
 860
 861         while (length > 0) {
 862                 /* Check if the remaining data fits into current packet. */
 863                 copy = mtu - skb->len;
 864                 if (copy < length)
 865                         copy = maxfraglen - skb->len;
 866                 if (copy <= 0) {
 867                         char *data;
 868                         unsigned int datalen;
 869                         unsigned int fraglen;
 870                         unsigned int fraggap;
 871                         unsigned int alloclen;
 872                         struct sk_buff *skb_prev;
 873 alloc_new_skb:
 874                         skb_prev = skb;
 875                         if (skb_prev)
 876                                 fraggap = skb_prev->len - maxfraglen;
 877                         else
 878                                 fraggap = 0;
 879
 880                         /*
 881                          * If remaining data exceeds the mtu,
 882                          * we know we need more fragment(s).
 883                          */
 884                         datalen = length + fraggap;
 885                         if (datalen > mtu - fragheaderlen)
 886                                 datalen = maxfraglen - fragheaderlen;
 887                         fraglen = datalen + fragheaderlen;
 888
 889                         if ((flags & MSG_MORE) &&
 890                             !(rt->dst.dev->features&NETIF_F_SG))
 891                                 alloclen = mtu;
 892                         else
 893                                 alloclen = fraglen;
 894
 895                         alloclen += exthdrlen;
 896
 897                         /* The last fragment gets additional space at tail.
 898                          * Note, with MSG_MORE we overallocate on fragments,
 899                          * because we have no idea what fragment will be
 900                          * the last.
 901                          */
 902                         if (datalen == length + fraggap)
 903                                 alloclen += rt->dst.trailer_len;
 904
 905                         if (transhdrlen) {
 906                                 skb = sock_alloc_send_skb(sk,
 907                                                 alloclen + hh_len + 15,
 908                                                 (flags & MSG_DONTWAIT), &err);
 909                         } else {
 910                                 skb = NULL;
 911                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 912                                     2 * sk->sk_sndbuf)
 913                                         skb = sock_wmalloc(sk,
 914                                                            alloclen + hh_len + 15, 1,
 915                                                            sk->sk_allocation);
 916                                 if (unlikely(skb == NULL))
 917                                         err = -ENOBUFS;
 918                                 else
 919                                         /* only the initial fragment is
 920                                            time stamped */
 921                                         cork->tx_flags = 0;
 922                         }
 923                         if (skb == NULL)
 924                                 goto error;
 925
 926                         /*
 927                          *      Fill in the control structures
 928                          */
 929                         skb->ip_summed = csummode;
 930                         skb->csum = 0;
 931                         skb_reserve(skb, hh_len);
 932                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
 933
 934                         /*
 935                          *      Find where to start putting bytes.
 936                          */
 937                         data = skb_put(skb, fraglen + exthdrlen);
 938                         skb_set_network_header(skb, exthdrlen);
 939                         skb->transport_header = (skb->network_header +
 940                                                  fragheaderlen);
 941                         data += fragheaderlen + exthdrlen;
 942
 943                         if (fraggap) {
 944                                 skb->csum = skb_copy_and_csum_bits(
 945                                         skb_prev, maxfraglen,
 946                                         data + transhdrlen, fraggap, 0);
 947                                 skb_prev->csum = csum_sub(skb_prev->csum,
 948                                                           skb->csum);
 949                                 data += fraggap;
 950                                 pskb_trim_unique(skb_prev, maxfraglen);
 951                         }
 952
 953                         copy = datalen - transhdrlen - fraggap;
 954                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 955                                 err = -EFAULT;
 956                                 kfree_skb(skb);
 957                                 goto error;
 958                         }
 959
 960                         offset += copy;
 961                         length -= datalen - fraggap;
 962                         transhdrlen = 0;
 963                         exthdrlen = 0;
 964                         csummode = CHECKSUM_NONE;
 965
 966                         /*
 967                          * Put the packet on the pending queue.
 968                          */
 969                         __skb_queue_tail(queue, skb);
 970                         continue;
 971                 }
 972
 973                 if (copy > length)
 974                         copy = length;
 975
 976                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
 977                         unsigned int off;
 978
 979                         off = skb->len;
 980                         if (getfrag(from, skb_put(skb, copy),
 981                                         offset, copy, off, skb) < 0) {
 982                                 __skb_trim(skb, off);
 983                                 err = -EFAULT;
 984                                 goto error;
 985                         }
 986                 } else {
 987                         int i = skb_shinfo(skb)->nr_frags;
 988                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 989                         struct page *page = cork->page;
 990                         int off = cork->off;
 991                         unsigned int left;
 992
 993                         if (page && (left = PAGE_SIZE - off) > 0) {
 994                                 if (copy >= left)
 995                                         copy = left;
 996                                 if (page != skb_frag_page(frag)) {
 997                                         if (i == MAX_SKB_FRAGS) {
 998                                                 err = -EMSGSIZE;
 999                                                 goto error;
1000                                         }
1001                                         skb_fill_page_desc(skb, i, page, off, 0);
1002                                         skb_frag_ref(skb, i);
1003                                         frag = &skb_shinfo(skb)->frags[i];
1004                                 }
1005                         } else if (i < MAX_SKB_FRAGS) {
1006                                 if (copy > PAGE_SIZE)
1007                                         copy = PAGE_SIZE;
1008                                 page = alloc_pages(sk->sk_allocation, 0);
1009                                 if (page == NULL)  {
1010                                         err = -ENOMEM;
1011                                         goto error;
1012                                 }
1013                                 cork->page = page;
1014                                 cork->off = 0;
1015
1016                                 skb_fill_page_desc(skb, i, page, 0, 0);
1017                                 frag = &skb_shinfo(skb)->frags[i];
1018                         } else {
1019                                 err = -EMSGSIZE;
1020                                 goto error;
1021                         }
1022                         if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1023                                     offset, copy, skb->len, skb) < 0) {
1024                                 err = -EFAULT;
1025                                 goto error;
1026                         }
1027                         cork->off += copy;
1028                         skb_frag_size_add(frag, copy);
1029                         skb->len += copy;
1030                         skb->data_len += copy;
1031                         skb->truesize += copy;
1032                         atomic_add(copy, &sk->sk_wmem_alloc);
1033                 }
1034                 offset += copy;
1035                 length -= copy;
1036         }
1037
1038         return 0;
1039
1040 error:
1041         cork->length -= length;
1042         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1043         return err;
1044 }
1045
1046 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1047                          struct ipcm_cookie *ipc, struct rtable **rtp)
1048 {
1049         struct inet_sock *inet = inet_sk(sk);
1050         struct ip_options_rcu *opt;
1051         struct rtable *rt;
1052
1053         /*
1054          * setup for corking.
1055          */
1056         opt = ipc->opt;
1057         if (opt) {
1058                 if (cork->opt == NULL) {
1059                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1060                                             sk->sk_allocation);
1061                         if (unlikely(cork->opt == NULL))
1062                                 return -ENOBUFS;
1063                 }
1064                 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1065                 cork->flags |= IPCORK_OPT;
1066                 cork->addr = ipc->addr;
1067         }
1068         rt = *rtp;
1069         if (unlikely(!rt))
1070                 return -EFAULT;
1071         /*
1072          * We steal reference to this route, caller should not release it
1073          */
1074         *rtp = NULL;
1075         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1076                          rt->dst.dev->mtu : dst_mtu(&rt->dst);
1077         cork->dst = &rt->dst;
1078         cork->length = 0;
1079         cork->tx_flags = ipc->tx_flags;
1080         cork->page = NULL;
1081         cork->off = 0;
1082
1083         return 0;
1084 }
1085
1086 /*
1087  *      ip_append_data() and ip_append_page() can make one large IP datagram
1088  *      from many pieces of data. Each pieces will be holded on the socket
1089  *      until ip_push_pending_frames() is called. Each piece can be a page
1090  *      or non-page data.
1091  *
1092  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1093  *      this interface potentially.
1094  *
1095  *      LATER: length must be adjusted by pad at tail, when it is required.
1096  */
1097 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1098                    int getfrag(void *from, char *to, int offset, int len,
1099                                int odd, struct sk_buff *skb),
1100                    void *from, int length, int transhdrlen,
1101                    struct ipcm_cookie *ipc, struct rtable **rtp,
1102                    unsigned int flags)
1103 {
1104         struct inet_sock *inet = inet_sk(sk);
1105         int err;
1106
1107         if (flags&MSG_PROBE)
1108                 return 0;
1109
1110         if (skb_queue_empty(&sk->sk_write_queue)) {
1111                 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1112                 if (err)
1113                         return err;
1114         } else {
1115                 transhdrlen = 0;
1116         }
1117
1118         return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1119                                 from, length, transhdrlen, flags);
1120 }
1121
1122 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1123                        int offset, size_t size, int flags)
1124 {
1125         struct inet_sock *inet = inet_sk(sk);
1126         struct sk_buff *skb;
1127         struct rtable *rt;
1128         struct ip_options *opt = NULL;
1129         struct inet_cork *cork;
1130         int hh_len;
1131         int mtu;
1132         int len;
1133         int err;
1134         unsigned int maxfraglen, fragheaderlen, fraggap;
1135
1136         if (inet->hdrincl)
1137                 return -EPERM;
1138
1139         if (flags&MSG_PROBE)
1140                 return 0;
1141
1142         if (skb_queue_empty(&sk->sk_write_queue))
1143                 return -EINVAL;
1144
1145         cork = &inet->cork.base;
1146         rt = (struct rtable *)cork->dst;
1147         if (cork->flags & IPCORK_OPT)
1148                 opt = cork->opt;
1149
1150         if (!(rt->dst.dev->features&NETIF_F_SG))
1151                 return -EOPNOTSUPP;
1152
1153         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1154         mtu = cork->fragsize;
1155
1156         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1157         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1158
1159         if (cork->length + size > 0xFFFF - fragheaderlen) {
1160                 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1161                 return -EMSGSIZE;
1162         }
1163
1164         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1165                 return -EINVAL;
1166
1167         cork->length += size;
1168         if ((size + skb->len > mtu) &&
1169             (sk->sk_protocol == IPPROTO_UDP) &&
1170             (rt->dst.dev->features & NETIF_F_UFO)) {
1171                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1172                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1173         }
1174
1175
1176         while (size > 0) {
1177                 int i;
1178
1179                 if (skb_is_gso(skb))
1180                         len = size;
1181                 else {
1182
1183                         /* Check if the remaining data fits into current packet. */
1184                         len = mtu - skb->len;
1185                         if (len < size)
1186                                 len = maxfraglen - skb->len;
1187                 }
1188                 if (len <= 0) {
1189                         struct sk_buff *skb_prev;
1190                         int alloclen;
1191
1192                         skb_prev = skb;
1193                         fraggap = skb_prev->len - maxfraglen;
1194
1195                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1196                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1197                         if (unlikely(!skb)) {
1198                                 err = -ENOBUFS;
1199                                 goto error;
1200                         }
1201
1202                         /*
1203                          *      Fill in the control structures
1204                          */
1205                         skb->ip_summed = CHECKSUM_NONE;
1206                         skb->csum = 0;
1207                         skb_reserve(skb, hh_len);
1208
1209                         /*
1210                          *      Find where to start putting bytes.
1211                          */
1212                         skb_put(skb, fragheaderlen + fraggap);
1213                         skb_reset_network_header(skb);
1214                         skb->transport_header = (skb->network_header +
1215                                                  fragheaderlen);
1216                         if (fraggap) {
1217                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1218                                                                    maxfraglen,
1219                                                     skb_transport_header(skb),
1220                                                                    fraggap, 0);
1221                                 skb_prev->csum = csum_sub(skb_prev->csum,
1222                                                           skb->csum);
1223                                 pskb_trim_unique(skb_prev, maxfraglen);
1224                         }
1225
1226                         /*
1227                          * Put the packet on the pending queue.
1228                          */
1229                         __skb_queue_tail(&sk->sk_write_queue, skb);
1230                         continue;
1231                 }
1232
1233                 i = skb_shinfo(skb)->nr_frags;
1234                 if (len > size)
1235                         len = size;
1236                 if (skb_can_coalesce(skb, i, page, offset)) {
1237                         skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1238                 } else if (i < MAX_SKB_FRAGS) {
1239                         get_page(page);
1240                         skb_fill_page_desc(skb, i, page, offset, len);
1241                 } else {
1242                         err = -EMSGSIZE;
1243                         goto error;
1244                 }
1245
1246                 if (skb->ip_summed == CHECKSUM_NONE) {
1247                         __wsum csum;
1248                         csum = csum_page(page, offset, len);
1249                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1250                 }
1251
1252                 skb->len += len;
1253                 skb->data_len += len;
1254                 skb->truesize += len;
1255                 atomic_add(len, &sk->sk_wmem_alloc);
1256                 offset += len;
1257                 size -= len;
1258         }
1259         return 0;
1260
1261 error:
1262         cork->length -= size;
1263         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1264         return err;
1265 }
1266
1267 static void ip_cork_release(struct inet_cork *cork)
1268 {
1269         cork->flags &= ~IPCORK_OPT;
1270         kfree(cork->opt);
1271         cork->opt = NULL;
1272         dst_release(cork->dst);
1273         cork->dst = NULL;
1274 }
1275
1276 /*
1277  *      Combined all pending IP fragments on the socket as one IP datagram
1278  *      and push them out.
1279  */
1280 struct sk_buff *__ip_make_skb(struct sock *sk,
1281                               struct flowi4 *fl4,
1282                               struct sk_buff_head *queue,
1283                               struct inet_cork *cork)
1284 {
1285         struct sk_buff *skb, *tmp_skb;
1286         struct sk_buff **tail_skb;
1287         struct inet_sock *inet = inet_sk(sk);
1288         struct net *net = sock_net(sk);
1289         struct ip_options *opt = NULL;
1290         struct rtable *rt = (struct rtable *)cork->dst;
1291         struct iphdr *iph;
1292         __be16 df = 0;
1293         __u8 ttl;
1294
1295         if ((skb = __skb_dequeue(queue)) == NULL)
1296                 goto out;
1297         tail_skb = &(skb_shinfo(skb)->frag_list);
1298
1299         /* move skb->data to ip header from ext header */
1300         if (skb->data < skb_network_header(skb))
1301                 __skb_pull(skb, skb_network_offset(skb));
1302         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1303                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1304                 *tail_skb = tmp_skb;
1305                 tail_skb = &(tmp_skb->next);
1306                 skb->len += tmp_skb->len;
1307                 skb->data_len += tmp_skb->len;
1308                 skb->truesize += tmp_skb->truesize;
1309                 tmp_skb->destructor = NULL;
1310                 tmp_skb->sk = NULL;
1311         }
1312
1313         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1314          * to fragment the frame generated here. No matter, what transforms
1315          * how transforms change size of the packet, it will come out.
1316          */
1317         if (inet->pmtudisc < IP_PMTUDISC_DO)
1318                 skb->local_df = 1;
1319
1320         /* DF bit is set when we want to see DF on outgoing frames.
1321          * If local_df is set too, we still allow to fragment this frame
1322          * locally. */
1323         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1324             (skb->len <= dst_mtu(&rt->dst) &&
1325              ip_dont_fragment(sk, &rt->dst)))
1326                 df = htons(IP_DF);
1327
1328         if (cork->flags & IPCORK_OPT)
1329                 opt = cork->opt;
1330
1331         if (rt->rt_type == RTN_MULTICAST)
1332                 ttl = inet->mc_ttl;
1333         else
1334                 ttl = ip_select_ttl(inet, &rt->dst);
1335
1336         iph = (struct iphdr *)skb->data;
1337         iph->version = 4;
1338         iph->ihl = 5;
1339         iph->tos = inet->tos;
1340         iph->frag_off = df;
1341         ip_select_ident(iph, &rt->dst, sk);
1342         iph->ttl = ttl;
1343         iph->protocol = sk->sk_protocol;
1344         ip_copy_addrs(iph, fl4);
1345
1346         if (opt) {
1347                 iph->ihl += opt->optlen>>2;
1348                 ip_options_build(skb, opt, cork->addr, rt, 0);
1349         }
1350
1351         skb->priority = sk->sk_priority;
1352         skb->mark = sk->sk_mark;
1353         /*
1354          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1355          * on dst refcount
1356          */
1357         cork->dst = NULL;
1358         skb_dst_set(skb, &rt->dst);
1359
1360         if (iph->protocol == IPPROTO_ICMP)
1361                 icmp_out_count(net, ((struct icmphdr *)
1362                         skb_transport_header(skb))->type);
1363
1364         ip_cork_release(cork);
1365 out:
1366         return skb;
1367 }
1368
1369 int ip_send_skb(struct sk_buff *skb)
1370 {
1371         struct net *net = sock_net(skb->sk);
1372         int err;
1373
1374         err = ip_local_out(skb);
1375         if (err) {
1376                 if (err > 0)
1377                         err = net_xmit_errno(err);
1378                 if (err)
1379                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1380         }
1381
1382         return err;
1383 }
1384
1385 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1386 {
1387         struct sk_buff *skb;
1388
1389         skb = ip_finish_skb(sk, fl4);
1390         if (!skb)
1391                 return 0;
1392
1393         /* Netfilter gets whole the not fragmented skb. */
1394         return ip_send_skb(skb);
1395 }
1396
1397 /*
1398  *      Throw away all pending data on the socket.
1399  */
1400 static void __ip_flush_pending_frames(struct sock *sk,
1401                                       struct sk_buff_head *queue,
1402                                       struct inet_cork *cork)
1403 {
1404         struct sk_buff *skb;
1405
1406         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1407                 kfree_skb(skb);
1408
1409         ip_cork_release(cork);
1410 }
1411
1412 void ip_flush_pending_frames(struct sock *sk)
1413 {
1414         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1415 }
1416
1417 struct sk_buff *ip_make_skb(struct sock *sk,
1418                             struct flowi4 *fl4,
1419                             int getfrag(void *from, char *to, int offset,
1420                                         int len, int odd, struct sk_buff *skb),
1421                             void *from, int length, int transhdrlen,
1422                             struct ipcm_cookie *ipc, struct rtable **rtp,
1423                             unsigned int flags)
1424 {
1425         struct inet_cork cork;
1426         struct sk_buff_head queue;
1427         int err;
1428
1429         if (flags & MSG_PROBE)
1430                 return NULL;
1431
1432         __skb_queue_head_init(&queue);
1433
1434         cork.flags = 0;
1435         cork.addr = 0;
1436         cork.opt = NULL;
1437         err = ip_setup_cork(sk, &cork, ipc, rtp);
1438         if (err)
1439                 return ERR_PTR(err);
1440
1441         err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1442                                from, length, transhdrlen, flags);
1443         if (err) {
1444                 __ip_flush_pending_frames(sk, &queue, &cork);
1445                 return ERR_PTR(err);
1446         }
1447
1448         return __ip_make_skb(sk, fl4, &queue, &cork);
1449 }
1450
1451 /*
1452  *      Fetch data from kernel space and fill in checksum if needed.
1453  */
1454 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1455                               int len, int odd, struct sk_buff *skb)
1456 {
1457         __wsum csum;
1458
1459         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1460         skb->csum = csum_block_add(skb->csum, csum, odd);
1461         return 0;
1462 }
1463
1464 /*
1465  *      Generic function to send a packet as reply to another packet.
1466  *      Used to send TCP resets so far.
1467  *
1468  *      Should run single threaded per socket because it uses the sock
1469  *      structure to pass arguments.
1470  */
1471 void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1472                            __be32 saddr, const struct ip_reply_arg *arg,
1473                            unsigned int len)
1474 {
1475         struct inet_sock *inet = inet_sk(sk);
1476         struct ip_options_data replyopts;
1477         struct ipcm_cookie ipc;
1478         struct flowi4 fl4;
1479         struct rtable *rt = skb_rtable(skb);
1480
1481         if (ip_options_echo(&replyopts.opt.opt, skb))
1482                 return;
1483
1484         ipc.addr = daddr;
1485         ipc.opt = NULL;
1486         ipc.tx_flags = 0;
1487
1488         if (replyopts.opt.opt.optlen) {
1489                 ipc.opt = &replyopts.opt;
1490
1491                 if (replyopts.opt.opt.srr)
1492                         daddr = replyopts.opt.opt.faddr;
1493         }
1494
1495         flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1496                            RT_TOS(arg->tos),
1497                            RT_SCOPE_UNIVERSE, sk->sk_protocol,
1498                            ip_reply_arg_flowi_flags(arg),
1499                            daddr, saddr,
1500                            tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1501         security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1502         rt = ip_route_output_key(sock_net(sk), &fl4);
1503         if (IS_ERR(rt))
1504                 return;
1505
1506         /* And let IP do all the hard work.
1507
1508            This chunk is not reenterable, hence spinlock.
1509            Note that it uses the fact, that this function is called
1510            with locally disabled BH and that sk cannot be already spinlocked.
1511          */
1512         bh_lock_sock(sk);
1513         inet->tos = arg->tos;
1514         sk->sk_priority = skb->priority;
1515         sk->sk_protocol = ip_hdr(skb)->protocol;
1516         sk->sk_bound_dev_if = arg->bound_dev_if;
1517         ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1518                        &ipc, &rt, MSG_DONTWAIT);
1519         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1520                 if (arg->csumoffset >= 0)
1521                         *((__sum16 *)skb_transport_header(skb) +
1522                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1523                                                                 arg->csum));
1524                 skb->ip_summed = CHECKSUM_NONE;
1525                 ip_push_pending_frames(sk, &fl4);
1526         }
1527
1528         bh_unlock_sock(sk);
1529
1530         ip_rt_put(rt);
1531 }
1532
1533 void __init ip_init(void)
1534 {
1535         ip_rt_init();
1536         inet_initpeers();
1537
1538 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1539         igmp_mc_proc_init();
1540 #endif
1541 }