net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <linux/module.h>
  47 #include <linux/types.h>
  48 #include <linux/kernel.h>
  49 #include <linux/mm.h>
  50 #include <linux/string.h>
  51 #include <linux/errno.h>
  52 #include <linux/highmem.h>
  53 #include <linux/slab.h>
  54
  55 #include <linux/socket.h>
  56 #include <linux/sockios.h>
  57 #include <linux/in.h>
  58 #include <linux/inet.h>
  59 #include <linux/netdevice.h>
  60 #include <linux/etherdevice.h>
  61 #include <linux/proc_fs.h>
  62 #include <linux/stat.h>
  63 #include <linux/init.h>
  64
  65 #include <net/snmp.h>
  66 #include <net/ip.h>
  67 #include <net/protocol.h>
  68 #include <net/route.h>
  69 #include <net/xfrm.h>
  70 #include <linux/skbuff.h>
  71 #include <net/sock.h>
  72 #include <net/arp.h>
  73 #include <net/icmp.h>
  74 #include <net/checksum.h>
  75 #include <net/inetpeer.h>
  76 #include <linux/igmp.h>
  77 #include <linux/netfilter_ipv4.h>
  78 #include <linux/netfilter_bridge.h>
  79 #include <linux/mroute.h>
  80 #include <linux/netlink.h>
  81 #include <linux/tcp.h>
  82
  83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
  85
  86 /* Generate a checksum for an outgoing IP datagram. */
  87 __inline__ void ip_send_check(struct iphdr *iph)
  88 {
  89         iph->check = 0;
  90         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  91 }
  92 EXPORT_SYMBOL(ip_send_check);
  93
  94 int __ip_local_out(struct sk_buff *skb)
  95 {
  96         struct iphdr *iph = ip_hdr(skb);
  97
  98         iph->tot_len = htons(skb->len);
  99         ip_send_check(iph);
 100         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 101                        skb_dst(skb)->dev, dst_output);
 102 }
 103
 104 int ip_local_out(struct sk_buff *skb)
 105 {
 106         int err;
 107
 108         err = __ip_local_out(skb);
 109         if (likely(err == 1))
 110                 err = dst_output(skb);
 111
 112         return err;
 113 }
 114 EXPORT_SYMBOL_GPL(ip_local_out);
 115
 116 /* dev_loopback_xmit for use with netfilter. */
 117 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 118 {
 119         skb_reset_mac_header(newskb);
 120         __skb_pull(newskb, skb_network_offset(newskb));
 121         newskb->pkt_type = PACKET_LOOPBACK;
 122         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 123         WARN_ON(!skb_dst(newskb));
 124         skb_dst_force(newskb);
 125         netif_rx_ni(newskb);
 126         return 0;
 127 }
 128
 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130 {
 131         int ttl = inet->uc_ttl;
 132
 133         if (ttl < 0)
 134                 ttl = ip4_dst_hoplimit(dst);
 135         return ttl;
 136 }
 137
 138 /*
 139  *              Add an ip header to a skbuff and send it out.
 140  *
 141  */
 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                           __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 144 {
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct rtable *rt = skb_rtable(skb);
 147         struct iphdr *iph;
 148
 149         /* Build the IP header. */
 150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
 151         skb_reset_network_header(skb);
 152         iph = ip_hdr(skb);
 153         iph->version  = 4;
 154         iph->ihl      = 5;
 155         iph->tos      = inet->tos;
 156         if (ip_dont_fragment(sk, &rt->dst))
 157                 iph->frag_off = htons(IP_DF);
 158         else
 159                 iph->frag_off = 0;
 160         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 161         iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
 162         iph->saddr    = saddr;
 163         iph->protocol = sk->sk_protocol;
 164         ip_select_ident(iph, &rt->dst, sk);
 165
 166         if (opt && opt->opt.optlen) {
 167                 iph->ihl += opt->opt.optlen>>2;
 168                 ip_options_build(skb, &opt->opt, daddr, rt, 0);
 169         }
 170
 171         skb->priority = sk->sk_priority;
 172         skb->mark = sk->sk_mark;
 173
 174         /* Send it out. */
 175         return ip_local_out(skb);
 176 }
 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179 static inline int ip_finish_output2(struct sk_buff *skb)
 180 {
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct rtable *rt = (struct rtable *)dst;
 183         struct net_device *dev = dst->dev;
 184         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185         struct neighbour *neigh;
 186
 187         if (rt->rt_type == RTN_MULTICAST) {
 188                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 189         } else if (rt->rt_type == RTN_BROADCAST)
 190                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 191
 192         /* Be paranoid, rather than too clever. */
 193         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 194                 struct sk_buff *skb2;
 195
 196                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 197                 if (skb2 == NULL) {
 198                         kfree_skb(skb);
 199                         return -ENOMEM;
 200                 }
 201                 if (skb->sk)
 202                         skb_set_owner_w(skb2, skb->sk);
 203                 kfree_skb(skb);
 204                 skb = skb2;
 205         }
 206
 207         rcu_read_lock();
 208         neigh = dst_get_neighbour_noref(dst);
 209         if (neigh) {
 210                 int res = neigh_output(neigh, skb);
 211
 212                 rcu_read_unlock();
 213                 return res;
 214         }
 215         rcu_read_unlock();
 216
 217         net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
 218                             __func__);
 219         kfree_skb(skb);
 220         return -EINVAL;
 221 }
 222
 223 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 224 {
 225         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 226
 227         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 228                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 229 }
 230
 231 static int ip_finish_output(struct sk_buff *skb)
 232 {
 233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 234         /* Policy lookup after SNAT yielded a new policy */
 235         if (skb_dst(skb)->xfrm != NULL) {
 236                 IPCB(skb)->flags |= IPSKB_REROUTED;
 237                 return dst_output(skb);
 238         }
 239 #endif
 240         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 241                 return ip_fragment(skb, ip_finish_output2);
 242         else
 243                 return ip_finish_output2(skb);
 244 }
 245
 246 int ip_mc_output(struct sk_buff *skb)
 247 {
 248         struct sock *sk = skb->sk;
 249         struct rtable *rt = skb_rtable(skb);
 250         struct net_device *dev = rt->dst.dev;
 251
 252         /*
 253          *      If the indicated interface is up and running, send the packet.
 254          */
 255         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 256
 257         skb->dev = dev;
 258         skb->protocol = htons(ETH_P_IP);
 259
 260         /*
 261          *      Multicasts are looped back for other local users
 262          */
 263
 264         if (rt->rt_flags&RTCF_MULTICAST) {
 265                 if (sk_mc_loop(sk)
 266 #ifdef CONFIG_IP_MROUTE
 267                 /* Small optimization: do not loopback not local frames,
 268                    which returned after forwarding; they will be  dropped
 269                    by ip_mr_input in any case.
 270                    Note, that local frames are looped back to be delivered
 271                    to local recipients.
 272
 273                    This check is duplicated in ip_mr_input at the moment.
 274                  */
 275                     &&
 276                     ((rt->rt_flags & RTCF_LOCAL) ||
 277                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 278 #endif
 279                    ) {
 280                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 281                         if (newskb)
 282                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 283                                         newskb, NULL, newskb->dev,
 284                                         ip_dev_loopback_xmit);
 285                 }
 286
 287                 /* Multicasts with ttl 0 must not go beyond the host */
 288
 289                 if (ip_hdr(skb)->ttl == 0) {
 290                         kfree_skb(skb);
 291                         return 0;
 292                 }
 293         }
 294
 295         if (rt->rt_flags&RTCF_BROADCAST) {
 296                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 297                 if (newskb)
 298                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 299                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 300         }
 301
 302         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 303                             skb->dev, ip_finish_output,
 304                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 305 }
 306
 307 int ip_output(struct sk_buff *skb)
 308 {
 309         struct net_device *dev = skb_dst(skb)->dev;
 310
 311         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 312
 313         skb->dev = dev;
 314         skb->protocol = htons(ETH_P_IP);
 315
 316         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 317                             ip_finish_output,
 318                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 319 }
 320
 321 /*
 322  * copy saddr and daddr, possibly using 64bit load/stores
 323  * Equivalent to :
 324  *   iph->saddr = fl4->saddr;
 325  *   iph->daddr = fl4->daddr;
 326  */
 327 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
 328 {
 329         BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
 330                      offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
 331         memcpy(&iph->saddr, &fl4->saddr,
 332                sizeof(fl4->saddr) + sizeof(fl4->daddr));
 333 }
 334
 335 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 336 {
 337         struct sock *sk = skb->sk;
 338         struct inet_sock *inet = inet_sk(sk);
 339         struct ip_options_rcu *inet_opt;
 340         struct flowi4 *fl4;
 341         struct rtable *rt;
 342         struct iphdr *iph;
 343         int res;
 344
 345         /* Skip all of this if the packet is already routed,
 346          * f.e. by something like SCTP.
 347          */
 348         rcu_read_lock();
 349         inet_opt = rcu_dereference(inet->inet_opt);
 350         fl4 = &fl->u.ip4;
 351         rt = skb_rtable(skb);
 352         if (rt != NULL)
 353                 goto packet_routed;
 354
 355         /* Make sure we can route this packet. */
 356         rt = (struct rtable *)__sk_dst_check(sk, 0);
 357         if (rt == NULL) {
 358                 __be32 daddr;
 359
 360                 /* Use correct destination address if we have options. */
 361                 daddr = inet->inet_daddr;
 362                 if (inet_opt && inet_opt->opt.srr)
 363                         daddr = inet_opt->opt.faddr;
 364
 365                 /* If this fails, retransmit mechanism of transport layer will
 366                  * keep trying until route appears or the connection times
 367                  * itself out.
 368                  */
 369                 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
 370                                            daddr, inet->inet_saddr,
 371                                            inet->inet_dport,
 372                                            inet->inet_sport,
 373                                            sk->sk_protocol,
 374                                            RT_CONN_FLAGS(sk),
 375                                            sk->sk_bound_dev_if);
 376                 if (IS_ERR(rt))
 377                         goto no_route;
 378                 sk_setup_caps(sk, &rt->dst);
 379         }
 380         skb_dst_set_noref(skb, &rt->dst);
 381
 382 packet_routed:
 383         if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
 384                 goto no_route;
 385
 386         /* OK, we know where to send it, allocate and build IP header. */
 387         skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 388         skb_reset_network_header(skb);
 389         iph = ip_hdr(skb);
 390         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 391         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 392                 iph->frag_off = htons(IP_DF);
 393         else
 394                 iph->frag_off = 0;
 395         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 396         iph->protocol = sk->sk_protocol;
 397         ip_copy_addrs(iph, fl4);
 398
 399         /* Transport layer set skb->h.foo itself. */
 400
 401         if (inet_opt && inet_opt->opt.optlen) {
 402                 iph->ihl += inet_opt->opt.optlen >> 2;
 403                 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 404         }
 405
 406         ip_select_ident_more(iph, &rt->dst, sk,
 407                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 408
 409         skb->priority = sk->sk_priority;
 410         skb->mark = sk->sk_mark;
 411
 412         res = ip_local_out(skb);
 413         rcu_read_unlock();
 414         return res;
 415
 416 no_route:
 417         rcu_read_unlock();
 418         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 419         kfree_skb(skb);
 420         return -EHOSTUNREACH;
 421 }
 422 EXPORT_SYMBOL(ip_queue_xmit);
 423
 424
 425 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 426 {
 427         to->pkt_type = from->pkt_type;
 428         to->priority = from->priority;
 429         to->protocol = from->protocol;
 430         skb_dst_drop(to);
 431         skb_dst_copy(to, from);
 432         to->dev = from->dev;
 433         to->mark = from->mark;
 434
 435         /* Copy the flags to each fragment. */
 436         IPCB(to)->flags = IPCB(from)->flags;
 437
 438 #ifdef CONFIG_NET_SCHED
 439         to->tc_index = from->tc_index;
 440 #endif
 441         nf_copy(to, from);
 442 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 443     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 444         to->nf_trace = from->nf_trace;
 445 #endif
 446 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 447         to->ipvs_property = from->ipvs_property;
 448 #endif
 449         skb_copy_secmark(to, from);
 450 }
 451
 452 /*
 453  *      This IP datagram is too large to be sent in one piece.  Break it up into
 454  *      smaller pieces (each of size equal to IP header plus
 455  *      a block of the data of the original IP data part) that will yet fit in a
 456  *      single device frame, and queue such a frame for sending.
 457  */
 458
 459 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 460 {
 461         struct iphdr *iph;
 462         int ptr;
 463         struct net_device *dev;
 464         struct sk_buff *skb2;
 465         unsigned int mtu, hlen, left, len, ll_rs;
 466         int offset;
 467         __be16 not_last_frag;
 468         struct rtable *rt = skb_rtable(skb);
 469         int err = 0;
 470
 471         dev = rt->dst.dev;
 472
 473         /*
 474          *      Point into the IP datagram header.
 475          */
 476
 477         iph = ip_hdr(skb);
 478
 479         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 480                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 481                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 482                           htonl(ip_skb_dst_mtu(skb)));
 483                 kfree_skb(skb);
 484                 return -EMSGSIZE;
 485         }
 486
 487         /*
 488          *      Setup starting values.
 489          */
 490
 491         hlen = iph->ihl * 4;
 492         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 493 #ifdef CONFIG_BRIDGE_NETFILTER
 494         if (skb->nf_bridge)
 495                 mtu -= nf_bridge_mtu_reduction(skb);
 496 #endif
 497         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 498
 499         /* When frag_list is given, use it. First, check its validity:
 500          * some transformers could create wrong frag_list or break existing
 501          * one, it is not prohibited. In this case fall back to copying.
 502          *
 503          * LATER: this step can be merged to real generation of fragments,
 504          * we can switch to copy when see the first bad fragment.
 505          */
 506         if (skb_has_frag_list(skb)) {
 507                 struct sk_buff *frag, *frag2;
 508                 int first_len = skb_pagelen(skb);
 509
 510                 if (first_len - hlen > mtu ||
 511                     ((first_len - hlen) & 7) ||
 512                     ip_is_fragment(iph) ||
 513                     skb_cloned(skb))
 514                         goto slow_path;
 515
 516                 skb_walk_frags(skb, frag) {
 517                         /* Correct geometry. */
 518                         if (frag->len > mtu ||
 519                             ((frag->len & 7) && frag->next) ||
 520                             skb_headroom(frag) < hlen)
 521                                 goto slow_path_clean;
 522
 523                         /* Partially cloned skb? */
 524                         if (skb_shared(frag))
 525                                 goto slow_path_clean;
 526
 527                         BUG_ON(frag->sk);
 528                         if (skb->sk) {
 529                                 frag->sk = skb->sk;
 530                                 frag->destructor = sock_wfree;
 531                         }
 532                         skb->truesize -= frag->truesize;
 533                 }
 534
 535                 /* Everything is OK. Generate! */
 536
 537                 err = 0;
 538                 offset = 0;
 539                 frag = skb_shinfo(skb)->frag_list;
 540                 skb_frag_list_init(skb);
 541                 skb->data_len = first_len - skb_headlen(skb);
 542                 skb->len = first_len;
 543                 iph->tot_len = htons(first_len);
 544                 iph->frag_off = htons(IP_MF);
 545                 ip_send_check(iph);
 546
 547                 for (;;) {
 548                         /* Prepare header of the next frame,
 549                          * before previous one went down. */
 550                         if (frag) {
 551                                 frag->ip_summed = CHECKSUM_NONE;
 552                                 skb_reset_transport_header(frag);
 553                                 __skb_push(frag, hlen);
 554                                 skb_reset_network_header(frag);
 555                                 memcpy(skb_network_header(frag), iph, hlen);
 556                                 iph = ip_hdr(frag);
 557                                 iph->tot_len = htons(frag->len);
 558                                 ip_copy_metadata(frag, skb);
 559                                 if (offset == 0)
 560                                         ip_options_fragment(frag);
 561                                 offset += skb->len - hlen;
 562                                 iph->frag_off = htons(offset>>3);
 563                                 if (frag->next != NULL)
 564                                         iph->frag_off |= htons(IP_MF);
 565                                 /* Ready, complete checksum */
 566                                 ip_send_check(iph);
 567                         }
 568
 569                         err = output(skb);
 570
 571                         if (!err)
 572                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 573                         if (err || !frag)
 574                                 break;
 575
 576                         skb = frag;
 577                         frag = skb->next;
 578                         skb->next = NULL;
 579                 }
 580
 581                 if (err == 0) {
 582                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 583                         return 0;
 584                 }
 585
 586                 while (frag) {
 587                         skb = frag->next;
 588                         kfree_skb(frag);
 589                         frag = skb;
 590                 }
 591                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 592                 return err;
 593
 594 slow_path_clean:
 595                 skb_walk_frags(skb, frag2) {
 596                         if (frag2 == frag)
 597                                 break;
 598                         frag2->sk = NULL;
 599                         frag2->destructor = NULL;
 600                         skb->truesize += frag2->truesize;
 601                 }
 602         }
 603
 604 slow_path:
 605         left = skb->len - hlen;         /* Space per frame */
 606         ptr = hlen;             /* Where to start from */
 607
 608         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 609          * we need to make room for the encapsulating header
 610          */
 611         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 612
 613         /*
 614          *      Fragment the datagram.
 615          */
 616
 617         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 618         not_last_frag = iph->frag_off & htons(IP_MF);
 619
 620         /*
 621          *      Keep copying data until we run out.
 622          */
 623
 624         while (left > 0) {
 625                 len = left;
 626                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 627                 if (len > mtu)
 628                         len = mtu;
 629                 /* IF: we are not sending up to and including the packet end
 630                    then align the next start on an eight byte boundary */
 631                 if (len < left) {
 632                         len &= ~7;
 633                 }
 634                 /*
 635                  *      Allocate buffer.
 636                  */
 637
 638                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 639                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 640                         err = -ENOMEM;
 641                         goto fail;
 642                 }
 643
 644                 /*
 645                  *      Set up data on packet
 646                  */
 647
 648                 ip_copy_metadata(skb2, skb);
 649                 skb_reserve(skb2, ll_rs);
 650                 skb_put(skb2, len + hlen);
 651                 skb_reset_network_header(skb2);
 652                 skb2->transport_header = skb2->network_header + hlen;
 653
 654                 /*
 655                  *      Charge the memory for the fragment to any owner
 656                  *      it might possess
 657                  */
 658
 659                 if (skb->sk)
 660                         skb_set_owner_w(skb2, skb->sk);
 661
 662                 /*
 663                  *      Copy the packet header into the new buffer.
 664                  */
 665
 666                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 667
 668                 /*
 669                  *      Copy a block of the IP datagram.
 670                  */
 671                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 672                         BUG();
 673                 left -= len;
 674
 675                 /*
 676                  *      Fill in the new header fields.
 677                  */
 678                 iph = ip_hdr(skb2);
 679                 iph->frag_off = htons((offset >> 3));
 680
 681                 /* ANK: dirty, but effective trick. Upgrade options only if
 682                  * the segment to be fragmented was THE FIRST (otherwise,
 683                  * options are already fixed) and make it ONCE
 684                  * on the initial skb, so that all the following fragments
 685                  * will inherit fixed options.
 686                  */
 687                 if (offset == 0)
 688                         ip_options_fragment(skb);
 689
 690                 /*
 691                  *      Added AC : If we are fragmenting a fragment that's not the
 692                  *                 last fragment then keep MF on each bit
 693                  */
 694                 if (left > 0 || not_last_frag)
 695                         iph->frag_off |= htons(IP_MF);
 696                 ptr += len;
 697                 offset += len;
 698
 699                 /*
 700                  *      Put this fragment into the sending queue.
 701                  */
 702                 iph->tot_len = htons(len + hlen);
 703
 704                 ip_send_check(iph);
 705
 706                 err = output(skb2);
 707                 if (err)
 708                         goto fail;
 709
 710                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 711         }
 712         kfree_skb(skb);
 713         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 714         return err;
 715
 716 fail:
 717         kfree_skb(skb);
 718         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 719         return err;
 720 }
 721 EXPORT_SYMBOL(ip_fragment);
 722
 723 int
 724 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 725 {
 726         struct iovec *iov = from;
 727
 728         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 729                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 730                         return -EFAULT;
 731         } else {
 732                 __wsum csum = 0;
 733                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 734                         return -EFAULT;
 735                 skb->csum = csum_block_add(skb->csum, csum, odd);
 736         }
 737         return 0;
 738 }
 739 EXPORT_SYMBOL(ip_generic_getfrag);
 740
 741 static inline __wsum
 742 csum_page(struct page *page, int offset, int copy)
 743 {
 744         char *kaddr;
 745         __wsum csum;
 746         kaddr = kmap(page);
 747         csum = csum_partial(kaddr + offset, copy, 0);
 748         kunmap(page);
 749         return csum;
 750 }
 751
 752 static inline int ip_ufo_append_data(struct sock *sk,
 753                         struct sk_buff_head *queue,
 754                         int getfrag(void *from, char *to, int offset, int len,
 755                                int odd, struct sk_buff *skb),
 756                         void *from, int length, int hh_len, int fragheaderlen,
 757                         int transhdrlen, int maxfraglen, unsigned int flags)
 758 {
 759         struct sk_buff *skb;
 760         int err;
 761
 762         /* There is support for UDP fragmentation offload by network
 763          * device, so create one single skb packet containing complete
 764          * udp datagram
 765          */
 766         if ((skb = skb_peek_tail(queue)) == NULL) {
 767                 skb = sock_alloc_send_skb(sk,
 768                         hh_len + fragheaderlen + transhdrlen + 20,
 769                         (flags & MSG_DONTWAIT), &err);
 770
 771                 if (skb == NULL)
 772                         return err;
 773
 774                 /* reserve space for Hardware header */
 775                 skb_reserve(skb, hh_len);
 776
 777                 /* create space for UDP/IP header */
 778                 skb_put(skb, fragheaderlen + transhdrlen);
 779
 780                 /* initialize network header pointer */
 781                 skb_reset_network_header(skb);
 782
 783                 /* initialize protocol header pointer */
 784                 skb->transport_header = skb->network_header + fragheaderlen;
 785
 786                 skb->ip_summed = CHECKSUM_PARTIAL;
 787                 skb->csum = 0;
 788
 789                 /* specify the length of each IP datagram fragment */
 790                 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
 791                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 792                 __skb_queue_tail(queue, skb);
 793         }
 794
 795         return skb_append_datato_frags(sk, skb, getfrag, from,
 796                                        (length - transhdrlen));
 797 }
 798
 799 static int __ip_append_data(struct sock *sk,
 800                             struct flowi4 *fl4,
 801                             struct sk_buff_head *queue,
 802                             struct inet_cork *cork,
 803                             int getfrag(void *from, char *to, int offset,
 804                                         int len, int odd, struct sk_buff *skb),
 805                             void *from, int length, int transhdrlen,
 806                             unsigned int flags)
 807 {
 808         struct inet_sock *inet = inet_sk(sk);
 809         struct sk_buff *skb;
 810
 811         struct ip_options *opt = cork->opt;
 812         int hh_len;
 813         int exthdrlen;
 814         int mtu;
 815         int copy;
 816         int err;
 817         int offset = 0;
 818         unsigned int maxfraglen, fragheaderlen;
 819         int csummode = CHECKSUM_NONE;
 820         struct rtable *rt = (struct rtable *)cork->dst;
 821
 822         skb = skb_peek_tail(queue);
 823
 824         exthdrlen = !skb ? rt->dst.header_len : 0;
 825         mtu = cork->fragsize;
 826
 827         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 828
 829         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 830         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 831
 832         if (cork->length + length > 0xFFFF - fragheaderlen) {
 833                 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
 834                                mtu-exthdrlen);
 835                 return -EMSGSIZE;
 836         }
 837
 838         /*
 839          * transhdrlen > 0 means that this is the first fragment and we wish
 840          * it won't be fragmented in the future.
 841          */
 842         if (transhdrlen &&
 843             length + fragheaderlen <= mtu &&
 844             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 845             !exthdrlen)
 846                 csummode = CHECKSUM_PARTIAL;
 847
 848         cork->length += length;
 849         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 850             (sk->sk_protocol == IPPROTO_UDP) &&
 851             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
 852                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 853                                          hh_len, fragheaderlen, transhdrlen,
 854                                          maxfraglen, flags);
 855                 if (err)
 856                         goto error;
 857                 return 0;
 858         }
 859
 860         /* So, what's going on in the loop below?
 861          *
 862          * We use calculated fragment length to generate chained skb,
 863          * each of segments is IP fragment ready for sending to network after
 864          * adding appropriate IP header.
 865          */
 866
 867         if (!skb)
 868                 goto alloc_new_skb;
 869
 870         while (length > 0) {
 871                 /* Check if the remaining data fits into current packet. */
 872                 copy = mtu - skb->len;
 873                 if (copy < length)
 874                         copy = maxfraglen - skb->len;
 875                 if (copy <= 0) {
 876                         char *data;
 877                         unsigned int datalen;
 878                         unsigned int fraglen;
 879                         unsigned int fraggap;
 880                         unsigned int alloclen;
 881                         struct sk_buff *skb_prev;
 882 alloc_new_skb:
 883                         skb_prev = skb;
 884                         if (skb_prev)
 885                                 fraggap = skb_prev->len - maxfraglen;
 886                         else
 887                                 fraggap = 0;
 888
 889                         /*
 890                          * If remaining data exceeds the mtu,
 891                          * we know we need more fragment(s).
 892                          */
 893                         datalen = length + fraggap;
 894                         if (datalen > mtu - fragheaderlen)
 895                                 datalen = maxfraglen - fragheaderlen;
 896                         fraglen = datalen + fragheaderlen;
 897
 898                         if ((flags & MSG_MORE) &&
 899                             !(rt->dst.dev->features&NETIF_F_SG))
 900                                 alloclen = mtu;
 901                         else
 902                                 alloclen = fraglen;
 903
 904                         alloclen += exthdrlen;
 905
 906                         /* The last fragment gets additional space at tail.
 907                          * Note, with MSG_MORE we overallocate on fragments,
 908                          * because we have no idea what fragment will be
 909                          * the last.
 910                          */
 911                         if (datalen == length + fraggap)
 912                                 alloclen += rt->dst.trailer_len;
 913
 914                         if (transhdrlen) {
 915                                 skb = sock_alloc_send_skb(sk,
 916                                                 alloclen + hh_len + 15,
 917                                                 (flags & MSG_DONTWAIT), &err);
 918                         } else {
 919                                 skb = NULL;
 920                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 921                                     2 * sk->sk_sndbuf)
 922                                         skb = sock_wmalloc(sk,
 923                                                            alloclen + hh_len + 15, 1,
 924                                                            sk->sk_allocation);
 925                                 if (unlikely(skb == NULL))
 926                                         err = -ENOBUFS;
 927                                 else
 928                                         /* only the initial fragment is
 929                                            time stamped */
 930                                         cork->tx_flags = 0;
 931                         }
 932                         if (skb == NULL)
 933                                 goto error;
 934
 935                         /*
 936                          *      Fill in the control structures
 937                          */
 938                         skb->ip_summed = csummode;
 939                         skb->csum = 0;
 940                         skb_reserve(skb, hh_len);
 941                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
 942
 943                         /*
 944                          *      Find where to start putting bytes.
 945                          */
 946                         data = skb_put(skb, fraglen + exthdrlen);
 947                         skb_set_network_header(skb, exthdrlen);
 948                         skb->transport_header = (skb->network_header +
 949                                                  fragheaderlen);
 950                         data += fragheaderlen + exthdrlen;
 951
 952                         if (fraggap) {
 953                                 skb->csum = skb_copy_and_csum_bits(
 954                                         skb_prev, maxfraglen,
 955                                         data + transhdrlen, fraggap, 0);
 956                                 skb_prev->csum = csum_sub(skb_prev->csum,
 957                                                           skb->csum);
 958                                 data += fraggap;
 959                                 pskb_trim_unique(skb_prev, maxfraglen);
 960                         }
 961
 962                         copy = datalen - transhdrlen - fraggap;
 963                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 964                                 err = -EFAULT;
 965                                 kfree_skb(skb);
 966                                 goto error;
 967                         }
 968
 969                         offset += copy;
 970                         length -= datalen - fraggap;
 971                         transhdrlen = 0;
 972                         exthdrlen = 0;
 973                         csummode = CHECKSUM_NONE;
 974
 975                         /*
 976                          * Put the packet on the pending queue.
 977                          */
 978                         __skb_queue_tail(queue, skb);
 979                         continue;
 980                 }
 981
 982                 if (copy > length)
 983                         copy = length;
 984
 985                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
 986                         unsigned int off;
 987
 988                         off = skb->len;
 989                         if (getfrag(from, skb_put(skb, copy),
 990                                         offset, copy, off, skb) < 0) {
 991                                 __skb_trim(skb, off);
 992                                 err = -EFAULT;
 993                                 goto error;
 994                         }
 995                 } else {
 996                         int i = skb_shinfo(skb)->nr_frags;
 997                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 998                         struct page *page = cork->page;
 999                         int off = cork->off;
1000                         unsigned int left;
1001
1002                         if (page && (left = PAGE_SIZE - off) > 0) {
1003                                 if (copy >= left)
1004                                         copy = left;
1005                                 if (page != skb_frag_page(frag)) {
1006                                         if (i == MAX_SKB_FRAGS) {
1007                                                 err = -EMSGSIZE;
1008                                                 goto error;
1009                                         }
1010                                         skb_fill_page_desc(skb, i, page, off, 0);
1011                                         skb_frag_ref(skb, i);
1012                                         frag = &skb_shinfo(skb)->frags[i];
1013                                 }
1014                         } else if (i < MAX_SKB_FRAGS) {
1015                                 if (copy > PAGE_SIZE)
1016                                         copy = PAGE_SIZE;
1017                                 page = alloc_pages(sk->sk_allocation, 0);
1018                                 if (page == NULL)  {
1019                                         err = -ENOMEM;
1020                                         goto error;
1021                                 }
1022                                 cork->page = page;
1023                                 cork->off = 0;
1024
1025                                 skb_fill_page_desc(skb, i, page, 0, 0);
1026                                 frag = &skb_shinfo(skb)->frags[i];
1027                         } else {
1028                                 err = -EMSGSIZE;
1029                                 goto error;
1030                         }
1031                         if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1032                                     offset, copy, skb->len, skb) < 0) {
1033                                 err = -EFAULT;
1034                                 goto error;
1035                         }
1036                         cork->off += copy;
1037                         skb_frag_size_add(frag, copy);
1038                         skb->len += copy;
1039                         skb->data_len += copy;
1040                         skb->truesize += copy;
1041                         atomic_add(copy, &sk->sk_wmem_alloc);
1042                 }
1043                 offset += copy;
1044                 length -= copy;
1045         }
1046
1047         return 0;
1048
1049 error:
1050         cork->length -= length;
1051         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1052         return err;
1053 }
1054
1055 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1056                          struct ipcm_cookie *ipc, struct rtable **rtp)
1057 {
1058         struct inet_sock *inet = inet_sk(sk);
1059         struct ip_options_rcu *opt;
1060         struct rtable *rt;
1061
1062         /*
1063          * setup for corking.
1064          */
1065         opt = ipc->opt;
1066         if (opt) {
1067                 if (cork->opt == NULL) {
1068                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1069                                             sk->sk_allocation);
1070                         if (unlikely(cork->opt == NULL))
1071                                 return -ENOBUFS;
1072                 }
1073                 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1074                 cork->flags |= IPCORK_OPT;
1075                 cork->addr = ipc->addr;
1076         }
1077         rt = *rtp;
1078         if (unlikely(!rt))
1079                 return -EFAULT;
1080         /*
1081          * We steal reference to this route, caller should not release it
1082          */
1083         *rtp = NULL;
1084         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1085                          rt->dst.dev->mtu : dst_mtu(&rt->dst);
1086         cork->dst = &rt->dst;
1087         cork->length = 0;
1088         cork->tx_flags = ipc->tx_flags;
1089         cork->page = NULL;
1090         cork->off = 0;
1091
1092         return 0;
1093 }
1094
1095 /*
1096  *      ip_append_data() and ip_append_page() can make one large IP datagram
1097  *      from many pieces of data. Each pieces will be holded on the socket
1098  *      until ip_push_pending_frames() is called. Each piece can be a page
1099  *      or non-page data.
1100  *
1101  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1102  *      this interface potentially.
1103  *
1104  *      LATER: length must be adjusted by pad at tail, when it is required.
1105  */
1106 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1107                    int getfrag(void *from, char *to, int offset, int len,
1108                                int odd, struct sk_buff *skb),
1109                    void *from, int length, int transhdrlen,
1110                    struct ipcm_cookie *ipc, struct rtable **rtp,
1111                    unsigned int flags)
1112 {
1113         struct inet_sock *inet = inet_sk(sk);
1114         int err;
1115
1116         if (flags&MSG_PROBE)
1117                 return 0;
1118
1119         if (skb_queue_empty(&sk->sk_write_queue)) {
1120                 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1121                 if (err)
1122                         return err;
1123         } else {
1124                 transhdrlen = 0;
1125         }
1126
1127         return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1128                                 from, length, transhdrlen, flags);
1129 }
1130
1131 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1132                        int offset, size_t size, int flags)
1133 {
1134         struct inet_sock *inet = inet_sk(sk);
1135         struct sk_buff *skb;
1136         struct rtable *rt;
1137         struct ip_options *opt = NULL;
1138         struct inet_cork *cork;
1139         int hh_len;
1140         int mtu;
1141         int len;
1142         int err;
1143         unsigned int maxfraglen, fragheaderlen, fraggap;
1144
1145         if (inet->hdrincl)
1146                 return -EPERM;
1147
1148         if (flags&MSG_PROBE)
1149                 return 0;
1150
1151         if (skb_queue_empty(&sk->sk_write_queue))
1152                 return -EINVAL;
1153
1154         cork = &inet->cork.base;
1155         rt = (struct rtable *)cork->dst;
1156         if (cork->flags & IPCORK_OPT)
1157                 opt = cork->opt;
1158
1159         if (!(rt->dst.dev->features&NETIF_F_SG))
1160                 return -EOPNOTSUPP;
1161
1162         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1163         mtu = cork->fragsize;
1164
1165         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1166         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1167
1168         if (cork->length + size > 0xFFFF - fragheaderlen) {
1169                 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1170                 return -EMSGSIZE;
1171         }
1172
1173         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1174                 return -EINVAL;
1175
1176         cork->length += size;
1177         if ((size + skb->len > mtu) &&
1178             (sk->sk_protocol == IPPROTO_UDP) &&
1179             (rt->dst.dev->features & NETIF_F_UFO)) {
1180                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1181                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1182         }
1183
1184
1185         while (size > 0) {
1186                 int i;
1187
1188                 if (skb_is_gso(skb))
1189                         len = size;
1190                 else {
1191
1192                         /* Check if the remaining data fits into current packet. */
1193                         len = mtu - skb->len;
1194                         if (len < size)
1195                                 len = maxfraglen - skb->len;
1196                 }
1197                 if (len <= 0) {
1198                         struct sk_buff *skb_prev;
1199                         int alloclen;
1200
1201                         skb_prev = skb;
1202                         fraggap = skb_prev->len - maxfraglen;
1203
1204                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1205                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1206                         if (unlikely(!skb)) {
1207                                 err = -ENOBUFS;
1208                                 goto error;
1209                         }
1210
1211                         /*
1212                          *      Fill in the control structures
1213                          */
1214                         skb->ip_summed = CHECKSUM_NONE;
1215                         skb->csum = 0;
1216                         skb_reserve(skb, hh_len);
1217
1218                         /*
1219                          *      Find where to start putting bytes.
1220                          */
1221                         skb_put(skb, fragheaderlen + fraggap);
1222                         skb_reset_network_header(skb);
1223                         skb->transport_header = (skb->network_header +
1224                                                  fragheaderlen);
1225                         if (fraggap) {
1226                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1227                                                                    maxfraglen,
1228                                                     skb_transport_header(skb),
1229                                                                    fraggap, 0);
1230                                 skb_prev->csum = csum_sub(skb_prev->csum,
1231                                                           skb->csum);
1232                                 pskb_trim_unique(skb_prev, maxfraglen);
1233                         }
1234
1235                         /*
1236                          * Put the packet on the pending queue.
1237                          */
1238                         __skb_queue_tail(&sk->sk_write_queue, skb);
1239                         continue;
1240                 }
1241
1242                 i = skb_shinfo(skb)->nr_frags;
1243                 if (len > size)
1244                         len = size;
1245                 if (skb_can_coalesce(skb, i, page, offset)) {
1246                         skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1247                 } else if (i < MAX_SKB_FRAGS) {
1248                         get_page(page);
1249                         skb_fill_page_desc(skb, i, page, offset, len);
1250                 } else {
1251                         err = -EMSGSIZE;
1252                         goto error;
1253                 }
1254
1255                 if (skb->ip_summed == CHECKSUM_NONE) {
1256                         __wsum csum;
1257                         csum = csum_page(page, offset, len);
1258                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1259                 }
1260
1261                 skb->len += len;
1262                 skb->data_len += len;
1263                 skb->truesize += len;
1264                 atomic_add(len, &sk->sk_wmem_alloc);
1265                 offset += len;
1266                 size -= len;
1267         }
1268         return 0;
1269
1270 error:
1271         cork->length -= size;
1272         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1273         return err;
1274 }
1275
1276 static void ip_cork_release(struct inet_cork *cork)
1277 {
1278         cork->flags &= ~IPCORK_OPT;
1279         kfree(cork->opt);
1280         cork->opt = NULL;
1281         dst_release(cork->dst);
1282         cork->dst = NULL;
1283 }
1284
1285 /*
1286  *      Combined all pending IP fragments on the socket as one IP datagram
1287  *      and push them out.
1288  */
1289 struct sk_buff *__ip_make_skb(struct sock *sk,
1290                               struct flowi4 *fl4,
1291                               struct sk_buff_head *queue,
1292                               struct inet_cork *cork)
1293 {
1294         struct sk_buff *skb, *tmp_skb;
1295         struct sk_buff **tail_skb;
1296         struct inet_sock *inet = inet_sk(sk);
1297         struct net *net = sock_net(sk);
1298         struct ip_options *opt = NULL;
1299         struct rtable *rt = (struct rtable *)cork->dst;
1300         struct iphdr *iph;
1301         __be16 df = 0;
1302         __u8 ttl;
1303
1304         if ((skb = __skb_dequeue(queue)) == NULL)
1305                 goto out;
1306         tail_skb = &(skb_shinfo(skb)->frag_list);
1307
1308         /* move skb->data to ip header from ext header */
1309         if (skb->data < skb_network_header(skb))
1310                 __skb_pull(skb, skb_network_offset(skb));
1311         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1312                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1313                 *tail_skb = tmp_skb;
1314                 tail_skb = &(tmp_skb->next);
1315                 skb->len += tmp_skb->len;
1316                 skb->data_len += tmp_skb->len;
1317                 skb->truesize += tmp_skb->truesize;
1318                 tmp_skb->destructor = NULL;
1319                 tmp_skb->sk = NULL;
1320         }
1321
1322         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1323          * to fragment the frame generated here. No matter, what transforms
1324          * how transforms change size of the packet, it will come out.
1325          */
1326         if (inet->pmtudisc < IP_PMTUDISC_DO)
1327                 skb->local_df = 1;
1328
1329         /* DF bit is set when we want to see DF on outgoing frames.
1330          * If local_df is set too, we still allow to fragment this frame
1331          * locally. */
1332         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1333             (skb->len <= dst_mtu(&rt->dst) &&
1334              ip_dont_fragment(sk, &rt->dst)))
1335                 df = htons(IP_DF);
1336
1337         if (cork->flags & IPCORK_OPT)
1338                 opt = cork->opt;
1339
1340         if (rt->rt_type == RTN_MULTICAST)
1341                 ttl = inet->mc_ttl;
1342         else
1343                 ttl = ip_select_ttl(inet, &rt->dst);
1344
1345         iph = (struct iphdr *)skb->data;
1346         iph->version = 4;
1347         iph->ihl = 5;
1348         iph->tos = inet->tos;
1349         iph->frag_off = df;
1350         ip_select_ident(iph, &rt->dst, sk);
1351         iph->ttl = ttl;
1352         iph->protocol = sk->sk_protocol;
1353         ip_copy_addrs(iph, fl4);
1354
1355         if (opt) {
1356                 iph->ihl += opt->optlen>>2;
1357                 ip_options_build(skb, opt, cork->addr, rt, 0);
1358         }
1359
1360         skb->priority = sk->sk_priority;
1361         skb->mark = sk->sk_mark;
1362         /*
1363          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1364          * on dst refcount
1365          */
1366         cork->dst = NULL;
1367         skb_dst_set(skb, &rt->dst);
1368
1369         if (iph->protocol == IPPROTO_ICMP)
1370                 icmp_out_count(net, ((struct icmphdr *)
1371                         skb_transport_header(skb))->type);
1372
1373         ip_cork_release(cork);
1374 out:
1375         return skb;
1376 }
1377
1378 int ip_send_skb(struct sk_buff *skb)
1379 {
1380         struct net *net = sock_net(skb->sk);
1381         int err;
1382
1383         err = ip_local_out(skb);
1384         if (err) {
1385                 if (err > 0)
1386                         err = net_xmit_errno(err);
1387                 if (err)
1388                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1389         }
1390
1391         return err;
1392 }
1393
1394 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1395 {
1396         struct sk_buff *skb;
1397
1398         skb = ip_finish_skb(sk, fl4);
1399         if (!skb)
1400                 return 0;
1401
1402         /* Netfilter gets whole the not fragmented skb. */
1403         return ip_send_skb(skb);
1404 }
1405
1406 /*
1407  *      Throw away all pending data on the socket.
1408  */
1409 static void __ip_flush_pending_frames(struct sock *sk,
1410                                       struct sk_buff_head *queue,
1411                                       struct inet_cork *cork)
1412 {
1413         struct sk_buff *skb;
1414
1415         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1416                 kfree_skb(skb);
1417
1418         ip_cork_release(cork);
1419 }
1420
1421 void ip_flush_pending_frames(struct sock *sk)
1422 {
1423         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1424 }
1425
1426 struct sk_buff *ip_make_skb(struct sock *sk,
1427                             struct flowi4 *fl4,
1428                             int getfrag(void *from, char *to, int offset,
1429                                         int len, int odd, struct sk_buff *skb),
1430                             void *from, int length, int transhdrlen,
1431                             struct ipcm_cookie *ipc, struct rtable **rtp,
1432                             unsigned int flags)
1433 {
1434         struct inet_cork cork;
1435         struct sk_buff_head queue;
1436         int err;
1437
1438         if (flags & MSG_PROBE)
1439                 return NULL;
1440
1441         __skb_queue_head_init(&queue);
1442
1443         cork.flags = 0;
1444         cork.addr = 0;
1445         cork.opt = NULL;
1446         err = ip_setup_cork(sk, &cork, ipc, rtp);
1447         if (err)
1448                 return ERR_PTR(err);
1449
1450         err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1451                                from, length, transhdrlen, flags);
1452         if (err) {
1453                 __ip_flush_pending_frames(sk, &queue, &cork);
1454                 return ERR_PTR(err);
1455         }
1456
1457         return __ip_make_skb(sk, fl4, &queue, &cork);
1458 }
1459
1460 /*
1461  *      Fetch data from kernel space and fill in checksum if needed.
1462  */
1463 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1464                               int len, int odd, struct sk_buff *skb)
1465 {
1466         __wsum csum;
1467
1468         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1469         skb->csum = csum_block_add(skb->csum, csum, odd);
1470         return 0;
1471 }
1472
1473 /*
1474  *      Generic function to send a packet as reply to another packet.
1475  *      Used to send TCP resets so far. ICMP should use this function too.
1476  *
1477  *      Should run single threaded per socket because it uses the sock
1478  *      structure to pass arguments.
1479  */
1480 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1481                    const struct ip_reply_arg *arg, unsigned int len)
1482 {
1483         struct inet_sock *inet = inet_sk(sk);
1484         struct ip_options_data replyopts;
1485         struct ipcm_cookie ipc;
1486         struct flowi4 fl4;
1487         struct rtable *rt = skb_rtable(skb);
1488
1489         if (ip_options_echo(&replyopts.opt.opt, skb))
1490                 return;
1491
1492         ipc.addr = daddr;
1493         ipc.opt = NULL;
1494         ipc.tx_flags = 0;
1495
1496         if (replyopts.opt.opt.optlen) {
1497                 ipc.opt = &replyopts.opt;
1498
1499                 if (replyopts.opt.opt.srr)
1500                         daddr = replyopts.opt.opt.faddr;
1501         }
1502
1503         flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1504                            RT_TOS(arg->tos),
1505                            RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506                            ip_reply_arg_flowi_flags(arg),
1507                            daddr, rt->rt_spec_dst,
1508                            tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509         security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510         rt = ip_route_output_key(sock_net(sk), &fl4);
1511         if (IS_ERR(rt))
1512                 return;
1513
1514         /* And let IP do all the hard work.
1515
1516            This chunk is not reenterable, hence spinlock.
1517            Note that it uses the fact, that this function is called
1518            with locally disabled BH and that sk cannot be already spinlocked.
1519          */
1520         bh_lock_sock(sk);
1521         inet->tos = arg->tos;
1522         sk->sk_priority = skb->priority;
1523         sk->sk_protocol = ip_hdr(skb)->protocol;
1524         sk->sk_bound_dev_if = arg->bound_dev_if;
1525         ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1526                        &ipc, &rt, MSG_DONTWAIT);
1527         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1528                 if (arg->csumoffset >= 0)
1529                         *((__sum16 *)skb_transport_header(skb) +
1530                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1531                                                                 arg->csum));
1532                 skb->ip_summed = CHECKSUM_NONE;
1533                 ip_push_pending_frames(sk, &fl4);
1534         }
1535
1536         bh_unlock_sock(sk);
1537
1538         ip_rt_put(rt);
1539 }
1540
1541 void __init ip_init(void)
1542 {
1543         ip_rt_init();
1544         inet_initpeers();
1545
1546 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1547         igmp_mc_proc_init();
1548 #endif
1549 }