]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv4/ip_output.c
net: Do delayed neigh confirmation.
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / ip_output.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45 #include <asm/uaccess.h>
46 #include <linux/module.h>
47 #include <linux/types.h>
48 #include <linux/kernel.h>
49 #include <linux/mm.h>
50 #include <linux/string.h>
51 #include <linux/errno.h>
52 #include <linux/highmem.h>
53 #include <linux/slab.h>
54
55 #include <linux/socket.h>
56 #include <linux/sockios.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/etherdevice.h>
61 #include <linux/proc_fs.h>
62 #include <linux/stat.h>
63 #include <linux/init.h>
64
65 #include <net/snmp.h>
66 #include <net/ip.h>
67 #include <net/protocol.h>
68 #include <net/route.h>
69 #include <net/xfrm.h>
70 #include <linux/skbuff.h>
71 #include <net/sock.h>
72 #include <net/arp.h>
73 #include <net/icmp.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/netfilter_bridge.h>
79 #include <linux/mroute.h>
80 #include <linux/netlink.h>
81 #include <linux/tcp.h>
82
83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
85
86 /* Generate a checksum for an outgoing IP datagram. */
87 __inline__ void ip_send_check(struct iphdr *iph)
88 {
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91 }
92 EXPORT_SYMBOL(ip_send_check);
93
94 int __ip_local_out(struct sk_buff *skb)
95 {
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
102 }
103
104 int ip_local_out(struct sk_buff *skb)
105 {
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113 }
114 EXPORT_SYMBOL_GPL(ip_local_out);
115
116 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117 {
118 int ttl = inet->uc_ttl;
119
120 if (ttl < 0)
121 ttl = ip4_dst_hoplimit(dst);
122 return ttl;
123 }
124
125 /*
126 * Add an ip header to a skbuff and send it out.
127 *
128 */
129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
131 {
132 struct inet_sock *inet = inet_sk(sk);
133 struct rtable *rt = skb_rtable(skb);
134 struct iphdr *iph;
135
136 /* Build the IP header. */
137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
138 skb_reset_network_header(skb);
139 iph = ip_hdr(skb);
140 iph->version = 4;
141 iph->ihl = 5;
142 iph->tos = inet->tos;
143 if (ip_dont_fragment(sk, &rt->dst))
144 iph->frag_off = htons(IP_DF);
145 else
146 iph->frag_off = 0;
147 iph->ttl = ip_select_ttl(inet, &rt->dst);
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol;
151 ip_select_ident(iph, &rt->dst, sk);
152
153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2;
155 ip_options_build(skb, &opt->opt, daddr, rt, 0);
156 }
157
158 skb->priority = sk->sk_priority;
159 skb->mark = sk->sk_mark;
160
161 /* Send it out. */
162 return ip_local_out(skb);
163 }
164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
165
166 static inline int ip_finish_output2(struct sk_buff *skb)
167 {
168 struct dst_entry *dst = skb_dst(skb);
169 struct rtable *rt = (struct rtable *)dst;
170 struct net_device *dev = dst->dev;
171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
172 struct neighbour *neigh;
173 u32 nexthop;
174
175 if (rt->rt_type == RTN_MULTICAST) {
176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
177 } else if (rt->rt_type == RTN_BROADCAST)
178 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
179
180 /* Be paranoid, rather than too clever. */
181 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
182 struct sk_buff *skb2;
183
184 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
185 if (skb2 == NULL) {
186 kfree_skb(skb);
187 return -ENOMEM;
188 }
189 if (skb->sk)
190 skb_set_owner_w(skb2, skb->sk);
191 consume_skb(skb);
192 skb = skb2;
193 }
194
195 rcu_read_lock_bh();
196 nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
200 if (neigh) {
201 int res = dst_neigh_output(dst, neigh, skb);
202
203 rcu_read_unlock_bh();
204 return res;
205 }
206 rcu_read_unlock_bh();
207
208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
209 __func__);
210 kfree_skb(skb);
211 return -EINVAL;
212 }
213
214 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
215 {
216 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
217
218 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
219 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
220 }
221
222 static int ip_finish_output(struct sk_buff *skb)
223 {
224 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
225 /* Policy lookup after SNAT yielded a new policy */
226 if (skb_dst(skb)->xfrm != NULL) {
227 IPCB(skb)->flags |= IPSKB_REROUTED;
228 return dst_output(skb);
229 }
230 #endif
231 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
232 return ip_fragment(skb, ip_finish_output2);
233 else
234 return ip_finish_output2(skb);
235 }
236
237 int ip_mc_output(struct sk_buff *skb)
238 {
239 struct sock *sk = skb->sk;
240 struct rtable *rt = skb_rtable(skb);
241 struct net_device *dev = rt->dst.dev;
242
243 /*
244 * If the indicated interface is up and running, send the packet.
245 */
246 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
247
248 skb->dev = dev;
249 skb->protocol = htons(ETH_P_IP);
250
251 /*
252 * Multicasts are looped back for other local users
253 */
254
255 if (rt->rt_flags&RTCF_MULTICAST) {
256 if (sk_mc_loop(sk)
257 #ifdef CONFIG_IP_MROUTE
258 /* Small optimization: do not loopback not local frames,
259 which returned after forwarding; they will be dropped
260 by ip_mr_input in any case.
261 Note, that local frames are looped back to be delivered
262 to local recipients.
263
264 This check is duplicated in ip_mr_input at the moment.
265 */
266 &&
267 ((rt->rt_flags & RTCF_LOCAL) ||
268 !(IPCB(skb)->flags & IPSKB_FORWARDED))
269 #endif
270 ) {
271 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
272 if (newskb)
273 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
274 newskb, NULL, newskb->dev,
275 dev_loopback_xmit);
276 }
277
278 /* Multicasts with ttl 0 must not go beyond the host */
279
280 if (ip_hdr(skb)->ttl == 0) {
281 kfree_skb(skb);
282 return 0;
283 }
284 }
285
286 if (rt->rt_flags&RTCF_BROADCAST) {
287 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
288 if (newskb)
289 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
290 NULL, newskb->dev, dev_loopback_xmit);
291 }
292
293 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
294 skb->dev, ip_finish_output,
295 !(IPCB(skb)->flags & IPSKB_REROUTED));
296 }
297
298 int ip_output(struct sk_buff *skb)
299 {
300 struct net_device *dev = skb_dst(skb)->dev;
301
302 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
303
304 skb->dev = dev;
305 skb->protocol = htons(ETH_P_IP);
306
307 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
308 ip_finish_output,
309 !(IPCB(skb)->flags & IPSKB_REROUTED));
310 }
311
312 /*
313 * copy saddr and daddr, possibly using 64bit load/stores
314 * Equivalent to :
315 * iph->saddr = fl4->saddr;
316 * iph->daddr = fl4->daddr;
317 */
318 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
319 {
320 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
321 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
322 memcpy(&iph->saddr, &fl4->saddr,
323 sizeof(fl4->saddr) + sizeof(fl4->daddr));
324 }
325
326 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
327 {
328 struct sock *sk = skb->sk;
329 struct inet_sock *inet = inet_sk(sk);
330 struct ip_options_rcu *inet_opt;
331 struct flowi4 *fl4;
332 struct rtable *rt;
333 struct iphdr *iph;
334 int res;
335
336 /* Skip all of this if the packet is already routed,
337 * f.e. by something like SCTP.
338 */
339 rcu_read_lock();
340 inet_opt = rcu_dereference(inet->inet_opt);
341 fl4 = &fl->u.ip4;
342 rt = skb_rtable(skb);
343 if (rt != NULL)
344 goto packet_routed;
345
346 /* Make sure we can route this packet. */
347 rt = (struct rtable *)__sk_dst_check(sk, 0);
348 if (rt == NULL) {
349 __be32 daddr;
350
351 /* Use correct destination address if we have options. */
352 daddr = inet->inet_daddr;
353 if (inet_opt && inet_opt->opt.srr)
354 daddr = inet_opt->opt.faddr;
355
356 /* If this fails, retransmit mechanism of transport layer will
357 * keep trying until route appears or the connection times
358 * itself out.
359 */
360 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
361 daddr, inet->inet_saddr,
362 inet->inet_dport,
363 inet->inet_sport,
364 sk->sk_protocol,
365 RT_CONN_FLAGS(sk),
366 sk->sk_bound_dev_if);
367 if (IS_ERR(rt))
368 goto no_route;
369 sk_setup_caps(sk, &rt->dst);
370 }
371 skb_dst_set_noref(skb, &rt->dst);
372
373 packet_routed:
374 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
375 goto no_route;
376
377 /* OK, we know where to send it, allocate and build IP header. */
378 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
379 skb_reset_network_header(skb);
380 iph = ip_hdr(skb);
381 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
382 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
383 iph->frag_off = htons(IP_DF);
384 else
385 iph->frag_off = 0;
386 iph->ttl = ip_select_ttl(inet, &rt->dst);
387 iph->protocol = sk->sk_protocol;
388 ip_copy_addrs(iph, fl4);
389
390 /* Transport layer set skb->h.foo itself. */
391
392 if (inet_opt && inet_opt->opt.optlen) {
393 iph->ihl += inet_opt->opt.optlen >> 2;
394 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
395 }
396
397 ip_select_ident_more(iph, &rt->dst, sk,
398 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
399
400 skb->priority = sk->sk_priority;
401 skb->mark = sk->sk_mark;
402
403 res = ip_local_out(skb);
404 rcu_read_unlock();
405 return res;
406
407 no_route:
408 rcu_read_unlock();
409 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
410 kfree_skb(skb);
411 return -EHOSTUNREACH;
412 }
413 EXPORT_SYMBOL(ip_queue_xmit);
414
415
416 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
417 {
418 to->pkt_type = from->pkt_type;
419 to->priority = from->priority;
420 to->protocol = from->protocol;
421 skb_dst_drop(to);
422 skb_dst_copy(to, from);
423 to->dev = from->dev;
424 to->mark = from->mark;
425
426 /* Copy the flags to each fragment. */
427 IPCB(to)->flags = IPCB(from)->flags;
428
429 #ifdef CONFIG_NET_SCHED
430 to->tc_index = from->tc_index;
431 #endif
432 nf_copy(to, from);
433 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
434 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
435 to->nf_trace = from->nf_trace;
436 #endif
437 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
438 to->ipvs_property = from->ipvs_property;
439 #endif
440 skb_copy_secmark(to, from);
441 }
442
443 /*
444 * This IP datagram is too large to be sent in one piece. Break it up into
445 * smaller pieces (each of size equal to IP header plus
446 * a block of the data of the original IP data part) that will yet fit in a
447 * single device frame, and queue such a frame for sending.
448 */
449
450 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
451 {
452 struct iphdr *iph;
453 int ptr;
454 struct net_device *dev;
455 struct sk_buff *skb2;
456 unsigned int mtu, hlen, left, len, ll_rs;
457 int offset;
458 __be16 not_last_frag;
459 struct rtable *rt = skb_rtable(skb);
460 int err = 0;
461
462 dev = rt->dst.dev;
463
464 /*
465 * Point into the IP datagram header.
466 */
467
468 iph = ip_hdr(skb);
469
470 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
471 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
472 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
473 htonl(ip_skb_dst_mtu(skb)));
474 kfree_skb(skb);
475 return -EMSGSIZE;
476 }
477
478 /*
479 * Setup starting values.
480 */
481
482 hlen = iph->ihl * 4;
483 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
484 #ifdef CONFIG_BRIDGE_NETFILTER
485 if (skb->nf_bridge)
486 mtu -= nf_bridge_mtu_reduction(skb);
487 #endif
488 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
489
490 /* When frag_list is given, use it. First, check its validity:
491 * some transformers could create wrong frag_list or break existing
492 * one, it is not prohibited. In this case fall back to copying.
493 *
494 * LATER: this step can be merged to real generation of fragments,
495 * we can switch to copy when see the first bad fragment.
496 */
497 if (skb_has_frag_list(skb)) {
498 struct sk_buff *frag, *frag2;
499 int first_len = skb_pagelen(skb);
500
501 if (first_len - hlen > mtu ||
502 ((first_len - hlen) & 7) ||
503 ip_is_fragment(iph) ||
504 skb_cloned(skb))
505 goto slow_path;
506
507 skb_walk_frags(skb, frag) {
508 /* Correct geometry. */
509 if (frag->len > mtu ||
510 ((frag->len & 7) && frag->next) ||
511 skb_headroom(frag) < hlen)
512 goto slow_path_clean;
513
514 /* Partially cloned skb? */
515 if (skb_shared(frag))
516 goto slow_path_clean;
517
518 BUG_ON(frag->sk);
519 if (skb->sk) {
520 frag->sk = skb->sk;
521 frag->destructor = sock_wfree;
522 }
523 skb->truesize -= frag->truesize;
524 }
525
526 /* Everything is OK. Generate! */
527
528 err = 0;
529 offset = 0;
530 frag = skb_shinfo(skb)->frag_list;
531 skb_frag_list_init(skb);
532 skb->data_len = first_len - skb_headlen(skb);
533 skb->len = first_len;
534 iph->tot_len = htons(first_len);
535 iph->frag_off = htons(IP_MF);
536 ip_send_check(iph);
537
538 for (;;) {
539 /* Prepare header of the next frame,
540 * before previous one went down. */
541 if (frag) {
542 frag->ip_summed = CHECKSUM_NONE;
543 skb_reset_transport_header(frag);
544 __skb_push(frag, hlen);
545 skb_reset_network_header(frag);
546 memcpy(skb_network_header(frag), iph, hlen);
547 iph = ip_hdr(frag);
548 iph->tot_len = htons(frag->len);
549 ip_copy_metadata(frag, skb);
550 if (offset == 0)
551 ip_options_fragment(frag);
552 offset += skb->len - hlen;
553 iph->frag_off = htons(offset>>3);
554 if (frag->next != NULL)
555 iph->frag_off |= htons(IP_MF);
556 /* Ready, complete checksum */
557 ip_send_check(iph);
558 }
559
560 err = output(skb);
561
562 if (!err)
563 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
564 if (err || !frag)
565 break;
566
567 skb = frag;
568 frag = skb->next;
569 skb->next = NULL;
570 }
571
572 if (err == 0) {
573 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
574 return 0;
575 }
576
577 while (frag) {
578 skb = frag->next;
579 kfree_skb(frag);
580 frag = skb;
581 }
582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
583 return err;
584
585 slow_path_clean:
586 skb_walk_frags(skb, frag2) {
587 if (frag2 == frag)
588 break;
589 frag2->sk = NULL;
590 frag2->destructor = NULL;
591 skb->truesize += frag2->truesize;
592 }
593 }
594
595 slow_path:
596 left = skb->len - hlen; /* Space per frame */
597 ptr = hlen; /* Where to start from */
598
599 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
600 * we need to make room for the encapsulating header
601 */
602 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
603
604 /*
605 * Fragment the datagram.
606 */
607
608 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
609 not_last_frag = iph->frag_off & htons(IP_MF);
610
611 /*
612 * Keep copying data until we run out.
613 */
614
615 while (left > 0) {
616 len = left;
617 /* IF: it doesn't fit, use 'mtu' - the data space left */
618 if (len > mtu)
619 len = mtu;
620 /* IF: we are not sending up to and including the packet end
621 then align the next start on an eight byte boundary */
622 if (len < left) {
623 len &= ~7;
624 }
625 /*
626 * Allocate buffer.
627 */
628
629 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
630 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
631 err = -ENOMEM;
632 goto fail;
633 }
634
635 /*
636 * Set up data on packet
637 */
638
639 ip_copy_metadata(skb2, skb);
640 skb_reserve(skb2, ll_rs);
641 skb_put(skb2, len + hlen);
642 skb_reset_network_header(skb2);
643 skb2->transport_header = skb2->network_header + hlen;
644
645 /*
646 * Charge the memory for the fragment to any owner
647 * it might possess
648 */
649
650 if (skb->sk)
651 skb_set_owner_w(skb2, skb->sk);
652
653 /*
654 * Copy the packet header into the new buffer.
655 */
656
657 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
658
659 /*
660 * Copy a block of the IP datagram.
661 */
662 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
663 BUG();
664 left -= len;
665
666 /*
667 * Fill in the new header fields.
668 */
669 iph = ip_hdr(skb2);
670 iph->frag_off = htons((offset >> 3));
671
672 /* ANK: dirty, but effective trick. Upgrade options only if
673 * the segment to be fragmented was THE FIRST (otherwise,
674 * options are already fixed) and make it ONCE
675 * on the initial skb, so that all the following fragments
676 * will inherit fixed options.
677 */
678 if (offset == 0)
679 ip_options_fragment(skb);
680
681 /*
682 * Added AC : If we are fragmenting a fragment that's not the
683 * last fragment then keep MF on each bit
684 */
685 if (left > 0 || not_last_frag)
686 iph->frag_off |= htons(IP_MF);
687 ptr += len;
688 offset += len;
689
690 /*
691 * Put this fragment into the sending queue.
692 */
693 iph->tot_len = htons(len + hlen);
694
695 ip_send_check(iph);
696
697 err = output(skb2);
698 if (err)
699 goto fail;
700
701 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
702 }
703 consume_skb(skb);
704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
705 return err;
706
707 fail:
708 kfree_skb(skb);
709 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
710 return err;
711 }
712 EXPORT_SYMBOL(ip_fragment);
713
714 int
715 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
716 {
717 struct iovec *iov = from;
718
719 if (skb->ip_summed == CHECKSUM_PARTIAL) {
720 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
721 return -EFAULT;
722 } else {
723 __wsum csum = 0;
724 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
725 return -EFAULT;
726 skb->csum = csum_block_add(skb->csum, csum, odd);
727 }
728 return 0;
729 }
730 EXPORT_SYMBOL(ip_generic_getfrag);
731
732 static inline __wsum
733 csum_page(struct page *page, int offset, int copy)
734 {
735 char *kaddr;
736 __wsum csum;
737 kaddr = kmap(page);
738 csum = csum_partial(kaddr + offset, copy, 0);
739 kunmap(page);
740 return csum;
741 }
742
743 static inline int ip_ufo_append_data(struct sock *sk,
744 struct sk_buff_head *queue,
745 int getfrag(void *from, char *to, int offset, int len,
746 int odd, struct sk_buff *skb),
747 void *from, int length, int hh_len, int fragheaderlen,
748 int transhdrlen, int maxfraglen, unsigned int flags)
749 {
750 struct sk_buff *skb;
751 int err;
752
753 /* There is support for UDP fragmentation offload by network
754 * device, so create one single skb packet containing complete
755 * udp datagram
756 */
757 if ((skb = skb_peek_tail(queue)) == NULL) {
758 skb = sock_alloc_send_skb(sk,
759 hh_len + fragheaderlen + transhdrlen + 20,
760 (flags & MSG_DONTWAIT), &err);
761
762 if (skb == NULL)
763 return err;
764
765 /* reserve space for Hardware header */
766 skb_reserve(skb, hh_len);
767
768 /* create space for UDP/IP header */
769 skb_put(skb, fragheaderlen + transhdrlen);
770
771 /* initialize network header pointer */
772 skb_reset_network_header(skb);
773
774 /* initialize protocol header pointer */
775 skb->transport_header = skb->network_header + fragheaderlen;
776
777 skb->ip_summed = CHECKSUM_PARTIAL;
778 skb->csum = 0;
779
780 /* specify the length of each IP datagram fragment */
781 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
782 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
783 __skb_queue_tail(queue, skb);
784 }
785
786 return skb_append_datato_frags(sk, skb, getfrag, from,
787 (length - transhdrlen));
788 }
789
790 static int __ip_append_data(struct sock *sk,
791 struct flowi4 *fl4,
792 struct sk_buff_head *queue,
793 struct inet_cork *cork,
794 int getfrag(void *from, char *to, int offset,
795 int len, int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 unsigned int flags)
798 {
799 struct inet_sock *inet = inet_sk(sk);
800 struct sk_buff *skb;
801
802 struct ip_options *opt = cork->opt;
803 int hh_len;
804 int exthdrlen;
805 int mtu;
806 int copy;
807 int err;
808 int offset = 0;
809 unsigned int maxfraglen, fragheaderlen;
810 int csummode = CHECKSUM_NONE;
811 struct rtable *rt = (struct rtable *)cork->dst;
812
813 skb = skb_peek_tail(queue);
814
815 exthdrlen = !skb ? rt->dst.header_len : 0;
816 mtu = cork->fragsize;
817
818 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
819
820 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
821 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
822
823 if (cork->length + length > 0xFFFF - fragheaderlen) {
824 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
825 mtu-exthdrlen);
826 return -EMSGSIZE;
827 }
828
829 /*
830 * transhdrlen > 0 means that this is the first fragment and we wish
831 * it won't be fragmented in the future.
832 */
833 if (transhdrlen &&
834 length + fragheaderlen <= mtu &&
835 rt->dst.dev->features & NETIF_F_V4_CSUM &&
836 !exthdrlen)
837 csummode = CHECKSUM_PARTIAL;
838
839 cork->length += length;
840 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
841 (sk->sk_protocol == IPPROTO_UDP) &&
842 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
843 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
844 hh_len, fragheaderlen, transhdrlen,
845 maxfraglen, flags);
846 if (err)
847 goto error;
848 return 0;
849 }
850
851 /* So, what's going on in the loop below?
852 *
853 * We use calculated fragment length to generate chained skb,
854 * each of segments is IP fragment ready for sending to network after
855 * adding appropriate IP header.
856 */
857
858 if (!skb)
859 goto alloc_new_skb;
860
861 while (length > 0) {
862 /* Check if the remaining data fits into current packet. */
863 copy = mtu - skb->len;
864 if (copy < length)
865 copy = maxfraglen - skb->len;
866 if (copy <= 0) {
867 char *data;
868 unsigned int datalen;
869 unsigned int fraglen;
870 unsigned int fraggap;
871 unsigned int alloclen;
872 struct sk_buff *skb_prev;
873 alloc_new_skb:
874 skb_prev = skb;
875 if (skb_prev)
876 fraggap = skb_prev->len - maxfraglen;
877 else
878 fraggap = 0;
879
880 /*
881 * If remaining data exceeds the mtu,
882 * we know we need more fragment(s).
883 */
884 datalen = length + fraggap;
885 if (datalen > mtu - fragheaderlen)
886 datalen = maxfraglen - fragheaderlen;
887 fraglen = datalen + fragheaderlen;
888
889 if ((flags & MSG_MORE) &&
890 !(rt->dst.dev->features&NETIF_F_SG))
891 alloclen = mtu;
892 else
893 alloclen = fraglen;
894
895 alloclen += exthdrlen;
896
897 /* The last fragment gets additional space at tail.
898 * Note, with MSG_MORE we overallocate on fragments,
899 * because we have no idea what fragment will be
900 * the last.
901 */
902 if (datalen == length + fraggap)
903 alloclen += rt->dst.trailer_len;
904
905 if (transhdrlen) {
906 skb = sock_alloc_send_skb(sk,
907 alloclen + hh_len + 15,
908 (flags & MSG_DONTWAIT), &err);
909 } else {
910 skb = NULL;
911 if (atomic_read(&sk->sk_wmem_alloc) <=
912 2 * sk->sk_sndbuf)
913 skb = sock_wmalloc(sk,
914 alloclen + hh_len + 15, 1,
915 sk->sk_allocation);
916 if (unlikely(skb == NULL))
917 err = -ENOBUFS;
918 else
919 /* only the initial fragment is
920 time stamped */
921 cork->tx_flags = 0;
922 }
923 if (skb == NULL)
924 goto error;
925
926 /*
927 * Fill in the control structures
928 */
929 skb->ip_summed = csummode;
930 skb->csum = 0;
931 skb_reserve(skb, hh_len);
932 skb_shinfo(skb)->tx_flags = cork->tx_flags;
933
934 /*
935 * Find where to start putting bytes.
936 */
937 data = skb_put(skb, fraglen + exthdrlen);
938 skb_set_network_header(skb, exthdrlen);
939 skb->transport_header = (skb->network_header +
940 fragheaderlen);
941 data += fragheaderlen + exthdrlen;
942
943 if (fraggap) {
944 skb->csum = skb_copy_and_csum_bits(
945 skb_prev, maxfraglen,
946 data + transhdrlen, fraggap, 0);
947 skb_prev->csum = csum_sub(skb_prev->csum,
948 skb->csum);
949 data += fraggap;
950 pskb_trim_unique(skb_prev, maxfraglen);
951 }
952
953 copy = datalen - transhdrlen - fraggap;
954 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
955 err = -EFAULT;
956 kfree_skb(skb);
957 goto error;
958 }
959
960 offset += copy;
961 length -= datalen - fraggap;
962 transhdrlen = 0;
963 exthdrlen = 0;
964 csummode = CHECKSUM_NONE;
965
966 /*
967 * Put the packet on the pending queue.
968 */
969 __skb_queue_tail(queue, skb);
970 continue;
971 }
972
973 if (copy > length)
974 copy = length;
975
976 if (!(rt->dst.dev->features&NETIF_F_SG)) {
977 unsigned int off;
978
979 off = skb->len;
980 if (getfrag(from, skb_put(skb, copy),
981 offset, copy, off, skb) < 0) {
982 __skb_trim(skb, off);
983 err = -EFAULT;
984 goto error;
985 }
986 } else {
987 int i = skb_shinfo(skb)->nr_frags;
988 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
989 struct page *page = cork->page;
990 int off = cork->off;
991 unsigned int left;
992
993 if (page && (left = PAGE_SIZE - off) > 0) {
994 if (copy >= left)
995 copy = left;
996 if (page != skb_frag_page(frag)) {
997 if (i == MAX_SKB_FRAGS) {
998 err = -EMSGSIZE;
999 goto error;
1000 }
1001 skb_fill_page_desc(skb, i, page, off, 0);
1002 skb_frag_ref(skb, i);
1003 frag = &skb_shinfo(skb)->frags[i];
1004 }
1005 } else if (i < MAX_SKB_FRAGS) {
1006 if (copy > PAGE_SIZE)
1007 copy = PAGE_SIZE;
1008 page = alloc_pages(sk->sk_allocation, 0);
1009 if (page == NULL) {
1010 err = -ENOMEM;
1011 goto error;
1012 }
1013 cork->page = page;
1014 cork->off = 0;
1015
1016 skb_fill_page_desc(skb, i, page, 0, 0);
1017 frag = &skb_shinfo(skb)->frags[i];
1018 } else {
1019 err = -EMSGSIZE;
1020 goto error;
1021 }
1022 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1023 offset, copy, skb->len, skb) < 0) {
1024 err = -EFAULT;
1025 goto error;
1026 }
1027 cork->off += copy;
1028 skb_frag_size_add(frag, copy);
1029 skb->len += copy;
1030 skb->data_len += copy;
1031 skb->truesize += copy;
1032 atomic_add(copy, &sk->sk_wmem_alloc);
1033 }
1034 offset += copy;
1035 length -= copy;
1036 }
1037
1038 return 0;
1039
1040 error:
1041 cork->length -= length;
1042 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1043 return err;
1044 }
1045
1046 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1047 struct ipcm_cookie *ipc, struct rtable **rtp)
1048 {
1049 struct inet_sock *inet = inet_sk(sk);
1050 struct ip_options_rcu *opt;
1051 struct rtable *rt;
1052
1053 /*
1054 * setup for corking.
1055 */
1056 opt = ipc->opt;
1057 if (opt) {
1058 if (cork->opt == NULL) {
1059 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1060 sk->sk_allocation);
1061 if (unlikely(cork->opt == NULL))
1062 return -ENOBUFS;
1063 }
1064 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1065 cork->flags |= IPCORK_OPT;
1066 cork->addr = ipc->addr;
1067 }
1068 rt = *rtp;
1069 if (unlikely(!rt))
1070 return -EFAULT;
1071 /*
1072 * We steal reference to this route, caller should not release it
1073 */
1074 *rtp = NULL;
1075 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1076 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1077 cork->dst = &rt->dst;
1078 cork->length = 0;
1079 cork->tx_flags = ipc->tx_flags;
1080 cork->page = NULL;
1081 cork->off = 0;
1082
1083 return 0;
1084 }
1085
1086 /*
1087 * ip_append_data() and ip_append_page() can make one large IP datagram
1088 * from many pieces of data. Each pieces will be holded on the socket
1089 * until ip_push_pending_frames() is called. Each piece can be a page
1090 * or non-page data.
1091 *
1092 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1093 * this interface potentially.
1094 *
1095 * LATER: length must be adjusted by pad at tail, when it is required.
1096 */
1097 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1098 int getfrag(void *from, char *to, int offset, int len,
1099 int odd, struct sk_buff *skb),
1100 void *from, int length, int transhdrlen,
1101 struct ipcm_cookie *ipc, struct rtable **rtp,
1102 unsigned int flags)
1103 {
1104 struct inet_sock *inet = inet_sk(sk);
1105 int err;
1106
1107 if (flags&MSG_PROBE)
1108 return 0;
1109
1110 if (skb_queue_empty(&sk->sk_write_queue)) {
1111 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1112 if (err)
1113 return err;
1114 } else {
1115 transhdrlen = 0;
1116 }
1117
1118 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1119 from, length, transhdrlen, flags);
1120 }
1121
1122 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1123 int offset, size_t size, int flags)
1124 {
1125 struct inet_sock *inet = inet_sk(sk);
1126 struct sk_buff *skb;
1127 struct rtable *rt;
1128 struct ip_options *opt = NULL;
1129 struct inet_cork *cork;
1130 int hh_len;
1131 int mtu;
1132 int len;
1133 int err;
1134 unsigned int maxfraglen, fragheaderlen, fraggap;
1135
1136 if (inet->hdrincl)
1137 return -EPERM;
1138
1139 if (flags&MSG_PROBE)
1140 return 0;
1141
1142 if (skb_queue_empty(&sk->sk_write_queue))
1143 return -EINVAL;
1144
1145 cork = &inet->cork.base;
1146 rt = (struct rtable *)cork->dst;
1147 if (cork->flags & IPCORK_OPT)
1148 opt = cork->opt;
1149
1150 if (!(rt->dst.dev->features&NETIF_F_SG))
1151 return -EOPNOTSUPP;
1152
1153 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1154 mtu = cork->fragsize;
1155
1156 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1157 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1158
1159 if (cork->length + size > 0xFFFF - fragheaderlen) {
1160 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1161 return -EMSGSIZE;
1162 }
1163
1164 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1165 return -EINVAL;
1166
1167 cork->length += size;
1168 if ((size + skb->len > mtu) &&
1169 (sk->sk_protocol == IPPROTO_UDP) &&
1170 (rt->dst.dev->features & NETIF_F_UFO)) {
1171 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1172 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1173 }
1174
1175
1176 while (size > 0) {
1177 int i;
1178
1179 if (skb_is_gso(skb))
1180 len = size;
1181 else {
1182
1183 /* Check if the remaining data fits into current packet. */
1184 len = mtu - skb->len;
1185 if (len < size)
1186 len = maxfraglen - skb->len;
1187 }
1188 if (len <= 0) {
1189 struct sk_buff *skb_prev;
1190 int alloclen;
1191
1192 skb_prev = skb;
1193 fraggap = skb_prev->len - maxfraglen;
1194
1195 alloclen = fragheaderlen + hh_len + fraggap + 15;
1196 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1197 if (unlikely(!skb)) {
1198 err = -ENOBUFS;
1199 goto error;
1200 }
1201
1202 /*
1203 * Fill in the control structures
1204 */
1205 skb->ip_summed = CHECKSUM_NONE;
1206 skb->csum = 0;
1207 skb_reserve(skb, hh_len);
1208
1209 /*
1210 * Find where to start putting bytes.
1211 */
1212 skb_put(skb, fragheaderlen + fraggap);
1213 skb_reset_network_header(skb);
1214 skb->transport_header = (skb->network_header +
1215 fragheaderlen);
1216 if (fraggap) {
1217 skb->csum = skb_copy_and_csum_bits(skb_prev,
1218 maxfraglen,
1219 skb_transport_header(skb),
1220 fraggap, 0);
1221 skb_prev->csum = csum_sub(skb_prev->csum,
1222 skb->csum);
1223 pskb_trim_unique(skb_prev, maxfraglen);
1224 }
1225
1226 /*
1227 * Put the packet on the pending queue.
1228 */
1229 __skb_queue_tail(&sk->sk_write_queue, skb);
1230 continue;
1231 }
1232
1233 i = skb_shinfo(skb)->nr_frags;
1234 if (len > size)
1235 len = size;
1236 if (skb_can_coalesce(skb, i, page, offset)) {
1237 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1238 } else if (i < MAX_SKB_FRAGS) {
1239 get_page(page);
1240 skb_fill_page_desc(skb, i, page, offset, len);
1241 } else {
1242 err = -EMSGSIZE;
1243 goto error;
1244 }
1245
1246 if (skb->ip_summed == CHECKSUM_NONE) {
1247 __wsum csum;
1248 csum = csum_page(page, offset, len);
1249 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1250 }
1251
1252 skb->len += len;
1253 skb->data_len += len;
1254 skb->truesize += len;
1255 atomic_add(len, &sk->sk_wmem_alloc);
1256 offset += len;
1257 size -= len;
1258 }
1259 return 0;
1260
1261 error:
1262 cork->length -= size;
1263 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1264 return err;
1265 }
1266
1267 static void ip_cork_release(struct inet_cork *cork)
1268 {
1269 cork->flags &= ~IPCORK_OPT;
1270 kfree(cork->opt);
1271 cork->opt = NULL;
1272 dst_release(cork->dst);
1273 cork->dst = NULL;
1274 }
1275
1276 /*
1277 * Combined all pending IP fragments on the socket as one IP datagram
1278 * and push them out.
1279 */
1280 struct sk_buff *__ip_make_skb(struct sock *sk,
1281 struct flowi4 *fl4,
1282 struct sk_buff_head *queue,
1283 struct inet_cork *cork)
1284 {
1285 struct sk_buff *skb, *tmp_skb;
1286 struct sk_buff **tail_skb;
1287 struct inet_sock *inet = inet_sk(sk);
1288 struct net *net = sock_net(sk);
1289 struct ip_options *opt = NULL;
1290 struct rtable *rt = (struct rtable *)cork->dst;
1291 struct iphdr *iph;
1292 __be16 df = 0;
1293 __u8 ttl;
1294
1295 if ((skb = __skb_dequeue(queue)) == NULL)
1296 goto out;
1297 tail_skb = &(skb_shinfo(skb)->frag_list);
1298
1299 /* move skb->data to ip header from ext header */
1300 if (skb->data < skb_network_header(skb))
1301 __skb_pull(skb, skb_network_offset(skb));
1302 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1303 __skb_pull(tmp_skb, skb_network_header_len(skb));
1304 *tail_skb = tmp_skb;
1305 tail_skb = &(tmp_skb->next);
1306 skb->len += tmp_skb->len;
1307 skb->data_len += tmp_skb->len;
1308 skb->truesize += tmp_skb->truesize;
1309 tmp_skb->destructor = NULL;
1310 tmp_skb->sk = NULL;
1311 }
1312
1313 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1314 * to fragment the frame generated here. No matter, what transforms
1315 * how transforms change size of the packet, it will come out.
1316 */
1317 if (inet->pmtudisc < IP_PMTUDISC_DO)
1318 skb->local_df = 1;
1319
1320 /* DF bit is set when we want to see DF on outgoing frames.
1321 * If local_df is set too, we still allow to fragment this frame
1322 * locally. */
1323 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1324 (skb->len <= dst_mtu(&rt->dst) &&
1325 ip_dont_fragment(sk, &rt->dst)))
1326 df = htons(IP_DF);
1327
1328 if (cork->flags & IPCORK_OPT)
1329 opt = cork->opt;
1330
1331 if (rt->rt_type == RTN_MULTICAST)
1332 ttl = inet->mc_ttl;
1333 else
1334 ttl = ip_select_ttl(inet, &rt->dst);
1335
1336 iph = (struct iphdr *)skb->data;
1337 iph->version = 4;
1338 iph->ihl = 5;
1339 iph->tos = inet->tos;
1340 iph->frag_off = df;
1341 ip_select_ident(iph, &rt->dst, sk);
1342 iph->ttl = ttl;
1343 iph->protocol = sk->sk_protocol;
1344 ip_copy_addrs(iph, fl4);
1345
1346 if (opt) {
1347 iph->ihl += opt->optlen>>2;
1348 ip_options_build(skb, opt, cork->addr, rt, 0);
1349 }
1350
1351 skb->priority = sk->sk_priority;
1352 skb->mark = sk->sk_mark;
1353 /*
1354 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1355 * on dst refcount
1356 */
1357 cork->dst = NULL;
1358 skb_dst_set(skb, &rt->dst);
1359
1360 if (iph->protocol == IPPROTO_ICMP)
1361 icmp_out_count(net, ((struct icmphdr *)
1362 skb_transport_header(skb))->type);
1363
1364 ip_cork_release(cork);
1365 out:
1366 return skb;
1367 }
1368
1369 int ip_send_skb(struct sk_buff *skb)
1370 {
1371 struct net *net = sock_net(skb->sk);
1372 int err;
1373
1374 err = ip_local_out(skb);
1375 if (err) {
1376 if (err > 0)
1377 err = net_xmit_errno(err);
1378 if (err)
1379 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1380 }
1381
1382 return err;
1383 }
1384
1385 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1386 {
1387 struct sk_buff *skb;
1388
1389 skb = ip_finish_skb(sk, fl4);
1390 if (!skb)
1391 return 0;
1392
1393 /* Netfilter gets whole the not fragmented skb. */
1394 return ip_send_skb(skb);
1395 }
1396
1397 /*
1398 * Throw away all pending data on the socket.
1399 */
1400 static void __ip_flush_pending_frames(struct sock *sk,
1401 struct sk_buff_head *queue,
1402 struct inet_cork *cork)
1403 {
1404 struct sk_buff *skb;
1405
1406 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1407 kfree_skb(skb);
1408
1409 ip_cork_release(cork);
1410 }
1411
1412 void ip_flush_pending_frames(struct sock *sk)
1413 {
1414 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1415 }
1416
1417 struct sk_buff *ip_make_skb(struct sock *sk,
1418 struct flowi4 *fl4,
1419 int getfrag(void *from, char *to, int offset,
1420 int len, int odd, struct sk_buff *skb),
1421 void *from, int length, int transhdrlen,
1422 struct ipcm_cookie *ipc, struct rtable **rtp,
1423 unsigned int flags)
1424 {
1425 struct inet_cork cork;
1426 struct sk_buff_head queue;
1427 int err;
1428
1429 if (flags & MSG_PROBE)
1430 return NULL;
1431
1432 __skb_queue_head_init(&queue);
1433
1434 cork.flags = 0;
1435 cork.addr = 0;
1436 cork.opt = NULL;
1437 err = ip_setup_cork(sk, &cork, ipc, rtp);
1438 if (err)
1439 return ERR_PTR(err);
1440
1441 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1442 from, length, transhdrlen, flags);
1443 if (err) {
1444 __ip_flush_pending_frames(sk, &queue, &cork);
1445 return ERR_PTR(err);
1446 }
1447
1448 return __ip_make_skb(sk, fl4, &queue, &cork);
1449 }
1450
1451 /*
1452 * Fetch data from kernel space and fill in checksum if needed.
1453 */
1454 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1455 int len, int odd, struct sk_buff *skb)
1456 {
1457 __wsum csum;
1458
1459 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1460 skb->csum = csum_block_add(skb->csum, csum, odd);
1461 return 0;
1462 }
1463
1464 /*
1465 * Generic function to send a packet as reply to another packet.
1466 * Used to send TCP resets so far.
1467 *
1468 * Should run single threaded per socket because it uses the sock
1469 * structure to pass arguments.
1470 */
1471 void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1472 __be32 saddr, const struct ip_reply_arg *arg,
1473 unsigned int len)
1474 {
1475 struct inet_sock *inet = inet_sk(sk);
1476 struct ip_options_data replyopts;
1477 struct ipcm_cookie ipc;
1478 struct flowi4 fl4;
1479 struct rtable *rt = skb_rtable(skb);
1480
1481 if (ip_options_echo(&replyopts.opt.opt, skb))
1482 return;
1483
1484 ipc.addr = daddr;
1485 ipc.opt = NULL;
1486 ipc.tx_flags = 0;
1487
1488 if (replyopts.opt.opt.optlen) {
1489 ipc.opt = &replyopts.opt;
1490
1491 if (replyopts.opt.opt.srr)
1492 daddr = replyopts.opt.opt.faddr;
1493 }
1494
1495 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1496 RT_TOS(arg->tos),
1497 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1498 ip_reply_arg_flowi_flags(arg),
1499 daddr, saddr,
1500 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1501 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1502 rt = ip_route_output_key(sock_net(sk), &fl4);
1503 if (IS_ERR(rt))
1504 return;
1505
1506 /* And let IP do all the hard work.
1507
1508 This chunk is not reenterable, hence spinlock.
1509 Note that it uses the fact, that this function is called
1510 with locally disabled BH and that sk cannot be already spinlocked.
1511 */
1512 bh_lock_sock(sk);
1513 inet->tos = arg->tos;
1514 sk->sk_priority = skb->priority;
1515 sk->sk_protocol = ip_hdr(skb)->protocol;
1516 sk->sk_bound_dev_if = arg->bound_dev_if;
1517 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1518 &ipc, &rt, MSG_DONTWAIT);
1519 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1520 if (arg->csumoffset >= 0)
1521 *((__sum16 *)skb_transport_header(skb) +
1522 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1523 arg->csum));
1524 skb->ip_summed = CHECKSUM_NONE;
1525 ip_push_pending_frames(sk, &fl4);
1526 }
1527
1528 bh_unlock_sock(sk);
1529
1530 ip_rt_put(rt);
1531 }
1532
1533 void __init ip_init(void)
1534 {
1535 ip_rt_init();
1536 inet_initpeers();
1537
1538 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1539 igmp_mc_proc_init();
1540 #endif
1541 }