]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv4/ip_output.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / ip_output.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45 #include <asm/uaccess.h>
46 #include <linux/module.h>
47 #include <linux/types.h>
48 #include <linux/kernel.h>
49 #include <linux/mm.h>
50 #include <linux/string.h>
51 #include <linux/errno.h>
52 #include <linux/highmem.h>
53 #include <linux/slab.h>
54
55 #include <linux/socket.h>
56 #include <linux/sockios.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/etherdevice.h>
61 #include <linux/proc_fs.h>
62 #include <linux/stat.h>
63 #include <linux/init.h>
64
65 #include <net/snmp.h>
66 #include <net/ip.h>
67 #include <net/protocol.h>
68 #include <net/route.h>
69 #include <net/xfrm.h>
70 #include <linux/skbuff.h>
71 #include <net/sock.h>
72 #include <net/arp.h>
73 #include <net/icmp.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/netfilter_bridge.h>
79 #include <linux/mroute.h>
80 #include <linux/netlink.h>
81 #include <linux/tcp.h>
82
83 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84 EXPORT_SYMBOL(sysctl_ip_default_ttl);
85
86 /* Generate a checksum for an outgoing IP datagram. */
87 __inline__ void ip_send_check(struct iphdr *iph)
88 {
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91 }
92 EXPORT_SYMBOL(ip_send_check);
93
94 int __ip_local_out(struct sk_buff *skb)
95 {
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
102 }
103
104 int ip_local_out(struct sk_buff *skb)
105 {
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113 }
114 EXPORT_SYMBOL_GPL(ip_local_out);
115
116 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117 {
118 int ttl = inet->uc_ttl;
119
120 if (ttl < 0)
121 ttl = ip4_dst_hoplimit(dst);
122 return ttl;
123 }
124
125 /*
126 * Add an ip header to a skbuff and send it out.
127 *
128 */
129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
131 {
132 struct inet_sock *inet = inet_sk(sk);
133 struct rtable *rt = skb_rtable(skb);
134 struct iphdr *iph;
135
136 /* Build the IP header. */
137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
138 skb_reset_network_header(skb);
139 iph = ip_hdr(skb);
140 iph->version = 4;
141 iph->ihl = 5;
142 iph->tos = inet->tos;
143 if (ip_dont_fragment(sk, &rt->dst))
144 iph->frag_off = htons(IP_DF);
145 else
146 iph->frag_off = 0;
147 iph->ttl = ip_select_ttl(inet, &rt->dst);
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol;
151 ip_select_ident(iph, &rt->dst, sk);
152
153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2;
155 ip_options_build(skb, &opt->opt, daddr, rt, 0);
156 }
157
158 skb->priority = sk->sk_priority;
159 skb->mark = sk->sk_mark;
160
161 /* Send it out. */
162 return ip_local_out(skb);
163 }
164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
165
166 static inline int ip_finish_output2(struct sk_buff *skb)
167 {
168 struct dst_entry *dst = skb_dst(skb);
169 struct rtable *rt = (struct rtable *)dst;
170 struct net_device *dev = dst->dev;
171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
172 struct neighbour *neigh;
173
174 if (rt->rt_type == RTN_MULTICAST) {
175 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
176 } else if (rt->rt_type == RTN_BROADCAST)
177 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
178
179 /* Be paranoid, rather than too clever. */
180 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
181 struct sk_buff *skb2;
182
183 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
184 if (skb2 == NULL) {
185 kfree_skb(skb);
186 return -ENOMEM;
187 }
188 if (skb->sk)
189 skb_set_owner_w(skb2, skb->sk);
190 consume_skb(skb);
191 skb = skb2;
192 }
193
194 rcu_read_lock();
195 neigh = dst_get_neighbour_noref(dst);
196 if (neigh) {
197 int res = neigh_output(neigh, skb);
198
199 rcu_read_unlock();
200 return res;
201 }
202 rcu_read_unlock();
203
204 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
205 __func__);
206 kfree_skb(skb);
207 return -EINVAL;
208 }
209
210 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
211 {
212 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
213
214 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
215 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
216 }
217
218 static int ip_finish_output(struct sk_buff *skb)
219 {
220 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
221 /* Policy lookup after SNAT yielded a new policy */
222 if (skb_dst(skb)->xfrm != NULL) {
223 IPCB(skb)->flags |= IPSKB_REROUTED;
224 return dst_output(skb);
225 }
226 #endif
227 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
228 return ip_fragment(skb, ip_finish_output2);
229 else
230 return ip_finish_output2(skb);
231 }
232
233 int ip_mc_output(struct sk_buff *skb)
234 {
235 struct sock *sk = skb->sk;
236 struct rtable *rt = skb_rtable(skb);
237 struct net_device *dev = rt->dst.dev;
238
239 /*
240 * If the indicated interface is up and running, send the packet.
241 */
242 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
243
244 skb->dev = dev;
245 skb->protocol = htons(ETH_P_IP);
246
247 /*
248 * Multicasts are looped back for other local users
249 */
250
251 if (rt->rt_flags&RTCF_MULTICAST) {
252 if (sk_mc_loop(sk)
253 #ifdef CONFIG_IP_MROUTE
254 /* Small optimization: do not loopback not local frames,
255 which returned after forwarding; they will be dropped
256 by ip_mr_input in any case.
257 Note, that local frames are looped back to be delivered
258 to local recipients.
259
260 This check is duplicated in ip_mr_input at the moment.
261 */
262 &&
263 ((rt->rt_flags & RTCF_LOCAL) ||
264 !(IPCB(skb)->flags & IPSKB_FORWARDED))
265 #endif
266 ) {
267 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
268 if (newskb)
269 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
270 newskb, NULL, newskb->dev,
271 dev_loopback_xmit);
272 }
273
274 /* Multicasts with ttl 0 must not go beyond the host */
275
276 if (ip_hdr(skb)->ttl == 0) {
277 kfree_skb(skb);
278 return 0;
279 }
280 }
281
282 if (rt->rt_flags&RTCF_BROADCAST) {
283 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
284 if (newskb)
285 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
286 NULL, newskb->dev, dev_loopback_xmit);
287 }
288
289 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
290 skb->dev, ip_finish_output,
291 !(IPCB(skb)->flags & IPSKB_REROUTED));
292 }
293
294 int ip_output(struct sk_buff *skb)
295 {
296 struct net_device *dev = skb_dst(skb)->dev;
297
298 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
299
300 skb->dev = dev;
301 skb->protocol = htons(ETH_P_IP);
302
303 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
304 ip_finish_output,
305 !(IPCB(skb)->flags & IPSKB_REROUTED));
306 }
307
308 /*
309 * copy saddr and daddr, possibly using 64bit load/stores
310 * Equivalent to :
311 * iph->saddr = fl4->saddr;
312 * iph->daddr = fl4->daddr;
313 */
314 static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
315 {
316 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
317 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
318 memcpy(&iph->saddr, &fl4->saddr,
319 sizeof(fl4->saddr) + sizeof(fl4->daddr));
320 }
321
322 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
323 {
324 struct sock *sk = skb->sk;
325 struct inet_sock *inet = inet_sk(sk);
326 struct ip_options_rcu *inet_opt;
327 struct flowi4 *fl4;
328 struct rtable *rt;
329 struct iphdr *iph;
330 int res;
331
332 /* Skip all of this if the packet is already routed,
333 * f.e. by something like SCTP.
334 */
335 rcu_read_lock();
336 inet_opt = rcu_dereference(inet->inet_opt);
337 fl4 = &fl->u.ip4;
338 rt = skb_rtable(skb);
339 if (rt != NULL)
340 goto packet_routed;
341
342 /* Make sure we can route this packet. */
343 rt = (struct rtable *)__sk_dst_check(sk, 0);
344 if (rt == NULL) {
345 __be32 daddr;
346
347 /* Use correct destination address if we have options. */
348 daddr = inet->inet_daddr;
349 if (inet_opt && inet_opt->opt.srr)
350 daddr = inet_opt->opt.faddr;
351
352 /* If this fails, retransmit mechanism of transport layer will
353 * keep trying until route appears or the connection times
354 * itself out.
355 */
356 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
357 daddr, inet->inet_saddr,
358 inet->inet_dport,
359 inet->inet_sport,
360 sk->sk_protocol,
361 RT_CONN_FLAGS(sk),
362 sk->sk_bound_dev_if);
363 if (IS_ERR(rt))
364 goto no_route;
365 sk_setup_caps(sk, &rt->dst);
366 }
367 skb_dst_set_noref(skb, &rt->dst);
368
369 packet_routed:
370 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
371 goto no_route;
372
373 /* OK, we know where to send it, allocate and build IP header. */
374 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
375 skb_reset_network_header(skb);
376 iph = ip_hdr(skb);
377 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
378 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
379 iph->frag_off = htons(IP_DF);
380 else
381 iph->frag_off = 0;
382 iph->ttl = ip_select_ttl(inet, &rt->dst);
383 iph->protocol = sk->sk_protocol;
384 ip_copy_addrs(iph, fl4);
385
386 /* Transport layer set skb->h.foo itself. */
387
388 if (inet_opt && inet_opt->opt.optlen) {
389 iph->ihl += inet_opt->opt.optlen >> 2;
390 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
391 }
392
393 ip_select_ident_more(iph, &rt->dst, sk,
394 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
395
396 skb->priority = sk->sk_priority;
397 skb->mark = sk->sk_mark;
398
399 res = ip_local_out(skb);
400 rcu_read_unlock();
401 return res;
402
403 no_route:
404 rcu_read_unlock();
405 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
406 kfree_skb(skb);
407 return -EHOSTUNREACH;
408 }
409 EXPORT_SYMBOL(ip_queue_xmit);
410
411
412 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
413 {
414 to->pkt_type = from->pkt_type;
415 to->priority = from->priority;
416 to->protocol = from->protocol;
417 skb_dst_drop(to);
418 skb_dst_copy(to, from);
419 to->dev = from->dev;
420 to->mark = from->mark;
421
422 /* Copy the flags to each fragment. */
423 IPCB(to)->flags = IPCB(from)->flags;
424
425 #ifdef CONFIG_NET_SCHED
426 to->tc_index = from->tc_index;
427 #endif
428 nf_copy(to, from);
429 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
430 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
431 to->nf_trace = from->nf_trace;
432 #endif
433 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
434 to->ipvs_property = from->ipvs_property;
435 #endif
436 skb_copy_secmark(to, from);
437 }
438
439 /*
440 * This IP datagram is too large to be sent in one piece. Break it up into
441 * smaller pieces (each of size equal to IP header plus
442 * a block of the data of the original IP data part) that will yet fit in a
443 * single device frame, and queue such a frame for sending.
444 */
445
446 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
447 {
448 struct iphdr *iph;
449 int ptr;
450 struct net_device *dev;
451 struct sk_buff *skb2;
452 unsigned int mtu, hlen, left, len, ll_rs;
453 int offset;
454 __be16 not_last_frag;
455 struct rtable *rt = skb_rtable(skb);
456 int err = 0;
457
458 dev = rt->dst.dev;
459
460 /*
461 * Point into the IP datagram header.
462 */
463
464 iph = ip_hdr(skb);
465
466 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
467 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
468 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
469 htonl(ip_skb_dst_mtu(skb)));
470 kfree_skb(skb);
471 return -EMSGSIZE;
472 }
473
474 /*
475 * Setup starting values.
476 */
477
478 hlen = iph->ihl * 4;
479 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
480 #ifdef CONFIG_BRIDGE_NETFILTER
481 if (skb->nf_bridge)
482 mtu -= nf_bridge_mtu_reduction(skb);
483 #endif
484 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
485
486 /* When frag_list is given, use it. First, check its validity:
487 * some transformers could create wrong frag_list or break existing
488 * one, it is not prohibited. In this case fall back to copying.
489 *
490 * LATER: this step can be merged to real generation of fragments,
491 * we can switch to copy when see the first bad fragment.
492 */
493 if (skb_has_frag_list(skb)) {
494 struct sk_buff *frag, *frag2;
495 int first_len = skb_pagelen(skb);
496
497 if (first_len - hlen > mtu ||
498 ((first_len - hlen) & 7) ||
499 ip_is_fragment(iph) ||
500 skb_cloned(skb))
501 goto slow_path;
502
503 skb_walk_frags(skb, frag) {
504 /* Correct geometry. */
505 if (frag->len > mtu ||
506 ((frag->len & 7) && frag->next) ||
507 skb_headroom(frag) < hlen)
508 goto slow_path_clean;
509
510 /* Partially cloned skb? */
511 if (skb_shared(frag))
512 goto slow_path_clean;
513
514 BUG_ON(frag->sk);
515 if (skb->sk) {
516 frag->sk = skb->sk;
517 frag->destructor = sock_wfree;
518 }
519 skb->truesize -= frag->truesize;
520 }
521
522 /* Everything is OK. Generate! */
523
524 err = 0;
525 offset = 0;
526 frag = skb_shinfo(skb)->frag_list;
527 skb_frag_list_init(skb);
528 skb->data_len = first_len - skb_headlen(skb);
529 skb->len = first_len;
530 iph->tot_len = htons(first_len);
531 iph->frag_off = htons(IP_MF);
532 ip_send_check(iph);
533
534 for (;;) {
535 /* Prepare header of the next frame,
536 * before previous one went down. */
537 if (frag) {
538 frag->ip_summed = CHECKSUM_NONE;
539 skb_reset_transport_header(frag);
540 __skb_push(frag, hlen);
541 skb_reset_network_header(frag);
542 memcpy(skb_network_header(frag), iph, hlen);
543 iph = ip_hdr(frag);
544 iph->tot_len = htons(frag->len);
545 ip_copy_metadata(frag, skb);
546 if (offset == 0)
547 ip_options_fragment(frag);
548 offset += skb->len - hlen;
549 iph->frag_off = htons(offset>>3);
550 if (frag->next != NULL)
551 iph->frag_off |= htons(IP_MF);
552 /* Ready, complete checksum */
553 ip_send_check(iph);
554 }
555
556 err = output(skb);
557
558 if (!err)
559 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
560 if (err || !frag)
561 break;
562
563 skb = frag;
564 frag = skb->next;
565 skb->next = NULL;
566 }
567
568 if (err == 0) {
569 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
570 return 0;
571 }
572
573 while (frag) {
574 skb = frag->next;
575 kfree_skb(frag);
576 frag = skb;
577 }
578 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
579 return err;
580
581 slow_path_clean:
582 skb_walk_frags(skb, frag2) {
583 if (frag2 == frag)
584 break;
585 frag2->sk = NULL;
586 frag2->destructor = NULL;
587 skb->truesize += frag2->truesize;
588 }
589 }
590
591 slow_path:
592 left = skb->len - hlen; /* Space per frame */
593 ptr = hlen; /* Where to start from */
594
595 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
596 * we need to make room for the encapsulating header
597 */
598 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
599
600 /*
601 * Fragment the datagram.
602 */
603
604 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
605 not_last_frag = iph->frag_off & htons(IP_MF);
606
607 /*
608 * Keep copying data until we run out.
609 */
610
611 while (left > 0) {
612 len = left;
613 /* IF: it doesn't fit, use 'mtu' - the data space left */
614 if (len > mtu)
615 len = mtu;
616 /* IF: we are not sending up to and including the packet end
617 then align the next start on an eight byte boundary */
618 if (len < left) {
619 len &= ~7;
620 }
621 /*
622 * Allocate buffer.
623 */
624
625 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
626 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
627 err = -ENOMEM;
628 goto fail;
629 }
630
631 /*
632 * Set up data on packet
633 */
634
635 ip_copy_metadata(skb2, skb);
636 skb_reserve(skb2, ll_rs);
637 skb_put(skb2, len + hlen);
638 skb_reset_network_header(skb2);
639 skb2->transport_header = skb2->network_header + hlen;
640
641 /*
642 * Charge the memory for the fragment to any owner
643 * it might possess
644 */
645
646 if (skb->sk)
647 skb_set_owner_w(skb2, skb->sk);
648
649 /*
650 * Copy the packet header into the new buffer.
651 */
652
653 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
654
655 /*
656 * Copy a block of the IP datagram.
657 */
658 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
659 BUG();
660 left -= len;
661
662 /*
663 * Fill in the new header fields.
664 */
665 iph = ip_hdr(skb2);
666 iph->frag_off = htons((offset >> 3));
667
668 /* ANK: dirty, but effective trick. Upgrade options only if
669 * the segment to be fragmented was THE FIRST (otherwise,
670 * options are already fixed) and make it ONCE
671 * on the initial skb, so that all the following fragments
672 * will inherit fixed options.
673 */
674 if (offset == 0)
675 ip_options_fragment(skb);
676
677 /*
678 * Added AC : If we are fragmenting a fragment that's not the
679 * last fragment then keep MF on each bit
680 */
681 if (left > 0 || not_last_frag)
682 iph->frag_off |= htons(IP_MF);
683 ptr += len;
684 offset += len;
685
686 /*
687 * Put this fragment into the sending queue.
688 */
689 iph->tot_len = htons(len + hlen);
690
691 ip_send_check(iph);
692
693 err = output(skb2);
694 if (err)
695 goto fail;
696
697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
698 }
699 consume_skb(skb);
700 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
701 return err;
702
703 fail:
704 kfree_skb(skb);
705 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
706 return err;
707 }
708 EXPORT_SYMBOL(ip_fragment);
709
710 int
711 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
712 {
713 struct iovec *iov = from;
714
715 if (skb->ip_summed == CHECKSUM_PARTIAL) {
716 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
717 return -EFAULT;
718 } else {
719 __wsum csum = 0;
720 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
721 return -EFAULT;
722 skb->csum = csum_block_add(skb->csum, csum, odd);
723 }
724 return 0;
725 }
726 EXPORT_SYMBOL(ip_generic_getfrag);
727
728 static inline __wsum
729 csum_page(struct page *page, int offset, int copy)
730 {
731 char *kaddr;
732 __wsum csum;
733 kaddr = kmap(page);
734 csum = csum_partial(kaddr + offset, copy, 0);
735 kunmap(page);
736 return csum;
737 }
738
739 static inline int ip_ufo_append_data(struct sock *sk,
740 struct sk_buff_head *queue,
741 int getfrag(void *from, char *to, int offset, int len,
742 int odd, struct sk_buff *skb),
743 void *from, int length, int hh_len, int fragheaderlen,
744 int transhdrlen, int maxfraglen, unsigned int flags)
745 {
746 struct sk_buff *skb;
747 int err;
748
749 /* There is support for UDP fragmentation offload by network
750 * device, so create one single skb packet containing complete
751 * udp datagram
752 */
753 if ((skb = skb_peek_tail(queue)) == NULL) {
754 skb = sock_alloc_send_skb(sk,
755 hh_len + fragheaderlen + transhdrlen + 20,
756 (flags & MSG_DONTWAIT), &err);
757
758 if (skb == NULL)
759 return err;
760
761 /* reserve space for Hardware header */
762 skb_reserve(skb, hh_len);
763
764 /* create space for UDP/IP header */
765 skb_put(skb, fragheaderlen + transhdrlen);
766
767 /* initialize network header pointer */
768 skb_reset_network_header(skb);
769
770 /* initialize protocol header pointer */
771 skb->transport_header = skb->network_header + fragheaderlen;
772
773 skb->ip_summed = CHECKSUM_PARTIAL;
774 skb->csum = 0;
775
776 /* specify the length of each IP datagram fragment */
777 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
778 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
779 __skb_queue_tail(queue, skb);
780 }
781
782 return skb_append_datato_frags(sk, skb, getfrag, from,
783 (length - transhdrlen));
784 }
785
786 static int __ip_append_data(struct sock *sk,
787 struct flowi4 *fl4,
788 struct sk_buff_head *queue,
789 struct inet_cork *cork,
790 int getfrag(void *from, char *to, int offset,
791 int len, int odd, struct sk_buff *skb),
792 void *from, int length, int transhdrlen,
793 unsigned int flags)
794 {
795 struct inet_sock *inet = inet_sk(sk);
796 struct sk_buff *skb;
797
798 struct ip_options *opt = cork->opt;
799 int hh_len;
800 int exthdrlen;
801 int mtu;
802 int copy;
803 int err;
804 int offset = 0;
805 unsigned int maxfraglen, fragheaderlen;
806 int csummode = CHECKSUM_NONE;
807 struct rtable *rt = (struct rtable *)cork->dst;
808
809 skb = skb_peek_tail(queue);
810
811 exthdrlen = !skb ? rt->dst.header_len : 0;
812 mtu = cork->fragsize;
813
814 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
815
816 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
817 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
818
819 if (cork->length + length > 0xFFFF - fragheaderlen) {
820 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
821 mtu-exthdrlen);
822 return -EMSGSIZE;
823 }
824
825 /*
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
828 */
829 if (transhdrlen &&
830 length + fragheaderlen <= mtu &&
831 rt->dst.dev->features & NETIF_F_V4_CSUM &&
832 !exthdrlen)
833 csummode = CHECKSUM_PARTIAL;
834
835 cork->length += length;
836 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
837 (sk->sk_protocol == IPPROTO_UDP) &&
838 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
839 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
840 hh_len, fragheaderlen, transhdrlen,
841 maxfraglen, flags);
842 if (err)
843 goto error;
844 return 0;
845 }
846
847 /* So, what's going on in the loop below?
848 *
849 * We use calculated fragment length to generate chained skb,
850 * each of segments is IP fragment ready for sending to network after
851 * adding appropriate IP header.
852 */
853
854 if (!skb)
855 goto alloc_new_skb;
856
857 while (length > 0) {
858 /* Check if the remaining data fits into current packet. */
859 copy = mtu - skb->len;
860 if (copy < length)
861 copy = maxfraglen - skb->len;
862 if (copy <= 0) {
863 char *data;
864 unsigned int datalen;
865 unsigned int fraglen;
866 unsigned int fraggap;
867 unsigned int alloclen;
868 struct sk_buff *skb_prev;
869 alloc_new_skb:
870 skb_prev = skb;
871 if (skb_prev)
872 fraggap = skb_prev->len - maxfraglen;
873 else
874 fraggap = 0;
875
876 /*
877 * If remaining data exceeds the mtu,
878 * we know we need more fragment(s).
879 */
880 datalen = length + fraggap;
881 if (datalen > mtu - fragheaderlen)
882 datalen = maxfraglen - fragheaderlen;
883 fraglen = datalen + fragheaderlen;
884
885 if ((flags & MSG_MORE) &&
886 !(rt->dst.dev->features&NETIF_F_SG))
887 alloclen = mtu;
888 else
889 alloclen = fraglen;
890
891 alloclen += exthdrlen;
892
893 /* The last fragment gets additional space at tail.
894 * Note, with MSG_MORE we overallocate on fragments,
895 * because we have no idea what fragment will be
896 * the last.
897 */
898 if (datalen == length + fraggap)
899 alloclen += rt->dst.trailer_len;
900
901 if (transhdrlen) {
902 skb = sock_alloc_send_skb(sk,
903 alloclen + hh_len + 15,
904 (flags & MSG_DONTWAIT), &err);
905 } else {
906 skb = NULL;
907 if (atomic_read(&sk->sk_wmem_alloc) <=
908 2 * sk->sk_sndbuf)
909 skb = sock_wmalloc(sk,
910 alloclen + hh_len + 15, 1,
911 sk->sk_allocation);
912 if (unlikely(skb == NULL))
913 err = -ENOBUFS;
914 else
915 /* only the initial fragment is
916 time stamped */
917 cork->tx_flags = 0;
918 }
919 if (skb == NULL)
920 goto error;
921
922 /*
923 * Fill in the control structures
924 */
925 skb->ip_summed = csummode;
926 skb->csum = 0;
927 skb_reserve(skb, hh_len);
928 skb_shinfo(skb)->tx_flags = cork->tx_flags;
929
930 /*
931 * Find where to start putting bytes.
932 */
933 data = skb_put(skb, fraglen + exthdrlen);
934 skb_set_network_header(skb, exthdrlen);
935 skb->transport_header = (skb->network_header +
936 fragheaderlen);
937 data += fragheaderlen + exthdrlen;
938
939 if (fraggap) {
940 skb->csum = skb_copy_and_csum_bits(
941 skb_prev, maxfraglen,
942 data + transhdrlen, fraggap, 0);
943 skb_prev->csum = csum_sub(skb_prev->csum,
944 skb->csum);
945 data += fraggap;
946 pskb_trim_unique(skb_prev, maxfraglen);
947 }
948
949 copy = datalen - transhdrlen - fraggap;
950 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
951 err = -EFAULT;
952 kfree_skb(skb);
953 goto error;
954 }
955
956 offset += copy;
957 length -= datalen - fraggap;
958 transhdrlen = 0;
959 exthdrlen = 0;
960 csummode = CHECKSUM_NONE;
961
962 /*
963 * Put the packet on the pending queue.
964 */
965 __skb_queue_tail(queue, skb);
966 continue;
967 }
968
969 if (copy > length)
970 copy = length;
971
972 if (!(rt->dst.dev->features&NETIF_F_SG)) {
973 unsigned int off;
974
975 off = skb->len;
976 if (getfrag(from, skb_put(skb, copy),
977 offset, copy, off, skb) < 0) {
978 __skb_trim(skb, off);
979 err = -EFAULT;
980 goto error;
981 }
982 } else {
983 int i = skb_shinfo(skb)->nr_frags;
984 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
985 struct page *page = cork->page;
986 int off = cork->off;
987 unsigned int left;
988
989 if (page && (left = PAGE_SIZE - off) > 0) {
990 if (copy >= left)
991 copy = left;
992 if (page != skb_frag_page(frag)) {
993 if (i == MAX_SKB_FRAGS) {
994 err = -EMSGSIZE;
995 goto error;
996 }
997 skb_fill_page_desc(skb, i, page, off, 0);
998 skb_frag_ref(skb, i);
999 frag = &skb_shinfo(skb)->frags[i];
1000 }
1001 } else if (i < MAX_SKB_FRAGS) {
1002 if (copy > PAGE_SIZE)
1003 copy = PAGE_SIZE;
1004 page = alloc_pages(sk->sk_allocation, 0);
1005 if (page == NULL) {
1006 err = -ENOMEM;
1007 goto error;
1008 }
1009 cork->page = page;
1010 cork->off = 0;
1011
1012 skb_fill_page_desc(skb, i, page, 0, 0);
1013 frag = &skb_shinfo(skb)->frags[i];
1014 } else {
1015 err = -EMSGSIZE;
1016 goto error;
1017 }
1018 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1019 offset, copy, skb->len, skb) < 0) {
1020 err = -EFAULT;
1021 goto error;
1022 }
1023 cork->off += copy;
1024 skb_frag_size_add(frag, copy);
1025 skb->len += copy;
1026 skb->data_len += copy;
1027 skb->truesize += copy;
1028 atomic_add(copy, &sk->sk_wmem_alloc);
1029 }
1030 offset += copy;
1031 length -= copy;
1032 }
1033
1034 return 0;
1035
1036 error:
1037 cork->length -= length;
1038 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1039 return err;
1040 }
1041
1042 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1043 struct ipcm_cookie *ipc, struct rtable **rtp)
1044 {
1045 struct inet_sock *inet = inet_sk(sk);
1046 struct ip_options_rcu *opt;
1047 struct rtable *rt;
1048
1049 /*
1050 * setup for corking.
1051 */
1052 opt = ipc->opt;
1053 if (opt) {
1054 if (cork->opt == NULL) {
1055 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1056 sk->sk_allocation);
1057 if (unlikely(cork->opt == NULL))
1058 return -ENOBUFS;
1059 }
1060 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1061 cork->flags |= IPCORK_OPT;
1062 cork->addr = ipc->addr;
1063 }
1064 rt = *rtp;
1065 if (unlikely(!rt))
1066 return -EFAULT;
1067 /*
1068 * We steal reference to this route, caller should not release it
1069 */
1070 *rtp = NULL;
1071 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1072 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1073 cork->dst = &rt->dst;
1074 cork->length = 0;
1075 cork->tx_flags = ipc->tx_flags;
1076 cork->page = NULL;
1077 cork->off = 0;
1078
1079 return 0;
1080 }
1081
1082 /*
1083 * ip_append_data() and ip_append_page() can make one large IP datagram
1084 * from many pieces of data. Each pieces will be holded on the socket
1085 * until ip_push_pending_frames() is called. Each piece can be a page
1086 * or non-page data.
1087 *
1088 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1089 * this interface potentially.
1090 *
1091 * LATER: length must be adjusted by pad at tail, when it is required.
1092 */
1093 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1094 int getfrag(void *from, char *to, int offset, int len,
1095 int odd, struct sk_buff *skb),
1096 void *from, int length, int transhdrlen,
1097 struct ipcm_cookie *ipc, struct rtable **rtp,
1098 unsigned int flags)
1099 {
1100 struct inet_sock *inet = inet_sk(sk);
1101 int err;
1102
1103 if (flags&MSG_PROBE)
1104 return 0;
1105
1106 if (skb_queue_empty(&sk->sk_write_queue)) {
1107 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1108 if (err)
1109 return err;
1110 } else {
1111 transhdrlen = 0;
1112 }
1113
1114 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1115 from, length, transhdrlen, flags);
1116 }
1117
1118 ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1119 int offset, size_t size, int flags)
1120 {
1121 struct inet_sock *inet = inet_sk(sk);
1122 struct sk_buff *skb;
1123 struct rtable *rt;
1124 struct ip_options *opt = NULL;
1125 struct inet_cork *cork;
1126 int hh_len;
1127 int mtu;
1128 int len;
1129 int err;
1130 unsigned int maxfraglen, fragheaderlen, fraggap;
1131
1132 if (inet->hdrincl)
1133 return -EPERM;
1134
1135 if (flags&MSG_PROBE)
1136 return 0;
1137
1138 if (skb_queue_empty(&sk->sk_write_queue))
1139 return -EINVAL;
1140
1141 cork = &inet->cork.base;
1142 rt = (struct rtable *)cork->dst;
1143 if (cork->flags & IPCORK_OPT)
1144 opt = cork->opt;
1145
1146 if (!(rt->dst.dev->features&NETIF_F_SG))
1147 return -EOPNOTSUPP;
1148
1149 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1150 mtu = cork->fragsize;
1151
1152 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1153 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1154
1155 if (cork->length + size > 0xFFFF - fragheaderlen) {
1156 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1157 return -EMSGSIZE;
1158 }
1159
1160 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1161 return -EINVAL;
1162
1163 cork->length += size;
1164 if ((size + skb->len > mtu) &&
1165 (sk->sk_protocol == IPPROTO_UDP) &&
1166 (rt->dst.dev->features & NETIF_F_UFO)) {
1167 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1168 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1169 }
1170
1171
1172 while (size > 0) {
1173 int i;
1174
1175 if (skb_is_gso(skb))
1176 len = size;
1177 else {
1178
1179 /* Check if the remaining data fits into current packet. */
1180 len = mtu - skb->len;
1181 if (len < size)
1182 len = maxfraglen - skb->len;
1183 }
1184 if (len <= 0) {
1185 struct sk_buff *skb_prev;
1186 int alloclen;
1187
1188 skb_prev = skb;
1189 fraggap = skb_prev->len - maxfraglen;
1190
1191 alloclen = fragheaderlen + hh_len + fraggap + 15;
1192 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1193 if (unlikely(!skb)) {
1194 err = -ENOBUFS;
1195 goto error;
1196 }
1197
1198 /*
1199 * Fill in the control structures
1200 */
1201 skb->ip_summed = CHECKSUM_NONE;
1202 skb->csum = 0;
1203 skb_reserve(skb, hh_len);
1204
1205 /*
1206 * Find where to start putting bytes.
1207 */
1208 skb_put(skb, fragheaderlen + fraggap);
1209 skb_reset_network_header(skb);
1210 skb->transport_header = (skb->network_header +
1211 fragheaderlen);
1212 if (fraggap) {
1213 skb->csum = skb_copy_and_csum_bits(skb_prev,
1214 maxfraglen,
1215 skb_transport_header(skb),
1216 fraggap, 0);
1217 skb_prev->csum = csum_sub(skb_prev->csum,
1218 skb->csum);
1219 pskb_trim_unique(skb_prev, maxfraglen);
1220 }
1221
1222 /*
1223 * Put the packet on the pending queue.
1224 */
1225 __skb_queue_tail(&sk->sk_write_queue, skb);
1226 continue;
1227 }
1228
1229 i = skb_shinfo(skb)->nr_frags;
1230 if (len > size)
1231 len = size;
1232 if (skb_can_coalesce(skb, i, page, offset)) {
1233 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1234 } else if (i < MAX_SKB_FRAGS) {
1235 get_page(page);
1236 skb_fill_page_desc(skb, i, page, offset, len);
1237 } else {
1238 err = -EMSGSIZE;
1239 goto error;
1240 }
1241
1242 if (skb->ip_summed == CHECKSUM_NONE) {
1243 __wsum csum;
1244 csum = csum_page(page, offset, len);
1245 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1246 }
1247
1248 skb->len += len;
1249 skb->data_len += len;
1250 skb->truesize += len;
1251 atomic_add(len, &sk->sk_wmem_alloc);
1252 offset += len;
1253 size -= len;
1254 }
1255 return 0;
1256
1257 error:
1258 cork->length -= size;
1259 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1260 return err;
1261 }
1262
1263 static void ip_cork_release(struct inet_cork *cork)
1264 {
1265 cork->flags &= ~IPCORK_OPT;
1266 kfree(cork->opt);
1267 cork->opt = NULL;
1268 dst_release(cork->dst);
1269 cork->dst = NULL;
1270 }
1271
1272 /*
1273 * Combined all pending IP fragments on the socket as one IP datagram
1274 * and push them out.
1275 */
1276 struct sk_buff *__ip_make_skb(struct sock *sk,
1277 struct flowi4 *fl4,
1278 struct sk_buff_head *queue,
1279 struct inet_cork *cork)
1280 {
1281 struct sk_buff *skb, *tmp_skb;
1282 struct sk_buff **tail_skb;
1283 struct inet_sock *inet = inet_sk(sk);
1284 struct net *net = sock_net(sk);
1285 struct ip_options *opt = NULL;
1286 struct rtable *rt = (struct rtable *)cork->dst;
1287 struct iphdr *iph;
1288 __be16 df = 0;
1289 __u8 ttl;
1290
1291 if ((skb = __skb_dequeue(queue)) == NULL)
1292 goto out;
1293 tail_skb = &(skb_shinfo(skb)->frag_list);
1294
1295 /* move skb->data to ip header from ext header */
1296 if (skb->data < skb_network_header(skb))
1297 __skb_pull(skb, skb_network_offset(skb));
1298 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1299 __skb_pull(tmp_skb, skb_network_header_len(skb));
1300 *tail_skb = tmp_skb;
1301 tail_skb = &(tmp_skb->next);
1302 skb->len += tmp_skb->len;
1303 skb->data_len += tmp_skb->len;
1304 skb->truesize += tmp_skb->truesize;
1305 tmp_skb->destructor = NULL;
1306 tmp_skb->sk = NULL;
1307 }
1308
1309 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1310 * to fragment the frame generated here. No matter, what transforms
1311 * how transforms change size of the packet, it will come out.
1312 */
1313 if (inet->pmtudisc < IP_PMTUDISC_DO)
1314 skb->local_df = 1;
1315
1316 /* DF bit is set when we want to see DF on outgoing frames.
1317 * If local_df is set too, we still allow to fragment this frame
1318 * locally. */
1319 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1320 (skb->len <= dst_mtu(&rt->dst) &&
1321 ip_dont_fragment(sk, &rt->dst)))
1322 df = htons(IP_DF);
1323
1324 if (cork->flags & IPCORK_OPT)
1325 opt = cork->opt;
1326
1327 if (rt->rt_type == RTN_MULTICAST)
1328 ttl = inet->mc_ttl;
1329 else
1330 ttl = ip_select_ttl(inet, &rt->dst);
1331
1332 iph = (struct iphdr *)skb->data;
1333 iph->version = 4;
1334 iph->ihl = 5;
1335 iph->tos = inet->tos;
1336 iph->frag_off = df;
1337 ip_select_ident(iph, &rt->dst, sk);
1338 iph->ttl = ttl;
1339 iph->protocol = sk->sk_protocol;
1340 ip_copy_addrs(iph, fl4);
1341
1342 if (opt) {
1343 iph->ihl += opt->optlen>>2;
1344 ip_options_build(skb, opt, cork->addr, rt, 0);
1345 }
1346
1347 skb->priority = sk->sk_priority;
1348 skb->mark = sk->sk_mark;
1349 /*
1350 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1351 * on dst refcount
1352 */
1353 cork->dst = NULL;
1354 skb_dst_set(skb, &rt->dst);
1355
1356 if (iph->protocol == IPPROTO_ICMP)
1357 icmp_out_count(net, ((struct icmphdr *)
1358 skb_transport_header(skb))->type);
1359
1360 ip_cork_release(cork);
1361 out:
1362 return skb;
1363 }
1364
1365 int ip_send_skb(struct sk_buff *skb)
1366 {
1367 struct net *net = sock_net(skb->sk);
1368 int err;
1369
1370 err = ip_local_out(skb);
1371 if (err) {
1372 if (err > 0)
1373 err = net_xmit_errno(err);
1374 if (err)
1375 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1376 }
1377
1378 return err;
1379 }
1380
1381 int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1382 {
1383 struct sk_buff *skb;
1384
1385 skb = ip_finish_skb(sk, fl4);
1386 if (!skb)
1387 return 0;
1388
1389 /* Netfilter gets whole the not fragmented skb. */
1390 return ip_send_skb(skb);
1391 }
1392
1393 /*
1394 * Throw away all pending data on the socket.
1395 */
1396 static void __ip_flush_pending_frames(struct sock *sk,
1397 struct sk_buff_head *queue,
1398 struct inet_cork *cork)
1399 {
1400 struct sk_buff *skb;
1401
1402 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1403 kfree_skb(skb);
1404
1405 ip_cork_release(cork);
1406 }
1407
1408 void ip_flush_pending_frames(struct sock *sk)
1409 {
1410 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1411 }
1412
1413 struct sk_buff *ip_make_skb(struct sock *sk,
1414 struct flowi4 *fl4,
1415 int getfrag(void *from, char *to, int offset,
1416 int len, int odd, struct sk_buff *skb),
1417 void *from, int length, int transhdrlen,
1418 struct ipcm_cookie *ipc, struct rtable **rtp,
1419 unsigned int flags)
1420 {
1421 struct inet_cork cork;
1422 struct sk_buff_head queue;
1423 int err;
1424
1425 if (flags & MSG_PROBE)
1426 return NULL;
1427
1428 __skb_queue_head_init(&queue);
1429
1430 cork.flags = 0;
1431 cork.addr = 0;
1432 cork.opt = NULL;
1433 err = ip_setup_cork(sk, &cork, ipc, rtp);
1434 if (err)
1435 return ERR_PTR(err);
1436
1437 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1438 from, length, transhdrlen, flags);
1439 if (err) {
1440 __ip_flush_pending_frames(sk, &queue, &cork);
1441 return ERR_PTR(err);
1442 }
1443
1444 return __ip_make_skb(sk, fl4, &queue, &cork);
1445 }
1446
1447 /*
1448 * Fetch data from kernel space and fill in checksum if needed.
1449 */
1450 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1451 int len, int odd, struct sk_buff *skb)
1452 {
1453 __wsum csum;
1454
1455 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1456 skb->csum = csum_block_add(skb->csum, csum, odd);
1457 return 0;
1458 }
1459
1460 /*
1461 * Generic function to send a packet as reply to another packet.
1462 * Used to send TCP resets so far. ICMP should use this function too.
1463 *
1464 * Should run single threaded per socket because it uses the sock
1465 * structure to pass arguments.
1466 */
1467 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1468 const struct ip_reply_arg *arg, unsigned int len)
1469 {
1470 struct inet_sock *inet = inet_sk(sk);
1471 struct ip_options_data replyopts;
1472 struct ipcm_cookie ipc;
1473 struct flowi4 fl4;
1474 struct rtable *rt = skb_rtable(skb);
1475
1476 if (ip_options_echo(&replyopts.opt.opt, skb))
1477 return;
1478
1479 ipc.addr = daddr;
1480 ipc.opt = NULL;
1481 ipc.tx_flags = 0;
1482
1483 if (replyopts.opt.opt.optlen) {
1484 ipc.opt = &replyopts.opt;
1485
1486 if (replyopts.opt.opt.srr)
1487 daddr = replyopts.opt.opt.faddr;
1488 }
1489
1490 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1491 RT_TOS(arg->tos),
1492 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1493 ip_reply_arg_flowi_flags(arg),
1494 daddr, rt->rt_spec_dst,
1495 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1496 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1497 rt = ip_route_output_key(sock_net(sk), &fl4);
1498 if (IS_ERR(rt))
1499 return;
1500
1501 /* And let IP do all the hard work.
1502
1503 This chunk is not reenterable, hence spinlock.
1504 Note that it uses the fact, that this function is called
1505 with locally disabled BH and that sk cannot be already spinlocked.
1506 */
1507 bh_lock_sock(sk);
1508 inet->tos = arg->tos;
1509 sk->sk_priority = skb->priority;
1510 sk->sk_protocol = ip_hdr(skb)->protocol;
1511 sk->sk_bound_dev_if = arg->bound_dev_if;
1512 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1513 &ipc, &rt, MSG_DONTWAIT);
1514 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1515 if (arg->csumoffset >= 0)
1516 *((__sum16 *)skb_transport_header(skb) +
1517 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1518 arg->csum));
1519 skb->ip_summed = CHECKSUM_NONE;
1520 ip_push_pending_frames(sk, &fl4);
1521 }
1522
1523 bh_unlock_sock(sk);
1524
1525 ip_rt_put(rt);
1526 }
1527
1528 void __init ip_init(void)
1529 {
1530 ip_rt_init();
1531 inet_initpeers();
1532
1533 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1534 igmp_mc_proc_init();
1535 #endif
1536 }