]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/ip_output.c
net: Do delayed neigh confirmation.
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / ip_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
e905a9ed 23 * Bradford Johnson: Fix faulty handling of some frames when
1da177e4
LT
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
e905a9ed
YH
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
1da177e4
LT
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
1da177e4
LT
46#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
1da177e4
LT
49#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
a1f8e7f7 52#include <linux/highmem.h>
5a0e3ad6 53#include <linux/slab.h>
1da177e4
LT
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
cfacb057 69#include <net/xfrm.h>
1da177e4
LT
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
1da177e4
LT
74#include <net/checksum.h>
75#include <net/inetpeer.h>
1da177e4
LT
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
6cbb0df7 81#include <linux/tcp.h>
1da177e4 82
ab32ea5d 83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
323e126f 84EXPORT_SYMBOL(sysctl_ip_default_ttl);
1da177e4
LT
85
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
4bc2f18b 92EXPORT_SYMBOL(ip_send_check);
1da177e4 93
c439cb2e
HX
94int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
9bbc768a
JE
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
c439cb2e
HX
102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
1da177e4
LT
116static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117{
118 int ttl = inet->uc_ttl;
119
120 if (ttl < 0)
323e126f 121 ttl = ip4_dst_hoplimit(dst);
1da177e4
LT
122 return ttl;
123}
124
e905a9ed 125/*
1da177e4
LT
126 * Add an ip header to a skbuff and send it out.
127 *
128 */
129int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
f6d8bd05 130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
1da177e4
LT
131{
132 struct inet_sock *inet = inet_sk(sk);
511c3f92 133 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
134 struct iphdr *iph;
135
136 /* Build the IP header. */
f6d8bd05 137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
8856dfa3 138 skb_reset_network_header(skb);
eddc9ec5 139 iph = ip_hdr(skb);
1da177e4
LT
140 iph->version = 4;
141 iph->ihl = 5;
142 iph->tos = inet->tos;
d8d1f30b 143 if (ip_dont_fragment(sk, &rt->dst))
1da177e4
LT
144 iph->frag_off = htons(IP_DF);
145 else
146 iph->frag_off = 0;
d8d1f30b 147 iph->ttl = ip_select_ttl(inet, &rt->dst);
dd927a26
DM
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr;
1da177e4 150 iph->protocol = sk->sk_protocol;
d8d1f30b 151 ip_select_ident(iph, &rt->dst, sk);
1da177e4 152
f6d8bd05
ED
153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2;
155 ip_options_build(skb, &opt->opt, daddr, rt, 0);
1da177e4 156 }
1da177e4
LT
157
158 skb->priority = sk->sk_priority;
4a19ec58 159 skb->mark = sk->sk_mark;
1da177e4
LT
160
161 /* Send it out. */
c439cb2e 162 return ip_local_out(skb);
1da177e4 163}
d8c97a94
ACM
164EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
165
1da177e4
LT
166static inline int ip_finish_output2(struct sk_buff *skb)
167{
adf30907 168 struct dst_entry *dst = skb_dst(skb);
80787ebc 169 struct rtable *rt = (struct rtable *)dst;
1da177e4 170 struct net_device *dev = dst->dev;
c2636b4d 171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
f6b72b62 172 struct neighbour *neigh;
a263b309 173 u32 nexthop;
1da177e4 174
edf391ff
NH
175 if (rt->rt_type == RTN_MULTICAST) {
176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
177 } else if (rt->rt_type == RTN_BROADCAST)
178 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
80787ebc 179
1da177e4 180 /* Be paranoid, rather than too clever. */
3b04ddde 181 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
1da177e4
LT
182 struct sk_buff *skb2;
183
184 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
185 if (skb2 == NULL) {
186 kfree_skb(skb);
187 return -ENOMEM;
188 }
189 if (skb->sk)
190 skb_set_owner_w(skb2, skb->sk);
5d0ba55b 191 consume_skb(skb);
1da177e4
LT
192 skb = skb2;
193 }
194
a263b309
DM
195 rcu_read_lock_bh();
196 nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
f2c31e32 200 if (neigh) {
5110effe 201 int res = dst_neigh_output(dst, neigh, skb);
f2c31e32 202
a263b309 203 rcu_read_unlock_bh();
f2c31e32
ED
204 return res;
205 }
a263b309 206 rcu_read_unlock_bh();
05e3aa09 207
e87cc472
JP
208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
209 __func__);
1da177e4
LT
210 kfree_skb(skb);
211 return -EINVAL;
212}
213
628a5c56
JH
214static inline int ip_skb_dst_mtu(struct sk_buff *skb)
215{
216 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
217
218 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
adf30907 219 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
220}
221
861d0486 222static int ip_finish_output(struct sk_buff *skb)
1da177e4 223{
5c901daa
PM
224#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
225 /* Policy lookup after SNAT yielded a new policy */
adf30907 226 if (skb_dst(skb)->xfrm != NULL) {
48d5cad8
PM
227 IPCB(skb)->flags |= IPSKB_REROUTED;
228 return dst_output(skb);
229 }
5c901daa 230#endif
628a5c56 231 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
1bd9bef6
PM
232 return ip_fragment(skb, ip_finish_output2);
233 else
234 return ip_finish_output2(skb);
1da177e4
LT
235}
236
237int ip_mc_output(struct sk_buff *skb)
238{
239 struct sock *sk = skb->sk;
511c3f92 240 struct rtable *rt = skb_rtable(skb);
d8d1f30b 241 struct net_device *dev = rt->dst.dev;
1da177e4
LT
242
243 /*
244 * If the indicated interface is up and running, send the packet.
245 */
edf391ff 246 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4
LT
247
248 skb->dev = dev;
249 skb->protocol = htons(ETH_P_IP);
250
251 /*
252 * Multicasts are looped back for other local users
253 */
254
255 if (rt->rt_flags&RTCF_MULTICAST) {
7ad6848c 256 if (sk_mc_loop(sk)
1da177e4
LT
257#ifdef CONFIG_IP_MROUTE
258 /* Small optimization: do not loopback not local frames,
259 which returned after forwarding; they will be dropped
260 by ip_mr_input in any case.
261 Note, that local frames are looped back to be delivered
262 to local recipients.
263
264 This check is duplicated in ip_mr_input at the moment.
265 */
9d4fb27d
JP
266 &&
267 ((rt->rt_flags & RTCF_LOCAL) ||
268 !(IPCB(skb)->flags & IPSKB_FORWARDED))
1da177e4 269#endif
9d4fb27d 270 ) {
1da177e4
LT
271 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
272 if (newskb)
9bbc768a
JE
273 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
274 newskb, NULL, newskb->dev,
95603e22 275 dev_loopback_xmit);
1da177e4
LT
276 }
277
278 /* Multicasts with ttl 0 must not go beyond the host */
279
eddc9ec5 280 if (ip_hdr(skb)->ttl == 0) {
1da177e4
LT
281 kfree_skb(skb);
282 return 0;
283 }
284 }
285
286 if (rt->rt_flags&RTCF_BROADCAST) {
287 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
288 if (newskb)
9bbc768a 289 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
95603e22 290 NULL, newskb->dev, dev_loopback_xmit);
1da177e4
LT
291 }
292
9bbc768a
JE
293 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
294 skb->dev, ip_finish_output,
48d5cad8 295 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
296}
297
298int ip_output(struct sk_buff *skb)
299{
adf30907 300 struct net_device *dev = skb_dst(skb)->dev;
1bd9bef6 301
edf391ff 302 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4 303
1bd9bef6
PM
304 skb->dev = dev;
305 skb->protocol = htons(ETH_P_IP);
306
9bbc768a 307 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
e905a9ed 308 ip_finish_output,
48d5cad8 309 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
310}
311
84f9307c
ED
312/*
313 * copy saddr and daddr, possibly using 64bit load/stores
314 * Equivalent to :
315 * iph->saddr = fl4->saddr;
316 * iph->daddr = fl4->daddr;
317 */
318static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
319{
320 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
321 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
322 memcpy(&iph->saddr, &fl4->saddr,
323 sizeof(fl4->saddr) + sizeof(fl4->daddr));
324}
325
d9d8da80 326int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
1da177e4 327{
e89862f4 328 struct sock *sk = skb->sk;
1da177e4 329 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 330 struct ip_options_rcu *inet_opt;
b57ae01a 331 struct flowi4 *fl4;
1da177e4
LT
332 struct rtable *rt;
333 struct iphdr *iph;
ab6e3feb 334 int res;
1da177e4
LT
335
336 /* Skip all of this if the packet is already routed,
337 * f.e. by something like SCTP.
338 */
ab6e3feb 339 rcu_read_lock();
f6d8bd05 340 inet_opt = rcu_dereference(inet->inet_opt);
ea4fc0d6 341 fl4 = &fl->u.ip4;
511c3f92 342 rt = skb_rtable(skb);
1da177e4
LT
343 if (rt != NULL)
344 goto packet_routed;
345
346 /* Make sure we can route this packet. */
347 rt = (struct rtable *)__sk_dst_check(sk, 0);
348 if (rt == NULL) {
3ca3c68e 349 __be32 daddr;
1da177e4
LT
350
351 /* Use correct destination address if we have options. */
c720c7e8 352 daddr = inet->inet_daddr;
f6d8bd05
ED
353 if (inet_opt && inet_opt->opt.srr)
354 daddr = inet_opt->opt.faddr;
1da177e4 355
78fbfd8a
DM
356 /* If this fails, retransmit mechanism of transport layer will
357 * keep trying until route appears or the connection times
358 * itself out.
359 */
b57ae01a 360 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
78fbfd8a
DM
361 daddr, inet->inet_saddr,
362 inet->inet_dport,
363 inet->inet_sport,
364 sk->sk_protocol,
365 RT_CONN_FLAGS(sk),
366 sk->sk_bound_dev_if);
367 if (IS_ERR(rt))
368 goto no_route;
d8d1f30b 369 sk_setup_caps(sk, &rt->dst);
1da177e4 370 }
d8d1f30b 371 skb_dst_set_noref(skb, &rt->dst);
1da177e4
LT
372
373packet_routed:
ea4fc0d6 374 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
1da177e4
LT
375 goto no_route;
376
377 /* OK, we know where to send it, allocate and build IP header. */
f6d8bd05 378 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
8856dfa3 379 skb_reset_network_header(skb);
eddc9ec5 380 iph = ip_hdr(skb);
714e85be 381 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
d8d1f30b 382 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
1da177e4
LT
383 iph->frag_off = htons(IP_DF);
384 else
385 iph->frag_off = 0;
d8d1f30b 386 iph->ttl = ip_select_ttl(inet, &rt->dst);
1da177e4 387 iph->protocol = sk->sk_protocol;
84f9307c
ED
388 ip_copy_addrs(iph, fl4);
389
1da177e4
LT
390 /* Transport layer set skb->h.foo itself. */
391
f6d8bd05
ED
392 if (inet_opt && inet_opt->opt.optlen) {
393 iph->ihl += inet_opt->opt.optlen >> 2;
394 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
1da177e4
LT
395 }
396
d8d1f30b 397 ip_select_ident_more(iph, &rt->dst, sk,
7967168c 398 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
1da177e4 399
1da177e4 400 skb->priority = sk->sk_priority;
4a19ec58 401 skb->mark = sk->sk_mark;
1da177e4 402
ab6e3feb
ED
403 res = ip_local_out(skb);
404 rcu_read_unlock();
405 return res;
1da177e4
LT
406
407no_route:
ab6e3feb 408 rcu_read_unlock();
5e38e270 409 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
410 kfree_skb(skb);
411 return -EHOSTUNREACH;
412}
4bc2f18b 413EXPORT_SYMBOL(ip_queue_xmit);
1da177e4
LT
414
415
416static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
417{
418 to->pkt_type = from->pkt_type;
419 to->priority = from->priority;
420 to->protocol = from->protocol;
adf30907 421 skb_dst_drop(to);
fe76cda3 422 skb_dst_copy(to, from);
1da177e4 423 to->dev = from->dev;
82e91ffe 424 to->mark = from->mark;
1da177e4
LT
425
426 /* Copy the flags to each fragment. */
427 IPCB(to)->flags = IPCB(from)->flags;
428
429#ifdef CONFIG_NET_SCHED
430 to->tc_index = from->tc_index;
431#endif
e7ac05f3 432 nf_copy(to, from);
ba9dda3a
JK
433#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
434 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
435 to->nf_trace = from->nf_trace;
436#endif
c98d80ed
JA
437#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
438 to->ipvs_property = from->ipvs_property;
1da177e4 439#endif
984bc16c 440 skb_copy_secmark(to, from);
1da177e4
LT
441}
442
443/*
444 * This IP datagram is too large to be sent in one piece. Break it up into
445 * smaller pieces (each of size equal to IP header plus
446 * a block of the data of the original IP data part) that will yet fit in a
447 * single device frame, and queue such a frame for sending.
448 */
449
d9319100 450int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4
LT
451{
452 struct iphdr *iph;
1da177e4
LT
453 int ptr;
454 struct net_device *dev;
455 struct sk_buff *skb2;
c893b806 456 unsigned int mtu, hlen, left, len, ll_rs;
1da177e4 457 int offset;
76ab608d 458 __be16 not_last_frag;
511c3f92 459 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
460 int err = 0;
461
d8d1f30b 462 dev = rt->dst.dev;
1da177e4
LT
463
464 /*
465 * Point into the IP datagram header.
466 */
467
eddc9ec5 468 iph = ip_hdr(skb);
1da177e4
LT
469
470 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
5e38e270 471 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 472 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
628a5c56 473 htonl(ip_skb_dst_mtu(skb)));
1da177e4
LT
474 kfree_skb(skb);
475 return -EMSGSIZE;
476 }
477
478 /*
479 * Setup starting values.
480 */
481
482 hlen = iph->ihl * 4;
d8d1f30b 483 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
6c79bf0f
BDS
484#ifdef CONFIG_BRIDGE_NETFILTER
485 if (skb->nf_bridge)
486 mtu -= nf_bridge_mtu_reduction(skb);
487#endif
89cee8b1 488 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
1da177e4
LT
489
490 /* When frag_list is given, use it. First, check its validity:
491 * some transformers could create wrong frag_list or break existing
492 * one, it is not prohibited. In this case fall back to copying.
493 *
494 * LATER: this step can be merged to real generation of fragments,
495 * we can switch to copy when see the first bad fragment.
496 */
21dc3301 497 if (skb_has_frag_list(skb)) {
3d13008e 498 struct sk_buff *frag, *frag2;
1da177e4
LT
499 int first_len = skb_pagelen(skb);
500
501 if (first_len - hlen > mtu ||
502 ((first_len - hlen) & 7) ||
56f8a75c 503 ip_is_fragment(iph) ||
1da177e4
LT
504 skb_cloned(skb))
505 goto slow_path;
506
d7fcf1a5 507 skb_walk_frags(skb, frag) {
1da177e4
LT
508 /* Correct geometry. */
509 if (frag->len > mtu ||
510 ((frag->len & 7) && frag->next) ||
511 skb_headroom(frag) < hlen)
3d13008e 512 goto slow_path_clean;
1da177e4
LT
513
514 /* Partially cloned skb? */
515 if (skb_shared(frag))
3d13008e 516 goto slow_path_clean;
2fdba6b0
HX
517
518 BUG_ON(frag->sk);
519 if (skb->sk) {
2fdba6b0
HX
520 frag->sk = skb->sk;
521 frag->destructor = sock_wfree;
2fdba6b0 522 }
3d13008e 523 skb->truesize -= frag->truesize;
1da177e4
LT
524 }
525
526 /* Everything is OK. Generate! */
527
528 err = 0;
529 offset = 0;
530 frag = skb_shinfo(skb)->frag_list;
d7fcf1a5 531 skb_frag_list_init(skb);
1da177e4
LT
532 skb->data_len = first_len - skb_headlen(skb);
533 skb->len = first_len;
534 iph->tot_len = htons(first_len);
535 iph->frag_off = htons(IP_MF);
536 ip_send_check(iph);
537
538 for (;;) {
539 /* Prepare header of the next frame,
540 * before previous one went down. */
541 if (frag) {
542 frag->ip_summed = CHECKSUM_NONE;
badff6d0 543 skb_reset_transport_header(frag);
e2d1bca7
ACM
544 __skb_push(frag, hlen);
545 skb_reset_network_header(frag);
d56f90a7 546 memcpy(skb_network_header(frag), iph, hlen);
eddc9ec5 547 iph = ip_hdr(frag);
1da177e4
LT
548 iph->tot_len = htons(frag->len);
549 ip_copy_metadata(frag, skb);
550 if (offset == 0)
551 ip_options_fragment(frag);
552 offset += skb->len - hlen;
553 iph->frag_off = htons(offset>>3);
554 if (frag->next != NULL)
555 iph->frag_off |= htons(IP_MF);
556 /* Ready, complete checksum */
557 ip_send_check(iph);
558 }
559
560 err = output(skb);
561
dafee490 562 if (!err)
5e38e270 563 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
564 if (err || !frag)
565 break;
566
567 skb = frag;
568 frag = skb->next;
569 skb->next = NULL;
570 }
571
572 if (err == 0) {
5e38e270 573 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
574 return 0;
575 }
576
577 while (frag) {
578 skb = frag->next;
579 kfree_skb(frag);
580 frag = skb;
581 }
5e38e270 582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 583 return err;
3d13008e
ED
584
585slow_path_clean:
586 skb_walk_frags(skb, frag2) {
587 if (frag2 == frag)
588 break;
589 frag2->sk = NULL;
590 frag2->destructor = NULL;
591 skb->truesize += frag2->truesize;
592 }
1da177e4
LT
593 }
594
595slow_path:
596 left = skb->len - hlen; /* Space per frame */
49085bd7 597 ptr = hlen; /* Where to start from */
1da177e4 598
1da177e4 599 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
9bcfcaf5
SH
600 * we need to make room for the encapsulating header
601 */
c893b806 602 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
9bcfcaf5 603
1da177e4
LT
604 /*
605 * Fragment the datagram.
606 */
607
608 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
609 not_last_frag = iph->frag_off & htons(IP_MF);
610
611 /*
612 * Keep copying data until we run out.
613 */
614
132adf54 615 while (left > 0) {
1da177e4
LT
616 len = left;
617 /* IF: it doesn't fit, use 'mtu' - the data space left */
618 if (len > mtu)
619 len = mtu;
25985edc 620 /* IF: we are not sending up to and including the packet end
1da177e4
LT
621 then align the next start on an eight byte boundary */
622 if (len < left) {
623 len &= ~7;
624 }
625 /*
626 * Allocate buffer.
627 */
628
629 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
64ce2073 630 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
1da177e4
LT
631 err = -ENOMEM;
632 goto fail;
633 }
634
635 /*
636 * Set up data on packet
637 */
638
639 ip_copy_metadata(skb2, skb);
640 skb_reserve(skb2, ll_rs);
641 skb_put(skb2, len + hlen);
c1d2bbe1 642 skb_reset_network_header(skb2);
b0e380b1 643 skb2->transport_header = skb2->network_header + hlen;
1da177e4
LT
644
645 /*
646 * Charge the memory for the fragment to any owner
647 * it might possess
648 */
649
650 if (skb->sk)
651 skb_set_owner_w(skb2, skb->sk);
652
653 /*
654 * Copy the packet header into the new buffer.
655 */
656
d626f62b 657 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
1da177e4
LT
658
659 /*
660 * Copy a block of the IP datagram.
661 */
bff9b61c 662 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
1da177e4
LT
663 BUG();
664 left -= len;
665
666 /*
667 * Fill in the new header fields.
668 */
eddc9ec5 669 iph = ip_hdr(skb2);
1da177e4
LT
670 iph->frag_off = htons((offset >> 3));
671
672 /* ANK: dirty, but effective trick. Upgrade options only if
673 * the segment to be fragmented was THE FIRST (otherwise,
674 * options are already fixed) and make it ONCE
675 * on the initial skb, so that all the following fragments
676 * will inherit fixed options.
677 */
678 if (offset == 0)
679 ip_options_fragment(skb);
680
681 /*
682 * Added AC : If we are fragmenting a fragment that's not the
683 * last fragment then keep MF on each bit
684 */
685 if (left > 0 || not_last_frag)
686 iph->frag_off |= htons(IP_MF);
687 ptr += len;
688 offset += len;
689
690 /*
691 * Put this fragment into the sending queue.
692 */
1da177e4
LT
693 iph->tot_len = htons(len + hlen);
694
695 ip_send_check(iph);
696
697 err = output(skb2);
698 if (err)
699 goto fail;
dafee490 700
5e38e270 701 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4 702 }
5d0ba55b 703 consume_skb(skb);
5e38e270 704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
705 return err;
706
707fail:
e905a9ed 708 kfree_skb(skb);
5e38e270 709 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
710 return err;
711}
2e2f7aef
PM
712EXPORT_SYMBOL(ip_fragment);
713
1da177e4
LT
714int
715ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
716{
717 struct iovec *iov = from;
718
84fa7933 719 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1da177e4
LT
720 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
721 return -EFAULT;
722 } else {
44bb9363 723 __wsum csum = 0;
1da177e4
LT
724 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
725 return -EFAULT;
726 skb->csum = csum_block_add(skb->csum, csum, odd);
727 }
728 return 0;
729}
4bc2f18b 730EXPORT_SYMBOL(ip_generic_getfrag);
1da177e4 731
44bb9363 732static inline __wsum
1da177e4
LT
733csum_page(struct page *page, int offset, int copy)
734{
735 char *kaddr;
44bb9363 736 __wsum csum;
1da177e4
LT
737 kaddr = kmap(page);
738 csum = csum_partial(kaddr + offset, copy, 0);
739 kunmap(page);
740 return csum;
741}
742
4b30b1c6 743static inline int ip_ufo_append_data(struct sock *sk,
1470ddf7 744 struct sk_buff_head *queue,
e89e9cf5
AR
745 int getfrag(void *from, char *to, int offset, int len,
746 int odd, struct sk_buff *skb),
747 void *from, int length, int hh_len, int fragheaderlen,
d9be4f7a 748 int transhdrlen, int maxfraglen, unsigned int flags)
e89e9cf5
AR
749{
750 struct sk_buff *skb;
751 int err;
752
753 /* There is support for UDP fragmentation offload by network
754 * device, so create one single skb packet containing complete
755 * udp datagram
756 */
1470ddf7 757 if ((skb = skb_peek_tail(queue)) == NULL) {
e89e9cf5
AR
758 skb = sock_alloc_send_skb(sk,
759 hh_len + fragheaderlen + transhdrlen + 20,
760 (flags & MSG_DONTWAIT), &err);
761
762 if (skb == NULL)
763 return err;
764
765 /* reserve space for Hardware header */
766 skb_reserve(skb, hh_len);
767
768 /* create space for UDP/IP header */
d9319100 769 skb_put(skb, fragheaderlen + transhdrlen);
e89e9cf5
AR
770
771 /* initialize network header pointer */
c1d2bbe1 772 skb_reset_network_header(skb);
e89e9cf5
AR
773
774 /* initialize protocol header pointer */
b0e380b1 775 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 776
84fa7933 777 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5 778 skb->csum = 0;
e89e9cf5 779
be9164e7 780 /* specify the length of each IP datagram fragment */
d9be4f7a 781 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
f83ef8c0 782 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1470ddf7 783 __skb_queue_tail(queue, skb);
e89e9cf5 784 }
be9164e7
K
785
786 return skb_append_datato_frags(sk, skb, getfrag, from,
787 (length - transhdrlen));
e89e9cf5
AR
788}
789
f5fca608
DM
790static int __ip_append_data(struct sock *sk,
791 struct flowi4 *fl4,
792 struct sk_buff_head *queue,
1470ddf7
HX
793 struct inet_cork *cork,
794 int getfrag(void *from, char *to, int offset,
795 int len, int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 unsigned int flags)
1da177e4
LT
798{
799 struct inet_sock *inet = inet_sk(sk);
800 struct sk_buff *skb;
801
07df5294 802 struct ip_options *opt = cork->opt;
1da177e4
LT
803 int hh_len;
804 int exthdrlen;
805 int mtu;
806 int copy;
807 int err;
808 int offset = 0;
809 unsigned int maxfraglen, fragheaderlen;
810 int csummode = CHECKSUM_NONE;
1470ddf7 811 struct rtable *rt = (struct rtable *)cork->dst;
1da177e4 812
96d7303e
SK
813 skb = skb_peek_tail(queue);
814
815 exthdrlen = !skb ? rt->dst.header_len : 0;
07df5294 816 mtu = cork->fragsize;
1da177e4 817
d8d1f30b 818 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4
LT
819
820 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
821 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
822
1470ddf7 823 if (cork->length + length > 0xFFFF - fragheaderlen) {
f5fca608 824 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
c720c7e8 825 mtu-exthdrlen);
1da177e4
LT
826 return -EMSGSIZE;
827 }
828
829 /*
830 * transhdrlen > 0 means that this is the first fragment and we wish
831 * it won't be fragmented in the future.
832 */
833 if (transhdrlen &&
834 length + fragheaderlen <= mtu &&
d8d1f30b 835 rt->dst.dev->features & NETIF_F_V4_CSUM &&
1da177e4 836 !exthdrlen)
84fa7933 837 csummode = CHECKSUM_PARTIAL;
1da177e4 838
1470ddf7 839 cork->length += length;
26cde9f7 840 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
be9164e7 841 (sk->sk_protocol == IPPROTO_UDP) &&
c146066a 842 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
1470ddf7
HX
843 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
844 hh_len, fragheaderlen, transhdrlen,
d9be4f7a 845 maxfraglen, flags);
baa829d8 846 if (err)
e89e9cf5 847 goto error;
e89e9cf5
AR
848 return 0;
849 }
1da177e4
LT
850
851 /* So, what's going on in the loop below?
852 *
853 * We use calculated fragment length to generate chained skb,
854 * each of segments is IP fragment ready for sending to network after
855 * adding appropriate IP header.
856 */
857
26cde9f7 858 if (!skb)
1da177e4
LT
859 goto alloc_new_skb;
860
861 while (length > 0) {
862 /* Check if the remaining data fits into current packet. */
863 copy = mtu - skb->len;
864 if (copy < length)
865 copy = maxfraglen - skb->len;
866 if (copy <= 0) {
867 char *data;
868 unsigned int datalen;
869 unsigned int fraglen;
870 unsigned int fraggap;
871 unsigned int alloclen;
872 struct sk_buff *skb_prev;
873alloc_new_skb:
874 skb_prev = skb;
875 if (skb_prev)
876 fraggap = skb_prev->len - maxfraglen;
877 else
878 fraggap = 0;
879
880 /*
881 * If remaining data exceeds the mtu,
882 * we know we need more fragment(s).
883 */
884 datalen = length + fraggap;
885 if (datalen > mtu - fragheaderlen)
886 datalen = maxfraglen - fragheaderlen;
887 fraglen = datalen + fragheaderlen;
888
e905a9ed 889 if ((flags & MSG_MORE) &&
d8d1f30b 890 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
891 alloclen = mtu;
892 else
59104f06 893 alloclen = fraglen;
1da177e4 894
353e5c9a
SK
895 alloclen += exthdrlen;
896
1da177e4
LT
897 /* The last fragment gets additional space at tail.
898 * Note, with MSG_MORE we overallocate on fragments,
899 * because we have no idea what fragment will be
900 * the last.
901 */
33f99dc7 902 if (datalen == length + fraggap)
d8d1f30b 903 alloclen += rt->dst.trailer_len;
33f99dc7 904
1da177e4 905 if (transhdrlen) {
e905a9ed 906 skb = sock_alloc_send_skb(sk,
1da177e4
LT
907 alloclen + hh_len + 15,
908 (flags & MSG_DONTWAIT), &err);
909 } else {
910 skb = NULL;
911 if (atomic_read(&sk->sk_wmem_alloc) <=
912 2 * sk->sk_sndbuf)
e905a9ed 913 skb = sock_wmalloc(sk,
1da177e4
LT
914 alloclen + hh_len + 15, 1,
915 sk->sk_allocation);
916 if (unlikely(skb == NULL))
917 err = -ENOBUFS;
51f31cab
PO
918 else
919 /* only the initial fragment is
920 time stamped */
1470ddf7 921 cork->tx_flags = 0;
1da177e4
LT
922 }
923 if (skb == NULL)
924 goto error;
925
926 /*
927 * Fill in the control structures
928 */
929 skb->ip_summed = csummode;
930 skb->csum = 0;
931 skb_reserve(skb, hh_len);
1470ddf7 932 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1da177e4
LT
933
934 /*
935 * Find where to start putting bytes.
936 */
353e5c9a 937 data = skb_put(skb, fraglen + exthdrlen);
c14d2450 938 skb_set_network_header(skb, exthdrlen);
b0e380b1
ACM
939 skb->transport_header = (skb->network_header +
940 fragheaderlen);
353e5c9a 941 data += fragheaderlen + exthdrlen;
1da177e4
LT
942
943 if (fraggap) {
944 skb->csum = skb_copy_and_csum_bits(
945 skb_prev, maxfraglen,
946 data + transhdrlen, fraggap, 0);
947 skb_prev->csum = csum_sub(skb_prev->csum,
948 skb->csum);
949 data += fraggap;
e9fa4f7b 950 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
951 }
952
953 copy = datalen - transhdrlen - fraggap;
954 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
955 err = -EFAULT;
956 kfree_skb(skb);
957 goto error;
958 }
959
960 offset += copy;
961 length -= datalen - fraggap;
962 transhdrlen = 0;
963 exthdrlen = 0;
964 csummode = CHECKSUM_NONE;
965
966 /*
967 * Put the packet on the pending queue.
968 */
1470ddf7 969 __skb_queue_tail(queue, skb);
1da177e4
LT
970 continue;
971 }
972
973 if (copy > length)
974 copy = length;
975
d8d1f30b 976 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1da177e4
LT
977 unsigned int off;
978
979 off = skb->len;
e905a9ed 980 if (getfrag(from, skb_put(skb, copy),
1da177e4
LT
981 offset, copy, off, skb) < 0) {
982 __skb_trim(skb, off);
983 err = -EFAULT;
984 goto error;
985 }
986 } else {
987 int i = skb_shinfo(skb)->nr_frags;
988 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1470ddf7
HX
989 struct page *page = cork->page;
990 int off = cork->off;
1da177e4
LT
991 unsigned int left;
992
993 if (page && (left = PAGE_SIZE - off) > 0) {
994 if (copy >= left)
995 copy = left;
aff65da0 996 if (page != skb_frag_page(frag)) {
1da177e4
LT
997 if (i == MAX_SKB_FRAGS) {
998 err = -EMSGSIZE;
999 goto error;
1000 }
1470ddf7 1001 skb_fill_page_desc(skb, i, page, off, 0);
aff65da0 1002 skb_frag_ref(skb, i);
1da177e4
LT
1003 frag = &skb_shinfo(skb)->frags[i];
1004 }
1005 } else if (i < MAX_SKB_FRAGS) {
1006 if (copy > PAGE_SIZE)
1007 copy = PAGE_SIZE;
1008 page = alloc_pages(sk->sk_allocation, 0);
1009 if (page == NULL) {
1010 err = -ENOMEM;
1011 goto error;
1012 }
1470ddf7
HX
1013 cork->page = page;
1014 cork->off = 0;
1da177e4
LT
1015
1016 skb_fill_page_desc(skb, i, page, 0, 0);
1017 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1018 } else {
1019 err = -EMSGSIZE;
1020 goto error;
1021 }
9e903e08 1022 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
aff65da0 1023 offset, copy, skb->len, skb) < 0) {
1da177e4
LT
1024 err = -EFAULT;
1025 goto error;
1026 }
1470ddf7 1027 cork->off += copy;
9e903e08 1028 skb_frag_size_add(frag, copy);
1da177e4
LT
1029 skb->len += copy;
1030 skb->data_len += copy;
f945fa7a
HX
1031 skb->truesize += copy;
1032 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1033 }
1034 offset += copy;
1035 length -= copy;
1036 }
1037
1038 return 0;
1039
1040error:
1470ddf7 1041 cork->length -= length;
5e38e270 1042 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
e905a9ed 1043 return err;
1da177e4
LT
1044}
1045
1470ddf7
HX
1046static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1047 struct ipcm_cookie *ipc, struct rtable **rtp)
1048{
1049 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 1050 struct ip_options_rcu *opt;
1470ddf7
HX
1051 struct rtable *rt;
1052
1053 /*
1054 * setup for corking.
1055 */
1056 opt = ipc->opt;
1057 if (opt) {
1058 if (cork->opt == NULL) {
1059 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1060 sk->sk_allocation);
1061 if (unlikely(cork->opt == NULL))
1062 return -ENOBUFS;
1063 }
f6d8bd05 1064 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1470ddf7
HX
1065 cork->flags |= IPCORK_OPT;
1066 cork->addr = ipc->addr;
1067 }
1068 rt = *rtp;
1069 if (unlikely(!rt))
1070 return -EFAULT;
1071 /*
1072 * We steal reference to this route, caller should not release it
1073 */
1074 *rtp = NULL;
1075 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
353e5c9a 1076 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1470ddf7
HX
1077 cork->dst = &rt->dst;
1078 cork->length = 0;
1079 cork->tx_flags = ipc->tx_flags;
1080 cork->page = NULL;
1081 cork->off = 0;
1082
1083 return 0;
1084}
1085
1086/*
1087 * ip_append_data() and ip_append_page() can make one large IP datagram
1088 * from many pieces of data. Each pieces will be holded on the socket
1089 * until ip_push_pending_frames() is called. Each piece can be a page
1090 * or non-page data.
1091 *
1092 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1093 * this interface potentially.
1094 *
1095 * LATER: length must be adjusted by pad at tail, when it is required.
1096 */
f5fca608 1097int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1470ddf7
HX
1098 int getfrag(void *from, char *to, int offset, int len,
1099 int odd, struct sk_buff *skb),
1100 void *from, int length, int transhdrlen,
1101 struct ipcm_cookie *ipc, struct rtable **rtp,
1102 unsigned int flags)
1103{
1104 struct inet_sock *inet = inet_sk(sk);
1105 int err;
1106
1107 if (flags&MSG_PROBE)
1108 return 0;
1109
1110 if (skb_queue_empty(&sk->sk_write_queue)) {
bdc712b4 1111 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1470ddf7
HX
1112 if (err)
1113 return err;
1114 } else {
1115 transhdrlen = 0;
1116 }
1117
f5fca608 1118 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1470ddf7
HX
1119 from, length, transhdrlen, flags);
1120}
1121
f5fca608 1122ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1da177e4
LT
1123 int offset, size_t size, int flags)
1124{
1125 struct inet_sock *inet = inet_sk(sk);
1126 struct sk_buff *skb;
1127 struct rtable *rt;
1128 struct ip_options *opt = NULL;
bdc712b4 1129 struct inet_cork *cork;
1da177e4
LT
1130 int hh_len;
1131 int mtu;
1132 int len;
1133 int err;
1134 unsigned int maxfraglen, fragheaderlen, fraggap;
1135
1136 if (inet->hdrincl)
1137 return -EPERM;
1138
1139 if (flags&MSG_PROBE)
1140 return 0;
1141
1142 if (skb_queue_empty(&sk->sk_write_queue))
1143 return -EINVAL;
1144
bdc712b4
DM
1145 cork = &inet->cork.base;
1146 rt = (struct rtable *)cork->dst;
1147 if (cork->flags & IPCORK_OPT)
1148 opt = cork->opt;
1da177e4 1149
d8d1f30b 1150 if (!(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
1151 return -EOPNOTSUPP;
1152
d8d1f30b 1153 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
bdc712b4 1154 mtu = cork->fragsize;
1da177e4
LT
1155
1156 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1157 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1158
bdc712b4 1159 if (cork->length + size > 0xFFFF - fragheaderlen) {
f5fca608 1160 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1da177e4
LT
1161 return -EMSGSIZE;
1162 }
1163
1164 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1165 return -EINVAL;
1166
bdc712b4 1167 cork->length += size;
26cde9f7
HX
1168 if ((size + skb->len > mtu) &&
1169 (sk->sk_protocol == IPPROTO_UDP) &&
d8d1f30b 1170 (rt->dst.dev->features & NETIF_F_UFO)) {
7967168c 1171 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 1172 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7967168c 1173 }
e89e9cf5 1174
1da177e4
LT
1175
1176 while (size > 0) {
1177 int i;
1178
89114afd 1179 if (skb_is_gso(skb))
e89e9cf5
AR
1180 len = size;
1181 else {
1182
1183 /* Check if the remaining data fits into current packet. */
1184 len = mtu - skb->len;
1185 if (len < size)
1186 len = maxfraglen - skb->len;
1187 }
1da177e4
LT
1188 if (len <= 0) {
1189 struct sk_buff *skb_prev;
1da177e4
LT
1190 int alloclen;
1191
1192 skb_prev = skb;
0d0d2bba 1193 fraggap = skb_prev->len - maxfraglen;
1da177e4
LT
1194
1195 alloclen = fragheaderlen + hh_len + fraggap + 15;
1196 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1197 if (unlikely(!skb)) {
1198 err = -ENOBUFS;
1199 goto error;
1200 }
1201
1202 /*
1203 * Fill in the control structures
1204 */
1205 skb->ip_summed = CHECKSUM_NONE;
1206 skb->csum = 0;
1207 skb_reserve(skb, hh_len);
1208
1209 /*
1210 * Find where to start putting bytes.
1211 */
967b05f6 1212 skb_put(skb, fragheaderlen + fraggap);
2ca9e6f2 1213 skb_reset_network_header(skb);
b0e380b1
ACM
1214 skb->transport_header = (skb->network_header +
1215 fragheaderlen);
1da177e4 1216 if (fraggap) {
967b05f6
ACM
1217 skb->csum = skb_copy_and_csum_bits(skb_prev,
1218 maxfraglen,
9c70220b 1219 skb_transport_header(skb),
967b05f6 1220 fraggap, 0);
1da177e4
LT
1221 skb_prev->csum = csum_sub(skb_prev->csum,
1222 skb->csum);
e9fa4f7b 1223 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1224 }
1225
1226 /*
1227 * Put the packet on the pending queue.
1228 */
1229 __skb_queue_tail(&sk->sk_write_queue, skb);
1230 continue;
1231 }
1232
1233 i = skb_shinfo(skb)->nr_frags;
1234 if (len > size)
1235 len = size;
1236 if (skb_can_coalesce(skb, i, page, offset)) {
9e903e08 1237 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1da177e4
LT
1238 } else if (i < MAX_SKB_FRAGS) {
1239 get_page(page);
1240 skb_fill_page_desc(skb, i, page, offset, len);
1241 } else {
1242 err = -EMSGSIZE;
1243 goto error;
1244 }
1245
1246 if (skb->ip_summed == CHECKSUM_NONE) {
44bb9363 1247 __wsum csum;
1da177e4
LT
1248 csum = csum_page(page, offset, len);
1249 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1250 }
1251
1252 skb->len += len;
1253 skb->data_len += len;
1e34a11d
DM
1254 skb->truesize += len;
1255 atomic_add(len, &sk->sk_wmem_alloc);
1da177e4
LT
1256 offset += len;
1257 size -= len;
1258 }
1259 return 0;
1260
1261error:
bdc712b4 1262 cork->length -= size;
5e38e270 1263 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1264 return err;
1265}
1266
1470ddf7 1267static void ip_cork_release(struct inet_cork *cork)
429f08e9 1268{
1470ddf7
HX
1269 cork->flags &= ~IPCORK_OPT;
1270 kfree(cork->opt);
1271 cork->opt = NULL;
1272 dst_release(cork->dst);
1273 cork->dst = NULL;
429f08e9
PE
1274}
1275
1da177e4
LT
1276/*
1277 * Combined all pending IP fragments on the socket as one IP datagram
1278 * and push them out.
1279 */
1c32c5ad 1280struct sk_buff *__ip_make_skb(struct sock *sk,
77968b78 1281 struct flowi4 *fl4,
1c32c5ad
HX
1282 struct sk_buff_head *queue,
1283 struct inet_cork *cork)
1da177e4
LT
1284{
1285 struct sk_buff *skb, *tmp_skb;
1286 struct sk_buff **tail_skb;
1287 struct inet_sock *inet = inet_sk(sk);
0388b004 1288 struct net *net = sock_net(sk);
1da177e4 1289 struct ip_options *opt = NULL;
1470ddf7 1290 struct rtable *rt = (struct rtable *)cork->dst;
1da177e4 1291 struct iphdr *iph;
76ab608d 1292 __be16 df = 0;
1da177e4 1293 __u8 ttl;
1da177e4 1294
1470ddf7 1295 if ((skb = __skb_dequeue(queue)) == NULL)
1da177e4
LT
1296 goto out;
1297 tail_skb = &(skb_shinfo(skb)->frag_list);
1298
1299 /* move skb->data to ip header from ext header */
d56f90a7 1300 if (skb->data < skb_network_header(skb))
bbe735e4 1301 __skb_pull(skb, skb_network_offset(skb));
1470ddf7 1302 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1303 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1304 *tail_skb = tmp_skb;
1305 tail_skb = &(tmp_skb->next);
1306 skb->len += tmp_skb->len;
1307 skb->data_len += tmp_skb->len;
1308 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1309 tmp_skb->destructor = NULL;
1310 tmp_skb->sk = NULL;
1311 }
1312
1313 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1314 * to fragment the frame generated here. No matter, what transforms
1315 * how transforms change size of the packet, it will come out.
1316 */
628a5c56 1317 if (inet->pmtudisc < IP_PMTUDISC_DO)
1da177e4
LT
1318 skb->local_df = 1;
1319
1320 /* DF bit is set when we want to see DF on outgoing frames.
1321 * If local_df is set too, we still allow to fragment this frame
1322 * locally. */
628a5c56 1323 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
d8d1f30b
CG
1324 (skb->len <= dst_mtu(&rt->dst) &&
1325 ip_dont_fragment(sk, &rt->dst)))
1da177e4
LT
1326 df = htons(IP_DF);
1327
1470ddf7
HX
1328 if (cork->flags & IPCORK_OPT)
1329 opt = cork->opt;
1da177e4
LT
1330
1331 if (rt->rt_type == RTN_MULTICAST)
1332 ttl = inet->mc_ttl;
1333 else
d8d1f30b 1334 ttl = ip_select_ttl(inet, &rt->dst);
1da177e4
LT
1335
1336 iph = (struct iphdr *)skb->data;
1337 iph->version = 4;
1338 iph->ihl = 5;
1da177e4 1339 iph->tos = inet->tos;
1da177e4 1340 iph->frag_off = df;
d8d1f30b 1341 ip_select_ident(iph, &rt->dst, sk);
1da177e4
LT
1342 iph->ttl = ttl;
1343 iph->protocol = sk->sk_protocol;
84f9307c 1344 ip_copy_addrs(iph, fl4);
1da177e4 1345
22f728f8
DM
1346 if (opt) {
1347 iph->ihl += opt->optlen>>2;
1348 ip_options_build(skb, opt, cork->addr, rt, 0);
1349 }
1350
1da177e4 1351 skb->priority = sk->sk_priority;
4a19ec58 1352 skb->mark = sk->sk_mark;
a21bba94
ED
1353 /*
1354 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1355 * on dst refcount
1356 */
1470ddf7 1357 cork->dst = NULL;
d8d1f30b 1358 skb_dst_set(skb, &rt->dst);
1da177e4 1359
96793b48 1360 if (iph->protocol == IPPROTO_ICMP)
0388b004 1361 icmp_out_count(net, ((struct icmphdr *)
96793b48
DS
1362 skb_transport_header(skb))->type);
1363
1c32c5ad
HX
1364 ip_cork_release(cork);
1365out:
1366 return skb;
1367}
1368
1369int ip_send_skb(struct sk_buff *skb)
1370{
1371 struct net *net = sock_net(skb->sk);
1372 int err;
1373
c439cb2e 1374 err = ip_local_out(skb);
1da177e4
LT
1375 if (err) {
1376 if (err > 0)
6ce9e7b5 1377 err = net_xmit_errno(err);
1da177e4 1378 if (err)
1c32c5ad 1379 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1380 }
1381
1da177e4 1382 return err;
1da177e4
LT
1383}
1384
77968b78 1385int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1470ddf7 1386{
1c32c5ad
HX
1387 struct sk_buff *skb;
1388
77968b78 1389 skb = ip_finish_skb(sk, fl4);
1c32c5ad
HX
1390 if (!skb)
1391 return 0;
1392
1393 /* Netfilter gets whole the not fragmented skb. */
1394 return ip_send_skb(skb);
1470ddf7
HX
1395}
1396
1da177e4
LT
1397/*
1398 * Throw away all pending data on the socket.
1399 */
1470ddf7
HX
1400static void __ip_flush_pending_frames(struct sock *sk,
1401 struct sk_buff_head *queue,
1402 struct inet_cork *cork)
1da177e4 1403{
1da177e4
LT
1404 struct sk_buff *skb;
1405
1470ddf7 1406 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1da177e4
LT
1407 kfree_skb(skb);
1408
1470ddf7
HX
1409 ip_cork_release(cork);
1410}
1411
1412void ip_flush_pending_frames(struct sock *sk)
1413{
bdc712b4 1414 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1da177e4
LT
1415}
1416
1c32c5ad 1417struct sk_buff *ip_make_skb(struct sock *sk,
77968b78 1418 struct flowi4 *fl4,
1c32c5ad
HX
1419 int getfrag(void *from, char *to, int offset,
1420 int len, int odd, struct sk_buff *skb),
1421 void *from, int length, int transhdrlen,
1422 struct ipcm_cookie *ipc, struct rtable **rtp,
1423 unsigned int flags)
1424{
b80d7226 1425 struct inet_cork cork;
1c32c5ad
HX
1426 struct sk_buff_head queue;
1427 int err;
1428
1429 if (flags & MSG_PROBE)
1430 return NULL;
1431
1432 __skb_queue_head_init(&queue);
1433
b80d7226
DM
1434 cork.flags = 0;
1435 cork.addr = 0;
70652728 1436 cork.opt = NULL;
1c32c5ad
HX
1437 err = ip_setup_cork(sk, &cork, ipc, rtp);
1438 if (err)
1439 return ERR_PTR(err);
1440
f5fca608 1441 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1c32c5ad
HX
1442 from, length, transhdrlen, flags);
1443 if (err) {
1444 __ip_flush_pending_frames(sk, &queue, &cork);
1445 return ERR_PTR(err);
1446 }
1447
77968b78 1448 return __ip_make_skb(sk, fl4, &queue, &cork);
1c32c5ad 1449}
1da177e4
LT
1450
1451/*
1452 * Fetch data from kernel space and fill in checksum if needed.
1453 */
e905a9ed 1454static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1da177e4
LT
1455 int len, int odd, struct sk_buff *skb)
1456{
5084205f 1457 __wsum csum;
1da177e4
LT
1458
1459 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1460 skb->csum = csum_block_add(skb->csum, csum, odd);
e905a9ed 1461 return 0;
1da177e4
LT
1462}
1463
e905a9ed 1464/*
1da177e4 1465 * Generic function to send a packet as reply to another packet.
70e73416 1466 * Used to send TCP resets so far.
1da177e4 1467 *
e905a9ed 1468 * Should run single threaded per socket because it uses the sock
1da177e4 1469 * structure to pass arguments.
1da177e4 1470 */
70e73416
DM
1471void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1472 __be32 saddr, const struct ip_reply_arg *arg,
1473 unsigned int len)
1da177e4
LT
1474{
1475 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 1476 struct ip_options_data replyopts;
1da177e4 1477 struct ipcm_cookie ipc;
77968b78 1478 struct flowi4 fl4;
511c3f92 1479 struct rtable *rt = skb_rtable(skb);
1da177e4 1480
f6d8bd05 1481 if (ip_options_echo(&replyopts.opt.opt, skb))
1da177e4
LT
1482 return;
1483
0a5ebb80 1484 ipc.addr = daddr;
1da177e4 1485 ipc.opt = NULL;
2244d07b 1486 ipc.tx_flags = 0;
1da177e4 1487
f6d8bd05 1488 if (replyopts.opt.opt.optlen) {
1da177e4
LT
1489 ipc.opt = &replyopts.opt;
1490
f6d8bd05
ED
1491 if (replyopts.opt.opt.srr)
1492 daddr = replyopts.opt.opt.faddr;
1da177e4
LT
1493 }
1494
77968b78 1495 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
66b13d99 1496 RT_TOS(arg->tos),
77968b78
DM
1497 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1498 ip_reply_arg_flowi_flags(arg),
70e73416 1499 daddr, saddr,
77968b78
DM
1500 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1501 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1502 rt = ip_route_output_key(sock_net(sk), &fl4);
1503 if (IS_ERR(rt))
1504 return;
1da177e4
LT
1505
1506 /* And let IP do all the hard work.
1507
1508 This chunk is not reenterable, hence spinlock.
1509 Note that it uses the fact, that this function is called
1510 with locally disabled BH and that sk cannot be already spinlocked.
1511 */
1512 bh_lock_sock(sk);
66b13d99 1513 inet->tos = arg->tos;
1da177e4 1514 sk->sk_priority = skb->priority;
eddc9ec5 1515 sk->sk_protocol = ip_hdr(skb)->protocol;
f0e48dbf 1516 sk->sk_bound_dev_if = arg->bound_dev_if;
f5fca608 1517 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
2e77d89b 1518 &ipc, &rt, MSG_DONTWAIT);
1da177e4
LT
1519 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1520 if (arg->csumoffset >= 0)
9c70220b
ACM
1521 *((__sum16 *)skb_transport_header(skb) +
1522 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1523 arg->csum));
1da177e4 1524 skb->ip_summed = CHECKSUM_NONE;
77968b78 1525 ip_push_pending_frames(sk, &fl4);
1da177e4
LT
1526 }
1527
1528 bh_unlock_sock(sk);
1529
1530 ip_rt_put(rt);
1531}
1532
1da177e4
LT
1533void __init ip_init(void)
1534{
1da177e4
LT
1535 ip_rt_init();
1536 inet_initpeers();
1537
1538#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1539 igmp_mc_proc_init();
1540#endif
1541}