]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/ip_output.c
udp: Use flow key information instead of rt->rt_{src,dst}
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / ip_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
e905a9ed 23 * Bradford Johnson: Fix faulty handling of some frames when
1da177e4
LT
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
e905a9ed
YH
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
1da177e4
LT
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
1da177e4
LT
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
a1f8e7f7 53#include <linux/highmem.h>
5a0e3ad6 54#include <linux/slab.h>
1da177e4
LT
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
cfacb057 70#include <net/xfrm.h>
1da177e4
LT
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
1da177e4
LT
75#include <net/checksum.h>
76#include <net/inetpeer.h>
1da177e4
LT
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
6cbb0df7 82#include <linux/tcp.h>
1da177e4 83
ab32ea5d 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
323e126f 85EXPORT_SYMBOL(sysctl_ip_default_ttl);
1da177e4
LT
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
4bc2f18b 93EXPORT_SYMBOL(ip_send_check);
1da177e4 94
c439cb2e
HX
95int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
9bbc768a
JE
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
c439cb2e
HX
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
1da177e4
LT
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
459a98ed 120 skb_reset_mac_header(newskb);
bbe735e4 121 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 124 WARN_ON(!skb_dst(newskb));
e30b38c2 125 netif_rx_ni(newskb);
1da177e4
LT
126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
323e126f 134 ttl = ip4_dst_hoplimit(dst);
1da177e4
LT
135 return ttl;
136}
137
e905a9ed 138/*
1da177e4
LT
139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
f6d8bd05 143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
1da177e4
LT
144{
145 struct inet_sock *inet = inet_sk(sk);
511c3f92 146 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
147 struct iphdr *iph;
148
149 /* Build the IP header. */
f6d8bd05 150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
8856dfa3 151 skb_reset_network_header(skb);
eddc9ec5 152 iph = ip_hdr(skb);
1da177e4
LT
153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
d8d1f30b 156 if (ip_dont_fragment(sk, &rt->dst))
1da177e4
LT
157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
d8d1f30b 160 iph->ttl = ip_select_ttl(inet, &rt->dst);
dd927a26
DM
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
1da177e4 163 iph->protocol = sk->sk_protocol;
d8d1f30b 164 ip_select_ident(iph, &rt->dst, sk);
1da177e4 165
f6d8bd05
ED
166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
1da177e4 169 }
1da177e4
LT
170
171 skb->priority = sk->sk_priority;
4a19ec58 172 skb->mark = sk->sk_mark;
1da177e4
LT
173
174 /* Send it out. */
c439cb2e 175 return ip_local_out(skb);
1da177e4 176}
d8c97a94
ACM
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
1da177e4
LT
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
adf30907 181 struct dst_entry *dst = skb_dst(skb);
80787ebc 182 struct rtable *rt = (struct rtable *)dst;
1da177e4 183 struct net_device *dev = dst->dev;
c2636b4d 184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
1da177e4 185
edf391ff
NH
186 if (rt->rt_type == RTN_MULTICAST) {
187 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188 } else if (rt->rt_type == RTN_BROADCAST)
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
80787ebc 190
1da177e4 191 /* Be paranoid, rather than too clever. */
3b04ddde 192 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
1da177e4
LT
193 struct sk_buff *skb2;
194
195 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196 if (skb2 == NULL) {
197 kfree_skb(skb);
198 return -ENOMEM;
199 }
200 if (skb->sk)
201 skb_set_owner_w(skb2, skb->sk);
202 kfree_skb(skb);
203 skb = skb2;
204 }
205
3644f0ce
SH
206 if (dst->hh)
207 return neigh_hh_output(dst->hh, skb);
208 else if (dst->neighbour)
1da177e4
LT
209 return dst->neighbour->output(skb);
210
211 if (net_ratelimit())
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 kfree_skb(skb);
214 return -EINVAL;
215}
216
628a5c56
JH
217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
adf30907 222 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
223}
224
861d0486 225static int ip_finish_output(struct sk_buff *skb)
1da177e4 226{
5c901daa
PM
227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 /* Policy lookup after SNAT yielded a new policy */
adf30907 229 if (skb_dst(skb)->xfrm != NULL) {
48d5cad8
PM
230 IPCB(skb)->flags |= IPSKB_REROUTED;
231 return dst_output(skb);
232 }
5c901daa 233#endif
628a5c56 234 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
1bd9bef6
PM
235 return ip_fragment(skb, ip_finish_output2);
236 else
237 return ip_finish_output2(skb);
1da177e4
LT
238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242 struct sock *sk = skb->sk;
511c3f92 243 struct rtable *rt = skb_rtable(skb);
d8d1f30b 244 struct net_device *dev = rt->dst.dev;
1da177e4
LT
245
246 /*
247 * If the indicated interface is up and running, send the packet.
248 */
edf391ff 249 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4
LT
250
251 skb->dev = dev;
252 skb->protocol = htons(ETH_P_IP);
253
254 /*
255 * Multicasts are looped back for other local users
256 */
257
258 if (rt->rt_flags&RTCF_MULTICAST) {
7ad6848c 259 if (sk_mc_loop(sk)
1da177e4
LT
260#ifdef CONFIG_IP_MROUTE
261 /* Small optimization: do not loopback not local frames,
262 which returned after forwarding; they will be dropped
263 by ip_mr_input in any case.
264 Note, that local frames are looped back to be delivered
265 to local recipients.
266
267 This check is duplicated in ip_mr_input at the moment.
268 */
9d4fb27d
JP
269 &&
270 ((rt->rt_flags & RTCF_LOCAL) ||
271 !(IPCB(skb)->flags & IPSKB_FORWARDED))
1da177e4 272#endif
9d4fb27d 273 ) {
1da177e4
LT
274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 if (newskb)
9bbc768a
JE
276 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 newskb, NULL, newskb->dev,
1da177e4
LT
278 ip_dev_loopback_xmit);
279 }
280
281 /* Multicasts with ttl 0 must not go beyond the host */
282
eddc9ec5 283 if (ip_hdr(skb)->ttl == 0) {
1da177e4
LT
284 kfree_skb(skb);
285 return 0;
286 }
287 }
288
289 if (rt->rt_flags&RTCF_BROADCAST) {
290 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291 if (newskb)
9bbc768a
JE
292 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 NULL, newskb->dev, ip_dev_loopback_xmit);
1da177e4
LT
294 }
295
9bbc768a
JE
296 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 skb->dev, ip_finish_output,
48d5cad8 298 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
299}
300
301int ip_output(struct sk_buff *skb)
302{
adf30907 303 struct net_device *dev = skb_dst(skb)->dev;
1bd9bef6 304
edf391ff 305 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4 306
1bd9bef6
PM
307 skb->dev = dev;
308 skb->protocol = htons(ETH_P_IP);
309
9bbc768a 310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
e905a9ed 311 ip_finish_output,
48d5cad8 312 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
313}
314
d9d8da80 315int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
1da177e4 316{
e89862f4 317 struct sock *sk = skb->sk;
1da177e4 318 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 319 struct ip_options_rcu *inet_opt;
b57ae01a 320 struct flowi4 *fl4;
1da177e4
LT
321 struct rtable *rt;
322 struct iphdr *iph;
ab6e3feb 323 int res;
1da177e4
LT
324
325 /* Skip all of this if the packet is already routed,
326 * f.e. by something like SCTP.
327 */
ab6e3feb 328 rcu_read_lock();
f6d8bd05 329 inet_opt = rcu_dereference(inet->inet_opt);
ea4fc0d6 330 fl4 = &fl->u.ip4;
511c3f92 331 rt = skb_rtable(skb);
1da177e4
LT
332 if (rt != NULL)
333 goto packet_routed;
334
335 /* Make sure we can route this packet. */
336 rt = (struct rtable *)__sk_dst_check(sk, 0);
337 if (rt == NULL) {
3ca3c68e 338 __be32 daddr;
1da177e4
LT
339
340 /* Use correct destination address if we have options. */
c720c7e8 341 daddr = inet->inet_daddr;
f6d8bd05
ED
342 if (inet_opt && inet_opt->opt.srr)
343 daddr = inet_opt->opt.faddr;
1da177e4 344
78fbfd8a
DM
345 /* If this fails, retransmit mechanism of transport layer will
346 * keep trying until route appears or the connection times
347 * itself out.
348 */
b57ae01a 349 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
78fbfd8a
DM
350 daddr, inet->inet_saddr,
351 inet->inet_dport,
352 inet->inet_sport,
353 sk->sk_protocol,
354 RT_CONN_FLAGS(sk),
355 sk->sk_bound_dev_if);
356 if (IS_ERR(rt))
357 goto no_route;
d8d1f30b 358 sk_setup_caps(sk, &rt->dst);
1da177e4 359 }
d8d1f30b 360 skb_dst_set_noref(skb, &rt->dst);
1da177e4
LT
361
362packet_routed:
ea4fc0d6 363 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
1da177e4
LT
364 goto no_route;
365
366 /* OK, we know where to send it, allocate and build IP header. */
f6d8bd05 367 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
8856dfa3 368 skb_reset_network_header(skb);
eddc9ec5 369 iph = ip_hdr(skb);
714e85be 370 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
d8d1f30b 371 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
1da177e4
LT
372 iph->frag_off = htons(IP_DF);
373 else
374 iph->frag_off = 0;
d8d1f30b 375 iph->ttl = ip_select_ttl(inet, &rt->dst);
1da177e4 376 iph->protocol = sk->sk_protocol;
ea4fc0d6
DM
377 iph->saddr = fl4->saddr;
378 iph->daddr = fl4->daddr;
1da177e4
LT
379 /* Transport layer set skb->h.foo itself. */
380
f6d8bd05
ED
381 if (inet_opt && inet_opt->opt.optlen) {
382 iph->ihl += inet_opt->opt.optlen >> 2;
383 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
1da177e4
LT
384 }
385
d8d1f30b 386 ip_select_ident_more(iph, &rt->dst, sk,
7967168c 387 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
1da177e4 388
1da177e4 389 skb->priority = sk->sk_priority;
4a19ec58 390 skb->mark = sk->sk_mark;
1da177e4 391
ab6e3feb
ED
392 res = ip_local_out(skb);
393 rcu_read_unlock();
394 return res;
1da177e4
LT
395
396no_route:
ab6e3feb 397 rcu_read_unlock();
5e38e270 398 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
399 kfree_skb(skb);
400 return -EHOSTUNREACH;
401}
4bc2f18b 402EXPORT_SYMBOL(ip_queue_xmit);
1da177e4
LT
403
404
405static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
406{
407 to->pkt_type = from->pkt_type;
408 to->priority = from->priority;
409 to->protocol = from->protocol;
adf30907 410 skb_dst_drop(to);
fe76cda3 411 skb_dst_copy(to, from);
1da177e4 412 to->dev = from->dev;
82e91ffe 413 to->mark = from->mark;
1da177e4
LT
414
415 /* Copy the flags to each fragment. */
416 IPCB(to)->flags = IPCB(from)->flags;
417
418#ifdef CONFIG_NET_SCHED
419 to->tc_index = from->tc_index;
420#endif
e7ac05f3 421 nf_copy(to, from);
ba9dda3a
JK
422#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
423 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
424 to->nf_trace = from->nf_trace;
425#endif
c98d80ed
JA
426#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427 to->ipvs_property = from->ipvs_property;
1da177e4 428#endif
984bc16c 429 skb_copy_secmark(to, from);
1da177e4
LT
430}
431
432/*
433 * This IP datagram is too large to be sent in one piece. Break it up into
434 * smaller pieces (each of size equal to IP header plus
435 * a block of the data of the original IP data part) that will yet fit in a
436 * single device frame, and queue such a frame for sending.
437 */
438
d9319100 439int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4
LT
440{
441 struct iphdr *iph;
1da177e4
LT
442 int ptr;
443 struct net_device *dev;
444 struct sk_buff *skb2;
c893b806 445 unsigned int mtu, hlen, left, len, ll_rs;
1da177e4 446 int offset;
76ab608d 447 __be16 not_last_frag;
511c3f92 448 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
449 int err = 0;
450
d8d1f30b 451 dev = rt->dst.dev;
1da177e4
LT
452
453 /*
454 * Point into the IP datagram header.
455 */
456
eddc9ec5 457 iph = ip_hdr(skb);
1da177e4
LT
458
459 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
5e38e270 460 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 461 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
628a5c56 462 htonl(ip_skb_dst_mtu(skb)));
1da177e4
LT
463 kfree_skb(skb);
464 return -EMSGSIZE;
465 }
466
467 /*
468 * Setup starting values.
469 */
470
471 hlen = iph->ihl * 4;
d8d1f30b 472 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
6c79bf0f
BDS
473#ifdef CONFIG_BRIDGE_NETFILTER
474 if (skb->nf_bridge)
475 mtu -= nf_bridge_mtu_reduction(skb);
476#endif
89cee8b1 477 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
1da177e4
LT
478
479 /* When frag_list is given, use it. First, check its validity:
480 * some transformers could create wrong frag_list or break existing
481 * one, it is not prohibited. In this case fall back to copying.
482 *
483 * LATER: this step can be merged to real generation of fragments,
484 * we can switch to copy when see the first bad fragment.
485 */
21dc3301 486 if (skb_has_frag_list(skb)) {
3d13008e 487 struct sk_buff *frag, *frag2;
1da177e4
LT
488 int first_len = skb_pagelen(skb);
489
490 if (first_len - hlen > mtu ||
491 ((first_len - hlen) & 7) ||
492 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
493 skb_cloned(skb))
494 goto slow_path;
495
d7fcf1a5 496 skb_walk_frags(skb, frag) {
1da177e4
LT
497 /* Correct geometry. */
498 if (frag->len > mtu ||
499 ((frag->len & 7) && frag->next) ||
500 skb_headroom(frag) < hlen)
3d13008e 501 goto slow_path_clean;
1da177e4
LT
502
503 /* Partially cloned skb? */
504 if (skb_shared(frag))
3d13008e 505 goto slow_path_clean;
2fdba6b0
HX
506
507 BUG_ON(frag->sk);
508 if (skb->sk) {
2fdba6b0
HX
509 frag->sk = skb->sk;
510 frag->destructor = sock_wfree;
2fdba6b0 511 }
3d13008e 512 skb->truesize -= frag->truesize;
1da177e4
LT
513 }
514
515 /* Everything is OK. Generate! */
516
517 err = 0;
518 offset = 0;
519 frag = skb_shinfo(skb)->frag_list;
d7fcf1a5 520 skb_frag_list_init(skb);
1da177e4
LT
521 skb->data_len = first_len - skb_headlen(skb);
522 skb->len = first_len;
523 iph->tot_len = htons(first_len);
524 iph->frag_off = htons(IP_MF);
525 ip_send_check(iph);
526
527 for (;;) {
528 /* Prepare header of the next frame,
529 * before previous one went down. */
530 if (frag) {
531 frag->ip_summed = CHECKSUM_NONE;
badff6d0 532 skb_reset_transport_header(frag);
e2d1bca7
ACM
533 __skb_push(frag, hlen);
534 skb_reset_network_header(frag);
d56f90a7 535 memcpy(skb_network_header(frag), iph, hlen);
eddc9ec5 536 iph = ip_hdr(frag);
1da177e4
LT
537 iph->tot_len = htons(frag->len);
538 ip_copy_metadata(frag, skb);
539 if (offset == 0)
540 ip_options_fragment(frag);
541 offset += skb->len - hlen;
542 iph->frag_off = htons(offset>>3);
543 if (frag->next != NULL)
544 iph->frag_off |= htons(IP_MF);
545 /* Ready, complete checksum */
546 ip_send_check(iph);
547 }
548
549 err = output(skb);
550
dafee490 551 if (!err)
5e38e270 552 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
553 if (err || !frag)
554 break;
555
556 skb = frag;
557 frag = skb->next;
558 skb->next = NULL;
559 }
560
561 if (err == 0) {
5e38e270 562 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
563 return 0;
564 }
565
566 while (frag) {
567 skb = frag->next;
568 kfree_skb(frag);
569 frag = skb;
570 }
5e38e270 571 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 572 return err;
3d13008e
ED
573
574slow_path_clean:
575 skb_walk_frags(skb, frag2) {
576 if (frag2 == frag)
577 break;
578 frag2->sk = NULL;
579 frag2->destructor = NULL;
580 skb->truesize += frag2->truesize;
581 }
1da177e4
LT
582 }
583
584slow_path:
585 left = skb->len - hlen; /* Space per frame */
49085bd7 586 ptr = hlen; /* Where to start from */
1da177e4 587
1da177e4 588 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
9bcfcaf5
SH
589 * we need to make room for the encapsulating header
590 */
c893b806 591 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
9bcfcaf5 592
1da177e4
LT
593 /*
594 * Fragment the datagram.
595 */
596
597 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
598 not_last_frag = iph->frag_off & htons(IP_MF);
599
600 /*
601 * Keep copying data until we run out.
602 */
603
132adf54 604 while (left > 0) {
1da177e4
LT
605 len = left;
606 /* IF: it doesn't fit, use 'mtu' - the data space left */
607 if (len > mtu)
608 len = mtu;
25985edc 609 /* IF: we are not sending up to and including the packet end
1da177e4
LT
610 then align the next start on an eight byte boundary */
611 if (len < left) {
612 len &= ~7;
613 }
614 /*
615 * Allocate buffer.
616 */
617
618 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
64ce2073 619 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
1da177e4
LT
620 err = -ENOMEM;
621 goto fail;
622 }
623
624 /*
625 * Set up data on packet
626 */
627
628 ip_copy_metadata(skb2, skb);
629 skb_reserve(skb2, ll_rs);
630 skb_put(skb2, len + hlen);
c1d2bbe1 631 skb_reset_network_header(skb2);
b0e380b1 632 skb2->transport_header = skb2->network_header + hlen;
1da177e4
LT
633
634 /*
635 * Charge the memory for the fragment to any owner
636 * it might possess
637 */
638
639 if (skb->sk)
640 skb_set_owner_w(skb2, skb->sk);
641
642 /*
643 * Copy the packet header into the new buffer.
644 */
645
d626f62b 646 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
1da177e4
LT
647
648 /*
649 * Copy a block of the IP datagram.
650 */
bff9b61c 651 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
1da177e4
LT
652 BUG();
653 left -= len;
654
655 /*
656 * Fill in the new header fields.
657 */
eddc9ec5 658 iph = ip_hdr(skb2);
1da177e4
LT
659 iph->frag_off = htons((offset >> 3));
660
661 /* ANK: dirty, but effective trick. Upgrade options only if
662 * the segment to be fragmented was THE FIRST (otherwise,
663 * options are already fixed) and make it ONCE
664 * on the initial skb, so that all the following fragments
665 * will inherit fixed options.
666 */
667 if (offset == 0)
668 ip_options_fragment(skb);
669
670 /*
671 * Added AC : If we are fragmenting a fragment that's not the
672 * last fragment then keep MF on each bit
673 */
674 if (left > 0 || not_last_frag)
675 iph->frag_off |= htons(IP_MF);
676 ptr += len;
677 offset += len;
678
679 /*
680 * Put this fragment into the sending queue.
681 */
1da177e4
LT
682 iph->tot_len = htons(len + hlen);
683
684 ip_send_check(iph);
685
686 err = output(skb2);
687 if (err)
688 goto fail;
dafee490 689
5e38e270 690 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
691 }
692 kfree_skb(skb);
5e38e270 693 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
694 return err;
695
696fail:
e905a9ed 697 kfree_skb(skb);
5e38e270 698 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
699 return err;
700}
2e2f7aef
PM
701EXPORT_SYMBOL(ip_fragment);
702
1da177e4
LT
703int
704ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
705{
706 struct iovec *iov = from;
707
84fa7933 708 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1da177e4
LT
709 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
710 return -EFAULT;
711 } else {
44bb9363 712 __wsum csum = 0;
1da177e4
LT
713 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
714 return -EFAULT;
715 skb->csum = csum_block_add(skb->csum, csum, odd);
716 }
717 return 0;
718}
4bc2f18b 719EXPORT_SYMBOL(ip_generic_getfrag);
1da177e4 720
44bb9363 721static inline __wsum
1da177e4
LT
722csum_page(struct page *page, int offset, int copy)
723{
724 char *kaddr;
44bb9363 725 __wsum csum;
1da177e4
LT
726 kaddr = kmap(page);
727 csum = csum_partial(kaddr + offset, copy, 0);
728 kunmap(page);
729 return csum;
730}
731
4b30b1c6 732static inline int ip_ufo_append_data(struct sock *sk,
1470ddf7 733 struct sk_buff_head *queue,
e89e9cf5
AR
734 int getfrag(void *from, char *to, int offset, int len,
735 int odd, struct sk_buff *skb),
736 void *from, int length, int hh_len, int fragheaderlen,
d9319100 737 int transhdrlen, int mtu, unsigned int flags)
e89e9cf5
AR
738{
739 struct sk_buff *skb;
740 int err;
741
742 /* There is support for UDP fragmentation offload by network
743 * device, so create one single skb packet containing complete
744 * udp datagram
745 */
1470ddf7 746 if ((skb = skb_peek_tail(queue)) == NULL) {
e89e9cf5
AR
747 skb = sock_alloc_send_skb(sk,
748 hh_len + fragheaderlen + transhdrlen + 20,
749 (flags & MSG_DONTWAIT), &err);
750
751 if (skb == NULL)
752 return err;
753
754 /* reserve space for Hardware header */
755 skb_reserve(skb, hh_len);
756
757 /* create space for UDP/IP header */
d9319100 758 skb_put(skb, fragheaderlen + transhdrlen);
e89e9cf5
AR
759
760 /* initialize network header pointer */
c1d2bbe1 761 skb_reset_network_header(skb);
e89e9cf5
AR
762
763 /* initialize protocol header pointer */
b0e380b1 764 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 765
84fa7933 766 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5 767 skb->csum = 0;
e89e9cf5 768
be9164e7 769 /* specify the length of each IP datagram fragment */
7967168c 770 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 771 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1470ddf7 772 __skb_queue_tail(queue, skb);
e89e9cf5 773 }
be9164e7
K
774
775 return skb_append_datato_frags(sk, skb, getfrag, from,
776 (length - transhdrlen));
e89e9cf5
AR
777}
778
1470ddf7
HX
779static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
780 struct inet_cork *cork,
781 int getfrag(void *from, char *to, int offset,
782 int len, int odd, struct sk_buff *skb),
783 void *from, int length, int transhdrlen,
784 unsigned int flags)
1da177e4
LT
785{
786 struct inet_sock *inet = inet_sk(sk);
787 struct sk_buff *skb;
788
07df5294 789 struct ip_options *opt = cork->opt;
1da177e4
LT
790 int hh_len;
791 int exthdrlen;
792 int mtu;
793 int copy;
794 int err;
795 int offset = 0;
796 unsigned int maxfraglen, fragheaderlen;
797 int csummode = CHECKSUM_NONE;
1470ddf7 798 struct rtable *rt = (struct rtable *)cork->dst;
1da177e4 799
1470ddf7
HX
800 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
801 length += exthdrlen;
802 transhdrlen += exthdrlen;
07df5294 803 mtu = cork->fragsize;
1da177e4 804
d8d1f30b 805 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4
LT
806
807 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
808 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
809
1470ddf7 810 if (cork->length + length > 0xFFFF - fragheaderlen) {
c720c7e8
ED
811 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
812 mtu-exthdrlen);
1da177e4
LT
813 return -EMSGSIZE;
814 }
815
816 /*
817 * transhdrlen > 0 means that this is the first fragment and we wish
818 * it won't be fragmented in the future.
819 */
820 if (transhdrlen &&
821 length + fragheaderlen <= mtu &&
d8d1f30b 822 rt->dst.dev->features & NETIF_F_V4_CSUM &&
1da177e4 823 !exthdrlen)
84fa7933 824 csummode = CHECKSUM_PARTIAL;
1da177e4 825
1470ddf7 826 skb = skb_peek_tail(queue);
26cde9f7 827
1470ddf7 828 cork->length += length;
26cde9f7 829 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
be9164e7 830 (sk->sk_protocol == IPPROTO_UDP) &&
d8d1f30b 831 (rt->dst.dev->features & NETIF_F_UFO)) {
1470ddf7
HX
832 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833 hh_len, fragheaderlen, transhdrlen,
834 mtu, flags);
baa829d8 835 if (err)
e89e9cf5 836 goto error;
e89e9cf5
AR
837 return 0;
838 }
1da177e4
LT
839
840 /* So, what's going on in the loop below?
841 *
842 * We use calculated fragment length to generate chained skb,
843 * each of segments is IP fragment ready for sending to network after
844 * adding appropriate IP header.
845 */
846
26cde9f7 847 if (!skb)
1da177e4
LT
848 goto alloc_new_skb;
849
850 while (length > 0) {
851 /* Check if the remaining data fits into current packet. */
852 copy = mtu - skb->len;
853 if (copy < length)
854 copy = maxfraglen - skb->len;
855 if (copy <= 0) {
856 char *data;
857 unsigned int datalen;
858 unsigned int fraglen;
859 unsigned int fraggap;
860 unsigned int alloclen;
861 struct sk_buff *skb_prev;
862alloc_new_skb:
863 skb_prev = skb;
864 if (skb_prev)
865 fraggap = skb_prev->len - maxfraglen;
866 else
867 fraggap = 0;
868
869 /*
870 * If remaining data exceeds the mtu,
871 * we know we need more fragment(s).
872 */
873 datalen = length + fraggap;
874 if (datalen > mtu - fragheaderlen)
875 datalen = maxfraglen - fragheaderlen;
876 fraglen = datalen + fragheaderlen;
877
e905a9ed 878 if ((flags & MSG_MORE) &&
d8d1f30b 879 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
880 alloclen = mtu;
881 else
59104f06 882 alloclen = fraglen;
1da177e4
LT
883
884 /* The last fragment gets additional space at tail.
885 * Note, with MSG_MORE we overallocate on fragments,
886 * because we have no idea what fragment will be
887 * the last.
888 */
59104f06 889 if (datalen == length + fraggap) {
d8d1f30b 890 alloclen += rt->dst.trailer_len;
59104f06
ED
891 /* make sure mtu is not reached */
892 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
893 datalen -= ALIGN(rt->dst.trailer_len, 8);
894 }
1da177e4 895 if (transhdrlen) {
e905a9ed 896 skb = sock_alloc_send_skb(sk,
1da177e4
LT
897 alloclen + hh_len + 15,
898 (flags & MSG_DONTWAIT), &err);
899 } else {
900 skb = NULL;
901 if (atomic_read(&sk->sk_wmem_alloc) <=
902 2 * sk->sk_sndbuf)
e905a9ed 903 skb = sock_wmalloc(sk,
1da177e4
LT
904 alloclen + hh_len + 15, 1,
905 sk->sk_allocation);
906 if (unlikely(skb == NULL))
907 err = -ENOBUFS;
51f31cab
PO
908 else
909 /* only the initial fragment is
910 time stamped */
1470ddf7 911 cork->tx_flags = 0;
1da177e4
LT
912 }
913 if (skb == NULL)
914 goto error;
915
916 /*
917 * Fill in the control structures
918 */
919 skb->ip_summed = csummode;
920 skb->csum = 0;
921 skb_reserve(skb, hh_len);
1470ddf7 922 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1da177e4
LT
923
924 /*
925 * Find where to start putting bytes.
926 */
927 data = skb_put(skb, fraglen);
c14d2450 928 skb_set_network_header(skb, exthdrlen);
b0e380b1
ACM
929 skb->transport_header = (skb->network_header +
930 fragheaderlen);
1da177e4 931 data += fragheaderlen;
1da177e4
LT
932
933 if (fraggap) {
934 skb->csum = skb_copy_and_csum_bits(
935 skb_prev, maxfraglen,
936 data + transhdrlen, fraggap, 0);
937 skb_prev->csum = csum_sub(skb_prev->csum,
938 skb->csum);
939 data += fraggap;
e9fa4f7b 940 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
941 }
942
943 copy = datalen - transhdrlen - fraggap;
944 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
945 err = -EFAULT;
946 kfree_skb(skb);
947 goto error;
948 }
949
950 offset += copy;
951 length -= datalen - fraggap;
952 transhdrlen = 0;
953 exthdrlen = 0;
954 csummode = CHECKSUM_NONE;
955
956 /*
957 * Put the packet on the pending queue.
958 */
1470ddf7 959 __skb_queue_tail(queue, skb);
1da177e4
LT
960 continue;
961 }
962
963 if (copy > length)
964 copy = length;
965
d8d1f30b 966 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1da177e4
LT
967 unsigned int off;
968
969 off = skb->len;
e905a9ed 970 if (getfrag(from, skb_put(skb, copy),
1da177e4
LT
971 offset, copy, off, skb) < 0) {
972 __skb_trim(skb, off);
973 err = -EFAULT;
974 goto error;
975 }
976 } else {
977 int i = skb_shinfo(skb)->nr_frags;
978 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1470ddf7
HX
979 struct page *page = cork->page;
980 int off = cork->off;
1da177e4
LT
981 unsigned int left;
982
983 if (page && (left = PAGE_SIZE - off) > 0) {
984 if (copy >= left)
985 copy = left;
986 if (page != frag->page) {
987 if (i == MAX_SKB_FRAGS) {
988 err = -EMSGSIZE;
989 goto error;
990 }
991 get_page(page);
1470ddf7 992 skb_fill_page_desc(skb, i, page, off, 0);
1da177e4
LT
993 frag = &skb_shinfo(skb)->frags[i];
994 }
995 } else if (i < MAX_SKB_FRAGS) {
996 if (copy > PAGE_SIZE)
997 copy = PAGE_SIZE;
998 page = alloc_pages(sk->sk_allocation, 0);
999 if (page == NULL) {
1000 err = -ENOMEM;
1001 goto error;
1002 }
1470ddf7
HX
1003 cork->page = page;
1004 cork->off = 0;
1da177e4
LT
1005
1006 skb_fill_page_desc(skb, i, page, 0, 0);
1007 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1008 } else {
1009 err = -EMSGSIZE;
1010 goto error;
1011 }
1012 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1013 err = -EFAULT;
1014 goto error;
1015 }
1470ddf7 1016 cork->off += copy;
1da177e4
LT
1017 frag->size += copy;
1018 skb->len += copy;
1019 skb->data_len += copy;
f945fa7a
HX
1020 skb->truesize += copy;
1021 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1022 }
1023 offset += copy;
1024 length -= copy;
1025 }
1026
1027 return 0;
1028
1029error:
1470ddf7 1030 cork->length -= length;
5e38e270 1031 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
e905a9ed 1032 return err;
1da177e4
LT
1033}
1034
1470ddf7
HX
1035static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1036 struct ipcm_cookie *ipc, struct rtable **rtp)
1037{
1038 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 1039 struct ip_options_rcu *opt;
1470ddf7
HX
1040 struct rtable *rt;
1041
1042 /*
1043 * setup for corking.
1044 */
1045 opt = ipc->opt;
1046 if (opt) {
1047 if (cork->opt == NULL) {
1048 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1049 sk->sk_allocation);
1050 if (unlikely(cork->opt == NULL))
1051 return -ENOBUFS;
1052 }
f6d8bd05 1053 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1470ddf7
HX
1054 cork->flags |= IPCORK_OPT;
1055 cork->addr = ipc->addr;
1056 }
1057 rt = *rtp;
1058 if (unlikely(!rt))
1059 return -EFAULT;
1060 /*
1061 * We steal reference to this route, caller should not release it
1062 */
1063 *rtp = NULL;
1064 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1065 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1066 cork->dst = &rt->dst;
1067 cork->length = 0;
1068 cork->tx_flags = ipc->tx_flags;
1069 cork->page = NULL;
1070 cork->off = 0;
1071
1072 return 0;
1073}
1074
1075/*
1076 * ip_append_data() and ip_append_page() can make one large IP datagram
1077 * from many pieces of data. Each pieces will be holded on the socket
1078 * until ip_push_pending_frames() is called. Each piece can be a page
1079 * or non-page data.
1080 *
1081 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1082 * this interface potentially.
1083 *
1084 * LATER: length must be adjusted by pad at tail, when it is required.
1085 */
1086int ip_append_data(struct sock *sk,
1087 int getfrag(void *from, char *to, int offset, int len,
1088 int odd, struct sk_buff *skb),
1089 void *from, int length, int transhdrlen,
1090 struct ipcm_cookie *ipc, struct rtable **rtp,
1091 unsigned int flags)
1092{
1093 struct inet_sock *inet = inet_sk(sk);
1094 int err;
1095
1096 if (flags&MSG_PROBE)
1097 return 0;
1098
1099 if (skb_queue_empty(&sk->sk_write_queue)) {
bdc712b4 1100 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1470ddf7
HX
1101 if (err)
1102 return err;
1103 } else {
1104 transhdrlen = 0;
1105 }
1106
bdc712b4 1107 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork.base, getfrag,
1470ddf7
HX
1108 from, length, transhdrlen, flags);
1109}
1110
1da177e4
LT
1111ssize_t ip_append_page(struct sock *sk, struct page *page,
1112 int offset, size_t size, int flags)
1113{
1114 struct inet_sock *inet = inet_sk(sk);
1115 struct sk_buff *skb;
1116 struct rtable *rt;
1117 struct ip_options *opt = NULL;
bdc712b4 1118 struct inet_cork *cork;
1da177e4
LT
1119 int hh_len;
1120 int mtu;
1121 int len;
1122 int err;
1123 unsigned int maxfraglen, fragheaderlen, fraggap;
1124
1125 if (inet->hdrincl)
1126 return -EPERM;
1127
1128 if (flags&MSG_PROBE)
1129 return 0;
1130
1131 if (skb_queue_empty(&sk->sk_write_queue))
1132 return -EINVAL;
1133
bdc712b4
DM
1134 cork = &inet->cork.base;
1135 rt = (struct rtable *)cork->dst;
1136 if (cork->flags & IPCORK_OPT)
1137 opt = cork->opt;
1da177e4 1138
d8d1f30b 1139 if (!(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
1140 return -EOPNOTSUPP;
1141
d8d1f30b 1142 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
bdc712b4 1143 mtu = cork->fragsize;
1da177e4
LT
1144
1145 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1146 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1147
bdc712b4 1148 if (cork->length + size > 0xFFFF - fragheaderlen) {
c720c7e8 1149 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1da177e4
LT
1150 return -EMSGSIZE;
1151 }
1152
1153 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1154 return -EINVAL;
1155
bdc712b4 1156 cork->length += size;
26cde9f7
HX
1157 if ((size + skb->len > mtu) &&
1158 (sk->sk_protocol == IPPROTO_UDP) &&
d8d1f30b 1159 (rt->dst.dev->features & NETIF_F_UFO)) {
7967168c 1160 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 1161 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7967168c 1162 }
e89e9cf5 1163
1da177e4
LT
1164
1165 while (size > 0) {
1166 int i;
1167
89114afd 1168 if (skb_is_gso(skb))
e89e9cf5
AR
1169 len = size;
1170 else {
1171
1172 /* Check if the remaining data fits into current packet. */
1173 len = mtu - skb->len;
1174 if (len < size)
1175 len = maxfraglen - skb->len;
1176 }
1da177e4
LT
1177 if (len <= 0) {
1178 struct sk_buff *skb_prev;
1da177e4
LT
1179 int alloclen;
1180
1181 skb_prev = skb;
0d0d2bba 1182 fraggap = skb_prev->len - maxfraglen;
1da177e4
LT
1183
1184 alloclen = fragheaderlen + hh_len + fraggap + 15;
1185 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1186 if (unlikely(!skb)) {
1187 err = -ENOBUFS;
1188 goto error;
1189 }
1190
1191 /*
1192 * Fill in the control structures
1193 */
1194 skb->ip_summed = CHECKSUM_NONE;
1195 skb->csum = 0;
1196 skb_reserve(skb, hh_len);
1197
1198 /*
1199 * Find where to start putting bytes.
1200 */
967b05f6 1201 skb_put(skb, fragheaderlen + fraggap);
2ca9e6f2 1202 skb_reset_network_header(skb);
b0e380b1
ACM
1203 skb->transport_header = (skb->network_header +
1204 fragheaderlen);
1da177e4 1205 if (fraggap) {
967b05f6
ACM
1206 skb->csum = skb_copy_and_csum_bits(skb_prev,
1207 maxfraglen,
9c70220b 1208 skb_transport_header(skb),
967b05f6 1209 fraggap, 0);
1da177e4
LT
1210 skb_prev->csum = csum_sub(skb_prev->csum,
1211 skb->csum);
e9fa4f7b 1212 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1213 }
1214
1215 /*
1216 * Put the packet on the pending queue.
1217 */
1218 __skb_queue_tail(&sk->sk_write_queue, skb);
1219 continue;
1220 }
1221
1222 i = skb_shinfo(skb)->nr_frags;
1223 if (len > size)
1224 len = size;
1225 if (skb_can_coalesce(skb, i, page, offset)) {
1226 skb_shinfo(skb)->frags[i-1].size += len;
1227 } else if (i < MAX_SKB_FRAGS) {
1228 get_page(page);
1229 skb_fill_page_desc(skb, i, page, offset, len);
1230 } else {
1231 err = -EMSGSIZE;
1232 goto error;
1233 }
1234
1235 if (skb->ip_summed == CHECKSUM_NONE) {
44bb9363 1236 __wsum csum;
1da177e4
LT
1237 csum = csum_page(page, offset, len);
1238 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1239 }
1240
1241 skb->len += len;
1242 skb->data_len += len;
1e34a11d
DM
1243 skb->truesize += len;
1244 atomic_add(len, &sk->sk_wmem_alloc);
1da177e4
LT
1245 offset += len;
1246 size -= len;
1247 }
1248 return 0;
1249
1250error:
bdc712b4 1251 cork->length -= size;
5e38e270 1252 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1253 return err;
1254}
1255
1470ddf7 1256static void ip_cork_release(struct inet_cork *cork)
429f08e9 1257{
1470ddf7
HX
1258 cork->flags &= ~IPCORK_OPT;
1259 kfree(cork->opt);
1260 cork->opt = NULL;
1261 dst_release(cork->dst);
1262 cork->dst = NULL;
429f08e9
PE
1263}
1264
1da177e4
LT
1265/*
1266 * Combined all pending IP fragments on the socket as one IP datagram
1267 * and push them out.
1268 */
1c32c5ad
HX
1269struct sk_buff *__ip_make_skb(struct sock *sk,
1270 struct sk_buff_head *queue,
1271 struct inet_cork *cork)
1da177e4
LT
1272{
1273 struct sk_buff *skb, *tmp_skb;
1274 struct sk_buff **tail_skb;
1275 struct inet_sock *inet = inet_sk(sk);
0388b004 1276 struct net *net = sock_net(sk);
1da177e4 1277 struct ip_options *opt = NULL;
1470ddf7 1278 struct rtable *rt = (struct rtable *)cork->dst;
1da177e4 1279 struct iphdr *iph;
76ab608d 1280 __be16 df = 0;
1da177e4 1281 __u8 ttl;
1da177e4 1282
1470ddf7 1283 if ((skb = __skb_dequeue(queue)) == NULL)
1da177e4
LT
1284 goto out;
1285 tail_skb = &(skb_shinfo(skb)->frag_list);
1286
1287 /* move skb->data to ip header from ext header */
d56f90a7 1288 if (skb->data < skb_network_header(skb))
bbe735e4 1289 __skb_pull(skb, skb_network_offset(skb));
1470ddf7 1290 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1291 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1292 *tail_skb = tmp_skb;
1293 tail_skb = &(tmp_skb->next);
1294 skb->len += tmp_skb->len;
1295 skb->data_len += tmp_skb->len;
1296 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1297 tmp_skb->destructor = NULL;
1298 tmp_skb->sk = NULL;
1299 }
1300
1301 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1302 * to fragment the frame generated here. No matter, what transforms
1303 * how transforms change size of the packet, it will come out.
1304 */
628a5c56 1305 if (inet->pmtudisc < IP_PMTUDISC_DO)
1da177e4
LT
1306 skb->local_df = 1;
1307
1308 /* DF bit is set when we want to see DF on outgoing frames.
1309 * If local_df is set too, we still allow to fragment this frame
1310 * locally. */
628a5c56 1311 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
d8d1f30b
CG
1312 (skb->len <= dst_mtu(&rt->dst) &&
1313 ip_dont_fragment(sk, &rt->dst)))
1da177e4
LT
1314 df = htons(IP_DF);
1315
1470ddf7
HX
1316 if (cork->flags & IPCORK_OPT)
1317 opt = cork->opt;
1da177e4
LT
1318
1319 if (rt->rt_type == RTN_MULTICAST)
1320 ttl = inet->mc_ttl;
1321 else
d8d1f30b 1322 ttl = ip_select_ttl(inet, &rt->dst);
1da177e4
LT
1323
1324 iph = (struct iphdr *)skb->data;
1325 iph->version = 4;
1326 iph->ihl = 5;
1327 if (opt) {
1328 iph->ihl += opt->optlen>>2;
1470ddf7 1329 ip_options_build(skb, opt, cork->addr, rt, 0);
1da177e4
LT
1330 }
1331 iph->tos = inet->tos;
1da177e4 1332 iph->frag_off = df;
d8d1f30b 1333 ip_select_ident(iph, &rt->dst, sk);
1da177e4
LT
1334 iph->ttl = ttl;
1335 iph->protocol = sk->sk_protocol;
1336 iph->saddr = rt->rt_src;
1337 iph->daddr = rt->rt_dst;
1da177e4
LT
1338
1339 skb->priority = sk->sk_priority;
4a19ec58 1340 skb->mark = sk->sk_mark;
a21bba94
ED
1341 /*
1342 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1343 * on dst refcount
1344 */
1470ddf7 1345 cork->dst = NULL;
d8d1f30b 1346 skb_dst_set(skb, &rt->dst);
1da177e4 1347
96793b48 1348 if (iph->protocol == IPPROTO_ICMP)
0388b004 1349 icmp_out_count(net, ((struct icmphdr *)
96793b48
DS
1350 skb_transport_header(skb))->type);
1351
1c32c5ad
HX
1352 ip_cork_release(cork);
1353out:
1354 return skb;
1355}
1356
1357int ip_send_skb(struct sk_buff *skb)
1358{
1359 struct net *net = sock_net(skb->sk);
1360 int err;
1361
c439cb2e 1362 err = ip_local_out(skb);
1da177e4
LT
1363 if (err) {
1364 if (err > 0)
6ce9e7b5 1365 err = net_xmit_errno(err);
1da177e4 1366 if (err)
1c32c5ad 1367 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1368 }
1369
1da177e4 1370 return err;
1da177e4
LT
1371}
1372
1470ddf7
HX
1373int ip_push_pending_frames(struct sock *sk)
1374{
1c32c5ad
HX
1375 struct sk_buff *skb;
1376
1377 skb = ip_finish_skb(sk);
1378 if (!skb)
1379 return 0;
1380
1381 /* Netfilter gets whole the not fragmented skb. */
1382 return ip_send_skb(skb);
1470ddf7
HX
1383}
1384
1da177e4
LT
1385/*
1386 * Throw away all pending data on the socket.
1387 */
1470ddf7
HX
1388static void __ip_flush_pending_frames(struct sock *sk,
1389 struct sk_buff_head *queue,
1390 struct inet_cork *cork)
1da177e4 1391{
1da177e4
LT
1392 struct sk_buff *skb;
1393
1470ddf7 1394 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1da177e4
LT
1395 kfree_skb(skb);
1396
1470ddf7
HX
1397 ip_cork_release(cork);
1398}
1399
1400void ip_flush_pending_frames(struct sock *sk)
1401{
bdc712b4 1402 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1da177e4
LT
1403}
1404
1c32c5ad
HX
1405struct sk_buff *ip_make_skb(struct sock *sk,
1406 int getfrag(void *from, char *to, int offset,
1407 int len, int odd, struct sk_buff *skb),
1408 void *from, int length, int transhdrlen,
1409 struct ipcm_cookie *ipc, struct rtable **rtp,
1410 unsigned int flags)
1411{
b80d7226 1412 struct inet_cork cork;
1c32c5ad
HX
1413 struct sk_buff_head queue;
1414 int err;
1415
1416 if (flags & MSG_PROBE)
1417 return NULL;
1418
1419 __skb_queue_head_init(&queue);
1420
b80d7226
DM
1421 cork.flags = 0;
1422 cork.addr = 0;
70652728 1423 cork.opt = NULL;
1c32c5ad
HX
1424 err = ip_setup_cork(sk, &cork, ipc, rtp);
1425 if (err)
1426 return ERR_PTR(err);
1427
1428 err = __ip_append_data(sk, &queue, &cork, getfrag,
1429 from, length, transhdrlen, flags);
1430 if (err) {
1431 __ip_flush_pending_frames(sk, &queue, &cork);
1432 return ERR_PTR(err);
1433 }
1434
1435 return __ip_make_skb(sk, &queue, &cork);
1436}
1da177e4
LT
1437
1438/*
1439 * Fetch data from kernel space and fill in checksum if needed.
1440 */
e905a9ed 1441static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1da177e4
LT
1442 int len, int odd, struct sk_buff *skb)
1443{
5084205f 1444 __wsum csum;
1da177e4
LT
1445
1446 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1447 skb->csum = csum_block_add(skb->csum, csum, odd);
e905a9ed 1448 return 0;
1da177e4
LT
1449}
1450
e905a9ed 1451/*
1da177e4
LT
1452 * Generic function to send a packet as reply to another packet.
1453 * Used to send TCP resets so far. ICMP should use this function too.
1454 *
e905a9ed 1455 * Should run single threaded per socket because it uses the sock
1da177e4 1456 * structure to pass arguments.
1da177e4
LT
1457 */
1458void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1459 unsigned int len)
1460{
1461 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 1462 struct ip_options_data replyopts;
1da177e4 1463 struct ipcm_cookie ipc;
3ca3c68e 1464 __be32 daddr;
511c3f92 1465 struct rtable *rt = skb_rtable(skb);
1da177e4 1466
f6d8bd05 1467 if (ip_options_echo(&replyopts.opt.opt, skb))
1da177e4
LT
1468 return;
1469
1470 daddr = ipc.addr = rt->rt_src;
1471 ipc.opt = NULL;
2244d07b 1472 ipc.tx_flags = 0;
1da177e4 1473
f6d8bd05 1474 if (replyopts.opt.opt.optlen) {
1da177e4
LT
1475 ipc.opt = &replyopts.opt;
1476
f6d8bd05
ED
1477 if (replyopts.opt.opt.srr)
1478 daddr = replyopts.opt.opt.faddr;
1da177e4
LT
1479 }
1480
1481 {
538de0e0
DM
1482 struct flowi4 fl4;
1483
1484 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1485 RT_TOS(ip_hdr(skb)->tos),
1486 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1487 ip_reply_arg_flowi_flags(arg),
1488 daddr, rt->rt_spec_dst,
1489 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
9d6ec938
DM
1490 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1491 rt = ip_route_output_key(sock_net(sk), &fl4);
b23dd4fe 1492 if (IS_ERR(rt))
1da177e4
LT
1493 return;
1494 }
1495
1496 /* And let IP do all the hard work.
1497
1498 This chunk is not reenterable, hence spinlock.
1499 Note that it uses the fact, that this function is called
1500 with locally disabled BH and that sk cannot be already spinlocked.
1501 */
1502 bh_lock_sock(sk);
eddc9ec5 1503 inet->tos = ip_hdr(skb)->tos;
1da177e4 1504 sk->sk_priority = skb->priority;
eddc9ec5 1505 sk->sk_protocol = ip_hdr(skb)->protocol;
f0e48dbf 1506 sk->sk_bound_dev_if = arg->bound_dev_if;
1da177e4 1507 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
2e77d89b 1508 &ipc, &rt, MSG_DONTWAIT);
1da177e4
LT
1509 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1510 if (arg->csumoffset >= 0)
9c70220b
ACM
1511 *((__sum16 *)skb_transport_header(skb) +
1512 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1513 arg->csum));
1da177e4
LT
1514 skb->ip_summed = CHECKSUM_NONE;
1515 ip_push_pending_frames(sk);
1516 }
1517
1518 bh_unlock_sock(sk);
1519
1520 ip_rt_put(rt);
1521}
1522
1da177e4
LT
1523void __init ip_init(void)
1524{
1da177e4
LT
1525 ip_rt_init();
1526 inet_initpeers();
1527
1528#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1529 igmp_mc_proc_init();
1530#endif
1531}