]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/ip_output.c
dccp: Evaluate ip_hdr() only once in dccp_v4_route_skb().
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / ip_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
e905a9ed 23 * Bradford Johnson: Fix faulty handling of some frames when
1da177e4
LT
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
e905a9ed
YH
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
1da177e4
LT
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
1da177e4
LT
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
a1f8e7f7 53#include <linux/highmem.h>
5a0e3ad6 54#include <linux/slab.h>
1da177e4
LT
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
cfacb057 70#include <net/xfrm.h>
1da177e4
LT
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
1da177e4
LT
75#include <net/checksum.h>
76#include <net/inetpeer.h>
1da177e4
LT
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
6cbb0df7 82#include <linux/tcp.h>
1da177e4 83
ab32ea5d 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
323e126f 85EXPORT_SYMBOL(sysctl_ip_default_ttl);
1da177e4
LT
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
4bc2f18b 93EXPORT_SYMBOL(ip_send_check);
1da177e4 94
c439cb2e
HX
95int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
9bbc768a
JE
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
c439cb2e
HX
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
1da177e4
LT
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
459a98ed 120 skb_reset_mac_header(newskb);
bbe735e4 121 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 124 WARN_ON(!skb_dst(newskb));
d52fbfc9 125 skb_dst_force(newskb);
e30b38c2 126 netif_rx_ni(newskb);
1da177e4
LT
127 return 0;
128}
129
130static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131{
132 int ttl = inet->uc_ttl;
133
134 if (ttl < 0)
323e126f 135 ttl = ip4_dst_hoplimit(dst);
1da177e4
LT
136 return ttl;
137}
138
e905a9ed 139/*
1da177e4
LT
140 * Add an ip header to a skbuff and send it out.
141 *
142 */
143int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
f6d8bd05 144 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
1da177e4
LT
145{
146 struct inet_sock *inet = inet_sk(sk);
511c3f92 147 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
148 struct iphdr *iph;
149
150 /* Build the IP header. */
f6d8bd05 151 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
8856dfa3 152 skb_reset_network_header(skb);
eddc9ec5 153 iph = ip_hdr(skb);
1da177e4
LT
154 iph->version = 4;
155 iph->ihl = 5;
156 iph->tos = inet->tos;
d8d1f30b 157 if (ip_dont_fragment(sk, &rt->dst))
1da177e4
LT
158 iph->frag_off = htons(IP_DF);
159 else
160 iph->frag_off = 0;
d8d1f30b 161 iph->ttl = ip_select_ttl(inet, &rt->dst);
dd927a26
DM
162 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163 iph->saddr = saddr;
1da177e4 164 iph->protocol = sk->sk_protocol;
d8d1f30b 165 ip_select_ident(iph, &rt->dst, sk);
1da177e4 166
f6d8bd05
ED
167 if (opt && opt->opt.optlen) {
168 iph->ihl += opt->opt.optlen>>2;
169 ip_options_build(skb, &opt->opt, daddr, rt, 0);
1da177e4 170 }
1da177e4
LT
171
172 skb->priority = sk->sk_priority;
4a19ec58 173 skb->mark = sk->sk_mark;
1da177e4
LT
174
175 /* Send it out. */
c439cb2e 176 return ip_local_out(skb);
1da177e4 177}
d8c97a94
ACM
178EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179
1da177e4
LT
180static inline int ip_finish_output2(struct sk_buff *skb)
181{
adf30907 182 struct dst_entry *dst = skb_dst(skb);
80787ebc 183 struct rtable *rt = (struct rtable *)dst;
1da177e4 184 struct net_device *dev = dst->dev;
c2636b4d 185 unsigned int hh_len = LL_RESERVED_SPACE(dev);
f6b72b62 186 struct neighbour *neigh;
1da177e4 187
edf391ff
NH
188 if (rt->rt_type == RTN_MULTICAST) {
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190 } else if (rt->rt_type == RTN_BROADCAST)
191 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
80787ebc 192
1da177e4 193 /* Be paranoid, rather than too clever. */
3b04ddde 194 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
1da177e4
LT
195 struct sk_buff *skb2;
196
197 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198 if (skb2 == NULL) {
199 kfree_skb(skb);
200 return -ENOMEM;
201 }
202 if (skb->sk)
203 skb_set_owner_w(skb2, skb->sk);
204 kfree_skb(skb);
205 skb = skb2;
206 }
207
f2c31e32 208 rcu_read_lock();
69cce1d1 209 neigh = dst_get_neighbour(dst);
f2c31e32
ED
210 if (neigh) {
211 int res = neigh_output(neigh, skb);
212
213 rcu_read_unlock();
214 return res;
215 }
216 rcu_read_unlock();
05e3aa09 217
1da177e4
LT
218 if (net_ratelimit())
219 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
220 kfree_skb(skb);
221 return -EINVAL;
222}
223
628a5c56
JH
224static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225{
226 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227
228 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
adf30907 229 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
230}
231
861d0486 232static int ip_finish_output(struct sk_buff *skb)
1da177e4 233{
5c901daa
PM
234#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235 /* Policy lookup after SNAT yielded a new policy */
adf30907 236 if (skb_dst(skb)->xfrm != NULL) {
48d5cad8
PM
237 IPCB(skb)->flags |= IPSKB_REROUTED;
238 return dst_output(skb);
239 }
5c901daa 240#endif
628a5c56 241 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
1bd9bef6
PM
242 return ip_fragment(skb, ip_finish_output2);
243 else
244 return ip_finish_output2(skb);
1da177e4
LT
245}
246
247int ip_mc_output(struct sk_buff *skb)
248{
249 struct sock *sk = skb->sk;
511c3f92 250 struct rtable *rt = skb_rtable(skb);
d8d1f30b 251 struct net_device *dev = rt->dst.dev;
1da177e4
LT
252
253 /*
254 * If the indicated interface is up and running, send the packet.
255 */
edf391ff 256 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4
LT
257
258 skb->dev = dev;
259 skb->protocol = htons(ETH_P_IP);
260
261 /*
262 * Multicasts are looped back for other local users
263 */
264
265 if (rt->rt_flags&RTCF_MULTICAST) {
7ad6848c 266 if (sk_mc_loop(sk)
1da177e4
LT
267#ifdef CONFIG_IP_MROUTE
268 /* Small optimization: do not loopback not local frames,
269 which returned after forwarding; they will be dropped
270 by ip_mr_input in any case.
271 Note, that local frames are looped back to be delivered
272 to local recipients.
273
274 This check is duplicated in ip_mr_input at the moment.
275 */
9d4fb27d
JP
276 &&
277 ((rt->rt_flags & RTCF_LOCAL) ||
278 !(IPCB(skb)->flags & IPSKB_FORWARDED))
1da177e4 279#endif
9d4fb27d 280 ) {
1da177e4
LT
281 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
282 if (newskb)
9bbc768a
JE
283 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
284 newskb, NULL, newskb->dev,
1da177e4
LT
285 ip_dev_loopback_xmit);
286 }
287
288 /* Multicasts with ttl 0 must not go beyond the host */
289
eddc9ec5 290 if (ip_hdr(skb)->ttl == 0) {
1da177e4
LT
291 kfree_skb(skb);
292 return 0;
293 }
294 }
295
296 if (rt->rt_flags&RTCF_BROADCAST) {
297 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
298 if (newskb)
9bbc768a
JE
299 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
300 NULL, newskb->dev, ip_dev_loopback_xmit);
1da177e4
LT
301 }
302
9bbc768a
JE
303 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
304 skb->dev, ip_finish_output,
48d5cad8 305 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
306}
307
308int ip_output(struct sk_buff *skb)
309{
adf30907 310 struct net_device *dev = skb_dst(skb)->dev;
1bd9bef6 311
edf391ff 312 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4 313
1bd9bef6
PM
314 skb->dev = dev;
315 skb->protocol = htons(ETH_P_IP);
316
9bbc768a 317 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
e905a9ed 318 ip_finish_output,
48d5cad8 319 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
320}
321
d9d8da80 322int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
1da177e4 323{
e89862f4 324 struct sock *sk = skb->sk;
1da177e4 325 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 326 struct ip_options_rcu *inet_opt;
b57ae01a 327 struct flowi4 *fl4;
1da177e4
LT
328 struct rtable *rt;
329 struct iphdr *iph;
ab6e3feb 330 int res;
1da177e4
LT
331
332 /* Skip all of this if the packet is already routed,
333 * f.e. by something like SCTP.
334 */
ab6e3feb 335 rcu_read_lock();
f6d8bd05 336 inet_opt = rcu_dereference(inet->inet_opt);
ea4fc0d6 337 fl4 = &fl->u.ip4;
511c3f92 338 rt = skb_rtable(skb);
1da177e4
LT
339 if (rt != NULL)
340 goto packet_routed;
341
342 /* Make sure we can route this packet. */
343 rt = (struct rtable *)__sk_dst_check(sk, 0);
344 if (rt == NULL) {
3ca3c68e 345 __be32 daddr;
1da177e4
LT
346
347 /* Use correct destination address if we have options. */
c720c7e8 348 daddr = inet->inet_daddr;
f6d8bd05
ED
349 if (inet_opt && inet_opt->opt.srr)
350 daddr = inet_opt->opt.faddr;
1da177e4 351
78fbfd8a
DM
352 /* If this fails, retransmit mechanism of transport layer will
353 * keep trying until route appears or the connection times
354 * itself out.
355 */
b57ae01a 356 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
78fbfd8a
DM
357 daddr, inet->inet_saddr,
358 inet->inet_dport,
359 inet->inet_sport,
360 sk->sk_protocol,
361 RT_CONN_FLAGS(sk),
362 sk->sk_bound_dev_if);
363 if (IS_ERR(rt))
364 goto no_route;
d8d1f30b 365 sk_setup_caps(sk, &rt->dst);
1da177e4 366 }
d8d1f30b 367 skb_dst_set_noref(skb, &rt->dst);
1da177e4
LT
368
369packet_routed:
ea4fc0d6 370 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
1da177e4
LT
371 goto no_route;
372
373 /* OK, we know where to send it, allocate and build IP header. */
f6d8bd05 374 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
8856dfa3 375 skb_reset_network_header(skb);
eddc9ec5 376 iph = ip_hdr(skb);
714e85be 377 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
d8d1f30b 378 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
1da177e4
LT
379 iph->frag_off = htons(IP_DF);
380 else
381 iph->frag_off = 0;
d8d1f30b 382 iph->ttl = ip_select_ttl(inet, &rt->dst);
1da177e4 383 iph->protocol = sk->sk_protocol;
ea4fc0d6
DM
384 iph->saddr = fl4->saddr;
385 iph->daddr = fl4->daddr;
1da177e4
LT
386 /* Transport layer set skb->h.foo itself. */
387
f6d8bd05
ED
388 if (inet_opt && inet_opt->opt.optlen) {
389 iph->ihl += inet_opt->opt.optlen >> 2;
390 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
1da177e4
LT
391 }
392
d8d1f30b 393 ip_select_ident_more(iph, &rt->dst, sk,
7967168c 394 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
1da177e4 395
1da177e4 396 skb->priority = sk->sk_priority;
4a19ec58 397 skb->mark = sk->sk_mark;
1da177e4 398
ab6e3feb
ED
399 res = ip_local_out(skb);
400 rcu_read_unlock();
401 return res;
1da177e4
LT
402
403no_route:
ab6e3feb 404 rcu_read_unlock();
5e38e270 405 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
406 kfree_skb(skb);
407 return -EHOSTUNREACH;
408}
4bc2f18b 409EXPORT_SYMBOL(ip_queue_xmit);
1da177e4
LT
410
411
412static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
413{
414 to->pkt_type = from->pkt_type;
415 to->priority = from->priority;
416 to->protocol = from->protocol;
adf30907 417 skb_dst_drop(to);
fe76cda3 418 skb_dst_copy(to, from);
1da177e4 419 to->dev = from->dev;
82e91ffe 420 to->mark = from->mark;
1da177e4
LT
421
422 /* Copy the flags to each fragment. */
423 IPCB(to)->flags = IPCB(from)->flags;
424
425#ifdef CONFIG_NET_SCHED
426 to->tc_index = from->tc_index;
427#endif
e7ac05f3 428 nf_copy(to, from);
ba9dda3a
JK
429#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
430 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
431 to->nf_trace = from->nf_trace;
432#endif
c98d80ed
JA
433#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
434 to->ipvs_property = from->ipvs_property;
1da177e4 435#endif
984bc16c 436 skb_copy_secmark(to, from);
1da177e4
LT
437}
438
439/*
440 * This IP datagram is too large to be sent in one piece. Break it up into
441 * smaller pieces (each of size equal to IP header plus
442 * a block of the data of the original IP data part) that will yet fit in a
443 * single device frame, and queue such a frame for sending.
444 */
445
d9319100 446int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4
LT
447{
448 struct iphdr *iph;
1da177e4
LT
449 int ptr;
450 struct net_device *dev;
451 struct sk_buff *skb2;
c893b806 452 unsigned int mtu, hlen, left, len, ll_rs;
1da177e4 453 int offset;
76ab608d 454 __be16 not_last_frag;
511c3f92 455 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
456 int err = 0;
457
d8d1f30b 458 dev = rt->dst.dev;
1da177e4
LT
459
460 /*
461 * Point into the IP datagram header.
462 */
463
eddc9ec5 464 iph = ip_hdr(skb);
1da177e4
LT
465
466 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
5e38e270 467 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 468 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
628a5c56 469 htonl(ip_skb_dst_mtu(skb)));
1da177e4
LT
470 kfree_skb(skb);
471 return -EMSGSIZE;
472 }
473
474 /*
475 * Setup starting values.
476 */
477
478 hlen = iph->ihl * 4;
d8d1f30b 479 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
6c79bf0f
BDS
480#ifdef CONFIG_BRIDGE_NETFILTER
481 if (skb->nf_bridge)
482 mtu -= nf_bridge_mtu_reduction(skb);
483#endif
89cee8b1 484 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
1da177e4
LT
485
486 /* When frag_list is given, use it. First, check its validity:
487 * some transformers could create wrong frag_list or break existing
488 * one, it is not prohibited. In this case fall back to copying.
489 *
490 * LATER: this step can be merged to real generation of fragments,
491 * we can switch to copy when see the first bad fragment.
492 */
21dc3301 493 if (skb_has_frag_list(skb)) {
3d13008e 494 struct sk_buff *frag, *frag2;
1da177e4
LT
495 int first_len = skb_pagelen(skb);
496
497 if (first_len - hlen > mtu ||
498 ((first_len - hlen) & 7) ||
56f8a75c 499 ip_is_fragment(iph) ||
1da177e4
LT
500 skb_cloned(skb))
501 goto slow_path;
502
d7fcf1a5 503 skb_walk_frags(skb, frag) {
1da177e4
LT
504 /* Correct geometry. */
505 if (frag->len > mtu ||
506 ((frag->len & 7) && frag->next) ||
507 skb_headroom(frag) < hlen)
3d13008e 508 goto slow_path_clean;
1da177e4
LT
509
510 /* Partially cloned skb? */
511 if (skb_shared(frag))
3d13008e 512 goto slow_path_clean;
2fdba6b0
HX
513
514 BUG_ON(frag->sk);
515 if (skb->sk) {
2fdba6b0
HX
516 frag->sk = skb->sk;
517 frag->destructor = sock_wfree;
2fdba6b0 518 }
3d13008e 519 skb->truesize -= frag->truesize;
1da177e4
LT
520 }
521
522 /* Everything is OK. Generate! */
523
524 err = 0;
525 offset = 0;
526 frag = skb_shinfo(skb)->frag_list;
d7fcf1a5 527 skb_frag_list_init(skb);
1da177e4
LT
528 skb->data_len = first_len - skb_headlen(skb);
529 skb->len = first_len;
530 iph->tot_len = htons(first_len);
531 iph->frag_off = htons(IP_MF);
532 ip_send_check(iph);
533
534 for (;;) {
535 /* Prepare header of the next frame,
536 * before previous one went down. */
537 if (frag) {
538 frag->ip_summed = CHECKSUM_NONE;
badff6d0 539 skb_reset_transport_header(frag);
e2d1bca7
ACM
540 __skb_push(frag, hlen);
541 skb_reset_network_header(frag);
d56f90a7 542 memcpy(skb_network_header(frag), iph, hlen);
eddc9ec5 543 iph = ip_hdr(frag);
1da177e4
LT
544 iph->tot_len = htons(frag->len);
545 ip_copy_metadata(frag, skb);
546 if (offset == 0)
547 ip_options_fragment(frag);
548 offset += skb->len - hlen;
549 iph->frag_off = htons(offset>>3);
550 if (frag->next != NULL)
551 iph->frag_off |= htons(IP_MF);
552 /* Ready, complete checksum */
553 ip_send_check(iph);
554 }
555
556 err = output(skb);
557
dafee490 558 if (!err)
5e38e270 559 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
560 if (err || !frag)
561 break;
562
563 skb = frag;
564 frag = skb->next;
565 skb->next = NULL;
566 }
567
568 if (err == 0) {
5e38e270 569 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
570 return 0;
571 }
572
573 while (frag) {
574 skb = frag->next;
575 kfree_skb(frag);
576 frag = skb;
577 }
5e38e270 578 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 579 return err;
3d13008e
ED
580
581slow_path_clean:
582 skb_walk_frags(skb, frag2) {
583 if (frag2 == frag)
584 break;
585 frag2->sk = NULL;
586 frag2->destructor = NULL;
587 skb->truesize += frag2->truesize;
588 }
1da177e4
LT
589 }
590
591slow_path:
592 left = skb->len - hlen; /* Space per frame */
49085bd7 593 ptr = hlen; /* Where to start from */
1da177e4 594
1da177e4 595 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
9bcfcaf5
SH
596 * we need to make room for the encapsulating header
597 */
c893b806 598 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
9bcfcaf5 599
1da177e4
LT
600 /*
601 * Fragment the datagram.
602 */
603
604 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
605 not_last_frag = iph->frag_off & htons(IP_MF);
606
607 /*
608 * Keep copying data until we run out.
609 */
610
132adf54 611 while (left > 0) {
1da177e4
LT
612 len = left;
613 /* IF: it doesn't fit, use 'mtu' - the data space left */
614 if (len > mtu)
615 len = mtu;
25985edc 616 /* IF: we are not sending up to and including the packet end
1da177e4
LT
617 then align the next start on an eight byte boundary */
618 if (len < left) {
619 len &= ~7;
620 }
621 /*
622 * Allocate buffer.
623 */
624
625 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
64ce2073 626 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
1da177e4
LT
627 err = -ENOMEM;
628 goto fail;
629 }
630
631 /*
632 * Set up data on packet
633 */
634
635 ip_copy_metadata(skb2, skb);
636 skb_reserve(skb2, ll_rs);
637 skb_put(skb2, len + hlen);
c1d2bbe1 638 skb_reset_network_header(skb2);
b0e380b1 639 skb2->transport_header = skb2->network_header + hlen;
1da177e4
LT
640
641 /*
642 * Charge the memory for the fragment to any owner
643 * it might possess
644 */
645
646 if (skb->sk)
647 skb_set_owner_w(skb2, skb->sk);
648
649 /*
650 * Copy the packet header into the new buffer.
651 */
652
d626f62b 653 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
1da177e4
LT
654
655 /*
656 * Copy a block of the IP datagram.
657 */
bff9b61c 658 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
1da177e4
LT
659 BUG();
660 left -= len;
661
662 /*
663 * Fill in the new header fields.
664 */
eddc9ec5 665 iph = ip_hdr(skb2);
1da177e4
LT
666 iph->frag_off = htons((offset >> 3));
667
668 /* ANK: dirty, but effective trick. Upgrade options only if
669 * the segment to be fragmented was THE FIRST (otherwise,
670 * options are already fixed) and make it ONCE
671 * on the initial skb, so that all the following fragments
672 * will inherit fixed options.
673 */
674 if (offset == 0)
675 ip_options_fragment(skb);
676
677 /*
678 * Added AC : If we are fragmenting a fragment that's not the
679 * last fragment then keep MF on each bit
680 */
681 if (left > 0 || not_last_frag)
682 iph->frag_off |= htons(IP_MF);
683 ptr += len;
684 offset += len;
685
686 /*
687 * Put this fragment into the sending queue.
688 */
1da177e4
LT
689 iph->tot_len = htons(len + hlen);
690
691 ip_send_check(iph);
692
693 err = output(skb2);
694 if (err)
695 goto fail;
dafee490 696
5e38e270 697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
698 }
699 kfree_skb(skb);
5e38e270 700 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
701 return err;
702
703fail:
e905a9ed 704 kfree_skb(skb);
5e38e270 705 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
706 return err;
707}
2e2f7aef
PM
708EXPORT_SYMBOL(ip_fragment);
709
1da177e4
LT
710int
711ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
712{
713 struct iovec *iov = from;
714
84fa7933 715 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1da177e4
LT
716 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
717 return -EFAULT;
718 } else {
44bb9363 719 __wsum csum = 0;
1da177e4
LT
720 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
721 return -EFAULT;
722 skb->csum = csum_block_add(skb->csum, csum, odd);
723 }
724 return 0;
725}
4bc2f18b 726EXPORT_SYMBOL(ip_generic_getfrag);
1da177e4 727
44bb9363 728static inline __wsum
1da177e4
LT
729csum_page(struct page *page, int offset, int copy)
730{
731 char *kaddr;
44bb9363 732 __wsum csum;
1da177e4
LT
733 kaddr = kmap(page);
734 csum = csum_partial(kaddr + offset, copy, 0);
735 kunmap(page);
736 return csum;
737}
738
4b30b1c6 739static inline int ip_ufo_append_data(struct sock *sk,
1470ddf7 740 struct sk_buff_head *queue,
e89e9cf5
AR
741 int getfrag(void *from, char *to, int offset, int len,
742 int odd, struct sk_buff *skb),
743 void *from, int length, int hh_len, int fragheaderlen,
d9be4f7a 744 int transhdrlen, int maxfraglen, unsigned int flags)
e89e9cf5
AR
745{
746 struct sk_buff *skb;
747 int err;
748
749 /* There is support for UDP fragmentation offload by network
750 * device, so create one single skb packet containing complete
751 * udp datagram
752 */
1470ddf7 753 if ((skb = skb_peek_tail(queue)) == NULL) {
e89e9cf5
AR
754 skb = sock_alloc_send_skb(sk,
755 hh_len + fragheaderlen + transhdrlen + 20,
756 (flags & MSG_DONTWAIT), &err);
757
758 if (skb == NULL)
759 return err;
760
761 /* reserve space for Hardware header */
762 skb_reserve(skb, hh_len);
763
764 /* create space for UDP/IP header */
d9319100 765 skb_put(skb, fragheaderlen + transhdrlen);
e89e9cf5
AR
766
767 /* initialize network header pointer */
c1d2bbe1 768 skb_reset_network_header(skb);
e89e9cf5
AR
769
770 /* initialize protocol header pointer */
b0e380b1 771 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 772
84fa7933 773 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5 774 skb->csum = 0;
e89e9cf5 775
be9164e7 776 /* specify the length of each IP datagram fragment */
d9be4f7a 777 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
f83ef8c0 778 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1470ddf7 779 __skb_queue_tail(queue, skb);
e89e9cf5 780 }
be9164e7
K
781
782 return skb_append_datato_frags(sk, skb, getfrag, from,
783 (length - transhdrlen));
e89e9cf5
AR
784}
785
f5fca608
DM
786static int __ip_append_data(struct sock *sk,
787 struct flowi4 *fl4,
788 struct sk_buff_head *queue,
1470ddf7
HX
789 struct inet_cork *cork,
790 int getfrag(void *from, char *to, int offset,
791 int len, int odd, struct sk_buff *skb),
792 void *from, int length, int transhdrlen,
793 unsigned int flags)
1da177e4
LT
794{
795 struct inet_sock *inet = inet_sk(sk);
796 struct sk_buff *skb;
797
07df5294 798 struct ip_options *opt = cork->opt;
1da177e4
LT
799 int hh_len;
800 int exthdrlen;
801 int mtu;
802 int copy;
803 int err;
804 int offset = 0;
805 unsigned int maxfraglen, fragheaderlen;
806 int csummode = CHECKSUM_NONE;
1470ddf7 807 struct rtable *rt = (struct rtable *)cork->dst;
1da177e4 808
96d7303e
SK
809 skb = skb_peek_tail(queue);
810
811 exthdrlen = !skb ? rt->dst.header_len : 0;
07df5294 812 mtu = cork->fragsize;
1da177e4 813
d8d1f30b 814 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4
LT
815
816 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
817 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
818
1470ddf7 819 if (cork->length + length > 0xFFFF - fragheaderlen) {
f5fca608 820 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
c720c7e8 821 mtu-exthdrlen);
1da177e4
LT
822 return -EMSGSIZE;
823 }
824
825 /*
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
828 */
829 if (transhdrlen &&
830 length + fragheaderlen <= mtu &&
d8d1f30b 831 rt->dst.dev->features & NETIF_F_V4_CSUM &&
1da177e4 832 !exthdrlen)
84fa7933 833 csummode = CHECKSUM_PARTIAL;
1da177e4 834
1470ddf7 835 cork->length += length;
26cde9f7 836 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
be9164e7 837 (sk->sk_protocol == IPPROTO_UDP) &&
c146066a 838 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
1470ddf7
HX
839 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
840 hh_len, fragheaderlen, transhdrlen,
d9be4f7a 841 maxfraglen, flags);
baa829d8 842 if (err)
e89e9cf5 843 goto error;
e89e9cf5
AR
844 return 0;
845 }
1da177e4
LT
846
847 /* So, what's going on in the loop below?
848 *
849 * We use calculated fragment length to generate chained skb,
850 * each of segments is IP fragment ready for sending to network after
851 * adding appropriate IP header.
852 */
853
26cde9f7 854 if (!skb)
1da177e4
LT
855 goto alloc_new_skb;
856
857 while (length > 0) {
858 /* Check if the remaining data fits into current packet. */
859 copy = mtu - skb->len;
860 if (copy < length)
861 copy = maxfraglen - skb->len;
862 if (copy <= 0) {
863 char *data;
864 unsigned int datalen;
865 unsigned int fraglen;
866 unsigned int fraggap;
867 unsigned int alloclen;
868 struct sk_buff *skb_prev;
869alloc_new_skb:
870 skb_prev = skb;
871 if (skb_prev)
872 fraggap = skb_prev->len - maxfraglen;
873 else
874 fraggap = 0;
875
876 /*
877 * If remaining data exceeds the mtu,
878 * we know we need more fragment(s).
879 */
880 datalen = length + fraggap;
881 if (datalen > mtu - fragheaderlen)
882 datalen = maxfraglen - fragheaderlen;
883 fraglen = datalen + fragheaderlen;
884
e905a9ed 885 if ((flags & MSG_MORE) &&
d8d1f30b 886 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
887 alloclen = mtu;
888 else
59104f06 889 alloclen = fraglen;
1da177e4 890
353e5c9a
SK
891 alloclen += exthdrlen;
892
1da177e4
LT
893 /* The last fragment gets additional space at tail.
894 * Note, with MSG_MORE we overallocate on fragments,
895 * because we have no idea what fragment will be
896 * the last.
897 */
33f99dc7 898 if (datalen == length + fraggap)
d8d1f30b 899 alloclen += rt->dst.trailer_len;
33f99dc7 900
1da177e4 901 if (transhdrlen) {
e905a9ed 902 skb = sock_alloc_send_skb(sk,
1da177e4
LT
903 alloclen + hh_len + 15,
904 (flags & MSG_DONTWAIT), &err);
905 } else {
906 skb = NULL;
907 if (atomic_read(&sk->sk_wmem_alloc) <=
908 2 * sk->sk_sndbuf)
e905a9ed 909 skb = sock_wmalloc(sk,
1da177e4
LT
910 alloclen + hh_len + 15, 1,
911 sk->sk_allocation);
912 if (unlikely(skb == NULL))
913 err = -ENOBUFS;
51f31cab
PO
914 else
915 /* only the initial fragment is
916 time stamped */
1470ddf7 917 cork->tx_flags = 0;
1da177e4
LT
918 }
919 if (skb == NULL)
920 goto error;
921
922 /*
923 * Fill in the control structures
924 */
925 skb->ip_summed = csummode;
926 skb->csum = 0;
927 skb_reserve(skb, hh_len);
1470ddf7 928 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1da177e4
LT
929
930 /*
931 * Find where to start putting bytes.
932 */
353e5c9a 933 data = skb_put(skb, fraglen + exthdrlen);
c14d2450 934 skb_set_network_header(skb, exthdrlen);
b0e380b1
ACM
935 skb->transport_header = (skb->network_header +
936 fragheaderlen);
353e5c9a 937 data += fragheaderlen + exthdrlen;
1da177e4
LT
938
939 if (fraggap) {
940 skb->csum = skb_copy_and_csum_bits(
941 skb_prev, maxfraglen,
942 data + transhdrlen, fraggap, 0);
943 skb_prev->csum = csum_sub(skb_prev->csum,
944 skb->csum);
945 data += fraggap;
e9fa4f7b 946 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
947 }
948
949 copy = datalen - transhdrlen - fraggap;
950 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
951 err = -EFAULT;
952 kfree_skb(skb);
953 goto error;
954 }
955
956 offset += copy;
957 length -= datalen - fraggap;
958 transhdrlen = 0;
959 exthdrlen = 0;
960 csummode = CHECKSUM_NONE;
961
962 /*
963 * Put the packet on the pending queue.
964 */
1470ddf7 965 __skb_queue_tail(queue, skb);
1da177e4
LT
966 continue;
967 }
968
969 if (copy > length)
970 copy = length;
971
d8d1f30b 972 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1da177e4
LT
973 unsigned int off;
974
975 off = skb->len;
e905a9ed 976 if (getfrag(from, skb_put(skb, copy),
1da177e4
LT
977 offset, copy, off, skb) < 0) {
978 __skb_trim(skb, off);
979 err = -EFAULT;
980 goto error;
981 }
982 } else {
983 int i = skb_shinfo(skb)->nr_frags;
984 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1470ddf7
HX
985 struct page *page = cork->page;
986 int off = cork->off;
1da177e4
LT
987 unsigned int left;
988
989 if (page && (left = PAGE_SIZE - off) > 0) {
990 if (copy >= left)
991 copy = left;
aff65da0 992 if (page != skb_frag_page(frag)) {
1da177e4
LT
993 if (i == MAX_SKB_FRAGS) {
994 err = -EMSGSIZE;
995 goto error;
996 }
1470ddf7 997 skb_fill_page_desc(skb, i, page, off, 0);
aff65da0 998 skb_frag_ref(skb, i);
1da177e4
LT
999 frag = &skb_shinfo(skb)->frags[i];
1000 }
1001 } else if (i < MAX_SKB_FRAGS) {
1002 if (copy > PAGE_SIZE)
1003 copy = PAGE_SIZE;
1004 page = alloc_pages(sk->sk_allocation, 0);
1005 if (page == NULL) {
1006 err = -ENOMEM;
1007 goto error;
1008 }
1470ddf7
HX
1009 cork->page = page;
1010 cork->off = 0;
1da177e4
LT
1011
1012 skb_fill_page_desc(skb, i, page, 0, 0);
1013 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1014 } else {
1015 err = -EMSGSIZE;
1016 goto error;
1017 }
9e903e08 1018 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
aff65da0 1019 offset, copy, skb->len, skb) < 0) {
1da177e4
LT
1020 err = -EFAULT;
1021 goto error;
1022 }
1470ddf7 1023 cork->off += copy;
9e903e08 1024 skb_frag_size_add(frag, copy);
1da177e4
LT
1025 skb->len += copy;
1026 skb->data_len += copy;
f945fa7a
HX
1027 skb->truesize += copy;
1028 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1029 }
1030 offset += copy;
1031 length -= copy;
1032 }
1033
1034 return 0;
1035
1036error:
1470ddf7 1037 cork->length -= length;
5e38e270 1038 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
e905a9ed 1039 return err;
1da177e4
LT
1040}
1041
1470ddf7
HX
1042static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1043 struct ipcm_cookie *ipc, struct rtable **rtp)
1044{
1045 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 1046 struct ip_options_rcu *opt;
1470ddf7
HX
1047 struct rtable *rt;
1048
1049 /*
1050 * setup for corking.
1051 */
1052 opt = ipc->opt;
1053 if (opt) {
1054 if (cork->opt == NULL) {
1055 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1056 sk->sk_allocation);
1057 if (unlikely(cork->opt == NULL))
1058 return -ENOBUFS;
1059 }
f6d8bd05 1060 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1470ddf7
HX
1061 cork->flags |= IPCORK_OPT;
1062 cork->addr = ipc->addr;
1063 }
1064 rt = *rtp;
1065 if (unlikely(!rt))
1066 return -EFAULT;
1067 /*
1068 * We steal reference to this route, caller should not release it
1069 */
1070 *rtp = NULL;
1071 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
353e5c9a 1072 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1470ddf7
HX
1073 cork->dst = &rt->dst;
1074 cork->length = 0;
1075 cork->tx_flags = ipc->tx_flags;
1076 cork->page = NULL;
1077 cork->off = 0;
1078
1079 return 0;
1080}
1081
1082/*
1083 * ip_append_data() and ip_append_page() can make one large IP datagram
1084 * from many pieces of data. Each pieces will be holded on the socket
1085 * until ip_push_pending_frames() is called. Each piece can be a page
1086 * or non-page data.
1087 *
1088 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1089 * this interface potentially.
1090 *
1091 * LATER: length must be adjusted by pad at tail, when it is required.
1092 */
f5fca608 1093int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1470ddf7
HX
1094 int getfrag(void *from, char *to, int offset, int len,
1095 int odd, struct sk_buff *skb),
1096 void *from, int length, int transhdrlen,
1097 struct ipcm_cookie *ipc, struct rtable **rtp,
1098 unsigned int flags)
1099{
1100 struct inet_sock *inet = inet_sk(sk);
1101 int err;
1102
1103 if (flags&MSG_PROBE)
1104 return 0;
1105
1106 if (skb_queue_empty(&sk->sk_write_queue)) {
bdc712b4 1107 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1470ddf7
HX
1108 if (err)
1109 return err;
1110 } else {
1111 transhdrlen = 0;
1112 }
1113
f5fca608 1114 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1470ddf7
HX
1115 from, length, transhdrlen, flags);
1116}
1117
f5fca608 1118ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1da177e4
LT
1119 int offset, size_t size, int flags)
1120{
1121 struct inet_sock *inet = inet_sk(sk);
1122 struct sk_buff *skb;
1123 struct rtable *rt;
1124 struct ip_options *opt = NULL;
bdc712b4 1125 struct inet_cork *cork;
1da177e4
LT
1126 int hh_len;
1127 int mtu;
1128 int len;
1129 int err;
1130 unsigned int maxfraglen, fragheaderlen, fraggap;
1131
1132 if (inet->hdrincl)
1133 return -EPERM;
1134
1135 if (flags&MSG_PROBE)
1136 return 0;
1137
1138 if (skb_queue_empty(&sk->sk_write_queue))
1139 return -EINVAL;
1140
bdc712b4
DM
1141 cork = &inet->cork.base;
1142 rt = (struct rtable *)cork->dst;
1143 if (cork->flags & IPCORK_OPT)
1144 opt = cork->opt;
1da177e4 1145
d8d1f30b 1146 if (!(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
1147 return -EOPNOTSUPP;
1148
d8d1f30b 1149 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
bdc712b4 1150 mtu = cork->fragsize;
1da177e4
LT
1151
1152 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1153 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1154
bdc712b4 1155 if (cork->length + size > 0xFFFF - fragheaderlen) {
f5fca608 1156 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1da177e4
LT
1157 return -EMSGSIZE;
1158 }
1159
1160 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1161 return -EINVAL;
1162
bdc712b4 1163 cork->length += size;
26cde9f7
HX
1164 if ((size + skb->len > mtu) &&
1165 (sk->sk_protocol == IPPROTO_UDP) &&
d8d1f30b 1166 (rt->dst.dev->features & NETIF_F_UFO)) {
7967168c 1167 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 1168 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7967168c 1169 }
e89e9cf5 1170
1da177e4
LT
1171
1172 while (size > 0) {
1173 int i;
1174
89114afd 1175 if (skb_is_gso(skb))
e89e9cf5
AR
1176 len = size;
1177 else {
1178
1179 /* Check if the remaining data fits into current packet. */
1180 len = mtu - skb->len;
1181 if (len < size)
1182 len = maxfraglen - skb->len;
1183 }
1da177e4
LT
1184 if (len <= 0) {
1185 struct sk_buff *skb_prev;
1da177e4
LT
1186 int alloclen;
1187
1188 skb_prev = skb;
0d0d2bba 1189 fraggap = skb_prev->len - maxfraglen;
1da177e4
LT
1190
1191 alloclen = fragheaderlen + hh_len + fraggap + 15;
1192 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1193 if (unlikely(!skb)) {
1194 err = -ENOBUFS;
1195 goto error;
1196 }
1197
1198 /*
1199 * Fill in the control structures
1200 */
1201 skb->ip_summed = CHECKSUM_NONE;
1202 skb->csum = 0;
1203 skb_reserve(skb, hh_len);
1204
1205 /*
1206 * Find where to start putting bytes.
1207 */
967b05f6 1208 skb_put(skb, fragheaderlen + fraggap);
2ca9e6f2 1209 skb_reset_network_header(skb);
b0e380b1
ACM
1210 skb->transport_header = (skb->network_header +
1211 fragheaderlen);
1da177e4 1212 if (fraggap) {
967b05f6
ACM
1213 skb->csum = skb_copy_and_csum_bits(skb_prev,
1214 maxfraglen,
9c70220b 1215 skb_transport_header(skb),
967b05f6 1216 fraggap, 0);
1da177e4
LT
1217 skb_prev->csum = csum_sub(skb_prev->csum,
1218 skb->csum);
e9fa4f7b 1219 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1220 }
1221
1222 /*
1223 * Put the packet on the pending queue.
1224 */
1225 __skb_queue_tail(&sk->sk_write_queue, skb);
1226 continue;
1227 }
1228
1229 i = skb_shinfo(skb)->nr_frags;
1230 if (len > size)
1231 len = size;
1232 if (skb_can_coalesce(skb, i, page, offset)) {
9e903e08 1233 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1da177e4
LT
1234 } else if (i < MAX_SKB_FRAGS) {
1235 get_page(page);
1236 skb_fill_page_desc(skb, i, page, offset, len);
1237 } else {
1238 err = -EMSGSIZE;
1239 goto error;
1240 }
1241
1242 if (skb->ip_summed == CHECKSUM_NONE) {
44bb9363 1243 __wsum csum;
1da177e4
LT
1244 csum = csum_page(page, offset, len);
1245 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1246 }
1247
1248 skb->len += len;
1249 skb->data_len += len;
1e34a11d
DM
1250 skb->truesize += len;
1251 atomic_add(len, &sk->sk_wmem_alloc);
1da177e4
LT
1252 offset += len;
1253 size -= len;
1254 }
1255 return 0;
1256
1257error:
bdc712b4 1258 cork->length -= size;
5e38e270 1259 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1260 return err;
1261}
1262
1470ddf7 1263static void ip_cork_release(struct inet_cork *cork)
429f08e9 1264{
1470ddf7
HX
1265 cork->flags &= ~IPCORK_OPT;
1266 kfree(cork->opt);
1267 cork->opt = NULL;
1268 dst_release(cork->dst);
1269 cork->dst = NULL;
429f08e9
PE
1270}
1271
1da177e4
LT
1272/*
1273 * Combined all pending IP fragments on the socket as one IP datagram
1274 * and push them out.
1275 */
1c32c5ad 1276struct sk_buff *__ip_make_skb(struct sock *sk,
77968b78 1277 struct flowi4 *fl4,
1c32c5ad
HX
1278 struct sk_buff_head *queue,
1279 struct inet_cork *cork)
1da177e4
LT
1280{
1281 struct sk_buff *skb, *tmp_skb;
1282 struct sk_buff **tail_skb;
1283 struct inet_sock *inet = inet_sk(sk);
0388b004 1284 struct net *net = sock_net(sk);
1da177e4 1285 struct ip_options *opt = NULL;
1470ddf7 1286 struct rtable *rt = (struct rtable *)cork->dst;
1da177e4 1287 struct iphdr *iph;
76ab608d 1288 __be16 df = 0;
1da177e4 1289 __u8 ttl;
1da177e4 1290
1470ddf7 1291 if ((skb = __skb_dequeue(queue)) == NULL)
1da177e4
LT
1292 goto out;
1293 tail_skb = &(skb_shinfo(skb)->frag_list);
1294
1295 /* move skb->data to ip header from ext header */
d56f90a7 1296 if (skb->data < skb_network_header(skb))
bbe735e4 1297 __skb_pull(skb, skb_network_offset(skb));
1470ddf7 1298 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
cfe1fc77 1299 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1300 *tail_skb = tmp_skb;
1301 tail_skb = &(tmp_skb->next);
1302 skb->len += tmp_skb->len;
1303 skb->data_len += tmp_skb->len;
1304 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1305 tmp_skb->destructor = NULL;
1306 tmp_skb->sk = NULL;
1307 }
1308
1309 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1310 * to fragment the frame generated here. No matter, what transforms
1311 * how transforms change size of the packet, it will come out.
1312 */
628a5c56 1313 if (inet->pmtudisc < IP_PMTUDISC_DO)
1da177e4
LT
1314 skb->local_df = 1;
1315
1316 /* DF bit is set when we want to see DF on outgoing frames.
1317 * If local_df is set too, we still allow to fragment this frame
1318 * locally. */
628a5c56 1319 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
d8d1f30b
CG
1320 (skb->len <= dst_mtu(&rt->dst) &&
1321 ip_dont_fragment(sk, &rt->dst)))
1da177e4
LT
1322 df = htons(IP_DF);
1323
1470ddf7
HX
1324 if (cork->flags & IPCORK_OPT)
1325 opt = cork->opt;
1da177e4
LT
1326
1327 if (rt->rt_type == RTN_MULTICAST)
1328 ttl = inet->mc_ttl;
1329 else
d8d1f30b 1330 ttl = ip_select_ttl(inet, &rt->dst);
1da177e4
LT
1331
1332 iph = (struct iphdr *)skb->data;
1333 iph->version = 4;
1334 iph->ihl = 5;
1da177e4 1335 iph->tos = inet->tos;
1da177e4 1336 iph->frag_off = df;
d8d1f30b 1337 ip_select_ident(iph, &rt->dst, sk);
1da177e4
LT
1338 iph->ttl = ttl;
1339 iph->protocol = sk->sk_protocol;
77968b78
DM
1340 iph->saddr = fl4->saddr;
1341 iph->daddr = fl4->daddr;
1da177e4 1342
22f728f8
DM
1343 if (opt) {
1344 iph->ihl += opt->optlen>>2;
1345 ip_options_build(skb, opt, cork->addr, rt, 0);
1346 }
1347
1da177e4 1348 skb->priority = sk->sk_priority;
4a19ec58 1349 skb->mark = sk->sk_mark;
a21bba94
ED
1350 /*
1351 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1352 * on dst refcount
1353 */
1470ddf7 1354 cork->dst = NULL;
d8d1f30b 1355 skb_dst_set(skb, &rt->dst);
1da177e4 1356
96793b48 1357 if (iph->protocol == IPPROTO_ICMP)
0388b004 1358 icmp_out_count(net, ((struct icmphdr *)
96793b48
DS
1359 skb_transport_header(skb))->type);
1360
1c32c5ad
HX
1361 ip_cork_release(cork);
1362out:
1363 return skb;
1364}
1365
1366int ip_send_skb(struct sk_buff *skb)
1367{
1368 struct net *net = sock_net(skb->sk);
1369 int err;
1370
c439cb2e 1371 err = ip_local_out(skb);
1da177e4
LT
1372 if (err) {
1373 if (err > 0)
6ce9e7b5 1374 err = net_xmit_errno(err);
1da177e4 1375 if (err)
1c32c5ad 1376 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1377 }
1378
1da177e4 1379 return err;
1da177e4
LT
1380}
1381
77968b78 1382int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1470ddf7 1383{
1c32c5ad
HX
1384 struct sk_buff *skb;
1385
77968b78 1386 skb = ip_finish_skb(sk, fl4);
1c32c5ad
HX
1387 if (!skb)
1388 return 0;
1389
1390 /* Netfilter gets whole the not fragmented skb. */
1391 return ip_send_skb(skb);
1470ddf7
HX
1392}
1393
1da177e4
LT
1394/*
1395 * Throw away all pending data on the socket.
1396 */
1470ddf7
HX
1397static void __ip_flush_pending_frames(struct sock *sk,
1398 struct sk_buff_head *queue,
1399 struct inet_cork *cork)
1da177e4 1400{
1da177e4
LT
1401 struct sk_buff *skb;
1402
1470ddf7 1403 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1da177e4
LT
1404 kfree_skb(skb);
1405
1470ddf7
HX
1406 ip_cork_release(cork);
1407}
1408
1409void ip_flush_pending_frames(struct sock *sk)
1410{
bdc712b4 1411 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1da177e4
LT
1412}
1413
1c32c5ad 1414struct sk_buff *ip_make_skb(struct sock *sk,
77968b78 1415 struct flowi4 *fl4,
1c32c5ad
HX
1416 int getfrag(void *from, char *to, int offset,
1417 int len, int odd, struct sk_buff *skb),
1418 void *from, int length, int transhdrlen,
1419 struct ipcm_cookie *ipc, struct rtable **rtp,
1420 unsigned int flags)
1421{
b80d7226 1422 struct inet_cork cork;
1c32c5ad
HX
1423 struct sk_buff_head queue;
1424 int err;
1425
1426 if (flags & MSG_PROBE)
1427 return NULL;
1428
1429 __skb_queue_head_init(&queue);
1430
b80d7226
DM
1431 cork.flags = 0;
1432 cork.addr = 0;
70652728 1433 cork.opt = NULL;
1c32c5ad
HX
1434 err = ip_setup_cork(sk, &cork, ipc, rtp);
1435 if (err)
1436 return ERR_PTR(err);
1437
f5fca608 1438 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1c32c5ad
HX
1439 from, length, transhdrlen, flags);
1440 if (err) {
1441 __ip_flush_pending_frames(sk, &queue, &cork);
1442 return ERR_PTR(err);
1443 }
1444
77968b78 1445 return __ip_make_skb(sk, fl4, &queue, &cork);
1c32c5ad 1446}
1da177e4
LT
1447
1448/*
1449 * Fetch data from kernel space and fill in checksum if needed.
1450 */
e905a9ed 1451static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1da177e4
LT
1452 int len, int odd, struct sk_buff *skb)
1453{
5084205f 1454 __wsum csum;
1da177e4
LT
1455
1456 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1457 skb->csum = csum_block_add(skb->csum, csum, odd);
e905a9ed 1458 return 0;
1da177e4
LT
1459}
1460
e905a9ed 1461/*
1da177e4
LT
1462 * Generic function to send a packet as reply to another packet.
1463 * Used to send TCP resets so far. ICMP should use this function too.
1464 *
e905a9ed 1465 * Should run single threaded per socket because it uses the sock
1da177e4 1466 * structure to pass arguments.
1da177e4 1467 */
0a5ebb80 1468void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
66b13d99 1469 const struct ip_reply_arg *arg, unsigned int len)
1da177e4
LT
1470{
1471 struct inet_sock *inet = inet_sk(sk);
f6d8bd05 1472 struct ip_options_data replyopts;
1da177e4 1473 struct ipcm_cookie ipc;
77968b78 1474 struct flowi4 fl4;
511c3f92 1475 struct rtable *rt = skb_rtable(skb);
1da177e4 1476
f6d8bd05 1477 if (ip_options_echo(&replyopts.opt.opt, skb))
1da177e4
LT
1478 return;
1479
0a5ebb80 1480 ipc.addr = daddr;
1da177e4 1481 ipc.opt = NULL;
2244d07b 1482 ipc.tx_flags = 0;
1da177e4 1483
f6d8bd05 1484 if (replyopts.opt.opt.optlen) {
1da177e4
LT
1485 ipc.opt = &replyopts.opt;
1486
f6d8bd05
ED
1487 if (replyopts.opt.opt.srr)
1488 daddr = replyopts.opt.opt.faddr;
1da177e4
LT
1489 }
1490
77968b78 1491 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
66b13d99 1492 RT_TOS(arg->tos),
77968b78
DM
1493 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1494 ip_reply_arg_flowi_flags(arg),
1495 daddr, rt->rt_spec_dst,
1496 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1497 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1498 rt = ip_route_output_key(sock_net(sk), &fl4);
1499 if (IS_ERR(rt))
1500 return;
1da177e4
LT
1501
1502 /* And let IP do all the hard work.
1503
1504 This chunk is not reenterable, hence spinlock.
1505 Note that it uses the fact, that this function is called
1506 with locally disabled BH and that sk cannot be already spinlocked.
1507 */
1508 bh_lock_sock(sk);
66b13d99 1509 inet->tos = arg->tos;
1da177e4 1510 sk->sk_priority = skb->priority;
eddc9ec5 1511 sk->sk_protocol = ip_hdr(skb)->protocol;
f0e48dbf 1512 sk->sk_bound_dev_if = arg->bound_dev_if;
f5fca608 1513 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
2e77d89b 1514 &ipc, &rt, MSG_DONTWAIT);
1da177e4
LT
1515 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1516 if (arg->csumoffset >= 0)
9c70220b
ACM
1517 *((__sum16 *)skb_transport_header(skb) +
1518 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1519 arg->csum));
1da177e4 1520 skb->ip_summed = CHECKSUM_NONE;
77968b78 1521 ip_push_pending_frames(sk, &fl4);
1da177e4
LT
1522 }
1523
1524 bh_unlock_sock(sk);
1525
1526 ip_rt_put(rt);
1527}
1528
1da177e4
LT
1529void __init ip_init(void)
1530{
1da177e4
LT
1531 ip_rt_init();
1532 inet_initpeers();
1533
1534#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1535 igmp_mc_proc_init();
1536#endif
1537}