]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv6/ip6_output.c
ipsec: Restore larval states and socket policies in dump
[mirror_ubuntu-zesty-kernel.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
1da177e4
LT
40
41#include <linux/netfilter.h>
42#include <linux/netfilter_ipv6.h>
43
44#include <net/sock.h>
45#include <net/snmp.h>
46
47#include <net/ipv6.h>
48#include <net/ndisc.h>
49#include <net/protocol.h>
50#include <net/ip6_route.h>
51#include <net/addrconf.h>
52#include <net/rawv6.h>
53#include <net/icmp.h>
54#include <net/xfrm.h>
55#include <net/checksum.h>
7bc570c8 56#include <linux/mroute6.h>
1da177e4
LT
57
58static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61{
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
64
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
70}
71
ef76bc23
HX
72int __ip6_local_out(struct sk_buff *skb)
73{
74 int len;
75
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
78 len = 0;
79 ipv6_hdr(skb)->payload_len = htons(len);
80
6e23ae2a 81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
ef76bc23
HX
82 dst_output);
83}
84
85int ip6_local_out(struct sk_buff *skb)
86{
87 int err;
88
89 err = __ip6_local_out(skb);
90 if (likely(err == 1))
91 err = dst_output(skb);
92
93 return err;
94}
95EXPORT_SYMBOL_GPL(ip6_local_out);
96
ad643a79 97static int ip6_output_finish(struct sk_buff *skb)
1da177e4 98{
1da177e4 99 struct dst_entry *dst = skb->dst;
1da177e4 100
3644f0ce
SH
101 if (dst->hh)
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
1da177e4
LT
104 return dst->neighbour->output(skb);
105
a11d206d 106 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
107 kfree_skb(skb);
108 return -EINVAL;
109
110}
111
112/* dev_loopback_xmit for use with netfilter. */
113static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
114{
459a98ed 115 skb_reset_mac_header(newskb);
bbe735e4 116 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
117 newskb->pkt_type = PACKET_LOOPBACK;
118 newskb->ip_summed = CHECKSUM_UNNECESSARY;
547b792c 119 WARN_ON(!newskb->dst);
1da177e4
LT
120
121 netif_rx(newskb);
122 return 0;
123}
124
125
126static int ip6_output2(struct sk_buff *skb)
127{
128 struct dst_entry *dst = skb->dst;
129 struct net_device *dev = dst->dev;
130
131 skb->protocol = htons(ETH_P_IPV6);
132 skb->dev = dev;
133
0660e03f 134 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
1da177e4 135 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
a11d206d 136 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1da177e4
LT
137
138 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
7bc570c8
YH
139 ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
146 */
147 if (newskb)
6e23ae2a
PM
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149 NULL, newskb->dev,
1da177e4
LT
150 ip6_dev_loopback_xmit);
151
0660e03f 152 if (ipv6_hdr(skb)->hop_limit == 0) {
a11d206d 153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
154 kfree_skb(skb);
155 return 0;
156 }
157 }
158
a11d206d 159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
1da177e4
LT
160 }
161
6e23ae2a
PM
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163 ip6_output_finish);
1da177e4
LT
164}
165
628a5c56
JH
166static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167{
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
172}
173
1da177e4
LT
174int ip6_output(struct sk_buff *skb)
175{
778d80be
YH
176 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
177 if (unlikely(idev->cnf.disable_ipv6)) {
178 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
179 kfree_skb(skb);
180 return 0;
181 }
182
628a5c56 183 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
e89e9cf5 184 dst_allfrag(skb->dst))
1da177e4
LT
185 return ip6_fragment(skb, ip6_output2);
186 else
187 return ip6_output2(skb);
188}
189
1da177e4
LT
190/*
191 * xmit an sk_buff (used by TCP)
192 */
193
194int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
195 struct ipv6_txoptions *opt, int ipfragok)
196{
b30bd282 197 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4
LT
198 struct in6_addr *first_hop = &fl->fl6_dst;
199 struct dst_entry *dst = skb->dst;
200 struct ipv6hdr *hdr;
201 u8 proto = fl->proto;
202 int seg_len = skb->len;
41a1f8ea 203 int hlimit, tclass;
1da177e4
LT
204 u32 mtu;
205
206 if (opt) {
c2636b4d 207 unsigned int head_room;
1da177e4
LT
208
209 /* First: exthdrs may take lots of space (~8K for now)
210 MAX_HEADER is not enough.
211 */
212 head_room = opt->opt_nflen + opt->opt_flen;
213 seg_len += head_room;
214 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
215
216 if (skb_headroom(skb) < head_room) {
217 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d
YH
218 if (skb2 == NULL) {
219 IP6_INC_STATS(ip6_dst_idev(skb->dst),
220 IPSTATS_MIB_OUTDISCARDS);
221 kfree_skb(skb);
1da177e4
LT
222 return -ENOBUFS;
223 }
a11d206d
YH
224 kfree_skb(skb);
225 skb = skb2;
1da177e4
LT
226 if (sk)
227 skb_set_owner_w(skb, sk);
228 }
229 if (opt->opt_flen)
230 ipv6_push_frag_opts(skb, opt, &proto);
231 if (opt->opt_nflen)
232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
233 }
234
e2d1bca7
ACM
235 skb_push(skb, sizeof(struct ipv6hdr));
236 skb_reset_network_header(skb);
0660e03f 237 hdr = ipv6_hdr(skb);
1da177e4 238
77e2f14f
WY
239 /* Allow local fragmentation. */
240 if (ipfragok)
241 skb->local_df = 1;
242
1da177e4
LT
243 /*
244 * Fill in the IPv6 header
245 */
246
1da177e4
LT
247 hlimit = -1;
248 if (np)
249 hlimit = np->hop_limit;
250 if (hlimit < 0)
6b75d090 251 hlimit = ip6_dst_hoplimit(dst);
1da177e4 252
41a1f8ea
YH
253 tclass = -1;
254 if (np)
255 tclass = np->tclass;
256 if (tclass < 0)
257 tclass = 0;
258
90bcaf7b 259 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 260
1da177e4
LT
261 hdr->payload_len = htons(seg_len);
262 hdr->nexthdr = proto;
263 hdr->hop_limit = hlimit;
264
265 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
266 ipv6_addr_copy(&hdr->daddr, first_hop);
267
a2c2064f 268 skb->priority = sk->sk_priority;
4a19ec58 269 skb->mark = sk->sk_mark;
a2c2064f 270
1da177e4 271 mtu = dst_mtu(dst);
283d07ac 272 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
a11d206d
YH
273 IP6_INC_STATS(ip6_dst_idev(skb->dst),
274 IPSTATS_MIB_OUTREQUESTS);
6e23ae2a 275 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
6869c4d8 276 dst_output);
1da177e4
LT
277 }
278
279 if (net_ratelimit())
280 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
281 skb->dev = dst->dev;
282 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
a11d206d 283 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
284 kfree_skb(skb);
285 return -EMSGSIZE;
286}
287
7159039a
YH
288EXPORT_SYMBOL(ip6_xmit);
289
1da177e4
LT
290/*
291 * To avoid extra problems ND packets are send through this
292 * routine. It's code duplication but I really want to avoid
293 * extra checks since ipv6_build_header is used by TCP (which
294 * is for us performance critical)
295 */
296
297int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
9acd9f3a 298 const struct in6_addr *saddr, const struct in6_addr *daddr,
1da177e4
LT
299 int proto, int len)
300{
301 struct ipv6_pinfo *np = inet6_sk(sk);
302 struct ipv6hdr *hdr;
303 int totlen;
304
305 skb->protocol = htons(ETH_P_IPV6);
306 skb->dev = dev;
307
308 totlen = len + sizeof(struct ipv6hdr);
309
55f79cc0
ACM
310 skb_reset_network_header(skb);
311 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 312 hdr = ipv6_hdr(skb);
1da177e4 313
ae08e1f0 314 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
315
316 hdr->payload_len = htons(len);
317 hdr->nexthdr = proto;
318 hdr->hop_limit = np->hop_limit;
319
320 ipv6_addr_copy(&hdr->saddr, saddr);
321 ipv6_addr_copy(&hdr->daddr, daddr);
322
323 return 0;
324}
325
326static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
327{
328 struct ip6_ra_chain *ra;
329 struct sock *last = NULL;
330
331 read_lock(&ip6_ra_lock);
332 for (ra = ip6_ra_chain; ra; ra = ra->next) {
333 struct sock *sk = ra->sk;
0bd1b59b
AM
334 if (sk && ra->sel == sel &&
335 (!sk->sk_bound_dev_if ||
336 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
337 if (last) {
338 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
339 if (skb2)
340 rawv6_rcv(last, skb2);
341 }
342 last = sk;
343 }
344 }
345
346 if (last) {
347 rawv6_rcv(last, skb);
348 read_unlock(&ip6_ra_lock);
349 return 1;
350 }
351 read_unlock(&ip6_ra_lock);
352 return 0;
353}
354
e21e0b5f
VN
355static int ip6_forward_proxy_check(struct sk_buff *skb)
356{
0660e03f 357 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
358 u8 nexthdr = hdr->nexthdr;
359 int offset;
360
361 if (ipv6_ext_hdr(nexthdr)) {
362 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
363 if (offset < 0)
364 return 0;
365 } else
366 offset = sizeof(struct ipv6hdr);
367
368 if (nexthdr == IPPROTO_ICMPV6) {
369 struct icmp6hdr *icmp6;
370
d56f90a7
ACM
371 if (!pskb_may_pull(skb, (skb_network_header(skb) +
372 offset + 1 - skb->data)))
e21e0b5f
VN
373 return 0;
374
d56f90a7 375 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
376
377 switch (icmp6->icmp6_type) {
378 case NDISC_ROUTER_SOLICITATION:
379 case NDISC_ROUTER_ADVERTISEMENT:
380 case NDISC_NEIGHBOUR_SOLICITATION:
381 case NDISC_NEIGHBOUR_ADVERTISEMENT:
382 case NDISC_REDIRECT:
383 /* For reaction involving unicast neighbor discovery
384 * message destined to the proxied address, pass it to
385 * input function.
386 */
387 return 1;
388 default:
389 break;
390 }
391 }
392
74553b09
VN
393 /*
394 * The proxying router can't forward traffic sent to a link-local
395 * address, so signal the sender and discard the packet. This
396 * behavior is clarified by the MIPv6 specification.
397 */
398 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
399 dst_link_failure(skb);
400 return -1;
401 }
402
e21e0b5f
VN
403 return 0;
404}
405
1da177e4
LT
406static inline int ip6_forward_finish(struct sk_buff *skb)
407{
408 return dst_output(skb);
409}
410
411int ip6_forward(struct sk_buff *skb)
412{
413 struct dst_entry *dst = skb->dst;
0660e03f 414 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 415 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 416 struct net *net = dev_net(dst->dev);
1ab1457c 417
53b7997f 418 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
419 goto error;
420
4497b076
BH
421 if (skb_warn_if_lro(skb))
422 goto drop;
423
1da177e4 424 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
a11d206d 425 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
426 goto drop;
427 }
428
35fc92a9 429 skb_forward_csum(skb);
1da177e4
LT
430
431 /*
432 * We DO NOT make any processing on
433 * RA packets, pushing them to user level AS IS
434 * without ane WARRANTY that application will be able
435 * to interpret them. The reason is that we
436 * cannot make anything clever here.
437 *
438 * We are not end-node, so that if packet contains
439 * AH/ESP, we cannot make anything.
440 * Defragmentation also would be mistake, RA packets
441 * cannot be fragmented, because there is no warranty
442 * that different fragments will go along one path. --ANK
443 */
444 if (opt->ra) {
d56f90a7 445 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
446 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
447 return 0;
448 }
449
450 /*
451 * check and decrement ttl
452 */
453 if (hdr->hop_limit <= 1) {
454 /* Force OUTPUT device used as source address */
455 skb->dev = dst->dev;
456 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
457 0, skb->dev);
a11d206d 458 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
459
460 kfree_skb(skb);
461 return -ETIMEDOUT;
462 }
463
fbea49e1 464 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 465 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 466 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
467 int proxied = ip6_forward_proxy_check(skb);
468 if (proxied > 0)
e21e0b5f 469 return ip6_input(skb);
74553b09 470 else if (proxied < 0) {
a11d206d 471 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
74553b09
VN
472 goto drop;
473 }
e21e0b5f
VN
474 }
475
1da177e4 476 if (!xfrm6_route_forward(skb)) {
a11d206d 477 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
478 goto drop;
479 }
480 dst = skb->dst;
481
482 /* IPv6 specs say nothing about it, but it is clear that we cannot
483 send redirects to source routed frames.
1e5dc146 484 We don't send redirects to frames decapsulated from IPsec.
1da177e4 485 */
1e5dc146
MN
486 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
487 !skb->sp) {
1da177e4
LT
488 struct in6_addr *target = NULL;
489 struct rt6_info *rt;
490 struct neighbour *n = dst->neighbour;
491
492 /*
493 * incoming and outgoing devices are the same
494 * send a redirect.
495 */
496
497 rt = (struct rt6_info *) dst;
498 if ((rt->rt6i_flags & RTF_GATEWAY))
499 target = (struct in6_addr*)&n->primary_key;
500 else
501 target = &hdr->daddr;
502
503 /* Limit redirects both by destination (here)
504 and by source (inside ndisc_send_redirect)
505 */
506 if (xrlim_allow(dst, 1*HZ))
507 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
508 } else {
509 int addrtype = ipv6_addr_type(&hdr->saddr);
510
1da177e4 511 /* This check is security critical. */
f81b2e7d
YH
512 if (addrtype == IPV6_ADDR_ANY ||
513 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
514 goto error;
515 if (addrtype & IPV6_ADDR_LINKLOCAL) {
516 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
517 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
518 goto error;
519 }
1da177e4
LT
520 }
521
522 if (skb->len > dst_mtu(dst)) {
523 /* Again, force OUTPUT device used as source address */
524 skb->dev = dst->dev;
525 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
a11d206d
YH
526 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
527 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
528 kfree_skb(skb);
529 return -EMSGSIZE;
530 }
531
532 if (skb_cow(skb, dst->dev->hard_header_len)) {
a11d206d 533 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
534 goto drop;
535 }
536
0660e03f 537 hdr = ipv6_hdr(skb);
1da177e4
LT
538
539 /* Mangling hops number delayed to point after skb COW */
1ab1457c 540
1da177e4
LT
541 hdr->hop_limit--;
542
a11d206d 543 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
6e23ae2a
PM
544 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
545 ip6_forward_finish);
1da177e4
LT
546
547error:
a11d206d 548 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
549drop:
550 kfree_skb(skb);
551 return -EINVAL;
552}
553
554static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
555{
556 to->pkt_type = from->pkt_type;
557 to->priority = from->priority;
558 to->protocol = from->protocol;
1da177e4
LT
559 dst_release(to->dst);
560 to->dst = dst_clone(from->dst);
561 to->dev = from->dev;
82e91ffe 562 to->mark = from->mark;
1da177e4
LT
563
564#ifdef CONFIG_NET_SCHED
565 to->tc_index = from->tc_index;
566#endif
e7ac05f3 567 nf_copy(to, from);
ba9dda3a
JK
568#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
569 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
570 to->nf_trace = from->nf_trace;
571#endif
984bc16c 572 skb_copy_secmark(to, from);
1da177e4
LT
573}
574
575int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
576{
577 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
578 struct ipv6_opt_hdr *exthdr =
579 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 580 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 581 int found_rhdr = 0;
0660e03f 582 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
583
584 while (offset + 1 <= packet_len) {
585
586 switch (**nexthdr) {
587
588 case NEXTHDR_HOP:
27637df9 589 break;
1da177e4 590 case NEXTHDR_ROUTING:
27637df9
MN
591 found_rhdr = 1;
592 break;
1da177e4 593 case NEXTHDR_DEST:
59fbb3a6 594#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
595 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
596 break;
597#endif
598 if (found_rhdr)
599 return offset;
1da177e4
LT
600 break;
601 default :
602 return offset;
603 }
27637df9
MN
604
605 offset += ipv6_optlen(exthdr);
606 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
607 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
608 offset);
1da177e4
LT
609 }
610
611 return offset;
612}
613
614static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
615{
616 struct net_device *dev;
617 struct sk_buff *frag;
618 struct rt6_info *rt = (struct rt6_info*)skb->dst;
d91675f9 619 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
620 struct ipv6hdr *tmp_hdr;
621 struct frag_hdr *fh;
622 unsigned int mtu, hlen, left, len;
ae08e1f0 623 __be32 frag_id = 0;
1da177e4
LT
624 int ptr, offset = 0, err=0;
625 u8 *prevhdr, nexthdr = 0;
626
627 dev = rt->u.dst.dev;
628 hlen = ip6_find_1stfragopt(skb, &prevhdr);
629 nexthdr = *prevhdr;
630
628a5c56 631 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
632
633 /* We must not fragment if the socket is set to force MTU discovery
634 * or if the skb it not generated by a local socket. (This last
635 * check should be redundant, but it's free.)
636 */
b5c15fc0 637 if (!skb->local_df) {
b881ef76
JH
638 skb->dev = skb->dst->dev;
639 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
640 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
641 kfree_skb(skb);
642 return -EMSGSIZE;
643 }
644
d91675f9
YH
645 if (np && np->frag_size < mtu) {
646 if (np->frag_size)
647 mtu = np->frag_size;
648 }
649 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4
LT
650
651 if (skb_shinfo(skb)->frag_list) {
652 int first_len = skb_pagelen(skb);
29ffe1a5 653 int truesizes = 0;
1da177e4
LT
654
655 if (first_len - hlen > mtu ||
656 ((first_len - hlen) & 7) ||
657 skb_cloned(skb))
658 goto slow_path;
659
660 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
661 /* Correct geometry. */
662 if (frag->len > mtu ||
663 ((frag->len & 7) && frag->next) ||
664 skb_headroom(frag) < hlen)
665 goto slow_path;
666
1da177e4
LT
667 /* Partially cloned skb? */
668 if (skb_shared(frag))
669 goto slow_path;
2fdba6b0
HX
670
671 BUG_ON(frag->sk);
672 if (skb->sk) {
673 sock_hold(skb->sk);
674 frag->sk = skb->sk;
675 frag->destructor = sock_wfree;
29ffe1a5 676 truesizes += frag->truesize;
2fdba6b0 677 }
1da177e4
LT
678 }
679
680 err = 0;
681 offset = 0;
682 frag = skb_shinfo(skb)->frag_list;
683 skb_shinfo(skb)->frag_list = NULL;
684 /* BUILD HEADER */
685
9a217a1c 686 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 687 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 688 if (!tmp_hdr) {
a11d206d 689 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
690 return -ENOMEM;
691 }
692
1da177e4
LT
693 __skb_pull(skb, hlen);
694 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
695 __skb_push(skb, hlen);
696 skb_reset_network_header(skb);
d56f90a7 697 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4
LT
698
699 ipv6_select_ident(skb, fh);
700 fh->nexthdr = nexthdr;
701 fh->reserved = 0;
702 fh->frag_off = htons(IP6_MF);
703 frag_id = fh->identification;
704
705 first_len = skb_pagelen(skb);
706 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 707 skb->truesize -= truesizes;
1da177e4 708 skb->len = first_len;
0660e03f
ACM
709 ipv6_hdr(skb)->payload_len = htons(first_len -
710 sizeof(struct ipv6hdr));
a11d206d
YH
711
712 dst_hold(&rt->u.dst);
1da177e4
LT
713
714 for (;;) {
715 /* Prepare header of the next frame,
716 * before previous one went down. */
717 if (frag) {
718 frag->ip_summed = CHECKSUM_NONE;
badff6d0 719 skb_reset_transport_header(frag);
1da177e4 720 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
721 __skb_push(frag, hlen);
722 skb_reset_network_header(frag);
d56f90a7
ACM
723 memcpy(skb_network_header(frag), tmp_hdr,
724 hlen);
1da177e4
LT
725 offset += skb->len - hlen - sizeof(struct frag_hdr);
726 fh->nexthdr = nexthdr;
727 fh->reserved = 0;
728 fh->frag_off = htons(offset);
729 if (frag->next != NULL)
730 fh->frag_off |= htons(IP6_MF);
731 fh->identification = frag_id;
0660e03f
ACM
732 ipv6_hdr(frag)->payload_len =
733 htons(frag->len -
734 sizeof(struct ipv6hdr));
1da177e4
LT
735 ip6_copy_metadata(frag, skb);
736 }
1ab1457c 737
1da177e4 738 err = output(skb);
dafee490 739 if(!err)
a11d206d 740 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
dafee490 741
1da177e4
LT
742 if (err || !frag)
743 break;
744
745 skb = frag;
746 frag = skb->next;
747 skb->next = NULL;
748 }
749
a51482bd 750 kfree(tmp_hdr);
1da177e4
LT
751
752 if (err == 0) {
a11d206d
YH
753 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
754 dst_release(&rt->u.dst);
1da177e4
LT
755 return 0;
756 }
757
758 while (frag) {
759 skb = frag->next;
760 kfree_skb(frag);
761 frag = skb;
762 }
763
a11d206d
YH
764 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
765 dst_release(&rt->u.dst);
1da177e4
LT
766 return err;
767 }
768
769slow_path:
770 left = skb->len - hlen; /* Space per frame */
771 ptr = hlen; /* Where to start from */
772
773 /*
774 * Fragment the datagram.
775 */
776
777 *prevhdr = NEXTHDR_FRAGMENT;
778
779 /*
780 * Keep copying data until we run out.
781 */
782 while(left > 0) {
783 len = left;
784 /* IF: it doesn't fit, use 'mtu' - the data space left */
785 if (len > mtu)
786 len = mtu;
787 /* IF: we are not sending upto and including the packet end
788 then align the next start on an eight byte boundary */
789 if (len < left) {
790 len &= ~7;
791 }
792 /*
793 * Allocate buffer.
794 */
795
f5184d26 796 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 797 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
a11d206d
YH
798 IP6_INC_STATS(ip6_dst_idev(skb->dst),
799 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
800 err = -ENOMEM;
801 goto fail;
802 }
803
804 /*
805 * Set up data on packet
806 */
807
808 ip6_copy_metadata(frag, skb);
809 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
810 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 811 skb_reset_network_header(frag);
badff6d0 812 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
813 frag->transport_header = (frag->network_header + hlen +
814 sizeof(struct frag_hdr));
1da177e4
LT
815
816 /*
817 * Charge the memory for the fragment to any owner
818 * it might possess
819 */
820 if (skb->sk)
821 skb_set_owner_w(frag, skb->sk);
822
823 /*
824 * Copy the packet header into the new buffer.
825 */
d626f62b 826 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
827
828 /*
829 * Build fragment header.
830 */
831 fh->nexthdr = nexthdr;
832 fh->reserved = 0;
f36d6ab1 833 if (!frag_id) {
1da177e4
LT
834 ipv6_select_ident(skb, fh);
835 frag_id = fh->identification;
836 } else
837 fh->identification = frag_id;
838
839 /*
840 * Copy a block of the IP datagram.
841 */
8984e41d 842 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
843 BUG();
844 left -= len;
845
846 fh->frag_off = htons(offset);
847 if (left > 0)
848 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
849 ipv6_hdr(frag)->payload_len = htons(frag->len -
850 sizeof(struct ipv6hdr));
1da177e4
LT
851
852 ptr += len;
853 offset += len;
854
855 /*
856 * Put this fragment into the sending queue.
857 */
1da177e4
LT
858 err = output(frag);
859 if (err)
860 goto fail;
dafee490 861
a11d206d 862 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
1da177e4 863 }
a11d206d
YH
864 IP6_INC_STATS(ip6_dst_idev(skb->dst),
865 IPSTATS_MIB_FRAGOKS);
1da177e4 866 kfree_skb(skb);
1da177e4
LT
867 return err;
868
869fail:
a11d206d
YH
870 IP6_INC_STATS(ip6_dst_idev(skb->dst),
871 IPSTATS_MIB_FRAGFAILS);
1ab1457c 872 kfree_skb(skb);
1da177e4
LT
873 return err;
874}
875
cf6b1982
YH
876static inline int ip6_rt_check(struct rt6key *rt_key,
877 struct in6_addr *fl_addr,
878 struct in6_addr *addr_cache)
879{
880 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882}
883
497c615a
HX
884static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 struct dst_entry *dst,
886 struct flowi *fl)
1da177e4 887{
497c615a
HX
888 struct ipv6_pinfo *np = inet6_sk(sk);
889 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 890
497c615a
HX
891 if (!dst)
892 goto out;
893
894 /* Yes, checking route validity in not connected
895 * case is not very simple. Take into account,
896 * that we do not support routing by source, TOS,
897 * and MSG_DONTROUTE --ANK (980726)
898 *
cf6b1982
YH
899 * 1. ip6_rt_check(): If route was host route,
900 * check that cached destination is current.
497c615a
HX
901 * If it is network route, we still may
902 * check its validity using saved pointer
903 * to the last used address: daddr_cache.
904 * We do not want to save whole address now,
905 * (because main consumer of this service
906 * is tcp, which has not this problem),
907 * so that the last trick works only on connected
908 * sockets.
909 * 2. oif also should be the same.
910 */
cf6b1982 911 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
912#ifdef CONFIG_IPV6_SUBTREES
913 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
914#endif
cf6b1982 915 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
916 dst_release(dst);
917 dst = NULL;
1da177e4
LT
918 }
919
497c615a
HX
920out:
921 return dst;
922}
923
924static int ip6_dst_lookup_tail(struct sock *sk,
925 struct dst_entry **dst, struct flowi *fl)
926{
927 int err;
3b1e0a65 928 struct net *net = sock_net(sk);
497c615a 929
1da177e4 930 if (*dst == NULL)
8a3edd80 931 *dst = ip6_route_output(net, sk, fl);
1da177e4
LT
932
933 if ((err = (*dst)->error))
934 goto out_err_release;
935
936 if (ipv6_addr_any(&fl->fl6_src)) {
191cd582 937 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
7cbca67c
YH
938 &fl->fl6_dst,
939 sk ? inet6_sk(sk)->srcprefs : 0,
940 &fl->fl6_src);
44456d37 941 if (err)
1da177e4 942 goto out_err_release;
1da177e4
LT
943 }
944
95c385b4
NH
945#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946 /*
947 * Here if the dst entry we've looked up
948 * has a neighbour entry that is in the INCOMPLETE
949 * state and the src address from the flow is
950 * marked as OPTIMISTIC, we release the found
951 * dst entry and replace it instead with the
952 * dst entry of the nexthop router
953 */
954 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
955 struct inet6_ifaddr *ifp;
956 struct flowi fl_gw;
957 int redirect;
958
8a3edd80 959 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
1cab3da6 960 (*dst)->dev, 1);
95c385b4
NH
961
962 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 if (ifp)
964 in6_ifa_put(ifp);
965
966 if (redirect) {
967 /*
968 * We need to get the dst entry for the
969 * default router instead
970 */
971 dst_release(*dst);
972 memcpy(&fl_gw, fl, sizeof(struct flowi));
973 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
8a3edd80 974 *dst = ip6_route_output(net, sk, &fl_gw);
95c385b4
NH
975 if ((err = (*dst)->error))
976 goto out_err_release;
977 }
978 }
979#endif
980
1da177e4
LT
981 return 0;
982
983out_err_release:
ca46f9c8
MC
984 if (err == -ENETUNREACH)
985 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
986 dst_release(*dst);
987 *dst = NULL;
988 return err;
989}
34a0b3cd 990
497c615a
HX
991/**
992 * ip6_dst_lookup - perform route lookup on flow
993 * @sk: socket which provides route info
994 * @dst: pointer to dst_entry * for result
995 * @fl: flow to lookup
996 *
997 * This function performs a route lookup on the given flow.
998 *
999 * It returns zero on success, or a standard errno code on error.
1000 */
1001int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002{
1003 *dst = NULL;
1004 return ip6_dst_lookup_tail(sk, dst, fl);
1005}
3cf3dc6c
ACM
1006EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007
497c615a
HX
1008/**
1009 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010 * @sk: socket which provides the dst cache and route info
1011 * @dst: pointer to dst_entry * for result
1012 * @fl: flow to lookup
1013 *
1014 * This function performs a route lookup on the given flow with the
1015 * possibility of using the cached route in the socket if it is valid.
1016 * It will take the socket dst lock when operating on the dst cache.
1017 * As a result, this function can only be used in process context.
1018 *
1019 * It returns zero on success, or a standard errno code on error.
1020 */
1021int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022{
1023 *dst = NULL;
1024 if (sk) {
1025 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026 *dst = ip6_sk_dst_check(sk, *dst, fl);
1027 }
1028
1029 return ip6_dst_lookup_tail(sk, dst, fl);
1030}
1031EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032
34a0b3cd 1033static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1034 int getfrag(void *from, char *to, int offset, int len,
1035 int odd, struct sk_buff *skb),
1036 void *from, int length, int hh_len, int fragheaderlen,
1037 int transhdrlen, int mtu,unsigned int flags)
1038
1039{
1040 struct sk_buff *skb;
1041 int err;
1042
1043 /* There is support for UDP large send offload by network
1044 * device, so create one single skb packet containing complete
1045 * udp datagram
1046 */
1047 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048 skb = sock_alloc_send_skb(sk,
1049 hh_len + fragheaderlen + transhdrlen + 20,
1050 (flags & MSG_DONTWAIT), &err);
1051 if (skb == NULL)
1052 return -ENOMEM;
1053
1054 /* reserve space for Hardware header */
1055 skb_reserve(skb, hh_len);
1056
1057 /* create space for UDP/IP header */
1058 skb_put(skb,fragheaderlen + transhdrlen);
1059
1060 /* initialize network header pointer */
c1d2bbe1 1061 skb_reset_network_header(skb);
e89e9cf5
AR
1062
1063 /* initialize protocol header pointer */
b0e380b1 1064 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1065
84fa7933 1066 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1067 skb->csum = 0;
1068 sk->sk_sndmsg_off = 0;
1069 }
1070
1071 err = skb_append_datato_frags(sk,skb, getfrag, from,
1072 (length - transhdrlen));
1073 if (!err) {
1074 struct frag_hdr fhdr;
1075
1076 /* specify the length of each IP datagram fragment*/
1ab1457c 1077 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
7967168c 1078 sizeof(struct frag_hdr);
f83ef8c0 1079 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
e89e9cf5
AR
1080 ipv6_select_ident(skb, &fhdr);
1081 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1082 __skb_queue_tail(&sk->sk_write_queue, skb);
1083
1084 return 0;
1085 }
1086 /* There is not enough support do UPD LSO,
1087 * so follow normal path
1088 */
1089 kfree_skb(skb);
1090
1091 return err;
1092}
1da177e4 1093
41a1f8ea
YH
1094int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1095 int offset, int len, int odd, struct sk_buff *skb),
1096 void *from, int length, int transhdrlen,
1097 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1098 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1099{
1100 struct inet_sock *inet = inet_sk(sk);
1101 struct ipv6_pinfo *np = inet6_sk(sk);
1102 struct sk_buff *skb;
1103 unsigned int maxfraglen, fragheaderlen;
1104 int exthdrlen;
1105 int hh_len;
1106 int mtu;
1107 int copy;
1108 int err;
1109 int offset = 0;
1110 int csummode = CHECKSUM_NONE;
1111
1112 if (flags&MSG_PROBE)
1113 return 0;
1114 if (skb_queue_empty(&sk->sk_write_queue)) {
1115 /*
1116 * setup for corking
1117 */
1118 if (opt) {
1119 if (np->cork.opt == NULL) {
1120 np->cork.opt = kmalloc(opt->tot_len,
1121 sk->sk_allocation);
1122 if (unlikely(np->cork.opt == NULL))
1123 return -ENOBUFS;
1124 } else if (np->cork.opt->tot_len < opt->tot_len) {
1125 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1126 return -EINVAL;
1127 }
1128 memcpy(np->cork.opt, opt, opt->tot_len);
1129 inet->cork.flags |= IPCORK_OPT;
1130 /* need source address above miyazawa*/
1131 }
1132 dst_hold(&rt->u.dst);
c8cdaf99 1133 inet->cork.dst = &rt->u.dst;
1da177e4
LT
1134 inet->cork.fl = *fl;
1135 np->cork.hop_limit = hlimit;
41a1f8ea 1136 np->cork.tclass = tclass;
628a5c56
JH
1137 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1138 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1139 if (np->frag_size < mtu) {
d91675f9
YH
1140 if (np->frag_size)
1141 mtu = np->frag_size;
1142 }
1143 inet->cork.fragsize = mtu;
1da177e4
LT
1144 if (dst_allfrag(rt->u.dst.path))
1145 inet->cork.flags |= IPCORK_ALLFRAG;
1146 inet->cork.length = 0;
1147 sk->sk_sndmsg_page = NULL;
1148 sk->sk_sndmsg_off = 0;
01488942 1149 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1150 rt->rt6i_nfheader_len;
1da177e4
LT
1151 length += exthdrlen;
1152 transhdrlen += exthdrlen;
1153 } else {
c8cdaf99 1154 rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1155 fl = &inet->cork.fl;
1156 if (inet->cork.flags & IPCORK_OPT)
1157 opt = np->cork.opt;
1158 transhdrlen = 0;
1159 exthdrlen = 0;
1160 mtu = inet->cork.fragsize;
1161 }
1162
1163 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1164
a1b05140 1165 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1166 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1167 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1168
1169 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1170 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1171 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1172 return -EMSGSIZE;
1173 }
1174 }
1175
1176 /*
1177 * Let's try using as much space as possible.
1178 * Use MTU if total length of the message fits into the MTU.
1179 * Otherwise, we need to reserve fragment header and
1180 * fragment alignment (= 8-15 octects, in total).
1181 *
1182 * Note that we may need to "move" the data from the tail of
1ab1457c 1183 * of the buffer to the new fragment when we split
1da177e4
LT
1184 * the message.
1185 *
1ab1457c 1186 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1187 * at once if non-fragmentable extension headers
1188 * are too large.
1ab1457c 1189 * --yoshfuji
1da177e4
LT
1190 */
1191
1192 inet->cork.length += length;
e89e9cf5
AR
1193 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1194 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1195
baa829d8
PM
1196 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1197 fragheaderlen, transhdrlen, mtu,
1198 flags);
1199 if (err)
e89e9cf5 1200 goto error;
e89e9cf5
AR
1201 return 0;
1202 }
1da177e4
LT
1203
1204 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1205 goto alloc_new_skb;
1206
1207 while (length > 0) {
1208 /* Check if the remaining data fits into current packet. */
1209 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1210 if (copy < length)
1211 copy = maxfraglen - skb->len;
1212
1213 if (copy <= 0) {
1214 char *data;
1215 unsigned int datalen;
1216 unsigned int fraglen;
1217 unsigned int fraggap;
1218 unsigned int alloclen;
1219 struct sk_buff *skb_prev;
1220alloc_new_skb:
1221 skb_prev = skb;
1222
1223 /* There's no room in the current skb */
1224 if (skb_prev)
1225 fraggap = skb_prev->len - maxfraglen;
1226 else
1227 fraggap = 0;
1228
1229 /*
1230 * If remaining data exceeds the mtu,
1231 * we know we need more fragment(s).
1232 */
1233 datalen = length + fraggap;
1234 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1235 datalen = maxfraglen - fragheaderlen;
1236
1237 fraglen = datalen + fragheaderlen;
1238 if ((flags & MSG_MORE) &&
1239 !(rt->u.dst.dev->features&NETIF_F_SG))
1240 alloclen = mtu;
1241 else
1242 alloclen = datalen + fragheaderlen;
1243
1244 /*
1245 * The last fragment gets additional space at tail.
1246 * Note: we overallocate on fragments with MSG_MODE
1247 * because we have no idea if we're the last one.
1248 */
1249 if (datalen == length + fraggap)
1250 alloclen += rt->u.dst.trailer_len;
1251
1252 /*
1253 * We just reserve space for fragment header.
1ab1457c 1254 * Note: this may be overallocation if the message
1da177e4
LT
1255 * (without MSG_MORE) fits into the MTU.
1256 */
1257 alloclen += sizeof(struct frag_hdr);
1258
1259 if (transhdrlen) {
1260 skb = sock_alloc_send_skb(sk,
1261 alloclen + hh_len,
1262 (flags & MSG_DONTWAIT), &err);
1263 } else {
1264 skb = NULL;
1265 if (atomic_read(&sk->sk_wmem_alloc) <=
1266 2 * sk->sk_sndbuf)
1267 skb = sock_wmalloc(sk,
1268 alloclen + hh_len, 1,
1269 sk->sk_allocation);
1270 if (unlikely(skb == NULL))
1271 err = -ENOBUFS;
1272 }
1273 if (skb == NULL)
1274 goto error;
1275 /*
1276 * Fill in the control structures
1277 */
1278 skb->ip_summed = csummode;
1279 skb->csum = 0;
1280 /* reserve for fragmentation */
1281 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1282
1283 /*
1284 * Find where to start putting bytes
1285 */
1286 data = skb_put(skb, fraglen);
c14d2450 1287 skb_set_network_header(skb, exthdrlen);
1da177e4 1288 data += fragheaderlen;
b0e380b1
ACM
1289 skb->transport_header = (skb->network_header +
1290 fragheaderlen);
1da177e4
LT
1291 if (fraggap) {
1292 skb->csum = skb_copy_and_csum_bits(
1293 skb_prev, maxfraglen,
1294 data + transhdrlen, fraggap, 0);
1295 skb_prev->csum = csum_sub(skb_prev->csum,
1296 skb->csum);
1297 data += fraggap;
e9fa4f7b 1298 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1299 }
1300 copy = datalen - transhdrlen - fraggap;
1301 if (copy < 0) {
1302 err = -EINVAL;
1303 kfree_skb(skb);
1304 goto error;
1305 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1306 err = -EFAULT;
1307 kfree_skb(skb);
1308 goto error;
1309 }
1310
1311 offset += copy;
1312 length -= datalen - fraggap;
1313 transhdrlen = 0;
1314 exthdrlen = 0;
1315 csummode = CHECKSUM_NONE;
1316
1317 /*
1318 * Put the packet on the pending queue
1319 */
1320 __skb_queue_tail(&sk->sk_write_queue, skb);
1321 continue;
1322 }
1323
1324 if (copy > length)
1325 copy = length;
1326
1327 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1328 unsigned int off;
1329
1330 off = skb->len;
1331 if (getfrag(from, skb_put(skb, copy),
1332 offset, copy, off, skb) < 0) {
1333 __skb_trim(skb, off);
1334 err = -EFAULT;
1335 goto error;
1336 }
1337 } else {
1338 int i = skb_shinfo(skb)->nr_frags;
1339 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1340 struct page *page = sk->sk_sndmsg_page;
1341 int off = sk->sk_sndmsg_off;
1342 unsigned int left;
1343
1344 if (page && (left = PAGE_SIZE - off) > 0) {
1345 if (copy >= left)
1346 copy = left;
1347 if (page != frag->page) {
1348 if (i == MAX_SKB_FRAGS) {
1349 err = -EMSGSIZE;
1350 goto error;
1351 }
1352 get_page(page);
1353 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1354 frag = &skb_shinfo(skb)->frags[i];
1355 }
1356 } else if(i < MAX_SKB_FRAGS) {
1357 if (copy > PAGE_SIZE)
1358 copy = PAGE_SIZE;
1359 page = alloc_pages(sk->sk_allocation, 0);
1360 if (page == NULL) {
1361 err = -ENOMEM;
1362 goto error;
1363 }
1364 sk->sk_sndmsg_page = page;
1365 sk->sk_sndmsg_off = 0;
1366
1367 skb_fill_page_desc(skb, i, page, 0, 0);
1368 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1369 } else {
1370 err = -EMSGSIZE;
1371 goto error;
1372 }
1373 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1374 err = -EFAULT;
1375 goto error;
1376 }
1377 sk->sk_sndmsg_off += copy;
1378 frag->size += copy;
1379 skb->len += copy;
1380 skb->data_len += copy;
f945fa7a
HX
1381 skb->truesize += copy;
1382 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1383 }
1384 offset += copy;
1385 length -= copy;
1386 }
1387 return 0;
1388error:
1389 inet->cork.length -= length;
a11d206d 1390 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1391 return err;
1392}
1393
bf138862
PE
1394static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1395{
1396 inet->cork.flags &= ~IPCORK_OPT;
1397 kfree(np->cork.opt);
1398 np->cork.opt = NULL;
c8cdaf99
YH
1399 if (inet->cork.dst) {
1400 dst_release(inet->cork.dst);
1401 inet->cork.dst = NULL;
bf138862
PE
1402 inet->cork.flags &= ~IPCORK_ALLFRAG;
1403 }
1404 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1405}
1406
1da177e4
LT
1407int ip6_push_pending_frames(struct sock *sk)
1408{
1409 struct sk_buff *skb, *tmp_skb;
1410 struct sk_buff **tail_skb;
1411 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1412 struct inet_sock *inet = inet_sk(sk);
1413 struct ipv6_pinfo *np = inet6_sk(sk);
1414 struct ipv6hdr *hdr;
1415 struct ipv6_txoptions *opt = np->cork.opt;
c8cdaf99 1416 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1417 struct flowi *fl = &inet->cork.fl;
1418 unsigned char proto = fl->proto;
1419 int err = 0;
1420
1421 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1422 goto out;
1423 tail_skb = &(skb_shinfo(skb)->frag_list);
1424
1425 /* move skb->data to ip header from ext header */
d56f90a7 1426 if (skb->data < skb_network_header(skb))
bbe735e4 1427 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1428 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1429 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1430 *tail_skb = tmp_skb;
1431 tail_skb = &(tmp_skb->next);
1432 skb->len += tmp_skb->len;
1433 skb->data_len += tmp_skb->len;
1da177e4
LT
1434 skb->truesize += tmp_skb->truesize;
1435 __sock_put(tmp_skb->sk);
1436 tmp_skb->destructor = NULL;
1437 tmp_skb->sk = NULL;
1da177e4
LT
1438 }
1439
28a89453 1440 /* Allow local fragmentation. */
b5c15fc0 1441 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1442 skb->local_df = 1;
1443
1da177e4 1444 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1445 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1446 if (opt && opt->opt_flen)
1447 ipv6_push_frag_opts(skb, opt, &proto);
1448 if (opt && opt->opt_nflen)
1449 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1450
e2d1bca7
ACM
1451 skb_push(skb, sizeof(struct ipv6hdr));
1452 skb_reset_network_header(skb);
0660e03f 1453 hdr = ipv6_hdr(skb);
1ab1457c 1454
90bcaf7b 1455 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1456 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1457
1da177e4
LT
1458 hdr->hop_limit = np->cork.hop_limit;
1459 hdr->nexthdr = proto;
1460 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1461 ipv6_addr_copy(&hdr->daddr, final_dst);
1462
a2c2064f 1463 skb->priority = sk->sk_priority;
4a19ec58 1464 skb->mark = sk->sk_mark;
a2c2064f 1465
1da177e4 1466 skb->dst = dst_clone(&rt->u.dst);
a11d206d 1467 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
14878f75
DS
1468 if (proto == IPPROTO_ICMPV6) {
1469 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1470
1471 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1472 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1473 }
1474
ef76bc23 1475 err = ip6_local_out(skb);
1da177e4
LT
1476 if (err) {
1477 if (err > 0)
3320da89 1478 err = np->recverr ? net_xmit_errno(err) : 0;
1da177e4
LT
1479 if (err)
1480 goto error;
1481 }
1482
1483out:
bf138862 1484 ip6_cork_release(inet, np);
1da177e4
LT
1485 return err;
1486error:
1487 goto out;
1488}
1489
1490void ip6_flush_pending_frames(struct sock *sk)
1491{
1da177e4
LT
1492 struct sk_buff *skb;
1493
1494 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
e1f52208
YH
1495 if (skb->dst)
1496 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1497 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1498 kfree_skb(skb);
1499 }
1500
bf138862 1501 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1502}