]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv6/ip6_output.c
6d9c0abc8c200e2bfdb87eff48745dc848e553c6
[mirror_ubuntu-zesty-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88 struct dst_entry *dst = skb_dst(skb);
89 struct net_device *dev = dst->dev;
90 struct neighbour *neigh;
91 struct rt6_info *rt;
92
93 skb->protocol = htons(ETH_P_IPV6);
94 skb->dev = dev;
95
96 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98
99 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 ((mroute6_socket(dev_net(dev), skb) &&
101 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 &ipv6_hdr(skb)->saddr))) {
104 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105
106 /* Do not check for IFF_ALLMULTI; multicast routing
107 is not supported in any case.
108 */
109 if (newskb)
110 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 newskb, NULL, newskb->dev,
112 dev_loopback_xmit);
113
114 if (ipv6_hdr(skb)->hop_limit == 0) {
115 IP6_INC_STATS(dev_net(dev), idev,
116 IPSTATS_MIB_OUTDISCARDS);
117 kfree_skb(skb);
118 return 0;
119 }
120 }
121
122 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123 skb->len);
124 }
125
126 rcu_read_lock();
127 rt = (struct rt6_info *) dst;
128 neigh = rt->n;
129 if (neigh) {
130 int res = dst_neigh_output(dst, neigh, skb);
131
132 rcu_read_unlock();
133 return res;
134 }
135 rcu_read_unlock();
136 IP6_INC_STATS_BH(dev_net(dst->dev),
137 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
138 kfree_skb(skb);
139 return -EINVAL;
140 }
141
142 static int ip6_finish_output(struct sk_buff *skb)
143 {
144 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
145 dst_allfrag(skb_dst(skb)))
146 return ip6_fragment(skb, ip6_finish_output2);
147 else
148 return ip6_finish_output2(skb);
149 }
150
151 int ip6_output(struct sk_buff *skb)
152 {
153 struct net_device *dev = skb_dst(skb)->dev;
154 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
155 if (unlikely(idev->cnf.disable_ipv6)) {
156 IP6_INC_STATS(dev_net(dev), idev,
157 IPSTATS_MIB_OUTDISCARDS);
158 kfree_skb(skb);
159 return 0;
160 }
161
162 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
163 ip6_finish_output,
164 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
165 }
166
167 /*
168 * xmit an sk_buff (used by TCP, SCTP and DCCP)
169 */
170
171 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
172 struct ipv6_txoptions *opt, int tclass)
173 {
174 struct net *net = sock_net(sk);
175 struct ipv6_pinfo *np = inet6_sk(sk);
176 struct in6_addr *first_hop = &fl6->daddr;
177 struct dst_entry *dst = skb_dst(skb);
178 struct ipv6hdr *hdr;
179 u8 proto = fl6->flowi6_proto;
180 int seg_len = skb->len;
181 int hlimit = -1;
182 u32 mtu;
183
184 if (opt) {
185 unsigned int head_room;
186
187 /* First: exthdrs may take lots of space (~8K for now)
188 MAX_HEADER is not enough.
189 */
190 head_room = opt->opt_nflen + opt->opt_flen;
191 seg_len += head_room;
192 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
193
194 if (skb_headroom(skb) < head_room) {
195 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
196 if (skb2 == NULL) {
197 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
198 IPSTATS_MIB_OUTDISCARDS);
199 kfree_skb(skb);
200 return -ENOBUFS;
201 }
202 consume_skb(skb);
203 skb = skb2;
204 skb_set_owner_w(skb, sk);
205 }
206 if (opt->opt_flen)
207 ipv6_push_frag_opts(skb, opt, &proto);
208 if (opt->opt_nflen)
209 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
210 }
211
212 skb_push(skb, sizeof(struct ipv6hdr));
213 skb_reset_network_header(skb);
214 hdr = ipv6_hdr(skb);
215
216 /*
217 * Fill in the IPv6 header
218 */
219 if (np)
220 hlimit = np->hop_limit;
221 if (hlimit < 0)
222 hlimit = ip6_dst_hoplimit(dst);
223
224 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
225
226 hdr->payload_len = htons(seg_len);
227 hdr->nexthdr = proto;
228 hdr->hop_limit = hlimit;
229
230 hdr->saddr = fl6->saddr;
231 hdr->daddr = *first_hop;
232
233 skb->priority = sk->sk_priority;
234 skb->mark = sk->sk_mark;
235
236 mtu = dst_mtu(dst);
237 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
238 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
239 IPSTATS_MIB_OUT, skb->len);
240 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
241 dst->dev, dst_output);
242 }
243
244 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
245 skb->dev = dst->dev;
246 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
247 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
248 kfree_skb(skb);
249 return -EMSGSIZE;
250 }
251
252 EXPORT_SYMBOL(ip6_xmit);
253
254 /*
255 * To avoid extra problems ND packets are send through this
256 * routine. It's code duplication but I really want to avoid
257 * extra checks since ipv6_build_header is used by TCP (which
258 * is for us performance critical)
259 */
260
261 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
262 const struct in6_addr *saddr, const struct in6_addr *daddr,
263 int proto, int len)
264 {
265 struct ipv6_pinfo *np = inet6_sk(sk);
266 struct ipv6hdr *hdr;
267
268 skb->protocol = htons(ETH_P_IPV6);
269 skb->dev = dev;
270
271 skb_reset_network_header(skb);
272 skb_put(skb, sizeof(struct ipv6hdr));
273 hdr = ipv6_hdr(skb);
274
275 *(__be32*)hdr = htonl(0x60000000);
276
277 hdr->payload_len = htons(len);
278 hdr->nexthdr = proto;
279 hdr->hop_limit = np->hop_limit;
280
281 hdr->saddr = *saddr;
282 hdr->daddr = *daddr;
283
284 return 0;
285 }
286
287 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
288 {
289 struct ip6_ra_chain *ra;
290 struct sock *last = NULL;
291
292 read_lock(&ip6_ra_lock);
293 for (ra = ip6_ra_chain; ra; ra = ra->next) {
294 struct sock *sk = ra->sk;
295 if (sk && ra->sel == sel &&
296 (!sk->sk_bound_dev_if ||
297 sk->sk_bound_dev_if == skb->dev->ifindex)) {
298 if (last) {
299 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
300 if (skb2)
301 rawv6_rcv(last, skb2);
302 }
303 last = sk;
304 }
305 }
306
307 if (last) {
308 rawv6_rcv(last, skb);
309 read_unlock(&ip6_ra_lock);
310 return 1;
311 }
312 read_unlock(&ip6_ra_lock);
313 return 0;
314 }
315
316 static int ip6_forward_proxy_check(struct sk_buff *skb)
317 {
318 struct ipv6hdr *hdr = ipv6_hdr(skb);
319 u8 nexthdr = hdr->nexthdr;
320 __be16 frag_off;
321 int offset;
322
323 if (ipv6_ext_hdr(nexthdr)) {
324 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
325 if (offset < 0)
326 return 0;
327 } else
328 offset = sizeof(struct ipv6hdr);
329
330 if (nexthdr == IPPROTO_ICMPV6) {
331 struct icmp6hdr *icmp6;
332
333 if (!pskb_may_pull(skb, (skb_network_header(skb) +
334 offset + 1 - skb->data)))
335 return 0;
336
337 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
338
339 switch (icmp6->icmp6_type) {
340 case NDISC_ROUTER_SOLICITATION:
341 case NDISC_ROUTER_ADVERTISEMENT:
342 case NDISC_NEIGHBOUR_SOLICITATION:
343 case NDISC_NEIGHBOUR_ADVERTISEMENT:
344 case NDISC_REDIRECT:
345 /* For reaction involving unicast neighbor discovery
346 * message destined to the proxied address, pass it to
347 * input function.
348 */
349 return 1;
350 default:
351 break;
352 }
353 }
354
355 /*
356 * The proxying router can't forward traffic sent to a link-local
357 * address, so signal the sender and discard the packet. This
358 * behavior is clarified by the MIPv6 specification.
359 */
360 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
361 dst_link_failure(skb);
362 return -1;
363 }
364
365 return 0;
366 }
367
368 static inline int ip6_forward_finish(struct sk_buff *skb)
369 {
370 return dst_output(skb);
371 }
372
373 int ip6_forward(struct sk_buff *skb)
374 {
375 struct dst_entry *dst = skb_dst(skb);
376 struct ipv6hdr *hdr = ipv6_hdr(skb);
377 struct inet6_skb_parm *opt = IP6CB(skb);
378 struct net *net = dev_net(dst->dev);
379 u32 mtu;
380
381 if (net->ipv6.devconf_all->forwarding == 0)
382 goto error;
383
384 if (skb_warn_if_lro(skb))
385 goto drop;
386
387 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
388 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
389 goto drop;
390 }
391
392 if (skb->pkt_type != PACKET_HOST)
393 goto drop;
394
395 skb_forward_csum(skb);
396
397 /*
398 * We DO NOT make any processing on
399 * RA packets, pushing them to user level AS IS
400 * without ane WARRANTY that application will be able
401 * to interpret them. The reason is that we
402 * cannot make anything clever here.
403 *
404 * We are not end-node, so that if packet contains
405 * AH/ESP, we cannot make anything.
406 * Defragmentation also would be mistake, RA packets
407 * cannot be fragmented, because there is no warranty
408 * that different fragments will go along one path. --ANK
409 */
410 if (opt->ra) {
411 u8 *ptr = skb_network_header(skb) + opt->ra;
412 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
413 return 0;
414 }
415
416 /*
417 * check and decrement ttl
418 */
419 if (hdr->hop_limit <= 1) {
420 /* Force OUTPUT device used as source address */
421 skb->dev = dst->dev;
422 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
423 IP6_INC_STATS_BH(net,
424 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
425
426 kfree_skb(skb);
427 return -ETIMEDOUT;
428 }
429
430 /* XXX: idev->cnf.proxy_ndp? */
431 if (net->ipv6.devconf_all->proxy_ndp &&
432 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
433 int proxied = ip6_forward_proxy_check(skb);
434 if (proxied > 0)
435 return ip6_input(skb);
436 else if (proxied < 0) {
437 IP6_INC_STATS(net, ip6_dst_idev(dst),
438 IPSTATS_MIB_INDISCARDS);
439 goto drop;
440 }
441 }
442
443 if (!xfrm6_route_forward(skb)) {
444 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
445 goto drop;
446 }
447 dst = skb_dst(skb);
448
449 /* IPv6 specs say nothing about it, but it is clear that we cannot
450 send redirects to source routed frames.
451 We don't send redirects to frames decapsulated from IPsec.
452 */
453 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
454 struct in6_addr *target = NULL;
455 struct inet_peer *peer;
456 struct rt6_info *rt;
457
458 /*
459 * incoming and outgoing devices are the same
460 * send a redirect.
461 */
462
463 rt = (struct rt6_info *) dst;
464 if (rt->rt6i_flags & RTF_GATEWAY)
465 target = &rt->rt6i_gateway;
466 else
467 target = &hdr->daddr;
468
469 peer = rt6_get_peer_create(rt);
470
471 /* Limit redirects both by destination (here)
472 and by source (inside ndisc_send_redirect)
473 */
474 if (inet_peer_xrlim_allow(peer, 1*HZ))
475 ndisc_send_redirect(skb, target);
476 } else {
477 int addrtype = ipv6_addr_type(&hdr->saddr);
478
479 /* This check is security critical. */
480 if (addrtype == IPV6_ADDR_ANY ||
481 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
482 goto error;
483 if (addrtype & IPV6_ADDR_LINKLOCAL) {
484 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
485 ICMPV6_NOT_NEIGHBOUR, 0);
486 goto error;
487 }
488 }
489
490 mtu = dst_mtu(dst);
491 if (mtu < IPV6_MIN_MTU)
492 mtu = IPV6_MIN_MTU;
493
494 if (skb->len > mtu && !skb_is_gso(skb)) {
495 /* Again, force OUTPUT device used as source address */
496 skb->dev = dst->dev;
497 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
498 IP6_INC_STATS_BH(net,
499 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
500 IP6_INC_STATS_BH(net,
501 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
502 kfree_skb(skb);
503 return -EMSGSIZE;
504 }
505
506 if (skb_cow(skb, dst->dev->hard_header_len)) {
507 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
508 goto drop;
509 }
510
511 hdr = ipv6_hdr(skb);
512
513 /* Mangling hops number delayed to point after skb COW */
514
515 hdr->hop_limit--;
516
517 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
518 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
519 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
520 ip6_forward_finish);
521
522 error:
523 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 drop:
525 kfree_skb(skb);
526 return -EINVAL;
527 }
528
529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530 {
531 to->pkt_type = from->pkt_type;
532 to->priority = from->priority;
533 to->protocol = from->protocol;
534 skb_dst_drop(to);
535 skb_dst_set(to, dst_clone(skb_dst(from)));
536 to->dev = from->dev;
537 to->mark = from->mark;
538
539 #ifdef CONFIG_NET_SCHED
540 to->tc_index = from->tc_index;
541 #endif
542 nf_copy(to, from);
543 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
544 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
545 to->nf_trace = from->nf_trace;
546 #endif
547 skb_copy_secmark(to, from);
548 }
549
550 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
551 {
552 u16 offset = sizeof(struct ipv6hdr);
553 struct ipv6_opt_hdr *exthdr =
554 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
555 unsigned int packet_len = skb->tail - skb->network_header;
556 int found_rhdr = 0;
557 *nexthdr = &ipv6_hdr(skb)->nexthdr;
558
559 while (offset + 1 <= packet_len) {
560
561 switch (**nexthdr) {
562
563 case NEXTHDR_HOP:
564 break;
565 case NEXTHDR_ROUTING:
566 found_rhdr = 1;
567 break;
568 case NEXTHDR_DEST:
569 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
570 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
571 break;
572 #endif
573 if (found_rhdr)
574 return offset;
575 break;
576 default :
577 return offset;
578 }
579
580 offset += ipv6_optlen(exthdr);
581 *nexthdr = &exthdr->nexthdr;
582 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
583 offset);
584 }
585
586 return offset;
587 }
588
589 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
590 {
591 static atomic_t ipv6_fragmentation_id;
592 int old, new;
593
594 if (rt && !(rt->dst.flags & DST_NOPEER)) {
595 struct inet_peer *peer = rt6_get_peer_create(rt);
596
597 if (peer) {
598 fhdr->identification = htonl(inet_getid(peer, 0));
599 return;
600 }
601 }
602 do {
603 old = atomic_read(&ipv6_fragmentation_id);
604 new = old + 1;
605 if (!new)
606 new = 1;
607 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
608 fhdr->identification = htonl(new);
609 }
610
611 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
612 {
613 struct sk_buff *frag;
614 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
615 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
616 struct ipv6hdr *tmp_hdr;
617 struct frag_hdr *fh;
618 unsigned int mtu, hlen, left, len;
619 int hroom, troom;
620 __be32 frag_id = 0;
621 int ptr, offset = 0, err=0;
622 u8 *prevhdr, nexthdr = 0;
623 struct net *net = dev_net(skb_dst(skb)->dev);
624
625 hlen = ip6_find_1stfragopt(skb, &prevhdr);
626 nexthdr = *prevhdr;
627
628 mtu = ip6_skb_dst_mtu(skb);
629
630 /* We must not fragment if the socket is set to force MTU discovery
631 * or if the skb it not generated by a local socket.
632 */
633 if (unlikely(!skb->local_df && skb->len > mtu)) {
634 if (skb->sk && dst_allfrag(skb_dst(skb)))
635 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
636
637 skb->dev = skb_dst(skb)->dev;
638 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
639 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
640 IPSTATS_MIB_FRAGFAILS);
641 kfree_skb(skb);
642 return -EMSGSIZE;
643 }
644
645 if (np && np->frag_size < mtu) {
646 if (np->frag_size)
647 mtu = np->frag_size;
648 }
649 mtu -= hlen + sizeof(struct frag_hdr);
650
651 if (skb_has_frag_list(skb)) {
652 int first_len = skb_pagelen(skb);
653 struct sk_buff *frag2;
654
655 if (first_len - hlen > mtu ||
656 ((first_len - hlen) & 7) ||
657 skb_cloned(skb))
658 goto slow_path;
659
660 skb_walk_frags(skb, frag) {
661 /* Correct geometry. */
662 if (frag->len > mtu ||
663 ((frag->len & 7) && frag->next) ||
664 skb_headroom(frag) < hlen)
665 goto slow_path_clean;
666
667 /* Partially cloned skb? */
668 if (skb_shared(frag))
669 goto slow_path_clean;
670
671 BUG_ON(frag->sk);
672 if (skb->sk) {
673 frag->sk = skb->sk;
674 frag->destructor = sock_wfree;
675 }
676 skb->truesize -= frag->truesize;
677 }
678
679 err = 0;
680 offset = 0;
681 frag = skb_shinfo(skb)->frag_list;
682 skb_frag_list_init(skb);
683 /* BUILD HEADER */
684
685 *prevhdr = NEXTHDR_FRAGMENT;
686 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
687 if (!tmp_hdr) {
688 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
689 IPSTATS_MIB_FRAGFAILS);
690 return -ENOMEM;
691 }
692
693 __skb_pull(skb, hlen);
694 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
695 __skb_push(skb, hlen);
696 skb_reset_network_header(skb);
697 memcpy(skb_network_header(skb), tmp_hdr, hlen);
698
699 ipv6_select_ident(fh, rt);
700 fh->nexthdr = nexthdr;
701 fh->reserved = 0;
702 fh->frag_off = htons(IP6_MF);
703 frag_id = fh->identification;
704
705 first_len = skb_pagelen(skb);
706 skb->data_len = first_len - skb_headlen(skb);
707 skb->len = first_len;
708 ipv6_hdr(skb)->payload_len = htons(first_len -
709 sizeof(struct ipv6hdr));
710
711 dst_hold(&rt->dst);
712
713 for (;;) {
714 /* Prepare header of the next frame,
715 * before previous one went down. */
716 if (frag) {
717 frag->ip_summed = CHECKSUM_NONE;
718 skb_reset_transport_header(frag);
719 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
720 __skb_push(frag, hlen);
721 skb_reset_network_header(frag);
722 memcpy(skb_network_header(frag), tmp_hdr,
723 hlen);
724 offset += skb->len - hlen - sizeof(struct frag_hdr);
725 fh->nexthdr = nexthdr;
726 fh->reserved = 0;
727 fh->frag_off = htons(offset);
728 if (frag->next != NULL)
729 fh->frag_off |= htons(IP6_MF);
730 fh->identification = frag_id;
731 ipv6_hdr(frag)->payload_len =
732 htons(frag->len -
733 sizeof(struct ipv6hdr));
734 ip6_copy_metadata(frag, skb);
735 }
736
737 err = output(skb);
738 if(!err)
739 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
740 IPSTATS_MIB_FRAGCREATES);
741
742 if (err || !frag)
743 break;
744
745 skb = frag;
746 frag = skb->next;
747 skb->next = NULL;
748 }
749
750 kfree(tmp_hdr);
751
752 if (err == 0) {
753 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754 IPSTATS_MIB_FRAGOKS);
755 dst_release(&rt->dst);
756 return 0;
757 }
758
759 while (frag) {
760 skb = frag->next;
761 kfree_skb(frag);
762 frag = skb;
763 }
764
765 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
766 IPSTATS_MIB_FRAGFAILS);
767 dst_release(&rt->dst);
768 return err;
769
770 slow_path_clean:
771 skb_walk_frags(skb, frag2) {
772 if (frag2 == frag)
773 break;
774 frag2->sk = NULL;
775 frag2->destructor = NULL;
776 skb->truesize += frag2->truesize;
777 }
778 }
779
780 slow_path:
781 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
782 skb_checksum_help(skb))
783 goto fail;
784
785 left = skb->len - hlen; /* Space per frame */
786 ptr = hlen; /* Where to start from */
787
788 /*
789 * Fragment the datagram.
790 */
791
792 *prevhdr = NEXTHDR_FRAGMENT;
793 hroom = LL_RESERVED_SPACE(rt->dst.dev);
794 troom = rt->dst.dev->needed_tailroom;
795
796 /*
797 * Keep copying data until we run out.
798 */
799 while(left > 0) {
800 len = left;
801 /* IF: it doesn't fit, use 'mtu' - the data space left */
802 if (len > mtu)
803 len = mtu;
804 /* IF: we are not sending up to and including the packet end
805 then align the next start on an eight byte boundary */
806 if (len < left) {
807 len &= ~7;
808 }
809 /*
810 * Allocate buffer.
811 */
812
813 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
814 hroom + troom, GFP_ATOMIC)) == NULL) {
815 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
816 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
817 IPSTATS_MIB_FRAGFAILS);
818 err = -ENOMEM;
819 goto fail;
820 }
821
822 /*
823 * Set up data on packet
824 */
825
826 ip6_copy_metadata(frag, skb);
827 skb_reserve(frag, hroom);
828 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
829 skb_reset_network_header(frag);
830 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
831 frag->transport_header = (frag->network_header + hlen +
832 sizeof(struct frag_hdr));
833
834 /*
835 * Charge the memory for the fragment to any owner
836 * it might possess
837 */
838 if (skb->sk)
839 skb_set_owner_w(frag, skb->sk);
840
841 /*
842 * Copy the packet header into the new buffer.
843 */
844 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
845
846 /*
847 * Build fragment header.
848 */
849 fh->nexthdr = nexthdr;
850 fh->reserved = 0;
851 if (!frag_id) {
852 ipv6_select_ident(fh, rt);
853 frag_id = fh->identification;
854 } else
855 fh->identification = frag_id;
856
857 /*
858 * Copy a block of the IP datagram.
859 */
860 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
861 BUG();
862 left -= len;
863
864 fh->frag_off = htons(offset);
865 if (left > 0)
866 fh->frag_off |= htons(IP6_MF);
867 ipv6_hdr(frag)->payload_len = htons(frag->len -
868 sizeof(struct ipv6hdr));
869
870 ptr += len;
871 offset += len;
872
873 /*
874 * Put this fragment into the sending queue.
875 */
876 err = output(frag);
877 if (err)
878 goto fail;
879
880 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
881 IPSTATS_MIB_FRAGCREATES);
882 }
883 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
884 IPSTATS_MIB_FRAGOKS);
885 consume_skb(skb);
886 return err;
887
888 fail:
889 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890 IPSTATS_MIB_FRAGFAILS);
891 kfree_skb(skb);
892 return err;
893 }
894
895 static inline int ip6_rt_check(const struct rt6key *rt_key,
896 const struct in6_addr *fl_addr,
897 const struct in6_addr *addr_cache)
898 {
899 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
900 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
901 }
902
903 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
904 struct dst_entry *dst,
905 const struct flowi6 *fl6)
906 {
907 struct ipv6_pinfo *np = inet6_sk(sk);
908 struct rt6_info *rt = (struct rt6_info *)dst;
909
910 if (!dst)
911 goto out;
912
913 /* Yes, checking route validity in not connected
914 * case is not very simple. Take into account,
915 * that we do not support routing by source, TOS,
916 * and MSG_DONTROUTE --ANK (980726)
917 *
918 * 1. ip6_rt_check(): If route was host route,
919 * check that cached destination is current.
920 * If it is network route, we still may
921 * check its validity using saved pointer
922 * to the last used address: daddr_cache.
923 * We do not want to save whole address now,
924 * (because main consumer of this service
925 * is tcp, which has not this problem),
926 * so that the last trick works only on connected
927 * sockets.
928 * 2. oif also should be the same.
929 */
930 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
931 #ifdef CONFIG_IPV6_SUBTREES
932 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
933 #endif
934 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
935 dst_release(dst);
936 dst = NULL;
937 }
938
939 out:
940 return dst;
941 }
942
943 static int ip6_dst_lookup_tail(struct sock *sk,
944 struct dst_entry **dst, struct flowi6 *fl6)
945 {
946 struct net *net = sock_net(sk);
947 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
948 struct neighbour *n;
949 struct rt6_info *rt;
950 #endif
951 int err;
952
953 if (*dst == NULL)
954 *dst = ip6_route_output(net, sk, fl6);
955
956 if ((err = (*dst)->error))
957 goto out_err_release;
958
959 if (ipv6_addr_any(&fl6->saddr)) {
960 struct rt6_info *rt = (struct rt6_info *) *dst;
961 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
962 sk ? inet6_sk(sk)->srcprefs : 0,
963 &fl6->saddr);
964 if (err)
965 goto out_err_release;
966 }
967
968 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
969 /*
970 * Here if the dst entry we've looked up
971 * has a neighbour entry that is in the INCOMPLETE
972 * state and the src address from the flow is
973 * marked as OPTIMISTIC, we release the found
974 * dst entry and replace it instead with the
975 * dst entry of the nexthop router
976 */
977 rcu_read_lock();
978 rt = (struct rt6_info *) dst;
979 n = rt->n;
980 if (n && !(n->nud_state & NUD_VALID)) {
981 struct inet6_ifaddr *ifp;
982 struct flowi6 fl_gw6;
983 int redirect;
984
985 rcu_read_unlock();
986 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
987 (*dst)->dev, 1);
988
989 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
990 if (ifp)
991 in6_ifa_put(ifp);
992
993 if (redirect) {
994 /*
995 * We need to get the dst entry for the
996 * default router instead
997 */
998 dst_release(*dst);
999 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1000 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1001 *dst = ip6_route_output(net, sk, &fl_gw6);
1002 if ((err = (*dst)->error))
1003 goto out_err_release;
1004 }
1005 } else {
1006 rcu_read_unlock();
1007 }
1008 #endif
1009
1010 return 0;
1011
1012 out_err_release:
1013 if (err == -ENETUNREACH)
1014 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1015 dst_release(*dst);
1016 *dst = NULL;
1017 return err;
1018 }
1019
1020 /**
1021 * ip6_dst_lookup - perform route lookup on flow
1022 * @sk: socket which provides route info
1023 * @dst: pointer to dst_entry * for result
1024 * @fl6: flow to lookup
1025 *
1026 * This function performs a route lookup on the given flow.
1027 *
1028 * It returns zero on success, or a standard errno code on error.
1029 */
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1031 {
1032 *dst = NULL;
1033 return ip6_dst_lookup_tail(sk, dst, fl6);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1036
1037 /**
1038 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039 * @sk: socket which provides route info
1040 * @fl6: flow to lookup
1041 * @final_dst: final destination address for ipsec lookup
1042 * @can_sleep: we are in a sleepable context
1043 *
1044 * This function performs a route lookup on the given flow.
1045 *
1046 * It returns a valid dst pointer on success, or a pointer encoded
1047 * error code.
1048 */
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050 const struct in6_addr *final_dst,
1051 bool can_sleep)
1052 {
1053 struct dst_entry *dst = NULL;
1054 int err;
1055
1056 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057 if (err)
1058 return ERR_PTR(err);
1059 if (final_dst)
1060 fl6->daddr = *final_dst;
1061 if (can_sleep)
1062 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063
1064 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1067
1068 /**
1069 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070 * @sk: socket which provides the dst cache and route info
1071 * @fl6: flow to lookup
1072 * @final_dst: final destination address for ipsec lookup
1073 * @can_sleep: we are in a sleepable context
1074 *
1075 * This function performs a route lookup on the given flow with the
1076 * possibility of using the cached route in the socket if it is valid.
1077 * It will take the socket dst lock when operating on the dst cache.
1078 * As a result, this function can only be used in process context.
1079 *
1080 * It returns a valid dst pointer on success, or a pointer encoded
1081 * error code.
1082 */
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084 const struct in6_addr *final_dst,
1085 bool can_sleep)
1086 {
1087 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1088 int err;
1089
1090 dst = ip6_sk_dst_check(sk, dst, fl6);
1091
1092 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093 if (err)
1094 return ERR_PTR(err);
1095 if (final_dst)
1096 fl6->daddr = *final_dst;
1097 if (can_sleep)
1098 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099
1100 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105 int getfrag(void *from, char *to, int offset, int len,
1106 int odd, struct sk_buff *skb),
1107 void *from, int length, int hh_len, int fragheaderlen,
1108 int transhdrlen, int mtu,unsigned int flags,
1109 struct rt6_info *rt)
1110
1111 {
1112 struct sk_buff *skb;
1113 int err;
1114
1115 /* There is support for UDP large send offload by network
1116 * device, so create one single skb packet containing complete
1117 * udp datagram
1118 */
1119 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120 skb = sock_alloc_send_skb(sk,
1121 hh_len + fragheaderlen + transhdrlen + 20,
1122 (flags & MSG_DONTWAIT), &err);
1123 if (skb == NULL)
1124 return err;
1125
1126 /* reserve space for Hardware header */
1127 skb_reserve(skb, hh_len);
1128
1129 /* create space for UDP/IP header */
1130 skb_put(skb,fragheaderlen + transhdrlen);
1131
1132 /* initialize network header pointer */
1133 skb_reset_network_header(skb);
1134
1135 /* initialize protocol header pointer */
1136 skb->transport_header = skb->network_header + fragheaderlen;
1137
1138 skb->ip_summed = CHECKSUM_PARTIAL;
1139 skb->csum = 0;
1140 }
1141
1142 err = skb_append_datato_frags(sk,skb, getfrag, from,
1143 (length - transhdrlen));
1144 if (!err) {
1145 struct frag_hdr fhdr;
1146
1147 /* Specify the length of each IPv6 datagram fragment.
1148 * It has to be a multiple of 8.
1149 */
1150 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1151 sizeof(struct frag_hdr)) & ~7;
1152 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153 ipv6_select_ident(&fhdr, rt);
1154 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1155 __skb_queue_tail(&sk->sk_write_queue, skb);
1156
1157 return 0;
1158 }
1159 /* There is not enough support do UPD LSO,
1160 * so follow normal path
1161 */
1162 kfree_skb(skb);
1163
1164 return err;
1165 }
1166
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1168 gfp_t gfp)
1169 {
1170 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1174 gfp_t gfp)
1175 {
1176 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static void ip6_append_data_mtu(int *mtu,
1180 int *maxfraglen,
1181 unsigned int fragheaderlen,
1182 struct sk_buff *skb,
1183 struct rt6_info *rt)
1184 {
1185 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186 if (skb == NULL) {
1187 /* first fragment, reserve header_len */
1188 *mtu = *mtu - rt->dst.header_len;
1189
1190 } else {
1191 /*
1192 * this fragment is not first, the headers
1193 * space is regarded as data space.
1194 */
1195 *mtu = dst_mtu(rt->dst.path);
1196 }
1197 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198 + fragheaderlen - sizeof(struct frag_hdr);
1199 }
1200 }
1201
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203 int offset, int len, int odd, struct sk_buff *skb),
1204 void *from, int length, int transhdrlen,
1205 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206 struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 {
1208 struct inet_sock *inet = inet_sk(sk);
1209 struct ipv6_pinfo *np = inet6_sk(sk);
1210 struct inet_cork *cork;
1211 struct sk_buff *skb, *skb_prev = NULL;
1212 unsigned int maxfraglen, fragheaderlen;
1213 int exthdrlen;
1214 int dst_exthdrlen;
1215 int hh_len;
1216 int mtu;
1217 int copy;
1218 int err;
1219 int offset = 0;
1220 __u8 tx_flags = 0;
1221
1222 if (flags&MSG_PROBE)
1223 return 0;
1224 cork = &inet->cork.base;
1225 if (skb_queue_empty(&sk->sk_write_queue)) {
1226 /*
1227 * setup for corking
1228 */
1229 if (opt) {
1230 if (WARN_ON(np->cork.opt))
1231 return -EINVAL;
1232
1233 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1234 if (unlikely(np->cork.opt == NULL))
1235 return -ENOBUFS;
1236
1237 np->cork.opt->tot_len = opt->tot_len;
1238 np->cork.opt->opt_flen = opt->opt_flen;
1239 np->cork.opt->opt_nflen = opt->opt_nflen;
1240
1241 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242 sk->sk_allocation);
1243 if (opt->dst0opt && !np->cork.opt->dst0opt)
1244 return -ENOBUFS;
1245
1246 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247 sk->sk_allocation);
1248 if (opt->dst1opt && !np->cork.opt->dst1opt)
1249 return -ENOBUFS;
1250
1251 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252 sk->sk_allocation);
1253 if (opt->hopopt && !np->cork.opt->hopopt)
1254 return -ENOBUFS;
1255
1256 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257 sk->sk_allocation);
1258 if (opt->srcrt && !np->cork.opt->srcrt)
1259 return -ENOBUFS;
1260
1261 /* need source address above miyazawa*/
1262 }
1263 dst_hold(&rt->dst);
1264 cork->dst = &rt->dst;
1265 inet->cork.fl.u.ip6 = *fl6;
1266 np->cork.hop_limit = hlimit;
1267 np->cork.tclass = tclass;
1268 if (rt->dst.flags & DST_XFRM_TUNNEL)
1269 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271 else
1272 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274 if (np->frag_size < mtu) {
1275 if (np->frag_size)
1276 mtu = np->frag_size;
1277 }
1278 cork->fragsize = mtu;
1279 if (dst_allfrag(rt->dst.path))
1280 cork->flags |= IPCORK_ALLFRAG;
1281 cork->length = 0;
1282 sk->sk_sndmsg_page = NULL;
1283 sk->sk_sndmsg_off = 0;
1284 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1285 length += exthdrlen;
1286 transhdrlen += exthdrlen;
1287 dst_exthdrlen = rt->dst.header_len;
1288 } else {
1289 rt = (struct rt6_info *)cork->dst;
1290 fl6 = &inet->cork.fl.u.ip6;
1291 opt = np->cork.opt;
1292 transhdrlen = 0;
1293 exthdrlen = 0;
1294 dst_exthdrlen = 0;
1295 mtu = cork->fragsize;
1296 }
1297
1298 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1299
1300 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1301 (opt ? opt->opt_nflen : 0);
1302 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1303
1304 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1305 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1306 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1307 return -EMSGSIZE;
1308 }
1309 }
1310
1311 /* For UDP, check if TX timestamp is enabled */
1312 if (sk->sk_type == SOCK_DGRAM) {
1313 err = sock_tx_timestamp(sk, &tx_flags);
1314 if (err)
1315 goto error;
1316 }
1317
1318 /*
1319 * Let's try using as much space as possible.
1320 * Use MTU if total length of the message fits into the MTU.
1321 * Otherwise, we need to reserve fragment header and
1322 * fragment alignment (= 8-15 octects, in total).
1323 *
1324 * Note that we may need to "move" the data from the tail of
1325 * of the buffer to the new fragment when we split
1326 * the message.
1327 *
1328 * FIXME: It may be fragmented into multiple chunks
1329 * at once if non-fragmentable extension headers
1330 * are too large.
1331 * --yoshfuji
1332 */
1333
1334 cork->length += length;
1335 if (length > mtu) {
1336 int proto = sk->sk_protocol;
1337 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1338 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1339 return -EMSGSIZE;
1340 }
1341
1342 if (proto == IPPROTO_UDP &&
1343 (rt->dst.dev->features & NETIF_F_UFO)) {
1344
1345 err = ip6_ufo_append_data(sk, getfrag, from, length,
1346 hh_len, fragheaderlen,
1347 transhdrlen, mtu, flags, rt);
1348 if (err)
1349 goto error;
1350 return 0;
1351 }
1352 }
1353
1354 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1355 goto alloc_new_skb;
1356
1357 while (length > 0) {
1358 /* Check if the remaining data fits into current packet. */
1359 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1360 if (copy < length)
1361 copy = maxfraglen - skb->len;
1362
1363 if (copy <= 0) {
1364 char *data;
1365 unsigned int datalen;
1366 unsigned int fraglen;
1367 unsigned int fraggap;
1368 unsigned int alloclen;
1369 alloc_new_skb:
1370 /* There's no room in the current skb */
1371 if (skb)
1372 fraggap = skb->len - maxfraglen;
1373 else
1374 fraggap = 0;
1375 /* update mtu and maxfraglen if necessary */
1376 if (skb == NULL || skb_prev == NULL)
1377 ip6_append_data_mtu(&mtu, &maxfraglen,
1378 fragheaderlen, skb, rt);
1379
1380 skb_prev = skb;
1381
1382 /*
1383 * If remaining data exceeds the mtu,
1384 * we know we need more fragment(s).
1385 */
1386 datalen = length + fraggap;
1387
1388 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1389 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1390 if ((flags & MSG_MORE) &&
1391 !(rt->dst.dev->features&NETIF_F_SG))
1392 alloclen = mtu;
1393 else
1394 alloclen = datalen + fragheaderlen;
1395
1396 alloclen += dst_exthdrlen;
1397
1398 if (datalen != length + fraggap) {
1399 /*
1400 * this is not the last fragment, the trailer
1401 * space is regarded as data space.
1402 */
1403 datalen += rt->dst.trailer_len;
1404 }
1405
1406 alloclen += rt->dst.trailer_len;
1407 fraglen = datalen + fragheaderlen;
1408
1409 /*
1410 * We just reserve space for fragment header.
1411 * Note: this may be overallocation if the message
1412 * (without MSG_MORE) fits into the MTU.
1413 */
1414 alloclen += sizeof(struct frag_hdr);
1415
1416 if (transhdrlen) {
1417 skb = sock_alloc_send_skb(sk,
1418 alloclen + hh_len,
1419 (flags & MSG_DONTWAIT), &err);
1420 } else {
1421 skb = NULL;
1422 if (atomic_read(&sk->sk_wmem_alloc) <=
1423 2 * sk->sk_sndbuf)
1424 skb = sock_wmalloc(sk,
1425 alloclen + hh_len, 1,
1426 sk->sk_allocation);
1427 if (unlikely(skb == NULL))
1428 err = -ENOBUFS;
1429 else {
1430 /* Only the initial fragment
1431 * is time stamped.
1432 */
1433 tx_flags = 0;
1434 }
1435 }
1436 if (skb == NULL)
1437 goto error;
1438 /*
1439 * Fill in the control structures
1440 */
1441 skb->ip_summed = CHECKSUM_NONE;
1442 skb->csum = 0;
1443 /* reserve for fragmentation and ipsec header */
1444 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1445 dst_exthdrlen);
1446
1447 if (sk->sk_type == SOCK_DGRAM)
1448 skb_shinfo(skb)->tx_flags = tx_flags;
1449
1450 /*
1451 * Find where to start putting bytes
1452 */
1453 data = skb_put(skb, fraglen);
1454 skb_set_network_header(skb, exthdrlen);
1455 data += fragheaderlen;
1456 skb->transport_header = (skb->network_header +
1457 fragheaderlen);
1458 if (fraggap) {
1459 skb->csum = skb_copy_and_csum_bits(
1460 skb_prev, maxfraglen,
1461 data + transhdrlen, fraggap, 0);
1462 skb_prev->csum = csum_sub(skb_prev->csum,
1463 skb->csum);
1464 data += fraggap;
1465 pskb_trim_unique(skb_prev, maxfraglen);
1466 }
1467 copy = datalen - transhdrlen - fraggap;
1468
1469 if (copy < 0) {
1470 err = -EINVAL;
1471 kfree_skb(skb);
1472 goto error;
1473 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1474 err = -EFAULT;
1475 kfree_skb(skb);
1476 goto error;
1477 }
1478
1479 offset += copy;
1480 length -= datalen - fraggap;
1481 transhdrlen = 0;
1482 exthdrlen = 0;
1483 dst_exthdrlen = 0;
1484
1485 /*
1486 * Put the packet on the pending queue
1487 */
1488 __skb_queue_tail(&sk->sk_write_queue, skb);
1489 continue;
1490 }
1491
1492 if (copy > length)
1493 copy = length;
1494
1495 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1496 unsigned int off;
1497
1498 off = skb->len;
1499 if (getfrag(from, skb_put(skb, copy),
1500 offset, copy, off, skb) < 0) {
1501 __skb_trim(skb, off);
1502 err = -EFAULT;
1503 goto error;
1504 }
1505 } else {
1506 int i = skb_shinfo(skb)->nr_frags;
1507 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1508 struct page *page = sk->sk_sndmsg_page;
1509 int off = sk->sk_sndmsg_off;
1510 unsigned int left;
1511
1512 if (page && (left = PAGE_SIZE - off) > 0) {
1513 if (copy >= left)
1514 copy = left;
1515 if (page != skb_frag_page(frag)) {
1516 if (i == MAX_SKB_FRAGS) {
1517 err = -EMSGSIZE;
1518 goto error;
1519 }
1520 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1521 skb_frag_ref(skb, i);
1522 frag = &skb_shinfo(skb)->frags[i];
1523 }
1524 } else if(i < MAX_SKB_FRAGS) {
1525 if (copy > PAGE_SIZE)
1526 copy = PAGE_SIZE;
1527 page = alloc_pages(sk->sk_allocation, 0);
1528 if (page == NULL) {
1529 err = -ENOMEM;
1530 goto error;
1531 }
1532 sk->sk_sndmsg_page = page;
1533 sk->sk_sndmsg_off = 0;
1534
1535 skb_fill_page_desc(skb, i, page, 0, 0);
1536 frag = &skb_shinfo(skb)->frags[i];
1537 } else {
1538 err = -EMSGSIZE;
1539 goto error;
1540 }
1541 if (getfrag(from,
1542 skb_frag_address(frag) + skb_frag_size(frag),
1543 offset, copy, skb->len, skb) < 0) {
1544 err = -EFAULT;
1545 goto error;
1546 }
1547 sk->sk_sndmsg_off += copy;
1548 skb_frag_size_add(frag, copy);
1549 skb->len += copy;
1550 skb->data_len += copy;
1551 skb->truesize += copy;
1552 atomic_add(copy, &sk->sk_wmem_alloc);
1553 }
1554 offset += copy;
1555 length -= copy;
1556 }
1557 return 0;
1558 error:
1559 cork->length -= length;
1560 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561 return err;
1562 }
1563 EXPORT_SYMBOL_GPL(ip6_append_data);
1564
1565 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1566 {
1567 if (np->cork.opt) {
1568 kfree(np->cork.opt->dst0opt);
1569 kfree(np->cork.opt->dst1opt);
1570 kfree(np->cork.opt->hopopt);
1571 kfree(np->cork.opt->srcrt);
1572 kfree(np->cork.opt);
1573 np->cork.opt = NULL;
1574 }
1575
1576 if (inet->cork.base.dst) {
1577 dst_release(inet->cork.base.dst);
1578 inet->cork.base.dst = NULL;
1579 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1580 }
1581 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1582 }
1583
1584 int ip6_push_pending_frames(struct sock *sk)
1585 {
1586 struct sk_buff *skb, *tmp_skb;
1587 struct sk_buff **tail_skb;
1588 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1589 struct inet_sock *inet = inet_sk(sk);
1590 struct ipv6_pinfo *np = inet6_sk(sk);
1591 struct net *net = sock_net(sk);
1592 struct ipv6hdr *hdr;
1593 struct ipv6_txoptions *opt = np->cork.opt;
1594 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1595 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1596 unsigned char proto = fl6->flowi6_proto;
1597 int err = 0;
1598
1599 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1600 goto out;
1601 tail_skb = &(skb_shinfo(skb)->frag_list);
1602
1603 /* move skb->data to ip header from ext header */
1604 if (skb->data < skb_network_header(skb))
1605 __skb_pull(skb, skb_network_offset(skb));
1606 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1607 __skb_pull(tmp_skb, skb_network_header_len(skb));
1608 *tail_skb = tmp_skb;
1609 tail_skb = &(tmp_skb->next);
1610 skb->len += tmp_skb->len;
1611 skb->data_len += tmp_skb->len;
1612 skb->truesize += tmp_skb->truesize;
1613 tmp_skb->destructor = NULL;
1614 tmp_skb->sk = NULL;
1615 }
1616
1617 /* Allow local fragmentation. */
1618 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1619 skb->local_df = 1;
1620
1621 *final_dst = fl6->daddr;
1622 __skb_pull(skb, skb_network_header_len(skb));
1623 if (opt && opt->opt_flen)
1624 ipv6_push_frag_opts(skb, opt, &proto);
1625 if (opt && opt->opt_nflen)
1626 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1627
1628 skb_push(skb, sizeof(struct ipv6hdr));
1629 skb_reset_network_header(skb);
1630 hdr = ipv6_hdr(skb);
1631
1632 *(__be32*)hdr = fl6->flowlabel |
1633 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1634
1635 hdr->hop_limit = np->cork.hop_limit;
1636 hdr->nexthdr = proto;
1637 hdr->saddr = fl6->saddr;
1638 hdr->daddr = *final_dst;
1639
1640 skb->priority = sk->sk_priority;
1641 skb->mark = sk->sk_mark;
1642
1643 skb_dst_set(skb, dst_clone(&rt->dst));
1644 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1645 if (proto == IPPROTO_ICMPV6) {
1646 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1647
1648 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1649 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1650 }
1651
1652 err = ip6_local_out(skb);
1653 if (err) {
1654 if (err > 0)
1655 err = net_xmit_errno(err);
1656 if (err)
1657 goto error;
1658 }
1659
1660 out:
1661 ip6_cork_release(inet, np);
1662 return err;
1663 error:
1664 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1665 goto out;
1666 }
1667 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1668
1669 void ip6_flush_pending_frames(struct sock *sk)
1670 {
1671 struct sk_buff *skb;
1672
1673 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1674 if (skb_dst(skb))
1675 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1676 IPSTATS_MIB_OUTDISCARDS);
1677 kfree_skb(skb);
1678 }
1679
1680 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1681 }
1682 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);