]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv6/ip6_output.c
net: Do delayed neigh confirmation.
[mirror_ubuntu-zesty-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88 struct dst_entry *dst = skb_dst(skb);
89 struct net_device *dev = dst->dev;
90 struct neighbour *neigh;
91
92 skb->protocol = htons(ETH_P_IPV6);
93 skb->dev = dev;
94
95 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97
98 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
99 ((mroute6_socket(dev_net(dev), skb) &&
100 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102 &ipv6_hdr(skb)->saddr))) {
103 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104
105 /* Do not check for IFF_ALLMULTI; multicast routing
106 is not supported in any case.
107 */
108 if (newskb)
109 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110 newskb, NULL, newskb->dev,
111 dev_loopback_xmit);
112
113 if (ipv6_hdr(skb)->hop_limit == 0) {
114 IP6_INC_STATS(dev_net(dev), idev,
115 IPSTATS_MIB_OUTDISCARDS);
116 kfree_skb(skb);
117 return 0;
118 }
119 }
120
121 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
122 skb->len);
123 }
124
125 rcu_read_lock();
126 neigh = dst_get_neighbour_noref(dst);
127 if (neigh) {
128 int res = dst_neigh_output(dst, neigh, skb);
129
130 rcu_read_unlock();
131 return res;
132 }
133 rcu_read_unlock();
134 IP6_INC_STATS_BH(dev_net(dst->dev),
135 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
136 kfree_skb(skb);
137 return -EINVAL;
138 }
139
140 static int ip6_finish_output(struct sk_buff *skb)
141 {
142 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
143 dst_allfrag(skb_dst(skb)))
144 return ip6_fragment(skb, ip6_finish_output2);
145 else
146 return ip6_finish_output2(skb);
147 }
148
149 int ip6_output(struct sk_buff *skb)
150 {
151 struct net_device *dev = skb_dst(skb)->dev;
152 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
153 if (unlikely(idev->cnf.disable_ipv6)) {
154 IP6_INC_STATS(dev_net(dev), idev,
155 IPSTATS_MIB_OUTDISCARDS);
156 kfree_skb(skb);
157 return 0;
158 }
159
160 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
161 ip6_finish_output,
162 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
163 }
164
165 /*
166 * xmit an sk_buff (used by TCP, SCTP and DCCP)
167 */
168
169 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
170 struct ipv6_txoptions *opt, int tclass)
171 {
172 struct net *net = sock_net(sk);
173 struct ipv6_pinfo *np = inet6_sk(sk);
174 struct in6_addr *first_hop = &fl6->daddr;
175 struct dst_entry *dst = skb_dst(skb);
176 struct ipv6hdr *hdr;
177 u8 proto = fl6->flowi6_proto;
178 int seg_len = skb->len;
179 int hlimit = -1;
180 u32 mtu;
181
182 if (opt) {
183 unsigned int head_room;
184
185 /* First: exthdrs may take lots of space (~8K for now)
186 MAX_HEADER is not enough.
187 */
188 head_room = opt->opt_nflen + opt->opt_flen;
189 seg_len += head_room;
190 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
191
192 if (skb_headroom(skb) < head_room) {
193 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
194 if (skb2 == NULL) {
195 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
196 IPSTATS_MIB_OUTDISCARDS);
197 kfree_skb(skb);
198 return -ENOBUFS;
199 }
200 consume_skb(skb);
201 skb = skb2;
202 skb_set_owner_w(skb, sk);
203 }
204 if (opt->opt_flen)
205 ipv6_push_frag_opts(skb, opt, &proto);
206 if (opt->opt_nflen)
207 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
208 }
209
210 skb_push(skb, sizeof(struct ipv6hdr));
211 skb_reset_network_header(skb);
212 hdr = ipv6_hdr(skb);
213
214 /*
215 * Fill in the IPv6 header
216 */
217 if (np)
218 hlimit = np->hop_limit;
219 if (hlimit < 0)
220 hlimit = ip6_dst_hoplimit(dst);
221
222 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
223
224 hdr->payload_len = htons(seg_len);
225 hdr->nexthdr = proto;
226 hdr->hop_limit = hlimit;
227
228 hdr->saddr = fl6->saddr;
229 hdr->daddr = *first_hop;
230
231 skb->priority = sk->sk_priority;
232 skb->mark = sk->sk_mark;
233
234 mtu = dst_mtu(dst);
235 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
236 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
237 IPSTATS_MIB_OUT, skb->len);
238 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
239 dst->dev, dst_output);
240 }
241
242 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
243 skb->dev = dst->dev;
244 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
245 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246 kfree_skb(skb);
247 return -EMSGSIZE;
248 }
249
250 EXPORT_SYMBOL(ip6_xmit);
251
252 /*
253 * To avoid extra problems ND packets are send through this
254 * routine. It's code duplication but I really want to avoid
255 * extra checks since ipv6_build_header is used by TCP (which
256 * is for us performance critical)
257 */
258
259 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
260 const struct in6_addr *saddr, const struct in6_addr *daddr,
261 int proto, int len)
262 {
263 struct ipv6_pinfo *np = inet6_sk(sk);
264 struct ipv6hdr *hdr;
265
266 skb->protocol = htons(ETH_P_IPV6);
267 skb->dev = dev;
268
269 skb_reset_network_header(skb);
270 skb_put(skb, sizeof(struct ipv6hdr));
271 hdr = ipv6_hdr(skb);
272
273 *(__be32*)hdr = htonl(0x60000000);
274
275 hdr->payload_len = htons(len);
276 hdr->nexthdr = proto;
277 hdr->hop_limit = np->hop_limit;
278
279 hdr->saddr = *saddr;
280 hdr->daddr = *daddr;
281
282 return 0;
283 }
284
285 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
286 {
287 struct ip6_ra_chain *ra;
288 struct sock *last = NULL;
289
290 read_lock(&ip6_ra_lock);
291 for (ra = ip6_ra_chain; ra; ra = ra->next) {
292 struct sock *sk = ra->sk;
293 if (sk && ra->sel == sel &&
294 (!sk->sk_bound_dev_if ||
295 sk->sk_bound_dev_if == skb->dev->ifindex)) {
296 if (last) {
297 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
298 if (skb2)
299 rawv6_rcv(last, skb2);
300 }
301 last = sk;
302 }
303 }
304
305 if (last) {
306 rawv6_rcv(last, skb);
307 read_unlock(&ip6_ra_lock);
308 return 1;
309 }
310 read_unlock(&ip6_ra_lock);
311 return 0;
312 }
313
314 static int ip6_forward_proxy_check(struct sk_buff *skb)
315 {
316 struct ipv6hdr *hdr = ipv6_hdr(skb);
317 u8 nexthdr = hdr->nexthdr;
318 __be16 frag_off;
319 int offset;
320
321 if (ipv6_ext_hdr(nexthdr)) {
322 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
323 if (offset < 0)
324 return 0;
325 } else
326 offset = sizeof(struct ipv6hdr);
327
328 if (nexthdr == IPPROTO_ICMPV6) {
329 struct icmp6hdr *icmp6;
330
331 if (!pskb_may_pull(skb, (skb_network_header(skb) +
332 offset + 1 - skb->data)))
333 return 0;
334
335 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
336
337 switch (icmp6->icmp6_type) {
338 case NDISC_ROUTER_SOLICITATION:
339 case NDISC_ROUTER_ADVERTISEMENT:
340 case NDISC_NEIGHBOUR_SOLICITATION:
341 case NDISC_NEIGHBOUR_ADVERTISEMENT:
342 case NDISC_REDIRECT:
343 /* For reaction involving unicast neighbor discovery
344 * message destined to the proxied address, pass it to
345 * input function.
346 */
347 return 1;
348 default:
349 break;
350 }
351 }
352
353 /*
354 * The proxying router can't forward traffic sent to a link-local
355 * address, so signal the sender and discard the packet. This
356 * behavior is clarified by the MIPv6 specification.
357 */
358 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
359 dst_link_failure(skb);
360 return -1;
361 }
362
363 return 0;
364 }
365
366 static inline int ip6_forward_finish(struct sk_buff *skb)
367 {
368 return dst_output(skb);
369 }
370
371 int ip6_forward(struct sk_buff *skb)
372 {
373 struct dst_entry *dst = skb_dst(skb);
374 struct ipv6hdr *hdr = ipv6_hdr(skb);
375 struct inet6_skb_parm *opt = IP6CB(skb);
376 struct net *net = dev_net(dst->dev);
377 u32 mtu;
378
379 if (net->ipv6.devconf_all->forwarding == 0)
380 goto error;
381
382 if (skb_warn_if_lro(skb))
383 goto drop;
384
385 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
387 goto drop;
388 }
389
390 if (skb->pkt_type != PACKET_HOST)
391 goto drop;
392
393 skb_forward_csum(skb);
394
395 /*
396 * We DO NOT make any processing on
397 * RA packets, pushing them to user level AS IS
398 * without ane WARRANTY that application will be able
399 * to interpret them. The reason is that we
400 * cannot make anything clever here.
401 *
402 * We are not end-node, so that if packet contains
403 * AH/ESP, we cannot make anything.
404 * Defragmentation also would be mistake, RA packets
405 * cannot be fragmented, because there is no warranty
406 * that different fragments will go along one path. --ANK
407 */
408 if (opt->ra) {
409 u8 *ptr = skb_network_header(skb) + opt->ra;
410 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
411 return 0;
412 }
413
414 /*
415 * check and decrement ttl
416 */
417 if (hdr->hop_limit <= 1) {
418 /* Force OUTPUT device used as source address */
419 skb->dev = dst->dev;
420 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
421 IP6_INC_STATS_BH(net,
422 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
423
424 kfree_skb(skb);
425 return -ETIMEDOUT;
426 }
427
428 /* XXX: idev->cnf.proxy_ndp? */
429 if (net->ipv6.devconf_all->proxy_ndp &&
430 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
431 int proxied = ip6_forward_proxy_check(skb);
432 if (proxied > 0)
433 return ip6_input(skb);
434 else if (proxied < 0) {
435 IP6_INC_STATS(net, ip6_dst_idev(dst),
436 IPSTATS_MIB_INDISCARDS);
437 goto drop;
438 }
439 }
440
441 if (!xfrm6_route_forward(skb)) {
442 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
443 goto drop;
444 }
445 dst = skb_dst(skb);
446
447 /* IPv6 specs say nothing about it, but it is clear that we cannot
448 send redirects to source routed frames.
449 We don't send redirects to frames decapsulated from IPsec.
450 */
451 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
452 struct in6_addr *target = NULL;
453 struct inet_peer *peer;
454 struct rt6_info *rt;
455
456 /*
457 * incoming and outgoing devices are the same
458 * send a redirect.
459 */
460
461 rt = (struct rt6_info *) dst;
462 if (rt->rt6i_flags & RTF_GATEWAY)
463 target = &rt->rt6i_gateway;
464 else
465 target = &hdr->daddr;
466
467 peer = rt6_get_peer_create(rt);
468
469 /* Limit redirects both by destination (here)
470 and by source (inside ndisc_send_redirect)
471 */
472 if (inet_peer_xrlim_allow(peer, 1*HZ))
473 ndisc_send_redirect(skb, target);
474 } else {
475 int addrtype = ipv6_addr_type(&hdr->saddr);
476
477 /* This check is security critical. */
478 if (addrtype == IPV6_ADDR_ANY ||
479 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 goto error;
481 if (addrtype & IPV6_ADDR_LINKLOCAL) {
482 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483 ICMPV6_NOT_NEIGHBOUR, 0);
484 goto error;
485 }
486 }
487
488 mtu = dst_mtu(dst);
489 if (mtu < IPV6_MIN_MTU)
490 mtu = IPV6_MIN_MTU;
491
492 if (skb->len > mtu && !skb_is_gso(skb)) {
493 /* Again, force OUTPUT device used as source address */
494 skb->dev = dst->dev;
495 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 IP6_INC_STATS_BH(net,
497 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498 IP6_INC_STATS_BH(net,
499 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500 kfree_skb(skb);
501 return -EMSGSIZE;
502 }
503
504 if (skb_cow(skb, dst->dev->hard_header_len)) {
505 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506 goto drop;
507 }
508
509 hdr = ipv6_hdr(skb);
510
511 /* Mangling hops number delayed to point after skb COW */
512
513 hdr->hop_limit--;
514
515 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518 ip6_forward_finish);
519
520 error:
521 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522 drop:
523 kfree_skb(skb);
524 return -EINVAL;
525 }
526
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528 {
529 to->pkt_type = from->pkt_type;
530 to->priority = from->priority;
531 to->protocol = from->protocol;
532 skb_dst_drop(to);
533 skb_dst_set(to, dst_clone(skb_dst(from)));
534 to->dev = from->dev;
535 to->mark = from->mark;
536
537 #ifdef CONFIG_NET_SCHED
538 to->tc_index = from->tc_index;
539 #endif
540 nf_copy(to, from);
541 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
542 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
543 to->nf_trace = from->nf_trace;
544 #endif
545 skb_copy_secmark(to, from);
546 }
547
548 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
549 {
550 u16 offset = sizeof(struct ipv6hdr);
551 struct ipv6_opt_hdr *exthdr =
552 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
553 unsigned int packet_len = skb->tail - skb->network_header;
554 int found_rhdr = 0;
555 *nexthdr = &ipv6_hdr(skb)->nexthdr;
556
557 while (offset + 1 <= packet_len) {
558
559 switch (**nexthdr) {
560
561 case NEXTHDR_HOP:
562 break;
563 case NEXTHDR_ROUTING:
564 found_rhdr = 1;
565 break;
566 case NEXTHDR_DEST:
567 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
568 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
569 break;
570 #endif
571 if (found_rhdr)
572 return offset;
573 break;
574 default :
575 return offset;
576 }
577
578 offset += ipv6_optlen(exthdr);
579 *nexthdr = &exthdr->nexthdr;
580 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
581 offset);
582 }
583
584 return offset;
585 }
586
587 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
588 {
589 static atomic_t ipv6_fragmentation_id;
590 int old, new;
591
592 if (rt && !(rt->dst.flags & DST_NOPEER)) {
593 struct inet_peer *peer = rt6_get_peer_create(rt);
594
595 if (peer) {
596 fhdr->identification = htonl(inet_getid(peer, 0));
597 return;
598 }
599 }
600 do {
601 old = atomic_read(&ipv6_fragmentation_id);
602 new = old + 1;
603 if (!new)
604 new = 1;
605 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
606 fhdr->identification = htonl(new);
607 }
608
609 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
610 {
611 struct sk_buff *frag;
612 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
613 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
614 struct ipv6hdr *tmp_hdr;
615 struct frag_hdr *fh;
616 unsigned int mtu, hlen, left, len;
617 int hroom, troom;
618 __be32 frag_id = 0;
619 int ptr, offset = 0, err=0;
620 u8 *prevhdr, nexthdr = 0;
621 struct net *net = dev_net(skb_dst(skb)->dev);
622
623 hlen = ip6_find_1stfragopt(skb, &prevhdr);
624 nexthdr = *prevhdr;
625
626 mtu = ip6_skb_dst_mtu(skb);
627
628 /* We must not fragment if the socket is set to force MTU discovery
629 * or if the skb it not generated by a local socket.
630 */
631 if (unlikely(!skb->local_df && skb->len > mtu)) {
632 if (skb->sk && dst_allfrag(skb_dst(skb)))
633 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
634
635 skb->dev = skb_dst(skb)->dev;
636 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
637 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
638 IPSTATS_MIB_FRAGFAILS);
639 kfree_skb(skb);
640 return -EMSGSIZE;
641 }
642
643 if (np && np->frag_size < mtu) {
644 if (np->frag_size)
645 mtu = np->frag_size;
646 }
647 mtu -= hlen + sizeof(struct frag_hdr);
648
649 if (skb_has_frag_list(skb)) {
650 int first_len = skb_pagelen(skb);
651 struct sk_buff *frag2;
652
653 if (first_len - hlen > mtu ||
654 ((first_len - hlen) & 7) ||
655 skb_cloned(skb))
656 goto slow_path;
657
658 skb_walk_frags(skb, frag) {
659 /* Correct geometry. */
660 if (frag->len > mtu ||
661 ((frag->len & 7) && frag->next) ||
662 skb_headroom(frag) < hlen)
663 goto slow_path_clean;
664
665 /* Partially cloned skb? */
666 if (skb_shared(frag))
667 goto slow_path_clean;
668
669 BUG_ON(frag->sk);
670 if (skb->sk) {
671 frag->sk = skb->sk;
672 frag->destructor = sock_wfree;
673 }
674 skb->truesize -= frag->truesize;
675 }
676
677 err = 0;
678 offset = 0;
679 frag = skb_shinfo(skb)->frag_list;
680 skb_frag_list_init(skb);
681 /* BUILD HEADER */
682
683 *prevhdr = NEXTHDR_FRAGMENT;
684 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
685 if (!tmp_hdr) {
686 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
687 IPSTATS_MIB_FRAGFAILS);
688 return -ENOMEM;
689 }
690
691 __skb_pull(skb, hlen);
692 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
693 __skb_push(skb, hlen);
694 skb_reset_network_header(skb);
695 memcpy(skb_network_header(skb), tmp_hdr, hlen);
696
697 ipv6_select_ident(fh, rt);
698 fh->nexthdr = nexthdr;
699 fh->reserved = 0;
700 fh->frag_off = htons(IP6_MF);
701 frag_id = fh->identification;
702
703 first_len = skb_pagelen(skb);
704 skb->data_len = first_len - skb_headlen(skb);
705 skb->len = first_len;
706 ipv6_hdr(skb)->payload_len = htons(first_len -
707 sizeof(struct ipv6hdr));
708
709 dst_hold(&rt->dst);
710
711 for (;;) {
712 /* Prepare header of the next frame,
713 * before previous one went down. */
714 if (frag) {
715 frag->ip_summed = CHECKSUM_NONE;
716 skb_reset_transport_header(frag);
717 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
718 __skb_push(frag, hlen);
719 skb_reset_network_header(frag);
720 memcpy(skb_network_header(frag), tmp_hdr,
721 hlen);
722 offset += skb->len - hlen - sizeof(struct frag_hdr);
723 fh->nexthdr = nexthdr;
724 fh->reserved = 0;
725 fh->frag_off = htons(offset);
726 if (frag->next != NULL)
727 fh->frag_off |= htons(IP6_MF);
728 fh->identification = frag_id;
729 ipv6_hdr(frag)->payload_len =
730 htons(frag->len -
731 sizeof(struct ipv6hdr));
732 ip6_copy_metadata(frag, skb);
733 }
734
735 err = output(skb);
736 if(!err)
737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 IPSTATS_MIB_FRAGCREATES);
739
740 if (err || !frag)
741 break;
742
743 skb = frag;
744 frag = skb->next;
745 skb->next = NULL;
746 }
747
748 kfree(tmp_hdr);
749
750 if (err == 0) {
751 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
752 IPSTATS_MIB_FRAGOKS);
753 dst_release(&rt->dst);
754 return 0;
755 }
756
757 while (frag) {
758 skb = frag->next;
759 kfree_skb(frag);
760 frag = skb;
761 }
762
763 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764 IPSTATS_MIB_FRAGFAILS);
765 dst_release(&rt->dst);
766 return err;
767
768 slow_path_clean:
769 skb_walk_frags(skb, frag2) {
770 if (frag2 == frag)
771 break;
772 frag2->sk = NULL;
773 frag2->destructor = NULL;
774 skb->truesize += frag2->truesize;
775 }
776 }
777
778 slow_path:
779 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
780 skb_checksum_help(skb))
781 goto fail;
782
783 left = skb->len - hlen; /* Space per frame */
784 ptr = hlen; /* Where to start from */
785
786 /*
787 * Fragment the datagram.
788 */
789
790 *prevhdr = NEXTHDR_FRAGMENT;
791 hroom = LL_RESERVED_SPACE(rt->dst.dev);
792 troom = rt->dst.dev->needed_tailroom;
793
794 /*
795 * Keep copying data until we run out.
796 */
797 while(left > 0) {
798 len = left;
799 /* IF: it doesn't fit, use 'mtu' - the data space left */
800 if (len > mtu)
801 len = mtu;
802 /* IF: we are not sending up to and including the packet end
803 then align the next start on an eight byte boundary */
804 if (len < left) {
805 len &= ~7;
806 }
807 /*
808 * Allocate buffer.
809 */
810
811 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
812 hroom + troom, GFP_ATOMIC)) == NULL) {
813 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
814 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
815 IPSTATS_MIB_FRAGFAILS);
816 err = -ENOMEM;
817 goto fail;
818 }
819
820 /*
821 * Set up data on packet
822 */
823
824 ip6_copy_metadata(frag, skb);
825 skb_reserve(frag, hroom);
826 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
827 skb_reset_network_header(frag);
828 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
829 frag->transport_header = (frag->network_header + hlen +
830 sizeof(struct frag_hdr));
831
832 /*
833 * Charge the memory for the fragment to any owner
834 * it might possess
835 */
836 if (skb->sk)
837 skb_set_owner_w(frag, skb->sk);
838
839 /*
840 * Copy the packet header into the new buffer.
841 */
842 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
843
844 /*
845 * Build fragment header.
846 */
847 fh->nexthdr = nexthdr;
848 fh->reserved = 0;
849 if (!frag_id) {
850 ipv6_select_ident(fh, rt);
851 frag_id = fh->identification;
852 } else
853 fh->identification = frag_id;
854
855 /*
856 * Copy a block of the IP datagram.
857 */
858 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
859 BUG();
860 left -= len;
861
862 fh->frag_off = htons(offset);
863 if (left > 0)
864 fh->frag_off |= htons(IP6_MF);
865 ipv6_hdr(frag)->payload_len = htons(frag->len -
866 sizeof(struct ipv6hdr));
867
868 ptr += len;
869 offset += len;
870
871 /*
872 * Put this fragment into the sending queue.
873 */
874 err = output(frag);
875 if (err)
876 goto fail;
877
878 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879 IPSTATS_MIB_FRAGCREATES);
880 }
881 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
882 IPSTATS_MIB_FRAGOKS);
883 consume_skb(skb);
884 return err;
885
886 fail:
887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 IPSTATS_MIB_FRAGFAILS);
889 kfree_skb(skb);
890 return err;
891 }
892
893 static inline int ip6_rt_check(const struct rt6key *rt_key,
894 const struct in6_addr *fl_addr,
895 const struct in6_addr *addr_cache)
896 {
897 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
898 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
899 }
900
901 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
902 struct dst_entry *dst,
903 const struct flowi6 *fl6)
904 {
905 struct ipv6_pinfo *np = inet6_sk(sk);
906 struct rt6_info *rt = (struct rt6_info *)dst;
907
908 if (!dst)
909 goto out;
910
911 /* Yes, checking route validity in not connected
912 * case is not very simple. Take into account,
913 * that we do not support routing by source, TOS,
914 * and MSG_DONTROUTE --ANK (980726)
915 *
916 * 1. ip6_rt_check(): If route was host route,
917 * check that cached destination is current.
918 * If it is network route, we still may
919 * check its validity using saved pointer
920 * to the last used address: daddr_cache.
921 * We do not want to save whole address now,
922 * (because main consumer of this service
923 * is tcp, which has not this problem),
924 * so that the last trick works only on connected
925 * sockets.
926 * 2. oif also should be the same.
927 */
928 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
929 #ifdef CONFIG_IPV6_SUBTREES
930 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
931 #endif
932 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
933 dst_release(dst);
934 dst = NULL;
935 }
936
937 out:
938 return dst;
939 }
940
941 static int ip6_dst_lookup_tail(struct sock *sk,
942 struct dst_entry **dst, struct flowi6 *fl6)
943 {
944 struct net *net = sock_net(sk);
945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946 struct neighbour *n;
947 #endif
948 int err;
949
950 if (*dst == NULL)
951 *dst = ip6_route_output(net, sk, fl6);
952
953 if ((err = (*dst)->error))
954 goto out_err_release;
955
956 if (ipv6_addr_any(&fl6->saddr)) {
957 struct rt6_info *rt = (struct rt6_info *) *dst;
958 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
959 sk ? inet6_sk(sk)->srcprefs : 0,
960 &fl6->saddr);
961 if (err)
962 goto out_err_release;
963 }
964
965 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
966 /*
967 * Here if the dst entry we've looked up
968 * has a neighbour entry that is in the INCOMPLETE
969 * state and the src address from the flow is
970 * marked as OPTIMISTIC, we release the found
971 * dst entry and replace it instead with the
972 * dst entry of the nexthop router
973 */
974 rcu_read_lock();
975 n = dst_get_neighbour_noref(*dst);
976 if (n && !(n->nud_state & NUD_VALID)) {
977 struct inet6_ifaddr *ifp;
978 struct flowi6 fl_gw6;
979 int redirect;
980
981 rcu_read_unlock();
982 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
983 (*dst)->dev, 1);
984
985 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
986 if (ifp)
987 in6_ifa_put(ifp);
988
989 if (redirect) {
990 /*
991 * We need to get the dst entry for the
992 * default router instead
993 */
994 dst_release(*dst);
995 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
996 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
997 *dst = ip6_route_output(net, sk, &fl_gw6);
998 if ((err = (*dst)->error))
999 goto out_err_release;
1000 }
1001 } else {
1002 rcu_read_unlock();
1003 }
1004 #endif
1005
1006 return 0;
1007
1008 out_err_release:
1009 if (err == -ENETUNREACH)
1010 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011 dst_release(*dst);
1012 *dst = NULL;
1013 return err;
1014 }
1015
1016 /**
1017 * ip6_dst_lookup - perform route lookup on flow
1018 * @sk: socket which provides route info
1019 * @dst: pointer to dst_entry * for result
1020 * @fl6: flow to lookup
1021 *
1022 * This function performs a route lookup on the given flow.
1023 *
1024 * It returns zero on success, or a standard errno code on error.
1025 */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027 {
1028 *dst = NULL;
1029 return ip6_dst_lookup_tail(sk, dst, fl6);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032
1033 /**
1034 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035 * @sk: socket which provides route info
1036 * @fl6: flow to lookup
1037 * @final_dst: final destination address for ipsec lookup
1038 * @can_sleep: we are in a sleepable context
1039 *
1040 * This function performs a route lookup on the given flow.
1041 *
1042 * It returns a valid dst pointer on success, or a pointer encoded
1043 * error code.
1044 */
1045 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046 const struct in6_addr *final_dst,
1047 bool can_sleep)
1048 {
1049 struct dst_entry *dst = NULL;
1050 int err;
1051
1052 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053 if (err)
1054 return ERR_PTR(err);
1055 if (final_dst)
1056 fl6->daddr = *final_dst;
1057 if (can_sleep)
1058 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059
1060 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061 }
1062 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063
1064 /**
1065 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066 * @sk: socket which provides the dst cache and route info
1067 * @fl6: flow to lookup
1068 * @final_dst: final destination address for ipsec lookup
1069 * @can_sleep: we are in a sleepable context
1070 *
1071 * This function performs a route lookup on the given flow with the
1072 * possibility of using the cached route in the socket if it is valid.
1073 * It will take the socket dst lock when operating on the dst cache.
1074 * As a result, this function can only be used in process context.
1075 *
1076 * It returns a valid dst pointer on success, or a pointer encoded
1077 * error code.
1078 */
1079 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080 const struct in6_addr *final_dst,
1081 bool can_sleep)
1082 {
1083 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084 int err;
1085
1086 dst = ip6_sk_dst_check(sk, dst, fl6);
1087
1088 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089 if (err)
1090 return ERR_PTR(err);
1091 if (final_dst)
1092 fl6->daddr = *final_dst;
1093 if (can_sleep)
1094 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095
1096 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099
1100 static inline int ip6_ufo_append_data(struct sock *sk,
1101 int getfrag(void *from, char *to, int offset, int len,
1102 int odd, struct sk_buff *skb),
1103 void *from, int length, int hh_len, int fragheaderlen,
1104 int transhdrlen, int mtu,unsigned int flags,
1105 struct rt6_info *rt)
1106
1107 {
1108 struct sk_buff *skb;
1109 int err;
1110
1111 /* There is support for UDP large send offload by network
1112 * device, so create one single skb packet containing complete
1113 * udp datagram
1114 */
1115 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116 skb = sock_alloc_send_skb(sk,
1117 hh_len + fragheaderlen + transhdrlen + 20,
1118 (flags & MSG_DONTWAIT), &err);
1119 if (skb == NULL)
1120 return err;
1121
1122 /* reserve space for Hardware header */
1123 skb_reserve(skb, hh_len);
1124
1125 /* create space for UDP/IP header */
1126 skb_put(skb,fragheaderlen + transhdrlen);
1127
1128 /* initialize network header pointer */
1129 skb_reset_network_header(skb);
1130
1131 /* initialize protocol header pointer */
1132 skb->transport_header = skb->network_header + fragheaderlen;
1133
1134 skb->ip_summed = CHECKSUM_PARTIAL;
1135 skb->csum = 0;
1136 }
1137
1138 err = skb_append_datato_frags(sk,skb, getfrag, from,
1139 (length - transhdrlen));
1140 if (!err) {
1141 struct frag_hdr fhdr;
1142
1143 /* Specify the length of each IPv6 datagram fragment.
1144 * It has to be a multiple of 8.
1145 */
1146 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147 sizeof(struct frag_hdr)) & ~7;
1148 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149 ipv6_select_ident(&fhdr, rt);
1150 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151 __skb_queue_tail(&sk->sk_write_queue, skb);
1152
1153 return 0;
1154 }
1155 /* There is not enough support do UPD LSO,
1156 * so follow normal path
1157 */
1158 kfree_skb(skb);
1159
1160 return err;
1161 }
1162
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164 gfp_t gfp)
1165 {
1166 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 }
1168
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170 gfp_t gfp)
1171 {
1172 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 static void ip6_append_data_mtu(int *mtu,
1176 int *maxfraglen,
1177 unsigned int fragheaderlen,
1178 struct sk_buff *skb,
1179 struct rt6_info *rt)
1180 {
1181 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1182 if (skb == NULL) {
1183 /* first fragment, reserve header_len */
1184 *mtu = *mtu - rt->dst.header_len;
1185
1186 } else {
1187 /*
1188 * this fragment is not first, the headers
1189 * space is regarded as data space.
1190 */
1191 *mtu = dst_mtu(rt->dst.path);
1192 }
1193 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1194 + fragheaderlen - sizeof(struct frag_hdr);
1195 }
1196 }
1197
1198 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1199 int offset, int len, int odd, struct sk_buff *skb),
1200 void *from, int length, int transhdrlen,
1201 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1202 struct rt6_info *rt, unsigned int flags, int dontfrag)
1203 {
1204 struct inet_sock *inet = inet_sk(sk);
1205 struct ipv6_pinfo *np = inet6_sk(sk);
1206 struct inet_cork *cork;
1207 struct sk_buff *skb, *skb_prev = NULL;
1208 unsigned int maxfraglen, fragheaderlen;
1209 int exthdrlen;
1210 int dst_exthdrlen;
1211 int hh_len;
1212 int mtu;
1213 int copy;
1214 int err;
1215 int offset = 0;
1216 __u8 tx_flags = 0;
1217
1218 if (flags&MSG_PROBE)
1219 return 0;
1220 cork = &inet->cork.base;
1221 if (skb_queue_empty(&sk->sk_write_queue)) {
1222 /*
1223 * setup for corking
1224 */
1225 if (opt) {
1226 if (WARN_ON(np->cork.opt))
1227 return -EINVAL;
1228
1229 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1230 if (unlikely(np->cork.opt == NULL))
1231 return -ENOBUFS;
1232
1233 np->cork.opt->tot_len = opt->tot_len;
1234 np->cork.opt->opt_flen = opt->opt_flen;
1235 np->cork.opt->opt_nflen = opt->opt_nflen;
1236
1237 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1238 sk->sk_allocation);
1239 if (opt->dst0opt && !np->cork.opt->dst0opt)
1240 return -ENOBUFS;
1241
1242 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1243 sk->sk_allocation);
1244 if (opt->dst1opt && !np->cork.opt->dst1opt)
1245 return -ENOBUFS;
1246
1247 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1248 sk->sk_allocation);
1249 if (opt->hopopt && !np->cork.opt->hopopt)
1250 return -ENOBUFS;
1251
1252 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1253 sk->sk_allocation);
1254 if (opt->srcrt && !np->cork.opt->srcrt)
1255 return -ENOBUFS;
1256
1257 /* need source address above miyazawa*/
1258 }
1259 dst_hold(&rt->dst);
1260 cork->dst = &rt->dst;
1261 inet->cork.fl.u.ip6 = *fl6;
1262 np->cork.hop_limit = hlimit;
1263 np->cork.tclass = tclass;
1264 if (rt->dst.flags & DST_XFRM_TUNNEL)
1265 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1266 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1267 else
1268 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1269 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1270 if (np->frag_size < mtu) {
1271 if (np->frag_size)
1272 mtu = np->frag_size;
1273 }
1274 cork->fragsize = mtu;
1275 if (dst_allfrag(rt->dst.path))
1276 cork->flags |= IPCORK_ALLFRAG;
1277 cork->length = 0;
1278 sk->sk_sndmsg_page = NULL;
1279 sk->sk_sndmsg_off = 0;
1280 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1281 length += exthdrlen;
1282 transhdrlen += exthdrlen;
1283 dst_exthdrlen = rt->dst.header_len;
1284 } else {
1285 rt = (struct rt6_info *)cork->dst;
1286 fl6 = &inet->cork.fl.u.ip6;
1287 opt = np->cork.opt;
1288 transhdrlen = 0;
1289 exthdrlen = 0;
1290 dst_exthdrlen = 0;
1291 mtu = cork->fragsize;
1292 }
1293
1294 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297 (opt ? opt->opt_nflen : 0);
1298 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1299
1300 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1301 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1302 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1303 return -EMSGSIZE;
1304 }
1305 }
1306
1307 /* For UDP, check if TX timestamp is enabled */
1308 if (sk->sk_type == SOCK_DGRAM) {
1309 err = sock_tx_timestamp(sk, &tx_flags);
1310 if (err)
1311 goto error;
1312 }
1313
1314 /*
1315 * Let's try using as much space as possible.
1316 * Use MTU if total length of the message fits into the MTU.
1317 * Otherwise, we need to reserve fragment header and
1318 * fragment alignment (= 8-15 octects, in total).
1319 *
1320 * Note that we may need to "move" the data from the tail of
1321 * of the buffer to the new fragment when we split
1322 * the message.
1323 *
1324 * FIXME: It may be fragmented into multiple chunks
1325 * at once if non-fragmentable extension headers
1326 * are too large.
1327 * --yoshfuji
1328 */
1329
1330 cork->length += length;
1331 if (length > mtu) {
1332 int proto = sk->sk_protocol;
1333 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1334 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1335 return -EMSGSIZE;
1336 }
1337
1338 if (proto == IPPROTO_UDP &&
1339 (rt->dst.dev->features & NETIF_F_UFO)) {
1340
1341 err = ip6_ufo_append_data(sk, getfrag, from, length,
1342 hh_len, fragheaderlen,
1343 transhdrlen, mtu, flags, rt);
1344 if (err)
1345 goto error;
1346 return 0;
1347 }
1348 }
1349
1350 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1351 goto alloc_new_skb;
1352
1353 while (length > 0) {
1354 /* Check if the remaining data fits into current packet. */
1355 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1356 if (copy < length)
1357 copy = maxfraglen - skb->len;
1358
1359 if (copy <= 0) {
1360 char *data;
1361 unsigned int datalen;
1362 unsigned int fraglen;
1363 unsigned int fraggap;
1364 unsigned int alloclen;
1365 alloc_new_skb:
1366 /* There's no room in the current skb */
1367 if (skb)
1368 fraggap = skb->len - maxfraglen;
1369 else
1370 fraggap = 0;
1371 /* update mtu and maxfraglen if necessary */
1372 if (skb == NULL || skb_prev == NULL)
1373 ip6_append_data_mtu(&mtu, &maxfraglen,
1374 fragheaderlen, skb, rt);
1375
1376 skb_prev = skb;
1377
1378 /*
1379 * If remaining data exceeds the mtu,
1380 * we know we need more fragment(s).
1381 */
1382 datalen = length + fraggap;
1383
1384 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1385 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1386 if ((flags & MSG_MORE) &&
1387 !(rt->dst.dev->features&NETIF_F_SG))
1388 alloclen = mtu;
1389 else
1390 alloclen = datalen + fragheaderlen;
1391
1392 alloclen += dst_exthdrlen;
1393
1394 if (datalen != length + fraggap) {
1395 /*
1396 * this is not the last fragment, the trailer
1397 * space is regarded as data space.
1398 */
1399 datalen += rt->dst.trailer_len;
1400 }
1401
1402 alloclen += rt->dst.trailer_len;
1403 fraglen = datalen + fragheaderlen;
1404
1405 /*
1406 * We just reserve space for fragment header.
1407 * Note: this may be overallocation if the message
1408 * (without MSG_MORE) fits into the MTU.
1409 */
1410 alloclen += sizeof(struct frag_hdr);
1411
1412 if (transhdrlen) {
1413 skb = sock_alloc_send_skb(sk,
1414 alloclen + hh_len,
1415 (flags & MSG_DONTWAIT), &err);
1416 } else {
1417 skb = NULL;
1418 if (atomic_read(&sk->sk_wmem_alloc) <=
1419 2 * sk->sk_sndbuf)
1420 skb = sock_wmalloc(sk,
1421 alloclen + hh_len, 1,
1422 sk->sk_allocation);
1423 if (unlikely(skb == NULL))
1424 err = -ENOBUFS;
1425 else {
1426 /* Only the initial fragment
1427 * is time stamped.
1428 */
1429 tx_flags = 0;
1430 }
1431 }
1432 if (skb == NULL)
1433 goto error;
1434 /*
1435 * Fill in the control structures
1436 */
1437 skb->ip_summed = CHECKSUM_NONE;
1438 skb->csum = 0;
1439 /* reserve for fragmentation and ipsec header */
1440 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1441 dst_exthdrlen);
1442
1443 if (sk->sk_type == SOCK_DGRAM)
1444 skb_shinfo(skb)->tx_flags = tx_flags;
1445
1446 /*
1447 * Find where to start putting bytes
1448 */
1449 data = skb_put(skb, fraglen);
1450 skb_set_network_header(skb, exthdrlen);
1451 data += fragheaderlen;
1452 skb->transport_header = (skb->network_header +
1453 fragheaderlen);
1454 if (fraggap) {
1455 skb->csum = skb_copy_and_csum_bits(
1456 skb_prev, maxfraglen,
1457 data + transhdrlen, fraggap, 0);
1458 skb_prev->csum = csum_sub(skb_prev->csum,
1459 skb->csum);
1460 data += fraggap;
1461 pskb_trim_unique(skb_prev, maxfraglen);
1462 }
1463 copy = datalen - transhdrlen - fraggap;
1464
1465 if (copy < 0) {
1466 err = -EINVAL;
1467 kfree_skb(skb);
1468 goto error;
1469 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1470 err = -EFAULT;
1471 kfree_skb(skb);
1472 goto error;
1473 }
1474
1475 offset += copy;
1476 length -= datalen - fraggap;
1477 transhdrlen = 0;
1478 exthdrlen = 0;
1479 dst_exthdrlen = 0;
1480
1481 /*
1482 * Put the packet on the pending queue
1483 */
1484 __skb_queue_tail(&sk->sk_write_queue, skb);
1485 continue;
1486 }
1487
1488 if (copy > length)
1489 copy = length;
1490
1491 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1492 unsigned int off;
1493
1494 off = skb->len;
1495 if (getfrag(from, skb_put(skb, copy),
1496 offset, copy, off, skb) < 0) {
1497 __skb_trim(skb, off);
1498 err = -EFAULT;
1499 goto error;
1500 }
1501 } else {
1502 int i = skb_shinfo(skb)->nr_frags;
1503 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1504 struct page *page = sk->sk_sndmsg_page;
1505 int off = sk->sk_sndmsg_off;
1506 unsigned int left;
1507
1508 if (page && (left = PAGE_SIZE - off) > 0) {
1509 if (copy >= left)
1510 copy = left;
1511 if (page != skb_frag_page(frag)) {
1512 if (i == MAX_SKB_FRAGS) {
1513 err = -EMSGSIZE;
1514 goto error;
1515 }
1516 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1517 skb_frag_ref(skb, i);
1518 frag = &skb_shinfo(skb)->frags[i];
1519 }
1520 } else if(i < MAX_SKB_FRAGS) {
1521 if (copy > PAGE_SIZE)
1522 copy = PAGE_SIZE;
1523 page = alloc_pages(sk->sk_allocation, 0);
1524 if (page == NULL) {
1525 err = -ENOMEM;
1526 goto error;
1527 }
1528 sk->sk_sndmsg_page = page;
1529 sk->sk_sndmsg_off = 0;
1530
1531 skb_fill_page_desc(skb, i, page, 0, 0);
1532 frag = &skb_shinfo(skb)->frags[i];
1533 } else {
1534 err = -EMSGSIZE;
1535 goto error;
1536 }
1537 if (getfrag(from,
1538 skb_frag_address(frag) + skb_frag_size(frag),
1539 offset, copy, skb->len, skb) < 0) {
1540 err = -EFAULT;
1541 goto error;
1542 }
1543 sk->sk_sndmsg_off += copy;
1544 skb_frag_size_add(frag, copy);
1545 skb->len += copy;
1546 skb->data_len += copy;
1547 skb->truesize += copy;
1548 atomic_add(copy, &sk->sk_wmem_alloc);
1549 }
1550 offset += copy;
1551 length -= copy;
1552 }
1553 return 0;
1554 error:
1555 cork->length -= length;
1556 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1557 return err;
1558 }
1559 EXPORT_SYMBOL_GPL(ip6_append_data);
1560
1561 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1562 {
1563 if (np->cork.opt) {
1564 kfree(np->cork.opt->dst0opt);
1565 kfree(np->cork.opt->dst1opt);
1566 kfree(np->cork.opt->hopopt);
1567 kfree(np->cork.opt->srcrt);
1568 kfree(np->cork.opt);
1569 np->cork.opt = NULL;
1570 }
1571
1572 if (inet->cork.base.dst) {
1573 dst_release(inet->cork.base.dst);
1574 inet->cork.base.dst = NULL;
1575 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1576 }
1577 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1578 }
1579
1580 int ip6_push_pending_frames(struct sock *sk)
1581 {
1582 struct sk_buff *skb, *tmp_skb;
1583 struct sk_buff **tail_skb;
1584 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1585 struct inet_sock *inet = inet_sk(sk);
1586 struct ipv6_pinfo *np = inet6_sk(sk);
1587 struct net *net = sock_net(sk);
1588 struct ipv6hdr *hdr;
1589 struct ipv6_txoptions *opt = np->cork.opt;
1590 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1591 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1592 unsigned char proto = fl6->flowi6_proto;
1593 int err = 0;
1594
1595 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1596 goto out;
1597 tail_skb = &(skb_shinfo(skb)->frag_list);
1598
1599 /* move skb->data to ip header from ext header */
1600 if (skb->data < skb_network_header(skb))
1601 __skb_pull(skb, skb_network_offset(skb));
1602 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1603 __skb_pull(tmp_skb, skb_network_header_len(skb));
1604 *tail_skb = tmp_skb;
1605 tail_skb = &(tmp_skb->next);
1606 skb->len += tmp_skb->len;
1607 skb->data_len += tmp_skb->len;
1608 skb->truesize += tmp_skb->truesize;
1609 tmp_skb->destructor = NULL;
1610 tmp_skb->sk = NULL;
1611 }
1612
1613 /* Allow local fragmentation. */
1614 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1615 skb->local_df = 1;
1616
1617 *final_dst = fl6->daddr;
1618 __skb_pull(skb, skb_network_header_len(skb));
1619 if (opt && opt->opt_flen)
1620 ipv6_push_frag_opts(skb, opt, &proto);
1621 if (opt && opt->opt_nflen)
1622 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1623
1624 skb_push(skb, sizeof(struct ipv6hdr));
1625 skb_reset_network_header(skb);
1626 hdr = ipv6_hdr(skb);
1627
1628 *(__be32*)hdr = fl6->flowlabel |
1629 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1630
1631 hdr->hop_limit = np->cork.hop_limit;
1632 hdr->nexthdr = proto;
1633 hdr->saddr = fl6->saddr;
1634 hdr->daddr = *final_dst;
1635
1636 skb->priority = sk->sk_priority;
1637 skb->mark = sk->sk_mark;
1638
1639 skb_dst_set(skb, dst_clone(&rt->dst));
1640 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1641 if (proto == IPPROTO_ICMPV6) {
1642 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1643
1644 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1645 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1646 }
1647
1648 err = ip6_local_out(skb);
1649 if (err) {
1650 if (err > 0)
1651 err = net_xmit_errno(err);
1652 if (err)
1653 goto error;
1654 }
1655
1656 out:
1657 ip6_cork_release(inet, np);
1658 return err;
1659 error:
1660 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1661 goto out;
1662 }
1663 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1664
1665 void ip6_flush_pending_frames(struct sock *sk)
1666 {
1667 struct sk_buff *skb;
1668
1669 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1670 if (skb_dst(skb))
1671 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1672 IPSTATS_MIB_OUTDISCARDS);
1673 kfree_skb(skb);
1674 }
1675
1676 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1677 }
1678 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);