]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv6/ip6_output.c
62fcf3e48aca8a3cc2dc925580bae30b573e35a8
[mirror_ubuntu-zesty-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
94
95 netif_rx_ni(newskb);
96 return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
104
105 skb->protocol = htons(ETH_P_IPV6);
106 skb->dev = dev;
107
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
120 */
121 if (newskb)
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
125
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
129 kfree_skb(skb);
130 return 0;
131 }
132 }
133
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 skb->len);
136 }
137
138 rcu_read_lock();
139 neigh = dst_get_neighbour_noref(dst);
140 if (neigh) {
141 int res = neigh_output(neigh, skb);
142
143 rcu_read_unlock();
144 return res;
145 }
146 rcu_read_unlock();
147 IP6_INC_STATS_BH(dev_net(dst->dev),
148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 kfree_skb(skb);
150 return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 dst_allfrag(skb_dst(skb)))
157 return ip6_fragment(skb, ip6_finish_output2);
158 else
159 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164 struct net_device *dev = skb_dst(skb)->dev;
165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 if (unlikely(idev->cnf.disable_ipv6)) {
167 IP6_INC_STATS(dev_net(dev), idev,
168 IPSTATS_MIB_OUTDISCARDS);
169 kfree_skb(skb);
170 return 0;
171 }
172
173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 ip6_finish_output,
175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
180 */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 struct ipv6_txoptions *opt, int tclass)
184 {
185 struct net *net = sock_net(sk);
186 struct ipv6_pinfo *np = inet6_sk(sk);
187 struct in6_addr *first_hop = &fl6->daddr;
188 struct dst_entry *dst = skb_dst(skb);
189 struct ipv6hdr *hdr;
190 u8 proto = fl6->flowi6_proto;
191 int seg_len = skb->len;
192 int hlimit = -1;
193 u32 mtu;
194
195 if (opt) {
196 unsigned int head_room;
197
198 /* First: exthdrs may take lots of space (~8K for now)
199 MAX_HEADER is not enough.
200 */
201 head_room = opt->opt_nflen + opt->opt_flen;
202 seg_len += head_room;
203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205 if (skb_headroom(skb) < head_room) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 if (skb2 == NULL) {
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
210 kfree_skb(skb);
211 return -ENOBUFS;
212 }
213 consume_skb(skb);
214 skb = skb2;
215 skb_set_owner_w(skb, sk);
216 }
217 if (opt->opt_flen)
218 ipv6_push_frag_opts(skb, opt, &proto);
219 if (opt->opt_nflen)
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 }
222
223 skb_push(skb, sizeof(struct ipv6hdr));
224 skb_reset_network_header(skb);
225 hdr = ipv6_hdr(skb);
226
227 /*
228 * Fill in the IPv6 header
229 */
230 if (np)
231 hlimit = np->hop_limit;
232 if (hlimit < 0)
233 hlimit = ip6_dst_hoplimit(dst);
234
235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237 hdr->payload_len = htons(seg_len);
238 hdr->nexthdr = proto;
239 hdr->hop_limit = hlimit;
240
241 hdr->saddr = fl6->saddr;
242 hdr->daddr = *first_hop;
243
244 skb->priority = sk->sk_priority;
245 skb->mark = sk->sk_mark;
246
247 mtu = dst_mtu(dst);
248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 IPSTATS_MIB_OUT, skb->len);
251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 dst->dev, dst_output);
253 }
254
255 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
256 skb->dev = dst->dev;
257 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
259 kfree_skb(skb);
260 return -EMSGSIZE;
261 }
262
263 EXPORT_SYMBOL(ip6_xmit);
264
265 /*
266 * To avoid extra problems ND packets are send through this
267 * routine. It's code duplication but I really want to avoid
268 * extra checks since ipv6_build_header is used by TCP (which
269 * is for us performance critical)
270 */
271
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273 const struct in6_addr *saddr, const struct in6_addr *daddr,
274 int proto, int len)
275 {
276 struct ipv6_pinfo *np = inet6_sk(sk);
277 struct ipv6hdr *hdr;
278
279 skb->protocol = htons(ETH_P_IPV6);
280 skb->dev = dev;
281
282 skb_reset_network_header(skb);
283 skb_put(skb, sizeof(struct ipv6hdr));
284 hdr = ipv6_hdr(skb);
285
286 *(__be32*)hdr = htonl(0x60000000);
287
288 hdr->payload_len = htons(len);
289 hdr->nexthdr = proto;
290 hdr->hop_limit = np->hop_limit;
291
292 hdr->saddr = *saddr;
293 hdr->daddr = *daddr;
294
295 return 0;
296 }
297
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 {
300 struct ip6_ra_chain *ra;
301 struct sock *last = NULL;
302
303 read_lock(&ip6_ra_lock);
304 for (ra = ip6_ra_chain; ra; ra = ra->next) {
305 struct sock *sk = ra->sk;
306 if (sk && ra->sel == sel &&
307 (!sk->sk_bound_dev_if ||
308 sk->sk_bound_dev_if == skb->dev->ifindex)) {
309 if (last) {
310 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 if (skb2)
312 rawv6_rcv(last, skb2);
313 }
314 last = sk;
315 }
316 }
317
318 if (last) {
319 rawv6_rcv(last, skb);
320 read_unlock(&ip6_ra_lock);
321 return 1;
322 }
323 read_unlock(&ip6_ra_lock);
324 return 0;
325 }
326
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 struct ipv6hdr *hdr = ipv6_hdr(skb);
330 u8 nexthdr = hdr->nexthdr;
331 __be16 frag_off;
332 int offset;
333
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 if (offset < 0)
337 return 0;
338 } else
339 offset = sizeof(struct ipv6hdr);
340
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
343
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
346 return 0;
347
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 case NDISC_REDIRECT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
358 * input function.
359 */
360 return 1;
361 default:
362 break;
363 }
364 }
365
366 /*
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
370 */
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
373 return -1;
374 }
375
376 return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381 return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386 struct dst_entry *dst = skb_dst(skb);
387 struct ipv6hdr *hdr = ipv6_hdr(skb);
388 struct inet6_skb_parm *opt = IP6CB(skb);
389 struct net *net = dev_net(dst->dev);
390 u32 mtu;
391
392 if (net->ipv6.devconf_all->forwarding == 0)
393 goto error;
394
395 if (skb_warn_if_lro(skb))
396 goto drop;
397
398 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
400 goto drop;
401 }
402
403 if (skb->pkt_type != PACKET_HOST)
404 goto drop;
405
406 skb_forward_csum(skb);
407
408 /*
409 * We DO NOT make any processing on
410 * RA packets, pushing them to user level AS IS
411 * without ane WARRANTY that application will be able
412 * to interpret them. The reason is that we
413 * cannot make anything clever here.
414 *
415 * We are not end-node, so that if packet contains
416 * AH/ESP, we cannot make anything.
417 * Defragmentation also would be mistake, RA packets
418 * cannot be fragmented, because there is no warranty
419 * that different fragments will go along one path. --ANK
420 */
421 if (opt->ra) {
422 u8 *ptr = skb_network_header(skb) + opt->ra;
423 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
424 return 0;
425 }
426
427 /*
428 * check and decrement ttl
429 */
430 if (hdr->hop_limit <= 1) {
431 /* Force OUTPUT device used as source address */
432 skb->dev = dst->dev;
433 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434 IP6_INC_STATS_BH(net,
435 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
436
437 kfree_skb(skb);
438 return -ETIMEDOUT;
439 }
440
441 /* XXX: idev->cnf.proxy_ndp? */
442 if (net->ipv6.devconf_all->proxy_ndp &&
443 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444 int proxied = ip6_forward_proxy_check(skb);
445 if (proxied > 0)
446 return ip6_input(skb);
447 else if (proxied < 0) {
448 IP6_INC_STATS(net, ip6_dst_idev(dst),
449 IPSTATS_MIB_INDISCARDS);
450 goto drop;
451 }
452 }
453
454 if (!xfrm6_route_forward(skb)) {
455 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
456 goto drop;
457 }
458 dst = skb_dst(skb);
459
460 /* IPv6 specs say nothing about it, but it is clear that we cannot
461 send redirects to source routed frames.
462 We don't send redirects to frames decapsulated from IPsec.
463 */
464 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 struct in6_addr *target = NULL;
466 struct inet_peer *peer;
467 struct rt6_info *rt;
468
469 /*
470 * incoming and outgoing devices are the same
471 * send a redirect.
472 */
473
474 rt = (struct rt6_info *) dst;
475 if (rt->rt6i_flags & RTF_GATEWAY)
476 target = &rt->rt6i_gateway;
477 else
478 target = &hdr->daddr;
479
480 peer = rt6_get_peer_create(rt);
481
482 /* Limit redirects both by destination (here)
483 and by source (inside ndisc_send_redirect)
484 */
485 if (inet_peer_xrlim_allow(peer, 1*HZ))
486 ndisc_send_redirect(skb, target);
487 } else {
488 int addrtype = ipv6_addr_type(&hdr->saddr);
489
490 /* This check is security critical. */
491 if (addrtype == IPV6_ADDR_ANY ||
492 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 goto error;
494 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 ICMPV6_NOT_NEIGHBOUR, 0);
497 goto error;
498 }
499 }
500
501 mtu = dst_mtu(dst);
502 if (mtu < IPV6_MIN_MTU)
503 mtu = IPV6_MIN_MTU;
504
505 if (skb->len > mtu && !skb_is_gso(skb)) {
506 /* Again, force OUTPUT device used as source address */
507 skb->dev = dst->dev;
508 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 IP6_INC_STATS_BH(net,
510 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 kfree_skb(skb);
514 return -EMSGSIZE;
515 }
516
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 goto drop;
520 }
521
522 hdr = ipv6_hdr(skb);
523
524 /* Mangling hops number delayed to point after skb COW */
525
526 hdr->hop_limit--;
527
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 ip6_forward_finish);
531
532 error:
533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 kfree_skb(skb);
536 return -EINVAL;
537 }
538
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
544 skb_dst_drop(to);
545 skb_dst_set(to, dst_clone(skb_dst(from)));
546 to->dev = from->dev;
547 to->mark = from->mark;
548
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
551 #endif
552 nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
556 #endif
557 skb_copy_secmark(to, from);
558 }
559
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
566 int found_rhdr = 0;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569 while (offset + 1 <= packet_len) {
570
571 switch (**nexthdr) {
572
573 case NEXTHDR_HOP:
574 break;
575 case NEXTHDR_ROUTING:
576 found_rhdr = 1;
577 break;
578 case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 break;
582 #endif
583 if (found_rhdr)
584 return offset;
585 break;
586 default :
587 return offset;
588 }
589
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 offset);
594 }
595
596 return offset;
597 }
598
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601 static atomic_t ipv6_fragmentation_id;
602 int old, new;
603
604 if (rt && !(rt->dst.flags & DST_NOPEER)) {
605 struct inet_peer *peer = rt6_get_peer_create(rt);
606
607 if (peer) {
608 fhdr->identification = htonl(inet_getid(peer, 0));
609 return;
610 }
611 }
612 do {
613 old = atomic_read(&ipv6_fragmentation_id);
614 new = old + 1;
615 if (!new)
616 new = 1;
617 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
618 fhdr->identification = htonl(new);
619 }
620
621 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
622 {
623 struct sk_buff *frag;
624 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
625 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
626 struct ipv6hdr *tmp_hdr;
627 struct frag_hdr *fh;
628 unsigned int mtu, hlen, left, len;
629 int hroom, troom;
630 __be32 frag_id = 0;
631 int ptr, offset = 0, err=0;
632 u8 *prevhdr, nexthdr = 0;
633 struct net *net = dev_net(skb_dst(skb)->dev);
634
635 hlen = ip6_find_1stfragopt(skb, &prevhdr);
636 nexthdr = *prevhdr;
637
638 mtu = ip6_skb_dst_mtu(skb);
639
640 /* We must not fragment if the socket is set to force MTU discovery
641 * or if the skb it not generated by a local socket.
642 */
643 if (unlikely(!skb->local_df && skb->len > mtu)) {
644 if (skb->sk && dst_allfrag(skb_dst(skb)))
645 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
646
647 skb->dev = skb_dst(skb)->dev;
648 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
649 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
650 IPSTATS_MIB_FRAGFAILS);
651 kfree_skb(skb);
652 return -EMSGSIZE;
653 }
654
655 if (np && np->frag_size < mtu) {
656 if (np->frag_size)
657 mtu = np->frag_size;
658 }
659 mtu -= hlen + sizeof(struct frag_hdr);
660
661 if (skb_has_frag_list(skb)) {
662 int first_len = skb_pagelen(skb);
663 struct sk_buff *frag2;
664
665 if (first_len - hlen > mtu ||
666 ((first_len - hlen) & 7) ||
667 skb_cloned(skb))
668 goto slow_path;
669
670 skb_walk_frags(skb, frag) {
671 /* Correct geometry. */
672 if (frag->len > mtu ||
673 ((frag->len & 7) && frag->next) ||
674 skb_headroom(frag) < hlen)
675 goto slow_path_clean;
676
677 /* Partially cloned skb? */
678 if (skb_shared(frag))
679 goto slow_path_clean;
680
681 BUG_ON(frag->sk);
682 if (skb->sk) {
683 frag->sk = skb->sk;
684 frag->destructor = sock_wfree;
685 }
686 skb->truesize -= frag->truesize;
687 }
688
689 err = 0;
690 offset = 0;
691 frag = skb_shinfo(skb)->frag_list;
692 skb_frag_list_init(skb);
693 /* BUILD HEADER */
694
695 *prevhdr = NEXTHDR_FRAGMENT;
696 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
697 if (!tmp_hdr) {
698 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
699 IPSTATS_MIB_FRAGFAILS);
700 return -ENOMEM;
701 }
702
703 __skb_pull(skb, hlen);
704 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
705 __skb_push(skb, hlen);
706 skb_reset_network_header(skb);
707 memcpy(skb_network_header(skb), tmp_hdr, hlen);
708
709 ipv6_select_ident(fh, rt);
710 fh->nexthdr = nexthdr;
711 fh->reserved = 0;
712 fh->frag_off = htons(IP6_MF);
713 frag_id = fh->identification;
714
715 first_len = skb_pagelen(skb);
716 skb->data_len = first_len - skb_headlen(skb);
717 skb->len = first_len;
718 ipv6_hdr(skb)->payload_len = htons(first_len -
719 sizeof(struct ipv6hdr));
720
721 dst_hold(&rt->dst);
722
723 for (;;) {
724 /* Prepare header of the next frame,
725 * before previous one went down. */
726 if (frag) {
727 frag->ip_summed = CHECKSUM_NONE;
728 skb_reset_transport_header(frag);
729 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
730 __skb_push(frag, hlen);
731 skb_reset_network_header(frag);
732 memcpy(skb_network_header(frag), tmp_hdr,
733 hlen);
734 offset += skb->len - hlen - sizeof(struct frag_hdr);
735 fh->nexthdr = nexthdr;
736 fh->reserved = 0;
737 fh->frag_off = htons(offset);
738 if (frag->next != NULL)
739 fh->frag_off |= htons(IP6_MF);
740 fh->identification = frag_id;
741 ipv6_hdr(frag)->payload_len =
742 htons(frag->len -
743 sizeof(struct ipv6hdr));
744 ip6_copy_metadata(frag, skb);
745 }
746
747 err = output(skb);
748 if(!err)
749 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750 IPSTATS_MIB_FRAGCREATES);
751
752 if (err || !frag)
753 break;
754
755 skb = frag;
756 frag = skb->next;
757 skb->next = NULL;
758 }
759
760 kfree(tmp_hdr);
761
762 if (err == 0) {
763 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764 IPSTATS_MIB_FRAGOKS);
765 dst_release(&rt->dst);
766 return 0;
767 }
768
769 while (frag) {
770 skb = frag->next;
771 kfree_skb(frag);
772 frag = skb;
773 }
774
775 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
776 IPSTATS_MIB_FRAGFAILS);
777 dst_release(&rt->dst);
778 return err;
779
780 slow_path_clean:
781 skb_walk_frags(skb, frag2) {
782 if (frag2 == frag)
783 break;
784 frag2->sk = NULL;
785 frag2->destructor = NULL;
786 skb->truesize += frag2->truesize;
787 }
788 }
789
790 slow_path:
791 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
792 skb_checksum_help(skb))
793 goto fail;
794
795 left = skb->len - hlen; /* Space per frame */
796 ptr = hlen; /* Where to start from */
797
798 /*
799 * Fragment the datagram.
800 */
801
802 *prevhdr = NEXTHDR_FRAGMENT;
803 hroom = LL_RESERVED_SPACE(rt->dst.dev);
804 troom = rt->dst.dev->needed_tailroom;
805
806 /*
807 * Keep copying data until we run out.
808 */
809 while(left > 0) {
810 len = left;
811 /* IF: it doesn't fit, use 'mtu' - the data space left */
812 if (len > mtu)
813 len = mtu;
814 /* IF: we are not sending up to and including the packet end
815 then align the next start on an eight byte boundary */
816 if (len < left) {
817 len &= ~7;
818 }
819 /*
820 * Allocate buffer.
821 */
822
823 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
824 hroom + troom, GFP_ATOMIC)) == NULL) {
825 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
826 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
827 IPSTATS_MIB_FRAGFAILS);
828 err = -ENOMEM;
829 goto fail;
830 }
831
832 /*
833 * Set up data on packet
834 */
835
836 ip6_copy_metadata(frag, skb);
837 skb_reserve(frag, hroom);
838 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
839 skb_reset_network_header(frag);
840 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
841 frag->transport_header = (frag->network_header + hlen +
842 sizeof(struct frag_hdr));
843
844 /*
845 * Charge the memory for the fragment to any owner
846 * it might possess
847 */
848 if (skb->sk)
849 skb_set_owner_w(frag, skb->sk);
850
851 /*
852 * Copy the packet header into the new buffer.
853 */
854 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
855
856 /*
857 * Build fragment header.
858 */
859 fh->nexthdr = nexthdr;
860 fh->reserved = 0;
861 if (!frag_id) {
862 ipv6_select_ident(fh, rt);
863 frag_id = fh->identification;
864 } else
865 fh->identification = frag_id;
866
867 /*
868 * Copy a block of the IP datagram.
869 */
870 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
871 BUG();
872 left -= len;
873
874 fh->frag_off = htons(offset);
875 if (left > 0)
876 fh->frag_off |= htons(IP6_MF);
877 ipv6_hdr(frag)->payload_len = htons(frag->len -
878 sizeof(struct ipv6hdr));
879
880 ptr += len;
881 offset += len;
882
883 /*
884 * Put this fragment into the sending queue.
885 */
886 err = output(frag);
887 if (err)
888 goto fail;
889
890 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891 IPSTATS_MIB_FRAGCREATES);
892 }
893 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 IPSTATS_MIB_FRAGOKS);
895 consume_skb(skb);
896 return err;
897
898 fail:
899 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
900 IPSTATS_MIB_FRAGFAILS);
901 kfree_skb(skb);
902 return err;
903 }
904
905 static inline int ip6_rt_check(const struct rt6key *rt_key,
906 const struct in6_addr *fl_addr,
907 const struct in6_addr *addr_cache)
908 {
909 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
910 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
911 }
912
913 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
914 struct dst_entry *dst,
915 const struct flowi6 *fl6)
916 {
917 struct ipv6_pinfo *np = inet6_sk(sk);
918 struct rt6_info *rt = (struct rt6_info *)dst;
919
920 if (!dst)
921 goto out;
922
923 /* Yes, checking route validity in not connected
924 * case is not very simple. Take into account,
925 * that we do not support routing by source, TOS,
926 * and MSG_DONTROUTE --ANK (980726)
927 *
928 * 1. ip6_rt_check(): If route was host route,
929 * check that cached destination is current.
930 * If it is network route, we still may
931 * check its validity using saved pointer
932 * to the last used address: daddr_cache.
933 * We do not want to save whole address now,
934 * (because main consumer of this service
935 * is tcp, which has not this problem),
936 * so that the last trick works only on connected
937 * sockets.
938 * 2. oif also should be the same.
939 */
940 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
941 #ifdef CONFIG_IPV6_SUBTREES
942 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
943 #endif
944 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
945 dst_release(dst);
946 dst = NULL;
947 }
948
949 out:
950 return dst;
951 }
952
953 static int ip6_dst_lookup_tail(struct sock *sk,
954 struct dst_entry **dst, struct flowi6 *fl6)
955 {
956 struct net *net = sock_net(sk);
957 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
958 struct neighbour *n;
959 #endif
960 int err;
961
962 if (*dst == NULL)
963 *dst = ip6_route_output(net, sk, fl6);
964
965 if ((err = (*dst)->error))
966 goto out_err_release;
967
968 if (ipv6_addr_any(&fl6->saddr)) {
969 struct rt6_info *rt = (struct rt6_info *) *dst;
970 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
971 sk ? inet6_sk(sk)->srcprefs : 0,
972 &fl6->saddr);
973 if (err)
974 goto out_err_release;
975 }
976
977 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
978 /*
979 * Here if the dst entry we've looked up
980 * has a neighbour entry that is in the INCOMPLETE
981 * state and the src address from the flow is
982 * marked as OPTIMISTIC, we release the found
983 * dst entry and replace it instead with the
984 * dst entry of the nexthop router
985 */
986 rcu_read_lock();
987 n = dst_get_neighbour_noref(*dst);
988 if (n && !(n->nud_state & NUD_VALID)) {
989 struct inet6_ifaddr *ifp;
990 struct flowi6 fl_gw6;
991 int redirect;
992
993 rcu_read_unlock();
994 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
995 (*dst)->dev, 1);
996
997 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
998 if (ifp)
999 in6_ifa_put(ifp);
1000
1001 if (redirect) {
1002 /*
1003 * We need to get the dst entry for the
1004 * default router instead
1005 */
1006 dst_release(*dst);
1007 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1008 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1009 *dst = ip6_route_output(net, sk, &fl_gw6);
1010 if ((err = (*dst)->error))
1011 goto out_err_release;
1012 }
1013 } else {
1014 rcu_read_unlock();
1015 }
1016 #endif
1017
1018 return 0;
1019
1020 out_err_release:
1021 if (err == -ENETUNREACH)
1022 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023 dst_release(*dst);
1024 *dst = NULL;
1025 return err;
1026 }
1027
1028 /**
1029 * ip6_dst_lookup - perform route lookup on flow
1030 * @sk: socket which provides route info
1031 * @dst: pointer to dst_entry * for result
1032 * @fl6: flow to lookup
1033 *
1034 * This function performs a route lookup on the given flow.
1035 *
1036 * It returns zero on success, or a standard errno code on error.
1037 */
1038 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1039 {
1040 *dst = NULL;
1041 return ip6_dst_lookup_tail(sk, dst, fl6);
1042 }
1043 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1044
1045 /**
1046 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1047 * @sk: socket which provides route info
1048 * @fl6: flow to lookup
1049 * @final_dst: final destination address for ipsec lookup
1050 * @can_sleep: we are in a sleepable context
1051 *
1052 * This function performs a route lookup on the given flow.
1053 *
1054 * It returns a valid dst pointer on success, or a pointer encoded
1055 * error code.
1056 */
1057 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1058 const struct in6_addr *final_dst,
1059 bool can_sleep)
1060 {
1061 struct dst_entry *dst = NULL;
1062 int err;
1063
1064 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1065 if (err)
1066 return ERR_PTR(err);
1067 if (final_dst)
1068 fl6->daddr = *final_dst;
1069 if (can_sleep)
1070 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1071
1072 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1075
1076 /**
1077 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078 * @sk: socket which provides the dst cache and route info
1079 * @fl6: flow to lookup
1080 * @final_dst: final destination address for ipsec lookup
1081 * @can_sleep: we are in a sleepable context
1082 *
1083 * This function performs a route lookup on the given flow with the
1084 * possibility of using the cached route in the socket if it is valid.
1085 * It will take the socket dst lock when operating on the dst cache.
1086 * As a result, this function can only be used in process context.
1087 *
1088 * It returns a valid dst pointer on success, or a pointer encoded
1089 * error code.
1090 */
1091 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1092 const struct in6_addr *final_dst,
1093 bool can_sleep)
1094 {
1095 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1096 int err;
1097
1098 dst = ip6_sk_dst_check(sk, dst, fl6);
1099
1100 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1101 if (err)
1102 return ERR_PTR(err);
1103 if (final_dst)
1104 fl6->daddr = *final_dst;
1105 if (can_sleep)
1106 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1107
1108 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1109 }
1110 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1111
1112 static inline int ip6_ufo_append_data(struct sock *sk,
1113 int getfrag(void *from, char *to, int offset, int len,
1114 int odd, struct sk_buff *skb),
1115 void *from, int length, int hh_len, int fragheaderlen,
1116 int transhdrlen, int mtu,unsigned int flags,
1117 struct rt6_info *rt)
1118
1119 {
1120 struct sk_buff *skb;
1121 int err;
1122
1123 /* There is support for UDP large send offload by network
1124 * device, so create one single skb packet containing complete
1125 * udp datagram
1126 */
1127 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1128 skb = sock_alloc_send_skb(sk,
1129 hh_len + fragheaderlen + transhdrlen + 20,
1130 (flags & MSG_DONTWAIT), &err);
1131 if (skb == NULL)
1132 return err;
1133
1134 /* reserve space for Hardware header */
1135 skb_reserve(skb, hh_len);
1136
1137 /* create space for UDP/IP header */
1138 skb_put(skb,fragheaderlen + transhdrlen);
1139
1140 /* initialize network header pointer */
1141 skb_reset_network_header(skb);
1142
1143 /* initialize protocol header pointer */
1144 skb->transport_header = skb->network_header + fragheaderlen;
1145
1146 skb->ip_summed = CHECKSUM_PARTIAL;
1147 skb->csum = 0;
1148 }
1149
1150 err = skb_append_datato_frags(sk,skb, getfrag, from,
1151 (length - transhdrlen));
1152 if (!err) {
1153 struct frag_hdr fhdr;
1154
1155 /* Specify the length of each IPv6 datagram fragment.
1156 * It has to be a multiple of 8.
1157 */
1158 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1159 sizeof(struct frag_hdr)) & ~7;
1160 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161 ipv6_select_ident(&fhdr, rt);
1162 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1163 __skb_queue_tail(&sk->sk_write_queue, skb);
1164
1165 return 0;
1166 }
1167 /* There is not enough support do UPD LSO,
1168 * so follow normal path
1169 */
1170 kfree_skb(skb);
1171
1172 return err;
1173 }
1174
1175 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1176 gfp_t gfp)
1177 {
1178 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 }
1180
1181 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1182 gfp_t gfp)
1183 {
1184 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1185 }
1186
1187 static void ip6_append_data_mtu(int *mtu,
1188 int *maxfraglen,
1189 unsigned int fragheaderlen,
1190 struct sk_buff *skb,
1191 struct rt6_info *rt)
1192 {
1193 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1194 if (skb == NULL) {
1195 /* first fragment, reserve header_len */
1196 *mtu = *mtu - rt->dst.header_len;
1197
1198 } else {
1199 /*
1200 * this fragment is not first, the headers
1201 * space is regarded as data space.
1202 */
1203 *mtu = dst_mtu(rt->dst.path);
1204 }
1205 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1206 + fragheaderlen - sizeof(struct frag_hdr);
1207 }
1208 }
1209
1210 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1211 int offset, int len, int odd, struct sk_buff *skb),
1212 void *from, int length, int transhdrlen,
1213 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1214 struct rt6_info *rt, unsigned int flags, int dontfrag)
1215 {
1216 struct inet_sock *inet = inet_sk(sk);
1217 struct ipv6_pinfo *np = inet6_sk(sk);
1218 struct inet_cork *cork;
1219 struct sk_buff *skb, *skb_prev = NULL;
1220 unsigned int maxfraglen, fragheaderlen;
1221 int exthdrlen;
1222 int dst_exthdrlen;
1223 int hh_len;
1224 int mtu;
1225 int copy;
1226 int err;
1227 int offset = 0;
1228 __u8 tx_flags = 0;
1229
1230 if (flags&MSG_PROBE)
1231 return 0;
1232 cork = &inet->cork.base;
1233 if (skb_queue_empty(&sk->sk_write_queue)) {
1234 /*
1235 * setup for corking
1236 */
1237 if (opt) {
1238 if (WARN_ON(np->cork.opt))
1239 return -EINVAL;
1240
1241 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1242 if (unlikely(np->cork.opt == NULL))
1243 return -ENOBUFS;
1244
1245 np->cork.opt->tot_len = opt->tot_len;
1246 np->cork.opt->opt_flen = opt->opt_flen;
1247 np->cork.opt->opt_nflen = opt->opt_nflen;
1248
1249 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1250 sk->sk_allocation);
1251 if (opt->dst0opt && !np->cork.opt->dst0opt)
1252 return -ENOBUFS;
1253
1254 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1255 sk->sk_allocation);
1256 if (opt->dst1opt && !np->cork.opt->dst1opt)
1257 return -ENOBUFS;
1258
1259 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1260 sk->sk_allocation);
1261 if (opt->hopopt && !np->cork.opt->hopopt)
1262 return -ENOBUFS;
1263
1264 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1265 sk->sk_allocation);
1266 if (opt->srcrt && !np->cork.opt->srcrt)
1267 return -ENOBUFS;
1268
1269 /* need source address above miyazawa*/
1270 }
1271 dst_hold(&rt->dst);
1272 cork->dst = &rt->dst;
1273 inet->cork.fl.u.ip6 = *fl6;
1274 np->cork.hop_limit = hlimit;
1275 np->cork.tclass = tclass;
1276 if (rt->dst.flags & DST_XFRM_TUNNEL)
1277 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1278 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1279 else
1280 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1281 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1282 if (np->frag_size < mtu) {
1283 if (np->frag_size)
1284 mtu = np->frag_size;
1285 }
1286 cork->fragsize = mtu;
1287 if (dst_allfrag(rt->dst.path))
1288 cork->flags |= IPCORK_ALLFRAG;
1289 cork->length = 0;
1290 sk->sk_sndmsg_page = NULL;
1291 sk->sk_sndmsg_off = 0;
1292 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1293 length += exthdrlen;
1294 transhdrlen += exthdrlen;
1295 dst_exthdrlen = rt->dst.header_len;
1296 } else {
1297 rt = (struct rt6_info *)cork->dst;
1298 fl6 = &inet->cork.fl.u.ip6;
1299 opt = np->cork.opt;
1300 transhdrlen = 0;
1301 exthdrlen = 0;
1302 dst_exthdrlen = 0;
1303 mtu = cork->fragsize;
1304 }
1305
1306 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1307
1308 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1309 (opt ? opt->opt_nflen : 0);
1310 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1311
1312 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1313 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1314 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1315 return -EMSGSIZE;
1316 }
1317 }
1318
1319 /* For UDP, check if TX timestamp is enabled */
1320 if (sk->sk_type == SOCK_DGRAM) {
1321 err = sock_tx_timestamp(sk, &tx_flags);
1322 if (err)
1323 goto error;
1324 }
1325
1326 /*
1327 * Let's try using as much space as possible.
1328 * Use MTU if total length of the message fits into the MTU.
1329 * Otherwise, we need to reserve fragment header and
1330 * fragment alignment (= 8-15 octects, in total).
1331 *
1332 * Note that we may need to "move" the data from the tail of
1333 * of the buffer to the new fragment when we split
1334 * the message.
1335 *
1336 * FIXME: It may be fragmented into multiple chunks
1337 * at once if non-fragmentable extension headers
1338 * are too large.
1339 * --yoshfuji
1340 */
1341
1342 cork->length += length;
1343 if (length > mtu) {
1344 int proto = sk->sk_protocol;
1345 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1346 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1347 return -EMSGSIZE;
1348 }
1349
1350 if (proto == IPPROTO_UDP &&
1351 (rt->dst.dev->features & NETIF_F_UFO)) {
1352
1353 err = ip6_ufo_append_data(sk, getfrag, from, length,
1354 hh_len, fragheaderlen,
1355 transhdrlen, mtu, flags, rt);
1356 if (err)
1357 goto error;
1358 return 0;
1359 }
1360 }
1361
1362 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1363 goto alloc_new_skb;
1364
1365 while (length > 0) {
1366 /* Check if the remaining data fits into current packet. */
1367 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1368 if (copy < length)
1369 copy = maxfraglen - skb->len;
1370
1371 if (copy <= 0) {
1372 char *data;
1373 unsigned int datalen;
1374 unsigned int fraglen;
1375 unsigned int fraggap;
1376 unsigned int alloclen;
1377 alloc_new_skb:
1378 /* There's no room in the current skb */
1379 if (skb)
1380 fraggap = skb->len - maxfraglen;
1381 else
1382 fraggap = 0;
1383 /* update mtu and maxfraglen if necessary */
1384 if (skb == NULL || skb_prev == NULL)
1385 ip6_append_data_mtu(&mtu, &maxfraglen,
1386 fragheaderlen, skb, rt);
1387
1388 skb_prev = skb;
1389
1390 /*
1391 * If remaining data exceeds the mtu,
1392 * we know we need more fragment(s).
1393 */
1394 datalen = length + fraggap;
1395
1396 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1397 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1398 if ((flags & MSG_MORE) &&
1399 !(rt->dst.dev->features&NETIF_F_SG))
1400 alloclen = mtu;
1401 else
1402 alloclen = datalen + fragheaderlen;
1403
1404 alloclen += dst_exthdrlen;
1405
1406 if (datalen != length + fraggap) {
1407 /*
1408 * this is not the last fragment, the trailer
1409 * space is regarded as data space.
1410 */
1411 datalen += rt->dst.trailer_len;
1412 }
1413
1414 alloclen += rt->dst.trailer_len;
1415 fraglen = datalen + fragheaderlen;
1416
1417 /*
1418 * We just reserve space for fragment header.
1419 * Note: this may be overallocation if the message
1420 * (without MSG_MORE) fits into the MTU.
1421 */
1422 alloclen += sizeof(struct frag_hdr);
1423
1424 if (transhdrlen) {
1425 skb = sock_alloc_send_skb(sk,
1426 alloclen + hh_len,
1427 (flags & MSG_DONTWAIT), &err);
1428 } else {
1429 skb = NULL;
1430 if (atomic_read(&sk->sk_wmem_alloc) <=
1431 2 * sk->sk_sndbuf)
1432 skb = sock_wmalloc(sk,
1433 alloclen + hh_len, 1,
1434 sk->sk_allocation);
1435 if (unlikely(skb == NULL))
1436 err = -ENOBUFS;
1437 else {
1438 /* Only the initial fragment
1439 * is time stamped.
1440 */
1441 tx_flags = 0;
1442 }
1443 }
1444 if (skb == NULL)
1445 goto error;
1446 /*
1447 * Fill in the control structures
1448 */
1449 skb->ip_summed = CHECKSUM_NONE;
1450 skb->csum = 0;
1451 /* reserve for fragmentation and ipsec header */
1452 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1453 dst_exthdrlen);
1454
1455 if (sk->sk_type == SOCK_DGRAM)
1456 skb_shinfo(skb)->tx_flags = tx_flags;
1457
1458 /*
1459 * Find where to start putting bytes
1460 */
1461 data = skb_put(skb, fraglen);
1462 skb_set_network_header(skb, exthdrlen);
1463 data += fragheaderlen;
1464 skb->transport_header = (skb->network_header +
1465 fragheaderlen);
1466 if (fraggap) {
1467 skb->csum = skb_copy_and_csum_bits(
1468 skb_prev, maxfraglen,
1469 data + transhdrlen, fraggap, 0);
1470 skb_prev->csum = csum_sub(skb_prev->csum,
1471 skb->csum);
1472 data += fraggap;
1473 pskb_trim_unique(skb_prev, maxfraglen);
1474 }
1475 copy = datalen - transhdrlen - fraggap;
1476
1477 if (copy < 0) {
1478 err = -EINVAL;
1479 kfree_skb(skb);
1480 goto error;
1481 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1482 err = -EFAULT;
1483 kfree_skb(skb);
1484 goto error;
1485 }
1486
1487 offset += copy;
1488 length -= datalen - fraggap;
1489 transhdrlen = 0;
1490 exthdrlen = 0;
1491 dst_exthdrlen = 0;
1492
1493 /*
1494 * Put the packet on the pending queue
1495 */
1496 __skb_queue_tail(&sk->sk_write_queue, skb);
1497 continue;
1498 }
1499
1500 if (copy > length)
1501 copy = length;
1502
1503 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1504 unsigned int off;
1505
1506 off = skb->len;
1507 if (getfrag(from, skb_put(skb, copy),
1508 offset, copy, off, skb) < 0) {
1509 __skb_trim(skb, off);
1510 err = -EFAULT;
1511 goto error;
1512 }
1513 } else {
1514 int i = skb_shinfo(skb)->nr_frags;
1515 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1516 struct page *page = sk->sk_sndmsg_page;
1517 int off = sk->sk_sndmsg_off;
1518 unsigned int left;
1519
1520 if (page && (left = PAGE_SIZE - off) > 0) {
1521 if (copy >= left)
1522 copy = left;
1523 if (page != skb_frag_page(frag)) {
1524 if (i == MAX_SKB_FRAGS) {
1525 err = -EMSGSIZE;
1526 goto error;
1527 }
1528 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1529 skb_frag_ref(skb, i);
1530 frag = &skb_shinfo(skb)->frags[i];
1531 }
1532 } else if(i < MAX_SKB_FRAGS) {
1533 if (copy > PAGE_SIZE)
1534 copy = PAGE_SIZE;
1535 page = alloc_pages(sk->sk_allocation, 0);
1536 if (page == NULL) {
1537 err = -ENOMEM;
1538 goto error;
1539 }
1540 sk->sk_sndmsg_page = page;
1541 sk->sk_sndmsg_off = 0;
1542
1543 skb_fill_page_desc(skb, i, page, 0, 0);
1544 frag = &skb_shinfo(skb)->frags[i];
1545 } else {
1546 err = -EMSGSIZE;
1547 goto error;
1548 }
1549 if (getfrag(from,
1550 skb_frag_address(frag) + skb_frag_size(frag),
1551 offset, copy, skb->len, skb) < 0) {
1552 err = -EFAULT;
1553 goto error;
1554 }
1555 sk->sk_sndmsg_off += copy;
1556 skb_frag_size_add(frag, copy);
1557 skb->len += copy;
1558 skb->data_len += copy;
1559 skb->truesize += copy;
1560 atomic_add(copy, &sk->sk_wmem_alloc);
1561 }
1562 offset += copy;
1563 length -= copy;
1564 }
1565 return 0;
1566 error:
1567 cork->length -= length;
1568 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1569 return err;
1570 }
1571 EXPORT_SYMBOL_GPL(ip6_append_data);
1572
1573 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1574 {
1575 if (np->cork.opt) {
1576 kfree(np->cork.opt->dst0opt);
1577 kfree(np->cork.opt->dst1opt);
1578 kfree(np->cork.opt->hopopt);
1579 kfree(np->cork.opt->srcrt);
1580 kfree(np->cork.opt);
1581 np->cork.opt = NULL;
1582 }
1583
1584 if (inet->cork.base.dst) {
1585 dst_release(inet->cork.base.dst);
1586 inet->cork.base.dst = NULL;
1587 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1588 }
1589 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1590 }
1591
1592 int ip6_push_pending_frames(struct sock *sk)
1593 {
1594 struct sk_buff *skb, *tmp_skb;
1595 struct sk_buff **tail_skb;
1596 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1597 struct inet_sock *inet = inet_sk(sk);
1598 struct ipv6_pinfo *np = inet6_sk(sk);
1599 struct net *net = sock_net(sk);
1600 struct ipv6hdr *hdr;
1601 struct ipv6_txoptions *opt = np->cork.opt;
1602 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1603 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1604 unsigned char proto = fl6->flowi6_proto;
1605 int err = 0;
1606
1607 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1608 goto out;
1609 tail_skb = &(skb_shinfo(skb)->frag_list);
1610
1611 /* move skb->data to ip header from ext header */
1612 if (skb->data < skb_network_header(skb))
1613 __skb_pull(skb, skb_network_offset(skb));
1614 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1615 __skb_pull(tmp_skb, skb_network_header_len(skb));
1616 *tail_skb = tmp_skb;
1617 tail_skb = &(tmp_skb->next);
1618 skb->len += tmp_skb->len;
1619 skb->data_len += tmp_skb->len;
1620 skb->truesize += tmp_skb->truesize;
1621 tmp_skb->destructor = NULL;
1622 tmp_skb->sk = NULL;
1623 }
1624
1625 /* Allow local fragmentation. */
1626 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1627 skb->local_df = 1;
1628
1629 *final_dst = fl6->daddr;
1630 __skb_pull(skb, skb_network_header_len(skb));
1631 if (opt && opt->opt_flen)
1632 ipv6_push_frag_opts(skb, opt, &proto);
1633 if (opt && opt->opt_nflen)
1634 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1635
1636 skb_push(skb, sizeof(struct ipv6hdr));
1637 skb_reset_network_header(skb);
1638 hdr = ipv6_hdr(skb);
1639
1640 *(__be32*)hdr = fl6->flowlabel |
1641 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1642
1643 hdr->hop_limit = np->cork.hop_limit;
1644 hdr->nexthdr = proto;
1645 hdr->saddr = fl6->saddr;
1646 hdr->daddr = *final_dst;
1647
1648 skb->priority = sk->sk_priority;
1649 skb->mark = sk->sk_mark;
1650
1651 skb_dst_set(skb, dst_clone(&rt->dst));
1652 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1653 if (proto == IPPROTO_ICMPV6) {
1654 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1655
1656 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1657 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1658 }
1659
1660 err = ip6_local_out(skb);
1661 if (err) {
1662 if (err > 0)
1663 err = net_xmit_errno(err);
1664 if (err)
1665 goto error;
1666 }
1667
1668 out:
1669 ip6_cork_release(inet, np);
1670 return err;
1671 error:
1672 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1673 goto out;
1674 }
1675 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1676
1677 void ip6_flush_pending_frames(struct sock *sk)
1678 {
1679 struct sk_buff *skb;
1680
1681 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1682 if (skb_dst(skb))
1683 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1684 IPSTATS_MIB_OUTDISCARDS);
1685 kfree_skb(skb);
1686 }
1687
1688 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1689 }
1690 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);