]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv6/ip6_output.c
inet: Minimize use of cached route inetpeer.
[mirror_ubuntu-zesty-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88 struct dst_entry *dst = skb_dst(skb);
89 struct net_device *dev = dst->dev;
90 struct neighbour *neigh;
91 struct rt6_info *rt;
92
93 skb->protocol = htons(ETH_P_IPV6);
94 skb->dev = dev;
95
96 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98
99 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 ((mroute6_socket(dev_net(dev), skb) &&
101 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 &ipv6_hdr(skb)->saddr))) {
104 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105
106 /* Do not check for IFF_ALLMULTI; multicast routing
107 is not supported in any case.
108 */
109 if (newskb)
110 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 newskb, NULL, newskb->dev,
112 dev_loopback_xmit);
113
114 if (ipv6_hdr(skb)->hop_limit == 0) {
115 IP6_INC_STATS(dev_net(dev), idev,
116 IPSTATS_MIB_OUTDISCARDS);
117 kfree_skb(skb);
118 return 0;
119 }
120 }
121
122 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123 skb->len);
124 }
125
126 rcu_read_lock();
127 rt = (struct rt6_info *) dst;
128 neigh = rt->n;
129 if (neigh) {
130 int res = dst_neigh_output(dst, neigh, skb);
131
132 rcu_read_unlock();
133 return res;
134 }
135 rcu_read_unlock();
136 IP6_INC_STATS_BH(dev_net(dst->dev),
137 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
138 kfree_skb(skb);
139 return -EINVAL;
140 }
141
142 static int ip6_finish_output(struct sk_buff *skb)
143 {
144 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
145 dst_allfrag(skb_dst(skb)))
146 return ip6_fragment(skb, ip6_finish_output2);
147 else
148 return ip6_finish_output2(skb);
149 }
150
151 int ip6_output(struct sk_buff *skb)
152 {
153 struct net_device *dev = skb_dst(skb)->dev;
154 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
155 if (unlikely(idev->cnf.disable_ipv6)) {
156 IP6_INC_STATS(dev_net(dev), idev,
157 IPSTATS_MIB_OUTDISCARDS);
158 kfree_skb(skb);
159 return 0;
160 }
161
162 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
163 ip6_finish_output,
164 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
165 }
166
167 /*
168 * xmit an sk_buff (used by TCP, SCTP and DCCP)
169 */
170
171 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
172 struct ipv6_txoptions *opt, int tclass)
173 {
174 struct net *net = sock_net(sk);
175 struct ipv6_pinfo *np = inet6_sk(sk);
176 struct in6_addr *first_hop = &fl6->daddr;
177 struct dst_entry *dst = skb_dst(skb);
178 struct ipv6hdr *hdr;
179 u8 proto = fl6->flowi6_proto;
180 int seg_len = skb->len;
181 int hlimit = -1;
182 u32 mtu;
183
184 if (opt) {
185 unsigned int head_room;
186
187 /* First: exthdrs may take lots of space (~8K for now)
188 MAX_HEADER is not enough.
189 */
190 head_room = opt->opt_nflen + opt->opt_flen;
191 seg_len += head_room;
192 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
193
194 if (skb_headroom(skb) < head_room) {
195 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
196 if (skb2 == NULL) {
197 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
198 IPSTATS_MIB_OUTDISCARDS);
199 kfree_skb(skb);
200 return -ENOBUFS;
201 }
202 consume_skb(skb);
203 skb = skb2;
204 skb_set_owner_w(skb, sk);
205 }
206 if (opt->opt_flen)
207 ipv6_push_frag_opts(skb, opt, &proto);
208 if (opt->opt_nflen)
209 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
210 }
211
212 skb_push(skb, sizeof(struct ipv6hdr));
213 skb_reset_network_header(skb);
214 hdr = ipv6_hdr(skb);
215
216 /*
217 * Fill in the IPv6 header
218 */
219 if (np)
220 hlimit = np->hop_limit;
221 if (hlimit < 0)
222 hlimit = ip6_dst_hoplimit(dst);
223
224 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
225
226 hdr->payload_len = htons(seg_len);
227 hdr->nexthdr = proto;
228 hdr->hop_limit = hlimit;
229
230 hdr->saddr = fl6->saddr;
231 hdr->daddr = *first_hop;
232
233 skb->priority = sk->sk_priority;
234 skb->mark = sk->sk_mark;
235
236 mtu = dst_mtu(dst);
237 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
238 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
239 IPSTATS_MIB_OUT, skb->len);
240 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
241 dst->dev, dst_output);
242 }
243
244 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
245 skb->dev = dst->dev;
246 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
247 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
248 kfree_skb(skb);
249 return -EMSGSIZE;
250 }
251
252 EXPORT_SYMBOL(ip6_xmit);
253
254 /*
255 * To avoid extra problems ND packets are send through this
256 * routine. It's code duplication but I really want to avoid
257 * extra checks since ipv6_build_header is used by TCP (which
258 * is for us performance critical)
259 */
260
261 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
262 const struct in6_addr *saddr, const struct in6_addr *daddr,
263 int proto, int len)
264 {
265 struct ipv6_pinfo *np = inet6_sk(sk);
266 struct ipv6hdr *hdr;
267
268 skb->protocol = htons(ETH_P_IPV6);
269 skb->dev = dev;
270
271 skb_reset_network_header(skb);
272 skb_put(skb, sizeof(struct ipv6hdr));
273 hdr = ipv6_hdr(skb);
274
275 *(__be32*)hdr = htonl(0x60000000);
276
277 hdr->payload_len = htons(len);
278 hdr->nexthdr = proto;
279 hdr->hop_limit = np->hop_limit;
280
281 hdr->saddr = *saddr;
282 hdr->daddr = *daddr;
283
284 return 0;
285 }
286
287 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
288 {
289 struct ip6_ra_chain *ra;
290 struct sock *last = NULL;
291
292 read_lock(&ip6_ra_lock);
293 for (ra = ip6_ra_chain; ra; ra = ra->next) {
294 struct sock *sk = ra->sk;
295 if (sk && ra->sel == sel &&
296 (!sk->sk_bound_dev_if ||
297 sk->sk_bound_dev_if == skb->dev->ifindex)) {
298 if (last) {
299 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
300 if (skb2)
301 rawv6_rcv(last, skb2);
302 }
303 last = sk;
304 }
305 }
306
307 if (last) {
308 rawv6_rcv(last, skb);
309 read_unlock(&ip6_ra_lock);
310 return 1;
311 }
312 read_unlock(&ip6_ra_lock);
313 return 0;
314 }
315
316 static int ip6_forward_proxy_check(struct sk_buff *skb)
317 {
318 struct ipv6hdr *hdr = ipv6_hdr(skb);
319 u8 nexthdr = hdr->nexthdr;
320 __be16 frag_off;
321 int offset;
322
323 if (ipv6_ext_hdr(nexthdr)) {
324 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
325 if (offset < 0)
326 return 0;
327 } else
328 offset = sizeof(struct ipv6hdr);
329
330 if (nexthdr == IPPROTO_ICMPV6) {
331 struct icmp6hdr *icmp6;
332
333 if (!pskb_may_pull(skb, (skb_network_header(skb) +
334 offset + 1 - skb->data)))
335 return 0;
336
337 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
338
339 switch (icmp6->icmp6_type) {
340 case NDISC_ROUTER_SOLICITATION:
341 case NDISC_ROUTER_ADVERTISEMENT:
342 case NDISC_NEIGHBOUR_SOLICITATION:
343 case NDISC_NEIGHBOUR_ADVERTISEMENT:
344 case NDISC_REDIRECT:
345 /* For reaction involving unicast neighbor discovery
346 * message destined to the proxied address, pass it to
347 * input function.
348 */
349 return 1;
350 default:
351 break;
352 }
353 }
354
355 /*
356 * The proxying router can't forward traffic sent to a link-local
357 * address, so signal the sender and discard the packet. This
358 * behavior is clarified by the MIPv6 specification.
359 */
360 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
361 dst_link_failure(skb);
362 return -1;
363 }
364
365 return 0;
366 }
367
368 static inline int ip6_forward_finish(struct sk_buff *skb)
369 {
370 return dst_output(skb);
371 }
372
373 int ip6_forward(struct sk_buff *skb)
374 {
375 struct dst_entry *dst = skb_dst(skb);
376 struct ipv6hdr *hdr = ipv6_hdr(skb);
377 struct inet6_skb_parm *opt = IP6CB(skb);
378 struct net *net = dev_net(dst->dev);
379 u32 mtu;
380
381 if (net->ipv6.devconf_all->forwarding == 0)
382 goto error;
383
384 if (skb_warn_if_lro(skb))
385 goto drop;
386
387 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
388 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
389 goto drop;
390 }
391
392 if (skb->pkt_type != PACKET_HOST)
393 goto drop;
394
395 skb_forward_csum(skb);
396
397 /*
398 * We DO NOT make any processing on
399 * RA packets, pushing them to user level AS IS
400 * without ane WARRANTY that application will be able
401 * to interpret them. The reason is that we
402 * cannot make anything clever here.
403 *
404 * We are not end-node, so that if packet contains
405 * AH/ESP, we cannot make anything.
406 * Defragmentation also would be mistake, RA packets
407 * cannot be fragmented, because there is no warranty
408 * that different fragments will go along one path. --ANK
409 */
410 if (opt->ra) {
411 u8 *ptr = skb_network_header(skb) + opt->ra;
412 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
413 return 0;
414 }
415
416 /*
417 * check and decrement ttl
418 */
419 if (hdr->hop_limit <= 1) {
420 /* Force OUTPUT device used as source address */
421 skb->dev = dst->dev;
422 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
423 IP6_INC_STATS_BH(net,
424 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
425
426 kfree_skb(skb);
427 return -ETIMEDOUT;
428 }
429
430 /* XXX: idev->cnf.proxy_ndp? */
431 if (net->ipv6.devconf_all->proxy_ndp &&
432 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
433 int proxied = ip6_forward_proxy_check(skb);
434 if (proxied > 0)
435 return ip6_input(skb);
436 else if (proxied < 0) {
437 IP6_INC_STATS(net, ip6_dst_idev(dst),
438 IPSTATS_MIB_INDISCARDS);
439 goto drop;
440 }
441 }
442
443 if (!xfrm6_route_forward(skb)) {
444 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
445 goto drop;
446 }
447 dst = skb_dst(skb);
448
449 /* IPv6 specs say nothing about it, but it is clear that we cannot
450 send redirects to source routed frames.
451 We don't send redirects to frames decapsulated from IPsec.
452 */
453 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
454 struct in6_addr *target = NULL;
455 struct inet_peer *peer;
456 struct rt6_info *rt;
457
458 /*
459 * incoming and outgoing devices are the same
460 * send a redirect.
461 */
462
463 rt = (struct rt6_info *) dst;
464 if (rt->rt6i_flags & RTF_GATEWAY)
465 target = &rt->rt6i_gateway;
466 else
467 target = &hdr->daddr;
468
469 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
470
471 /* Limit redirects both by destination (here)
472 and by source (inside ndisc_send_redirect)
473 */
474 if (inet_peer_xrlim_allow(peer, 1*HZ))
475 ndisc_send_redirect(skb, target);
476 if (peer)
477 inet_putpeer(peer);
478 } else {
479 int addrtype = ipv6_addr_type(&hdr->saddr);
480
481 /* This check is security critical. */
482 if (addrtype == IPV6_ADDR_ANY ||
483 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
484 goto error;
485 if (addrtype & IPV6_ADDR_LINKLOCAL) {
486 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
487 ICMPV6_NOT_NEIGHBOUR, 0);
488 goto error;
489 }
490 }
491
492 mtu = dst_mtu(dst);
493 if (mtu < IPV6_MIN_MTU)
494 mtu = IPV6_MIN_MTU;
495
496 if (skb->len > mtu && !skb_is_gso(skb)) {
497 /* Again, force OUTPUT device used as source address */
498 skb->dev = dst->dev;
499 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
500 IP6_INC_STATS_BH(net,
501 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
502 IP6_INC_STATS_BH(net,
503 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
504 kfree_skb(skb);
505 return -EMSGSIZE;
506 }
507
508 if (skb_cow(skb, dst->dev->hard_header_len)) {
509 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
510 goto drop;
511 }
512
513 hdr = ipv6_hdr(skb);
514
515 /* Mangling hops number delayed to point after skb COW */
516
517 hdr->hop_limit--;
518
519 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
520 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
521 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
522 ip6_forward_finish);
523
524 error:
525 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
526 drop:
527 kfree_skb(skb);
528 return -EINVAL;
529 }
530
531 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
532 {
533 to->pkt_type = from->pkt_type;
534 to->priority = from->priority;
535 to->protocol = from->protocol;
536 skb_dst_drop(to);
537 skb_dst_set(to, dst_clone(skb_dst(from)));
538 to->dev = from->dev;
539 to->mark = from->mark;
540
541 #ifdef CONFIG_NET_SCHED
542 to->tc_index = from->tc_index;
543 #endif
544 nf_copy(to, from);
545 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
546 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
547 to->nf_trace = from->nf_trace;
548 #endif
549 skb_copy_secmark(to, from);
550 }
551
552 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
553 {
554 u16 offset = sizeof(struct ipv6hdr);
555 struct ipv6_opt_hdr *exthdr =
556 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
557 unsigned int packet_len = skb->tail - skb->network_header;
558 int found_rhdr = 0;
559 *nexthdr = &ipv6_hdr(skb)->nexthdr;
560
561 while (offset + 1 <= packet_len) {
562
563 switch (**nexthdr) {
564
565 case NEXTHDR_HOP:
566 break;
567 case NEXTHDR_ROUTING:
568 found_rhdr = 1;
569 break;
570 case NEXTHDR_DEST:
571 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
572 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
573 break;
574 #endif
575 if (found_rhdr)
576 return offset;
577 break;
578 default :
579 return offset;
580 }
581
582 offset += ipv6_optlen(exthdr);
583 *nexthdr = &exthdr->nexthdr;
584 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
585 offset);
586 }
587
588 return offset;
589 }
590
591 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
592 {
593 static atomic_t ipv6_fragmentation_id;
594 int old, new;
595
596 if (rt && !(rt->dst.flags & DST_NOPEER)) {
597 struct inet_peer *peer;
598 struct net *net;
599
600 net = dev_net(rt->dst.dev);
601 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
602 if (peer) {
603 fhdr->identification = htonl(inet_getid(peer, 0));
604 inet_putpeer(peer);
605 return;
606 }
607 }
608 do {
609 old = atomic_read(&ipv6_fragmentation_id);
610 new = old + 1;
611 if (!new)
612 new = 1;
613 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
614 fhdr->identification = htonl(new);
615 }
616
617 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
618 {
619 struct sk_buff *frag;
620 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
621 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
622 struct ipv6hdr *tmp_hdr;
623 struct frag_hdr *fh;
624 unsigned int mtu, hlen, left, len;
625 int hroom, troom;
626 __be32 frag_id = 0;
627 int ptr, offset = 0, err=0;
628 u8 *prevhdr, nexthdr = 0;
629 struct net *net = dev_net(skb_dst(skb)->dev);
630
631 hlen = ip6_find_1stfragopt(skb, &prevhdr);
632 nexthdr = *prevhdr;
633
634 mtu = ip6_skb_dst_mtu(skb);
635
636 /* We must not fragment if the socket is set to force MTU discovery
637 * or if the skb it not generated by a local socket.
638 */
639 if (unlikely(!skb->local_df && skb->len > mtu)) {
640 if (skb->sk && dst_allfrag(skb_dst(skb)))
641 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
642
643 skb->dev = skb_dst(skb)->dev;
644 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
645 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
646 IPSTATS_MIB_FRAGFAILS);
647 kfree_skb(skb);
648 return -EMSGSIZE;
649 }
650
651 if (np && np->frag_size < mtu) {
652 if (np->frag_size)
653 mtu = np->frag_size;
654 }
655 mtu -= hlen + sizeof(struct frag_hdr);
656
657 if (skb_has_frag_list(skb)) {
658 int first_len = skb_pagelen(skb);
659 struct sk_buff *frag2;
660
661 if (first_len - hlen > mtu ||
662 ((first_len - hlen) & 7) ||
663 skb_cloned(skb))
664 goto slow_path;
665
666 skb_walk_frags(skb, frag) {
667 /* Correct geometry. */
668 if (frag->len > mtu ||
669 ((frag->len & 7) && frag->next) ||
670 skb_headroom(frag) < hlen)
671 goto slow_path_clean;
672
673 /* Partially cloned skb? */
674 if (skb_shared(frag))
675 goto slow_path_clean;
676
677 BUG_ON(frag->sk);
678 if (skb->sk) {
679 frag->sk = skb->sk;
680 frag->destructor = sock_wfree;
681 }
682 skb->truesize -= frag->truesize;
683 }
684
685 err = 0;
686 offset = 0;
687 frag = skb_shinfo(skb)->frag_list;
688 skb_frag_list_init(skb);
689 /* BUILD HEADER */
690
691 *prevhdr = NEXTHDR_FRAGMENT;
692 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
693 if (!tmp_hdr) {
694 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
695 IPSTATS_MIB_FRAGFAILS);
696 return -ENOMEM;
697 }
698
699 __skb_pull(skb, hlen);
700 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
701 __skb_push(skb, hlen);
702 skb_reset_network_header(skb);
703 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705 ipv6_select_ident(fh, rt);
706 fh->nexthdr = nexthdr;
707 fh->reserved = 0;
708 fh->frag_off = htons(IP6_MF);
709 frag_id = fh->identification;
710
711 first_len = skb_pagelen(skb);
712 skb->data_len = first_len - skb_headlen(skb);
713 skb->len = first_len;
714 ipv6_hdr(skb)->payload_len = htons(first_len -
715 sizeof(struct ipv6hdr));
716
717 dst_hold(&rt->dst);
718
719 for (;;) {
720 /* Prepare header of the next frame,
721 * before previous one went down. */
722 if (frag) {
723 frag->ip_summed = CHECKSUM_NONE;
724 skb_reset_transport_header(frag);
725 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
726 __skb_push(frag, hlen);
727 skb_reset_network_header(frag);
728 memcpy(skb_network_header(frag), tmp_hdr,
729 hlen);
730 offset += skb->len - hlen - sizeof(struct frag_hdr);
731 fh->nexthdr = nexthdr;
732 fh->reserved = 0;
733 fh->frag_off = htons(offset);
734 if (frag->next != NULL)
735 fh->frag_off |= htons(IP6_MF);
736 fh->identification = frag_id;
737 ipv6_hdr(frag)->payload_len =
738 htons(frag->len -
739 sizeof(struct ipv6hdr));
740 ip6_copy_metadata(frag, skb);
741 }
742
743 err = output(skb);
744 if(!err)
745 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
746 IPSTATS_MIB_FRAGCREATES);
747
748 if (err || !frag)
749 break;
750
751 skb = frag;
752 frag = skb->next;
753 skb->next = NULL;
754 }
755
756 kfree(tmp_hdr);
757
758 if (err == 0) {
759 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
760 IPSTATS_MIB_FRAGOKS);
761 dst_release(&rt->dst);
762 return 0;
763 }
764
765 while (frag) {
766 skb = frag->next;
767 kfree_skb(frag);
768 frag = skb;
769 }
770
771 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
772 IPSTATS_MIB_FRAGFAILS);
773 dst_release(&rt->dst);
774 return err;
775
776 slow_path_clean:
777 skb_walk_frags(skb, frag2) {
778 if (frag2 == frag)
779 break;
780 frag2->sk = NULL;
781 frag2->destructor = NULL;
782 skb->truesize += frag2->truesize;
783 }
784 }
785
786 slow_path:
787 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
788 skb_checksum_help(skb))
789 goto fail;
790
791 left = skb->len - hlen; /* Space per frame */
792 ptr = hlen; /* Where to start from */
793
794 /*
795 * Fragment the datagram.
796 */
797
798 *prevhdr = NEXTHDR_FRAGMENT;
799 hroom = LL_RESERVED_SPACE(rt->dst.dev);
800 troom = rt->dst.dev->needed_tailroom;
801
802 /*
803 * Keep copying data until we run out.
804 */
805 while(left > 0) {
806 len = left;
807 /* IF: it doesn't fit, use 'mtu' - the data space left */
808 if (len > mtu)
809 len = mtu;
810 /* IF: we are not sending up to and including the packet end
811 then align the next start on an eight byte boundary */
812 if (len < left) {
813 len &= ~7;
814 }
815 /*
816 * Allocate buffer.
817 */
818
819 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
820 hroom + troom, GFP_ATOMIC)) == NULL) {
821 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
822 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
823 IPSTATS_MIB_FRAGFAILS);
824 err = -ENOMEM;
825 goto fail;
826 }
827
828 /*
829 * Set up data on packet
830 */
831
832 ip6_copy_metadata(frag, skb);
833 skb_reserve(frag, hroom);
834 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
835 skb_reset_network_header(frag);
836 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
837 frag->transport_header = (frag->network_header + hlen +
838 sizeof(struct frag_hdr));
839
840 /*
841 * Charge the memory for the fragment to any owner
842 * it might possess
843 */
844 if (skb->sk)
845 skb_set_owner_w(frag, skb->sk);
846
847 /*
848 * Copy the packet header into the new buffer.
849 */
850 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851
852 /*
853 * Build fragment header.
854 */
855 fh->nexthdr = nexthdr;
856 fh->reserved = 0;
857 if (!frag_id) {
858 ipv6_select_ident(fh, rt);
859 frag_id = fh->identification;
860 } else
861 fh->identification = frag_id;
862
863 /*
864 * Copy a block of the IP datagram.
865 */
866 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
867 BUG();
868 left -= len;
869
870 fh->frag_off = htons(offset);
871 if (left > 0)
872 fh->frag_off |= htons(IP6_MF);
873 ipv6_hdr(frag)->payload_len = htons(frag->len -
874 sizeof(struct ipv6hdr));
875
876 ptr += len;
877 offset += len;
878
879 /*
880 * Put this fragment into the sending queue.
881 */
882 err = output(frag);
883 if (err)
884 goto fail;
885
886 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887 IPSTATS_MIB_FRAGCREATES);
888 }
889 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890 IPSTATS_MIB_FRAGOKS);
891 consume_skb(skb);
892 return err;
893
894 fail:
895 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
896 IPSTATS_MIB_FRAGFAILS);
897 kfree_skb(skb);
898 return err;
899 }
900
901 static inline int ip6_rt_check(const struct rt6key *rt_key,
902 const struct in6_addr *fl_addr,
903 const struct in6_addr *addr_cache)
904 {
905 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
906 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 }
908
909 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
910 struct dst_entry *dst,
911 const struct flowi6 *fl6)
912 {
913 struct ipv6_pinfo *np = inet6_sk(sk);
914 struct rt6_info *rt = (struct rt6_info *)dst;
915
916 if (!dst)
917 goto out;
918
919 /* Yes, checking route validity in not connected
920 * case is not very simple. Take into account,
921 * that we do not support routing by source, TOS,
922 * and MSG_DONTROUTE --ANK (980726)
923 *
924 * 1. ip6_rt_check(): If route was host route,
925 * check that cached destination is current.
926 * If it is network route, we still may
927 * check its validity using saved pointer
928 * to the last used address: daddr_cache.
929 * We do not want to save whole address now,
930 * (because main consumer of this service
931 * is tcp, which has not this problem),
932 * so that the last trick works only on connected
933 * sockets.
934 * 2. oif also should be the same.
935 */
936 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
937 #ifdef CONFIG_IPV6_SUBTREES
938 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
939 #endif
940 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
941 dst_release(dst);
942 dst = NULL;
943 }
944
945 out:
946 return dst;
947 }
948
949 static int ip6_dst_lookup_tail(struct sock *sk,
950 struct dst_entry **dst, struct flowi6 *fl6)
951 {
952 struct net *net = sock_net(sk);
953 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
954 struct neighbour *n;
955 struct rt6_info *rt;
956 #endif
957 int err;
958
959 if (*dst == NULL)
960 *dst = ip6_route_output(net, sk, fl6);
961
962 if ((err = (*dst)->error))
963 goto out_err_release;
964
965 if (ipv6_addr_any(&fl6->saddr)) {
966 struct rt6_info *rt = (struct rt6_info *) *dst;
967 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
968 sk ? inet6_sk(sk)->srcprefs : 0,
969 &fl6->saddr);
970 if (err)
971 goto out_err_release;
972 }
973
974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
975 /*
976 * Here if the dst entry we've looked up
977 * has a neighbour entry that is in the INCOMPLETE
978 * state and the src address from the flow is
979 * marked as OPTIMISTIC, we release the found
980 * dst entry and replace it instead with the
981 * dst entry of the nexthop router
982 */
983 rcu_read_lock();
984 rt = (struct rt6_info *) *dst;
985 n = rt->n;
986 if (n && !(n->nud_state & NUD_VALID)) {
987 struct inet6_ifaddr *ifp;
988 struct flowi6 fl_gw6;
989 int redirect;
990
991 rcu_read_unlock();
992 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
993 (*dst)->dev, 1);
994
995 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
996 if (ifp)
997 in6_ifa_put(ifp);
998
999 if (redirect) {
1000 /*
1001 * We need to get the dst entry for the
1002 * default router instead
1003 */
1004 dst_release(*dst);
1005 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1006 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1007 *dst = ip6_route_output(net, sk, &fl_gw6);
1008 if ((err = (*dst)->error))
1009 goto out_err_release;
1010 }
1011 } else {
1012 rcu_read_unlock();
1013 }
1014 #endif
1015
1016 return 0;
1017
1018 out_err_release:
1019 if (err == -ENETUNREACH)
1020 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1021 dst_release(*dst);
1022 *dst = NULL;
1023 return err;
1024 }
1025
1026 /**
1027 * ip6_dst_lookup - perform route lookup on flow
1028 * @sk: socket which provides route info
1029 * @dst: pointer to dst_entry * for result
1030 * @fl6: flow to lookup
1031 *
1032 * This function performs a route lookup on the given flow.
1033 *
1034 * It returns zero on success, or a standard errno code on error.
1035 */
1036 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1037 {
1038 *dst = NULL;
1039 return ip6_dst_lookup_tail(sk, dst, fl6);
1040 }
1041 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1042
1043 /**
1044 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1045 * @sk: socket which provides route info
1046 * @fl6: flow to lookup
1047 * @final_dst: final destination address for ipsec lookup
1048 * @can_sleep: we are in a sleepable context
1049 *
1050 * This function performs a route lookup on the given flow.
1051 *
1052 * It returns a valid dst pointer on success, or a pointer encoded
1053 * error code.
1054 */
1055 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1056 const struct in6_addr *final_dst,
1057 bool can_sleep)
1058 {
1059 struct dst_entry *dst = NULL;
1060 int err;
1061
1062 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1063 if (err)
1064 return ERR_PTR(err);
1065 if (final_dst)
1066 fl6->daddr = *final_dst;
1067 if (can_sleep)
1068 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1069
1070 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1071 }
1072 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1073
1074 /**
1075 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1076 * @sk: socket which provides the dst cache and route info
1077 * @fl6: flow to lookup
1078 * @final_dst: final destination address for ipsec lookup
1079 * @can_sleep: we are in a sleepable context
1080 *
1081 * This function performs a route lookup on the given flow with the
1082 * possibility of using the cached route in the socket if it is valid.
1083 * It will take the socket dst lock when operating on the dst cache.
1084 * As a result, this function can only be used in process context.
1085 *
1086 * It returns a valid dst pointer on success, or a pointer encoded
1087 * error code.
1088 */
1089 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1090 const struct in6_addr *final_dst,
1091 bool can_sleep)
1092 {
1093 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1094 int err;
1095
1096 dst = ip6_sk_dst_check(sk, dst, fl6);
1097
1098 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1099 if (err)
1100 return ERR_PTR(err);
1101 if (final_dst)
1102 fl6->daddr = *final_dst;
1103 if (can_sleep)
1104 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1105
1106 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1107 }
1108 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1109
1110 static inline int ip6_ufo_append_data(struct sock *sk,
1111 int getfrag(void *from, char *to, int offset, int len,
1112 int odd, struct sk_buff *skb),
1113 void *from, int length, int hh_len, int fragheaderlen,
1114 int transhdrlen, int mtu,unsigned int flags,
1115 struct rt6_info *rt)
1116
1117 {
1118 struct sk_buff *skb;
1119 int err;
1120
1121 /* There is support for UDP large send offload by network
1122 * device, so create one single skb packet containing complete
1123 * udp datagram
1124 */
1125 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1126 skb = sock_alloc_send_skb(sk,
1127 hh_len + fragheaderlen + transhdrlen + 20,
1128 (flags & MSG_DONTWAIT), &err);
1129 if (skb == NULL)
1130 return err;
1131
1132 /* reserve space for Hardware header */
1133 skb_reserve(skb, hh_len);
1134
1135 /* create space for UDP/IP header */
1136 skb_put(skb,fragheaderlen + transhdrlen);
1137
1138 /* initialize network header pointer */
1139 skb_reset_network_header(skb);
1140
1141 /* initialize protocol header pointer */
1142 skb->transport_header = skb->network_header + fragheaderlen;
1143
1144 skb->ip_summed = CHECKSUM_PARTIAL;
1145 skb->csum = 0;
1146 }
1147
1148 err = skb_append_datato_frags(sk,skb, getfrag, from,
1149 (length - transhdrlen));
1150 if (!err) {
1151 struct frag_hdr fhdr;
1152
1153 /* Specify the length of each IPv6 datagram fragment.
1154 * It has to be a multiple of 8.
1155 */
1156 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1157 sizeof(struct frag_hdr)) & ~7;
1158 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1159 ipv6_select_ident(&fhdr, rt);
1160 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1161 __skb_queue_tail(&sk->sk_write_queue, skb);
1162
1163 return 0;
1164 }
1165 /* There is not enough support do UPD LSO,
1166 * so follow normal path
1167 */
1168 kfree_skb(skb);
1169
1170 return err;
1171 }
1172
1173 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1174 gfp_t gfp)
1175 {
1176 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1180 gfp_t gfp)
1181 {
1182 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184
1185 static void ip6_append_data_mtu(int *mtu,
1186 int *maxfraglen,
1187 unsigned int fragheaderlen,
1188 struct sk_buff *skb,
1189 struct rt6_info *rt)
1190 {
1191 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1192 if (skb == NULL) {
1193 /* first fragment, reserve header_len */
1194 *mtu = *mtu - rt->dst.header_len;
1195
1196 } else {
1197 /*
1198 * this fragment is not first, the headers
1199 * space is regarded as data space.
1200 */
1201 *mtu = dst_mtu(rt->dst.path);
1202 }
1203 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1204 + fragheaderlen - sizeof(struct frag_hdr);
1205 }
1206 }
1207
1208 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1209 int offset, int len, int odd, struct sk_buff *skb),
1210 void *from, int length, int transhdrlen,
1211 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1212 struct rt6_info *rt, unsigned int flags, int dontfrag)
1213 {
1214 struct inet_sock *inet = inet_sk(sk);
1215 struct ipv6_pinfo *np = inet6_sk(sk);
1216 struct inet_cork *cork;
1217 struct sk_buff *skb, *skb_prev = NULL;
1218 unsigned int maxfraglen, fragheaderlen;
1219 int exthdrlen;
1220 int dst_exthdrlen;
1221 int hh_len;
1222 int mtu;
1223 int copy;
1224 int err;
1225 int offset = 0;
1226 __u8 tx_flags = 0;
1227
1228 if (flags&MSG_PROBE)
1229 return 0;
1230 cork = &inet->cork.base;
1231 if (skb_queue_empty(&sk->sk_write_queue)) {
1232 /*
1233 * setup for corking
1234 */
1235 if (opt) {
1236 if (WARN_ON(np->cork.opt))
1237 return -EINVAL;
1238
1239 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1240 if (unlikely(np->cork.opt == NULL))
1241 return -ENOBUFS;
1242
1243 np->cork.opt->tot_len = opt->tot_len;
1244 np->cork.opt->opt_flen = opt->opt_flen;
1245 np->cork.opt->opt_nflen = opt->opt_nflen;
1246
1247 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1248 sk->sk_allocation);
1249 if (opt->dst0opt && !np->cork.opt->dst0opt)
1250 return -ENOBUFS;
1251
1252 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1253 sk->sk_allocation);
1254 if (opt->dst1opt && !np->cork.opt->dst1opt)
1255 return -ENOBUFS;
1256
1257 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1258 sk->sk_allocation);
1259 if (opt->hopopt && !np->cork.opt->hopopt)
1260 return -ENOBUFS;
1261
1262 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1263 sk->sk_allocation);
1264 if (opt->srcrt && !np->cork.opt->srcrt)
1265 return -ENOBUFS;
1266
1267 /* need source address above miyazawa*/
1268 }
1269 dst_hold(&rt->dst);
1270 cork->dst = &rt->dst;
1271 inet->cork.fl.u.ip6 = *fl6;
1272 np->cork.hop_limit = hlimit;
1273 np->cork.tclass = tclass;
1274 if (rt->dst.flags & DST_XFRM_TUNNEL)
1275 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1276 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1277 else
1278 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1279 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1280 if (np->frag_size < mtu) {
1281 if (np->frag_size)
1282 mtu = np->frag_size;
1283 }
1284 cork->fragsize = mtu;
1285 if (dst_allfrag(rt->dst.path))
1286 cork->flags |= IPCORK_ALLFRAG;
1287 cork->length = 0;
1288 sk->sk_sndmsg_page = NULL;
1289 sk->sk_sndmsg_off = 0;
1290 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1291 length += exthdrlen;
1292 transhdrlen += exthdrlen;
1293 dst_exthdrlen = rt->dst.header_len;
1294 } else {
1295 rt = (struct rt6_info *)cork->dst;
1296 fl6 = &inet->cork.fl.u.ip6;
1297 opt = np->cork.opt;
1298 transhdrlen = 0;
1299 exthdrlen = 0;
1300 dst_exthdrlen = 0;
1301 mtu = cork->fragsize;
1302 }
1303
1304 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1305
1306 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1307 (opt ? opt->opt_nflen : 0);
1308 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1309
1310 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1311 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1312 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1313 return -EMSGSIZE;
1314 }
1315 }
1316
1317 /* For UDP, check if TX timestamp is enabled */
1318 if (sk->sk_type == SOCK_DGRAM) {
1319 err = sock_tx_timestamp(sk, &tx_flags);
1320 if (err)
1321 goto error;
1322 }
1323
1324 /*
1325 * Let's try using as much space as possible.
1326 * Use MTU if total length of the message fits into the MTU.
1327 * Otherwise, we need to reserve fragment header and
1328 * fragment alignment (= 8-15 octects, in total).
1329 *
1330 * Note that we may need to "move" the data from the tail of
1331 * of the buffer to the new fragment when we split
1332 * the message.
1333 *
1334 * FIXME: It may be fragmented into multiple chunks
1335 * at once if non-fragmentable extension headers
1336 * are too large.
1337 * --yoshfuji
1338 */
1339
1340 cork->length += length;
1341 if (length > mtu) {
1342 int proto = sk->sk_protocol;
1343 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1344 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1345 return -EMSGSIZE;
1346 }
1347
1348 if (proto == IPPROTO_UDP &&
1349 (rt->dst.dev->features & NETIF_F_UFO)) {
1350
1351 err = ip6_ufo_append_data(sk, getfrag, from, length,
1352 hh_len, fragheaderlen,
1353 transhdrlen, mtu, flags, rt);
1354 if (err)
1355 goto error;
1356 return 0;
1357 }
1358 }
1359
1360 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1361 goto alloc_new_skb;
1362
1363 while (length > 0) {
1364 /* Check if the remaining data fits into current packet. */
1365 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1366 if (copy < length)
1367 copy = maxfraglen - skb->len;
1368
1369 if (copy <= 0) {
1370 char *data;
1371 unsigned int datalen;
1372 unsigned int fraglen;
1373 unsigned int fraggap;
1374 unsigned int alloclen;
1375 alloc_new_skb:
1376 /* There's no room in the current skb */
1377 if (skb)
1378 fraggap = skb->len - maxfraglen;
1379 else
1380 fraggap = 0;
1381 /* update mtu and maxfraglen if necessary */
1382 if (skb == NULL || skb_prev == NULL)
1383 ip6_append_data_mtu(&mtu, &maxfraglen,
1384 fragheaderlen, skb, rt);
1385
1386 skb_prev = skb;
1387
1388 /*
1389 * If remaining data exceeds the mtu,
1390 * we know we need more fragment(s).
1391 */
1392 datalen = length + fraggap;
1393
1394 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1395 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1396 if ((flags & MSG_MORE) &&
1397 !(rt->dst.dev->features&NETIF_F_SG))
1398 alloclen = mtu;
1399 else
1400 alloclen = datalen + fragheaderlen;
1401
1402 alloclen += dst_exthdrlen;
1403
1404 if (datalen != length + fraggap) {
1405 /*
1406 * this is not the last fragment, the trailer
1407 * space is regarded as data space.
1408 */
1409 datalen += rt->dst.trailer_len;
1410 }
1411
1412 alloclen += rt->dst.trailer_len;
1413 fraglen = datalen + fragheaderlen;
1414
1415 /*
1416 * We just reserve space for fragment header.
1417 * Note: this may be overallocation if the message
1418 * (without MSG_MORE) fits into the MTU.
1419 */
1420 alloclen += sizeof(struct frag_hdr);
1421
1422 if (transhdrlen) {
1423 skb = sock_alloc_send_skb(sk,
1424 alloclen + hh_len,
1425 (flags & MSG_DONTWAIT), &err);
1426 } else {
1427 skb = NULL;
1428 if (atomic_read(&sk->sk_wmem_alloc) <=
1429 2 * sk->sk_sndbuf)
1430 skb = sock_wmalloc(sk,
1431 alloclen + hh_len, 1,
1432 sk->sk_allocation);
1433 if (unlikely(skb == NULL))
1434 err = -ENOBUFS;
1435 else {
1436 /* Only the initial fragment
1437 * is time stamped.
1438 */
1439 tx_flags = 0;
1440 }
1441 }
1442 if (skb == NULL)
1443 goto error;
1444 /*
1445 * Fill in the control structures
1446 */
1447 skb->ip_summed = CHECKSUM_NONE;
1448 skb->csum = 0;
1449 /* reserve for fragmentation and ipsec header */
1450 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1451 dst_exthdrlen);
1452
1453 if (sk->sk_type == SOCK_DGRAM)
1454 skb_shinfo(skb)->tx_flags = tx_flags;
1455
1456 /*
1457 * Find where to start putting bytes
1458 */
1459 data = skb_put(skb, fraglen);
1460 skb_set_network_header(skb, exthdrlen);
1461 data += fragheaderlen;
1462 skb->transport_header = (skb->network_header +
1463 fragheaderlen);
1464 if (fraggap) {
1465 skb->csum = skb_copy_and_csum_bits(
1466 skb_prev, maxfraglen,
1467 data + transhdrlen, fraggap, 0);
1468 skb_prev->csum = csum_sub(skb_prev->csum,
1469 skb->csum);
1470 data += fraggap;
1471 pskb_trim_unique(skb_prev, maxfraglen);
1472 }
1473 copy = datalen - transhdrlen - fraggap;
1474
1475 if (copy < 0) {
1476 err = -EINVAL;
1477 kfree_skb(skb);
1478 goto error;
1479 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1480 err = -EFAULT;
1481 kfree_skb(skb);
1482 goto error;
1483 }
1484
1485 offset += copy;
1486 length -= datalen - fraggap;
1487 transhdrlen = 0;
1488 exthdrlen = 0;
1489 dst_exthdrlen = 0;
1490
1491 /*
1492 * Put the packet on the pending queue
1493 */
1494 __skb_queue_tail(&sk->sk_write_queue, skb);
1495 continue;
1496 }
1497
1498 if (copy > length)
1499 copy = length;
1500
1501 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1502 unsigned int off;
1503
1504 off = skb->len;
1505 if (getfrag(from, skb_put(skb, copy),
1506 offset, copy, off, skb) < 0) {
1507 __skb_trim(skb, off);
1508 err = -EFAULT;
1509 goto error;
1510 }
1511 } else {
1512 int i = skb_shinfo(skb)->nr_frags;
1513 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1514 struct page *page = sk->sk_sndmsg_page;
1515 int off = sk->sk_sndmsg_off;
1516 unsigned int left;
1517
1518 if (page && (left = PAGE_SIZE - off) > 0) {
1519 if (copy >= left)
1520 copy = left;
1521 if (page != skb_frag_page(frag)) {
1522 if (i == MAX_SKB_FRAGS) {
1523 err = -EMSGSIZE;
1524 goto error;
1525 }
1526 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1527 skb_frag_ref(skb, i);
1528 frag = &skb_shinfo(skb)->frags[i];
1529 }
1530 } else if(i < MAX_SKB_FRAGS) {
1531 if (copy > PAGE_SIZE)
1532 copy = PAGE_SIZE;
1533 page = alloc_pages(sk->sk_allocation, 0);
1534 if (page == NULL) {
1535 err = -ENOMEM;
1536 goto error;
1537 }
1538 sk->sk_sndmsg_page = page;
1539 sk->sk_sndmsg_off = 0;
1540
1541 skb_fill_page_desc(skb, i, page, 0, 0);
1542 frag = &skb_shinfo(skb)->frags[i];
1543 } else {
1544 err = -EMSGSIZE;
1545 goto error;
1546 }
1547 if (getfrag(from,
1548 skb_frag_address(frag) + skb_frag_size(frag),
1549 offset, copy, skb->len, skb) < 0) {
1550 err = -EFAULT;
1551 goto error;
1552 }
1553 sk->sk_sndmsg_off += copy;
1554 skb_frag_size_add(frag, copy);
1555 skb->len += copy;
1556 skb->data_len += copy;
1557 skb->truesize += copy;
1558 atomic_add(copy, &sk->sk_wmem_alloc);
1559 }
1560 offset += copy;
1561 length -= copy;
1562 }
1563 return 0;
1564 error:
1565 cork->length -= length;
1566 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1567 return err;
1568 }
1569 EXPORT_SYMBOL_GPL(ip6_append_data);
1570
1571 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1572 {
1573 if (np->cork.opt) {
1574 kfree(np->cork.opt->dst0opt);
1575 kfree(np->cork.opt->dst1opt);
1576 kfree(np->cork.opt->hopopt);
1577 kfree(np->cork.opt->srcrt);
1578 kfree(np->cork.opt);
1579 np->cork.opt = NULL;
1580 }
1581
1582 if (inet->cork.base.dst) {
1583 dst_release(inet->cork.base.dst);
1584 inet->cork.base.dst = NULL;
1585 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1586 }
1587 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1588 }
1589
1590 int ip6_push_pending_frames(struct sock *sk)
1591 {
1592 struct sk_buff *skb, *tmp_skb;
1593 struct sk_buff **tail_skb;
1594 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1595 struct inet_sock *inet = inet_sk(sk);
1596 struct ipv6_pinfo *np = inet6_sk(sk);
1597 struct net *net = sock_net(sk);
1598 struct ipv6hdr *hdr;
1599 struct ipv6_txoptions *opt = np->cork.opt;
1600 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1601 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1602 unsigned char proto = fl6->flowi6_proto;
1603 int err = 0;
1604
1605 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1606 goto out;
1607 tail_skb = &(skb_shinfo(skb)->frag_list);
1608
1609 /* move skb->data to ip header from ext header */
1610 if (skb->data < skb_network_header(skb))
1611 __skb_pull(skb, skb_network_offset(skb));
1612 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1613 __skb_pull(tmp_skb, skb_network_header_len(skb));
1614 *tail_skb = tmp_skb;
1615 tail_skb = &(tmp_skb->next);
1616 skb->len += tmp_skb->len;
1617 skb->data_len += tmp_skb->len;
1618 skb->truesize += tmp_skb->truesize;
1619 tmp_skb->destructor = NULL;
1620 tmp_skb->sk = NULL;
1621 }
1622
1623 /* Allow local fragmentation. */
1624 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1625 skb->local_df = 1;
1626
1627 *final_dst = fl6->daddr;
1628 __skb_pull(skb, skb_network_header_len(skb));
1629 if (opt && opt->opt_flen)
1630 ipv6_push_frag_opts(skb, opt, &proto);
1631 if (opt && opt->opt_nflen)
1632 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1633
1634 skb_push(skb, sizeof(struct ipv6hdr));
1635 skb_reset_network_header(skb);
1636 hdr = ipv6_hdr(skb);
1637
1638 *(__be32*)hdr = fl6->flowlabel |
1639 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1640
1641 hdr->hop_limit = np->cork.hop_limit;
1642 hdr->nexthdr = proto;
1643 hdr->saddr = fl6->saddr;
1644 hdr->daddr = *final_dst;
1645
1646 skb->priority = sk->sk_priority;
1647 skb->mark = sk->sk_mark;
1648
1649 skb_dst_set(skb, dst_clone(&rt->dst));
1650 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1651 if (proto == IPPROTO_ICMPV6) {
1652 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1653
1654 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1655 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1656 }
1657
1658 err = ip6_local_out(skb);
1659 if (err) {
1660 if (err > 0)
1661 err = net_xmit_errno(err);
1662 if (err)
1663 goto error;
1664 }
1665
1666 out:
1667 ip6_cork_release(inet, np);
1668 return err;
1669 error:
1670 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1671 goto out;
1672 }
1673 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1674
1675 void ip6_flush_pending_frames(struct sock *sk)
1676 {
1677 struct sk_buff *skb;
1678
1679 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1680 if (skb_dst(skb))
1681 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1682 IPSTATS_MIB_OUTDISCARDS);
1683 kfree_skb(skb);
1684 }
1685
1686 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1687 }
1688 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);