]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/ipv6/ip6_output.c
ipv6: constify rt6_nexthop()
[mirror_ubuntu-bionic-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 const struct in6_addr *nexthop;
67 struct neighbour *neigh;
68 int ret;
69
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
87
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
108
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
111 }
112
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
123 }
124 rcu_read_unlock_bh();
125
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133 int ret;
134
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
139 }
140
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
146 }
147 #endif
148
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
155 }
156
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158 {
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
164
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
169 }
170
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175 }
176
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178 {
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
183 }
184
185 /*
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
190 */
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
193 {
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 unsigned int head_room;
199 struct ipv6hdr *hdr;
200 u8 proto = fl6->flowi6_proto;
201 int seg_len = skb->len;
202 int hlimit = -1;
203 u32 mtu;
204
205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 if (opt)
207 head_room += opt->opt_nflen + opt->opt_flen;
208
209 if (unlikely(skb_headroom(skb) < head_room)) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 if (!skb2) {
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
215 return -ENOBUFS;
216 }
217 if (skb->sk)
218 skb_set_owner_w(skb2, skb->sk);
219 consume_skb(skb);
220 skb = skb2;
221 }
222
223 if (opt) {
224 seg_len += opt->opt_nflen + opt->opt_flen;
225
226 if (opt->opt_flen)
227 ipv6_push_frag_opts(skb, opt, &proto);
228
229 if (opt->opt_nflen)
230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 &fl6->saddr);
232 }
233
234 skb_push(skb, sizeof(struct ipv6hdr));
235 skb_reset_network_header(skb);
236 hdr = ipv6_hdr(skb);
237
238 /*
239 * Fill in the IPv6 header
240 */
241 if (np)
242 hlimit = np->hop_limit;
243 if (hlimit < 0)
244 hlimit = ip6_dst_hoplimit(dst);
245
246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 ip6_autoflowlabel(net, np), fl6));
248
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
252
253 hdr->saddr = fl6->saddr;
254 hdr->daddr = *first_hop;
255
256 skb->protocol = htons(ETH_P_IPV6);
257 skb->priority = sk->sk_priority;
258 skb->mark = mark;
259
260 mtu = dst_mtu(dst);
261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 IPSTATS_MIB_OUT, skb->len);
264
265 /* if egress device is enslaved to an L3 master device pass the
266 * skb to its handler for processing
267 */
268 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 if (unlikely(!skb))
270 return 0;
271
272 /* hooks should never assume socket lock is held.
273 * we promote our socket to non const
274 */
275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 net, (struct sock *)sk, skb, NULL, dst->dev,
277 dst_output);
278 }
279
280 skb->dev = dst->dev;
281 /* ipv6_local_error() does not require socket lock,
282 * we promote our socket to non const
283 */
284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285
286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 kfree_skb(skb);
288 return -EMSGSIZE;
289 }
290 EXPORT_SYMBOL(ip6_xmit);
291
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293 {
294 struct ip6_ra_chain *ra;
295 struct sock *last = NULL;
296
297 read_lock(&ip6_ra_lock);
298 for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 struct sock *sk = ra->sk;
300 if (sk && ra->sel == sel &&
301 (!sk->sk_bound_dev_if ||
302 sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 if (last) {
304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 if (skb2)
306 rawv6_rcv(last, skb2);
307 }
308 last = sk;
309 }
310 }
311
312 if (last) {
313 rawv6_rcv(last, skb);
314 read_unlock(&ip6_ra_lock);
315 return 1;
316 }
317 read_unlock(&ip6_ra_lock);
318 return 0;
319 }
320
321 static int ip6_forward_proxy_check(struct sk_buff *skb)
322 {
323 struct ipv6hdr *hdr = ipv6_hdr(skb);
324 u8 nexthdr = hdr->nexthdr;
325 __be16 frag_off;
326 int offset;
327
328 if (ipv6_ext_hdr(nexthdr)) {
329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 if (offset < 0)
331 return 0;
332 } else
333 offset = sizeof(struct ipv6hdr);
334
335 if (nexthdr == IPPROTO_ICMPV6) {
336 struct icmp6hdr *icmp6;
337
338 if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 offset + 1 - skb->data)))
340 return 0;
341
342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343
344 switch (icmp6->icmp6_type) {
345 case NDISC_ROUTER_SOLICITATION:
346 case NDISC_ROUTER_ADVERTISEMENT:
347 case NDISC_NEIGHBOUR_SOLICITATION:
348 case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 case NDISC_REDIRECT:
350 /* For reaction involving unicast neighbor discovery
351 * message destined to the proxied address, pass it to
352 * input function.
353 */
354 return 1;
355 default:
356 break;
357 }
358 }
359
360 /*
361 * The proxying router can't forward traffic sent to a link-local
362 * address, so signal the sender and discard the packet. This
363 * behavior is clarified by the MIPv6 specification.
364 */
365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 dst_link_failure(skb);
367 return -1;
368 }
369
370 return 0;
371 }
372
373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 struct sk_buff *skb)
375 {
376 struct dst_entry *dst = skb_dst(skb);
377
378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380
381 skb->tstamp = 0;
382 return dst_output(net, sk, skb);
383 }
384
385 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
386 {
387 unsigned int mtu;
388 struct inet6_dev *idev;
389
390 if (dst_metric_locked(dst, RTAX_MTU)) {
391 mtu = dst_metric_raw(dst, RTAX_MTU);
392 if (mtu)
393 return mtu;
394 }
395
396 mtu = IPV6_MIN_MTU;
397 rcu_read_lock();
398 idev = __in6_dev_get(dst->dev);
399 if (idev)
400 mtu = idev->cnf.mtu6;
401 rcu_read_unlock();
402
403 return mtu;
404 }
405
406 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
407 {
408 if (skb->len <= mtu)
409 return false;
410
411 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
412 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
413 return true;
414
415 if (skb->ignore_df)
416 return false;
417
418 if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
419 return false;
420
421 return true;
422 }
423
424 int ip6_forward(struct sk_buff *skb)
425 {
426 struct dst_entry *dst = skb_dst(skb);
427 struct ipv6hdr *hdr = ipv6_hdr(skb);
428 struct inet6_skb_parm *opt = IP6CB(skb);
429 struct net *net = dev_net(dst->dev);
430 u32 mtu;
431
432 if (net->ipv6.devconf_all->forwarding == 0)
433 goto error;
434
435 if (skb->pkt_type != PACKET_HOST)
436 goto drop;
437
438 if (unlikely(skb->sk))
439 goto drop;
440
441 if (skb_warn_if_lro(skb))
442 goto drop;
443
444 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
445 __IP6_INC_STATS(net, ip6_dst_idev(dst),
446 IPSTATS_MIB_INDISCARDS);
447 goto drop;
448 }
449
450 skb_forward_csum(skb);
451
452 /*
453 * We DO NOT make any processing on
454 * RA packets, pushing them to user level AS IS
455 * without ane WARRANTY that application will be able
456 * to interpret them. The reason is that we
457 * cannot make anything clever here.
458 *
459 * We are not end-node, so that if packet contains
460 * AH/ESP, we cannot make anything.
461 * Defragmentation also would be mistake, RA packets
462 * cannot be fragmented, because there is no warranty
463 * that different fragments will go along one path. --ANK
464 */
465 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
466 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
467 return 0;
468 }
469
470 /*
471 * check and decrement ttl
472 */
473 if (hdr->hop_limit <= 1) {
474 /* Force OUTPUT device used as source address */
475 skb->dev = dst->dev;
476 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
477 __IP6_INC_STATS(net, ip6_dst_idev(dst),
478 IPSTATS_MIB_INHDRERRORS);
479
480 kfree_skb(skb);
481 return -ETIMEDOUT;
482 }
483
484 /* XXX: idev->cnf.proxy_ndp? */
485 if (net->ipv6.devconf_all->proxy_ndp &&
486 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
487 int proxied = ip6_forward_proxy_check(skb);
488 if (proxied > 0)
489 return ip6_input(skb);
490 else if (proxied < 0) {
491 __IP6_INC_STATS(net, ip6_dst_idev(dst),
492 IPSTATS_MIB_INDISCARDS);
493 goto drop;
494 }
495 }
496
497 if (!xfrm6_route_forward(skb)) {
498 __IP6_INC_STATS(net, ip6_dst_idev(dst),
499 IPSTATS_MIB_INDISCARDS);
500 goto drop;
501 }
502 dst = skb_dst(skb);
503
504 /* IPv6 specs say nothing about it, but it is clear that we cannot
505 send redirects to source routed frames.
506 We don't send redirects to frames decapsulated from IPsec.
507 */
508 if (IP6CB(skb)->iif == dst->dev->ifindex &&
509 opt->srcrt == 0 && !skb_sec_path(skb)) {
510 struct in6_addr *target = NULL;
511 struct inet_peer *peer;
512 struct rt6_info *rt;
513
514 /*
515 * incoming and outgoing devices are the same
516 * send a redirect.
517 */
518
519 rt = (struct rt6_info *) dst;
520 if (rt->rt6i_flags & RTF_GATEWAY)
521 target = &rt->rt6i_gateway;
522 else
523 target = &hdr->daddr;
524
525 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
526
527 /* Limit redirects both by destination (here)
528 and by source (inside ndisc_send_redirect)
529 */
530 if (inet_peer_xrlim_allow(peer, 1*HZ))
531 ndisc_send_redirect(skb, target);
532 if (peer)
533 inet_putpeer(peer);
534 } else {
535 int addrtype = ipv6_addr_type(&hdr->saddr);
536
537 /* This check is security critical. */
538 if (addrtype == IPV6_ADDR_ANY ||
539 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
540 goto error;
541 if (addrtype & IPV6_ADDR_LINKLOCAL) {
542 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
543 ICMPV6_NOT_NEIGHBOUR, 0);
544 goto error;
545 }
546 }
547
548 mtu = ip6_dst_mtu_forward(dst);
549 if (mtu < IPV6_MIN_MTU)
550 mtu = IPV6_MIN_MTU;
551
552 if (ip6_pkt_too_big(skb, mtu)) {
553 /* Again, force OUTPUT device used as source address */
554 skb->dev = dst->dev;
555 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
556 __IP6_INC_STATS(net, ip6_dst_idev(dst),
557 IPSTATS_MIB_INTOOBIGERRORS);
558 __IP6_INC_STATS(net, ip6_dst_idev(dst),
559 IPSTATS_MIB_FRAGFAILS);
560 kfree_skb(skb);
561 return -EMSGSIZE;
562 }
563
564 if (skb_cow(skb, dst->dev->hard_header_len)) {
565 __IP6_INC_STATS(net, ip6_dst_idev(dst),
566 IPSTATS_MIB_OUTDISCARDS);
567 goto drop;
568 }
569
570 hdr = ipv6_hdr(skb);
571
572 /* Mangling hops number delayed to point after skb COW */
573
574 hdr->hop_limit--;
575
576 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
577 net, NULL, skb, skb->dev, dst->dev,
578 ip6_forward_finish);
579
580 error:
581 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
582 drop:
583 kfree_skb(skb);
584 return -EINVAL;
585 }
586
587 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
588 {
589 to->pkt_type = from->pkt_type;
590 to->priority = from->priority;
591 to->protocol = from->protocol;
592 skb_dst_drop(to);
593 skb_dst_set(to, dst_clone(skb_dst(from)));
594 to->dev = from->dev;
595 to->mark = from->mark;
596
597 skb_copy_hash(to, from);
598
599 #ifdef CONFIG_NET_SCHED
600 to->tc_index = from->tc_index;
601 #endif
602 nf_copy(to, from);
603 skb_copy_secmark(to, from);
604 }
605
606 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
607 int (*output)(struct net *, struct sock *, struct sk_buff *))
608 {
609 struct sk_buff *frag;
610 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
611 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
612 inet6_sk(skb->sk) : NULL;
613 struct ipv6hdr *tmp_hdr;
614 struct frag_hdr *fh;
615 unsigned int mtu, hlen, left, len, nexthdr_offset;
616 int hroom, troom;
617 __be32 frag_id;
618 int ptr, offset = 0, err = 0;
619 u8 *prevhdr, nexthdr = 0;
620
621 err = ip6_find_1stfragopt(skb, &prevhdr);
622 if (err < 0)
623 goto fail;
624 hlen = err;
625 nexthdr = *prevhdr;
626 nexthdr_offset = prevhdr - skb_network_header(skb);
627
628 mtu = ip6_skb_dst_mtu(skb);
629
630 /* We must not fragment if the socket is set to force MTU discovery
631 * or if the skb it not generated by a local socket.
632 */
633 if (unlikely(!skb->ignore_df && skb->len > mtu))
634 goto fail_toobig;
635
636 if (IP6CB(skb)->frag_max_size) {
637 if (IP6CB(skb)->frag_max_size > mtu)
638 goto fail_toobig;
639
640 /* don't send fragments larger than what we received */
641 mtu = IP6CB(skb)->frag_max_size;
642 if (mtu < IPV6_MIN_MTU)
643 mtu = IPV6_MIN_MTU;
644 }
645
646 if (np && np->frag_size < mtu) {
647 if (np->frag_size)
648 mtu = np->frag_size;
649 }
650 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
651 goto fail_toobig;
652 mtu -= hlen + sizeof(struct frag_hdr);
653
654 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
655 &ipv6_hdr(skb)->saddr);
656
657 if (skb->ip_summed == CHECKSUM_PARTIAL &&
658 (err = skb_checksum_help(skb)))
659 goto fail;
660
661 prevhdr = skb_network_header(skb) + nexthdr_offset;
662 hroom = LL_RESERVED_SPACE(rt->dst.dev);
663 if (skb_has_frag_list(skb)) {
664 unsigned int first_len = skb_pagelen(skb);
665 struct sk_buff *frag2;
666
667 if (first_len - hlen > mtu ||
668 ((first_len - hlen) & 7) ||
669 skb_cloned(skb) ||
670 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
671 goto slow_path;
672
673 skb_walk_frags(skb, frag) {
674 /* Correct geometry. */
675 if (frag->len > mtu ||
676 ((frag->len & 7) && frag->next) ||
677 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
678 goto slow_path_clean;
679
680 /* Partially cloned skb? */
681 if (skb_shared(frag))
682 goto slow_path_clean;
683
684 BUG_ON(frag->sk);
685 if (skb->sk) {
686 frag->sk = skb->sk;
687 frag->destructor = sock_wfree;
688 }
689 skb->truesize -= frag->truesize;
690 }
691
692 err = 0;
693 offset = 0;
694 /* BUILD HEADER */
695
696 *prevhdr = NEXTHDR_FRAGMENT;
697 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698 if (!tmp_hdr) {
699 err = -ENOMEM;
700 goto fail;
701 }
702 frag = skb_shinfo(skb)->frag_list;
703 skb_frag_list_init(skb);
704
705 __skb_pull(skb, hlen);
706 fh = __skb_push(skb, sizeof(struct frag_hdr));
707 __skb_push(skb, hlen);
708 skb_reset_network_header(skb);
709 memcpy(skb_network_header(skb), tmp_hdr, hlen);
710
711 fh->nexthdr = nexthdr;
712 fh->reserved = 0;
713 fh->frag_off = htons(IP6_MF);
714 fh->identification = frag_id;
715
716 first_len = skb_pagelen(skb);
717 skb->data_len = first_len - skb_headlen(skb);
718 skb->len = first_len;
719 ipv6_hdr(skb)->payload_len = htons(first_len -
720 sizeof(struct ipv6hdr));
721
722 for (;;) {
723 /* Prepare header of the next frame,
724 * before previous one went down. */
725 if (frag) {
726 frag->ip_summed = CHECKSUM_NONE;
727 skb_reset_transport_header(frag);
728 fh = __skb_push(frag, sizeof(struct frag_hdr));
729 __skb_push(frag, hlen);
730 skb_reset_network_header(frag);
731 memcpy(skb_network_header(frag), tmp_hdr,
732 hlen);
733 offset += skb->len - hlen - sizeof(struct frag_hdr);
734 fh->nexthdr = nexthdr;
735 fh->reserved = 0;
736 fh->frag_off = htons(offset);
737 if (frag->next)
738 fh->frag_off |= htons(IP6_MF);
739 fh->identification = frag_id;
740 ipv6_hdr(frag)->payload_len =
741 htons(frag->len -
742 sizeof(struct ipv6hdr));
743 ip6_copy_metadata(frag, skb);
744 }
745
746 err = output(net, sk, skb);
747 if (!err)
748 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 IPSTATS_MIB_FRAGCREATES);
750
751 if (err || !frag)
752 break;
753
754 skb = frag;
755 frag = skb->next;
756 skb->next = NULL;
757 }
758
759 kfree(tmp_hdr);
760
761 if (err == 0) {
762 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763 IPSTATS_MIB_FRAGOKS);
764 return 0;
765 }
766
767 kfree_skb_list(frag);
768
769 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
770 IPSTATS_MIB_FRAGFAILS);
771 return err;
772
773 slow_path_clean:
774 skb_walk_frags(skb, frag2) {
775 if (frag2 == frag)
776 break;
777 frag2->sk = NULL;
778 frag2->destructor = NULL;
779 skb->truesize += frag2->truesize;
780 }
781 }
782
783 slow_path:
784 left = skb->len - hlen; /* Space per frame */
785 ptr = hlen; /* Where to start from */
786
787 /*
788 * Fragment the datagram.
789 */
790
791 troom = rt->dst.dev->needed_tailroom;
792
793 /*
794 * Keep copying data until we run out.
795 */
796 while (left > 0) {
797 u8 *fragnexthdr_offset;
798
799 len = left;
800 /* IF: it doesn't fit, use 'mtu' - the data space left */
801 if (len > mtu)
802 len = mtu;
803 /* IF: we are not sending up to and including the packet end
804 then align the next start on an eight byte boundary */
805 if (len < left) {
806 len &= ~7;
807 }
808
809 /* Allocate buffer */
810 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
811 hroom + troom, GFP_ATOMIC);
812 if (!frag) {
813 err = -ENOMEM;
814 goto fail;
815 }
816
817 /*
818 * Set up data on packet
819 */
820
821 ip6_copy_metadata(frag, skb);
822 skb_reserve(frag, hroom);
823 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
824 skb_reset_network_header(frag);
825 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
826 frag->transport_header = (frag->network_header + hlen +
827 sizeof(struct frag_hdr));
828
829 /*
830 * Charge the memory for the fragment to any owner
831 * it might possess
832 */
833 if (skb->sk)
834 skb_set_owner_w(frag, skb->sk);
835
836 /*
837 * Copy the packet header into the new buffer.
838 */
839 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
840
841 fragnexthdr_offset = skb_network_header(frag);
842 fragnexthdr_offset += prevhdr - skb_network_header(skb);
843 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
844
845 /*
846 * Build fragment header.
847 */
848 fh->nexthdr = nexthdr;
849 fh->reserved = 0;
850 fh->identification = frag_id;
851
852 /*
853 * Copy a block of the IP datagram.
854 */
855 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
856 len));
857 left -= len;
858
859 fh->frag_off = htons(offset);
860 if (left > 0)
861 fh->frag_off |= htons(IP6_MF);
862 ipv6_hdr(frag)->payload_len = htons(frag->len -
863 sizeof(struct ipv6hdr));
864
865 ptr += len;
866 offset += len;
867
868 /*
869 * Put this fragment into the sending queue.
870 */
871 err = output(net, sk, frag);
872 if (err)
873 goto fail;
874
875 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
876 IPSTATS_MIB_FRAGCREATES);
877 }
878 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
879 IPSTATS_MIB_FRAGOKS);
880 consume_skb(skb);
881 return err;
882
883 fail_toobig:
884 if (skb->sk && dst_allfrag(skb_dst(skb)))
885 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
886
887 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
888 err = -EMSGSIZE;
889
890 fail:
891 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
892 IPSTATS_MIB_FRAGFAILS);
893 kfree_skb(skb);
894 return err;
895 }
896
897 static inline int ip6_rt_check(const struct rt6key *rt_key,
898 const struct in6_addr *fl_addr,
899 const struct in6_addr *addr_cache)
900 {
901 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
902 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
903 }
904
905 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
906 struct dst_entry *dst,
907 const struct flowi6 *fl6)
908 {
909 struct ipv6_pinfo *np = inet6_sk(sk);
910 struct rt6_info *rt;
911
912 if (!dst)
913 goto out;
914
915 if (dst->ops->family != AF_INET6) {
916 dst_release(dst);
917 return NULL;
918 }
919
920 rt = (struct rt6_info *)dst;
921 /* Yes, checking route validity in not connected
922 * case is not very simple. Take into account,
923 * that we do not support routing by source, TOS,
924 * and MSG_DONTROUTE --ANK (980726)
925 *
926 * 1. ip6_rt_check(): If route was host route,
927 * check that cached destination is current.
928 * If it is network route, we still may
929 * check its validity using saved pointer
930 * to the last used address: daddr_cache.
931 * We do not want to save whole address now,
932 * (because main consumer of this service
933 * is tcp, which has not this problem),
934 * so that the last trick works only on connected
935 * sockets.
936 * 2. oif also should be the same.
937 */
938 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
939 #ifdef CONFIG_IPV6_SUBTREES
940 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
941 #endif
942 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
943 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
944 dst_release(dst);
945 dst = NULL;
946 }
947
948 out:
949 return dst;
950 }
951
952 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
953 struct dst_entry **dst, struct flowi6 *fl6)
954 {
955 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
956 struct neighbour *n;
957 struct rt6_info *rt;
958 #endif
959 int err;
960 int flags = 0;
961
962 /* The correct way to handle this would be to do
963 * ip6_route_get_saddr, and then ip6_route_output; however,
964 * the route-specific preferred source forces the
965 * ip6_route_output call _before_ ip6_route_get_saddr.
966 *
967 * In source specific routing (no src=any default route),
968 * ip6_route_output will fail given src=any saddr, though, so
969 * that's why we try it again later.
970 */
971 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
972 struct rt6_info *rt;
973 bool had_dst = *dst != NULL;
974
975 if (!had_dst)
976 *dst = ip6_route_output(net, sk, fl6);
977 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
978 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
979 sk ? inet6_sk(sk)->srcprefs : 0,
980 &fl6->saddr);
981 if (err)
982 goto out_err_release;
983
984 /* If we had an erroneous initial result, pretend it
985 * never existed and let the SA-enabled version take
986 * over.
987 */
988 if (!had_dst && (*dst)->error) {
989 dst_release(*dst);
990 *dst = NULL;
991 }
992
993 if (fl6->flowi6_oif)
994 flags |= RT6_LOOKUP_F_IFACE;
995 }
996
997 if (!*dst)
998 *dst = ip6_route_output_flags(net, sk, fl6, flags);
999
1000 err = (*dst)->error;
1001 if (err)
1002 goto out_err_release;
1003
1004 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1005 /*
1006 * Here if the dst entry we've looked up
1007 * has a neighbour entry that is in the INCOMPLETE
1008 * state and the src address from the flow is
1009 * marked as OPTIMISTIC, we release the found
1010 * dst entry and replace it instead with the
1011 * dst entry of the nexthop router
1012 */
1013 rt = (struct rt6_info *) *dst;
1014 rcu_read_lock_bh();
1015 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1016 rt6_nexthop(rt, &fl6->daddr));
1017 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1018 rcu_read_unlock_bh();
1019
1020 if (err) {
1021 struct inet6_ifaddr *ifp;
1022 struct flowi6 fl_gw6;
1023 int redirect;
1024
1025 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1026 (*dst)->dev, 1);
1027
1028 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1029 if (ifp)
1030 in6_ifa_put(ifp);
1031
1032 if (redirect) {
1033 /*
1034 * We need to get the dst entry for the
1035 * default router instead
1036 */
1037 dst_release(*dst);
1038 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1039 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1040 *dst = ip6_route_output(net, sk, &fl_gw6);
1041 err = (*dst)->error;
1042 if (err)
1043 goto out_err_release;
1044 }
1045 }
1046 #endif
1047 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1048 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1049 err = -EAFNOSUPPORT;
1050 goto out_err_release;
1051 }
1052
1053 return 0;
1054
1055 out_err_release:
1056 dst_release(*dst);
1057 *dst = NULL;
1058
1059 if (err == -ENETUNREACH)
1060 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1061 return err;
1062 }
1063
1064 /**
1065 * ip6_dst_lookup - perform route lookup on flow
1066 * @sk: socket which provides route info
1067 * @dst: pointer to dst_entry * for result
1068 * @fl6: flow to lookup
1069 *
1070 * This function performs a route lookup on the given flow.
1071 *
1072 * It returns zero on success, or a standard errno code on error.
1073 */
1074 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1075 struct flowi6 *fl6)
1076 {
1077 *dst = NULL;
1078 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1079 }
1080 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1081
1082 /**
1083 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1084 * @sk: socket which provides route info
1085 * @fl6: flow to lookup
1086 * @final_dst: final destination address for ipsec lookup
1087 *
1088 * This function performs a route lookup on the given flow.
1089 *
1090 * It returns a valid dst pointer on success, or a pointer encoded
1091 * error code.
1092 */
1093 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1094 const struct in6_addr *final_dst)
1095 {
1096 struct dst_entry *dst = NULL;
1097 int err;
1098
1099 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1100 if (err)
1101 return ERR_PTR(err);
1102 if (final_dst)
1103 fl6->daddr = *final_dst;
1104
1105 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1108
1109 /**
1110 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1111 * @sk: socket which provides the dst cache and route info
1112 * @fl6: flow to lookup
1113 * @final_dst: final destination address for ipsec lookup
1114 *
1115 * This function performs a route lookup on the given flow with the
1116 * possibility of using the cached route in the socket if it is valid.
1117 * It will take the socket dst lock when operating on the dst cache.
1118 * As a result, this function can only be used in process context.
1119 *
1120 * It returns a valid dst pointer on success, or a pointer encoded
1121 * error code.
1122 */
1123 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1124 const struct in6_addr *final_dst)
1125 {
1126 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1127
1128 dst = ip6_sk_dst_check(sk, dst, fl6);
1129 if (!dst)
1130 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1131
1132 return dst;
1133 }
1134 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1135
1136 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1137 gfp_t gfp)
1138 {
1139 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1143 gfp_t gfp)
1144 {
1145 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147
1148 static void ip6_append_data_mtu(unsigned int *mtu,
1149 int *maxfraglen,
1150 unsigned int fragheaderlen,
1151 struct sk_buff *skb,
1152 struct rt6_info *rt,
1153 unsigned int orig_mtu)
1154 {
1155 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1156 if (!skb) {
1157 /* first fragment, reserve header_len */
1158 *mtu = orig_mtu - rt->dst.header_len;
1159
1160 } else {
1161 /*
1162 * this fragment is not first, the headers
1163 * space is regarded as data space.
1164 */
1165 *mtu = orig_mtu;
1166 }
1167 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1168 + fragheaderlen - sizeof(struct frag_hdr);
1169 }
1170 }
1171
1172 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1173 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1174 struct rt6_info *rt, struct flowi6 *fl6)
1175 {
1176 struct ipv6_pinfo *np = inet6_sk(sk);
1177 unsigned int mtu;
1178 struct ipv6_txoptions *opt = ipc6->opt;
1179
1180 /*
1181 * setup for corking
1182 */
1183 if (opt) {
1184 if (WARN_ON(v6_cork->opt))
1185 return -EINVAL;
1186
1187 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1188 if (unlikely(!v6_cork->opt))
1189 return -ENOBUFS;
1190
1191 v6_cork->opt->tot_len = sizeof(*opt);
1192 v6_cork->opt->opt_flen = opt->opt_flen;
1193 v6_cork->opt->opt_nflen = opt->opt_nflen;
1194
1195 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1196 sk->sk_allocation);
1197 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1198 return -ENOBUFS;
1199
1200 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1201 sk->sk_allocation);
1202 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1203 return -ENOBUFS;
1204
1205 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1206 sk->sk_allocation);
1207 if (opt->hopopt && !v6_cork->opt->hopopt)
1208 return -ENOBUFS;
1209
1210 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1211 sk->sk_allocation);
1212 if (opt->srcrt && !v6_cork->opt->srcrt)
1213 return -ENOBUFS;
1214
1215 /* need source address above miyazawa*/
1216 }
1217 dst_hold(&rt->dst);
1218 cork->base.dst = &rt->dst;
1219 cork->fl.u.ip6 = *fl6;
1220 v6_cork->hop_limit = ipc6->hlimit;
1221 v6_cork->tclass = ipc6->tclass;
1222 if (rt->dst.flags & DST_XFRM_TUNNEL)
1223 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1224 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1225 else
1226 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
1228 if (np->frag_size < mtu) {
1229 if (np->frag_size)
1230 mtu = np->frag_size;
1231 }
1232 if (mtu < IPV6_MIN_MTU)
1233 return -EINVAL;
1234 cork->base.fragsize = mtu;
1235 if (dst_allfrag(rt->dst.path))
1236 cork->base.flags |= IPCORK_ALLFRAG;
1237 cork->base.length = 0;
1238
1239 return 0;
1240 }
1241
1242 static int __ip6_append_data(struct sock *sk,
1243 struct flowi6 *fl6,
1244 struct sk_buff_head *queue,
1245 struct inet_cork *cork,
1246 struct inet6_cork *v6_cork,
1247 struct page_frag *pfrag,
1248 int getfrag(void *from, char *to, int offset,
1249 int len, int odd, struct sk_buff *skb),
1250 void *from, int length, int transhdrlen,
1251 unsigned int flags, struct ipcm6_cookie *ipc6,
1252 const struct sockcm_cookie *sockc)
1253 {
1254 struct sk_buff *skb, *skb_prev = NULL;
1255 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1256 int exthdrlen = 0;
1257 int dst_exthdrlen = 0;
1258 int hh_len;
1259 int copy;
1260 int err;
1261 int offset = 0;
1262 __u8 tx_flags = 0;
1263 u32 tskey = 0;
1264 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1265 struct ipv6_txoptions *opt = v6_cork->opt;
1266 int csummode = CHECKSUM_NONE;
1267 unsigned int maxnonfragsize, headersize;
1268
1269 skb = skb_peek_tail(queue);
1270 if (!skb) {
1271 exthdrlen = opt ? opt->opt_flen : 0;
1272 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1273 }
1274
1275 mtu = cork->fragsize;
1276 orig_mtu = mtu;
1277
1278 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1279
1280 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1281 (opt ? opt->opt_nflen : 0);
1282 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1283 sizeof(struct frag_hdr);
1284
1285 headersize = sizeof(struct ipv6hdr) +
1286 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1287 (dst_allfrag(&rt->dst) ?
1288 sizeof(struct frag_hdr) : 0) +
1289 rt->rt6i_nfheader_len;
1290
1291 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1292 * the first fragment
1293 */
1294 if (headersize + transhdrlen > mtu)
1295 goto emsgsize;
1296
1297 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1298 (sk->sk_protocol == IPPROTO_UDP ||
1299 sk->sk_protocol == IPPROTO_RAW)) {
1300 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1301 sizeof(struct ipv6hdr));
1302 goto emsgsize;
1303 }
1304
1305 if (ip6_sk_ignore_df(sk))
1306 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1307 else
1308 maxnonfragsize = mtu;
1309
1310 if (cork->length + length > maxnonfragsize - headersize) {
1311 emsgsize:
1312 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1313 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1314 return -EMSGSIZE;
1315 }
1316
1317 /* CHECKSUM_PARTIAL only with no extension headers and when
1318 * we are not going to fragment
1319 */
1320 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1321 headersize == sizeof(struct ipv6hdr) &&
1322 length <= mtu - headersize &&
1323 !(flags & MSG_MORE) &&
1324 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1325 csummode = CHECKSUM_PARTIAL;
1326
1327 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1328 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1329 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1330 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1331 tskey = sk->sk_tskey++;
1332 }
1333
1334 /*
1335 * Let's try using as much space as possible.
1336 * Use MTU if total length of the message fits into the MTU.
1337 * Otherwise, we need to reserve fragment header and
1338 * fragment alignment (= 8-15 octects, in total).
1339 *
1340 * Note that we may need to "move" the data from the tail of
1341 * of the buffer to the new fragment when we split
1342 * the message.
1343 *
1344 * FIXME: It may be fragmented into multiple chunks
1345 * at once if non-fragmentable extension headers
1346 * are too large.
1347 * --yoshfuji
1348 */
1349
1350 cork->length += length;
1351 if (!skb)
1352 goto alloc_new_skb;
1353
1354 while (length > 0) {
1355 /* Check if the remaining data fits into current packet. */
1356 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1357 if (copy < length)
1358 copy = maxfraglen - skb->len;
1359
1360 if (copy <= 0) {
1361 char *data;
1362 unsigned int datalen;
1363 unsigned int fraglen;
1364 unsigned int fraggap;
1365 unsigned int alloclen;
1366 alloc_new_skb:
1367 /* There's no room in the current skb */
1368 if (skb)
1369 fraggap = skb->len - maxfraglen;
1370 else
1371 fraggap = 0;
1372 /* update mtu and maxfraglen if necessary */
1373 if (!skb || !skb_prev)
1374 ip6_append_data_mtu(&mtu, &maxfraglen,
1375 fragheaderlen, skb, rt,
1376 orig_mtu);
1377
1378 skb_prev = skb;
1379
1380 /*
1381 * If remaining data exceeds the mtu,
1382 * we know we need more fragment(s).
1383 */
1384 datalen = length + fraggap;
1385
1386 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388 if ((flags & MSG_MORE) &&
1389 !(rt->dst.dev->features&NETIF_F_SG))
1390 alloclen = mtu;
1391 else
1392 alloclen = datalen + fragheaderlen;
1393
1394 alloclen += dst_exthdrlen;
1395
1396 if (datalen != length + fraggap) {
1397 /*
1398 * this is not the last fragment, the trailer
1399 * space is regarded as data space.
1400 */
1401 datalen += rt->dst.trailer_len;
1402 }
1403
1404 alloclen += rt->dst.trailer_len;
1405 fraglen = datalen + fragheaderlen;
1406
1407 /*
1408 * We just reserve space for fragment header.
1409 * Note: this may be overallocation if the message
1410 * (without MSG_MORE) fits into the MTU.
1411 */
1412 alloclen += sizeof(struct frag_hdr);
1413
1414 copy = datalen - transhdrlen - fraggap;
1415 if (copy < 0) {
1416 err = -EINVAL;
1417 goto error;
1418 }
1419 if (transhdrlen) {
1420 skb = sock_alloc_send_skb(sk,
1421 alloclen + hh_len,
1422 (flags & MSG_DONTWAIT), &err);
1423 } else {
1424 skb = NULL;
1425 if (refcount_read(&sk->sk_wmem_alloc) <=
1426 2 * sk->sk_sndbuf)
1427 skb = sock_wmalloc(sk,
1428 alloclen + hh_len, 1,
1429 sk->sk_allocation);
1430 if (unlikely(!skb))
1431 err = -ENOBUFS;
1432 }
1433 if (!skb)
1434 goto error;
1435 /*
1436 * Fill in the control structures
1437 */
1438 skb->protocol = htons(ETH_P_IPV6);
1439 skb->ip_summed = csummode;
1440 skb->csum = 0;
1441 /* reserve for fragmentation and ipsec header */
1442 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1443 dst_exthdrlen);
1444
1445 /* Only the initial fragment is time stamped */
1446 skb_shinfo(skb)->tx_flags = tx_flags;
1447 tx_flags = 0;
1448 skb_shinfo(skb)->tskey = tskey;
1449 tskey = 0;
1450
1451 /*
1452 * Find where to start putting bytes
1453 */
1454 data = skb_put(skb, fraglen);
1455 skb_set_network_header(skb, exthdrlen);
1456 data += fragheaderlen;
1457 skb->transport_header = (skb->network_header +
1458 fragheaderlen);
1459 if (fraggap) {
1460 skb->csum = skb_copy_and_csum_bits(
1461 skb_prev, maxfraglen,
1462 data + transhdrlen, fraggap, 0);
1463 skb_prev->csum = csum_sub(skb_prev->csum,
1464 skb->csum);
1465 data += fraggap;
1466 pskb_trim_unique(skb_prev, maxfraglen);
1467 }
1468 if (copy > 0 &&
1469 getfrag(from, data + transhdrlen, offset,
1470 copy, fraggap, skb) < 0) {
1471 err = -EFAULT;
1472 kfree_skb(skb);
1473 goto error;
1474 }
1475
1476 offset += copy;
1477 length -= datalen - fraggap;
1478 transhdrlen = 0;
1479 exthdrlen = 0;
1480 dst_exthdrlen = 0;
1481
1482 if ((flags & MSG_CONFIRM) && !skb_prev)
1483 skb_set_dst_pending_confirm(skb, 1);
1484
1485 /*
1486 * Put the packet on the pending queue
1487 */
1488 __skb_queue_tail(queue, skb);
1489 continue;
1490 }
1491
1492 if (copy > length)
1493 copy = length;
1494
1495 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1496 skb_tailroom(skb) >= copy) {
1497 unsigned int off;
1498
1499 off = skb->len;
1500 if (getfrag(from, skb_put(skb, copy),
1501 offset, copy, off, skb) < 0) {
1502 __skb_trim(skb, off);
1503 err = -EFAULT;
1504 goto error;
1505 }
1506 } else {
1507 int i = skb_shinfo(skb)->nr_frags;
1508
1509 err = -ENOMEM;
1510 if (!sk_page_frag_refill(sk, pfrag))
1511 goto error;
1512
1513 if (!skb_can_coalesce(skb, i, pfrag->page,
1514 pfrag->offset)) {
1515 err = -EMSGSIZE;
1516 if (i == MAX_SKB_FRAGS)
1517 goto error;
1518
1519 __skb_fill_page_desc(skb, i, pfrag->page,
1520 pfrag->offset, 0);
1521 skb_shinfo(skb)->nr_frags = ++i;
1522 get_page(pfrag->page);
1523 }
1524 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1525 if (getfrag(from,
1526 page_address(pfrag->page) + pfrag->offset,
1527 offset, copy, skb->len, skb) < 0)
1528 goto error_efault;
1529
1530 pfrag->offset += copy;
1531 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1532 skb->len += copy;
1533 skb->data_len += copy;
1534 skb->truesize += copy;
1535 refcount_add(copy, &sk->sk_wmem_alloc);
1536 }
1537 offset += copy;
1538 length -= copy;
1539 }
1540
1541 return 0;
1542
1543 error_efault:
1544 err = -EFAULT;
1545 error:
1546 cork->length -= length;
1547 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1548 return err;
1549 }
1550
1551 int ip6_append_data(struct sock *sk,
1552 int getfrag(void *from, char *to, int offset, int len,
1553 int odd, struct sk_buff *skb),
1554 void *from, int length, int transhdrlen,
1555 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1556 struct rt6_info *rt, unsigned int flags,
1557 const struct sockcm_cookie *sockc)
1558 {
1559 struct inet_sock *inet = inet_sk(sk);
1560 struct ipv6_pinfo *np = inet6_sk(sk);
1561 int exthdrlen;
1562 int err;
1563
1564 if (flags&MSG_PROBE)
1565 return 0;
1566 if (skb_queue_empty(&sk->sk_write_queue)) {
1567 /*
1568 * setup for corking
1569 */
1570 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1571 ipc6, rt, fl6);
1572 if (err)
1573 return err;
1574
1575 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1576 length += exthdrlen;
1577 transhdrlen += exthdrlen;
1578 } else {
1579 fl6 = &inet->cork.fl.u.ip6;
1580 transhdrlen = 0;
1581 }
1582
1583 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1584 &np->cork, sk_page_frag(sk), getfrag,
1585 from, length, transhdrlen, flags, ipc6, sockc);
1586 }
1587 EXPORT_SYMBOL_GPL(ip6_append_data);
1588
1589 static void ip6_cork_release(struct inet_cork_full *cork,
1590 struct inet6_cork *v6_cork)
1591 {
1592 if (v6_cork->opt) {
1593 kfree(v6_cork->opt->dst0opt);
1594 kfree(v6_cork->opt->dst1opt);
1595 kfree(v6_cork->opt->hopopt);
1596 kfree(v6_cork->opt->srcrt);
1597 kfree(v6_cork->opt);
1598 v6_cork->opt = NULL;
1599 }
1600
1601 if (cork->base.dst) {
1602 dst_release(cork->base.dst);
1603 cork->base.dst = NULL;
1604 cork->base.flags &= ~IPCORK_ALLFRAG;
1605 }
1606 memset(&cork->fl, 0, sizeof(cork->fl));
1607 }
1608
1609 struct sk_buff *__ip6_make_skb(struct sock *sk,
1610 struct sk_buff_head *queue,
1611 struct inet_cork_full *cork,
1612 struct inet6_cork *v6_cork)
1613 {
1614 struct sk_buff *skb, *tmp_skb;
1615 struct sk_buff **tail_skb;
1616 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1617 struct ipv6_pinfo *np = inet6_sk(sk);
1618 struct net *net = sock_net(sk);
1619 struct ipv6hdr *hdr;
1620 struct ipv6_txoptions *opt = v6_cork->opt;
1621 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1622 struct flowi6 *fl6 = &cork->fl.u.ip6;
1623 unsigned char proto = fl6->flowi6_proto;
1624
1625 skb = __skb_dequeue(queue);
1626 if (!skb)
1627 goto out;
1628 tail_skb = &(skb_shinfo(skb)->frag_list);
1629
1630 /* move skb->data to ip header from ext header */
1631 if (skb->data < skb_network_header(skb))
1632 __skb_pull(skb, skb_network_offset(skb));
1633 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1634 __skb_pull(tmp_skb, skb_network_header_len(skb));
1635 *tail_skb = tmp_skb;
1636 tail_skb = &(tmp_skb->next);
1637 skb->len += tmp_skb->len;
1638 skb->data_len += tmp_skb->len;
1639 skb->truesize += tmp_skb->truesize;
1640 tmp_skb->destructor = NULL;
1641 tmp_skb->sk = NULL;
1642 }
1643
1644 /* Allow local fragmentation. */
1645 skb->ignore_df = ip6_sk_ignore_df(sk);
1646
1647 *final_dst = fl6->daddr;
1648 __skb_pull(skb, skb_network_header_len(skb));
1649 if (opt && opt->opt_flen)
1650 ipv6_push_frag_opts(skb, opt, &proto);
1651 if (opt && opt->opt_nflen)
1652 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1653
1654 skb_push(skb, sizeof(struct ipv6hdr));
1655 skb_reset_network_header(skb);
1656 hdr = ipv6_hdr(skb);
1657
1658 ip6_flow_hdr(hdr, v6_cork->tclass,
1659 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1660 ip6_autoflowlabel(net, np), fl6));
1661 hdr->hop_limit = v6_cork->hop_limit;
1662 hdr->nexthdr = proto;
1663 hdr->saddr = fl6->saddr;
1664 hdr->daddr = *final_dst;
1665
1666 skb->priority = sk->sk_priority;
1667 skb->mark = sk->sk_mark;
1668
1669 skb_dst_set(skb, dst_clone(&rt->dst));
1670 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1671 if (proto == IPPROTO_ICMPV6) {
1672 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1673
1674 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1675 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1676 }
1677
1678 ip6_cork_release(cork, v6_cork);
1679 out:
1680 return skb;
1681 }
1682
1683 int ip6_send_skb(struct sk_buff *skb)
1684 {
1685 struct net *net = sock_net(skb->sk);
1686 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1687 int err;
1688
1689 err = ip6_local_out(net, skb->sk, skb);
1690 if (err) {
1691 if (err > 0)
1692 err = net_xmit_errno(err);
1693 if (err)
1694 IP6_INC_STATS(net, rt->rt6i_idev,
1695 IPSTATS_MIB_OUTDISCARDS);
1696 }
1697
1698 return err;
1699 }
1700
1701 int ip6_push_pending_frames(struct sock *sk)
1702 {
1703 struct sk_buff *skb;
1704
1705 skb = ip6_finish_skb(sk);
1706 if (!skb)
1707 return 0;
1708
1709 return ip6_send_skb(skb);
1710 }
1711 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1712
1713 static void __ip6_flush_pending_frames(struct sock *sk,
1714 struct sk_buff_head *queue,
1715 struct inet_cork_full *cork,
1716 struct inet6_cork *v6_cork)
1717 {
1718 struct sk_buff *skb;
1719
1720 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1721 if (skb_dst(skb))
1722 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1723 IPSTATS_MIB_OUTDISCARDS);
1724 kfree_skb(skb);
1725 }
1726
1727 ip6_cork_release(cork, v6_cork);
1728 }
1729
1730 void ip6_flush_pending_frames(struct sock *sk)
1731 {
1732 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1733 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1736
1737 struct sk_buff *ip6_make_skb(struct sock *sk,
1738 int getfrag(void *from, char *to, int offset,
1739 int len, int odd, struct sk_buff *skb),
1740 void *from, int length, int transhdrlen,
1741 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1742 struct rt6_info *rt, unsigned int flags,
1743 const struct sockcm_cookie *sockc)
1744 {
1745 struct inet_cork_full cork;
1746 struct inet6_cork v6_cork;
1747 struct sk_buff_head queue;
1748 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1749 int err;
1750
1751 if (flags & MSG_PROBE)
1752 return NULL;
1753
1754 __skb_queue_head_init(&queue);
1755
1756 cork.base.flags = 0;
1757 cork.base.addr = 0;
1758 cork.base.opt = NULL;
1759 cork.base.dst = NULL;
1760 v6_cork.opt = NULL;
1761 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1762 if (err) {
1763 ip6_cork_release(&cork, &v6_cork);
1764 return ERR_PTR(err);
1765 }
1766 if (ipc6->dontfrag < 0)
1767 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1768
1769 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1770 &current->task_frag, getfrag, from,
1771 length + exthdrlen, transhdrlen + exthdrlen,
1772 flags, ipc6, sockc);
1773 if (err) {
1774 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1775 return ERR_PTR(err);
1776 }
1777
1778 return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1779 }