]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - net/ipv6/ip6_output.c
Merge tag 'dmaengine-fix-5.2-rc4' of git://git.infradead.org/users/vkoul/slave-dma
[mirror_ubuntu-jammy-kernel.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57
58 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
59 {
60 struct dst_entry *dst = skb_dst(skb);
61 struct net_device *dev = dst->dev;
62 struct neighbour *neigh;
63 struct in6_addr *nexthop;
64 int ret;
65
66 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
67 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
68
69 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
70 ((mroute6_is_socket(net, skb) &&
71 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
72 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
73 &ipv6_hdr(skb)->saddr))) {
74 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
75
76 /* Do not check for IFF_ALLMULTI; multicast routing
77 is not supported in any case.
78 */
79 if (newskb)
80 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
81 net, sk, newskb, NULL, newskb->dev,
82 dev_loopback_xmit);
83
84 if (ipv6_hdr(skb)->hop_limit == 0) {
85 IP6_INC_STATS(net, idev,
86 IPSTATS_MIB_OUTDISCARDS);
87 kfree_skb(skb);
88 return 0;
89 }
90 }
91
92 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
93
94 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
95 IPV6_ADDR_SCOPE_NODELOCAL &&
96 !(dev->flags & IFF_LOOPBACK)) {
97 kfree_skb(skb);
98 return 0;
99 }
100 }
101
102 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
103 int res = lwtunnel_xmit(skb);
104
105 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
106 return res;
107 }
108
109 rcu_read_lock_bh();
110 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
111 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
112 if (unlikely(!neigh))
113 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
114 if (!IS_ERR(neigh)) {
115 sock_confirm_neigh(skb, neigh);
116 ret = neigh_output(neigh, skb, false);
117 rcu_read_unlock_bh();
118 return ret;
119 }
120 rcu_read_unlock_bh();
121
122 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
123 kfree_skb(skb);
124 return -EINVAL;
125 }
126
127 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
128 {
129 int ret;
130
131 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
132 if (ret) {
133 kfree_skb(skb);
134 return ret;
135 }
136
137 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
138 /* Policy lookup after SNAT yielded a new policy */
139 if (skb_dst(skb)->xfrm) {
140 IPCB(skb)->flags |= IPSKB_REROUTED;
141 return dst_output(net, sk, skb);
142 }
143 #endif
144
145 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
146 dst_allfrag(skb_dst(skb)) ||
147 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
148 return ip6_fragment(net, sk, skb, ip6_finish_output2);
149 else
150 return ip6_finish_output2(net, sk, skb);
151 }
152
153 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
154 {
155 struct net_device *dev = skb_dst(skb)->dev;
156 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
157
158 skb->protocol = htons(ETH_P_IPV6);
159 skb->dev = dev;
160
161 if (unlikely(idev->cnf.disable_ipv6)) {
162 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
163 kfree_skb(skb);
164 return 0;
165 }
166
167 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
168 net, sk, skb, NULL, dev,
169 ip6_finish_output,
170 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172
173 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
174 {
175 if (!np->autoflowlabel_set)
176 return ip6_default_np_autolabel(net);
177 else
178 return np->autoflowlabel;
179 }
180
181 /*
182 * xmit an sk_buff (used by TCP, SCTP and DCCP)
183 * Note : socket lock is not held for SYNACK packets, but might be modified
184 * by calls to skb_set_owner_w() and ipv6_local_error(),
185 * which are using proper atomic operations or spinlocks.
186 */
187 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
188 __u32 mark, struct ipv6_txoptions *opt, int tclass)
189 {
190 struct net *net = sock_net(sk);
191 const struct ipv6_pinfo *np = inet6_sk(sk);
192 struct in6_addr *first_hop = &fl6->daddr;
193 struct dst_entry *dst = skb_dst(skb);
194 unsigned int head_room;
195 struct ipv6hdr *hdr;
196 u8 proto = fl6->flowi6_proto;
197 int seg_len = skb->len;
198 int hlimit = -1;
199 u32 mtu;
200
201 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
202 if (opt)
203 head_room += opt->opt_nflen + opt->opt_flen;
204
205 if (unlikely(skb_headroom(skb) < head_room)) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 if (!skb2) {
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
210 kfree_skb(skb);
211 return -ENOBUFS;
212 }
213 if (skb->sk)
214 skb_set_owner_w(skb2, skb->sk);
215 consume_skb(skb);
216 skb = skb2;
217 }
218
219 if (opt) {
220 seg_len += opt->opt_nflen + opt->opt_flen;
221
222 if (opt->opt_flen)
223 ipv6_push_frag_opts(skb, opt, &proto);
224
225 if (opt->opt_nflen)
226 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
227 &fl6->saddr);
228 }
229
230 skb_push(skb, sizeof(struct ipv6hdr));
231 skb_reset_network_header(skb);
232 hdr = ipv6_hdr(skb);
233
234 /*
235 * Fill in the IPv6 header
236 */
237 if (np)
238 hlimit = np->hop_limit;
239 if (hlimit < 0)
240 hlimit = ip6_dst_hoplimit(dst);
241
242 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
243 ip6_autoflowlabel(net, np), fl6));
244
245 hdr->payload_len = htons(seg_len);
246 hdr->nexthdr = proto;
247 hdr->hop_limit = hlimit;
248
249 hdr->saddr = fl6->saddr;
250 hdr->daddr = *first_hop;
251
252 skb->protocol = htons(ETH_P_IPV6);
253 skb->priority = sk->sk_priority;
254 skb->mark = mark;
255
256 mtu = dst_mtu(dst);
257 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
258 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
259 IPSTATS_MIB_OUT, skb->len);
260
261 /* if egress device is enslaved to an L3 master device pass the
262 * skb to its handler for processing
263 */
264 skb = l3mdev_ip6_out((struct sock *)sk, skb);
265 if (unlikely(!skb))
266 return 0;
267
268 /* hooks should never assume socket lock is held.
269 * we promote our socket to non const
270 */
271 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
272 net, (struct sock *)sk, skb, NULL, dst->dev,
273 dst_output);
274 }
275
276 skb->dev = dst->dev;
277 /* ipv6_local_error() does not require socket lock,
278 * we promote our socket to non const
279 */
280 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
281
282 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
283 kfree_skb(skb);
284 return -EMSGSIZE;
285 }
286 EXPORT_SYMBOL(ip6_xmit);
287
288 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
289 {
290 struct ip6_ra_chain *ra;
291 struct sock *last = NULL;
292
293 read_lock(&ip6_ra_lock);
294 for (ra = ip6_ra_chain; ra; ra = ra->next) {
295 struct sock *sk = ra->sk;
296 if (sk && ra->sel == sel &&
297 (!sk->sk_bound_dev_if ||
298 sk->sk_bound_dev_if == skb->dev->ifindex)) {
299 struct ipv6_pinfo *np = inet6_sk(sk);
300
301 if (np && np->rtalert_isolate &&
302 !net_eq(sock_net(sk), dev_net(skb->dev))) {
303 continue;
304 }
305 if (last) {
306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 if (skb2)
308 rawv6_rcv(last, skb2);
309 }
310 last = sk;
311 }
312 }
313
314 if (last) {
315 rawv6_rcv(last, skb);
316 read_unlock(&ip6_ra_lock);
317 return 1;
318 }
319 read_unlock(&ip6_ra_lock);
320 return 0;
321 }
322
323 static int ip6_forward_proxy_check(struct sk_buff *skb)
324 {
325 struct ipv6hdr *hdr = ipv6_hdr(skb);
326 u8 nexthdr = hdr->nexthdr;
327 __be16 frag_off;
328 int offset;
329
330 if (ipv6_ext_hdr(nexthdr)) {
331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 if (offset < 0)
333 return 0;
334 } else
335 offset = sizeof(struct ipv6hdr);
336
337 if (nexthdr == IPPROTO_ICMPV6) {
338 struct icmp6hdr *icmp6;
339
340 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 offset + 1 - skb->data)))
342 return 0;
343
344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346 switch (icmp6->icmp6_type) {
347 case NDISC_ROUTER_SOLICITATION:
348 case NDISC_ROUTER_ADVERTISEMENT:
349 case NDISC_NEIGHBOUR_SOLICITATION:
350 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 case NDISC_REDIRECT:
352 /* For reaction involving unicast neighbor discovery
353 * message destined to the proxied address, pass it to
354 * input function.
355 */
356 return 1;
357 default:
358 break;
359 }
360 }
361
362 /*
363 * The proxying router can't forward traffic sent to a link-local
364 * address, so signal the sender and discard the packet. This
365 * behavior is clarified by the MIPv6 specification.
366 */
367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 dst_link_failure(skb);
369 return -1;
370 }
371
372 return 0;
373 }
374
375 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 struct sk_buff *skb)
377 {
378 struct dst_entry *dst = skb_dst(skb);
379
380 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
381 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
382
383 #ifdef CONFIG_NET_SWITCHDEV
384 if (skb->offload_l3_fwd_mark) {
385 consume_skb(skb);
386 return 0;
387 }
388 #endif
389
390 skb->tstamp = 0;
391 return dst_output(net, sk, skb);
392 }
393
394 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
395 {
396 if (skb->len <= mtu)
397 return false;
398
399 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
400 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
401 return true;
402
403 if (skb->ignore_df)
404 return false;
405
406 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
407 return false;
408
409 return true;
410 }
411
412 int ip6_forward(struct sk_buff *skb)
413 {
414 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
415 struct dst_entry *dst = skb_dst(skb);
416 struct ipv6hdr *hdr = ipv6_hdr(skb);
417 struct inet6_skb_parm *opt = IP6CB(skb);
418 struct net *net = dev_net(dst->dev);
419 u32 mtu;
420
421 if (net->ipv6.devconf_all->forwarding == 0)
422 goto error;
423
424 if (skb->pkt_type != PACKET_HOST)
425 goto drop;
426
427 if (unlikely(skb->sk))
428 goto drop;
429
430 if (skb_warn_if_lro(skb))
431 goto drop;
432
433 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
434 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
435 goto drop;
436 }
437
438 skb_forward_csum(skb);
439
440 /*
441 * We DO NOT make any processing on
442 * RA packets, pushing them to user level AS IS
443 * without ane WARRANTY that application will be able
444 * to interpret them. The reason is that we
445 * cannot make anything clever here.
446 *
447 * We are not end-node, so that if packet contains
448 * AH/ESP, we cannot make anything.
449 * Defragmentation also would be mistake, RA packets
450 * cannot be fragmented, because there is no warranty
451 * that different fragments will go along one path. --ANK
452 */
453 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
454 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
455 return 0;
456 }
457
458 /*
459 * check and decrement ttl
460 */
461 if (hdr->hop_limit <= 1) {
462 /* Force OUTPUT device used as source address */
463 skb->dev = dst->dev;
464 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
465 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
466
467 kfree_skb(skb);
468 return -ETIMEDOUT;
469 }
470
471 /* XXX: idev->cnf.proxy_ndp? */
472 if (net->ipv6.devconf_all->proxy_ndp &&
473 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
474 int proxied = ip6_forward_proxy_check(skb);
475 if (proxied > 0)
476 return ip6_input(skb);
477 else if (proxied < 0) {
478 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
479 goto drop;
480 }
481 }
482
483 if (!xfrm6_route_forward(skb)) {
484 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
485 goto drop;
486 }
487 dst = skb_dst(skb);
488
489 /* IPv6 specs say nothing about it, but it is clear that we cannot
490 send redirects to source routed frames.
491 We don't send redirects to frames decapsulated from IPsec.
492 */
493 if (IP6CB(skb)->iif == dst->dev->ifindex &&
494 opt->srcrt == 0 && !skb_sec_path(skb)) {
495 struct in6_addr *target = NULL;
496 struct inet_peer *peer;
497 struct rt6_info *rt;
498
499 /*
500 * incoming and outgoing devices are the same
501 * send a redirect.
502 */
503
504 rt = (struct rt6_info *) dst;
505 if (rt->rt6i_flags & RTF_GATEWAY)
506 target = &rt->rt6i_gateway;
507 else
508 target = &hdr->daddr;
509
510 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
511
512 /* Limit redirects both by destination (here)
513 and by source (inside ndisc_send_redirect)
514 */
515 if (inet_peer_xrlim_allow(peer, 1*HZ))
516 ndisc_send_redirect(skb, target);
517 if (peer)
518 inet_putpeer(peer);
519 } else {
520 int addrtype = ipv6_addr_type(&hdr->saddr);
521
522 /* This check is security critical. */
523 if (addrtype == IPV6_ADDR_ANY ||
524 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
525 goto error;
526 if (addrtype & IPV6_ADDR_LINKLOCAL) {
527 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
528 ICMPV6_NOT_NEIGHBOUR, 0);
529 goto error;
530 }
531 }
532
533 mtu = ip6_dst_mtu_forward(dst);
534 if (mtu < IPV6_MIN_MTU)
535 mtu = IPV6_MIN_MTU;
536
537 if (ip6_pkt_too_big(skb, mtu)) {
538 /* Again, force OUTPUT device used as source address */
539 skb->dev = dst->dev;
540 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
541 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
542 __IP6_INC_STATS(net, ip6_dst_idev(dst),
543 IPSTATS_MIB_FRAGFAILS);
544 kfree_skb(skb);
545 return -EMSGSIZE;
546 }
547
548 if (skb_cow(skb, dst->dev->hard_header_len)) {
549 __IP6_INC_STATS(net, ip6_dst_idev(dst),
550 IPSTATS_MIB_OUTDISCARDS);
551 goto drop;
552 }
553
554 hdr = ipv6_hdr(skb);
555
556 /* Mangling hops number delayed to point after skb COW */
557
558 hdr->hop_limit--;
559
560 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
561 net, NULL, skb, skb->dev, dst->dev,
562 ip6_forward_finish);
563
564 error:
565 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
566 drop:
567 kfree_skb(skb);
568 return -EINVAL;
569 }
570
571 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
572 {
573 to->pkt_type = from->pkt_type;
574 to->priority = from->priority;
575 to->protocol = from->protocol;
576 skb_dst_drop(to);
577 skb_dst_set(to, dst_clone(skb_dst(from)));
578 to->dev = from->dev;
579 to->mark = from->mark;
580
581 skb_copy_hash(to, from);
582
583 #ifdef CONFIG_NET_SCHED
584 to->tc_index = from->tc_index;
585 #endif
586 nf_copy(to, from);
587 skb_ext_copy(to, from);
588 skb_copy_secmark(to, from);
589 }
590
591 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
592 int (*output)(struct net *, struct sock *, struct sk_buff *))
593 {
594 struct sk_buff *frag;
595 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
596 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
597 inet6_sk(skb->sk) : NULL;
598 struct ipv6hdr *tmp_hdr;
599 struct frag_hdr *fh;
600 unsigned int mtu, hlen, left, len, nexthdr_offset;
601 int hroom, troom;
602 __be32 frag_id;
603 int ptr, offset = 0, err = 0;
604 u8 *prevhdr, nexthdr = 0;
605
606 err = ip6_find_1stfragopt(skb, &prevhdr);
607 if (err < 0)
608 goto fail;
609 hlen = err;
610 nexthdr = *prevhdr;
611 nexthdr_offset = prevhdr - skb_network_header(skb);
612
613 mtu = ip6_skb_dst_mtu(skb);
614
615 /* We must not fragment if the socket is set to force MTU discovery
616 * or if the skb it not generated by a local socket.
617 */
618 if (unlikely(!skb->ignore_df && skb->len > mtu))
619 goto fail_toobig;
620
621 if (IP6CB(skb)->frag_max_size) {
622 if (IP6CB(skb)->frag_max_size > mtu)
623 goto fail_toobig;
624
625 /* don't send fragments larger than what we received */
626 mtu = IP6CB(skb)->frag_max_size;
627 if (mtu < IPV6_MIN_MTU)
628 mtu = IPV6_MIN_MTU;
629 }
630
631 if (np && np->frag_size < mtu) {
632 if (np->frag_size)
633 mtu = np->frag_size;
634 }
635 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
636 goto fail_toobig;
637 mtu -= hlen + sizeof(struct frag_hdr);
638
639 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
640 &ipv6_hdr(skb)->saddr);
641
642 if (skb->ip_summed == CHECKSUM_PARTIAL &&
643 (err = skb_checksum_help(skb)))
644 goto fail;
645
646 prevhdr = skb_network_header(skb) + nexthdr_offset;
647 hroom = LL_RESERVED_SPACE(rt->dst.dev);
648 if (skb_has_frag_list(skb)) {
649 unsigned int first_len = skb_pagelen(skb);
650 struct sk_buff *frag2;
651
652 if (first_len - hlen > mtu ||
653 ((first_len - hlen) & 7) ||
654 skb_cloned(skb) ||
655 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
656 goto slow_path;
657
658 skb_walk_frags(skb, frag) {
659 /* Correct geometry. */
660 if (frag->len > mtu ||
661 ((frag->len & 7) && frag->next) ||
662 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
663 goto slow_path_clean;
664
665 /* Partially cloned skb? */
666 if (skb_shared(frag))
667 goto slow_path_clean;
668
669 BUG_ON(frag->sk);
670 if (skb->sk) {
671 frag->sk = skb->sk;
672 frag->destructor = sock_wfree;
673 }
674 skb->truesize -= frag->truesize;
675 }
676
677 err = 0;
678 offset = 0;
679 /* BUILD HEADER */
680
681 *prevhdr = NEXTHDR_FRAGMENT;
682 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
683 if (!tmp_hdr) {
684 err = -ENOMEM;
685 goto fail;
686 }
687 frag = skb_shinfo(skb)->frag_list;
688 skb_frag_list_init(skb);
689
690 __skb_pull(skb, hlen);
691 fh = __skb_push(skb, sizeof(struct frag_hdr));
692 __skb_push(skb, hlen);
693 skb_reset_network_header(skb);
694 memcpy(skb_network_header(skb), tmp_hdr, hlen);
695
696 fh->nexthdr = nexthdr;
697 fh->reserved = 0;
698 fh->frag_off = htons(IP6_MF);
699 fh->identification = frag_id;
700
701 first_len = skb_pagelen(skb);
702 skb->data_len = first_len - skb_headlen(skb);
703 skb->len = first_len;
704 ipv6_hdr(skb)->payload_len = htons(first_len -
705 sizeof(struct ipv6hdr));
706
707 for (;;) {
708 /* Prepare header of the next frame,
709 * before previous one went down. */
710 if (frag) {
711 frag->ip_summed = CHECKSUM_NONE;
712 skb_reset_transport_header(frag);
713 fh = __skb_push(frag, sizeof(struct frag_hdr));
714 __skb_push(frag, hlen);
715 skb_reset_network_header(frag);
716 memcpy(skb_network_header(frag), tmp_hdr,
717 hlen);
718 offset += skb->len - hlen - sizeof(struct frag_hdr);
719 fh->nexthdr = nexthdr;
720 fh->reserved = 0;
721 fh->frag_off = htons(offset);
722 if (frag->next)
723 fh->frag_off |= htons(IP6_MF);
724 fh->identification = frag_id;
725 ipv6_hdr(frag)->payload_len =
726 htons(frag->len -
727 sizeof(struct ipv6hdr));
728 ip6_copy_metadata(frag, skb);
729 }
730
731 err = output(net, sk, skb);
732 if (!err)
733 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
734 IPSTATS_MIB_FRAGCREATES);
735
736 if (err || !frag)
737 break;
738
739 skb = frag;
740 frag = skb->next;
741 skb_mark_not_on_list(skb);
742 }
743
744 kfree(tmp_hdr);
745
746 if (err == 0) {
747 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
748 IPSTATS_MIB_FRAGOKS);
749 return 0;
750 }
751
752 kfree_skb_list(frag);
753
754 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
755 IPSTATS_MIB_FRAGFAILS);
756 return err;
757
758 slow_path_clean:
759 skb_walk_frags(skb, frag2) {
760 if (frag2 == frag)
761 break;
762 frag2->sk = NULL;
763 frag2->destructor = NULL;
764 skb->truesize += frag2->truesize;
765 }
766 }
767
768 slow_path:
769 left = skb->len - hlen; /* Space per frame */
770 ptr = hlen; /* Where to start from */
771
772 /*
773 * Fragment the datagram.
774 */
775
776 troom = rt->dst.dev->needed_tailroom;
777
778 /*
779 * Keep copying data until we run out.
780 */
781 while (left > 0) {
782 u8 *fragnexthdr_offset;
783
784 len = left;
785 /* IF: it doesn't fit, use 'mtu' - the data space left */
786 if (len > mtu)
787 len = mtu;
788 /* IF: we are not sending up to and including the packet end
789 then align the next start on an eight byte boundary */
790 if (len < left) {
791 len &= ~7;
792 }
793
794 /* Allocate buffer */
795 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
796 hroom + troom, GFP_ATOMIC);
797 if (!frag) {
798 err = -ENOMEM;
799 goto fail;
800 }
801
802 /*
803 * Set up data on packet
804 */
805
806 ip6_copy_metadata(frag, skb);
807 skb_reserve(frag, hroom);
808 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
809 skb_reset_network_header(frag);
810 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
811 frag->transport_header = (frag->network_header + hlen +
812 sizeof(struct frag_hdr));
813
814 /*
815 * Charge the memory for the fragment to any owner
816 * it might possess
817 */
818 if (skb->sk)
819 skb_set_owner_w(frag, skb->sk);
820
821 /*
822 * Copy the packet header into the new buffer.
823 */
824 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825
826 fragnexthdr_offset = skb_network_header(frag);
827 fragnexthdr_offset += prevhdr - skb_network_header(skb);
828 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
829
830 /*
831 * Build fragment header.
832 */
833 fh->nexthdr = nexthdr;
834 fh->reserved = 0;
835 fh->identification = frag_id;
836
837 /*
838 * Copy a block of the IP datagram.
839 */
840 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
841 len));
842 left -= len;
843
844 fh->frag_off = htons(offset);
845 if (left > 0)
846 fh->frag_off |= htons(IP6_MF);
847 ipv6_hdr(frag)->payload_len = htons(frag->len -
848 sizeof(struct ipv6hdr));
849
850 ptr += len;
851 offset += len;
852
853 /*
854 * Put this fragment into the sending queue.
855 */
856 err = output(net, sk, frag);
857 if (err)
858 goto fail;
859
860 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 IPSTATS_MIB_FRAGCREATES);
862 }
863 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864 IPSTATS_MIB_FRAGOKS);
865 consume_skb(skb);
866 return err;
867
868 fail_toobig:
869 if (skb->sk && dst_allfrag(skb_dst(skb)))
870 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
871
872 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
873 err = -EMSGSIZE;
874
875 fail:
876 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
877 IPSTATS_MIB_FRAGFAILS);
878 kfree_skb(skb);
879 return err;
880 }
881
882 static inline int ip6_rt_check(const struct rt6key *rt_key,
883 const struct in6_addr *fl_addr,
884 const struct in6_addr *addr_cache)
885 {
886 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
887 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
888 }
889
890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
891 struct dst_entry *dst,
892 const struct flowi6 *fl6)
893 {
894 struct ipv6_pinfo *np = inet6_sk(sk);
895 struct rt6_info *rt;
896
897 if (!dst)
898 goto out;
899
900 if (dst->ops->family != AF_INET6) {
901 dst_release(dst);
902 return NULL;
903 }
904
905 rt = (struct rt6_info *)dst;
906 /* Yes, checking route validity in not connected
907 * case is not very simple. Take into account,
908 * that we do not support routing by source, TOS,
909 * and MSG_DONTROUTE --ANK (980726)
910 *
911 * 1. ip6_rt_check(): If route was host route,
912 * check that cached destination is current.
913 * If it is network route, we still may
914 * check its validity using saved pointer
915 * to the last used address: daddr_cache.
916 * We do not want to save whole address now,
917 * (because main consumer of this service
918 * is tcp, which has not this problem),
919 * so that the last trick works only on connected
920 * sockets.
921 * 2. oif also should be the same.
922 */
923 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
924 #ifdef CONFIG_IPV6_SUBTREES
925 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
926 #endif
927 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
928 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
929 dst_release(dst);
930 dst = NULL;
931 }
932
933 out:
934 return dst;
935 }
936
937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
938 struct dst_entry **dst, struct flowi6 *fl6)
939 {
940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
941 struct neighbour *n;
942 struct rt6_info *rt;
943 #endif
944 int err;
945 int flags = 0;
946
947 /* The correct way to handle this would be to do
948 * ip6_route_get_saddr, and then ip6_route_output; however,
949 * the route-specific preferred source forces the
950 * ip6_route_output call _before_ ip6_route_get_saddr.
951 *
952 * In source specific routing (no src=any default route),
953 * ip6_route_output will fail given src=any saddr, though, so
954 * that's why we try it again later.
955 */
956 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
957 struct fib6_info *from;
958 struct rt6_info *rt;
959 bool had_dst = *dst != NULL;
960
961 if (!had_dst)
962 *dst = ip6_route_output(net, sk, fl6);
963 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
964
965 rcu_read_lock();
966 from = rt ? rcu_dereference(rt->from) : NULL;
967 err = ip6_route_get_saddr(net, from, &fl6->daddr,
968 sk ? inet6_sk(sk)->srcprefs : 0,
969 &fl6->saddr);
970 rcu_read_unlock();
971
972 if (err)
973 goto out_err_release;
974
975 /* If we had an erroneous initial result, pretend it
976 * never existed and let the SA-enabled version take
977 * over.
978 */
979 if (!had_dst && (*dst)->error) {
980 dst_release(*dst);
981 *dst = NULL;
982 }
983
984 if (fl6->flowi6_oif)
985 flags |= RT6_LOOKUP_F_IFACE;
986 }
987
988 if (!*dst)
989 *dst = ip6_route_output_flags(net, sk, fl6, flags);
990
991 err = (*dst)->error;
992 if (err)
993 goto out_err_release;
994
995 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
996 /*
997 * Here if the dst entry we've looked up
998 * has a neighbour entry that is in the INCOMPLETE
999 * state and the src address from the flow is
1000 * marked as OPTIMISTIC, we release the found
1001 * dst entry and replace it instead with the
1002 * dst entry of the nexthop router
1003 */
1004 rt = (struct rt6_info *) *dst;
1005 rcu_read_lock_bh();
1006 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1007 rt6_nexthop(rt, &fl6->daddr));
1008 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1009 rcu_read_unlock_bh();
1010
1011 if (err) {
1012 struct inet6_ifaddr *ifp;
1013 struct flowi6 fl_gw6;
1014 int redirect;
1015
1016 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1017 (*dst)->dev, 1);
1018
1019 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1020 if (ifp)
1021 in6_ifa_put(ifp);
1022
1023 if (redirect) {
1024 /*
1025 * We need to get the dst entry for the
1026 * default router instead
1027 */
1028 dst_release(*dst);
1029 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1030 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1031 *dst = ip6_route_output(net, sk, &fl_gw6);
1032 err = (*dst)->error;
1033 if (err)
1034 goto out_err_release;
1035 }
1036 }
1037 #endif
1038 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1039 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1040 err = -EAFNOSUPPORT;
1041 goto out_err_release;
1042 }
1043
1044 return 0;
1045
1046 out_err_release:
1047 dst_release(*dst);
1048 *dst = NULL;
1049
1050 if (err == -ENETUNREACH)
1051 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1052 return err;
1053 }
1054
1055 /**
1056 * ip6_dst_lookup - perform route lookup on flow
1057 * @sk: socket which provides route info
1058 * @dst: pointer to dst_entry * for result
1059 * @fl6: flow to lookup
1060 *
1061 * This function performs a route lookup on the given flow.
1062 *
1063 * It returns zero on success, or a standard errno code on error.
1064 */
1065 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1066 struct flowi6 *fl6)
1067 {
1068 *dst = NULL;
1069 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1072
1073 /**
1074 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1075 * @sk: socket which provides route info
1076 * @fl6: flow to lookup
1077 * @final_dst: final destination address for ipsec lookup
1078 *
1079 * This function performs a route lookup on the given flow.
1080 *
1081 * It returns a valid dst pointer on success, or a pointer encoded
1082 * error code.
1083 */
1084 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1085 const struct in6_addr *final_dst)
1086 {
1087 struct dst_entry *dst = NULL;
1088 int err;
1089
1090 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1091 if (err)
1092 return ERR_PTR(err);
1093 if (final_dst)
1094 fl6->daddr = *final_dst;
1095
1096 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1099
1100 /**
1101 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1102 * @sk: socket which provides the dst cache and route info
1103 * @fl6: flow to lookup
1104 * @final_dst: final destination address for ipsec lookup
1105 * @connected: whether @sk is connected or not
1106 *
1107 * This function performs a route lookup on the given flow with the
1108 * possibility of using the cached route in the socket if it is valid.
1109 * It will take the socket dst lock when operating on the dst cache.
1110 * As a result, this function can only be used in process context.
1111 *
1112 * In addition, for a connected socket, cache the dst in the socket
1113 * if the current cache is not valid.
1114 *
1115 * It returns a valid dst pointer on success, or a pointer encoded
1116 * error code.
1117 */
1118 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1119 const struct in6_addr *final_dst,
1120 bool connected)
1121 {
1122 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1123
1124 dst = ip6_sk_dst_check(sk, dst, fl6);
1125 if (dst)
1126 return dst;
1127
1128 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1129 if (connected && !IS_ERR(dst))
1130 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1131
1132 return dst;
1133 }
1134 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1135
1136 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1137 gfp_t gfp)
1138 {
1139 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1143 gfp_t gfp)
1144 {
1145 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 }
1147
1148 static void ip6_append_data_mtu(unsigned int *mtu,
1149 int *maxfraglen,
1150 unsigned int fragheaderlen,
1151 struct sk_buff *skb,
1152 struct rt6_info *rt,
1153 unsigned int orig_mtu)
1154 {
1155 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1156 if (!skb) {
1157 /* first fragment, reserve header_len */
1158 *mtu = orig_mtu - rt->dst.header_len;
1159
1160 } else {
1161 /*
1162 * this fragment is not first, the headers
1163 * space is regarded as data space.
1164 */
1165 *mtu = orig_mtu;
1166 }
1167 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1168 + fragheaderlen - sizeof(struct frag_hdr);
1169 }
1170 }
1171
1172 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1173 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1174 struct rt6_info *rt, struct flowi6 *fl6)
1175 {
1176 struct ipv6_pinfo *np = inet6_sk(sk);
1177 unsigned int mtu;
1178 struct ipv6_txoptions *opt = ipc6->opt;
1179
1180 /*
1181 * setup for corking
1182 */
1183 if (opt) {
1184 if (WARN_ON(v6_cork->opt))
1185 return -EINVAL;
1186
1187 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1188 if (unlikely(!v6_cork->opt))
1189 return -ENOBUFS;
1190
1191 v6_cork->opt->tot_len = sizeof(*opt);
1192 v6_cork->opt->opt_flen = opt->opt_flen;
1193 v6_cork->opt->opt_nflen = opt->opt_nflen;
1194
1195 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1196 sk->sk_allocation);
1197 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1198 return -ENOBUFS;
1199
1200 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1201 sk->sk_allocation);
1202 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1203 return -ENOBUFS;
1204
1205 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1206 sk->sk_allocation);
1207 if (opt->hopopt && !v6_cork->opt->hopopt)
1208 return -ENOBUFS;
1209
1210 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1211 sk->sk_allocation);
1212 if (opt->srcrt && !v6_cork->opt->srcrt)
1213 return -ENOBUFS;
1214
1215 /* need source address above miyazawa*/
1216 }
1217 dst_hold(&rt->dst);
1218 cork->base.dst = &rt->dst;
1219 cork->fl.u.ip6 = *fl6;
1220 v6_cork->hop_limit = ipc6->hlimit;
1221 v6_cork->tclass = ipc6->tclass;
1222 if (rt->dst.flags & DST_XFRM_TUNNEL)
1223 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1224 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1225 else
1226 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1227 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1228 if (np->frag_size < mtu) {
1229 if (np->frag_size)
1230 mtu = np->frag_size;
1231 }
1232 if (mtu < IPV6_MIN_MTU)
1233 return -EINVAL;
1234 cork->base.fragsize = mtu;
1235 cork->base.gso_size = ipc6->gso_size;
1236 cork->base.tx_flags = 0;
1237 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1238
1239 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1240 cork->base.flags |= IPCORK_ALLFRAG;
1241 cork->base.length = 0;
1242
1243 cork->base.transmit_time = ipc6->sockc.transmit_time;
1244
1245 return 0;
1246 }
1247
1248 static int __ip6_append_data(struct sock *sk,
1249 struct flowi6 *fl6,
1250 struct sk_buff_head *queue,
1251 struct inet_cork *cork,
1252 struct inet6_cork *v6_cork,
1253 struct page_frag *pfrag,
1254 int getfrag(void *from, char *to, int offset,
1255 int len, int odd, struct sk_buff *skb),
1256 void *from, int length, int transhdrlen,
1257 unsigned int flags, struct ipcm6_cookie *ipc6)
1258 {
1259 struct sk_buff *skb, *skb_prev = NULL;
1260 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1261 struct ubuf_info *uarg = NULL;
1262 int exthdrlen = 0;
1263 int dst_exthdrlen = 0;
1264 int hh_len;
1265 int copy;
1266 int err;
1267 int offset = 0;
1268 u32 tskey = 0;
1269 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1270 struct ipv6_txoptions *opt = v6_cork->opt;
1271 int csummode = CHECKSUM_NONE;
1272 unsigned int maxnonfragsize, headersize;
1273 unsigned int wmem_alloc_delta = 0;
1274 bool paged, extra_uref = false;
1275
1276 skb = skb_peek_tail(queue);
1277 if (!skb) {
1278 exthdrlen = opt ? opt->opt_flen : 0;
1279 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280 }
1281
1282 paged = !!cork->gso_size;
1283 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1284 orig_mtu = mtu;
1285
1286 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1287 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1288 tskey = sk->sk_tskey++;
1289
1290 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1291
1292 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1293 (opt ? opt->opt_nflen : 0);
1294 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1295 sizeof(struct frag_hdr);
1296
1297 headersize = sizeof(struct ipv6hdr) +
1298 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1299 (dst_allfrag(&rt->dst) ?
1300 sizeof(struct frag_hdr) : 0) +
1301 rt->rt6i_nfheader_len;
1302
1303 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1304 * the first fragment
1305 */
1306 if (headersize + transhdrlen > mtu)
1307 goto emsgsize;
1308
1309 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1310 (sk->sk_protocol == IPPROTO_UDP ||
1311 sk->sk_protocol == IPPROTO_RAW)) {
1312 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1313 sizeof(struct ipv6hdr));
1314 goto emsgsize;
1315 }
1316
1317 if (ip6_sk_ignore_df(sk))
1318 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1319 else
1320 maxnonfragsize = mtu;
1321
1322 if (cork->length + length > maxnonfragsize - headersize) {
1323 emsgsize:
1324 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1325 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1326 return -EMSGSIZE;
1327 }
1328
1329 /* CHECKSUM_PARTIAL only with no extension headers and when
1330 * we are not going to fragment
1331 */
1332 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1333 headersize == sizeof(struct ipv6hdr) &&
1334 length <= mtu - headersize &&
1335 (!(flags & MSG_MORE) || cork->gso_size) &&
1336 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1337 csummode = CHECKSUM_PARTIAL;
1338
1339 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1340 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1341 if (!uarg)
1342 return -ENOBUFS;
1343 extra_uref = !skb; /* only extra ref if !MSG_MORE */
1344 if (rt->dst.dev->features & NETIF_F_SG &&
1345 csummode == CHECKSUM_PARTIAL) {
1346 paged = true;
1347 } else {
1348 uarg->zerocopy = 0;
1349 skb_zcopy_set(skb, uarg, &extra_uref);
1350 }
1351 }
1352
1353 /*
1354 * Let's try using as much space as possible.
1355 * Use MTU if total length of the message fits into the MTU.
1356 * Otherwise, we need to reserve fragment header and
1357 * fragment alignment (= 8-15 octects, in total).
1358 *
1359 * Note that we may need to "move" the data from the tail of
1360 * of the buffer to the new fragment when we split
1361 * the message.
1362 *
1363 * FIXME: It may be fragmented into multiple chunks
1364 * at once if non-fragmentable extension headers
1365 * are too large.
1366 * --yoshfuji
1367 */
1368
1369 cork->length += length;
1370 if (!skb)
1371 goto alloc_new_skb;
1372
1373 while (length > 0) {
1374 /* Check if the remaining data fits into current packet. */
1375 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1376 if (copy < length)
1377 copy = maxfraglen - skb->len;
1378
1379 if (copy <= 0) {
1380 char *data;
1381 unsigned int datalen;
1382 unsigned int fraglen;
1383 unsigned int fraggap;
1384 unsigned int alloclen;
1385 unsigned int pagedlen;
1386 alloc_new_skb:
1387 /* There's no room in the current skb */
1388 if (skb)
1389 fraggap = skb->len - maxfraglen;
1390 else
1391 fraggap = 0;
1392 /* update mtu and maxfraglen if necessary */
1393 if (!skb || !skb_prev)
1394 ip6_append_data_mtu(&mtu, &maxfraglen,
1395 fragheaderlen, skb, rt,
1396 orig_mtu);
1397
1398 skb_prev = skb;
1399
1400 /*
1401 * If remaining data exceeds the mtu,
1402 * we know we need more fragment(s).
1403 */
1404 datalen = length + fraggap;
1405
1406 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1407 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1408 fraglen = datalen + fragheaderlen;
1409 pagedlen = 0;
1410
1411 if ((flags & MSG_MORE) &&
1412 !(rt->dst.dev->features&NETIF_F_SG))
1413 alloclen = mtu;
1414 else if (!paged)
1415 alloclen = fraglen;
1416 else {
1417 alloclen = min_t(int, fraglen, MAX_HEADER);
1418 pagedlen = fraglen - alloclen;
1419 }
1420
1421 alloclen += dst_exthdrlen;
1422
1423 if (datalen != length + fraggap) {
1424 /*
1425 * this is not the last fragment, the trailer
1426 * space is regarded as data space.
1427 */
1428 datalen += rt->dst.trailer_len;
1429 }
1430
1431 alloclen += rt->dst.trailer_len;
1432 fraglen = datalen + fragheaderlen;
1433
1434 /*
1435 * We just reserve space for fragment header.
1436 * Note: this may be overallocation if the message
1437 * (without MSG_MORE) fits into the MTU.
1438 */
1439 alloclen += sizeof(struct frag_hdr);
1440
1441 copy = datalen - transhdrlen - fraggap - pagedlen;
1442 if (copy < 0) {
1443 err = -EINVAL;
1444 goto error;
1445 }
1446 if (transhdrlen) {
1447 skb = sock_alloc_send_skb(sk,
1448 alloclen + hh_len,
1449 (flags & MSG_DONTWAIT), &err);
1450 } else {
1451 skb = NULL;
1452 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1453 2 * sk->sk_sndbuf)
1454 skb = alloc_skb(alloclen + hh_len,
1455 sk->sk_allocation);
1456 if (unlikely(!skb))
1457 err = -ENOBUFS;
1458 }
1459 if (!skb)
1460 goto error;
1461 /*
1462 * Fill in the control structures
1463 */
1464 skb->protocol = htons(ETH_P_IPV6);
1465 skb->ip_summed = csummode;
1466 skb->csum = 0;
1467 /* reserve for fragmentation and ipsec header */
1468 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1469 dst_exthdrlen);
1470
1471 /*
1472 * Find where to start putting bytes
1473 */
1474 data = skb_put(skb, fraglen - pagedlen);
1475 skb_set_network_header(skb, exthdrlen);
1476 data += fragheaderlen;
1477 skb->transport_header = (skb->network_header +
1478 fragheaderlen);
1479 if (fraggap) {
1480 skb->csum = skb_copy_and_csum_bits(
1481 skb_prev, maxfraglen,
1482 data + transhdrlen, fraggap, 0);
1483 skb_prev->csum = csum_sub(skb_prev->csum,
1484 skb->csum);
1485 data += fraggap;
1486 pskb_trim_unique(skb_prev, maxfraglen);
1487 }
1488 if (copy > 0 &&
1489 getfrag(from, data + transhdrlen, offset,
1490 copy, fraggap, skb) < 0) {
1491 err = -EFAULT;
1492 kfree_skb(skb);
1493 goto error;
1494 }
1495
1496 offset += copy;
1497 length -= copy + transhdrlen;
1498 transhdrlen = 0;
1499 exthdrlen = 0;
1500 dst_exthdrlen = 0;
1501
1502 /* Only the initial fragment is time stamped */
1503 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1504 cork->tx_flags = 0;
1505 skb_shinfo(skb)->tskey = tskey;
1506 tskey = 0;
1507 skb_zcopy_set(skb, uarg, &extra_uref);
1508
1509 if ((flags & MSG_CONFIRM) && !skb_prev)
1510 skb_set_dst_pending_confirm(skb, 1);
1511
1512 /*
1513 * Put the packet on the pending queue
1514 */
1515 if (!skb->destructor) {
1516 skb->destructor = sock_wfree;
1517 skb->sk = sk;
1518 wmem_alloc_delta += skb->truesize;
1519 }
1520 __skb_queue_tail(queue, skb);
1521 continue;
1522 }
1523
1524 if (copy > length)
1525 copy = length;
1526
1527 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1528 skb_tailroom(skb) >= copy) {
1529 unsigned int off;
1530
1531 off = skb->len;
1532 if (getfrag(from, skb_put(skb, copy),
1533 offset, copy, off, skb) < 0) {
1534 __skb_trim(skb, off);
1535 err = -EFAULT;
1536 goto error;
1537 }
1538 } else if (!uarg || !uarg->zerocopy) {
1539 int i = skb_shinfo(skb)->nr_frags;
1540
1541 err = -ENOMEM;
1542 if (!sk_page_frag_refill(sk, pfrag))
1543 goto error;
1544
1545 if (!skb_can_coalesce(skb, i, pfrag->page,
1546 pfrag->offset)) {
1547 err = -EMSGSIZE;
1548 if (i == MAX_SKB_FRAGS)
1549 goto error;
1550
1551 __skb_fill_page_desc(skb, i, pfrag->page,
1552 pfrag->offset, 0);
1553 skb_shinfo(skb)->nr_frags = ++i;
1554 get_page(pfrag->page);
1555 }
1556 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1557 if (getfrag(from,
1558 page_address(pfrag->page) + pfrag->offset,
1559 offset, copy, skb->len, skb) < 0)
1560 goto error_efault;
1561
1562 pfrag->offset += copy;
1563 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1564 skb->len += copy;
1565 skb->data_len += copy;
1566 skb->truesize += copy;
1567 wmem_alloc_delta += copy;
1568 } else {
1569 err = skb_zerocopy_iter_dgram(skb, from, copy);
1570 if (err < 0)
1571 goto error;
1572 }
1573 offset += copy;
1574 length -= copy;
1575 }
1576
1577 if (wmem_alloc_delta)
1578 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1579 return 0;
1580
1581 error_efault:
1582 err = -EFAULT;
1583 error:
1584 if (uarg)
1585 sock_zerocopy_put_abort(uarg, extra_uref);
1586 cork->length -= length;
1587 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1588 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1589 return err;
1590 }
1591
1592 int ip6_append_data(struct sock *sk,
1593 int getfrag(void *from, char *to, int offset, int len,
1594 int odd, struct sk_buff *skb),
1595 void *from, int length, int transhdrlen,
1596 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1597 struct rt6_info *rt, unsigned int flags)
1598 {
1599 struct inet_sock *inet = inet_sk(sk);
1600 struct ipv6_pinfo *np = inet6_sk(sk);
1601 int exthdrlen;
1602 int err;
1603
1604 if (flags&MSG_PROBE)
1605 return 0;
1606 if (skb_queue_empty(&sk->sk_write_queue)) {
1607 /*
1608 * setup for corking
1609 */
1610 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1611 ipc6, rt, fl6);
1612 if (err)
1613 return err;
1614
1615 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1616 length += exthdrlen;
1617 transhdrlen += exthdrlen;
1618 } else {
1619 fl6 = &inet->cork.fl.u.ip6;
1620 transhdrlen = 0;
1621 }
1622
1623 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1624 &np->cork, sk_page_frag(sk), getfrag,
1625 from, length, transhdrlen, flags, ipc6);
1626 }
1627 EXPORT_SYMBOL_GPL(ip6_append_data);
1628
1629 static void ip6_cork_release(struct inet_cork_full *cork,
1630 struct inet6_cork *v6_cork)
1631 {
1632 if (v6_cork->opt) {
1633 kfree(v6_cork->opt->dst0opt);
1634 kfree(v6_cork->opt->dst1opt);
1635 kfree(v6_cork->opt->hopopt);
1636 kfree(v6_cork->opt->srcrt);
1637 kfree(v6_cork->opt);
1638 v6_cork->opt = NULL;
1639 }
1640
1641 if (cork->base.dst) {
1642 dst_release(cork->base.dst);
1643 cork->base.dst = NULL;
1644 cork->base.flags &= ~IPCORK_ALLFRAG;
1645 }
1646 memset(&cork->fl, 0, sizeof(cork->fl));
1647 }
1648
1649 struct sk_buff *__ip6_make_skb(struct sock *sk,
1650 struct sk_buff_head *queue,
1651 struct inet_cork_full *cork,
1652 struct inet6_cork *v6_cork)
1653 {
1654 struct sk_buff *skb, *tmp_skb;
1655 struct sk_buff **tail_skb;
1656 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1657 struct ipv6_pinfo *np = inet6_sk(sk);
1658 struct net *net = sock_net(sk);
1659 struct ipv6hdr *hdr;
1660 struct ipv6_txoptions *opt = v6_cork->opt;
1661 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1662 struct flowi6 *fl6 = &cork->fl.u.ip6;
1663 unsigned char proto = fl6->flowi6_proto;
1664
1665 skb = __skb_dequeue(queue);
1666 if (!skb)
1667 goto out;
1668 tail_skb = &(skb_shinfo(skb)->frag_list);
1669
1670 /* move skb->data to ip header from ext header */
1671 if (skb->data < skb_network_header(skb))
1672 __skb_pull(skb, skb_network_offset(skb));
1673 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1674 __skb_pull(tmp_skb, skb_network_header_len(skb));
1675 *tail_skb = tmp_skb;
1676 tail_skb = &(tmp_skb->next);
1677 skb->len += tmp_skb->len;
1678 skb->data_len += tmp_skb->len;
1679 skb->truesize += tmp_skb->truesize;
1680 tmp_skb->destructor = NULL;
1681 tmp_skb->sk = NULL;
1682 }
1683
1684 /* Allow local fragmentation. */
1685 skb->ignore_df = ip6_sk_ignore_df(sk);
1686
1687 *final_dst = fl6->daddr;
1688 __skb_pull(skb, skb_network_header_len(skb));
1689 if (opt && opt->opt_flen)
1690 ipv6_push_frag_opts(skb, opt, &proto);
1691 if (opt && opt->opt_nflen)
1692 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1693
1694 skb_push(skb, sizeof(struct ipv6hdr));
1695 skb_reset_network_header(skb);
1696 hdr = ipv6_hdr(skb);
1697
1698 ip6_flow_hdr(hdr, v6_cork->tclass,
1699 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1700 ip6_autoflowlabel(net, np), fl6));
1701 hdr->hop_limit = v6_cork->hop_limit;
1702 hdr->nexthdr = proto;
1703 hdr->saddr = fl6->saddr;
1704 hdr->daddr = *final_dst;
1705
1706 skb->priority = sk->sk_priority;
1707 skb->mark = sk->sk_mark;
1708
1709 skb->tstamp = cork->base.transmit_time;
1710
1711 skb_dst_set(skb, dst_clone(&rt->dst));
1712 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1713 if (proto == IPPROTO_ICMPV6) {
1714 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1715
1716 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1717 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1718 }
1719
1720 ip6_cork_release(cork, v6_cork);
1721 out:
1722 return skb;
1723 }
1724
1725 int ip6_send_skb(struct sk_buff *skb)
1726 {
1727 struct net *net = sock_net(skb->sk);
1728 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1729 int err;
1730
1731 err = ip6_local_out(net, skb->sk, skb);
1732 if (err) {
1733 if (err > 0)
1734 err = net_xmit_errno(err);
1735 if (err)
1736 IP6_INC_STATS(net, rt->rt6i_idev,
1737 IPSTATS_MIB_OUTDISCARDS);
1738 }
1739
1740 return err;
1741 }
1742
1743 int ip6_push_pending_frames(struct sock *sk)
1744 {
1745 struct sk_buff *skb;
1746
1747 skb = ip6_finish_skb(sk);
1748 if (!skb)
1749 return 0;
1750
1751 return ip6_send_skb(skb);
1752 }
1753 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1754
1755 static void __ip6_flush_pending_frames(struct sock *sk,
1756 struct sk_buff_head *queue,
1757 struct inet_cork_full *cork,
1758 struct inet6_cork *v6_cork)
1759 {
1760 struct sk_buff *skb;
1761
1762 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1763 if (skb_dst(skb))
1764 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1765 IPSTATS_MIB_OUTDISCARDS);
1766 kfree_skb(skb);
1767 }
1768
1769 ip6_cork_release(cork, v6_cork);
1770 }
1771
1772 void ip6_flush_pending_frames(struct sock *sk)
1773 {
1774 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1775 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1776 }
1777 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1778
1779 struct sk_buff *ip6_make_skb(struct sock *sk,
1780 int getfrag(void *from, char *to, int offset,
1781 int len, int odd, struct sk_buff *skb),
1782 void *from, int length, int transhdrlen,
1783 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1784 struct rt6_info *rt, unsigned int flags,
1785 struct inet_cork_full *cork)
1786 {
1787 struct inet6_cork v6_cork;
1788 struct sk_buff_head queue;
1789 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1790 int err;
1791
1792 if (flags & MSG_PROBE)
1793 return NULL;
1794
1795 __skb_queue_head_init(&queue);
1796
1797 cork->base.flags = 0;
1798 cork->base.addr = 0;
1799 cork->base.opt = NULL;
1800 cork->base.dst = NULL;
1801 v6_cork.opt = NULL;
1802 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1803 if (err) {
1804 ip6_cork_release(cork, &v6_cork);
1805 return ERR_PTR(err);
1806 }
1807 if (ipc6->dontfrag < 0)
1808 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1809
1810 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1811 &current->task_frag, getfrag, from,
1812 length + exthdrlen, transhdrlen + exthdrlen,
1813 flags, ipc6);
1814 if (err) {
1815 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1816 return ERR_PTR(err);
1817 }
1818
1819 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1820 }