]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/ipv6/route.c
net: lwtunnel: Add extack to encap attr validation
[mirror_ubuntu-bionic-kernel.git] / net / ipv6 / route.c
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14 /* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
77 RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
89
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 struct sk_buff *skb);
99 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103 struct sk_buff *skb, struct rt6_info *rt,
104 struct in6_addr *dst, struct in6_addr *src,
105 int iif, int type, u32 portid, u32 seq,
106 unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110 const struct in6_addr *prefix, int prefixlen,
111 const struct in6_addr *gwaddr,
112 struct net_device *dev,
113 unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115 const struct in6_addr *prefix, int prefixlen,
116 const struct in6_addr *gwaddr,
117 struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121 spinlock_t lock;
122 struct list_head head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131 rt->dst.flags |= DST_NOCACHE;
132 rt->rt6i_uncached_list = ul;
133
134 spin_lock_bh(&ul->lock);
135 list_add_tail(&rt->rt6i_uncached, &ul->head);
136 spin_unlock_bh(&ul->lock);
137 }
138
139 static void rt6_uncached_list_del(struct rt6_info *rt)
140 {
141 if (!list_empty(&rt->rt6i_uncached)) {
142 struct uncached_list *ul = rt->rt6i_uncached_list;
143
144 spin_lock_bh(&ul->lock);
145 list_del(&rt->rt6i_uncached);
146 spin_unlock_bh(&ul->lock);
147 }
148 }
149
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151 {
152 struct net_device *loopback_dev = net->loopback_dev;
153 int cpu;
154
155 if (dev == loopback_dev)
156 return;
157
158 for_each_possible_cpu(cpu) {
159 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160 struct rt6_info *rt;
161
162 spin_lock_bh(&ul->lock);
163 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164 struct inet6_dev *rt_idev = rt->rt6i_idev;
165 struct net_device *rt_dev = rt->dst.dev;
166
167 if (rt_idev->dev == dev) {
168 rt->rt6i_idev = in6_dev_get(loopback_dev);
169 in6_dev_put(rt_idev);
170 }
171
172 if (rt_dev == dev) {
173 rt->dst.dev = loopback_dev;
174 dev_hold(rt->dst.dev);
175 dev_put(rt_dev);
176 }
177 }
178 spin_unlock_bh(&ul->lock);
179 }
180 }
181
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183 {
184 return dst_metrics_write_ptr(rt->dst.from);
185 }
186
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188 {
189 struct rt6_info *rt = (struct rt6_info *)dst;
190
191 if (rt->rt6i_flags & RTF_PCPU)
192 return rt6_pcpu_cow_metrics(rt);
193 else if (rt->rt6i_flags & RTF_CACHE)
194 return NULL;
195 else
196 return dst_cow_metrics_generic(dst, old);
197 }
198
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200 struct sk_buff *skb,
201 const void *daddr)
202 {
203 struct in6_addr *p = &rt->rt6i_gateway;
204
205 if (!ipv6_addr_any(p))
206 return (const void *) p;
207 else if (skb)
208 return &ipv6_hdr(skb)->daddr;
209 return daddr;
210 }
211
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213 struct sk_buff *skb,
214 const void *daddr)
215 {
216 struct rt6_info *rt = (struct rt6_info *) dst;
217 struct neighbour *n;
218
219 daddr = choose_neigh_daddr(rt, skb, daddr);
220 n = __ipv6_neigh_lookup(dst->dev, daddr);
221 if (n)
222 return n;
223 return neigh_create(&nd_tbl, daddr, dst->dev);
224 }
225
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 {
228 struct net_device *dev = dst->dev;
229 struct rt6_info *rt = (struct rt6_info *)dst;
230
231 daddr = choose_neigh_daddr(rt, NULL, daddr);
232 if (!daddr)
233 return;
234 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 return;
236 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 return;
238 __ipv6_confirm_neigh(dev, daddr);
239 }
240
241 static struct dst_ops ip6_dst_ops_template = {
242 .family = AF_INET6,
243 .gc = ip6_dst_gc,
244 .gc_thresh = 1024,
245 .check = ip6_dst_check,
246 .default_advmss = ip6_default_advmss,
247 .mtu = ip6_mtu,
248 .cow_metrics = ipv6_cow_metrics,
249 .destroy = ip6_dst_destroy,
250 .ifdown = ip6_dst_ifdown,
251 .negative_advice = ip6_negative_advice,
252 .link_failure = ip6_link_failure,
253 .update_pmtu = ip6_rt_update_pmtu,
254 .redirect = rt6_do_redirect,
255 .local_out = __ip6_local_out,
256 .neigh_lookup = ip6_neigh_lookup,
257 .confirm_neigh = ip6_confirm_neigh,
258 };
259
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 {
262 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263
264 return mtu ? : dst->dev->mtu;
265 }
266
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268 struct sk_buff *skb, u32 mtu)
269 {
270 }
271
272 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb)
274 {
275 }
276
277 static struct dst_ops ip6_dst_blackhole_ops = {
278 .family = AF_INET6,
279 .destroy = ip6_dst_destroy,
280 .check = ip6_dst_check,
281 .mtu = ip6_blackhole_mtu,
282 .default_advmss = ip6_default_advmss,
283 .update_pmtu = ip6_rt_blackhole_update_pmtu,
284 .redirect = ip6_rt_blackhole_redirect,
285 .cow_metrics = dst_cow_metrics_generic,
286 .neigh_lookup = ip6_neigh_lookup,
287 };
288
289 static const u32 ip6_template_metrics[RTAX_MAX] = {
290 [RTAX_HOPLIMIT - 1] = 0,
291 };
292
293 static const struct rt6_info ip6_null_entry_template = {
294 .dst = {
295 .__refcnt = ATOMIC_INIT(1),
296 .__use = 1,
297 .obsolete = DST_OBSOLETE_FORCE_CHK,
298 .error = -ENETUNREACH,
299 .input = ip6_pkt_discard,
300 .output = ip6_pkt_discard_out,
301 },
302 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
303 .rt6i_protocol = RTPROT_KERNEL,
304 .rt6i_metric = ~(u32) 0,
305 .rt6i_ref = ATOMIC_INIT(1),
306 };
307
308 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309
310 static const struct rt6_info ip6_prohibit_entry_template = {
311 .dst = {
312 .__refcnt = ATOMIC_INIT(1),
313 .__use = 1,
314 .obsolete = DST_OBSOLETE_FORCE_CHK,
315 .error = -EACCES,
316 .input = ip6_pkt_prohibit,
317 .output = ip6_pkt_prohibit_out,
318 },
319 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
320 .rt6i_protocol = RTPROT_KERNEL,
321 .rt6i_metric = ~(u32) 0,
322 .rt6i_ref = ATOMIC_INIT(1),
323 };
324
325 static const struct rt6_info ip6_blk_hole_entry_template = {
326 .dst = {
327 .__refcnt = ATOMIC_INIT(1),
328 .__use = 1,
329 .obsolete = DST_OBSOLETE_FORCE_CHK,
330 .error = -EINVAL,
331 .input = dst_discard,
332 .output = dst_discard_out,
333 },
334 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
335 .rt6i_protocol = RTPROT_KERNEL,
336 .rt6i_metric = ~(u32) 0,
337 .rt6i_ref = ATOMIC_INIT(1),
338 };
339
340 #endif
341
342 static void rt6_info_init(struct rt6_info *rt)
343 {
344 struct dst_entry *dst = &rt->dst;
345
346 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
347 INIT_LIST_HEAD(&rt->rt6i_siblings);
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 static struct rt6_info *__ip6_dst_alloc(struct net *net,
353 struct net_device *dev,
354 int flags)
355 {
356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 0, DST_OBSOLETE_FORCE_CHK, flags);
358
359 if (rt)
360 rt6_info_init(rt);
361
362 return rt;
363 }
364
365 struct rt6_info *ip6_dst_alloc(struct net *net,
366 struct net_device *dev,
367 int flags)
368 {
369 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
370
371 if (rt) {
372 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
373 if (rt->rt6i_pcpu) {
374 int cpu;
375
376 for_each_possible_cpu(cpu) {
377 struct rt6_info **p;
378
379 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
380 /* no one shares rt */
381 *p = NULL;
382 }
383 } else {
384 dst_destroy((struct dst_entry *)rt);
385 return NULL;
386 }
387 }
388
389 return rt;
390 }
391 EXPORT_SYMBOL(ip6_dst_alloc);
392
393 static void ip6_dst_destroy(struct dst_entry *dst)
394 {
395 struct rt6_info *rt = (struct rt6_info *)dst;
396 struct dst_entry *from = dst->from;
397 struct inet6_dev *idev;
398
399 dst_destroy_metrics_generic(dst);
400 free_percpu(rt->rt6i_pcpu);
401 rt6_uncached_list_del(rt);
402
403 idev = rt->rt6i_idev;
404 if (idev) {
405 rt->rt6i_idev = NULL;
406 in6_dev_put(idev);
407 }
408
409 dst->from = NULL;
410 dst_release(from);
411 }
412
413 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
414 int how)
415 {
416 struct rt6_info *rt = (struct rt6_info *)dst;
417 struct inet6_dev *idev = rt->rt6i_idev;
418 struct net_device *loopback_dev =
419 dev_net(dev)->loopback_dev;
420
421 if (dev != loopback_dev) {
422 if (idev && idev->dev == dev) {
423 struct inet6_dev *loopback_idev =
424 in6_dev_get(loopback_dev);
425 if (loopback_idev) {
426 rt->rt6i_idev = loopback_idev;
427 in6_dev_put(idev);
428 }
429 }
430 }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
437 else
438 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
445 return true;
446 } else if (rt->dst.from) {
447 return rt6_check_expired((struct rt6_info *) rt->dst.from);
448 }
449 return false;
450 }
451
452 /* Multipath route selection:
453 * Hash based function using packet header and flowlabel.
454 * Adapted from fib_info_hashfn()
455 */
456 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
457 const struct flowi6 *fl6)
458 {
459 return get_hash_from_flowi6(fl6) % candidate_count;
460 }
461
462 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
463 struct flowi6 *fl6, int oif,
464 int strict)
465 {
466 struct rt6_info *sibling, *next_sibling;
467 int route_choosen;
468
469 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
470 /* Don't change the route, if route_choosen == 0
471 * (siblings does not include ourself)
472 */
473 if (route_choosen)
474 list_for_each_entry_safe(sibling, next_sibling,
475 &match->rt6i_siblings, rt6i_siblings) {
476 route_choosen--;
477 if (route_choosen == 0) {
478 if (rt6_score_route(sibling, oif, strict) < 0)
479 break;
480 match = sibling;
481 break;
482 }
483 }
484 return match;
485 }
486
487 /*
488 * Route lookup. Any table->tb6_lock is implied.
489 */
490
491 static inline struct rt6_info *rt6_device_match(struct net *net,
492 struct rt6_info *rt,
493 const struct in6_addr *saddr,
494 int oif,
495 int flags)
496 {
497 struct rt6_info *local = NULL;
498 struct rt6_info *sprt;
499
500 if (!oif && ipv6_addr_any(saddr))
501 goto out;
502
503 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
504 struct net_device *dev = sprt->dst.dev;
505
506 if (oif) {
507 if (dev->ifindex == oif)
508 return sprt;
509 if (dev->flags & IFF_LOOPBACK) {
510 if (!sprt->rt6i_idev ||
511 sprt->rt6i_idev->dev->ifindex != oif) {
512 if (flags & RT6_LOOKUP_F_IFACE)
513 continue;
514 if (local &&
515 local->rt6i_idev->dev->ifindex == oif)
516 continue;
517 }
518 local = sprt;
519 }
520 } else {
521 if (ipv6_chk_addr(net, saddr, dev,
522 flags & RT6_LOOKUP_F_IFACE))
523 return sprt;
524 }
525 }
526
527 if (oif) {
528 if (local)
529 return local;
530
531 if (flags & RT6_LOOKUP_F_IFACE)
532 return net->ipv6.ip6_null_entry;
533 }
534 out:
535 return rt;
536 }
537
538 #ifdef CONFIG_IPV6_ROUTER_PREF
539 struct __rt6_probe_work {
540 struct work_struct work;
541 struct in6_addr target;
542 struct net_device *dev;
543 };
544
545 static void rt6_probe_deferred(struct work_struct *w)
546 {
547 struct in6_addr mcaddr;
548 struct __rt6_probe_work *work =
549 container_of(w, struct __rt6_probe_work, work);
550
551 addrconf_addr_solict_mult(&work->target, &mcaddr);
552 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
553 dev_put(work->dev);
554 kfree(work);
555 }
556
557 static void rt6_probe(struct rt6_info *rt)
558 {
559 struct __rt6_probe_work *work;
560 struct neighbour *neigh;
561 /*
562 * Okay, this does not seem to be appropriate
563 * for now, however, we need to check if it
564 * is really so; aka Router Reachability Probing.
565 *
566 * Router Reachability Probe MUST be rate-limited
567 * to no more than one per minute.
568 */
569 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
570 return;
571 rcu_read_lock_bh();
572 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573 if (neigh) {
574 if (neigh->nud_state & NUD_VALID)
575 goto out;
576
577 work = NULL;
578 write_lock(&neigh->lock);
579 if (!(neigh->nud_state & NUD_VALID) &&
580 time_after(jiffies,
581 neigh->updated +
582 rt->rt6i_idev->cnf.rtr_probe_interval)) {
583 work = kmalloc(sizeof(*work), GFP_ATOMIC);
584 if (work)
585 __neigh_set_probe_once(neigh);
586 }
587 write_unlock(&neigh->lock);
588 } else {
589 work = kmalloc(sizeof(*work), GFP_ATOMIC);
590 }
591
592 if (work) {
593 INIT_WORK(&work->work, rt6_probe_deferred);
594 work->target = rt->rt6i_gateway;
595 dev_hold(rt->dst.dev);
596 work->dev = rt->dst.dev;
597 schedule_work(&work->work);
598 }
599
600 out:
601 rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct rt6_info *rt)
605 {
606 }
607 #endif
608
609 /*
610 * Default Router Selection (RFC 2461 6.3.6)
611 */
612 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 {
614 struct net_device *dev = rt->dst.dev;
615 if (!oif || dev->ifindex == oif)
616 return 2;
617 if ((dev->flags & IFF_LOOPBACK) &&
618 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
619 return 1;
620 return 0;
621 }
622
623 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 {
625 struct neighbour *neigh;
626 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627
628 if (rt->rt6i_flags & RTF_NONEXTHOP ||
629 !(rt->rt6i_flags & RTF_GATEWAY))
630 return RT6_NUD_SUCCEED;
631
632 rcu_read_lock_bh();
633 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634 if (neigh) {
635 read_lock(&neigh->lock);
636 if (neigh->nud_state & NUD_VALID)
637 ret = RT6_NUD_SUCCEED;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639 else if (!(neigh->nud_state & NUD_FAILED))
640 ret = RT6_NUD_SUCCEED;
641 else
642 ret = RT6_NUD_FAIL_PROBE;
643 #endif
644 read_unlock(&neigh->lock);
645 } else {
646 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
647 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648 }
649 rcu_read_unlock_bh();
650
651 return ret;
652 }
653
654 static int rt6_score_route(struct rt6_info *rt, int oif,
655 int strict)
656 {
657 int m;
658
659 m = rt6_check_dev(rt, oif);
660 if (!m && (strict & RT6_LOOKUP_F_IFACE))
661 return RT6_NUD_FAIL_HARD;
662 #ifdef CONFIG_IPV6_ROUTER_PREF
663 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 #endif
665 if (strict & RT6_LOOKUP_F_REACHABLE) {
666 int n = rt6_check_neigh(rt);
667 if (n < 0)
668 return n;
669 }
670 return m;
671 }
672
673 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
674 int *mpri, struct rt6_info *match,
675 bool *do_rr)
676 {
677 int m;
678 bool match_do_rr = false;
679 struct inet6_dev *idev = rt->rt6i_idev;
680 struct net_device *dev = rt->dst.dev;
681
682 if (dev && !netif_carrier_ok(dev) &&
683 idev->cnf.ignore_routes_with_linkdown &&
684 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
685 goto out;
686
687 if (rt6_check_expired(rt))
688 goto out;
689
690 m = rt6_score_route(rt, oif, strict);
691 if (m == RT6_NUD_FAIL_DO_RR) {
692 match_do_rr = true;
693 m = 0; /* lowest valid score */
694 } else if (m == RT6_NUD_FAIL_HARD) {
695 goto out;
696 }
697
698 if (strict & RT6_LOOKUP_F_REACHABLE)
699 rt6_probe(rt);
700
701 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
702 if (m > *mpri) {
703 *do_rr = match_do_rr;
704 *mpri = m;
705 match = rt;
706 }
707 out:
708 return match;
709 }
710
711 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
712 struct rt6_info *rr_head,
713 u32 metric, int oif, int strict,
714 bool *do_rr)
715 {
716 struct rt6_info *rt, *match, *cont;
717 int mpri = -1;
718
719 match = NULL;
720 cont = NULL;
721 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
722 if (rt->rt6i_metric != metric) {
723 cont = rt;
724 break;
725 }
726
727 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 }
729
730 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
731 if (rt->rt6i_metric != metric) {
732 cont = rt;
733 break;
734 }
735
736 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737 }
738
739 if (match || !cont)
740 return match;
741
742 for (rt = cont; rt; rt = rt->dst.rt6_next)
743 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744
745 return match;
746 }
747
748 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
749 {
750 struct rt6_info *match, *rt0;
751 struct net *net;
752 bool do_rr = false;
753
754 rt0 = fn->rr_ptr;
755 if (!rt0)
756 fn->rr_ptr = rt0 = fn->leaf;
757
758 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
759 &do_rr);
760
761 if (do_rr) {
762 struct rt6_info *next = rt0->dst.rt6_next;
763
764 /* no entries matched; do round-robin */
765 if (!next || next->rt6i_metric != rt0->rt6i_metric)
766 next = fn->leaf;
767
768 if (next != rt0)
769 fn->rr_ptr = next;
770 }
771
772 net = dev_net(rt0->dst.dev);
773 return match ? match : net->ipv6.ip6_null_entry;
774 }
775
776 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
777 {
778 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
779 }
780
781 #ifdef CONFIG_IPV6_ROUTE_INFO
782 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
783 const struct in6_addr *gwaddr)
784 {
785 struct net *net = dev_net(dev);
786 struct route_info *rinfo = (struct route_info *) opt;
787 struct in6_addr prefix_buf, *prefix;
788 unsigned int pref;
789 unsigned long lifetime;
790 struct rt6_info *rt;
791
792 if (len < sizeof(struct route_info)) {
793 return -EINVAL;
794 }
795
796 /* Sanity check for prefix_len and length */
797 if (rinfo->length > 3) {
798 return -EINVAL;
799 } else if (rinfo->prefix_len > 128) {
800 return -EINVAL;
801 } else if (rinfo->prefix_len > 64) {
802 if (rinfo->length < 2) {
803 return -EINVAL;
804 }
805 } else if (rinfo->prefix_len > 0) {
806 if (rinfo->length < 1) {
807 return -EINVAL;
808 }
809 }
810
811 pref = rinfo->route_pref;
812 if (pref == ICMPV6_ROUTER_PREF_INVALID)
813 return -EINVAL;
814
815 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
816
817 if (rinfo->length == 3)
818 prefix = (struct in6_addr *)rinfo->prefix;
819 else {
820 /* this function is safe */
821 ipv6_addr_prefix(&prefix_buf,
822 (struct in6_addr *)rinfo->prefix,
823 rinfo->prefix_len);
824 prefix = &prefix_buf;
825 }
826
827 if (rinfo->prefix_len == 0)
828 rt = rt6_get_dflt_router(gwaddr, dev);
829 else
830 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
831 gwaddr, dev);
832
833 if (rt && !lifetime) {
834 ip6_del_rt(rt);
835 rt = NULL;
836 }
837
838 if (!rt && lifetime)
839 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
840 dev, pref);
841 else if (rt)
842 rt->rt6i_flags = RTF_ROUTEINFO |
843 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
844
845 if (rt) {
846 if (!addrconf_finite_timeout(lifetime))
847 rt6_clean_expires(rt);
848 else
849 rt6_set_expires(rt, jiffies + HZ * lifetime);
850
851 ip6_rt_put(rt);
852 }
853 return 0;
854 }
855 #endif
856
857 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
858 struct in6_addr *saddr)
859 {
860 struct fib6_node *pn;
861 while (1) {
862 if (fn->fn_flags & RTN_TL_ROOT)
863 return NULL;
864 pn = fn->parent;
865 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
866 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
867 else
868 fn = pn;
869 if (fn->fn_flags & RTN_RTINFO)
870 return fn;
871 }
872 }
873
874 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
875 struct fib6_table *table,
876 struct flowi6 *fl6, int flags)
877 {
878 struct fib6_node *fn;
879 struct rt6_info *rt;
880
881 read_lock_bh(&table->tb6_lock);
882 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 restart:
884 rt = fn->leaf;
885 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
886 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
887 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
888 if (rt == net->ipv6.ip6_null_entry) {
889 fn = fib6_backtrack(fn, &fl6->saddr);
890 if (fn)
891 goto restart;
892 }
893 dst_use(&rt->dst, jiffies);
894 read_unlock_bh(&table->tb6_lock);
895
896 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
897
898 return rt;
899
900 }
901
902 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903 int flags)
904 {
905 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
906 }
907 EXPORT_SYMBOL_GPL(ip6_route_lookup);
908
909 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
910 const struct in6_addr *saddr, int oif, int strict)
911 {
912 struct flowi6 fl6 = {
913 .flowi6_oif = oif,
914 .daddr = *daddr,
915 };
916 struct dst_entry *dst;
917 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918
919 if (saddr) {
920 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
921 flags |= RT6_LOOKUP_F_HAS_SADDR;
922 }
923
924 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
925 if (dst->error == 0)
926 return (struct rt6_info *) dst;
927
928 dst_release(dst);
929
930 return NULL;
931 }
932 EXPORT_SYMBOL(rt6_lookup);
933
934 /* ip6_ins_rt is called with FREE table->tb6_lock.
935 It takes new route entry, the addition fails by any reason the
936 route is freed. In any case, if caller does not hold it, it may
937 be destroyed.
938 */
939
940 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
941 struct mx6_config *mxc,
942 struct netlink_ext_ack *extack)
943 {
944 int err;
945 struct fib6_table *table;
946
947 table = rt->rt6i_table;
948 write_lock_bh(&table->tb6_lock);
949 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
950 write_unlock_bh(&table->tb6_lock);
951
952 return err;
953 }
954
955 int ip6_ins_rt(struct rt6_info *rt)
956 {
957 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
958 struct mx6_config mxc = { .mx = NULL, };
959
960 return __ip6_ins_rt(rt, &info, &mxc, NULL);
961 }
962
963 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
964 const struct in6_addr *daddr,
965 const struct in6_addr *saddr)
966 {
967 struct rt6_info *rt;
968
969 /*
970 * Clone the route.
971 */
972
973 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
974 ort = (struct rt6_info *)ort->dst.from;
975
976 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
977
978 if (!rt)
979 return NULL;
980
981 ip6_rt_copy_init(rt, ort);
982 rt->rt6i_flags |= RTF_CACHE;
983 rt->rt6i_metric = 0;
984 rt->dst.flags |= DST_HOST;
985 rt->rt6i_dst.addr = *daddr;
986 rt->rt6i_dst.plen = 128;
987
988 if (!rt6_is_gw_or_nonexthop(ort)) {
989 if (ort->rt6i_dst.plen != 128 &&
990 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
991 rt->rt6i_flags |= RTF_ANYCAST;
992 #ifdef CONFIG_IPV6_SUBTREES
993 if (rt->rt6i_src.plen && saddr) {
994 rt->rt6i_src.addr = *saddr;
995 rt->rt6i_src.plen = 128;
996 }
997 #endif
998 }
999
1000 return rt;
1001 }
1002
1003 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1004 {
1005 struct rt6_info *pcpu_rt;
1006
1007 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1008 rt->dst.dev, rt->dst.flags);
1009
1010 if (!pcpu_rt)
1011 return NULL;
1012 ip6_rt_copy_init(pcpu_rt, rt);
1013 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1014 pcpu_rt->rt6i_flags |= RTF_PCPU;
1015 return pcpu_rt;
1016 }
1017
1018 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1019 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1020 {
1021 struct rt6_info *pcpu_rt, **p;
1022
1023 p = this_cpu_ptr(rt->rt6i_pcpu);
1024 pcpu_rt = *p;
1025
1026 if (pcpu_rt) {
1027 dst_hold(&pcpu_rt->dst);
1028 rt6_dst_from_metrics_check(pcpu_rt);
1029 }
1030 return pcpu_rt;
1031 }
1032
1033 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1034 {
1035 struct fib6_table *table = rt->rt6i_table;
1036 struct rt6_info *pcpu_rt, *prev, **p;
1037
1038 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1039 if (!pcpu_rt) {
1040 struct net *net = dev_net(rt->dst.dev);
1041
1042 dst_hold(&net->ipv6.ip6_null_entry->dst);
1043 return net->ipv6.ip6_null_entry;
1044 }
1045
1046 read_lock_bh(&table->tb6_lock);
1047 if (rt->rt6i_pcpu) {
1048 p = this_cpu_ptr(rt->rt6i_pcpu);
1049 prev = cmpxchg(p, NULL, pcpu_rt);
1050 if (prev) {
1051 /* If someone did it before us, return prev instead */
1052 dst_destroy(&pcpu_rt->dst);
1053 pcpu_rt = prev;
1054 }
1055 } else {
1056 /* rt has been removed from the fib6 tree
1057 * before we have a chance to acquire the read_lock.
1058 * In this case, don't brother to create a pcpu rt
1059 * since rt is going away anyway. The next
1060 * dst_check() will trigger a re-lookup.
1061 */
1062 dst_destroy(&pcpu_rt->dst);
1063 pcpu_rt = rt;
1064 }
1065 dst_hold(&pcpu_rt->dst);
1066 rt6_dst_from_metrics_check(pcpu_rt);
1067 read_unlock_bh(&table->tb6_lock);
1068 return pcpu_rt;
1069 }
1070
1071 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1072 int oif, struct flowi6 *fl6, int flags)
1073 {
1074 struct fib6_node *fn, *saved_fn;
1075 struct rt6_info *rt;
1076 int strict = 0;
1077
1078 strict |= flags & RT6_LOOKUP_F_IFACE;
1079 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1080 if (net->ipv6.devconf_all->forwarding == 0)
1081 strict |= RT6_LOOKUP_F_REACHABLE;
1082
1083 read_lock_bh(&table->tb6_lock);
1084
1085 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1086 saved_fn = fn;
1087
1088 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1089 oif = 0;
1090
1091 redo_rt6_select:
1092 rt = rt6_select(fn, oif, strict);
1093 if (rt->rt6i_nsiblings)
1094 rt = rt6_multipath_select(rt, fl6, oif, strict);
1095 if (rt == net->ipv6.ip6_null_entry) {
1096 fn = fib6_backtrack(fn, &fl6->saddr);
1097 if (fn)
1098 goto redo_rt6_select;
1099 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1100 /* also consider unreachable route */
1101 strict &= ~RT6_LOOKUP_F_REACHABLE;
1102 fn = saved_fn;
1103 goto redo_rt6_select;
1104 }
1105 }
1106
1107
1108 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1109 dst_use(&rt->dst, jiffies);
1110 read_unlock_bh(&table->tb6_lock);
1111
1112 rt6_dst_from_metrics_check(rt);
1113
1114 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1115 return rt;
1116 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1117 !(rt->rt6i_flags & RTF_GATEWAY))) {
1118 /* Create a RTF_CACHE clone which will not be
1119 * owned by the fib6 tree. It is for the special case where
1120 * the daddr in the skb during the neighbor look-up is different
1121 * from the fl6->daddr used to look-up route here.
1122 */
1123
1124 struct rt6_info *uncached_rt;
1125
1126 dst_use(&rt->dst, jiffies);
1127 read_unlock_bh(&table->tb6_lock);
1128
1129 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1130 dst_release(&rt->dst);
1131
1132 if (uncached_rt)
1133 rt6_uncached_list_add(uncached_rt);
1134 else
1135 uncached_rt = net->ipv6.ip6_null_entry;
1136
1137 dst_hold(&uncached_rt->dst);
1138
1139 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1140 return uncached_rt;
1141
1142 } else {
1143 /* Get a percpu copy */
1144
1145 struct rt6_info *pcpu_rt;
1146
1147 rt->dst.lastuse = jiffies;
1148 rt->dst.__use++;
1149 pcpu_rt = rt6_get_pcpu_route(rt);
1150
1151 if (pcpu_rt) {
1152 read_unlock_bh(&table->tb6_lock);
1153 } else {
1154 /* We have to do the read_unlock first
1155 * because rt6_make_pcpu_route() may trigger
1156 * ip6_dst_gc() which will take the write_lock.
1157 */
1158 dst_hold(&rt->dst);
1159 read_unlock_bh(&table->tb6_lock);
1160 pcpu_rt = rt6_make_pcpu_route(rt);
1161 dst_release(&rt->dst);
1162 }
1163
1164 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1165 return pcpu_rt;
1166
1167 }
1168 }
1169 EXPORT_SYMBOL_GPL(ip6_pol_route);
1170
1171 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1172 struct flowi6 *fl6, int flags)
1173 {
1174 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1175 }
1176
1177 struct dst_entry *ip6_route_input_lookup(struct net *net,
1178 struct net_device *dev,
1179 struct flowi6 *fl6, int flags)
1180 {
1181 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1182 flags |= RT6_LOOKUP_F_IFACE;
1183
1184 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1185 }
1186 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1187
1188 void ip6_route_input(struct sk_buff *skb)
1189 {
1190 const struct ipv6hdr *iph = ipv6_hdr(skb);
1191 struct net *net = dev_net(skb->dev);
1192 int flags = RT6_LOOKUP_F_HAS_SADDR;
1193 struct ip_tunnel_info *tun_info;
1194 struct flowi6 fl6 = {
1195 .flowi6_iif = skb->dev->ifindex,
1196 .daddr = iph->daddr,
1197 .saddr = iph->saddr,
1198 .flowlabel = ip6_flowinfo(iph),
1199 .flowi6_mark = skb->mark,
1200 .flowi6_proto = iph->nexthdr,
1201 };
1202
1203 tun_info = skb_tunnel_info(skb);
1204 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1205 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1206 skb_dst_drop(skb);
1207 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1208 }
1209
1210 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1211 struct flowi6 *fl6, int flags)
1212 {
1213 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1214 }
1215
1216 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1217 struct flowi6 *fl6, int flags)
1218 {
1219 bool any_src;
1220
1221 if (rt6_need_strict(&fl6->daddr)) {
1222 struct dst_entry *dst;
1223
1224 dst = l3mdev_link_scope_lookup(net, fl6);
1225 if (dst)
1226 return dst;
1227 }
1228
1229 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1230
1231 any_src = ipv6_addr_any(&fl6->saddr);
1232 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1233 (fl6->flowi6_oif && any_src))
1234 flags |= RT6_LOOKUP_F_IFACE;
1235
1236 if (!any_src)
1237 flags |= RT6_LOOKUP_F_HAS_SADDR;
1238 else if (sk)
1239 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1240
1241 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1242 }
1243 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1244
1245 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1246 {
1247 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1248 struct dst_entry *new = NULL;
1249
1250 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1251 if (rt) {
1252 rt6_info_init(rt);
1253
1254 new = &rt->dst;
1255 new->__use = 1;
1256 new->input = dst_discard;
1257 new->output = dst_discard_out;
1258
1259 dst_copy_metrics(new, &ort->dst);
1260 rt->rt6i_idev = ort->rt6i_idev;
1261 if (rt->rt6i_idev)
1262 in6_dev_hold(rt->rt6i_idev);
1263
1264 rt->rt6i_gateway = ort->rt6i_gateway;
1265 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1266 rt->rt6i_metric = 0;
1267
1268 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1269 #ifdef CONFIG_IPV6_SUBTREES
1270 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1271 #endif
1272
1273 dst_free(new);
1274 }
1275
1276 dst_release(dst_orig);
1277 return new ? new : ERR_PTR(-ENOMEM);
1278 }
1279
1280 /*
1281 * Destination cache support functions
1282 */
1283
1284 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1285 {
1286 if (rt->dst.from &&
1287 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1288 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1289 }
1290
1291 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1292 {
1293 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1294 return NULL;
1295
1296 if (rt6_check_expired(rt))
1297 return NULL;
1298
1299 return &rt->dst;
1300 }
1301
1302 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1303 {
1304 if (!__rt6_check_expired(rt) &&
1305 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1306 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1307 return &rt->dst;
1308 else
1309 return NULL;
1310 }
1311
1312 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1313 {
1314 struct rt6_info *rt;
1315
1316 rt = (struct rt6_info *) dst;
1317
1318 /* All IPV6 dsts are created with ->obsolete set to the value
1319 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1320 * into this function always.
1321 */
1322
1323 rt6_dst_from_metrics_check(rt);
1324
1325 if (rt->rt6i_flags & RTF_PCPU ||
1326 (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1327 return rt6_dst_from_check(rt, cookie);
1328 else
1329 return rt6_check(rt, cookie);
1330 }
1331
1332 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1333 {
1334 struct rt6_info *rt = (struct rt6_info *) dst;
1335
1336 if (rt) {
1337 if (rt->rt6i_flags & RTF_CACHE) {
1338 if (rt6_check_expired(rt)) {
1339 ip6_del_rt(rt);
1340 dst = NULL;
1341 }
1342 } else {
1343 dst_release(dst);
1344 dst = NULL;
1345 }
1346 }
1347 return dst;
1348 }
1349
1350 static void ip6_link_failure(struct sk_buff *skb)
1351 {
1352 struct rt6_info *rt;
1353
1354 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1355
1356 rt = (struct rt6_info *) skb_dst(skb);
1357 if (rt) {
1358 if (rt->rt6i_flags & RTF_CACHE) {
1359 dst_hold(&rt->dst);
1360 ip6_del_rt(rt);
1361 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1362 rt->rt6i_node->fn_sernum = -1;
1363 }
1364 }
1365 }
1366
1367 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1368 {
1369 struct net *net = dev_net(rt->dst.dev);
1370
1371 rt->rt6i_flags |= RTF_MODIFIED;
1372 rt->rt6i_pmtu = mtu;
1373 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1374 }
1375
1376 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1377 {
1378 return !(rt->rt6i_flags & RTF_CACHE) &&
1379 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1380 }
1381
1382 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1383 const struct ipv6hdr *iph, u32 mtu)
1384 {
1385 const struct in6_addr *daddr, *saddr;
1386 struct rt6_info *rt6 = (struct rt6_info *)dst;
1387
1388 if (rt6->rt6i_flags & RTF_LOCAL)
1389 return;
1390
1391 if (dst_metric_locked(dst, RTAX_MTU))
1392 return;
1393
1394 if (iph) {
1395 daddr = &iph->daddr;
1396 saddr = &iph->saddr;
1397 } else if (sk) {
1398 daddr = &sk->sk_v6_daddr;
1399 saddr = &inet6_sk(sk)->saddr;
1400 } else {
1401 daddr = NULL;
1402 saddr = NULL;
1403 }
1404 dst_confirm_neigh(dst, daddr);
1405 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1406 if (mtu >= dst_mtu(dst))
1407 return;
1408
1409 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1410 rt6_do_update_pmtu(rt6, mtu);
1411 } else if (daddr) {
1412 struct rt6_info *nrt6;
1413
1414 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1415 if (nrt6) {
1416 rt6_do_update_pmtu(nrt6, mtu);
1417
1418 /* ip6_ins_rt(nrt6) will bump the
1419 * rt6->rt6i_node->fn_sernum
1420 * which will fail the next rt6_check() and
1421 * invalidate the sk->sk_dst_cache.
1422 */
1423 ip6_ins_rt(nrt6);
1424 }
1425 }
1426 }
1427
1428 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1429 struct sk_buff *skb, u32 mtu)
1430 {
1431 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1432 }
1433
1434 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1435 int oif, u32 mark, kuid_t uid)
1436 {
1437 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1438 struct dst_entry *dst;
1439 struct flowi6 fl6;
1440
1441 memset(&fl6, 0, sizeof(fl6));
1442 fl6.flowi6_oif = oif;
1443 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1444 fl6.daddr = iph->daddr;
1445 fl6.saddr = iph->saddr;
1446 fl6.flowlabel = ip6_flowinfo(iph);
1447 fl6.flowi6_uid = uid;
1448
1449 dst = ip6_route_output(net, NULL, &fl6);
1450 if (!dst->error)
1451 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1452 dst_release(dst);
1453 }
1454 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1455
1456 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1457 {
1458 struct dst_entry *dst;
1459
1460 ip6_update_pmtu(skb, sock_net(sk), mtu,
1461 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1462
1463 dst = __sk_dst_get(sk);
1464 if (!dst || !dst->obsolete ||
1465 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1466 return;
1467
1468 bh_lock_sock(sk);
1469 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1470 ip6_datagram_dst_update(sk, false);
1471 bh_unlock_sock(sk);
1472 }
1473 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1474
1475 /* Handle redirects */
1476 struct ip6rd_flowi {
1477 struct flowi6 fl6;
1478 struct in6_addr gateway;
1479 };
1480
1481 static struct rt6_info *__ip6_route_redirect(struct net *net,
1482 struct fib6_table *table,
1483 struct flowi6 *fl6,
1484 int flags)
1485 {
1486 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1487 struct rt6_info *rt;
1488 struct fib6_node *fn;
1489
1490 /* Get the "current" route for this destination and
1491 * check if the redirect has come from appropriate router.
1492 *
1493 * RFC 4861 specifies that redirects should only be
1494 * accepted if they come from the nexthop to the target.
1495 * Due to the way the routes are chosen, this notion
1496 * is a bit fuzzy and one might need to check all possible
1497 * routes.
1498 */
1499
1500 read_lock_bh(&table->tb6_lock);
1501 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1502 restart:
1503 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1504 if (rt6_check_expired(rt))
1505 continue;
1506 if (rt->dst.error)
1507 break;
1508 if (!(rt->rt6i_flags & RTF_GATEWAY))
1509 continue;
1510 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1511 continue;
1512 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1513 continue;
1514 break;
1515 }
1516
1517 if (!rt)
1518 rt = net->ipv6.ip6_null_entry;
1519 else if (rt->dst.error) {
1520 rt = net->ipv6.ip6_null_entry;
1521 goto out;
1522 }
1523
1524 if (rt == net->ipv6.ip6_null_entry) {
1525 fn = fib6_backtrack(fn, &fl6->saddr);
1526 if (fn)
1527 goto restart;
1528 }
1529
1530 out:
1531 dst_hold(&rt->dst);
1532
1533 read_unlock_bh(&table->tb6_lock);
1534
1535 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1536 return rt;
1537 };
1538
1539 static struct dst_entry *ip6_route_redirect(struct net *net,
1540 const struct flowi6 *fl6,
1541 const struct in6_addr *gateway)
1542 {
1543 int flags = RT6_LOOKUP_F_HAS_SADDR;
1544 struct ip6rd_flowi rdfl;
1545
1546 rdfl.fl6 = *fl6;
1547 rdfl.gateway = *gateway;
1548
1549 return fib6_rule_lookup(net, &rdfl.fl6,
1550 flags, __ip6_route_redirect);
1551 }
1552
1553 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1554 kuid_t uid)
1555 {
1556 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1557 struct dst_entry *dst;
1558 struct flowi6 fl6;
1559
1560 memset(&fl6, 0, sizeof(fl6));
1561 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1562 fl6.flowi6_oif = oif;
1563 fl6.flowi6_mark = mark;
1564 fl6.daddr = iph->daddr;
1565 fl6.saddr = iph->saddr;
1566 fl6.flowlabel = ip6_flowinfo(iph);
1567 fl6.flowi6_uid = uid;
1568
1569 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1570 rt6_do_redirect(dst, NULL, skb);
1571 dst_release(dst);
1572 }
1573 EXPORT_SYMBOL_GPL(ip6_redirect);
1574
1575 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1576 u32 mark)
1577 {
1578 const struct ipv6hdr *iph = ipv6_hdr(skb);
1579 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1580 struct dst_entry *dst;
1581 struct flowi6 fl6;
1582
1583 memset(&fl6, 0, sizeof(fl6));
1584 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1585 fl6.flowi6_oif = oif;
1586 fl6.flowi6_mark = mark;
1587 fl6.daddr = msg->dest;
1588 fl6.saddr = iph->daddr;
1589 fl6.flowi6_uid = sock_net_uid(net, NULL);
1590
1591 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1592 rt6_do_redirect(dst, NULL, skb);
1593 dst_release(dst);
1594 }
1595
1596 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1597 {
1598 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1599 sk->sk_uid);
1600 }
1601 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1602
1603 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1604 {
1605 struct net_device *dev = dst->dev;
1606 unsigned int mtu = dst_mtu(dst);
1607 struct net *net = dev_net(dev);
1608
1609 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1610
1611 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1612 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1613
1614 /*
1615 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1616 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1617 * IPV6_MAXPLEN is also valid and means: "any MSS,
1618 * rely only on pmtu discovery"
1619 */
1620 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1621 mtu = IPV6_MAXPLEN;
1622 return mtu;
1623 }
1624
1625 static unsigned int ip6_mtu(const struct dst_entry *dst)
1626 {
1627 const struct rt6_info *rt = (const struct rt6_info *)dst;
1628 unsigned int mtu = rt->rt6i_pmtu;
1629 struct inet6_dev *idev;
1630
1631 if (mtu)
1632 goto out;
1633
1634 mtu = dst_metric_raw(dst, RTAX_MTU);
1635 if (mtu)
1636 goto out;
1637
1638 mtu = IPV6_MIN_MTU;
1639
1640 rcu_read_lock();
1641 idev = __in6_dev_get(dst->dev);
1642 if (idev)
1643 mtu = idev->cnf.mtu6;
1644 rcu_read_unlock();
1645
1646 out:
1647 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1648
1649 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1650 }
1651
1652 static struct dst_entry *icmp6_dst_gc_list;
1653 static DEFINE_SPINLOCK(icmp6_dst_lock);
1654
1655 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1656 struct flowi6 *fl6)
1657 {
1658 struct dst_entry *dst;
1659 struct rt6_info *rt;
1660 struct inet6_dev *idev = in6_dev_get(dev);
1661 struct net *net = dev_net(dev);
1662
1663 if (unlikely(!idev))
1664 return ERR_PTR(-ENODEV);
1665
1666 rt = ip6_dst_alloc(net, dev, 0);
1667 if (unlikely(!rt)) {
1668 in6_dev_put(idev);
1669 dst = ERR_PTR(-ENOMEM);
1670 goto out;
1671 }
1672
1673 rt->dst.flags |= DST_HOST;
1674 rt->dst.output = ip6_output;
1675 atomic_set(&rt->dst.__refcnt, 1);
1676 rt->rt6i_gateway = fl6->daddr;
1677 rt->rt6i_dst.addr = fl6->daddr;
1678 rt->rt6i_dst.plen = 128;
1679 rt->rt6i_idev = idev;
1680 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1681
1682 spin_lock_bh(&icmp6_dst_lock);
1683 rt->dst.next = icmp6_dst_gc_list;
1684 icmp6_dst_gc_list = &rt->dst;
1685 spin_unlock_bh(&icmp6_dst_lock);
1686
1687 fib6_force_start_gc(net);
1688
1689 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1690
1691 out:
1692 return dst;
1693 }
1694
1695 int icmp6_dst_gc(void)
1696 {
1697 struct dst_entry *dst, **pprev;
1698 int more = 0;
1699
1700 spin_lock_bh(&icmp6_dst_lock);
1701 pprev = &icmp6_dst_gc_list;
1702
1703 while ((dst = *pprev) != NULL) {
1704 if (!atomic_read(&dst->__refcnt)) {
1705 *pprev = dst->next;
1706 dst_free(dst);
1707 } else {
1708 pprev = &dst->next;
1709 ++more;
1710 }
1711 }
1712
1713 spin_unlock_bh(&icmp6_dst_lock);
1714
1715 return more;
1716 }
1717
1718 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1719 void *arg)
1720 {
1721 struct dst_entry *dst, **pprev;
1722
1723 spin_lock_bh(&icmp6_dst_lock);
1724 pprev = &icmp6_dst_gc_list;
1725 while ((dst = *pprev) != NULL) {
1726 struct rt6_info *rt = (struct rt6_info *) dst;
1727 if (func(rt, arg)) {
1728 *pprev = dst->next;
1729 dst_free(dst);
1730 } else {
1731 pprev = &dst->next;
1732 }
1733 }
1734 spin_unlock_bh(&icmp6_dst_lock);
1735 }
1736
1737 static int ip6_dst_gc(struct dst_ops *ops)
1738 {
1739 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1740 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1741 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1742 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1743 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1744 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1745 int entries;
1746
1747 entries = dst_entries_get_fast(ops);
1748 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1749 entries <= rt_max_size)
1750 goto out;
1751
1752 net->ipv6.ip6_rt_gc_expire++;
1753 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1754 entries = dst_entries_get_slow(ops);
1755 if (entries < ops->gc_thresh)
1756 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1757 out:
1758 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1759 return entries > rt_max_size;
1760 }
1761
1762 static int ip6_convert_metrics(struct mx6_config *mxc,
1763 const struct fib6_config *cfg)
1764 {
1765 bool ecn_ca = false;
1766 struct nlattr *nla;
1767 int remaining;
1768 u32 *mp;
1769
1770 if (!cfg->fc_mx)
1771 return 0;
1772
1773 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1774 if (unlikely(!mp))
1775 return -ENOMEM;
1776
1777 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1778 int type = nla_type(nla);
1779 u32 val;
1780
1781 if (!type)
1782 continue;
1783 if (unlikely(type > RTAX_MAX))
1784 goto err;
1785
1786 if (type == RTAX_CC_ALGO) {
1787 char tmp[TCP_CA_NAME_MAX];
1788
1789 nla_strlcpy(tmp, nla, sizeof(tmp));
1790 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1791 if (val == TCP_CA_UNSPEC)
1792 goto err;
1793 } else {
1794 val = nla_get_u32(nla);
1795 }
1796 if (type == RTAX_HOPLIMIT && val > 255)
1797 val = 255;
1798 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1799 goto err;
1800
1801 mp[type - 1] = val;
1802 __set_bit(type - 1, mxc->mx_valid);
1803 }
1804
1805 if (ecn_ca) {
1806 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1807 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1808 }
1809
1810 mxc->mx = mp;
1811 return 0;
1812 err:
1813 kfree(mp);
1814 return -EINVAL;
1815 }
1816
1817 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1818 struct fib6_config *cfg,
1819 const struct in6_addr *gw_addr)
1820 {
1821 struct flowi6 fl6 = {
1822 .flowi6_oif = cfg->fc_ifindex,
1823 .daddr = *gw_addr,
1824 .saddr = cfg->fc_prefsrc,
1825 };
1826 struct fib6_table *table;
1827 struct rt6_info *rt;
1828 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1829
1830 table = fib6_get_table(net, cfg->fc_table);
1831 if (!table)
1832 return NULL;
1833
1834 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1835 flags |= RT6_LOOKUP_F_HAS_SADDR;
1836
1837 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1838
1839 /* if table lookup failed, fall back to full lookup */
1840 if (rt == net->ipv6.ip6_null_entry) {
1841 ip6_rt_put(rt);
1842 rt = NULL;
1843 }
1844
1845 return rt;
1846 }
1847
1848 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1849 struct netlink_ext_ack *extack)
1850 {
1851 struct net *net = cfg->fc_nlinfo.nl_net;
1852 struct rt6_info *rt = NULL;
1853 struct net_device *dev = NULL;
1854 struct inet6_dev *idev = NULL;
1855 struct fib6_table *table;
1856 int addr_type;
1857 int err = -EINVAL;
1858
1859 /* RTF_PCPU is an internal flag; can not be set by userspace */
1860 if (cfg->fc_flags & RTF_PCPU) {
1861 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1862 goto out;
1863 }
1864
1865 if (cfg->fc_dst_len > 128) {
1866 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1867 goto out;
1868 }
1869 if (cfg->fc_src_len > 128) {
1870 NL_SET_ERR_MSG(extack, "Invalid source address length");
1871 goto out;
1872 }
1873 #ifndef CONFIG_IPV6_SUBTREES
1874 if (cfg->fc_src_len) {
1875 NL_SET_ERR_MSG(extack,
1876 "Specifying source address requires IPV6_SUBTREES to be enabled");
1877 goto out;
1878 }
1879 #endif
1880 if (cfg->fc_ifindex) {
1881 err = -ENODEV;
1882 dev = dev_get_by_index(net, cfg->fc_ifindex);
1883 if (!dev)
1884 goto out;
1885 idev = in6_dev_get(dev);
1886 if (!idev)
1887 goto out;
1888 }
1889
1890 if (cfg->fc_metric == 0)
1891 cfg->fc_metric = IP6_RT_PRIO_USER;
1892
1893 err = -ENOBUFS;
1894 if (cfg->fc_nlinfo.nlh &&
1895 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1896 table = fib6_get_table(net, cfg->fc_table);
1897 if (!table) {
1898 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1899 table = fib6_new_table(net, cfg->fc_table);
1900 }
1901 } else {
1902 table = fib6_new_table(net, cfg->fc_table);
1903 }
1904
1905 if (!table)
1906 goto out;
1907
1908 rt = ip6_dst_alloc(net, NULL,
1909 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1910
1911 if (!rt) {
1912 err = -ENOMEM;
1913 goto out;
1914 }
1915
1916 if (cfg->fc_flags & RTF_EXPIRES)
1917 rt6_set_expires(rt, jiffies +
1918 clock_t_to_jiffies(cfg->fc_expires));
1919 else
1920 rt6_clean_expires(rt);
1921
1922 if (cfg->fc_protocol == RTPROT_UNSPEC)
1923 cfg->fc_protocol = RTPROT_BOOT;
1924 rt->rt6i_protocol = cfg->fc_protocol;
1925
1926 addr_type = ipv6_addr_type(&cfg->fc_dst);
1927
1928 if (addr_type & IPV6_ADDR_MULTICAST)
1929 rt->dst.input = ip6_mc_input;
1930 else if (cfg->fc_flags & RTF_LOCAL)
1931 rt->dst.input = ip6_input;
1932 else
1933 rt->dst.input = ip6_forward;
1934
1935 rt->dst.output = ip6_output;
1936
1937 if (cfg->fc_encap) {
1938 struct lwtunnel_state *lwtstate;
1939
1940 err = lwtunnel_build_state(cfg->fc_encap_type,
1941 cfg->fc_encap, AF_INET6, cfg,
1942 &lwtstate);
1943 if (err)
1944 goto out;
1945 rt->dst.lwtstate = lwtstate_get(lwtstate);
1946 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1947 rt->dst.lwtstate->orig_output = rt->dst.output;
1948 rt->dst.output = lwtunnel_output;
1949 }
1950 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1951 rt->dst.lwtstate->orig_input = rt->dst.input;
1952 rt->dst.input = lwtunnel_input;
1953 }
1954 }
1955
1956 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1957 rt->rt6i_dst.plen = cfg->fc_dst_len;
1958 if (rt->rt6i_dst.plen == 128)
1959 rt->dst.flags |= DST_HOST;
1960
1961 #ifdef CONFIG_IPV6_SUBTREES
1962 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1963 rt->rt6i_src.plen = cfg->fc_src_len;
1964 #endif
1965
1966 rt->rt6i_metric = cfg->fc_metric;
1967
1968 /* We cannot add true routes via loopback here,
1969 they would result in kernel looping; promote them to reject routes
1970 */
1971 if ((cfg->fc_flags & RTF_REJECT) ||
1972 (dev && (dev->flags & IFF_LOOPBACK) &&
1973 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1974 !(cfg->fc_flags & RTF_LOCAL))) {
1975 /* hold loopback dev/idev if we haven't done so. */
1976 if (dev != net->loopback_dev) {
1977 if (dev) {
1978 dev_put(dev);
1979 in6_dev_put(idev);
1980 }
1981 dev = net->loopback_dev;
1982 dev_hold(dev);
1983 idev = in6_dev_get(dev);
1984 if (!idev) {
1985 err = -ENODEV;
1986 goto out;
1987 }
1988 }
1989 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1990 switch (cfg->fc_type) {
1991 case RTN_BLACKHOLE:
1992 rt->dst.error = -EINVAL;
1993 rt->dst.output = dst_discard_out;
1994 rt->dst.input = dst_discard;
1995 break;
1996 case RTN_PROHIBIT:
1997 rt->dst.error = -EACCES;
1998 rt->dst.output = ip6_pkt_prohibit_out;
1999 rt->dst.input = ip6_pkt_prohibit;
2000 break;
2001 case RTN_THROW:
2002 case RTN_UNREACHABLE:
2003 default:
2004 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2005 : (cfg->fc_type == RTN_UNREACHABLE)
2006 ? -EHOSTUNREACH : -ENETUNREACH;
2007 rt->dst.output = ip6_pkt_discard_out;
2008 rt->dst.input = ip6_pkt_discard;
2009 break;
2010 }
2011 goto install_route;
2012 }
2013
2014 if (cfg->fc_flags & RTF_GATEWAY) {
2015 const struct in6_addr *gw_addr;
2016 int gwa_type;
2017
2018 gw_addr = &cfg->fc_gateway;
2019 gwa_type = ipv6_addr_type(gw_addr);
2020
2021 /* if gw_addr is local we will fail to detect this in case
2022 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2023 * will return already-added prefix route via interface that
2024 * prefix route was assigned to, which might be non-loopback.
2025 */
2026 err = -EINVAL;
2027 if (ipv6_chk_addr_and_flags(net, gw_addr,
2028 gwa_type & IPV6_ADDR_LINKLOCAL ?
2029 dev : NULL, 0, 0)) {
2030 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2031 goto out;
2032 }
2033 rt->rt6i_gateway = *gw_addr;
2034
2035 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2036 struct rt6_info *grt = NULL;
2037
2038 /* IPv6 strictly inhibits using not link-local
2039 addresses as nexthop address.
2040 Otherwise, router will not able to send redirects.
2041 It is very good, but in some (rare!) circumstances
2042 (SIT, PtP, NBMA NOARP links) it is handy to allow
2043 some exceptions. --ANK
2044 We allow IPv4-mapped nexthops to support RFC4798-type
2045 addressing
2046 */
2047 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2048 IPV6_ADDR_MAPPED))) {
2049 NL_SET_ERR_MSG(extack,
2050 "Invalid gateway address");
2051 goto out;
2052 }
2053
2054 if (cfg->fc_table) {
2055 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2056
2057 if (grt) {
2058 if (grt->rt6i_flags & RTF_GATEWAY ||
2059 (dev && dev != grt->dst.dev)) {
2060 ip6_rt_put(grt);
2061 grt = NULL;
2062 }
2063 }
2064 }
2065
2066 if (!grt)
2067 grt = rt6_lookup(net, gw_addr, NULL,
2068 cfg->fc_ifindex, 1);
2069
2070 err = -EHOSTUNREACH;
2071 if (!grt)
2072 goto out;
2073 if (dev) {
2074 if (dev != grt->dst.dev) {
2075 ip6_rt_put(grt);
2076 goto out;
2077 }
2078 } else {
2079 dev = grt->dst.dev;
2080 idev = grt->rt6i_idev;
2081 dev_hold(dev);
2082 in6_dev_hold(grt->rt6i_idev);
2083 }
2084 if (!(grt->rt6i_flags & RTF_GATEWAY))
2085 err = 0;
2086 ip6_rt_put(grt);
2087
2088 if (err)
2089 goto out;
2090 }
2091 err = -EINVAL;
2092 if (!dev) {
2093 NL_SET_ERR_MSG(extack, "Egress device not specified");
2094 goto out;
2095 } else if (dev->flags & IFF_LOOPBACK) {
2096 NL_SET_ERR_MSG(extack,
2097 "Egress device can not be loopback device for this route");
2098 goto out;
2099 }
2100 }
2101
2102 err = -ENODEV;
2103 if (!dev)
2104 goto out;
2105
2106 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2107 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2108 NL_SET_ERR_MSG(extack, "Invalid source address");
2109 err = -EINVAL;
2110 goto out;
2111 }
2112 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2113 rt->rt6i_prefsrc.plen = 128;
2114 } else
2115 rt->rt6i_prefsrc.plen = 0;
2116
2117 rt->rt6i_flags = cfg->fc_flags;
2118
2119 install_route:
2120 rt->dst.dev = dev;
2121 rt->rt6i_idev = idev;
2122 rt->rt6i_table = table;
2123
2124 cfg->fc_nlinfo.nl_net = dev_net(dev);
2125
2126 return rt;
2127 out:
2128 if (dev)
2129 dev_put(dev);
2130 if (idev)
2131 in6_dev_put(idev);
2132 if (rt)
2133 dst_free(&rt->dst);
2134
2135 return ERR_PTR(err);
2136 }
2137
2138 int ip6_route_add(struct fib6_config *cfg,
2139 struct netlink_ext_ack *extack)
2140 {
2141 struct mx6_config mxc = { .mx = NULL, };
2142 struct rt6_info *rt;
2143 int err;
2144
2145 rt = ip6_route_info_create(cfg, extack);
2146 if (IS_ERR(rt)) {
2147 err = PTR_ERR(rt);
2148 rt = NULL;
2149 goto out;
2150 }
2151
2152 err = ip6_convert_metrics(&mxc, cfg);
2153 if (err)
2154 goto out;
2155
2156 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2157
2158 kfree(mxc.mx);
2159
2160 return err;
2161 out:
2162 if (rt)
2163 dst_free(&rt->dst);
2164
2165 return err;
2166 }
2167
2168 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2169 {
2170 int err;
2171 struct fib6_table *table;
2172 struct net *net = dev_net(rt->dst.dev);
2173
2174 if (rt == net->ipv6.ip6_null_entry ||
2175 rt->dst.flags & DST_NOCACHE) {
2176 err = -ENOENT;
2177 goto out;
2178 }
2179
2180 table = rt->rt6i_table;
2181 write_lock_bh(&table->tb6_lock);
2182 err = fib6_del(rt, info);
2183 write_unlock_bh(&table->tb6_lock);
2184
2185 out:
2186 ip6_rt_put(rt);
2187 return err;
2188 }
2189
2190 int ip6_del_rt(struct rt6_info *rt)
2191 {
2192 struct nl_info info = {
2193 .nl_net = dev_net(rt->dst.dev),
2194 };
2195 return __ip6_del_rt(rt, &info);
2196 }
2197
2198 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2199 {
2200 struct nl_info *info = &cfg->fc_nlinfo;
2201 struct net *net = info->nl_net;
2202 struct sk_buff *skb = NULL;
2203 struct fib6_table *table;
2204 int err = -ENOENT;
2205
2206 if (rt == net->ipv6.ip6_null_entry)
2207 goto out_put;
2208 table = rt->rt6i_table;
2209 write_lock_bh(&table->tb6_lock);
2210
2211 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2212 struct rt6_info *sibling, *next_sibling;
2213
2214 /* prefer to send a single notification with all hops */
2215 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2216 if (skb) {
2217 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2218
2219 if (rt6_fill_node(net, skb, rt,
2220 NULL, NULL, 0, RTM_DELROUTE,
2221 info->portid, seq, 0) < 0) {
2222 kfree_skb(skb);
2223 skb = NULL;
2224 } else
2225 info->skip_notify = 1;
2226 }
2227
2228 list_for_each_entry_safe(sibling, next_sibling,
2229 &rt->rt6i_siblings,
2230 rt6i_siblings) {
2231 err = fib6_del(sibling, info);
2232 if (err)
2233 goto out_unlock;
2234 }
2235 }
2236
2237 err = fib6_del(rt, info);
2238 out_unlock:
2239 write_unlock_bh(&table->tb6_lock);
2240 out_put:
2241 ip6_rt_put(rt);
2242
2243 if (skb) {
2244 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2245 info->nlh, gfp_any());
2246 }
2247 return err;
2248 }
2249
2250 static int ip6_route_del(struct fib6_config *cfg,
2251 struct netlink_ext_ack *extack)
2252 {
2253 struct fib6_table *table;
2254 struct fib6_node *fn;
2255 struct rt6_info *rt;
2256 int err = -ESRCH;
2257
2258 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2259 if (!table) {
2260 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2261 return err;
2262 }
2263
2264 read_lock_bh(&table->tb6_lock);
2265
2266 fn = fib6_locate(&table->tb6_root,
2267 &cfg->fc_dst, cfg->fc_dst_len,
2268 &cfg->fc_src, cfg->fc_src_len);
2269
2270 if (fn) {
2271 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2272 if ((rt->rt6i_flags & RTF_CACHE) &&
2273 !(cfg->fc_flags & RTF_CACHE))
2274 continue;
2275 if (cfg->fc_ifindex &&
2276 (!rt->dst.dev ||
2277 rt->dst.dev->ifindex != cfg->fc_ifindex))
2278 continue;
2279 if (cfg->fc_flags & RTF_GATEWAY &&
2280 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2281 continue;
2282 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2283 continue;
2284 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2285 continue;
2286 dst_hold(&rt->dst);
2287 read_unlock_bh(&table->tb6_lock);
2288
2289 /* if gateway was specified only delete the one hop */
2290 if (cfg->fc_flags & RTF_GATEWAY)
2291 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2292
2293 return __ip6_del_rt_siblings(rt, cfg);
2294 }
2295 }
2296 read_unlock_bh(&table->tb6_lock);
2297
2298 return err;
2299 }
2300
2301 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2302 {
2303 struct netevent_redirect netevent;
2304 struct rt6_info *rt, *nrt = NULL;
2305 struct ndisc_options ndopts;
2306 struct inet6_dev *in6_dev;
2307 struct neighbour *neigh;
2308 struct rd_msg *msg;
2309 int optlen, on_link;
2310 u8 *lladdr;
2311
2312 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2313 optlen -= sizeof(*msg);
2314
2315 if (optlen < 0) {
2316 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2317 return;
2318 }
2319
2320 msg = (struct rd_msg *)icmp6_hdr(skb);
2321
2322 if (ipv6_addr_is_multicast(&msg->dest)) {
2323 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2324 return;
2325 }
2326
2327 on_link = 0;
2328 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2329 on_link = 1;
2330 } else if (ipv6_addr_type(&msg->target) !=
2331 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2332 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2333 return;
2334 }
2335
2336 in6_dev = __in6_dev_get(skb->dev);
2337 if (!in6_dev)
2338 return;
2339 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2340 return;
2341
2342 /* RFC2461 8.1:
2343 * The IP source address of the Redirect MUST be the same as the current
2344 * first-hop router for the specified ICMP Destination Address.
2345 */
2346
2347 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2348 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2349 return;
2350 }
2351
2352 lladdr = NULL;
2353 if (ndopts.nd_opts_tgt_lladdr) {
2354 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2355 skb->dev);
2356 if (!lladdr) {
2357 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2358 return;
2359 }
2360 }
2361
2362 rt = (struct rt6_info *) dst;
2363 if (rt->rt6i_flags & RTF_REJECT) {
2364 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2365 return;
2366 }
2367
2368 /* Redirect received -> path was valid.
2369 * Look, redirects are sent only in response to data packets,
2370 * so that this nexthop apparently is reachable. --ANK
2371 */
2372 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2373
2374 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2375 if (!neigh)
2376 return;
2377
2378 /*
2379 * We have finally decided to accept it.
2380 */
2381
2382 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2383 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2384 NEIGH_UPDATE_F_OVERRIDE|
2385 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2386 NEIGH_UPDATE_F_ISROUTER)),
2387 NDISC_REDIRECT, &ndopts);
2388
2389 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2390 if (!nrt)
2391 goto out;
2392
2393 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2394 if (on_link)
2395 nrt->rt6i_flags &= ~RTF_GATEWAY;
2396
2397 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2398
2399 if (ip6_ins_rt(nrt))
2400 goto out;
2401
2402 netevent.old = &rt->dst;
2403 netevent.new = &nrt->dst;
2404 netevent.daddr = &msg->dest;
2405 netevent.neigh = neigh;
2406 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2407
2408 if (rt->rt6i_flags & RTF_CACHE) {
2409 rt = (struct rt6_info *) dst_clone(&rt->dst);
2410 ip6_del_rt(rt);
2411 }
2412
2413 out:
2414 neigh_release(neigh);
2415 }
2416
2417 /*
2418 * Misc support functions
2419 */
2420
2421 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2422 {
2423 BUG_ON(from->dst.from);
2424
2425 rt->rt6i_flags &= ~RTF_EXPIRES;
2426 dst_hold(&from->dst);
2427 rt->dst.from = &from->dst;
2428 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2429 }
2430
2431 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2432 {
2433 rt->dst.input = ort->dst.input;
2434 rt->dst.output = ort->dst.output;
2435 rt->rt6i_dst = ort->rt6i_dst;
2436 rt->dst.error = ort->dst.error;
2437 rt->rt6i_idev = ort->rt6i_idev;
2438 if (rt->rt6i_idev)
2439 in6_dev_hold(rt->rt6i_idev);
2440 rt->dst.lastuse = jiffies;
2441 rt->rt6i_gateway = ort->rt6i_gateway;
2442 rt->rt6i_flags = ort->rt6i_flags;
2443 rt6_set_from(rt, ort);
2444 rt->rt6i_metric = ort->rt6i_metric;
2445 #ifdef CONFIG_IPV6_SUBTREES
2446 rt->rt6i_src = ort->rt6i_src;
2447 #endif
2448 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2449 rt->rt6i_table = ort->rt6i_table;
2450 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2451 }
2452
2453 #ifdef CONFIG_IPV6_ROUTE_INFO
2454 static struct rt6_info *rt6_get_route_info(struct net *net,
2455 const struct in6_addr *prefix, int prefixlen,
2456 const struct in6_addr *gwaddr,
2457 struct net_device *dev)
2458 {
2459 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2460 int ifindex = dev->ifindex;
2461 struct fib6_node *fn;
2462 struct rt6_info *rt = NULL;
2463 struct fib6_table *table;
2464
2465 table = fib6_get_table(net, tb_id);
2466 if (!table)
2467 return NULL;
2468
2469 read_lock_bh(&table->tb6_lock);
2470 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2471 if (!fn)
2472 goto out;
2473
2474 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2475 if (rt->dst.dev->ifindex != ifindex)
2476 continue;
2477 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2478 continue;
2479 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2480 continue;
2481 dst_hold(&rt->dst);
2482 break;
2483 }
2484 out:
2485 read_unlock_bh(&table->tb6_lock);
2486 return rt;
2487 }
2488
2489 static struct rt6_info *rt6_add_route_info(struct net *net,
2490 const struct in6_addr *prefix, int prefixlen,
2491 const struct in6_addr *gwaddr,
2492 struct net_device *dev,
2493 unsigned int pref)
2494 {
2495 struct fib6_config cfg = {
2496 .fc_metric = IP6_RT_PRIO_USER,
2497 .fc_ifindex = dev->ifindex,
2498 .fc_dst_len = prefixlen,
2499 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2500 RTF_UP | RTF_PREF(pref),
2501 .fc_nlinfo.portid = 0,
2502 .fc_nlinfo.nlh = NULL,
2503 .fc_nlinfo.nl_net = net,
2504 };
2505
2506 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2507 cfg.fc_dst = *prefix;
2508 cfg.fc_gateway = *gwaddr;
2509
2510 /* We should treat it as a default route if prefix length is 0. */
2511 if (!prefixlen)
2512 cfg.fc_flags |= RTF_DEFAULT;
2513
2514 ip6_route_add(&cfg, NULL);
2515
2516 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2517 }
2518 #endif
2519
2520 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2521 {
2522 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2523 struct rt6_info *rt;
2524 struct fib6_table *table;
2525
2526 table = fib6_get_table(dev_net(dev), tb_id);
2527 if (!table)
2528 return NULL;
2529
2530 read_lock_bh(&table->tb6_lock);
2531 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2532 if (dev == rt->dst.dev &&
2533 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2534 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2535 break;
2536 }
2537 if (rt)
2538 dst_hold(&rt->dst);
2539 read_unlock_bh(&table->tb6_lock);
2540 return rt;
2541 }
2542
2543 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2544 struct net_device *dev,
2545 unsigned int pref)
2546 {
2547 struct fib6_config cfg = {
2548 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2549 .fc_metric = IP6_RT_PRIO_USER,
2550 .fc_ifindex = dev->ifindex,
2551 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2552 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2553 .fc_nlinfo.portid = 0,
2554 .fc_nlinfo.nlh = NULL,
2555 .fc_nlinfo.nl_net = dev_net(dev),
2556 };
2557
2558 cfg.fc_gateway = *gwaddr;
2559
2560 if (!ip6_route_add(&cfg, NULL)) {
2561 struct fib6_table *table;
2562
2563 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2564 if (table)
2565 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2566 }
2567
2568 return rt6_get_dflt_router(gwaddr, dev);
2569 }
2570
2571 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2572 {
2573 struct rt6_info *rt;
2574
2575 restart:
2576 read_lock_bh(&table->tb6_lock);
2577 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2578 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2579 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2580 dst_hold(&rt->dst);
2581 read_unlock_bh(&table->tb6_lock);
2582 ip6_del_rt(rt);
2583 goto restart;
2584 }
2585 }
2586 read_unlock_bh(&table->tb6_lock);
2587
2588 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2589 }
2590
2591 void rt6_purge_dflt_routers(struct net *net)
2592 {
2593 struct fib6_table *table;
2594 struct hlist_head *head;
2595 unsigned int h;
2596
2597 rcu_read_lock();
2598
2599 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2600 head = &net->ipv6.fib_table_hash[h];
2601 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2602 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2603 __rt6_purge_dflt_routers(table);
2604 }
2605 }
2606
2607 rcu_read_unlock();
2608 }
2609
2610 static void rtmsg_to_fib6_config(struct net *net,
2611 struct in6_rtmsg *rtmsg,
2612 struct fib6_config *cfg)
2613 {
2614 memset(cfg, 0, sizeof(*cfg));
2615
2616 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2617 : RT6_TABLE_MAIN;
2618 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2619 cfg->fc_metric = rtmsg->rtmsg_metric;
2620 cfg->fc_expires = rtmsg->rtmsg_info;
2621 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2622 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2623 cfg->fc_flags = rtmsg->rtmsg_flags;
2624
2625 cfg->fc_nlinfo.nl_net = net;
2626
2627 cfg->fc_dst = rtmsg->rtmsg_dst;
2628 cfg->fc_src = rtmsg->rtmsg_src;
2629 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2630 }
2631
2632 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2633 {
2634 struct fib6_config cfg;
2635 struct in6_rtmsg rtmsg;
2636 int err;
2637
2638 switch (cmd) {
2639 case SIOCADDRT: /* Add a route */
2640 case SIOCDELRT: /* Delete a route */
2641 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2642 return -EPERM;
2643 err = copy_from_user(&rtmsg, arg,
2644 sizeof(struct in6_rtmsg));
2645 if (err)
2646 return -EFAULT;
2647
2648 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2649
2650 rtnl_lock();
2651 switch (cmd) {
2652 case SIOCADDRT:
2653 err = ip6_route_add(&cfg, NULL);
2654 break;
2655 case SIOCDELRT:
2656 err = ip6_route_del(&cfg, NULL);
2657 break;
2658 default:
2659 err = -EINVAL;
2660 }
2661 rtnl_unlock();
2662
2663 return err;
2664 }
2665
2666 return -EINVAL;
2667 }
2668
2669 /*
2670 * Drop the packet on the floor
2671 */
2672
2673 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2674 {
2675 int type;
2676 struct dst_entry *dst = skb_dst(skb);
2677 switch (ipstats_mib_noroutes) {
2678 case IPSTATS_MIB_INNOROUTES:
2679 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2680 if (type == IPV6_ADDR_ANY) {
2681 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2682 IPSTATS_MIB_INADDRERRORS);
2683 break;
2684 }
2685 /* FALLTHROUGH */
2686 case IPSTATS_MIB_OUTNOROUTES:
2687 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2688 ipstats_mib_noroutes);
2689 break;
2690 }
2691 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2692 kfree_skb(skb);
2693 return 0;
2694 }
2695
2696 static int ip6_pkt_discard(struct sk_buff *skb)
2697 {
2698 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2699 }
2700
2701 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2702 {
2703 skb->dev = skb_dst(skb)->dev;
2704 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2705 }
2706
2707 static int ip6_pkt_prohibit(struct sk_buff *skb)
2708 {
2709 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2710 }
2711
2712 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2713 {
2714 skb->dev = skb_dst(skb)->dev;
2715 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2716 }
2717
2718 /*
2719 * Allocate a dst for local (unicast / anycast) address.
2720 */
2721
2722 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2723 const struct in6_addr *addr,
2724 bool anycast)
2725 {
2726 u32 tb_id;
2727 struct net *net = dev_net(idev->dev);
2728 struct net_device *dev = net->loopback_dev;
2729 struct rt6_info *rt;
2730
2731 /* use L3 Master device as loopback for host routes if device
2732 * is enslaved and address is not link local or multicast
2733 */
2734 if (!rt6_need_strict(addr))
2735 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2736
2737 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2738 if (!rt)
2739 return ERR_PTR(-ENOMEM);
2740
2741 in6_dev_hold(idev);
2742
2743 rt->dst.flags |= DST_HOST;
2744 rt->dst.input = ip6_input;
2745 rt->dst.output = ip6_output;
2746 rt->rt6i_idev = idev;
2747
2748 rt->rt6i_protocol = RTPROT_KERNEL;
2749 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2750 if (anycast)
2751 rt->rt6i_flags |= RTF_ANYCAST;
2752 else
2753 rt->rt6i_flags |= RTF_LOCAL;
2754
2755 rt->rt6i_gateway = *addr;
2756 rt->rt6i_dst.addr = *addr;
2757 rt->rt6i_dst.plen = 128;
2758 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2759 rt->rt6i_table = fib6_get_table(net, tb_id);
2760 rt->dst.flags |= DST_NOCACHE;
2761
2762 atomic_set(&rt->dst.__refcnt, 1);
2763
2764 return rt;
2765 }
2766
2767 /* remove deleted ip from prefsrc entries */
2768 struct arg_dev_net_ip {
2769 struct net_device *dev;
2770 struct net *net;
2771 struct in6_addr *addr;
2772 };
2773
2774 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2775 {
2776 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2777 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2778 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2779
2780 if (((void *)rt->dst.dev == dev || !dev) &&
2781 rt != net->ipv6.ip6_null_entry &&
2782 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2783 /* remove prefsrc entry */
2784 rt->rt6i_prefsrc.plen = 0;
2785 }
2786 return 0;
2787 }
2788
2789 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2790 {
2791 struct net *net = dev_net(ifp->idev->dev);
2792 struct arg_dev_net_ip adni = {
2793 .dev = ifp->idev->dev,
2794 .net = net,
2795 .addr = &ifp->addr,
2796 };
2797 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2798 }
2799
2800 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2801 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2802
2803 /* Remove routers and update dst entries when gateway turn into host. */
2804 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2805 {
2806 struct in6_addr *gateway = (struct in6_addr *)arg;
2807
2808 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2809 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2810 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2811 return -1;
2812 }
2813 return 0;
2814 }
2815
2816 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2817 {
2818 fib6_clean_all(net, fib6_clean_tohost, gateway);
2819 }
2820
2821 struct arg_dev_net {
2822 struct net_device *dev;
2823 struct net *net;
2824 };
2825
2826 /* called with write lock held for table with rt */
2827 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2828 {
2829 const struct arg_dev_net *adn = arg;
2830 const struct net_device *dev = adn->dev;
2831
2832 if ((rt->dst.dev == dev || !dev) &&
2833 rt != adn->net->ipv6.ip6_null_entry &&
2834 (rt->rt6i_nsiblings == 0 ||
2835 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2836 return -1;
2837
2838 return 0;
2839 }
2840
2841 void rt6_ifdown(struct net *net, struct net_device *dev)
2842 {
2843 struct arg_dev_net adn = {
2844 .dev = dev,
2845 .net = net,
2846 };
2847
2848 fib6_clean_all(net, fib6_ifdown, &adn);
2849 icmp6_clean_all(fib6_ifdown, &adn);
2850 if (dev)
2851 rt6_uncached_list_flush_dev(net, dev);
2852 }
2853
2854 struct rt6_mtu_change_arg {
2855 struct net_device *dev;
2856 unsigned int mtu;
2857 };
2858
2859 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2860 {
2861 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2862 struct inet6_dev *idev;
2863
2864 /* In IPv6 pmtu discovery is not optional,
2865 so that RTAX_MTU lock cannot disable it.
2866 We still use this lock to block changes
2867 caused by addrconf/ndisc.
2868 */
2869
2870 idev = __in6_dev_get(arg->dev);
2871 if (!idev)
2872 return 0;
2873
2874 /* For administrative MTU increase, there is no way to discover
2875 IPv6 PMTU increase, so PMTU increase should be updated here.
2876 Since RFC 1981 doesn't include administrative MTU increase
2877 update PMTU increase is a MUST. (i.e. jumbo frame)
2878 */
2879 /*
2880 If new MTU is less than route PMTU, this new MTU will be the
2881 lowest MTU in the path, update the route PMTU to reflect PMTU
2882 decreases; if new MTU is greater than route PMTU, and the
2883 old MTU is the lowest MTU in the path, update the route PMTU
2884 to reflect the increase. In this case if the other nodes' MTU
2885 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2886 PMTU discovery.
2887 */
2888 if (rt->dst.dev == arg->dev &&
2889 dst_metric_raw(&rt->dst, RTAX_MTU) &&
2890 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2891 if (rt->rt6i_flags & RTF_CACHE) {
2892 /* For RTF_CACHE with rt6i_pmtu == 0
2893 * (i.e. a redirected route),
2894 * the metrics of its rt->dst.from has already
2895 * been updated.
2896 */
2897 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2898 rt->rt6i_pmtu = arg->mtu;
2899 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2900 (dst_mtu(&rt->dst) < arg->mtu &&
2901 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2902 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2903 }
2904 }
2905 return 0;
2906 }
2907
2908 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2909 {
2910 struct rt6_mtu_change_arg arg = {
2911 .dev = dev,
2912 .mtu = mtu,
2913 };
2914
2915 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2916 }
2917
2918 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2919 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2920 [RTA_OIF] = { .type = NLA_U32 },
2921 [RTA_IIF] = { .type = NLA_U32 },
2922 [RTA_PRIORITY] = { .type = NLA_U32 },
2923 [RTA_METRICS] = { .type = NLA_NESTED },
2924 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2925 [RTA_PREF] = { .type = NLA_U8 },
2926 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2927 [RTA_ENCAP] = { .type = NLA_NESTED },
2928 [RTA_EXPIRES] = { .type = NLA_U32 },
2929 [RTA_UID] = { .type = NLA_U32 },
2930 [RTA_MARK] = { .type = NLA_U32 },
2931 };
2932
2933 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2934 struct fib6_config *cfg,
2935 struct netlink_ext_ack *extack)
2936 {
2937 struct rtmsg *rtm;
2938 struct nlattr *tb[RTA_MAX+1];
2939 unsigned int pref;
2940 int err;
2941
2942 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2943 NULL);
2944 if (err < 0)
2945 goto errout;
2946
2947 err = -EINVAL;
2948 rtm = nlmsg_data(nlh);
2949 memset(cfg, 0, sizeof(*cfg));
2950
2951 cfg->fc_table = rtm->rtm_table;
2952 cfg->fc_dst_len = rtm->rtm_dst_len;
2953 cfg->fc_src_len = rtm->rtm_src_len;
2954 cfg->fc_flags = RTF_UP;
2955 cfg->fc_protocol = rtm->rtm_protocol;
2956 cfg->fc_type = rtm->rtm_type;
2957
2958 if (rtm->rtm_type == RTN_UNREACHABLE ||
2959 rtm->rtm_type == RTN_BLACKHOLE ||
2960 rtm->rtm_type == RTN_PROHIBIT ||
2961 rtm->rtm_type == RTN_THROW)
2962 cfg->fc_flags |= RTF_REJECT;
2963
2964 if (rtm->rtm_type == RTN_LOCAL)
2965 cfg->fc_flags |= RTF_LOCAL;
2966
2967 if (rtm->rtm_flags & RTM_F_CLONED)
2968 cfg->fc_flags |= RTF_CACHE;
2969
2970 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2971 cfg->fc_nlinfo.nlh = nlh;
2972 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2973
2974 if (tb[RTA_GATEWAY]) {
2975 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2976 cfg->fc_flags |= RTF_GATEWAY;
2977 }
2978
2979 if (tb[RTA_DST]) {
2980 int plen = (rtm->rtm_dst_len + 7) >> 3;
2981
2982 if (nla_len(tb[RTA_DST]) < plen)
2983 goto errout;
2984
2985 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2986 }
2987
2988 if (tb[RTA_SRC]) {
2989 int plen = (rtm->rtm_src_len + 7) >> 3;
2990
2991 if (nla_len(tb[RTA_SRC]) < plen)
2992 goto errout;
2993
2994 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2995 }
2996
2997 if (tb[RTA_PREFSRC])
2998 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2999
3000 if (tb[RTA_OIF])
3001 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3002
3003 if (tb[RTA_PRIORITY])
3004 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3005
3006 if (tb[RTA_METRICS]) {
3007 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3008 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3009 }
3010
3011 if (tb[RTA_TABLE])
3012 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3013
3014 if (tb[RTA_MULTIPATH]) {
3015 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3016 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3017
3018 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3019 cfg->fc_mp_len, extack);
3020 if (err < 0)
3021 goto errout;
3022 }
3023
3024 if (tb[RTA_PREF]) {
3025 pref = nla_get_u8(tb[RTA_PREF]);
3026 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3027 pref != ICMPV6_ROUTER_PREF_HIGH)
3028 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3029 cfg->fc_flags |= RTF_PREF(pref);
3030 }
3031
3032 if (tb[RTA_ENCAP])
3033 cfg->fc_encap = tb[RTA_ENCAP];
3034
3035 if (tb[RTA_ENCAP_TYPE]) {
3036 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3037
3038 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3039 if (err < 0)
3040 goto errout;
3041 }
3042
3043 if (tb[RTA_EXPIRES]) {
3044 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3045
3046 if (addrconf_finite_timeout(timeout)) {
3047 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3048 cfg->fc_flags |= RTF_EXPIRES;
3049 }
3050 }
3051
3052 err = 0;
3053 errout:
3054 return err;
3055 }
3056
3057 struct rt6_nh {
3058 struct rt6_info *rt6_info;
3059 struct fib6_config r_cfg;
3060 struct mx6_config mxc;
3061 struct list_head next;
3062 };
3063
3064 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3065 {
3066 struct rt6_nh *nh;
3067
3068 list_for_each_entry(nh, rt6_nh_list, next) {
3069 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3070 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3071 nh->r_cfg.fc_ifindex);
3072 }
3073 }
3074
3075 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3076 struct rt6_info *rt, struct fib6_config *r_cfg)
3077 {
3078 struct rt6_nh *nh;
3079 struct rt6_info *rtnh;
3080 int err = -EEXIST;
3081
3082 list_for_each_entry(nh, rt6_nh_list, next) {
3083 /* check if rt6_info already exists */
3084 rtnh = nh->rt6_info;
3085
3086 if (rtnh->dst.dev == rt->dst.dev &&
3087 rtnh->rt6i_idev == rt->rt6i_idev &&
3088 ipv6_addr_equal(&rtnh->rt6i_gateway,
3089 &rt->rt6i_gateway))
3090 return err;
3091 }
3092
3093 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3094 if (!nh)
3095 return -ENOMEM;
3096 nh->rt6_info = rt;
3097 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3098 if (err) {
3099 kfree(nh);
3100 return err;
3101 }
3102 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3103 list_add_tail(&nh->next, rt6_nh_list);
3104
3105 return 0;
3106 }
3107
3108 static void ip6_route_mpath_notify(struct rt6_info *rt,
3109 struct rt6_info *rt_last,
3110 struct nl_info *info,
3111 __u16 nlflags)
3112 {
3113 /* if this is an APPEND route, then rt points to the first route
3114 * inserted and rt_last points to last route inserted. Userspace
3115 * wants a consistent dump of the route which starts at the first
3116 * nexthop. Since sibling routes are always added at the end of
3117 * the list, find the first sibling of the last route appended
3118 */
3119 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3120 rt = list_first_entry(&rt_last->rt6i_siblings,
3121 struct rt6_info,
3122 rt6i_siblings);
3123 }
3124
3125 if (rt)
3126 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3127 }
3128
3129 static int ip6_route_multipath_add(struct fib6_config *cfg,
3130 struct netlink_ext_ack *extack)
3131 {
3132 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3133 struct nl_info *info = &cfg->fc_nlinfo;
3134 struct fib6_config r_cfg;
3135 struct rtnexthop *rtnh;
3136 struct rt6_info *rt;
3137 struct rt6_nh *err_nh;
3138 struct rt6_nh *nh, *nh_safe;
3139 __u16 nlflags;
3140 int remaining;
3141 int attrlen;
3142 int err = 1;
3143 int nhn = 0;
3144 int replace = (cfg->fc_nlinfo.nlh &&
3145 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3146 LIST_HEAD(rt6_nh_list);
3147
3148 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3149 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3150 nlflags |= NLM_F_APPEND;
3151
3152 remaining = cfg->fc_mp_len;
3153 rtnh = (struct rtnexthop *)cfg->fc_mp;
3154
3155 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3156 * rt6_info structs per nexthop
3157 */
3158 while (rtnh_ok(rtnh, remaining)) {
3159 memcpy(&r_cfg, cfg, sizeof(*cfg));
3160 if (rtnh->rtnh_ifindex)
3161 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3162
3163 attrlen = rtnh_attrlen(rtnh);
3164 if (attrlen > 0) {
3165 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3166
3167 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3168 if (nla) {
3169 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3170 r_cfg.fc_flags |= RTF_GATEWAY;
3171 }
3172 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3173 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3174 if (nla)
3175 r_cfg.fc_encap_type = nla_get_u16(nla);
3176 }
3177
3178 rt = ip6_route_info_create(&r_cfg, extack);
3179 if (IS_ERR(rt)) {
3180 err = PTR_ERR(rt);
3181 rt = NULL;
3182 goto cleanup;
3183 }
3184
3185 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3186 if (err) {
3187 dst_free(&rt->dst);
3188 goto cleanup;
3189 }
3190
3191 rtnh = rtnh_next(rtnh, &remaining);
3192 }
3193
3194 /* for add and replace send one notification with all nexthops.
3195 * Skip the notification in fib6_add_rt2node and send one with
3196 * the full route when done
3197 */
3198 info->skip_notify = 1;
3199
3200 err_nh = NULL;
3201 list_for_each_entry(nh, &rt6_nh_list, next) {
3202 rt_last = nh->rt6_info;
3203 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3204 /* save reference to first route for notification */
3205 if (!rt_notif && !err)
3206 rt_notif = nh->rt6_info;
3207
3208 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3209 nh->rt6_info = NULL;
3210 if (err) {
3211 if (replace && nhn)
3212 ip6_print_replace_route_err(&rt6_nh_list);
3213 err_nh = nh;
3214 goto add_errout;
3215 }
3216
3217 /* Because each route is added like a single route we remove
3218 * these flags after the first nexthop: if there is a collision,
3219 * we have already failed to add the first nexthop:
3220 * fib6_add_rt2node() has rejected it; when replacing, old
3221 * nexthops have been replaced by first new, the rest should
3222 * be added to it.
3223 */
3224 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3225 NLM_F_REPLACE);
3226 nhn++;
3227 }
3228
3229 /* success ... tell user about new route */
3230 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3231 goto cleanup;
3232
3233 add_errout:
3234 /* send notification for routes that were added so that
3235 * the delete notifications sent by ip6_route_del are
3236 * coherent
3237 */
3238 if (rt_notif)
3239 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3240
3241 /* Delete routes that were already added */
3242 list_for_each_entry(nh, &rt6_nh_list, next) {
3243 if (err_nh == nh)
3244 break;
3245 ip6_route_del(&nh->r_cfg, extack);
3246 }
3247
3248 cleanup:
3249 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3250 if (nh->rt6_info)
3251 dst_free(&nh->rt6_info->dst);
3252 kfree(nh->mxc.mx);
3253 list_del(&nh->next);
3254 kfree(nh);
3255 }
3256
3257 return err;
3258 }
3259
3260 static int ip6_route_multipath_del(struct fib6_config *cfg,
3261 struct netlink_ext_ack *extack)
3262 {
3263 struct fib6_config r_cfg;
3264 struct rtnexthop *rtnh;
3265 int remaining;
3266 int attrlen;
3267 int err = 1, last_err = 0;
3268
3269 remaining = cfg->fc_mp_len;
3270 rtnh = (struct rtnexthop *)cfg->fc_mp;
3271
3272 /* Parse a Multipath Entry */
3273 while (rtnh_ok(rtnh, remaining)) {
3274 memcpy(&r_cfg, cfg, sizeof(*cfg));
3275 if (rtnh->rtnh_ifindex)
3276 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3277
3278 attrlen = rtnh_attrlen(rtnh);
3279 if (attrlen > 0) {
3280 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3281
3282 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3283 if (nla) {
3284 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3285 r_cfg.fc_flags |= RTF_GATEWAY;
3286 }
3287 }
3288 err = ip6_route_del(&r_cfg, extack);
3289 if (err)
3290 last_err = err;
3291
3292 rtnh = rtnh_next(rtnh, &remaining);
3293 }
3294
3295 return last_err;
3296 }
3297
3298 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3299 struct netlink_ext_ack *extack)
3300 {
3301 struct fib6_config cfg;
3302 int err;
3303
3304 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3305 if (err < 0)
3306 return err;
3307
3308 if (cfg.fc_mp)
3309 return ip6_route_multipath_del(&cfg, extack);
3310 else {
3311 cfg.fc_delete_all_nh = 1;
3312 return ip6_route_del(&cfg, extack);
3313 }
3314 }
3315
3316 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3317 struct netlink_ext_ack *extack)
3318 {
3319 struct fib6_config cfg;
3320 int err;
3321
3322 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3323 if (err < 0)
3324 return err;
3325
3326 if (cfg.fc_mp)
3327 return ip6_route_multipath_add(&cfg, extack);
3328 else
3329 return ip6_route_add(&cfg, extack);
3330 }
3331
3332 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3333 {
3334 int nexthop_len = 0;
3335
3336 if (rt->rt6i_nsiblings) {
3337 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3338 + NLA_ALIGN(sizeof(struct rtnexthop))
3339 + nla_total_size(16) /* RTA_GATEWAY */
3340 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3341
3342 nexthop_len *= rt->rt6i_nsiblings;
3343 }
3344
3345 return NLMSG_ALIGN(sizeof(struct rtmsg))
3346 + nla_total_size(16) /* RTA_SRC */
3347 + nla_total_size(16) /* RTA_DST */
3348 + nla_total_size(16) /* RTA_GATEWAY */
3349 + nla_total_size(16) /* RTA_PREFSRC */
3350 + nla_total_size(4) /* RTA_TABLE */
3351 + nla_total_size(4) /* RTA_IIF */
3352 + nla_total_size(4) /* RTA_OIF */
3353 + nla_total_size(4) /* RTA_PRIORITY */
3354 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3355 + nla_total_size(sizeof(struct rta_cacheinfo))
3356 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3357 + nla_total_size(1) /* RTA_PREF */
3358 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3359 + nexthop_len;
3360 }
3361
3362 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3363 unsigned int *flags, bool skip_oif)
3364 {
3365 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3366 *flags |= RTNH_F_LINKDOWN;
3367 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3368 *flags |= RTNH_F_DEAD;
3369 }
3370
3371 if (rt->rt6i_flags & RTF_GATEWAY) {
3372 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3373 goto nla_put_failure;
3374 }
3375
3376 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3377 if (!skip_oif && rt->dst.dev &&
3378 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3379 goto nla_put_failure;
3380
3381 if (rt->dst.lwtstate &&
3382 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3383 goto nla_put_failure;
3384
3385 return 0;
3386
3387 nla_put_failure:
3388 return -EMSGSIZE;
3389 }
3390
3391 /* add multipath next hop */
3392 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3393 {
3394 struct rtnexthop *rtnh;
3395 unsigned int flags = 0;
3396
3397 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3398 if (!rtnh)
3399 goto nla_put_failure;
3400
3401 rtnh->rtnh_hops = 0;
3402 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3403
3404 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3405 goto nla_put_failure;
3406
3407 rtnh->rtnh_flags = flags;
3408
3409 /* length of rtnetlink header + attributes */
3410 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3411
3412 return 0;
3413
3414 nla_put_failure:
3415 return -EMSGSIZE;
3416 }
3417
3418 static int rt6_fill_node(struct net *net,
3419 struct sk_buff *skb, struct rt6_info *rt,
3420 struct in6_addr *dst, struct in6_addr *src,
3421 int iif, int type, u32 portid, u32 seq,
3422 unsigned int flags)
3423 {
3424 u32 metrics[RTAX_MAX];
3425 struct rtmsg *rtm;
3426 struct nlmsghdr *nlh;
3427 long expires;
3428 u32 table;
3429
3430 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3431 if (!nlh)
3432 return -EMSGSIZE;
3433
3434 rtm = nlmsg_data(nlh);
3435 rtm->rtm_family = AF_INET6;
3436 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3437 rtm->rtm_src_len = rt->rt6i_src.plen;
3438 rtm->rtm_tos = 0;
3439 if (rt->rt6i_table)
3440 table = rt->rt6i_table->tb6_id;
3441 else
3442 table = RT6_TABLE_UNSPEC;
3443 rtm->rtm_table = table;
3444 if (nla_put_u32(skb, RTA_TABLE, table))
3445 goto nla_put_failure;
3446 if (rt->rt6i_flags & RTF_REJECT) {
3447 switch (rt->dst.error) {
3448 case -EINVAL:
3449 rtm->rtm_type = RTN_BLACKHOLE;
3450 break;
3451 case -EACCES:
3452 rtm->rtm_type = RTN_PROHIBIT;
3453 break;
3454 case -EAGAIN:
3455 rtm->rtm_type = RTN_THROW;
3456 break;
3457 default:
3458 rtm->rtm_type = RTN_UNREACHABLE;
3459 break;
3460 }
3461 }
3462 else if (rt->rt6i_flags & RTF_LOCAL)
3463 rtm->rtm_type = RTN_LOCAL;
3464 else if (rt->rt6i_flags & RTF_ANYCAST)
3465 rtm->rtm_type = RTN_ANYCAST;
3466 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3467 rtm->rtm_type = RTN_LOCAL;
3468 else
3469 rtm->rtm_type = RTN_UNICAST;
3470 rtm->rtm_flags = 0;
3471 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3472 rtm->rtm_protocol = rt->rt6i_protocol;
3473 if (rt->rt6i_flags & RTF_DYNAMIC)
3474 rtm->rtm_protocol = RTPROT_REDIRECT;
3475 else if (rt->rt6i_flags & RTF_ADDRCONF) {
3476 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3477 rtm->rtm_protocol = RTPROT_RA;
3478 else
3479 rtm->rtm_protocol = RTPROT_KERNEL;
3480 }
3481
3482 if (rt->rt6i_flags & RTF_CACHE)
3483 rtm->rtm_flags |= RTM_F_CLONED;
3484
3485 if (dst) {
3486 if (nla_put_in6_addr(skb, RTA_DST, dst))
3487 goto nla_put_failure;
3488 rtm->rtm_dst_len = 128;
3489 } else if (rtm->rtm_dst_len)
3490 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3491 goto nla_put_failure;
3492 #ifdef CONFIG_IPV6_SUBTREES
3493 if (src) {
3494 if (nla_put_in6_addr(skb, RTA_SRC, src))
3495 goto nla_put_failure;
3496 rtm->rtm_src_len = 128;
3497 } else if (rtm->rtm_src_len &&
3498 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3499 goto nla_put_failure;
3500 #endif
3501 if (iif) {
3502 #ifdef CONFIG_IPV6_MROUTE
3503 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3504 int err = ip6mr_get_route(net, skb, rtm, portid);
3505
3506 if (err == 0)
3507 return 0;
3508 if (err < 0)
3509 goto nla_put_failure;
3510 } else
3511 #endif
3512 if (nla_put_u32(skb, RTA_IIF, iif))
3513 goto nla_put_failure;
3514 } else if (dst) {
3515 struct in6_addr saddr_buf;
3516 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3517 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3518 goto nla_put_failure;
3519 }
3520
3521 if (rt->rt6i_prefsrc.plen) {
3522 struct in6_addr saddr_buf;
3523 saddr_buf = rt->rt6i_prefsrc.addr;
3524 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3525 goto nla_put_failure;
3526 }
3527
3528 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3529 if (rt->rt6i_pmtu)
3530 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3531 if (rtnetlink_put_metrics(skb, metrics) < 0)
3532 goto nla_put_failure;
3533
3534 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3535 goto nla_put_failure;
3536
3537 /* For multipath routes, walk the siblings list and add
3538 * each as a nexthop within RTA_MULTIPATH.
3539 */
3540 if (rt->rt6i_nsiblings) {
3541 struct rt6_info *sibling, *next_sibling;
3542 struct nlattr *mp;
3543
3544 mp = nla_nest_start(skb, RTA_MULTIPATH);
3545 if (!mp)
3546 goto nla_put_failure;
3547
3548 if (rt6_add_nexthop(skb, rt) < 0)
3549 goto nla_put_failure;
3550
3551 list_for_each_entry_safe(sibling, next_sibling,
3552 &rt->rt6i_siblings, rt6i_siblings) {
3553 if (rt6_add_nexthop(skb, sibling) < 0)
3554 goto nla_put_failure;
3555 }
3556
3557 nla_nest_end(skb, mp);
3558 } else {
3559 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3560 goto nla_put_failure;
3561 }
3562
3563 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3564
3565 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3566 goto nla_put_failure;
3567
3568 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3569 goto nla_put_failure;
3570
3571
3572 nlmsg_end(skb, nlh);
3573 return 0;
3574
3575 nla_put_failure:
3576 nlmsg_cancel(skb, nlh);
3577 return -EMSGSIZE;
3578 }
3579
3580 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3581 {
3582 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3583 struct net *net = arg->net;
3584
3585 if (rt == net->ipv6.ip6_null_entry)
3586 return 0;
3587
3588 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3589 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3590
3591 /* user wants prefix routes only */
3592 if (rtm->rtm_flags & RTM_F_PREFIX &&
3593 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3594 /* success since this is not a prefix route */
3595 return 1;
3596 }
3597 }
3598
3599 return rt6_fill_node(net,
3600 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3601 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3602 NLM_F_MULTI);
3603 }
3604
3605 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3606 struct netlink_ext_ack *extack)
3607 {
3608 struct net *net = sock_net(in_skb->sk);
3609 struct nlattr *tb[RTA_MAX+1];
3610 int err, iif = 0, oif = 0;
3611 struct dst_entry *dst;
3612 struct rt6_info *rt;
3613 struct sk_buff *skb;
3614 struct rtmsg *rtm;
3615 struct flowi6 fl6;
3616 bool fibmatch;
3617
3618 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3619 extack);
3620 if (err < 0)
3621 goto errout;
3622
3623 err = -EINVAL;
3624 memset(&fl6, 0, sizeof(fl6));
3625 rtm = nlmsg_data(nlh);
3626 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3627 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3628
3629 if (tb[RTA_SRC]) {
3630 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3631 goto errout;
3632
3633 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3634 }
3635
3636 if (tb[RTA_DST]) {
3637 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3638 goto errout;
3639
3640 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3641 }
3642
3643 if (tb[RTA_IIF])
3644 iif = nla_get_u32(tb[RTA_IIF]);
3645
3646 if (tb[RTA_OIF])
3647 oif = nla_get_u32(tb[RTA_OIF]);
3648
3649 if (tb[RTA_MARK])
3650 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3651
3652 if (tb[RTA_UID])
3653 fl6.flowi6_uid = make_kuid(current_user_ns(),
3654 nla_get_u32(tb[RTA_UID]));
3655 else
3656 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3657
3658 if (iif) {
3659 struct net_device *dev;
3660 int flags = 0;
3661
3662 dev = __dev_get_by_index(net, iif);
3663 if (!dev) {
3664 err = -ENODEV;
3665 goto errout;
3666 }
3667
3668 fl6.flowi6_iif = iif;
3669
3670 if (!ipv6_addr_any(&fl6.saddr))
3671 flags |= RT6_LOOKUP_F_HAS_SADDR;
3672
3673 if (!fibmatch)
3674 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3675 } else {
3676 fl6.flowi6_oif = oif;
3677
3678 if (!fibmatch)
3679 dst = ip6_route_output(net, NULL, &fl6);
3680 }
3681
3682 if (fibmatch)
3683 dst = ip6_route_lookup(net, &fl6, 0);
3684
3685 rt = container_of(dst, struct rt6_info, dst);
3686 if (rt->dst.error) {
3687 err = rt->dst.error;
3688 ip6_rt_put(rt);
3689 goto errout;
3690 }
3691
3692 if (rt == net->ipv6.ip6_null_entry) {
3693 err = rt->dst.error;
3694 ip6_rt_put(rt);
3695 goto errout;
3696 }
3697
3698 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3699 if (!skb) {
3700 ip6_rt_put(rt);
3701 err = -ENOBUFS;
3702 goto errout;
3703 }
3704
3705 skb_dst_set(skb, &rt->dst);
3706 if (fibmatch)
3707 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3708 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3709 nlh->nlmsg_seq, 0);
3710 else
3711 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3712 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3713 nlh->nlmsg_seq, 0);
3714 if (err < 0) {
3715 kfree_skb(skb);
3716 goto errout;
3717 }
3718
3719 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3720 errout:
3721 return err;
3722 }
3723
3724 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3725 unsigned int nlm_flags)
3726 {
3727 struct sk_buff *skb;
3728 struct net *net = info->nl_net;
3729 u32 seq;
3730 int err;
3731
3732 err = -ENOBUFS;
3733 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3734
3735 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3736 if (!skb)
3737 goto errout;
3738
3739 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3740 event, info->portid, seq, nlm_flags);
3741 if (err < 0) {
3742 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3743 WARN_ON(err == -EMSGSIZE);
3744 kfree_skb(skb);
3745 goto errout;
3746 }
3747 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3748 info->nlh, gfp_any());
3749 return;
3750 errout:
3751 if (err < 0)
3752 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3753 }
3754
3755 static int ip6_route_dev_notify(struct notifier_block *this,
3756 unsigned long event, void *ptr)
3757 {
3758 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3759 struct net *net = dev_net(dev);
3760
3761 if (!(dev->flags & IFF_LOOPBACK))
3762 return NOTIFY_OK;
3763
3764 if (event == NETDEV_REGISTER) {
3765 net->ipv6.ip6_null_entry->dst.dev = dev;
3766 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3767 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3768 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3769 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3770 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3771 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3772 #endif
3773 } else if (event == NETDEV_UNREGISTER) {
3774 in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3775 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3776 in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3777 in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3778 #endif
3779 }
3780
3781 return NOTIFY_OK;
3782 }
3783
3784 /*
3785 * /proc
3786 */
3787
3788 #ifdef CONFIG_PROC_FS
3789
3790 static const struct file_operations ipv6_route_proc_fops = {
3791 .owner = THIS_MODULE,
3792 .open = ipv6_route_open,
3793 .read = seq_read,
3794 .llseek = seq_lseek,
3795 .release = seq_release_net,
3796 };
3797
3798 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3799 {
3800 struct net *net = (struct net *)seq->private;
3801 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3802 net->ipv6.rt6_stats->fib_nodes,
3803 net->ipv6.rt6_stats->fib_route_nodes,
3804 net->ipv6.rt6_stats->fib_rt_alloc,
3805 net->ipv6.rt6_stats->fib_rt_entries,
3806 net->ipv6.rt6_stats->fib_rt_cache,
3807 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3808 net->ipv6.rt6_stats->fib_discarded_routes);
3809
3810 return 0;
3811 }
3812
3813 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3814 {
3815 return single_open_net(inode, file, rt6_stats_seq_show);
3816 }
3817
3818 static const struct file_operations rt6_stats_seq_fops = {
3819 .owner = THIS_MODULE,
3820 .open = rt6_stats_seq_open,
3821 .read = seq_read,
3822 .llseek = seq_lseek,
3823 .release = single_release_net,
3824 };
3825 #endif /* CONFIG_PROC_FS */
3826
3827 #ifdef CONFIG_SYSCTL
3828
3829 static
3830 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3831 void __user *buffer, size_t *lenp, loff_t *ppos)
3832 {
3833 struct net *net;
3834 int delay;
3835 if (!write)
3836 return -EINVAL;
3837
3838 net = (struct net *)ctl->extra1;
3839 delay = net->ipv6.sysctl.flush_delay;
3840 proc_dointvec(ctl, write, buffer, lenp, ppos);
3841 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3842 return 0;
3843 }
3844
3845 struct ctl_table ipv6_route_table_template[] = {
3846 {
3847 .procname = "flush",
3848 .data = &init_net.ipv6.sysctl.flush_delay,
3849 .maxlen = sizeof(int),
3850 .mode = 0200,
3851 .proc_handler = ipv6_sysctl_rtcache_flush
3852 },
3853 {
3854 .procname = "gc_thresh",
3855 .data = &ip6_dst_ops_template.gc_thresh,
3856 .maxlen = sizeof(int),
3857 .mode = 0644,
3858 .proc_handler = proc_dointvec,
3859 },
3860 {
3861 .procname = "max_size",
3862 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3863 .maxlen = sizeof(int),
3864 .mode = 0644,
3865 .proc_handler = proc_dointvec,
3866 },
3867 {
3868 .procname = "gc_min_interval",
3869 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3870 .maxlen = sizeof(int),
3871 .mode = 0644,
3872 .proc_handler = proc_dointvec_jiffies,
3873 },
3874 {
3875 .procname = "gc_timeout",
3876 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3877 .maxlen = sizeof(int),
3878 .mode = 0644,
3879 .proc_handler = proc_dointvec_jiffies,
3880 },
3881 {
3882 .procname = "gc_interval",
3883 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3884 .maxlen = sizeof(int),
3885 .mode = 0644,
3886 .proc_handler = proc_dointvec_jiffies,
3887 },
3888 {
3889 .procname = "gc_elasticity",
3890 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3891 .maxlen = sizeof(int),
3892 .mode = 0644,
3893 .proc_handler = proc_dointvec,
3894 },
3895 {
3896 .procname = "mtu_expires",
3897 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3898 .maxlen = sizeof(int),
3899 .mode = 0644,
3900 .proc_handler = proc_dointvec_jiffies,
3901 },
3902 {
3903 .procname = "min_adv_mss",
3904 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3905 .maxlen = sizeof(int),
3906 .mode = 0644,
3907 .proc_handler = proc_dointvec,
3908 },
3909 {
3910 .procname = "gc_min_interval_ms",
3911 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3912 .maxlen = sizeof(int),
3913 .mode = 0644,
3914 .proc_handler = proc_dointvec_ms_jiffies,
3915 },
3916 { }
3917 };
3918
3919 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3920 {
3921 struct ctl_table *table;
3922
3923 table = kmemdup(ipv6_route_table_template,
3924 sizeof(ipv6_route_table_template),
3925 GFP_KERNEL);
3926
3927 if (table) {
3928 table[0].data = &net->ipv6.sysctl.flush_delay;
3929 table[0].extra1 = net;
3930 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3931 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3932 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3933 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3934 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3935 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3936 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3937 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3938 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3939
3940 /* Don't export sysctls to unprivileged users */
3941 if (net->user_ns != &init_user_ns)
3942 table[0].procname = NULL;
3943 }
3944
3945 return table;
3946 }
3947 #endif
3948
3949 static int __net_init ip6_route_net_init(struct net *net)
3950 {
3951 int ret = -ENOMEM;
3952
3953 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3954 sizeof(net->ipv6.ip6_dst_ops));
3955
3956 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3957 goto out_ip6_dst_ops;
3958
3959 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3960 sizeof(*net->ipv6.ip6_null_entry),
3961 GFP_KERNEL);
3962 if (!net->ipv6.ip6_null_entry)
3963 goto out_ip6_dst_entries;
3964 net->ipv6.ip6_null_entry->dst.path =
3965 (struct dst_entry *)net->ipv6.ip6_null_entry;
3966 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3967 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3968 ip6_template_metrics, true);
3969
3970 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3971 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3972 sizeof(*net->ipv6.ip6_prohibit_entry),
3973 GFP_KERNEL);
3974 if (!net->ipv6.ip6_prohibit_entry)
3975 goto out_ip6_null_entry;
3976 net->ipv6.ip6_prohibit_entry->dst.path =
3977 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3978 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3979 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3980 ip6_template_metrics, true);
3981
3982 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3983 sizeof(*net->ipv6.ip6_blk_hole_entry),
3984 GFP_KERNEL);
3985 if (!net->ipv6.ip6_blk_hole_entry)
3986 goto out_ip6_prohibit_entry;
3987 net->ipv6.ip6_blk_hole_entry->dst.path =
3988 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3989 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3990 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3991 ip6_template_metrics, true);
3992 #endif
3993
3994 net->ipv6.sysctl.flush_delay = 0;
3995 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3996 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3997 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3998 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3999 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4000 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4001 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4002
4003 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4004
4005 ret = 0;
4006 out:
4007 return ret;
4008
4009 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4010 out_ip6_prohibit_entry:
4011 kfree(net->ipv6.ip6_prohibit_entry);
4012 out_ip6_null_entry:
4013 kfree(net->ipv6.ip6_null_entry);
4014 #endif
4015 out_ip6_dst_entries:
4016 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4017 out_ip6_dst_ops:
4018 goto out;
4019 }
4020
4021 static void __net_exit ip6_route_net_exit(struct net *net)
4022 {
4023 kfree(net->ipv6.ip6_null_entry);
4024 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4025 kfree(net->ipv6.ip6_prohibit_entry);
4026 kfree(net->ipv6.ip6_blk_hole_entry);
4027 #endif
4028 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4029 }
4030
4031 static int __net_init ip6_route_net_init_late(struct net *net)
4032 {
4033 #ifdef CONFIG_PROC_FS
4034 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4035 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4036 #endif
4037 return 0;
4038 }
4039
4040 static void __net_exit ip6_route_net_exit_late(struct net *net)
4041 {
4042 #ifdef CONFIG_PROC_FS
4043 remove_proc_entry("ipv6_route", net->proc_net);
4044 remove_proc_entry("rt6_stats", net->proc_net);
4045 #endif
4046 }
4047
4048 static struct pernet_operations ip6_route_net_ops = {
4049 .init = ip6_route_net_init,
4050 .exit = ip6_route_net_exit,
4051 };
4052
4053 static int __net_init ipv6_inetpeer_init(struct net *net)
4054 {
4055 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4056
4057 if (!bp)
4058 return -ENOMEM;
4059 inet_peer_base_init(bp);
4060 net->ipv6.peers = bp;
4061 return 0;
4062 }
4063
4064 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4065 {
4066 struct inet_peer_base *bp = net->ipv6.peers;
4067
4068 net->ipv6.peers = NULL;
4069 inetpeer_invalidate_tree(bp);
4070 kfree(bp);
4071 }
4072
4073 static struct pernet_operations ipv6_inetpeer_ops = {
4074 .init = ipv6_inetpeer_init,
4075 .exit = ipv6_inetpeer_exit,
4076 };
4077
4078 static struct pernet_operations ip6_route_net_late_ops = {
4079 .init = ip6_route_net_init_late,
4080 .exit = ip6_route_net_exit_late,
4081 };
4082
4083 static struct notifier_block ip6_route_dev_notifier = {
4084 .notifier_call = ip6_route_dev_notify,
4085 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4086 };
4087
4088 void __init ip6_route_init_special_entries(void)
4089 {
4090 /* Registering of the loopback is done before this portion of code,
4091 * the loopback reference in rt6_info will not be taken, do it
4092 * manually for init_net */
4093 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4094 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4095 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4096 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4097 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4098 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4099 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4100 #endif
4101 }
4102
4103 int __init ip6_route_init(void)
4104 {
4105 int ret;
4106 int cpu;
4107
4108 ret = -ENOMEM;
4109 ip6_dst_ops_template.kmem_cachep =
4110 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4111 SLAB_HWCACHE_ALIGN, NULL);
4112 if (!ip6_dst_ops_template.kmem_cachep)
4113 goto out;
4114
4115 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4116 if (ret)
4117 goto out_kmem_cache;
4118
4119 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4120 if (ret)
4121 goto out_dst_entries;
4122
4123 ret = register_pernet_subsys(&ip6_route_net_ops);
4124 if (ret)
4125 goto out_register_inetpeer;
4126
4127 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4128
4129 ret = fib6_init();
4130 if (ret)
4131 goto out_register_subsys;
4132
4133 ret = xfrm6_init();
4134 if (ret)
4135 goto out_fib6_init;
4136
4137 ret = fib6_rules_init();
4138 if (ret)
4139 goto xfrm6_init;
4140
4141 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4142 if (ret)
4143 goto fib6_rules_init;
4144
4145 ret = -ENOBUFS;
4146 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4147 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4148 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4149 goto out_register_late_subsys;
4150
4151 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4152 if (ret)
4153 goto out_register_late_subsys;
4154
4155 for_each_possible_cpu(cpu) {
4156 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4157
4158 INIT_LIST_HEAD(&ul->head);
4159 spin_lock_init(&ul->lock);
4160 }
4161
4162 out:
4163 return ret;
4164
4165 out_register_late_subsys:
4166 unregister_pernet_subsys(&ip6_route_net_late_ops);
4167 fib6_rules_init:
4168 fib6_rules_cleanup();
4169 xfrm6_init:
4170 xfrm6_fini();
4171 out_fib6_init:
4172 fib6_gc_cleanup();
4173 out_register_subsys:
4174 unregister_pernet_subsys(&ip6_route_net_ops);
4175 out_register_inetpeer:
4176 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4177 out_dst_entries:
4178 dst_entries_destroy(&ip6_dst_blackhole_ops);
4179 out_kmem_cache:
4180 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4181 goto out;
4182 }
4183
4184 void ip6_route_cleanup(void)
4185 {
4186 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4187 unregister_pernet_subsys(&ip6_route_net_late_ops);
4188 fib6_rules_cleanup();
4189 xfrm6_fini();
4190 fib6_gc_cleanup();
4191 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4192 unregister_pernet_subsys(&ip6_route_net_ops);
4193 dst_entries_destroy(&ip6_dst_blackhole_ops);
4194 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4195 }