]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - net/ipv6/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[mirror_ubuntu-jammy-kernel.git] / net / ipv6 / route.c
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14 /* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
84 RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
95
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
111 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
121 unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129 spinlock_t lock;
130 struct list_head head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139 rt->rt6i_uncached_list = ul;
140
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
151
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
156 }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161 struct net_device *loopback_dev = net->loopback_dev;
162 int cpu;
163
164 if (dev == loopback_dev)
165 return;
166
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 struct rt6_info *rt;
170
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
175
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
179 }
180
181 if (rt_dev == dev) {
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
184 dev_put(rt_dev);
185 }
186 }
187 spin_unlock_bh(&ul->lock);
188 }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 struct sk_buff *skb,
193 const void *daddr)
194 {
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
197 else if (skb)
198 return &ipv6_hdr(skb)->daddr;
199 return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
204 struct sk_buff *skb,
205 const void *daddr)
206 {
207 struct neighbour *n;
208
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
211 if (n)
212 return n;
213
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219 struct sk_buff *skb,
220 const void *daddr)
221 {
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
231
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (!daddr)
234 return;
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236 return;
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238 return;
239 __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243 .family = AF_INET6,
244 .gc = ip6_dst_gc,
245 .gc_thresh = 1024,
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
248 .mtu = ip6_mtu,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265 return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274 struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279 .family = AF_INET6,
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304 .dst = {
305 .__refcnt = ATOMIC_INIT(1),
306 .__use = 1,
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
311 },
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318 .dst = {
319 .__refcnt = ATOMIC_INIT(1),
320 .__use = 1,
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
322 .error = -EACCES,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
325 },
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330 .dst = {
331 .__refcnt = ATOMIC_INIT(1),
332 .__use = 1,
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
334 .error = -EINVAL,
335 .input = dst_discard,
336 .output = dst_discard_out,
337 },
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345 struct dst_entry *dst = &rt->dst;
346
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 int flags)
354 {
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
357
358 if (rt) {
359 rt6_info_init(rt);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361 }
362
363 return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
372
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
375
376 idev = rt->rt6i_idev;
377 if (idev) {
378 rt->rt6i_idev = NULL;
379 in6_dev_put(idev);
380 }
381
382 rcu_read_lock();
383 from = rcu_dereference(rt->from);
384 rcu_assign_pointer(rt->from, NULL);
385 fib6_info_release(from);
386 rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 int how)
391 {
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
396
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399 if (loopback_idev) {
400 rt->rt6i_idev = loopback_idev;
401 in6_dev_put(idev);
402 }
403 }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
410 else
411 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416 struct fib6_info *from;
417
418 from = rcu_dereference(rt->from);
419
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
422 return true;
423 } else if (from) {
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
426 }
427 return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
434 int strict)
435 {
436 struct fib6_info *sibling, *next_sibling;
437
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
440 */
441 if (!fl6->mp_hash)
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 return match;
446
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448 fib6_siblings) {
449 int nh_upper_bound;
450
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
453 continue;
454 if (rt6_score_route(sibling, oif, strict) < 0)
455 break;
456 match = sibling;
457 break;
458 }
459
460 return match;
461 }
462
463 /*
464 * Route lookup. rcu_read_lock() should be held.
465 */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
470 int oif,
471 int flags)
472 {
473 struct fib6_info *sprt;
474
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 return rt;
478
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483 continue;
484
485 if (oif) {
486 if (dev->ifindex == oif)
487 return sprt;
488 } else {
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
491 return sprt;
492 }
493 }
494
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
497
498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
513
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516 dev_put(work->dev);
517 kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
527
528 /*
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
532 *
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
535 */
536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537 return;
538
539 nh_gw = &rt->fib6_nh.nh_gw;
540 dev = rt->fib6_nh.nh_dev;
541 rcu_read_lock_bh();
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544 if (neigh) {
545 if (neigh->nud_state & NUD_VALID)
546 goto out;
547
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
550 time_after(jiffies,
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
553 if (work)
554 __neigh_set_probe_once(neigh);
555 }
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560 }
561
562 if (work) {
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
566 dev_hold(dev);
567 work->dev = dev;
568 schedule_work(&work->work);
569 }
570
571 out:
572 rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581 * Default Router Selection (RFC 2461 6.3.6)
582 */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585 const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587 if (!oif || dev->ifindex == oif)
588 return 2;
589 return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
596
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !(rt->fib6_flags & RTF_GATEWAY))
599 return RT6_NUD_SUCCEED;
600
601 rcu_read_lock_bh();
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 &rt->fib6_nh.nh_gw);
604 if (neigh) {
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
611 else
612 ret = RT6_NUD_FAIL_PROBE;
613 #endif
614 read_unlock(&neigh->lock);
615 } else {
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618 }
619 rcu_read_unlock_bh();
620
621 return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626 int m;
627
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
636 if (n < 0)
637 return n;
638 }
639 return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645 const struct net_device *dev = fib6_info_nh_dev(f6i);
646 bool rc = false;
647
648 if (dev) {
649 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651 rc = !!idev->cnf.ignore_routes_with_linkdown;
652 }
653
654 return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658 int *mpri, struct fib6_info *match,
659 bool *do_rr)
660 {
661 int m;
662 bool match_do_rr = false;
663
664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 goto out;
666
667 if (fib6_ignore_linkdown(rt) &&
668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 goto out;
671
672 if (fib6_check_expired(rt))
673 goto out;
674
675 m = rt6_score_route(rt, oif, strict);
676 if (m == RT6_NUD_FAIL_DO_RR) {
677 match_do_rr = true;
678 m = 0; /* lowest valid score */
679 } else if (m == RT6_NUD_FAIL_HARD) {
680 goto out;
681 }
682
683 if (strict & RT6_LOOKUP_F_REACHABLE)
684 rt6_probe(rt);
685
686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687 if (m > *mpri) {
688 *do_rr = match_do_rr;
689 *mpri = m;
690 match = rt;
691 }
692 out:
693 return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697 struct fib6_info *leaf,
698 struct fib6_info *rr_head,
699 u32 metric, int oif, int strict,
700 bool *do_rr)
701 {
702 struct fib6_info *rt, *match, *cont;
703 int mpri = -1;
704
705 match = NULL;
706 cont = NULL;
707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708 if (rt->fib6_metric != metric) {
709 cont = rt;
710 break;
711 }
712
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 }
715
716 for (rt = leaf; rt && rt != rr_head;
717 rt = rcu_dereference(rt->fib6_next)) {
718 if (rt->fib6_metric != metric) {
719 cont = rt;
720 break;
721 }
722
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724 }
725
726 if (match || !cont)
727 return match;
728
729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732 return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 int oif, int strict)
737 {
738 struct fib6_info *leaf = rcu_dereference(fn->leaf);
739 struct fib6_info *match, *rt0;
740 bool do_rr = false;
741 int key_plen;
742
743 if (!leaf || leaf == net->ipv6.fib6_null_entry)
744 return net->ipv6.fib6_null_entry;
745
746 rt0 = rcu_dereference(fn->rr_ptr);
747 if (!rt0)
748 rt0 = leaf;
749
750 /* Double check to make sure fn is not an intermediate node
751 * and fn->leaf does not points to its child's leaf
752 * (This might happen if all routes under fn are deleted from
753 * the tree and fib6_repair_tree() is called on the node.)
754 */
755 key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757 if (rt0->fib6_src.plen)
758 key_plen = rt0->fib6_src.plen;
759 #endif
760 if (fn->fn_bit != key_plen)
761 return net->ipv6.fib6_null_entry;
762
763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764 &do_rr);
765
766 if (do_rr) {
767 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769 /* no entries matched; do round-robin */
770 if (!next || next->fib6_metric != rt0->fib6_metric)
771 next = leaf;
772
773 if (next != rt0) {
774 spin_lock_bh(&leaf->fib6_table->tb6_lock);
775 /* make sure next is not being deleted from the tree */
776 if (next->fib6_node)
777 rcu_assign_pointer(fn->rr_ptr, next);
778 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779 }
780 }
781
782 return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792 const struct in6_addr *gwaddr)
793 {
794 struct net *net = dev_net(dev);
795 struct route_info *rinfo = (struct route_info *) opt;
796 struct in6_addr prefix_buf, *prefix;
797 unsigned int pref;
798 unsigned long lifetime;
799 struct fib6_info *rt;
800
801 if (len < sizeof(struct route_info)) {
802 return -EINVAL;
803 }
804
805 /* Sanity check for prefix_len and length */
806 if (rinfo->length > 3) {
807 return -EINVAL;
808 } else if (rinfo->prefix_len > 128) {
809 return -EINVAL;
810 } else if (rinfo->prefix_len > 64) {
811 if (rinfo->length < 2) {
812 return -EINVAL;
813 }
814 } else if (rinfo->prefix_len > 0) {
815 if (rinfo->length < 1) {
816 return -EINVAL;
817 }
818 }
819
820 pref = rinfo->route_pref;
821 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 return -EINVAL;
823
824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826 if (rinfo->length == 3)
827 prefix = (struct in6_addr *)rinfo->prefix;
828 else {
829 /* this function is safe */
830 ipv6_addr_prefix(&prefix_buf,
831 (struct in6_addr *)rinfo->prefix,
832 rinfo->prefix_len);
833 prefix = &prefix_buf;
834 }
835
836 if (rinfo->prefix_len == 0)
837 rt = rt6_get_dflt_router(net, gwaddr, dev);
838 else
839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 gwaddr, dev);
841
842 if (rt && !lifetime) {
843 ip6_del_rt(net, rt);
844 rt = NULL;
845 }
846
847 if (!rt && lifetime)
848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 dev, pref);
850 else if (rt)
851 rt->fib6_flags = RTF_ROUTEINFO |
852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854 if (rt) {
855 if (!addrconf_finite_timeout(lifetime))
856 fib6_clean_expires(rt);
857 else
858 fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860 fib6_info_release(rt);
861 }
862 return 0;
863 }
864 #endif
865
866 /*
867 * Misc support functions
868 */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873 struct net_device *dev = rt->fib6_nh.nh_dev;
874
875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876 /* for copies of local routes, dst->dev needs to be the
877 * device if it is a master device, the master device if
878 * device is enslaved, and the loopback as the default
879 */
880 if (netif_is_l3_slave(dev) &&
881 !rt6_need_strict(&rt->fib6_dst.addr))
882 dev = l3mdev_master_dev_rcu(dev);
883 else if (!netif_is_l3_master(dev))
884 dev = dev_net(dev)->loopback_dev;
885 /* last case is netif_is_l3_master(dev) is true in which
886 * case we want dev returned to be dev
887 */
888 }
889
890 return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894 [RTN_UNSPEC] = 0,
895 [RTN_UNICAST] = 0,
896 [RTN_LOCAL] = 0,
897 [RTN_BROADCAST] = 0,
898 [RTN_ANYCAST] = 0,
899 [RTN_MULTICAST] = 0,
900 [RTN_BLACKHOLE] = -EINVAL,
901 [RTN_UNREACHABLE] = -EHOSTUNREACH,
902 [RTN_PROHIBIT] = -EACCES,
903 [RTN_THROW] = -EAGAIN,
904 [RTN_NAT] = -EINVAL,
905 [RTN_XRESOLVE] = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910 return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915 unsigned short flags = 0;
916
917 if (rt->dst_nocount)
918 flags |= DST_NOCOUNT;
919 if (rt->dst_nopolicy)
920 flags |= DST_NOPOLICY;
921 if (rt->dst_host)
922 flags |= DST_HOST;
923
924 return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931 switch (ort->fib6_type) {
932 case RTN_BLACKHOLE:
933 rt->dst.output = dst_discard_out;
934 rt->dst.input = dst_discard;
935 break;
936 case RTN_PROHIBIT:
937 rt->dst.output = ip6_pkt_prohibit_out;
938 rt->dst.input = ip6_pkt_prohibit;
939 break;
940 case RTN_THROW:
941 case RTN_UNREACHABLE:
942 default:
943 rt->dst.output = ip6_pkt_discard_out;
944 rt->dst.input = ip6_pkt_discard;
945 break;
946 }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
953 return;
954 }
955
956 rt->dst.error = 0;
957 rt->dst.output = ip6_output;
958
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
963 } else {
964 rt->dst.input = ip6_forward;
965 }
966
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
970 }
971
972 rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986 struct net_device *dev = fib6_info_nh_dev(ort);
987
988 ip6_rt_init_dst(rt, ort);
989
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001 struct in6_addr *saddr)
1002 {
1003 struct fib6_node *pn, *sn;
1004 while (1) {
1005 if (fn->fn_flags & RTN_TL_ROOT)
1006 return NULL;
1007 pn = rcu_dereference(fn->parent);
1008 sn = FIB6_SUBTREE(pn);
1009 if (sn && sn != fn)
1010 fn = fib6_node_lookup(sn, NULL, saddr);
1011 else
1012 fn = pn;
1013 if (fn->fn_flags & RTN_RTINFO)
1014 return fn;
1015 }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1019 bool null_fallback)
1020 {
1021 struct rt6_info *rt = *prt;
1022
1023 if (dst_hold_safe(&rt->dst))
1024 return true;
1025 if (null_fallback) {
1026 rt = net->ipv6.ip6_null_entry;
1027 dst_hold(&rt->dst);
1028 } else {
1029 rt = NULL;
1030 }
1031 *prt = rt;
1032 return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038 unsigned short flags = fib6_info_dst_flags(rt);
1039 struct net_device *dev = rt->fib6_nh.nh_dev;
1040 struct rt6_info *nrt;
1041
1042 if (!fib6_info_hold_safe(rt))
1043 return NULL;
1044
1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046 if (nrt)
1047 ip6_rt_copy_init(nrt, rt);
1048 else
1049 fib6_info_release(rt);
1050
1051 return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1056 struct flowi6 *fl6,
1057 const struct sk_buff *skb,
1058 int flags)
1059 {
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1063
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067 rcu_read_lock();
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070 f6i = rcu_dereference(fn->leaf);
1071 if (!f6i) {
1072 f6i = net->ipv6.fib6_null_entry;
1073 } else {
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1079 flags);
1080 }
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1083 if (fn)
1084 goto restart;
1085 }
1086
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091 if (rt) {
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1096 dst_hold(&rt->dst);
1097 } else {
1098 rt = ip6_create_rt_rcu(f6i);
1099 if (!rt) {
1100 rt = net->ipv6.ip6_null_entry;
1101 dst_hold(&rt->dst);
1102 }
1103 }
1104
1105 rcu_read_unlock();
1106
1107 return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1112 {
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1120 {
1121 struct flowi6 fl6 = {
1122 .flowi6_oif = oif,
1123 .daddr = *daddr,
1124 };
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128 if (saddr) {
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131 }
1132
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1136
1137 dst_release(dst);
1138
1139 return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1147 */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1151 {
1152 int err;
1153 struct fib6_table *table;
1154
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1159
1160 return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165 struct nl_info info = { .nl_net = net, };
1166
1167 return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1173 {
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1176
1177 /*
1178 * Clone the route.
1179 */
1180
1181 if (!fib6_info_hold_safe(ort))
1182 return NULL;
1183
1184 dev = ip6_rt_get_dev_rcu(ort);
1185 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186 if (!rt) {
1187 fib6_info_release(ort);
1188 return NULL;
1189 }
1190
1191 ip6_rt_copy_init(rt, ort);
1192 rt->rt6i_flags |= RTF_CACHE;
1193 rt->dst.flags |= DST_HOST;
1194 rt->rt6i_dst.addr = *daddr;
1195 rt->rt6i_dst.plen = 128;
1196
1197 if (!rt6_is_gw_or_nonexthop(ort)) {
1198 if (ort->fib6_dst.plen != 128 &&
1199 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200 rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202 if (rt->rt6i_src.plen && saddr) {
1203 rt->rt6i_src.addr = *saddr;
1204 rt->rt6i_src.plen = 128;
1205 }
1206 #endif
1207 }
1208
1209 return rt;
1210 }
1211
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 {
1214 unsigned short flags = fib6_info_dst_flags(rt);
1215 struct net_device *dev;
1216 struct rt6_info *pcpu_rt;
1217
1218 if (!fib6_info_hold_safe(rt))
1219 return NULL;
1220
1221 rcu_read_lock();
1222 dev = ip6_rt_get_dev_rcu(rt);
1223 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1224 rcu_read_unlock();
1225 if (!pcpu_rt) {
1226 fib6_info_release(rt);
1227 return NULL;
1228 }
1229 ip6_rt_copy_init(pcpu_rt, rt);
1230 pcpu_rt->rt6i_flags |= RTF_PCPU;
1231 return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 {
1237 struct rt6_info *pcpu_rt, **p;
1238
1239 p = this_cpu_ptr(rt->rt6i_pcpu);
1240 pcpu_rt = *p;
1241
1242 if (pcpu_rt)
1243 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245 return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249 struct fib6_info *rt)
1250 {
1251 struct rt6_info *pcpu_rt, *prev, **p;
1252
1253 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254 if (!pcpu_rt) {
1255 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256 return net->ipv6.ip6_null_entry;
1257 }
1258
1259 dst_hold(&pcpu_rt->dst);
1260 p = this_cpu_ptr(rt->rt6i_pcpu);
1261 prev = cmpxchg(p, NULL, pcpu_rt);
1262 BUG_ON(prev);
1263
1264 return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268 */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272 * Caller must hold rt6_exception_lock
1273 */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275 struct rt6_exception *rt6_ex)
1276 {
1277 struct fib6_info *from;
1278 struct net *net;
1279
1280 if (!bucket || !rt6_ex)
1281 return;
1282
1283 net = dev_net(rt6_ex->rt6i->dst.dev);
1284 net->ipv6.rt6_stats->fib_rt_cache--;
1285
1286 /* purge completely the exception to allow releasing the held resources:
1287 * some [sk] cache may keep the dst around for unlimited time
1288 */
1289 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1290 lockdep_is_held(&rt6_exception_lock));
1291 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1292 fib6_info_release(from);
1293 dst_dev_put(&rt6_ex->rt6i->dst);
1294
1295 hlist_del_rcu(&rt6_ex->hlist);
1296 dst_release(&rt6_ex->rt6i->dst);
1297 kfree_rcu(rt6_ex, rcu);
1298 WARN_ON_ONCE(!bucket->depth);
1299 bucket->depth--;
1300 }
1301
1302 /* Remove oldest rt6_ex in bucket and free the memory
1303 * Caller must hold rt6_exception_lock
1304 */
1305 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1306 {
1307 struct rt6_exception *rt6_ex, *oldest = NULL;
1308
1309 if (!bucket)
1310 return;
1311
1312 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1313 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1314 oldest = rt6_ex;
1315 }
1316 rt6_remove_exception(bucket, oldest);
1317 }
1318
1319 static u32 rt6_exception_hash(const struct in6_addr *dst,
1320 const struct in6_addr *src)
1321 {
1322 static u32 seed __read_mostly;
1323 u32 val;
1324
1325 net_get_random_once(&seed, sizeof(seed));
1326 val = jhash(dst, sizeof(*dst), seed);
1327
1328 #ifdef CONFIG_IPV6_SUBTREES
1329 if (src)
1330 val = jhash(src, sizeof(*src), val);
1331 #endif
1332 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1333 }
1334
1335 /* Helper function to find the cached rt in the hash table
1336 * and update bucket pointer to point to the bucket for this
1337 * (daddr, saddr) pair
1338 * Caller must hold rt6_exception_lock
1339 */
1340 static struct rt6_exception *
1341 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1342 const struct in6_addr *daddr,
1343 const struct in6_addr *saddr)
1344 {
1345 struct rt6_exception *rt6_ex;
1346 u32 hval;
1347
1348 if (!(*bucket) || !daddr)
1349 return NULL;
1350
1351 hval = rt6_exception_hash(daddr, saddr);
1352 *bucket += hval;
1353
1354 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1355 struct rt6_info *rt6 = rt6_ex->rt6i;
1356 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1357
1358 #ifdef CONFIG_IPV6_SUBTREES
1359 if (matched && saddr)
1360 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1361 #endif
1362 if (matched)
1363 return rt6_ex;
1364 }
1365 return NULL;
1366 }
1367
1368 /* Helper function to find the cached rt in the hash table
1369 * and update bucket pointer to point to the bucket for this
1370 * (daddr, saddr) pair
1371 * Caller must hold rcu_read_lock()
1372 */
1373 static struct rt6_exception *
1374 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1375 const struct in6_addr *daddr,
1376 const struct in6_addr *saddr)
1377 {
1378 struct rt6_exception *rt6_ex;
1379 u32 hval;
1380
1381 WARN_ON_ONCE(!rcu_read_lock_held());
1382
1383 if (!(*bucket) || !daddr)
1384 return NULL;
1385
1386 hval = rt6_exception_hash(daddr, saddr);
1387 *bucket += hval;
1388
1389 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1390 struct rt6_info *rt6 = rt6_ex->rt6i;
1391 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1392
1393 #ifdef CONFIG_IPV6_SUBTREES
1394 if (matched && saddr)
1395 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1396 #endif
1397 if (matched)
1398 return rt6_ex;
1399 }
1400 return NULL;
1401 }
1402
1403 static unsigned int fib6_mtu(const struct fib6_info *rt)
1404 {
1405 unsigned int mtu;
1406
1407 if (rt->fib6_pmtu) {
1408 mtu = rt->fib6_pmtu;
1409 } else {
1410 struct net_device *dev = fib6_info_nh_dev(rt);
1411 struct inet6_dev *idev;
1412
1413 rcu_read_lock();
1414 idev = __in6_dev_get(dev);
1415 mtu = idev->cnf.mtu6;
1416 rcu_read_unlock();
1417 }
1418
1419 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1420
1421 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1422 }
1423
1424 static int rt6_insert_exception(struct rt6_info *nrt,
1425 struct fib6_info *ort)
1426 {
1427 struct net *net = dev_net(nrt->dst.dev);
1428 struct rt6_exception_bucket *bucket;
1429 struct in6_addr *src_key = NULL;
1430 struct rt6_exception *rt6_ex;
1431 int err = 0;
1432
1433 spin_lock_bh(&rt6_exception_lock);
1434
1435 if (ort->exception_bucket_flushed) {
1436 err = -EINVAL;
1437 goto out;
1438 }
1439
1440 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1441 lockdep_is_held(&rt6_exception_lock));
1442 if (!bucket) {
1443 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1444 GFP_ATOMIC);
1445 if (!bucket) {
1446 err = -ENOMEM;
1447 goto out;
1448 }
1449 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1450 }
1451
1452 #ifdef CONFIG_IPV6_SUBTREES
1453 /* rt6i_src.plen != 0 indicates ort is in subtree
1454 * and exception table is indexed by a hash of
1455 * both rt6i_dst and rt6i_src.
1456 * Otherwise, the exception table is indexed by
1457 * a hash of only rt6i_dst.
1458 */
1459 if (ort->fib6_src.plen)
1460 src_key = &nrt->rt6i_src.addr;
1461 #endif
1462 /* rt6_mtu_change() might lower mtu on ort.
1463 * Only insert this exception route if its mtu
1464 * is less than ort's mtu value.
1465 */
1466 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467 err = -EINVAL;
1468 goto out;
1469 }
1470
1471 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1472 src_key);
1473 if (rt6_ex)
1474 rt6_remove_exception(bucket, rt6_ex);
1475
1476 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1477 if (!rt6_ex) {
1478 err = -ENOMEM;
1479 goto out;
1480 }
1481 rt6_ex->rt6i = nrt;
1482 rt6_ex->stamp = jiffies;
1483 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1484 bucket->depth++;
1485 net->ipv6.rt6_stats->fib_rt_cache++;
1486
1487 if (bucket->depth > FIB6_MAX_DEPTH)
1488 rt6_exception_remove_oldest(bucket);
1489
1490 out:
1491 spin_unlock_bh(&rt6_exception_lock);
1492
1493 /* Update fn->fn_sernum to invalidate all cached dst */
1494 if (!err) {
1495 spin_lock_bh(&ort->fib6_table->tb6_lock);
1496 fib6_update_sernum(net, ort);
1497 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498 fib6_force_start_gc(net);
1499 }
1500
1501 return err;
1502 }
1503
1504 void rt6_flush_exceptions(struct fib6_info *rt)
1505 {
1506 struct rt6_exception_bucket *bucket;
1507 struct rt6_exception *rt6_ex;
1508 struct hlist_node *tmp;
1509 int i;
1510
1511 spin_lock_bh(&rt6_exception_lock);
1512 /* Prevent rt6_insert_exception() to recreate the bucket list */
1513 rt->exception_bucket_flushed = 1;
1514
1515 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1516 lockdep_is_held(&rt6_exception_lock));
1517 if (!bucket)
1518 goto out;
1519
1520 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1521 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1522 rt6_remove_exception(bucket, rt6_ex);
1523 WARN_ON_ONCE(bucket->depth);
1524 bucket++;
1525 }
1526
1527 out:
1528 spin_unlock_bh(&rt6_exception_lock);
1529 }
1530
1531 /* Find cached rt in the hash table inside passed in rt
1532 * Caller has to hold rcu_read_lock()
1533 */
1534 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535 struct in6_addr *daddr,
1536 struct in6_addr *saddr)
1537 {
1538 struct rt6_exception_bucket *bucket;
1539 struct in6_addr *src_key = NULL;
1540 struct rt6_exception *rt6_ex;
1541 struct rt6_info *res = NULL;
1542
1543 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1544
1545 #ifdef CONFIG_IPV6_SUBTREES
1546 /* rt6i_src.plen != 0 indicates rt is in subtree
1547 * and exception table is indexed by a hash of
1548 * both rt6i_dst and rt6i_src.
1549 * Otherwise, the exception table is indexed by
1550 * a hash of only rt6i_dst.
1551 */
1552 if (rt->fib6_src.plen)
1553 src_key = saddr;
1554 #endif
1555 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1556
1557 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1558 res = rt6_ex->rt6i;
1559
1560 return res;
1561 }
1562
1563 /* Remove the passed in cached rt from the hash table that contains it */
1564 static int rt6_remove_exception_rt(struct rt6_info *rt)
1565 {
1566 struct rt6_exception_bucket *bucket;
1567 struct in6_addr *src_key = NULL;
1568 struct rt6_exception *rt6_ex;
1569 struct fib6_info *from;
1570 int err;
1571
1572 from = rcu_dereference(rt->from);
1573 if (!from ||
1574 !(rt->rt6i_flags & RTF_CACHE))
1575 return -EINVAL;
1576
1577 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1578 return -ENOENT;
1579
1580 spin_lock_bh(&rt6_exception_lock);
1581 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1582 lockdep_is_held(&rt6_exception_lock));
1583 #ifdef CONFIG_IPV6_SUBTREES
1584 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1585 * and exception table is indexed by a hash of
1586 * both rt6i_dst and rt6i_src.
1587 * Otherwise, the exception table is indexed by
1588 * a hash of only rt6i_dst.
1589 */
1590 if (from->fib6_src.plen)
1591 src_key = &rt->rt6i_src.addr;
1592 #endif
1593 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1594 &rt->rt6i_dst.addr,
1595 src_key);
1596 if (rt6_ex) {
1597 rt6_remove_exception(bucket, rt6_ex);
1598 err = 0;
1599 } else {
1600 err = -ENOENT;
1601 }
1602
1603 spin_unlock_bh(&rt6_exception_lock);
1604 return err;
1605 }
1606
1607 /* Find rt6_ex which contains the passed in rt cache and
1608 * refresh its stamp
1609 */
1610 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1611 {
1612 struct rt6_exception_bucket *bucket;
1613 struct in6_addr *src_key = NULL;
1614 struct rt6_exception *rt6_ex;
1615 struct fib6_info *from;
1616
1617 rcu_read_lock();
1618 from = rcu_dereference(rt->from);
1619 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1620 goto unlock;
1621
1622 bucket = rcu_dereference(from->rt6i_exception_bucket);
1623
1624 #ifdef CONFIG_IPV6_SUBTREES
1625 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1626 * and exception table is indexed by a hash of
1627 * both rt6i_dst and rt6i_src.
1628 * Otherwise, the exception table is indexed by
1629 * a hash of only rt6i_dst.
1630 */
1631 if (from->fib6_src.plen)
1632 src_key = &rt->rt6i_src.addr;
1633 #endif
1634 rt6_ex = __rt6_find_exception_rcu(&bucket,
1635 &rt->rt6i_dst.addr,
1636 src_key);
1637 if (rt6_ex)
1638 rt6_ex->stamp = jiffies;
1639
1640 unlock:
1641 rcu_read_unlock();
1642 }
1643
1644 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1645 struct rt6_info *rt, int mtu)
1646 {
1647 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1648 * lowest MTU in the path: always allow updating the route PMTU to
1649 * reflect PMTU decreases.
1650 *
1651 * If the new MTU is higher, and the route PMTU is equal to the local
1652 * MTU, this means the old MTU is the lowest in the path, so allow
1653 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1654 * handle this.
1655 */
1656
1657 if (dst_mtu(&rt->dst) >= mtu)
1658 return true;
1659
1660 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1661 return true;
1662
1663 return false;
1664 }
1665
1666 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1667 struct fib6_info *rt, int mtu)
1668 {
1669 struct rt6_exception_bucket *bucket;
1670 struct rt6_exception *rt6_ex;
1671 int i;
1672
1673 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1674 lockdep_is_held(&rt6_exception_lock));
1675
1676 if (!bucket)
1677 return;
1678
1679 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1680 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1681 struct rt6_info *entry = rt6_ex->rt6i;
1682
1683 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1684 * route), the metrics of its rt->from have already
1685 * been updated.
1686 */
1687 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1688 rt6_mtu_change_route_allowed(idev, entry, mtu))
1689 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1690 }
1691 bucket++;
1692 }
1693 }
1694
1695 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1696
1697 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1698 struct in6_addr *gateway)
1699 {
1700 struct rt6_exception_bucket *bucket;
1701 struct rt6_exception *rt6_ex;
1702 struct hlist_node *tmp;
1703 int i;
1704
1705 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1706 return;
1707
1708 spin_lock_bh(&rt6_exception_lock);
1709 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1710 lockdep_is_held(&rt6_exception_lock));
1711
1712 if (bucket) {
1713 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714 hlist_for_each_entry_safe(rt6_ex, tmp,
1715 &bucket->chain, hlist) {
1716 struct rt6_info *entry = rt6_ex->rt6i;
1717
1718 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1719 RTF_CACHE_GATEWAY &&
1720 ipv6_addr_equal(gateway,
1721 &entry->rt6i_gateway)) {
1722 rt6_remove_exception(bucket, rt6_ex);
1723 }
1724 }
1725 bucket++;
1726 }
1727 }
1728
1729 spin_unlock_bh(&rt6_exception_lock);
1730 }
1731
1732 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1733 struct rt6_exception *rt6_ex,
1734 struct fib6_gc_args *gc_args,
1735 unsigned long now)
1736 {
1737 struct rt6_info *rt = rt6_ex->rt6i;
1738
1739 /* we are pruning and obsoleting aged-out and non gateway exceptions
1740 * even if others have still references to them, so that on next
1741 * dst_check() such references can be dropped.
1742 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1743 * expired, independently from their aging, as per RFC 8201 section 4
1744 */
1745 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1746 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1747 RT6_TRACE("aging clone %p\n", rt);
1748 rt6_remove_exception(bucket, rt6_ex);
1749 return;
1750 }
1751 } else if (time_after(jiffies, rt->dst.expires)) {
1752 RT6_TRACE("purging expired route %p\n", rt);
1753 rt6_remove_exception(bucket, rt6_ex);
1754 return;
1755 }
1756
1757 if (rt->rt6i_flags & RTF_GATEWAY) {
1758 struct neighbour *neigh;
1759 __u8 neigh_flags = 0;
1760
1761 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1762 if (neigh)
1763 neigh_flags = neigh->flags;
1764
1765 if (!(neigh_flags & NTF_ROUTER)) {
1766 RT6_TRACE("purging route %p via non-router but gateway\n",
1767 rt);
1768 rt6_remove_exception(bucket, rt6_ex);
1769 return;
1770 }
1771 }
1772
1773 gc_args->more++;
1774 }
1775
1776 void rt6_age_exceptions(struct fib6_info *rt,
1777 struct fib6_gc_args *gc_args,
1778 unsigned long now)
1779 {
1780 struct rt6_exception_bucket *bucket;
1781 struct rt6_exception *rt6_ex;
1782 struct hlist_node *tmp;
1783 int i;
1784
1785 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1786 return;
1787
1788 rcu_read_lock_bh();
1789 spin_lock(&rt6_exception_lock);
1790 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1791 lockdep_is_held(&rt6_exception_lock));
1792
1793 if (bucket) {
1794 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1795 hlist_for_each_entry_safe(rt6_ex, tmp,
1796 &bucket->chain, hlist) {
1797 rt6_age_examine_exception(bucket, rt6_ex,
1798 gc_args, now);
1799 }
1800 bucket++;
1801 }
1802 }
1803 spin_unlock(&rt6_exception_lock);
1804 rcu_read_unlock_bh();
1805 }
1806
1807 /* must be called with rcu lock held */
1808 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1809 int oif, struct flowi6 *fl6, int strict)
1810 {
1811 struct fib6_node *fn, *saved_fn;
1812 struct fib6_info *f6i;
1813
1814 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1815 saved_fn = fn;
1816
1817 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1818 oif = 0;
1819
1820 redo_rt6_select:
1821 f6i = rt6_select(net, fn, oif, strict);
1822 if (f6i == net->ipv6.fib6_null_entry) {
1823 fn = fib6_backtrack(fn, &fl6->saddr);
1824 if (fn)
1825 goto redo_rt6_select;
1826 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1827 /* also consider unreachable route */
1828 strict &= ~RT6_LOOKUP_F_REACHABLE;
1829 fn = saved_fn;
1830 goto redo_rt6_select;
1831 }
1832 }
1833
1834 trace_fib6_table_lookup(net, f6i, table, fl6);
1835
1836 return f6i;
1837 }
1838
1839 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1840 int oif, struct flowi6 *fl6,
1841 const struct sk_buff *skb, int flags)
1842 {
1843 struct fib6_info *f6i;
1844 struct rt6_info *rt;
1845 int strict = 0;
1846
1847 strict |= flags & RT6_LOOKUP_F_IFACE;
1848 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1849 if (net->ipv6.devconf_all->forwarding == 0)
1850 strict |= RT6_LOOKUP_F_REACHABLE;
1851
1852 rcu_read_lock();
1853
1854 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1855 if (f6i->fib6_nsiblings)
1856 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1857
1858 if (f6i == net->ipv6.fib6_null_entry) {
1859 rt = net->ipv6.ip6_null_entry;
1860 rcu_read_unlock();
1861 dst_hold(&rt->dst);
1862 return rt;
1863 }
1864
1865 /*Search through exception table */
1866 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1867 if (rt) {
1868 if (ip6_hold_safe(net, &rt, true))
1869 dst_use_noref(&rt->dst, jiffies);
1870
1871 rcu_read_unlock();
1872 return rt;
1873 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1874 !(f6i->fib6_flags & RTF_GATEWAY))) {
1875 /* Create a RTF_CACHE clone which will not be
1876 * owned by the fib6 tree. It is for the special case where
1877 * the daddr in the skb during the neighbor look-up is different
1878 * from the fl6->daddr used to look-up route here.
1879 */
1880 struct rt6_info *uncached_rt;
1881
1882 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1883
1884 rcu_read_unlock();
1885
1886 if (uncached_rt) {
1887 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1888 * No need for another dst_hold()
1889 */
1890 rt6_uncached_list_add(uncached_rt);
1891 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1892 } else {
1893 uncached_rt = net->ipv6.ip6_null_entry;
1894 dst_hold(&uncached_rt->dst);
1895 }
1896
1897 return uncached_rt;
1898 } else {
1899 /* Get a percpu copy */
1900
1901 struct rt6_info *pcpu_rt;
1902
1903 local_bh_disable();
1904 pcpu_rt = rt6_get_pcpu_route(f6i);
1905
1906 if (!pcpu_rt)
1907 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1908
1909 local_bh_enable();
1910 rcu_read_unlock();
1911
1912 return pcpu_rt;
1913 }
1914 }
1915 EXPORT_SYMBOL_GPL(ip6_pol_route);
1916
1917 static struct rt6_info *ip6_pol_route_input(struct net *net,
1918 struct fib6_table *table,
1919 struct flowi6 *fl6,
1920 const struct sk_buff *skb,
1921 int flags)
1922 {
1923 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1924 }
1925
1926 struct dst_entry *ip6_route_input_lookup(struct net *net,
1927 struct net_device *dev,
1928 struct flowi6 *fl6,
1929 const struct sk_buff *skb,
1930 int flags)
1931 {
1932 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1933 flags |= RT6_LOOKUP_F_IFACE;
1934
1935 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1936 }
1937 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1938
1939 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1940 struct flow_keys *keys,
1941 struct flow_keys *flkeys)
1942 {
1943 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1944 const struct ipv6hdr *key_iph = outer_iph;
1945 struct flow_keys *_flkeys = flkeys;
1946 const struct ipv6hdr *inner_iph;
1947 const struct icmp6hdr *icmph;
1948 struct ipv6hdr _inner_iph;
1949 struct icmp6hdr _icmph;
1950
1951 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1952 goto out;
1953
1954 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1955 sizeof(_icmph), &_icmph);
1956 if (!icmph)
1957 goto out;
1958
1959 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1960 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1961 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1962 icmph->icmp6_type != ICMPV6_PARAMPROB)
1963 goto out;
1964
1965 inner_iph = skb_header_pointer(skb,
1966 skb_transport_offset(skb) + sizeof(*icmph),
1967 sizeof(_inner_iph), &_inner_iph);
1968 if (!inner_iph)
1969 goto out;
1970
1971 key_iph = inner_iph;
1972 _flkeys = NULL;
1973 out:
1974 if (_flkeys) {
1975 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1976 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1977 keys->tags.flow_label = _flkeys->tags.flow_label;
1978 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1979 } else {
1980 keys->addrs.v6addrs.src = key_iph->saddr;
1981 keys->addrs.v6addrs.dst = key_iph->daddr;
1982 keys->tags.flow_label = ip6_flowlabel(key_iph);
1983 keys->basic.ip_proto = key_iph->nexthdr;
1984 }
1985 }
1986
1987 /* if skb is set it will be used and fl6 can be NULL */
1988 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1989 const struct sk_buff *skb, struct flow_keys *flkeys)
1990 {
1991 struct flow_keys hash_keys;
1992 u32 mhash;
1993
1994 switch (ip6_multipath_hash_policy(net)) {
1995 case 0:
1996 memset(&hash_keys, 0, sizeof(hash_keys));
1997 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1998 if (skb) {
1999 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2000 } else {
2001 hash_keys.addrs.v6addrs.src = fl6->saddr;
2002 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2003 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2004 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2005 }
2006 break;
2007 case 1:
2008 if (skb) {
2009 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2010 struct flow_keys keys;
2011
2012 /* short-circuit if we already have L4 hash present */
2013 if (skb->l4_hash)
2014 return skb_get_hash_raw(skb) >> 1;
2015
2016 memset(&hash_keys, 0, sizeof(hash_keys));
2017
2018 if (!flkeys) {
2019 skb_flow_dissect_flow_keys(skb, &keys, flag);
2020 flkeys = &keys;
2021 }
2022 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2023 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2024 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2025 hash_keys.ports.src = flkeys->ports.src;
2026 hash_keys.ports.dst = flkeys->ports.dst;
2027 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2028 } else {
2029 memset(&hash_keys, 0, sizeof(hash_keys));
2030 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2031 hash_keys.addrs.v6addrs.src = fl6->saddr;
2032 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2033 hash_keys.ports.src = fl6->fl6_sport;
2034 hash_keys.ports.dst = fl6->fl6_dport;
2035 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2036 }
2037 break;
2038 }
2039 mhash = flow_hash_from_keys(&hash_keys);
2040
2041 return mhash >> 1;
2042 }
2043
2044 void ip6_route_input(struct sk_buff *skb)
2045 {
2046 const struct ipv6hdr *iph = ipv6_hdr(skb);
2047 struct net *net = dev_net(skb->dev);
2048 int flags = RT6_LOOKUP_F_HAS_SADDR;
2049 struct ip_tunnel_info *tun_info;
2050 struct flowi6 fl6 = {
2051 .flowi6_iif = skb->dev->ifindex,
2052 .daddr = iph->daddr,
2053 .saddr = iph->saddr,
2054 .flowlabel = ip6_flowinfo(iph),
2055 .flowi6_mark = skb->mark,
2056 .flowi6_proto = iph->nexthdr,
2057 };
2058 struct flow_keys *flkeys = NULL, _flkeys;
2059
2060 tun_info = skb_tunnel_info(skb);
2061 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2062 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2063
2064 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2065 flkeys = &_flkeys;
2066
2067 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2068 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2069 skb_dst_drop(skb);
2070 skb_dst_set(skb,
2071 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2072 }
2073
2074 static struct rt6_info *ip6_pol_route_output(struct net *net,
2075 struct fib6_table *table,
2076 struct flowi6 *fl6,
2077 const struct sk_buff *skb,
2078 int flags)
2079 {
2080 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2081 }
2082
2083 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2084 struct flowi6 *fl6, int flags)
2085 {
2086 bool any_src;
2087
2088 if (ipv6_addr_type(&fl6->daddr) &
2089 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2090 struct dst_entry *dst;
2091
2092 dst = l3mdev_link_scope_lookup(net, fl6);
2093 if (dst)
2094 return dst;
2095 }
2096
2097 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2098
2099 any_src = ipv6_addr_any(&fl6->saddr);
2100 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2101 (fl6->flowi6_oif && any_src))
2102 flags |= RT6_LOOKUP_F_IFACE;
2103
2104 if (!any_src)
2105 flags |= RT6_LOOKUP_F_HAS_SADDR;
2106 else if (sk)
2107 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2108
2109 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2110 }
2111 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2112
2113 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2114 {
2115 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2116 struct net_device *loopback_dev = net->loopback_dev;
2117 struct dst_entry *new = NULL;
2118
2119 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2120 DST_OBSOLETE_DEAD, 0);
2121 if (rt) {
2122 rt6_info_init(rt);
2123 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2124
2125 new = &rt->dst;
2126 new->__use = 1;
2127 new->input = dst_discard;
2128 new->output = dst_discard_out;
2129
2130 dst_copy_metrics(new, &ort->dst);
2131
2132 rt->rt6i_idev = in6_dev_get(loopback_dev);
2133 rt->rt6i_gateway = ort->rt6i_gateway;
2134 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2135
2136 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2137 #ifdef CONFIG_IPV6_SUBTREES
2138 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2139 #endif
2140 }
2141
2142 dst_release(dst_orig);
2143 return new ? new : ERR_PTR(-ENOMEM);
2144 }
2145
2146 /*
2147 * Destination cache support functions
2148 */
2149
2150 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2151 {
2152 u32 rt_cookie = 0;
2153
2154 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2155 return false;
2156
2157 if (fib6_check_expired(f6i))
2158 return false;
2159
2160 return true;
2161 }
2162
2163 static struct dst_entry *rt6_check(struct rt6_info *rt,
2164 struct fib6_info *from,
2165 u32 cookie)
2166 {
2167 u32 rt_cookie = 0;
2168
2169 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2170 rt_cookie != cookie)
2171 return NULL;
2172
2173 if (rt6_check_expired(rt))
2174 return NULL;
2175
2176 return &rt->dst;
2177 }
2178
2179 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2180 struct fib6_info *from,
2181 u32 cookie)
2182 {
2183 if (!__rt6_check_expired(rt) &&
2184 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2185 fib6_check(from, cookie))
2186 return &rt->dst;
2187 else
2188 return NULL;
2189 }
2190
2191 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2192 {
2193 struct dst_entry *dst_ret;
2194 struct fib6_info *from;
2195 struct rt6_info *rt;
2196
2197 rt = container_of(dst, struct rt6_info, dst);
2198
2199 rcu_read_lock();
2200
2201 /* All IPV6 dsts are created with ->obsolete set to the value
2202 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2203 * into this function always.
2204 */
2205
2206 from = rcu_dereference(rt->from);
2207
2208 if (from && (rt->rt6i_flags & RTF_PCPU ||
2209 unlikely(!list_empty(&rt->rt6i_uncached))))
2210 dst_ret = rt6_dst_from_check(rt, from, cookie);
2211 else
2212 dst_ret = rt6_check(rt, from, cookie);
2213
2214 rcu_read_unlock();
2215
2216 return dst_ret;
2217 }
2218
2219 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2220 {
2221 struct rt6_info *rt = (struct rt6_info *) dst;
2222
2223 if (rt) {
2224 if (rt->rt6i_flags & RTF_CACHE) {
2225 rcu_read_lock();
2226 if (rt6_check_expired(rt)) {
2227 rt6_remove_exception_rt(rt);
2228 dst = NULL;
2229 }
2230 rcu_read_unlock();
2231 } else {
2232 dst_release(dst);
2233 dst = NULL;
2234 }
2235 }
2236 return dst;
2237 }
2238
2239 static void ip6_link_failure(struct sk_buff *skb)
2240 {
2241 struct rt6_info *rt;
2242
2243 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2244
2245 rt = (struct rt6_info *) skb_dst(skb);
2246 if (rt) {
2247 rcu_read_lock();
2248 if (rt->rt6i_flags & RTF_CACHE) {
2249 rt6_remove_exception_rt(rt);
2250 } else {
2251 struct fib6_info *from;
2252 struct fib6_node *fn;
2253
2254 from = rcu_dereference(rt->from);
2255 if (from) {
2256 fn = rcu_dereference(from->fib6_node);
2257 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2258 fn->fn_sernum = -1;
2259 }
2260 }
2261 rcu_read_unlock();
2262 }
2263 }
2264
2265 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2266 {
2267 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2268 struct fib6_info *from;
2269
2270 rcu_read_lock();
2271 from = rcu_dereference(rt0->from);
2272 if (from)
2273 rt0->dst.expires = from->expires;
2274 rcu_read_unlock();
2275 }
2276
2277 dst_set_expires(&rt0->dst, timeout);
2278 rt0->rt6i_flags |= RTF_EXPIRES;
2279 }
2280
2281 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2282 {
2283 struct net *net = dev_net(rt->dst.dev);
2284
2285 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2286 rt->rt6i_flags |= RTF_MODIFIED;
2287 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2288 }
2289
2290 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2291 {
2292 return !(rt->rt6i_flags & RTF_CACHE) &&
2293 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2294 }
2295
2296 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2297 const struct ipv6hdr *iph, u32 mtu)
2298 {
2299 const struct in6_addr *daddr, *saddr;
2300 struct rt6_info *rt6 = (struct rt6_info *)dst;
2301
2302 if (dst_metric_locked(dst, RTAX_MTU))
2303 return;
2304
2305 if (iph) {
2306 daddr = &iph->daddr;
2307 saddr = &iph->saddr;
2308 } else if (sk) {
2309 daddr = &sk->sk_v6_daddr;
2310 saddr = &inet6_sk(sk)->saddr;
2311 } else {
2312 daddr = NULL;
2313 saddr = NULL;
2314 }
2315 dst_confirm_neigh(dst, daddr);
2316 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2317 if (mtu >= dst_mtu(dst))
2318 return;
2319
2320 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2321 rt6_do_update_pmtu(rt6, mtu);
2322 /* update rt6_ex->stamp for cache */
2323 if (rt6->rt6i_flags & RTF_CACHE)
2324 rt6_update_exception_stamp_rt(rt6);
2325 } else if (daddr) {
2326 struct fib6_info *from;
2327 struct rt6_info *nrt6;
2328
2329 rcu_read_lock();
2330 from = rcu_dereference(rt6->from);
2331 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2332 if (nrt6) {
2333 rt6_do_update_pmtu(nrt6, mtu);
2334 if (rt6_insert_exception(nrt6, from))
2335 dst_release_immediate(&nrt6->dst);
2336 }
2337 rcu_read_unlock();
2338 }
2339 }
2340
2341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2342 struct sk_buff *skb, u32 mtu)
2343 {
2344 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2345 }
2346
2347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2348 int oif, u32 mark, kuid_t uid)
2349 {
2350 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2351 struct dst_entry *dst;
2352 struct flowi6 fl6 = {
2353 .flowi6_oif = oif,
2354 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2355 .daddr = iph->daddr,
2356 .saddr = iph->saddr,
2357 .flowlabel = ip6_flowinfo(iph),
2358 .flowi6_uid = uid,
2359 };
2360
2361 dst = ip6_route_output(net, NULL, &fl6);
2362 if (!dst->error)
2363 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2364 dst_release(dst);
2365 }
2366 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2367
2368 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2369 {
2370 int oif = sk->sk_bound_dev_if;
2371 struct dst_entry *dst;
2372
2373 if (!oif && skb->dev)
2374 oif = l3mdev_master_ifindex(skb->dev);
2375
2376 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2377
2378 dst = __sk_dst_get(sk);
2379 if (!dst || !dst->obsolete ||
2380 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2381 return;
2382
2383 bh_lock_sock(sk);
2384 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2385 ip6_datagram_dst_update(sk, false);
2386 bh_unlock_sock(sk);
2387 }
2388 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2389
2390 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2391 const struct flowi6 *fl6)
2392 {
2393 #ifdef CONFIG_IPV6_SUBTREES
2394 struct ipv6_pinfo *np = inet6_sk(sk);
2395 #endif
2396
2397 ip6_dst_store(sk, dst,
2398 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2399 &sk->sk_v6_daddr : NULL,
2400 #ifdef CONFIG_IPV6_SUBTREES
2401 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2402 &np->saddr :
2403 #endif
2404 NULL);
2405 }
2406
2407 /* Handle redirects */
2408 struct ip6rd_flowi {
2409 struct flowi6 fl6;
2410 struct in6_addr gateway;
2411 };
2412
2413 static struct rt6_info *__ip6_route_redirect(struct net *net,
2414 struct fib6_table *table,
2415 struct flowi6 *fl6,
2416 const struct sk_buff *skb,
2417 int flags)
2418 {
2419 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2420 struct rt6_info *ret = NULL, *rt_cache;
2421 struct fib6_info *rt;
2422 struct fib6_node *fn;
2423
2424 /* Get the "current" route for this destination and
2425 * check if the redirect has come from appropriate router.
2426 *
2427 * RFC 4861 specifies that redirects should only be
2428 * accepted if they come from the nexthop to the target.
2429 * Due to the way the routes are chosen, this notion
2430 * is a bit fuzzy and one might need to check all possible
2431 * routes.
2432 */
2433
2434 rcu_read_lock();
2435 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2436 restart:
2437 for_each_fib6_node_rt_rcu(fn) {
2438 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2439 continue;
2440 if (fib6_check_expired(rt))
2441 continue;
2442 if (rt->fib6_flags & RTF_REJECT)
2443 break;
2444 if (!(rt->fib6_flags & RTF_GATEWAY))
2445 continue;
2446 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2447 continue;
2448 /* rt_cache's gateway might be different from its 'parent'
2449 * in the case of an ip redirect.
2450 * So we keep searching in the exception table if the gateway
2451 * is different.
2452 */
2453 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2454 rt_cache = rt6_find_cached_rt(rt,
2455 &fl6->daddr,
2456 &fl6->saddr);
2457 if (rt_cache &&
2458 ipv6_addr_equal(&rdfl->gateway,
2459 &rt_cache->rt6i_gateway)) {
2460 ret = rt_cache;
2461 break;
2462 }
2463 continue;
2464 }
2465 break;
2466 }
2467
2468 if (!rt)
2469 rt = net->ipv6.fib6_null_entry;
2470 else if (rt->fib6_flags & RTF_REJECT) {
2471 ret = net->ipv6.ip6_null_entry;
2472 goto out;
2473 }
2474
2475 if (rt == net->ipv6.fib6_null_entry) {
2476 fn = fib6_backtrack(fn, &fl6->saddr);
2477 if (fn)
2478 goto restart;
2479 }
2480
2481 out:
2482 if (ret)
2483 ip6_hold_safe(net, &ret, true);
2484 else
2485 ret = ip6_create_rt_rcu(rt);
2486
2487 rcu_read_unlock();
2488
2489 trace_fib6_table_lookup(net, rt, table, fl6);
2490 return ret;
2491 };
2492
2493 static struct dst_entry *ip6_route_redirect(struct net *net,
2494 const struct flowi6 *fl6,
2495 const struct sk_buff *skb,
2496 const struct in6_addr *gateway)
2497 {
2498 int flags = RT6_LOOKUP_F_HAS_SADDR;
2499 struct ip6rd_flowi rdfl;
2500
2501 rdfl.fl6 = *fl6;
2502 rdfl.gateway = *gateway;
2503
2504 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2505 flags, __ip6_route_redirect);
2506 }
2507
2508 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2509 kuid_t uid)
2510 {
2511 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2512 struct dst_entry *dst;
2513 struct flowi6 fl6 = {
2514 .flowi6_iif = LOOPBACK_IFINDEX,
2515 .flowi6_oif = oif,
2516 .flowi6_mark = mark,
2517 .daddr = iph->daddr,
2518 .saddr = iph->saddr,
2519 .flowlabel = ip6_flowinfo(iph),
2520 .flowi6_uid = uid,
2521 };
2522
2523 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2524 rt6_do_redirect(dst, NULL, skb);
2525 dst_release(dst);
2526 }
2527 EXPORT_SYMBOL_GPL(ip6_redirect);
2528
2529 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2530 {
2531 const struct ipv6hdr *iph = ipv6_hdr(skb);
2532 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2533 struct dst_entry *dst;
2534 struct flowi6 fl6 = {
2535 .flowi6_iif = LOOPBACK_IFINDEX,
2536 .flowi6_oif = oif,
2537 .daddr = msg->dest,
2538 .saddr = iph->daddr,
2539 .flowi6_uid = sock_net_uid(net, NULL),
2540 };
2541
2542 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2543 rt6_do_redirect(dst, NULL, skb);
2544 dst_release(dst);
2545 }
2546
2547 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2548 {
2549 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2550 sk->sk_uid);
2551 }
2552 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2553
2554 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2555 {
2556 struct net_device *dev = dst->dev;
2557 unsigned int mtu = dst_mtu(dst);
2558 struct net *net = dev_net(dev);
2559
2560 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2561
2562 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2563 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2564
2565 /*
2566 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2567 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2568 * IPV6_MAXPLEN is also valid and means: "any MSS,
2569 * rely only on pmtu discovery"
2570 */
2571 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2572 mtu = IPV6_MAXPLEN;
2573 return mtu;
2574 }
2575
2576 static unsigned int ip6_mtu(const struct dst_entry *dst)
2577 {
2578 struct inet6_dev *idev;
2579 unsigned int mtu;
2580
2581 mtu = dst_metric_raw(dst, RTAX_MTU);
2582 if (mtu)
2583 goto out;
2584
2585 mtu = IPV6_MIN_MTU;
2586
2587 rcu_read_lock();
2588 idev = __in6_dev_get(dst->dev);
2589 if (idev)
2590 mtu = idev->cnf.mtu6;
2591 rcu_read_unlock();
2592
2593 out:
2594 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2595
2596 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2597 }
2598
2599 /* MTU selection:
2600 * 1. mtu on route is locked - use it
2601 * 2. mtu from nexthop exception
2602 * 3. mtu from egress device
2603 *
2604 * based on ip6_dst_mtu_forward and exception logic of
2605 * rt6_find_cached_rt; called with rcu_read_lock
2606 */
2607 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2608 struct in6_addr *saddr)
2609 {
2610 struct rt6_exception_bucket *bucket;
2611 struct rt6_exception *rt6_ex;
2612 struct in6_addr *src_key;
2613 struct inet6_dev *idev;
2614 u32 mtu = 0;
2615
2616 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2617 mtu = f6i->fib6_pmtu;
2618 if (mtu)
2619 goto out;
2620 }
2621
2622 src_key = NULL;
2623 #ifdef CONFIG_IPV6_SUBTREES
2624 if (f6i->fib6_src.plen)
2625 src_key = saddr;
2626 #endif
2627
2628 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2629 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2630 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2631 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2632
2633 if (likely(!mtu)) {
2634 struct net_device *dev = fib6_info_nh_dev(f6i);
2635
2636 mtu = IPV6_MIN_MTU;
2637 idev = __in6_dev_get(dev);
2638 if (idev && idev->cnf.mtu6 > mtu)
2639 mtu = idev->cnf.mtu6;
2640 }
2641
2642 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2643 out:
2644 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2645 }
2646
2647 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2648 struct flowi6 *fl6)
2649 {
2650 struct dst_entry *dst;
2651 struct rt6_info *rt;
2652 struct inet6_dev *idev = in6_dev_get(dev);
2653 struct net *net = dev_net(dev);
2654
2655 if (unlikely(!idev))
2656 return ERR_PTR(-ENODEV);
2657
2658 rt = ip6_dst_alloc(net, dev, 0);
2659 if (unlikely(!rt)) {
2660 in6_dev_put(idev);
2661 dst = ERR_PTR(-ENOMEM);
2662 goto out;
2663 }
2664
2665 rt->dst.flags |= DST_HOST;
2666 rt->dst.input = ip6_input;
2667 rt->dst.output = ip6_output;
2668 rt->rt6i_gateway = fl6->daddr;
2669 rt->rt6i_dst.addr = fl6->daddr;
2670 rt->rt6i_dst.plen = 128;
2671 rt->rt6i_idev = idev;
2672 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2673
2674 /* Add this dst into uncached_list so that rt6_disable_ip() can
2675 * do proper release of the net_device
2676 */
2677 rt6_uncached_list_add(rt);
2678 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2679
2680 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2681
2682 out:
2683 return dst;
2684 }
2685
2686 static int ip6_dst_gc(struct dst_ops *ops)
2687 {
2688 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2689 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2690 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2691 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2692 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2693 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2694 int entries;
2695
2696 entries = dst_entries_get_fast(ops);
2697 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2698 entries <= rt_max_size)
2699 goto out;
2700
2701 net->ipv6.ip6_rt_gc_expire++;
2702 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2703 entries = dst_entries_get_slow(ops);
2704 if (entries < ops->gc_thresh)
2705 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2706 out:
2707 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2708 return entries > rt_max_size;
2709 }
2710
2711 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2712 struct fib6_config *cfg,
2713 const struct in6_addr *gw_addr,
2714 u32 tbid, int flags)
2715 {
2716 struct flowi6 fl6 = {
2717 .flowi6_oif = cfg->fc_ifindex,
2718 .daddr = *gw_addr,
2719 .saddr = cfg->fc_prefsrc,
2720 };
2721 struct fib6_table *table;
2722 struct rt6_info *rt;
2723
2724 table = fib6_get_table(net, tbid);
2725 if (!table)
2726 return NULL;
2727
2728 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2729 flags |= RT6_LOOKUP_F_HAS_SADDR;
2730
2731 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2732 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2733
2734 /* if table lookup failed, fall back to full lookup */
2735 if (rt == net->ipv6.ip6_null_entry) {
2736 ip6_rt_put(rt);
2737 rt = NULL;
2738 }
2739
2740 return rt;
2741 }
2742
2743 static int ip6_route_check_nh_onlink(struct net *net,
2744 struct fib6_config *cfg,
2745 const struct net_device *dev,
2746 struct netlink_ext_ack *extack)
2747 {
2748 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2749 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2750 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2751 struct fib6_info *from;
2752 struct rt6_info *grt;
2753 int err;
2754
2755 err = 0;
2756 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2757 if (grt) {
2758 rcu_read_lock();
2759 from = rcu_dereference(grt->from);
2760 if (!grt->dst.error &&
2761 /* ignore match if it is the default route */
2762 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2763 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2764 NL_SET_ERR_MSG(extack,
2765 "Nexthop has invalid gateway or device mismatch");
2766 err = -EINVAL;
2767 }
2768 rcu_read_unlock();
2769
2770 ip6_rt_put(grt);
2771 }
2772
2773 return err;
2774 }
2775
2776 static int ip6_route_check_nh(struct net *net,
2777 struct fib6_config *cfg,
2778 struct net_device **_dev,
2779 struct inet6_dev **idev)
2780 {
2781 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2782 struct net_device *dev = _dev ? *_dev : NULL;
2783 struct rt6_info *grt = NULL;
2784 int err = -EHOSTUNREACH;
2785
2786 if (cfg->fc_table) {
2787 int flags = RT6_LOOKUP_F_IFACE;
2788
2789 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2790 cfg->fc_table, flags);
2791 if (grt) {
2792 if (grt->rt6i_flags & RTF_GATEWAY ||
2793 (dev && dev != grt->dst.dev)) {
2794 ip6_rt_put(grt);
2795 grt = NULL;
2796 }
2797 }
2798 }
2799
2800 if (!grt)
2801 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2802
2803 if (!grt)
2804 goto out;
2805
2806 if (dev) {
2807 if (dev != grt->dst.dev) {
2808 ip6_rt_put(grt);
2809 goto out;
2810 }
2811 } else {
2812 *_dev = dev = grt->dst.dev;
2813 *idev = grt->rt6i_idev;
2814 dev_hold(dev);
2815 in6_dev_hold(grt->rt6i_idev);
2816 }
2817
2818 if (!(grt->rt6i_flags & RTF_GATEWAY))
2819 err = 0;
2820
2821 ip6_rt_put(grt);
2822
2823 out:
2824 return err;
2825 }
2826
2827 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2828 struct net_device **_dev, struct inet6_dev **idev,
2829 struct netlink_ext_ack *extack)
2830 {
2831 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2832 int gwa_type = ipv6_addr_type(gw_addr);
2833 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2834 const struct net_device *dev = *_dev;
2835 bool need_addr_check = !dev;
2836 int err = -EINVAL;
2837
2838 /* if gw_addr is local we will fail to detect this in case
2839 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2840 * will return already-added prefix route via interface that
2841 * prefix route was assigned to, which might be non-loopback.
2842 */
2843 if (dev &&
2844 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2845 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2846 goto out;
2847 }
2848
2849 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2850 /* IPv6 strictly inhibits using not link-local
2851 * addresses as nexthop address.
2852 * Otherwise, router will not able to send redirects.
2853 * It is very good, but in some (rare!) circumstances
2854 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2855 * some exceptions. --ANK
2856 * We allow IPv4-mapped nexthops to support RFC4798-type
2857 * addressing
2858 */
2859 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2860 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2861 goto out;
2862 }
2863
2864 if (cfg->fc_flags & RTNH_F_ONLINK)
2865 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2866 else
2867 err = ip6_route_check_nh(net, cfg, _dev, idev);
2868
2869 if (err)
2870 goto out;
2871 }
2872
2873 /* reload in case device was changed */
2874 dev = *_dev;
2875
2876 err = -EINVAL;
2877 if (!dev) {
2878 NL_SET_ERR_MSG(extack, "Egress device not specified");
2879 goto out;
2880 } else if (dev->flags & IFF_LOOPBACK) {
2881 NL_SET_ERR_MSG(extack,
2882 "Egress device can not be loopback device for this route");
2883 goto out;
2884 }
2885
2886 /* if we did not check gw_addr above, do so now that the
2887 * egress device has been resolved.
2888 */
2889 if (need_addr_check &&
2890 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2891 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2892 goto out;
2893 }
2894
2895 err = 0;
2896 out:
2897 return err;
2898 }
2899
2900 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2901 gfp_t gfp_flags,
2902 struct netlink_ext_ack *extack)
2903 {
2904 struct net *net = cfg->fc_nlinfo.nl_net;
2905 struct fib6_info *rt = NULL;
2906 struct net_device *dev = NULL;
2907 struct inet6_dev *idev = NULL;
2908 struct fib6_table *table;
2909 int addr_type;
2910 int err = -EINVAL;
2911
2912 /* RTF_PCPU is an internal flag; can not be set by userspace */
2913 if (cfg->fc_flags & RTF_PCPU) {
2914 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2915 goto out;
2916 }
2917
2918 /* RTF_CACHE is an internal flag; can not be set by userspace */
2919 if (cfg->fc_flags & RTF_CACHE) {
2920 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2921 goto out;
2922 }
2923
2924 if (cfg->fc_type > RTN_MAX) {
2925 NL_SET_ERR_MSG(extack, "Invalid route type");
2926 goto out;
2927 }
2928
2929 if (cfg->fc_dst_len > 128) {
2930 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2931 goto out;
2932 }
2933 if (cfg->fc_src_len > 128) {
2934 NL_SET_ERR_MSG(extack, "Invalid source address length");
2935 goto out;
2936 }
2937 #ifndef CONFIG_IPV6_SUBTREES
2938 if (cfg->fc_src_len) {
2939 NL_SET_ERR_MSG(extack,
2940 "Specifying source address requires IPV6_SUBTREES to be enabled");
2941 goto out;
2942 }
2943 #endif
2944 if (cfg->fc_ifindex) {
2945 err = -ENODEV;
2946 dev = dev_get_by_index(net, cfg->fc_ifindex);
2947 if (!dev)
2948 goto out;
2949 idev = in6_dev_get(dev);
2950 if (!idev)
2951 goto out;
2952 }
2953
2954 if (cfg->fc_metric == 0)
2955 cfg->fc_metric = IP6_RT_PRIO_USER;
2956
2957 if (cfg->fc_flags & RTNH_F_ONLINK) {
2958 if (!dev) {
2959 NL_SET_ERR_MSG(extack,
2960 "Nexthop device required for onlink");
2961 err = -ENODEV;
2962 goto out;
2963 }
2964
2965 if (!(dev->flags & IFF_UP)) {
2966 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2967 err = -ENETDOWN;
2968 goto out;
2969 }
2970 }
2971
2972 err = -ENOBUFS;
2973 if (cfg->fc_nlinfo.nlh &&
2974 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2975 table = fib6_get_table(net, cfg->fc_table);
2976 if (!table) {
2977 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2978 table = fib6_new_table(net, cfg->fc_table);
2979 }
2980 } else {
2981 table = fib6_new_table(net, cfg->fc_table);
2982 }
2983
2984 if (!table)
2985 goto out;
2986
2987 err = -ENOMEM;
2988 rt = fib6_info_alloc(gfp_flags);
2989 if (!rt)
2990 goto out;
2991
2992 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2993 extack);
2994 if (IS_ERR(rt->fib6_metrics)) {
2995 err = PTR_ERR(rt->fib6_metrics);
2996 /* Do not leave garbage there. */
2997 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2998 goto out;
2999 }
3000
3001 if (cfg->fc_flags & RTF_ADDRCONF)
3002 rt->dst_nocount = true;
3003
3004 if (cfg->fc_flags & RTF_EXPIRES)
3005 fib6_set_expires(rt, jiffies +
3006 clock_t_to_jiffies(cfg->fc_expires));
3007 else
3008 fib6_clean_expires(rt);
3009
3010 if (cfg->fc_protocol == RTPROT_UNSPEC)
3011 cfg->fc_protocol = RTPROT_BOOT;
3012 rt->fib6_protocol = cfg->fc_protocol;
3013
3014 addr_type = ipv6_addr_type(&cfg->fc_dst);
3015
3016 if (cfg->fc_encap) {
3017 struct lwtunnel_state *lwtstate;
3018
3019 err = lwtunnel_build_state(cfg->fc_encap_type,
3020 cfg->fc_encap, AF_INET6, cfg,
3021 &lwtstate, extack);
3022 if (err)
3023 goto out;
3024 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3025 }
3026
3027 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3028 rt->fib6_dst.plen = cfg->fc_dst_len;
3029 if (rt->fib6_dst.plen == 128)
3030 rt->dst_host = true;
3031
3032 #ifdef CONFIG_IPV6_SUBTREES
3033 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3034 rt->fib6_src.plen = cfg->fc_src_len;
3035 #endif
3036
3037 rt->fib6_metric = cfg->fc_metric;
3038 rt->fib6_nh.nh_weight = 1;
3039
3040 rt->fib6_type = cfg->fc_type;
3041
3042 /* We cannot add true routes via loopback here,
3043 they would result in kernel looping; promote them to reject routes
3044 */
3045 if ((cfg->fc_flags & RTF_REJECT) ||
3046 (dev && (dev->flags & IFF_LOOPBACK) &&
3047 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3048 !(cfg->fc_flags & RTF_LOCAL))) {
3049 /* hold loopback dev/idev if we haven't done so. */
3050 if (dev != net->loopback_dev) {
3051 if (dev) {
3052 dev_put(dev);
3053 in6_dev_put(idev);
3054 }
3055 dev = net->loopback_dev;
3056 dev_hold(dev);
3057 idev = in6_dev_get(dev);
3058 if (!idev) {
3059 err = -ENODEV;
3060 goto out;
3061 }
3062 }
3063 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3064 goto install_route;
3065 }
3066
3067 if (cfg->fc_flags & RTF_GATEWAY) {
3068 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3069 if (err)
3070 goto out;
3071
3072 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3073 }
3074
3075 err = -ENODEV;
3076 if (!dev)
3077 goto out;
3078
3079 if (idev->cnf.disable_ipv6) {
3080 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3081 err = -EACCES;
3082 goto out;
3083 }
3084
3085 if (!(dev->flags & IFF_UP)) {
3086 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3087 err = -ENETDOWN;
3088 goto out;
3089 }
3090
3091 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3092 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3093 NL_SET_ERR_MSG(extack, "Invalid source address");
3094 err = -EINVAL;
3095 goto out;
3096 }
3097 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3098 rt->fib6_prefsrc.plen = 128;
3099 } else
3100 rt->fib6_prefsrc.plen = 0;
3101
3102 rt->fib6_flags = cfg->fc_flags;
3103
3104 install_route:
3105 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3106 !netif_carrier_ok(dev))
3107 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3108 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3109 rt->fib6_nh.nh_dev = dev;
3110 rt->fib6_table = table;
3111
3112 if (idev)
3113 in6_dev_put(idev);
3114
3115 return rt;
3116 out:
3117 if (dev)
3118 dev_put(dev);
3119 if (idev)
3120 in6_dev_put(idev);
3121
3122 fib6_info_release(rt);
3123 return ERR_PTR(err);
3124 }
3125
3126 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3127 struct netlink_ext_ack *extack)
3128 {
3129 struct fib6_info *rt;
3130 int err;
3131
3132 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3133 if (IS_ERR(rt))
3134 return PTR_ERR(rt);
3135
3136 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3137 fib6_info_release(rt);
3138
3139 return err;
3140 }
3141
3142 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3143 {
3144 struct net *net = info->nl_net;
3145 struct fib6_table *table;
3146 int err;
3147
3148 if (rt == net->ipv6.fib6_null_entry) {
3149 err = -ENOENT;
3150 goto out;
3151 }
3152
3153 table = rt->fib6_table;
3154 spin_lock_bh(&table->tb6_lock);
3155 err = fib6_del(rt, info);
3156 spin_unlock_bh(&table->tb6_lock);
3157
3158 out:
3159 fib6_info_release(rt);
3160 return err;
3161 }
3162
3163 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3164 {
3165 struct nl_info info = { .nl_net = net };
3166
3167 return __ip6_del_rt(rt, &info);
3168 }
3169
3170 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3171 {
3172 struct nl_info *info = &cfg->fc_nlinfo;
3173 struct net *net = info->nl_net;
3174 struct sk_buff *skb = NULL;
3175 struct fib6_table *table;
3176 int err = -ENOENT;
3177
3178 if (rt == net->ipv6.fib6_null_entry)
3179 goto out_put;
3180 table = rt->fib6_table;
3181 spin_lock_bh(&table->tb6_lock);
3182
3183 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3184 struct fib6_info *sibling, *next_sibling;
3185
3186 /* prefer to send a single notification with all hops */
3187 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3188 if (skb) {
3189 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3190
3191 if (rt6_fill_node(net, skb, rt, NULL,
3192 NULL, NULL, 0, RTM_DELROUTE,
3193 info->portid, seq, 0) < 0) {
3194 kfree_skb(skb);
3195 skb = NULL;
3196 } else
3197 info->skip_notify = 1;
3198 }
3199
3200 list_for_each_entry_safe(sibling, next_sibling,
3201 &rt->fib6_siblings,
3202 fib6_siblings) {
3203 err = fib6_del(sibling, info);
3204 if (err)
3205 goto out_unlock;
3206 }
3207 }
3208
3209 err = fib6_del(rt, info);
3210 out_unlock:
3211 spin_unlock_bh(&table->tb6_lock);
3212 out_put:
3213 fib6_info_release(rt);
3214
3215 if (skb) {
3216 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3217 info->nlh, gfp_any());
3218 }
3219 return err;
3220 }
3221
3222 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3223 {
3224 int rc = -ESRCH;
3225
3226 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3227 goto out;
3228
3229 if (cfg->fc_flags & RTF_GATEWAY &&
3230 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3231 goto out;
3232
3233 rc = rt6_remove_exception_rt(rt);
3234 out:
3235 return rc;
3236 }
3237
3238 static int ip6_route_del(struct fib6_config *cfg,
3239 struct netlink_ext_ack *extack)
3240 {
3241 struct rt6_info *rt_cache;
3242 struct fib6_table *table;
3243 struct fib6_info *rt;
3244 struct fib6_node *fn;
3245 int err = -ESRCH;
3246
3247 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3248 if (!table) {
3249 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3250 return err;
3251 }
3252
3253 rcu_read_lock();
3254
3255 fn = fib6_locate(&table->tb6_root,
3256 &cfg->fc_dst, cfg->fc_dst_len,
3257 &cfg->fc_src, cfg->fc_src_len,
3258 !(cfg->fc_flags & RTF_CACHE));
3259
3260 if (fn) {
3261 for_each_fib6_node_rt_rcu(fn) {
3262 if (cfg->fc_flags & RTF_CACHE) {
3263 int rc;
3264
3265 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3266 &cfg->fc_src);
3267 if (rt_cache) {
3268 rc = ip6_del_cached_rt(rt_cache, cfg);
3269 if (rc != -ESRCH) {
3270 rcu_read_unlock();
3271 return rc;
3272 }
3273 }
3274 continue;
3275 }
3276 if (cfg->fc_ifindex &&
3277 (!rt->fib6_nh.nh_dev ||
3278 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3279 continue;
3280 if (cfg->fc_flags & RTF_GATEWAY &&
3281 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3282 continue;
3283 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3284 continue;
3285 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3286 continue;
3287 if (!fib6_info_hold_safe(rt))
3288 continue;
3289 rcu_read_unlock();
3290
3291 /* if gateway was specified only delete the one hop */
3292 if (cfg->fc_flags & RTF_GATEWAY)
3293 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3294
3295 return __ip6_del_rt_siblings(rt, cfg);
3296 }
3297 }
3298 rcu_read_unlock();
3299
3300 return err;
3301 }
3302
3303 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3304 {
3305 struct netevent_redirect netevent;
3306 struct rt6_info *rt, *nrt = NULL;
3307 struct ndisc_options ndopts;
3308 struct inet6_dev *in6_dev;
3309 struct neighbour *neigh;
3310 struct fib6_info *from;
3311 struct rd_msg *msg;
3312 int optlen, on_link;
3313 u8 *lladdr;
3314
3315 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3316 optlen -= sizeof(*msg);
3317
3318 if (optlen < 0) {
3319 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3320 return;
3321 }
3322
3323 msg = (struct rd_msg *)icmp6_hdr(skb);
3324
3325 if (ipv6_addr_is_multicast(&msg->dest)) {
3326 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3327 return;
3328 }
3329
3330 on_link = 0;
3331 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3332 on_link = 1;
3333 } else if (ipv6_addr_type(&msg->target) !=
3334 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3335 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3336 return;
3337 }
3338
3339 in6_dev = __in6_dev_get(skb->dev);
3340 if (!in6_dev)
3341 return;
3342 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3343 return;
3344
3345 /* RFC2461 8.1:
3346 * The IP source address of the Redirect MUST be the same as the current
3347 * first-hop router for the specified ICMP Destination Address.
3348 */
3349
3350 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3351 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3352 return;
3353 }
3354
3355 lladdr = NULL;
3356 if (ndopts.nd_opts_tgt_lladdr) {
3357 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3358 skb->dev);
3359 if (!lladdr) {
3360 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3361 return;
3362 }
3363 }
3364
3365 rt = (struct rt6_info *) dst;
3366 if (rt->rt6i_flags & RTF_REJECT) {
3367 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3368 return;
3369 }
3370
3371 /* Redirect received -> path was valid.
3372 * Look, redirects are sent only in response to data packets,
3373 * so that this nexthop apparently is reachable. --ANK
3374 */
3375 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3376
3377 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3378 if (!neigh)
3379 return;
3380
3381 /*
3382 * We have finally decided to accept it.
3383 */
3384
3385 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3386 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3387 NEIGH_UPDATE_F_OVERRIDE|
3388 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3389 NEIGH_UPDATE_F_ISROUTER)),
3390 NDISC_REDIRECT, &ndopts);
3391
3392 rcu_read_lock();
3393 from = rcu_dereference(rt->from);
3394 /* This fib6_info_hold() is safe here because we hold reference to rt
3395 * and rt already holds reference to fib6_info.
3396 */
3397 fib6_info_hold(from);
3398 rcu_read_unlock();
3399
3400 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3401 if (!nrt)
3402 goto out;
3403
3404 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3405 if (on_link)
3406 nrt->rt6i_flags &= ~RTF_GATEWAY;
3407
3408 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3409
3410 /* No need to remove rt from the exception table if rt is
3411 * a cached route because rt6_insert_exception() will
3412 * takes care of it
3413 */
3414 if (rt6_insert_exception(nrt, from)) {
3415 dst_release_immediate(&nrt->dst);
3416 goto out;
3417 }
3418
3419 netevent.old = &rt->dst;
3420 netevent.new = &nrt->dst;
3421 netevent.daddr = &msg->dest;
3422 netevent.neigh = neigh;
3423 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3424
3425 out:
3426 fib6_info_release(from);
3427 neigh_release(neigh);
3428 }
3429
3430 #ifdef CONFIG_IPV6_ROUTE_INFO
3431 static struct fib6_info *rt6_get_route_info(struct net *net,
3432 const struct in6_addr *prefix, int prefixlen,
3433 const struct in6_addr *gwaddr,
3434 struct net_device *dev)
3435 {
3436 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3437 int ifindex = dev->ifindex;
3438 struct fib6_node *fn;
3439 struct fib6_info *rt = NULL;
3440 struct fib6_table *table;
3441
3442 table = fib6_get_table(net, tb_id);
3443 if (!table)
3444 return NULL;
3445
3446 rcu_read_lock();
3447 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3448 if (!fn)
3449 goto out;
3450
3451 for_each_fib6_node_rt_rcu(fn) {
3452 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3453 continue;
3454 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3455 continue;
3456 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3457 continue;
3458 if (!fib6_info_hold_safe(rt))
3459 continue;
3460 break;
3461 }
3462 out:
3463 rcu_read_unlock();
3464 return rt;
3465 }
3466
3467 static struct fib6_info *rt6_add_route_info(struct net *net,
3468 const struct in6_addr *prefix, int prefixlen,
3469 const struct in6_addr *gwaddr,
3470 struct net_device *dev,
3471 unsigned int pref)
3472 {
3473 struct fib6_config cfg = {
3474 .fc_metric = IP6_RT_PRIO_USER,
3475 .fc_ifindex = dev->ifindex,
3476 .fc_dst_len = prefixlen,
3477 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3478 RTF_UP | RTF_PREF(pref),
3479 .fc_protocol = RTPROT_RA,
3480 .fc_type = RTN_UNICAST,
3481 .fc_nlinfo.portid = 0,
3482 .fc_nlinfo.nlh = NULL,
3483 .fc_nlinfo.nl_net = net,
3484 };
3485
3486 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3487 cfg.fc_dst = *prefix;
3488 cfg.fc_gateway = *gwaddr;
3489
3490 /* We should treat it as a default route if prefix length is 0. */
3491 if (!prefixlen)
3492 cfg.fc_flags |= RTF_DEFAULT;
3493
3494 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3495
3496 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3497 }
3498 #endif
3499
3500 struct fib6_info *rt6_get_dflt_router(struct net *net,
3501 const struct in6_addr *addr,
3502 struct net_device *dev)
3503 {
3504 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3505 struct fib6_info *rt;
3506 struct fib6_table *table;
3507
3508 table = fib6_get_table(net, tb_id);
3509 if (!table)
3510 return NULL;
3511
3512 rcu_read_lock();
3513 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3514 if (dev == rt->fib6_nh.nh_dev &&
3515 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3516 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3517 break;
3518 }
3519 if (rt && !fib6_info_hold_safe(rt))
3520 rt = NULL;
3521 rcu_read_unlock();
3522 return rt;
3523 }
3524
3525 struct fib6_info *rt6_add_dflt_router(struct net *net,
3526 const struct in6_addr *gwaddr,
3527 struct net_device *dev,
3528 unsigned int pref)
3529 {
3530 struct fib6_config cfg = {
3531 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3532 .fc_metric = IP6_RT_PRIO_USER,
3533 .fc_ifindex = dev->ifindex,
3534 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3535 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3536 .fc_protocol = RTPROT_RA,
3537 .fc_type = RTN_UNICAST,
3538 .fc_nlinfo.portid = 0,
3539 .fc_nlinfo.nlh = NULL,
3540 .fc_nlinfo.nl_net = net,
3541 };
3542
3543 cfg.fc_gateway = *gwaddr;
3544
3545 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3546 struct fib6_table *table;
3547
3548 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3549 if (table)
3550 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3551 }
3552
3553 return rt6_get_dflt_router(net, gwaddr, dev);
3554 }
3555
3556 static void __rt6_purge_dflt_routers(struct net *net,
3557 struct fib6_table *table)
3558 {
3559 struct fib6_info *rt;
3560
3561 restart:
3562 rcu_read_lock();
3563 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3564 struct net_device *dev = fib6_info_nh_dev(rt);
3565 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3566
3567 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3568 (!idev || idev->cnf.accept_ra != 2) &&
3569 fib6_info_hold_safe(rt)) {
3570 rcu_read_unlock();
3571 ip6_del_rt(net, rt);
3572 goto restart;
3573 }
3574 }
3575 rcu_read_unlock();
3576
3577 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3578 }
3579
3580 void rt6_purge_dflt_routers(struct net *net)
3581 {
3582 struct fib6_table *table;
3583 struct hlist_head *head;
3584 unsigned int h;
3585
3586 rcu_read_lock();
3587
3588 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3589 head = &net->ipv6.fib_table_hash[h];
3590 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3591 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3592 __rt6_purge_dflt_routers(net, table);
3593 }
3594 }
3595
3596 rcu_read_unlock();
3597 }
3598
3599 static void rtmsg_to_fib6_config(struct net *net,
3600 struct in6_rtmsg *rtmsg,
3601 struct fib6_config *cfg)
3602 {
3603 *cfg = (struct fib6_config){
3604 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3605 : RT6_TABLE_MAIN,
3606 .fc_ifindex = rtmsg->rtmsg_ifindex,
3607 .fc_metric = rtmsg->rtmsg_metric,
3608 .fc_expires = rtmsg->rtmsg_info,
3609 .fc_dst_len = rtmsg->rtmsg_dst_len,
3610 .fc_src_len = rtmsg->rtmsg_src_len,
3611 .fc_flags = rtmsg->rtmsg_flags,
3612 .fc_type = rtmsg->rtmsg_type,
3613
3614 .fc_nlinfo.nl_net = net,
3615
3616 .fc_dst = rtmsg->rtmsg_dst,
3617 .fc_src = rtmsg->rtmsg_src,
3618 .fc_gateway = rtmsg->rtmsg_gateway,
3619 };
3620 }
3621
3622 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3623 {
3624 struct fib6_config cfg;
3625 struct in6_rtmsg rtmsg;
3626 int err;
3627
3628 switch (cmd) {
3629 case SIOCADDRT: /* Add a route */
3630 case SIOCDELRT: /* Delete a route */
3631 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3632 return -EPERM;
3633 err = copy_from_user(&rtmsg, arg,
3634 sizeof(struct in6_rtmsg));
3635 if (err)
3636 return -EFAULT;
3637
3638 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3639
3640 rtnl_lock();
3641 switch (cmd) {
3642 case SIOCADDRT:
3643 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3644 break;
3645 case SIOCDELRT:
3646 err = ip6_route_del(&cfg, NULL);
3647 break;
3648 default:
3649 err = -EINVAL;
3650 }
3651 rtnl_unlock();
3652
3653 return err;
3654 }
3655
3656 return -EINVAL;
3657 }
3658
3659 /*
3660 * Drop the packet on the floor
3661 */
3662
3663 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3664 {
3665 int type;
3666 struct dst_entry *dst = skb_dst(skb);
3667 switch (ipstats_mib_noroutes) {
3668 case IPSTATS_MIB_INNOROUTES:
3669 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3670 if (type == IPV6_ADDR_ANY) {
3671 IP6_INC_STATS(dev_net(dst->dev),
3672 __in6_dev_get_safely(skb->dev),
3673 IPSTATS_MIB_INADDRERRORS);
3674 break;
3675 }
3676 /* FALLTHROUGH */
3677 case IPSTATS_MIB_OUTNOROUTES:
3678 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3679 ipstats_mib_noroutes);
3680 break;
3681 }
3682 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3683 kfree_skb(skb);
3684 return 0;
3685 }
3686
3687 static int ip6_pkt_discard(struct sk_buff *skb)
3688 {
3689 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3690 }
3691
3692 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3693 {
3694 skb->dev = skb_dst(skb)->dev;
3695 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3696 }
3697
3698 static int ip6_pkt_prohibit(struct sk_buff *skb)
3699 {
3700 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3701 }
3702
3703 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3704 {
3705 skb->dev = skb_dst(skb)->dev;
3706 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3707 }
3708
3709 /*
3710 * Allocate a dst for local (unicast / anycast) address.
3711 */
3712
3713 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3714 struct inet6_dev *idev,
3715 const struct in6_addr *addr,
3716 bool anycast, gfp_t gfp_flags)
3717 {
3718 u32 tb_id;
3719 struct net_device *dev = idev->dev;
3720 struct fib6_info *f6i;
3721
3722 f6i = fib6_info_alloc(gfp_flags);
3723 if (!f6i)
3724 return ERR_PTR(-ENOMEM);
3725
3726 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3727 f6i->dst_nocount = true;
3728 f6i->dst_host = true;
3729 f6i->fib6_protocol = RTPROT_KERNEL;
3730 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3731 if (anycast) {
3732 f6i->fib6_type = RTN_ANYCAST;
3733 f6i->fib6_flags |= RTF_ANYCAST;
3734 } else {
3735 f6i->fib6_type = RTN_LOCAL;
3736 f6i->fib6_flags |= RTF_LOCAL;
3737 }
3738
3739 f6i->fib6_nh.nh_gw = *addr;
3740 dev_hold(dev);
3741 f6i->fib6_nh.nh_dev = dev;
3742 f6i->fib6_dst.addr = *addr;
3743 f6i->fib6_dst.plen = 128;
3744 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3745 f6i->fib6_table = fib6_get_table(net, tb_id);
3746
3747 return f6i;
3748 }
3749
3750 /* remove deleted ip from prefsrc entries */
3751 struct arg_dev_net_ip {
3752 struct net_device *dev;
3753 struct net *net;
3754 struct in6_addr *addr;
3755 };
3756
3757 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3758 {
3759 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3760 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3761 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3762
3763 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3764 rt != net->ipv6.fib6_null_entry &&
3765 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3766 spin_lock_bh(&rt6_exception_lock);
3767 /* remove prefsrc entry */
3768 rt->fib6_prefsrc.plen = 0;
3769 spin_unlock_bh(&rt6_exception_lock);
3770 }
3771 return 0;
3772 }
3773
3774 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3775 {
3776 struct net *net = dev_net(ifp->idev->dev);
3777 struct arg_dev_net_ip adni = {
3778 .dev = ifp->idev->dev,
3779 .net = net,
3780 .addr = &ifp->addr,
3781 };
3782 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3783 }
3784
3785 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3786
3787 /* Remove routers and update dst entries when gateway turn into host. */
3788 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3789 {
3790 struct in6_addr *gateway = (struct in6_addr *)arg;
3791
3792 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3793 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3794 return -1;
3795 }
3796
3797 /* Further clean up cached routes in exception table.
3798 * This is needed because cached route may have a different
3799 * gateway than its 'parent' in the case of an ip redirect.
3800 */
3801 rt6_exceptions_clean_tohost(rt, gateway);
3802
3803 return 0;
3804 }
3805
3806 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3807 {
3808 fib6_clean_all(net, fib6_clean_tohost, gateway);
3809 }
3810
3811 struct arg_netdev_event {
3812 const struct net_device *dev;
3813 union {
3814 unsigned int nh_flags;
3815 unsigned long event;
3816 };
3817 };
3818
3819 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3820 {
3821 struct fib6_info *iter;
3822 struct fib6_node *fn;
3823
3824 fn = rcu_dereference_protected(rt->fib6_node,
3825 lockdep_is_held(&rt->fib6_table->tb6_lock));
3826 iter = rcu_dereference_protected(fn->leaf,
3827 lockdep_is_held(&rt->fib6_table->tb6_lock));
3828 while (iter) {
3829 if (iter->fib6_metric == rt->fib6_metric &&
3830 rt6_qualify_for_ecmp(iter))
3831 return iter;
3832 iter = rcu_dereference_protected(iter->fib6_next,
3833 lockdep_is_held(&rt->fib6_table->tb6_lock));
3834 }
3835
3836 return NULL;
3837 }
3838
3839 static bool rt6_is_dead(const struct fib6_info *rt)
3840 {
3841 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3842 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3843 fib6_ignore_linkdown(rt)))
3844 return true;
3845
3846 return false;
3847 }
3848
3849 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3850 {
3851 struct fib6_info *iter;
3852 int total = 0;
3853
3854 if (!rt6_is_dead(rt))
3855 total += rt->fib6_nh.nh_weight;
3856
3857 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3858 if (!rt6_is_dead(iter))
3859 total += iter->fib6_nh.nh_weight;
3860 }
3861
3862 return total;
3863 }
3864
3865 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3866 {
3867 int upper_bound = -1;
3868
3869 if (!rt6_is_dead(rt)) {
3870 *weight += rt->fib6_nh.nh_weight;
3871 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3872 total) - 1;
3873 }
3874 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3875 }
3876
3877 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3878 {
3879 struct fib6_info *iter;
3880 int weight = 0;
3881
3882 rt6_upper_bound_set(rt, &weight, total);
3883
3884 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3885 rt6_upper_bound_set(iter, &weight, total);
3886 }
3887
3888 void rt6_multipath_rebalance(struct fib6_info *rt)
3889 {
3890 struct fib6_info *first;
3891 int total;
3892
3893 /* In case the entire multipath route was marked for flushing,
3894 * then there is no need to rebalance upon the removal of every
3895 * sibling route.
3896 */
3897 if (!rt->fib6_nsiblings || rt->should_flush)
3898 return;
3899
3900 /* During lookup routes are evaluated in order, so we need to
3901 * make sure upper bounds are assigned from the first sibling
3902 * onwards.
3903 */
3904 first = rt6_multipath_first_sibling(rt);
3905 if (WARN_ON_ONCE(!first))
3906 return;
3907
3908 total = rt6_multipath_total_weight(first);
3909 rt6_multipath_upper_bound_set(first, total);
3910 }
3911
3912 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3913 {
3914 const struct arg_netdev_event *arg = p_arg;
3915 struct net *net = dev_net(arg->dev);
3916
3917 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3918 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3919 fib6_update_sernum_upto_root(net, rt);
3920 rt6_multipath_rebalance(rt);
3921 }
3922
3923 return 0;
3924 }
3925
3926 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3927 {
3928 struct arg_netdev_event arg = {
3929 .dev = dev,
3930 {
3931 .nh_flags = nh_flags,
3932 },
3933 };
3934
3935 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3936 arg.nh_flags |= RTNH_F_LINKDOWN;
3937
3938 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3939 }
3940
3941 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3942 const struct net_device *dev)
3943 {
3944 struct fib6_info *iter;
3945
3946 if (rt->fib6_nh.nh_dev == dev)
3947 return true;
3948 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3949 if (iter->fib6_nh.nh_dev == dev)
3950 return true;
3951
3952 return false;
3953 }
3954
3955 static void rt6_multipath_flush(struct fib6_info *rt)
3956 {
3957 struct fib6_info *iter;
3958
3959 rt->should_flush = 1;
3960 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3961 iter->should_flush = 1;
3962 }
3963
3964 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3965 const struct net_device *down_dev)
3966 {
3967 struct fib6_info *iter;
3968 unsigned int dead = 0;
3969
3970 if (rt->fib6_nh.nh_dev == down_dev ||
3971 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3972 dead++;
3973 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3974 if (iter->fib6_nh.nh_dev == down_dev ||
3975 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3976 dead++;
3977
3978 return dead;
3979 }
3980
3981 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3982 const struct net_device *dev,
3983 unsigned int nh_flags)
3984 {
3985 struct fib6_info *iter;
3986
3987 if (rt->fib6_nh.nh_dev == dev)
3988 rt->fib6_nh.nh_flags |= nh_flags;
3989 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3990 if (iter->fib6_nh.nh_dev == dev)
3991 iter->fib6_nh.nh_flags |= nh_flags;
3992 }
3993
3994 /* called with write lock held for table with rt */
3995 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3996 {
3997 const struct arg_netdev_event *arg = p_arg;
3998 const struct net_device *dev = arg->dev;
3999 struct net *net = dev_net(dev);
4000
4001 if (rt == net->ipv6.fib6_null_entry)
4002 return 0;
4003
4004 switch (arg->event) {
4005 case NETDEV_UNREGISTER:
4006 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4007 case NETDEV_DOWN:
4008 if (rt->should_flush)
4009 return -1;
4010 if (!rt->fib6_nsiblings)
4011 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4012 if (rt6_multipath_uses_dev(rt, dev)) {
4013 unsigned int count;
4014
4015 count = rt6_multipath_dead_count(rt, dev);
4016 if (rt->fib6_nsiblings + 1 == count) {
4017 rt6_multipath_flush(rt);
4018 return -1;
4019 }
4020 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4021 RTNH_F_LINKDOWN);
4022 fib6_update_sernum(net, rt);
4023 rt6_multipath_rebalance(rt);
4024 }
4025 return -2;
4026 case NETDEV_CHANGE:
4027 if (rt->fib6_nh.nh_dev != dev ||
4028 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4029 break;
4030 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4031 rt6_multipath_rebalance(rt);
4032 break;
4033 }
4034
4035 return 0;
4036 }
4037
4038 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4039 {
4040 struct arg_netdev_event arg = {
4041 .dev = dev,
4042 {
4043 .event = event,
4044 },
4045 };
4046 struct net *net = dev_net(dev);
4047
4048 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4049 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4050 else
4051 fib6_clean_all(net, fib6_ifdown, &arg);
4052 }
4053
4054 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4055 {
4056 rt6_sync_down_dev(dev, event);
4057 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4058 neigh_ifdown(&nd_tbl, dev);
4059 }
4060
4061 struct rt6_mtu_change_arg {
4062 struct net_device *dev;
4063 unsigned int mtu;
4064 };
4065
4066 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4067 {
4068 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4069 struct inet6_dev *idev;
4070
4071 /* In IPv6 pmtu discovery is not optional,
4072 so that RTAX_MTU lock cannot disable it.
4073 We still use this lock to block changes
4074 caused by addrconf/ndisc.
4075 */
4076
4077 idev = __in6_dev_get(arg->dev);
4078 if (!idev)
4079 return 0;
4080
4081 /* For administrative MTU increase, there is no way to discover
4082 IPv6 PMTU increase, so PMTU increase should be updated here.
4083 Since RFC 1981 doesn't include administrative MTU increase
4084 update PMTU increase is a MUST. (i.e. jumbo frame)
4085 */
4086 if (rt->fib6_nh.nh_dev == arg->dev &&
4087 !fib6_metric_locked(rt, RTAX_MTU)) {
4088 u32 mtu = rt->fib6_pmtu;
4089
4090 if (mtu >= arg->mtu ||
4091 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4092 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4093
4094 spin_lock_bh(&rt6_exception_lock);
4095 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4096 spin_unlock_bh(&rt6_exception_lock);
4097 }
4098 return 0;
4099 }
4100
4101 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4102 {
4103 struct rt6_mtu_change_arg arg = {
4104 .dev = dev,
4105 .mtu = mtu,
4106 };
4107
4108 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4109 }
4110
4111 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4112 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4113 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4114 [RTA_OIF] = { .type = NLA_U32 },
4115 [RTA_IIF] = { .type = NLA_U32 },
4116 [RTA_PRIORITY] = { .type = NLA_U32 },
4117 [RTA_METRICS] = { .type = NLA_NESTED },
4118 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4119 [RTA_PREF] = { .type = NLA_U8 },
4120 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4121 [RTA_ENCAP] = { .type = NLA_NESTED },
4122 [RTA_EXPIRES] = { .type = NLA_U32 },
4123 [RTA_UID] = { .type = NLA_U32 },
4124 [RTA_MARK] = { .type = NLA_U32 },
4125 [RTA_TABLE] = { .type = NLA_U32 },
4126 [RTA_IP_PROTO] = { .type = NLA_U8 },
4127 [RTA_SPORT] = { .type = NLA_U16 },
4128 [RTA_DPORT] = { .type = NLA_U16 },
4129 };
4130
4131 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4132 struct fib6_config *cfg,
4133 struct netlink_ext_ack *extack)
4134 {
4135 struct rtmsg *rtm;
4136 struct nlattr *tb[RTA_MAX+1];
4137 unsigned int pref;
4138 int err;
4139
4140 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4141 extack);
4142 if (err < 0)
4143 goto errout;
4144
4145 err = -EINVAL;
4146 rtm = nlmsg_data(nlh);
4147
4148 *cfg = (struct fib6_config){
4149 .fc_table = rtm->rtm_table,
4150 .fc_dst_len = rtm->rtm_dst_len,
4151 .fc_src_len = rtm->rtm_src_len,
4152 .fc_flags = RTF_UP,
4153 .fc_protocol = rtm->rtm_protocol,
4154 .fc_type = rtm->rtm_type,
4155
4156 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4157 .fc_nlinfo.nlh = nlh,
4158 .fc_nlinfo.nl_net = sock_net(skb->sk),
4159 };
4160
4161 if (rtm->rtm_type == RTN_UNREACHABLE ||
4162 rtm->rtm_type == RTN_BLACKHOLE ||
4163 rtm->rtm_type == RTN_PROHIBIT ||
4164 rtm->rtm_type == RTN_THROW)
4165 cfg->fc_flags |= RTF_REJECT;
4166
4167 if (rtm->rtm_type == RTN_LOCAL)
4168 cfg->fc_flags |= RTF_LOCAL;
4169
4170 if (rtm->rtm_flags & RTM_F_CLONED)
4171 cfg->fc_flags |= RTF_CACHE;
4172
4173 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4174
4175 if (tb[RTA_GATEWAY]) {
4176 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4177 cfg->fc_flags |= RTF_GATEWAY;
4178 }
4179 if (tb[RTA_VIA]) {
4180 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4181 goto errout;
4182 }
4183
4184 if (tb[RTA_DST]) {
4185 int plen = (rtm->rtm_dst_len + 7) >> 3;
4186
4187 if (nla_len(tb[RTA_DST]) < plen)
4188 goto errout;
4189
4190 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4191 }
4192
4193 if (tb[RTA_SRC]) {
4194 int plen = (rtm->rtm_src_len + 7) >> 3;
4195
4196 if (nla_len(tb[RTA_SRC]) < plen)
4197 goto errout;
4198
4199 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4200 }
4201
4202 if (tb[RTA_PREFSRC])
4203 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4204
4205 if (tb[RTA_OIF])
4206 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4207
4208 if (tb[RTA_PRIORITY])
4209 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4210
4211 if (tb[RTA_METRICS]) {
4212 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4213 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4214 }
4215
4216 if (tb[RTA_TABLE])
4217 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4218
4219 if (tb[RTA_MULTIPATH]) {
4220 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4221 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4222
4223 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4224 cfg->fc_mp_len, extack);
4225 if (err < 0)
4226 goto errout;
4227 }
4228
4229 if (tb[RTA_PREF]) {
4230 pref = nla_get_u8(tb[RTA_PREF]);
4231 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4232 pref != ICMPV6_ROUTER_PREF_HIGH)
4233 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4234 cfg->fc_flags |= RTF_PREF(pref);
4235 }
4236
4237 if (tb[RTA_ENCAP])
4238 cfg->fc_encap = tb[RTA_ENCAP];
4239
4240 if (tb[RTA_ENCAP_TYPE]) {
4241 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4242
4243 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4244 if (err < 0)
4245 goto errout;
4246 }
4247
4248 if (tb[RTA_EXPIRES]) {
4249 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4250
4251 if (addrconf_finite_timeout(timeout)) {
4252 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4253 cfg->fc_flags |= RTF_EXPIRES;
4254 }
4255 }
4256
4257 err = 0;
4258 errout:
4259 return err;
4260 }
4261
4262 struct rt6_nh {
4263 struct fib6_info *fib6_info;
4264 struct fib6_config r_cfg;
4265 struct list_head next;
4266 };
4267
4268 static int ip6_route_info_append(struct net *net,
4269 struct list_head *rt6_nh_list,
4270 struct fib6_info *rt,
4271 struct fib6_config *r_cfg)
4272 {
4273 struct rt6_nh *nh;
4274 int err = -EEXIST;
4275
4276 list_for_each_entry(nh, rt6_nh_list, next) {
4277 /* check if fib6_info already exists */
4278 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4279 return err;
4280 }
4281
4282 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4283 if (!nh)
4284 return -ENOMEM;
4285 nh->fib6_info = rt;
4286 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4287 list_add_tail(&nh->next, rt6_nh_list);
4288
4289 return 0;
4290 }
4291
4292 static void ip6_route_mpath_notify(struct fib6_info *rt,
4293 struct fib6_info *rt_last,
4294 struct nl_info *info,
4295 __u16 nlflags)
4296 {
4297 /* if this is an APPEND route, then rt points to the first route
4298 * inserted and rt_last points to last route inserted. Userspace
4299 * wants a consistent dump of the route which starts at the first
4300 * nexthop. Since sibling routes are always added at the end of
4301 * the list, find the first sibling of the last route appended
4302 */
4303 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4304 rt = list_first_entry(&rt_last->fib6_siblings,
4305 struct fib6_info,
4306 fib6_siblings);
4307 }
4308
4309 if (rt)
4310 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4311 }
4312
4313 static int ip6_route_multipath_add(struct fib6_config *cfg,
4314 struct netlink_ext_ack *extack)
4315 {
4316 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4317 struct nl_info *info = &cfg->fc_nlinfo;
4318 struct fib6_config r_cfg;
4319 struct rtnexthop *rtnh;
4320 struct fib6_info *rt;
4321 struct rt6_nh *err_nh;
4322 struct rt6_nh *nh, *nh_safe;
4323 __u16 nlflags;
4324 int remaining;
4325 int attrlen;
4326 int err = 1;
4327 int nhn = 0;
4328 int replace = (cfg->fc_nlinfo.nlh &&
4329 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4330 LIST_HEAD(rt6_nh_list);
4331
4332 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4333 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4334 nlflags |= NLM_F_APPEND;
4335
4336 remaining = cfg->fc_mp_len;
4337 rtnh = (struct rtnexthop *)cfg->fc_mp;
4338
4339 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4340 * fib6_info structs per nexthop
4341 */
4342 while (rtnh_ok(rtnh, remaining)) {
4343 memcpy(&r_cfg, cfg, sizeof(*cfg));
4344 if (rtnh->rtnh_ifindex)
4345 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4346
4347 attrlen = rtnh_attrlen(rtnh);
4348 if (attrlen > 0) {
4349 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4350
4351 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4352 if (nla) {
4353 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4354 r_cfg.fc_flags |= RTF_GATEWAY;
4355 }
4356 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4357 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4358 if (nla)
4359 r_cfg.fc_encap_type = nla_get_u16(nla);
4360 }
4361
4362 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4363 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4364 if (IS_ERR(rt)) {
4365 err = PTR_ERR(rt);
4366 rt = NULL;
4367 goto cleanup;
4368 }
4369 if (!rt6_qualify_for_ecmp(rt)) {
4370 err = -EINVAL;
4371 NL_SET_ERR_MSG(extack,
4372 "Device only routes can not be added for IPv6 using the multipath API.");
4373 fib6_info_release(rt);
4374 goto cleanup;
4375 }
4376
4377 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4378
4379 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4380 rt, &r_cfg);
4381 if (err) {
4382 fib6_info_release(rt);
4383 goto cleanup;
4384 }
4385
4386 rtnh = rtnh_next(rtnh, &remaining);
4387 }
4388
4389 /* for add and replace send one notification with all nexthops.
4390 * Skip the notification in fib6_add_rt2node and send one with
4391 * the full route when done
4392 */
4393 info->skip_notify = 1;
4394
4395 err_nh = NULL;
4396 list_for_each_entry(nh, &rt6_nh_list, next) {
4397 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4398 fib6_info_release(nh->fib6_info);
4399
4400 if (!err) {
4401 /* save reference to last route successfully inserted */
4402 rt_last = nh->fib6_info;
4403
4404 /* save reference to first route for notification */
4405 if (!rt_notif)
4406 rt_notif = nh->fib6_info;
4407 }
4408
4409 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4410 nh->fib6_info = NULL;
4411 if (err) {
4412 if (replace && nhn)
4413 NL_SET_ERR_MSG_MOD(extack,
4414 "multipath route replace failed (check consistency of installed routes)");
4415 err_nh = nh;
4416 goto add_errout;
4417 }
4418
4419 /* Because each route is added like a single route we remove
4420 * these flags after the first nexthop: if there is a collision,
4421 * we have already failed to add the first nexthop:
4422 * fib6_add_rt2node() has rejected it; when replacing, old
4423 * nexthops have been replaced by first new, the rest should
4424 * be added to it.
4425 */
4426 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4427 NLM_F_REPLACE);
4428 nhn++;
4429 }
4430
4431 /* success ... tell user about new route */
4432 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4433 goto cleanup;
4434
4435 add_errout:
4436 /* send notification for routes that were added so that
4437 * the delete notifications sent by ip6_route_del are
4438 * coherent
4439 */
4440 if (rt_notif)
4441 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4442
4443 /* Delete routes that were already added */
4444 list_for_each_entry(nh, &rt6_nh_list, next) {
4445 if (err_nh == nh)
4446 break;
4447 ip6_route_del(&nh->r_cfg, extack);
4448 }
4449
4450 cleanup:
4451 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4452 if (nh->fib6_info)
4453 fib6_info_release(nh->fib6_info);
4454 list_del(&nh->next);
4455 kfree(nh);
4456 }
4457
4458 return err;
4459 }
4460
4461 static int ip6_route_multipath_del(struct fib6_config *cfg,
4462 struct netlink_ext_ack *extack)
4463 {
4464 struct fib6_config r_cfg;
4465 struct rtnexthop *rtnh;
4466 int remaining;
4467 int attrlen;
4468 int err = 1, last_err = 0;
4469
4470 remaining = cfg->fc_mp_len;
4471 rtnh = (struct rtnexthop *)cfg->fc_mp;
4472
4473 /* Parse a Multipath Entry */
4474 while (rtnh_ok(rtnh, remaining)) {
4475 memcpy(&r_cfg, cfg, sizeof(*cfg));
4476 if (rtnh->rtnh_ifindex)
4477 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4478
4479 attrlen = rtnh_attrlen(rtnh);
4480 if (attrlen > 0) {
4481 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4482
4483 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4484 if (nla) {
4485 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4486 r_cfg.fc_flags |= RTF_GATEWAY;
4487 }
4488 }
4489 err = ip6_route_del(&r_cfg, extack);
4490 if (err)
4491 last_err = err;
4492
4493 rtnh = rtnh_next(rtnh, &remaining);
4494 }
4495
4496 return last_err;
4497 }
4498
4499 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4500 struct netlink_ext_ack *extack)
4501 {
4502 struct fib6_config cfg;
4503 int err;
4504
4505 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4506 if (err < 0)
4507 return err;
4508
4509 if (cfg.fc_mp)
4510 return ip6_route_multipath_del(&cfg, extack);
4511 else {
4512 cfg.fc_delete_all_nh = 1;
4513 return ip6_route_del(&cfg, extack);
4514 }
4515 }
4516
4517 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4518 struct netlink_ext_ack *extack)
4519 {
4520 struct fib6_config cfg;
4521 int err;
4522
4523 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4524 if (err < 0)
4525 return err;
4526
4527 if (cfg.fc_mp)
4528 return ip6_route_multipath_add(&cfg, extack);
4529 else
4530 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4531 }
4532
4533 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4534 {
4535 int nexthop_len = 0;
4536
4537 if (rt->fib6_nsiblings) {
4538 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4539 + NLA_ALIGN(sizeof(struct rtnexthop))
4540 + nla_total_size(16) /* RTA_GATEWAY */
4541 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4542
4543 nexthop_len *= rt->fib6_nsiblings;
4544 }
4545
4546 return NLMSG_ALIGN(sizeof(struct rtmsg))
4547 + nla_total_size(16) /* RTA_SRC */
4548 + nla_total_size(16) /* RTA_DST */
4549 + nla_total_size(16) /* RTA_GATEWAY */
4550 + nla_total_size(16) /* RTA_PREFSRC */
4551 + nla_total_size(4) /* RTA_TABLE */
4552 + nla_total_size(4) /* RTA_IIF */
4553 + nla_total_size(4) /* RTA_OIF */
4554 + nla_total_size(4) /* RTA_PRIORITY */
4555 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4556 + nla_total_size(sizeof(struct rta_cacheinfo))
4557 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4558 + nla_total_size(1) /* RTA_PREF */
4559 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4560 + nexthop_len;
4561 }
4562
4563 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4564 unsigned int *flags, bool skip_oif)
4565 {
4566 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4567 *flags |= RTNH_F_DEAD;
4568
4569 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4570 *flags |= RTNH_F_LINKDOWN;
4571
4572 rcu_read_lock();
4573 if (fib6_ignore_linkdown(rt))
4574 *flags |= RTNH_F_DEAD;
4575 rcu_read_unlock();
4576 }
4577
4578 if (rt->fib6_flags & RTF_GATEWAY) {
4579 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4580 goto nla_put_failure;
4581 }
4582
4583 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4584 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4585 *flags |= RTNH_F_OFFLOAD;
4586
4587 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4588 if (!skip_oif && rt->fib6_nh.nh_dev &&
4589 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4590 goto nla_put_failure;
4591
4592 if (rt->fib6_nh.nh_lwtstate &&
4593 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4594 goto nla_put_failure;
4595
4596 return 0;
4597
4598 nla_put_failure:
4599 return -EMSGSIZE;
4600 }
4601
4602 /* add multipath next hop */
4603 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4604 {
4605 const struct net_device *dev = rt->fib6_nh.nh_dev;
4606 struct rtnexthop *rtnh;
4607 unsigned int flags = 0;
4608
4609 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4610 if (!rtnh)
4611 goto nla_put_failure;
4612
4613 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4614 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4615
4616 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4617 goto nla_put_failure;
4618
4619 rtnh->rtnh_flags = flags;
4620
4621 /* length of rtnetlink header + attributes */
4622 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4623
4624 return 0;
4625
4626 nla_put_failure:
4627 return -EMSGSIZE;
4628 }
4629
4630 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4631 struct fib6_info *rt, struct dst_entry *dst,
4632 struct in6_addr *dest, struct in6_addr *src,
4633 int iif, int type, u32 portid, u32 seq,
4634 unsigned int flags)
4635 {
4636 struct rt6_info *rt6 = (struct rt6_info *)dst;
4637 struct rt6key *rt6_dst, *rt6_src;
4638 u32 *pmetrics, table, rt6_flags;
4639 struct nlmsghdr *nlh;
4640 struct rtmsg *rtm;
4641 long expires = 0;
4642
4643 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4644 if (!nlh)
4645 return -EMSGSIZE;
4646
4647 if (rt6) {
4648 rt6_dst = &rt6->rt6i_dst;
4649 rt6_src = &rt6->rt6i_src;
4650 rt6_flags = rt6->rt6i_flags;
4651 } else {
4652 rt6_dst = &rt->fib6_dst;
4653 rt6_src = &rt->fib6_src;
4654 rt6_flags = rt->fib6_flags;
4655 }
4656
4657 rtm = nlmsg_data(nlh);
4658 rtm->rtm_family = AF_INET6;
4659 rtm->rtm_dst_len = rt6_dst->plen;
4660 rtm->rtm_src_len = rt6_src->plen;
4661 rtm->rtm_tos = 0;
4662 if (rt->fib6_table)
4663 table = rt->fib6_table->tb6_id;
4664 else
4665 table = RT6_TABLE_UNSPEC;
4666 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4667 if (nla_put_u32(skb, RTA_TABLE, table))
4668 goto nla_put_failure;
4669
4670 rtm->rtm_type = rt->fib6_type;
4671 rtm->rtm_flags = 0;
4672 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4673 rtm->rtm_protocol = rt->fib6_protocol;
4674
4675 if (rt6_flags & RTF_CACHE)
4676 rtm->rtm_flags |= RTM_F_CLONED;
4677
4678 if (dest) {
4679 if (nla_put_in6_addr(skb, RTA_DST, dest))
4680 goto nla_put_failure;
4681 rtm->rtm_dst_len = 128;
4682 } else if (rtm->rtm_dst_len)
4683 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4684 goto nla_put_failure;
4685 #ifdef CONFIG_IPV6_SUBTREES
4686 if (src) {
4687 if (nla_put_in6_addr(skb, RTA_SRC, src))
4688 goto nla_put_failure;
4689 rtm->rtm_src_len = 128;
4690 } else if (rtm->rtm_src_len &&
4691 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4692 goto nla_put_failure;
4693 #endif
4694 if (iif) {
4695 #ifdef CONFIG_IPV6_MROUTE
4696 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4697 int err = ip6mr_get_route(net, skb, rtm, portid);
4698
4699 if (err == 0)
4700 return 0;
4701 if (err < 0)
4702 goto nla_put_failure;
4703 } else
4704 #endif
4705 if (nla_put_u32(skb, RTA_IIF, iif))
4706 goto nla_put_failure;
4707 } else if (dest) {
4708 struct in6_addr saddr_buf;
4709 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4710 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4711 goto nla_put_failure;
4712 }
4713
4714 if (rt->fib6_prefsrc.plen) {
4715 struct in6_addr saddr_buf;
4716 saddr_buf = rt->fib6_prefsrc.addr;
4717 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4718 goto nla_put_failure;
4719 }
4720
4721 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4722 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4723 goto nla_put_failure;
4724
4725 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4726 goto nla_put_failure;
4727
4728 /* For multipath routes, walk the siblings list and add
4729 * each as a nexthop within RTA_MULTIPATH.
4730 */
4731 if (rt6) {
4732 if (rt6_flags & RTF_GATEWAY &&
4733 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4734 goto nla_put_failure;
4735
4736 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4737 goto nla_put_failure;
4738 } else if (rt->fib6_nsiblings) {
4739 struct fib6_info *sibling, *next_sibling;
4740 struct nlattr *mp;
4741
4742 mp = nla_nest_start(skb, RTA_MULTIPATH);
4743 if (!mp)
4744 goto nla_put_failure;
4745
4746 if (rt6_add_nexthop(skb, rt) < 0)
4747 goto nla_put_failure;
4748
4749 list_for_each_entry_safe(sibling, next_sibling,
4750 &rt->fib6_siblings, fib6_siblings) {
4751 if (rt6_add_nexthop(skb, sibling) < 0)
4752 goto nla_put_failure;
4753 }
4754
4755 nla_nest_end(skb, mp);
4756 } else {
4757 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4758 goto nla_put_failure;
4759 }
4760
4761 if (rt6_flags & RTF_EXPIRES) {
4762 expires = dst ? dst->expires : rt->expires;
4763 expires -= jiffies;
4764 }
4765
4766 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4767 goto nla_put_failure;
4768
4769 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4770 goto nla_put_failure;
4771
4772
4773 nlmsg_end(skb, nlh);
4774 return 0;
4775
4776 nla_put_failure:
4777 nlmsg_cancel(skb, nlh);
4778 return -EMSGSIZE;
4779 }
4780
4781 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4782 const struct net_device *dev)
4783 {
4784 if (f6i->fib6_nh.nh_dev == dev)
4785 return true;
4786
4787 if (f6i->fib6_nsiblings) {
4788 struct fib6_info *sibling, *next_sibling;
4789
4790 list_for_each_entry_safe(sibling, next_sibling,
4791 &f6i->fib6_siblings, fib6_siblings) {
4792 if (sibling->fib6_nh.nh_dev == dev)
4793 return true;
4794 }
4795 }
4796
4797 return false;
4798 }
4799
4800 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4801 {
4802 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4803 struct fib_dump_filter *filter = &arg->filter;
4804 unsigned int flags = NLM_F_MULTI;
4805 struct net *net = arg->net;
4806
4807 if (rt == net->ipv6.fib6_null_entry)
4808 return 0;
4809
4810 if ((filter->flags & RTM_F_PREFIX) &&
4811 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4812 /* success since this is not a prefix route */
4813 return 1;
4814 }
4815 if (filter->filter_set) {
4816 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4817 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4818 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4819 return 1;
4820 }
4821 flags |= NLM_F_DUMP_FILTERED;
4822 }
4823
4824 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4825 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4826 arg->cb->nlh->nlmsg_seq, flags);
4827 }
4828
4829 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4830 const struct nlmsghdr *nlh,
4831 struct nlattr **tb,
4832 struct netlink_ext_ack *extack)
4833 {
4834 struct rtmsg *rtm;
4835 int i, err;
4836
4837 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4838 NL_SET_ERR_MSG_MOD(extack,
4839 "Invalid header for get route request");
4840 return -EINVAL;
4841 }
4842
4843 if (!netlink_strict_get_check(skb))
4844 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4845 rtm_ipv6_policy, extack);
4846
4847 rtm = nlmsg_data(nlh);
4848 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4849 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4850 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4851 rtm->rtm_type) {
4852 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4853 return -EINVAL;
4854 }
4855 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4856 NL_SET_ERR_MSG_MOD(extack,
4857 "Invalid flags for get route request");
4858 return -EINVAL;
4859 }
4860
4861 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4862 rtm_ipv6_policy, extack);
4863 if (err)
4864 return err;
4865
4866 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4867 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4868 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4869 return -EINVAL;
4870 }
4871
4872 for (i = 0; i <= RTA_MAX; i++) {
4873 if (!tb[i])
4874 continue;
4875
4876 switch (i) {
4877 case RTA_SRC:
4878 case RTA_DST:
4879 case RTA_IIF:
4880 case RTA_OIF:
4881 case RTA_MARK:
4882 case RTA_UID:
4883 case RTA_SPORT:
4884 case RTA_DPORT:
4885 case RTA_IP_PROTO:
4886 break;
4887 default:
4888 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4889 return -EINVAL;
4890 }
4891 }
4892
4893 return 0;
4894 }
4895
4896 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4897 struct netlink_ext_ack *extack)
4898 {
4899 struct net *net = sock_net(in_skb->sk);
4900 struct nlattr *tb[RTA_MAX+1];
4901 int err, iif = 0, oif = 0;
4902 struct fib6_info *from;
4903 struct dst_entry *dst;
4904 struct rt6_info *rt;
4905 struct sk_buff *skb;
4906 struct rtmsg *rtm;
4907 struct flowi6 fl6 = {};
4908 bool fibmatch;
4909
4910 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4911 if (err < 0)
4912 goto errout;
4913
4914 err = -EINVAL;
4915 rtm = nlmsg_data(nlh);
4916 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4917 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4918
4919 if (tb[RTA_SRC]) {
4920 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4921 goto errout;
4922
4923 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4924 }
4925
4926 if (tb[RTA_DST]) {
4927 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4928 goto errout;
4929
4930 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4931 }
4932
4933 if (tb[RTA_IIF])
4934 iif = nla_get_u32(tb[RTA_IIF]);
4935
4936 if (tb[RTA_OIF])
4937 oif = nla_get_u32(tb[RTA_OIF]);
4938
4939 if (tb[RTA_MARK])
4940 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4941
4942 if (tb[RTA_UID])
4943 fl6.flowi6_uid = make_kuid(current_user_ns(),
4944 nla_get_u32(tb[RTA_UID]));
4945 else
4946 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4947
4948 if (tb[RTA_SPORT])
4949 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4950
4951 if (tb[RTA_DPORT])
4952 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4953
4954 if (tb[RTA_IP_PROTO]) {
4955 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4956 &fl6.flowi6_proto, AF_INET6,
4957 extack);
4958 if (err)
4959 goto errout;
4960 }
4961
4962 if (iif) {
4963 struct net_device *dev;
4964 int flags = 0;
4965
4966 rcu_read_lock();
4967
4968 dev = dev_get_by_index_rcu(net, iif);
4969 if (!dev) {
4970 rcu_read_unlock();
4971 err = -ENODEV;
4972 goto errout;
4973 }
4974
4975 fl6.flowi6_iif = iif;
4976
4977 if (!ipv6_addr_any(&fl6.saddr))
4978 flags |= RT6_LOOKUP_F_HAS_SADDR;
4979
4980 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4981
4982 rcu_read_unlock();
4983 } else {
4984 fl6.flowi6_oif = oif;
4985
4986 dst = ip6_route_output(net, NULL, &fl6);
4987 }
4988
4989
4990 rt = container_of(dst, struct rt6_info, dst);
4991 if (rt->dst.error) {
4992 err = rt->dst.error;
4993 ip6_rt_put(rt);
4994 goto errout;
4995 }
4996
4997 if (rt == net->ipv6.ip6_null_entry) {
4998 err = rt->dst.error;
4999 ip6_rt_put(rt);
5000 goto errout;
5001 }
5002
5003 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5004 if (!skb) {
5005 ip6_rt_put(rt);
5006 err = -ENOBUFS;
5007 goto errout;
5008 }
5009
5010 skb_dst_set(skb, &rt->dst);
5011
5012 rcu_read_lock();
5013 from = rcu_dereference(rt->from);
5014
5015 if (fibmatch)
5016 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5017 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5018 nlh->nlmsg_seq, 0);
5019 else
5020 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5021 &fl6.saddr, iif, RTM_NEWROUTE,
5022 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5023 0);
5024 rcu_read_unlock();
5025
5026 if (err < 0) {
5027 kfree_skb(skb);
5028 goto errout;
5029 }
5030
5031 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5032 errout:
5033 return err;
5034 }
5035
5036 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5037 unsigned int nlm_flags)
5038 {
5039 struct sk_buff *skb;
5040 struct net *net = info->nl_net;
5041 u32 seq;
5042 int err;
5043
5044 err = -ENOBUFS;
5045 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5046
5047 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5048 if (!skb)
5049 goto errout;
5050
5051 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5052 event, info->portid, seq, nlm_flags);
5053 if (err < 0) {
5054 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5055 WARN_ON(err == -EMSGSIZE);
5056 kfree_skb(skb);
5057 goto errout;
5058 }
5059 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5060 info->nlh, gfp_any());
5061 return;
5062 errout:
5063 if (err < 0)
5064 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5065 }
5066
5067 static int ip6_route_dev_notify(struct notifier_block *this,
5068 unsigned long event, void *ptr)
5069 {
5070 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5071 struct net *net = dev_net(dev);
5072
5073 if (!(dev->flags & IFF_LOOPBACK))
5074 return NOTIFY_OK;
5075
5076 if (event == NETDEV_REGISTER) {
5077 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5078 net->ipv6.ip6_null_entry->dst.dev = dev;
5079 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5080 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5081 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5082 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5083 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5084 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5085 #endif
5086 } else if (event == NETDEV_UNREGISTER &&
5087 dev->reg_state != NETREG_UNREGISTERED) {
5088 /* NETDEV_UNREGISTER could be fired for multiple times by
5089 * netdev_wait_allrefs(). Make sure we only call this once.
5090 */
5091 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5092 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5093 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5094 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5095 #endif
5096 }
5097
5098 return NOTIFY_OK;
5099 }
5100
5101 /*
5102 * /proc
5103 */
5104
5105 #ifdef CONFIG_PROC_FS
5106 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5107 {
5108 struct net *net = (struct net *)seq->private;
5109 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5110 net->ipv6.rt6_stats->fib_nodes,
5111 net->ipv6.rt6_stats->fib_route_nodes,
5112 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5113 net->ipv6.rt6_stats->fib_rt_entries,
5114 net->ipv6.rt6_stats->fib_rt_cache,
5115 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5116 net->ipv6.rt6_stats->fib_discarded_routes);
5117
5118 return 0;
5119 }
5120 #endif /* CONFIG_PROC_FS */
5121
5122 #ifdef CONFIG_SYSCTL
5123
5124 static
5125 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5126 void __user *buffer, size_t *lenp, loff_t *ppos)
5127 {
5128 struct net *net;
5129 int delay;
5130 int ret;
5131 if (!write)
5132 return -EINVAL;
5133
5134 net = (struct net *)ctl->extra1;
5135 delay = net->ipv6.sysctl.flush_delay;
5136 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5137 if (ret)
5138 return ret;
5139
5140 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5141 return 0;
5142 }
5143
5144 static int zero;
5145 static int one = 1;
5146
5147 static struct ctl_table ipv6_route_table_template[] = {
5148 {
5149 .procname = "flush",
5150 .data = &init_net.ipv6.sysctl.flush_delay,
5151 .maxlen = sizeof(int),
5152 .mode = 0200,
5153 .proc_handler = ipv6_sysctl_rtcache_flush
5154 },
5155 {
5156 .procname = "gc_thresh",
5157 .data = &ip6_dst_ops_template.gc_thresh,
5158 .maxlen = sizeof(int),
5159 .mode = 0644,
5160 .proc_handler = proc_dointvec,
5161 },
5162 {
5163 .procname = "max_size",
5164 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5165 .maxlen = sizeof(int),
5166 .mode = 0644,
5167 .proc_handler = proc_dointvec,
5168 },
5169 {
5170 .procname = "gc_min_interval",
5171 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5172 .maxlen = sizeof(int),
5173 .mode = 0644,
5174 .proc_handler = proc_dointvec_jiffies,
5175 },
5176 {
5177 .procname = "gc_timeout",
5178 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5179 .maxlen = sizeof(int),
5180 .mode = 0644,
5181 .proc_handler = proc_dointvec_jiffies,
5182 },
5183 {
5184 .procname = "gc_interval",
5185 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5186 .maxlen = sizeof(int),
5187 .mode = 0644,
5188 .proc_handler = proc_dointvec_jiffies,
5189 },
5190 {
5191 .procname = "gc_elasticity",
5192 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5193 .maxlen = sizeof(int),
5194 .mode = 0644,
5195 .proc_handler = proc_dointvec,
5196 },
5197 {
5198 .procname = "mtu_expires",
5199 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5200 .maxlen = sizeof(int),
5201 .mode = 0644,
5202 .proc_handler = proc_dointvec_jiffies,
5203 },
5204 {
5205 .procname = "min_adv_mss",
5206 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5207 .maxlen = sizeof(int),
5208 .mode = 0644,
5209 .proc_handler = proc_dointvec,
5210 },
5211 {
5212 .procname = "gc_min_interval_ms",
5213 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5214 .maxlen = sizeof(int),
5215 .mode = 0644,
5216 .proc_handler = proc_dointvec_ms_jiffies,
5217 },
5218 {
5219 .procname = "skip_notify_on_dev_down",
5220 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5221 .maxlen = sizeof(int),
5222 .mode = 0644,
5223 .proc_handler = proc_dointvec,
5224 .extra1 = &zero,
5225 .extra2 = &one,
5226 },
5227 { }
5228 };
5229
5230 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5231 {
5232 struct ctl_table *table;
5233
5234 table = kmemdup(ipv6_route_table_template,
5235 sizeof(ipv6_route_table_template),
5236 GFP_KERNEL);
5237
5238 if (table) {
5239 table[0].data = &net->ipv6.sysctl.flush_delay;
5240 table[0].extra1 = net;
5241 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5242 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5243 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5244 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5245 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5246 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5247 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5248 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5249 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5250 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5251
5252 /* Don't export sysctls to unprivileged users */
5253 if (net->user_ns != &init_user_ns)
5254 table[0].procname = NULL;
5255 }
5256
5257 return table;
5258 }
5259 #endif
5260
5261 static int __net_init ip6_route_net_init(struct net *net)
5262 {
5263 int ret = -ENOMEM;
5264
5265 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5266 sizeof(net->ipv6.ip6_dst_ops));
5267
5268 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5269 goto out_ip6_dst_ops;
5270
5271 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5272 sizeof(*net->ipv6.fib6_null_entry),
5273 GFP_KERNEL);
5274 if (!net->ipv6.fib6_null_entry)
5275 goto out_ip6_dst_entries;
5276
5277 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5278 sizeof(*net->ipv6.ip6_null_entry),
5279 GFP_KERNEL);
5280 if (!net->ipv6.ip6_null_entry)
5281 goto out_fib6_null_entry;
5282 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5283 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5284 ip6_template_metrics, true);
5285
5286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5287 net->ipv6.fib6_has_custom_rules = false;
5288 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5289 sizeof(*net->ipv6.ip6_prohibit_entry),
5290 GFP_KERNEL);
5291 if (!net->ipv6.ip6_prohibit_entry)
5292 goto out_ip6_null_entry;
5293 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5294 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5295 ip6_template_metrics, true);
5296
5297 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5298 sizeof(*net->ipv6.ip6_blk_hole_entry),
5299 GFP_KERNEL);
5300 if (!net->ipv6.ip6_blk_hole_entry)
5301 goto out_ip6_prohibit_entry;
5302 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5303 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5304 ip6_template_metrics, true);
5305 #endif
5306
5307 net->ipv6.sysctl.flush_delay = 0;
5308 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5309 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5310 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5311 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5312 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5313 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5314 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5315 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5316
5317 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5318
5319 ret = 0;
5320 out:
5321 return ret;
5322
5323 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5324 out_ip6_prohibit_entry:
5325 kfree(net->ipv6.ip6_prohibit_entry);
5326 out_ip6_null_entry:
5327 kfree(net->ipv6.ip6_null_entry);
5328 #endif
5329 out_fib6_null_entry:
5330 kfree(net->ipv6.fib6_null_entry);
5331 out_ip6_dst_entries:
5332 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5333 out_ip6_dst_ops:
5334 goto out;
5335 }
5336
5337 static void __net_exit ip6_route_net_exit(struct net *net)
5338 {
5339 kfree(net->ipv6.fib6_null_entry);
5340 kfree(net->ipv6.ip6_null_entry);
5341 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5342 kfree(net->ipv6.ip6_prohibit_entry);
5343 kfree(net->ipv6.ip6_blk_hole_entry);
5344 #endif
5345 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5346 }
5347
5348 static int __net_init ip6_route_net_init_late(struct net *net)
5349 {
5350 #ifdef CONFIG_PROC_FS
5351 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5352 sizeof(struct ipv6_route_iter));
5353 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5354 rt6_stats_seq_show, NULL);
5355 #endif
5356 return 0;
5357 }
5358
5359 static void __net_exit ip6_route_net_exit_late(struct net *net)
5360 {
5361 #ifdef CONFIG_PROC_FS
5362 remove_proc_entry("ipv6_route", net->proc_net);
5363 remove_proc_entry("rt6_stats", net->proc_net);
5364 #endif
5365 }
5366
5367 static struct pernet_operations ip6_route_net_ops = {
5368 .init = ip6_route_net_init,
5369 .exit = ip6_route_net_exit,
5370 };
5371
5372 static int __net_init ipv6_inetpeer_init(struct net *net)
5373 {
5374 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5375
5376 if (!bp)
5377 return -ENOMEM;
5378 inet_peer_base_init(bp);
5379 net->ipv6.peers = bp;
5380 return 0;
5381 }
5382
5383 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5384 {
5385 struct inet_peer_base *bp = net->ipv6.peers;
5386
5387 net->ipv6.peers = NULL;
5388 inetpeer_invalidate_tree(bp);
5389 kfree(bp);
5390 }
5391
5392 static struct pernet_operations ipv6_inetpeer_ops = {
5393 .init = ipv6_inetpeer_init,
5394 .exit = ipv6_inetpeer_exit,
5395 };
5396
5397 static struct pernet_operations ip6_route_net_late_ops = {
5398 .init = ip6_route_net_init_late,
5399 .exit = ip6_route_net_exit_late,
5400 };
5401
5402 static struct notifier_block ip6_route_dev_notifier = {
5403 .notifier_call = ip6_route_dev_notify,
5404 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5405 };
5406
5407 void __init ip6_route_init_special_entries(void)
5408 {
5409 /* Registering of the loopback is done before this portion of code,
5410 * the loopback reference in rt6_info will not be taken, do it
5411 * manually for init_net */
5412 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5413 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5414 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5415 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5416 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5417 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5418 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5419 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5420 #endif
5421 }
5422
5423 int __init ip6_route_init(void)
5424 {
5425 int ret;
5426 int cpu;
5427
5428 ret = -ENOMEM;
5429 ip6_dst_ops_template.kmem_cachep =
5430 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5431 SLAB_HWCACHE_ALIGN, NULL);
5432 if (!ip6_dst_ops_template.kmem_cachep)
5433 goto out;
5434
5435 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5436 if (ret)
5437 goto out_kmem_cache;
5438
5439 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5440 if (ret)
5441 goto out_dst_entries;
5442
5443 ret = register_pernet_subsys(&ip6_route_net_ops);
5444 if (ret)
5445 goto out_register_inetpeer;
5446
5447 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5448
5449 ret = fib6_init();
5450 if (ret)
5451 goto out_register_subsys;
5452
5453 ret = xfrm6_init();
5454 if (ret)
5455 goto out_fib6_init;
5456
5457 ret = fib6_rules_init();
5458 if (ret)
5459 goto xfrm6_init;
5460
5461 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5462 if (ret)
5463 goto fib6_rules_init;
5464
5465 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5466 inet6_rtm_newroute, NULL, 0);
5467 if (ret < 0)
5468 goto out_register_late_subsys;
5469
5470 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5471 inet6_rtm_delroute, NULL, 0);
5472 if (ret < 0)
5473 goto out_register_late_subsys;
5474
5475 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5476 inet6_rtm_getroute, NULL,
5477 RTNL_FLAG_DOIT_UNLOCKED);
5478 if (ret < 0)
5479 goto out_register_late_subsys;
5480
5481 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5482 if (ret)
5483 goto out_register_late_subsys;
5484
5485 for_each_possible_cpu(cpu) {
5486 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5487
5488 INIT_LIST_HEAD(&ul->head);
5489 spin_lock_init(&ul->lock);
5490 }
5491
5492 out:
5493 return ret;
5494
5495 out_register_late_subsys:
5496 rtnl_unregister_all(PF_INET6);
5497 unregister_pernet_subsys(&ip6_route_net_late_ops);
5498 fib6_rules_init:
5499 fib6_rules_cleanup();
5500 xfrm6_init:
5501 xfrm6_fini();
5502 out_fib6_init:
5503 fib6_gc_cleanup();
5504 out_register_subsys:
5505 unregister_pernet_subsys(&ip6_route_net_ops);
5506 out_register_inetpeer:
5507 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5508 out_dst_entries:
5509 dst_entries_destroy(&ip6_dst_blackhole_ops);
5510 out_kmem_cache:
5511 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5512 goto out;
5513 }
5514
5515 void ip6_route_cleanup(void)
5516 {
5517 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5518 unregister_pernet_subsys(&ip6_route_net_late_ops);
5519 fib6_rules_cleanup();
5520 xfrm6_fini();
5521 fib6_gc_cleanup();
5522 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5523 unregister_pernet_subsys(&ip6_route_net_ops);
5524 dst_entries_destroy(&ip6_dst_blackhole_ops);
5525 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5526 }