]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blob - net/ipv6/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[mirror_ubuntu-eoan-kernel.git] / net / ipv6 / route.c
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14 /* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
78 RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
90
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb);
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
117 unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125 spinlock_t lock;
126 struct list_head head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135 rt->rt6i_uncached_list = ul;
136
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
140 }
141
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
147
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
152 }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 struct net_device *loopback_dev = net->loopback_dev;
158 int cpu;
159
160 if (dev == loopback_dev)
161 return;
162
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 struct rt6_info *rt;
166
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
171
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
175 }
176
177 if (rt_dev == dev) {
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
180 dev_put(rt_dev);
181 }
182 }
183 spin_unlock_bh(&ul->lock);
184 }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 struct rt6_info *rt = (struct rt6_info *)dst;
195
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
199 return NULL;
200 else
201 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 struct sk_buff *skb,
206 const void *daddr)
207 {
208 struct in6_addr *p = &rt->rt6i_gateway;
209
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
212 else if (skb)
213 return &ipv6_hdr(skb)->daddr;
214 return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 struct sk_buff *skb,
219 const void *daddr)
220 {
221 struct rt6_info *rt = (struct rt6_info *) dst;
222 struct neighbour *n;
223
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
226 if (n)
227 return n;
228 return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
235
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (!daddr)
238 return;
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 return;
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 return;
243 __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247 .family = AF_INET6,
248 .gc = ip6_dst_gc,
249 .gc_thresh = 1024,
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
252 .mtu = ip6_mtu,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269 return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 .family = AF_INET6,
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299 .dst = {
300 .__refcnt = ATOMIC_INIT(1),
301 .__use = 1,
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
306 },
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
320 .error = -EACCES,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
323 },
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .dst = {
332 .__refcnt = ATOMIC_INIT(1),
333 .__use = 1,
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .error = -EINVAL,
336 .input = dst_discard,
337 .output = dst_discard_out,
338 },
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 struct dst_entry *dst = &rt->dst;
350
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
359 int flags)
360 {
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
363
364 if (rt) {
365 rt6_info_init(rt);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 }
368
369 return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
374 int flags)
375 {
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378 if (rt) {
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
382 return NULL;
383 }
384 }
385
386 return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket;
394 struct rt6_info *from = rt->from;
395 struct inet6_dev *idev;
396
397 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt);
400
401 idev = rt->rt6i_idev;
402 if (idev) {
403 rt->rt6i_idev = NULL;
404 in6_dev_put(idev);
405 }
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 if (bucket) {
408 rt->rt6i_exception_bucket = NULL;
409 kfree(bucket);
410 }
411
412 rt->from = NULL;
413 dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 int how)
418 {
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
421 struct net_device *loopback_dev =
422 dev_net(dev)->loopback_dev;
423
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 if (loopback_idev) {
427 rt->rt6i_idev = loopback_idev;
428 in6_dev_put(idev);
429 }
430 }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
437 else
438 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
445 return true;
446 } else if (rt->from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired(rt->from);
449 }
450 return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454 struct rt6_info *match,
455 struct flowi6 *fl6, int oif,
456 const struct sk_buff *skb,
457 int strict)
458 {
459 struct rt6_info *sibling, *next_sibling;
460
461 /* We might have already computed the hash for ICMPv6 errors. In such
462 * case it will always be non-zero. Otherwise now is the time to do it.
463 */
464 if (!fl6->mp_hash)
465 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466
467 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468 return match;
469
470 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471 rt6i_siblings) {
472 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473 continue;
474 if (rt6_score_route(sibling, oif, strict) < 0)
475 break;
476 match = sibling;
477 break;
478 }
479
480 return match;
481 }
482
483 /*
484 * Route lookup. rcu_read_lock() should be held.
485 */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488 struct rt6_info *rt,
489 const struct in6_addr *saddr,
490 int oif,
491 int flags)
492 {
493 struct rt6_info *local = NULL;
494 struct rt6_info *sprt;
495
496 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497 return rt;
498
499 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500 struct net_device *dev = sprt->dst.dev;
501
502 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503 continue;
504
505 if (oif) {
506 if (dev->ifindex == oif)
507 return sprt;
508 if (dev->flags & IFF_LOOPBACK) {
509 if (!sprt->rt6i_idev ||
510 sprt->rt6i_idev->dev->ifindex != oif) {
511 if (flags & RT6_LOOKUP_F_IFACE)
512 continue;
513 if (local &&
514 local->rt6i_idev->dev->ifindex == oif)
515 continue;
516 }
517 local = sprt;
518 }
519 } else {
520 if (ipv6_chk_addr(net, saddr, dev,
521 flags & RT6_LOOKUP_F_IFACE))
522 return sprt;
523 }
524 }
525
526 if (oif) {
527 if (local)
528 return local;
529
530 if (flags & RT6_LOOKUP_F_IFACE)
531 return net->ipv6.ip6_null_entry;
532 }
533
534 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539 struct work_struct work;
540 struct in6_addr target;
541 struct net_device *dev;
542 };
543
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546 struct in6_addr mcaddr;
547 struct __rt6_probe_work *work =
548 container_of(w, struct __rt6_probe_work, work);
549
550 addrconf_addr_solict_mult(&work->target, &mcaddr);
551 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552 dev_put(work->dev);
553 kfree(work);
554 }
555
556 static void rt6_probe(struct rt6_info *rt)
557 {
558 struct __rt6_probe_work *work;
559 struct neighbour *neigh;
560 /*
561 * Okay, this does not seem to be appropriate
562 * for now, however, we need to check if it
563 * is really so; aka Router Reachability Probing.
564 *
565 * Router Reachability Probe MUST be rate-limited
566 * to no more than one per minute.
567 */
568 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569 return;
570 rcu_read_lock_bh();
571 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572 if (neigh) {
573 if (neigh->nud_state & NUD_VALID)
574 goto out;
575
576 work = NULL;
577 write_lock(&neigh->lock);
578 if (!(neigh->nud_state & NUD_VALID) &&
579 time_after(jiffies,
580 neigh->updated +
581 rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 if (work)
584 __neigh_set_probe_once(neigh);
585 }
586 write_unlock(&neigh->lock);
587 } else {
588 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 }
590
591 if (work) {
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = rt->rt6i_gateway;
594 dev_hold(rt->dst.dev);
595 work->dev = rt->dst.dev;
596 schedule_work(&work->work);
597 }
598
599 out:
600 rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607
608 /*
609 * Default Router Selection (RFC 2461 6.3.6)
610 */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613 struct net_device *dev = rt->dst.dev;
614 if (!oif || dev->ifindex == oif)
615 return 2;
616 if ((dev->flags & IFF_LOOPBACK) &&
617 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 return 1;
619 return 0;
620 }
621
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624 struct neighbour *neigh;
625 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626
627 if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 !(rt->rt6i_flags & RTF_GATEWAY))
629 return RT6_NUD_SUCCEED;
630
631 rcu_read_lock_bh();
632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633 if (neigh) {
634 read_lock(&neigh->lock);
635 if (neigh->nud_state & NUD_VALID)
636 ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 else if (!(neigh->nud_state & NUD_FAILED))
639 ret = RT6_NUD_SUCCEED;
640 else
641 ret = RT6_NUD_FAIL_PROBE;
642 #endif
643 read_unlock(&neigh->lock);
644 } else {
645 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647 }
648 rcu_read_unlock_bh();
649
650 return ret;
651 }
652
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654 int strict)
655 {
656 int m;
657
658 m = rt6_check_dev(rt, oif);
659 if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664 if (strict & RT6_LOOKUP_F_REACHABLE) {
665 int n = rt6_check_neigh(rt);
666 if (n < 0)
667 return n;
668 }
669 return m;
670 }
671
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673 int *mpri, struct rt6_info *match,
674 bool *do_rr)
675 {
676 int m;
677 bool match_do_rr = false;
678 struct inet6_dev *idev = rt->rt6i_idev;
679
680 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681 goto out;
682
683 if (idev->cnf.ignore_routes_with_linkdown &&
684 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686 goto out;
687
688 if (rt6_check_expired(rt))
689 goto out;
690
691 m = rt6_score_route(rt, oif, strict);
692 if (m == RT6_NUD_FAIL_DO_RR) {
693 match_do_rr = true;
694 m = 0; /* lowest valid score */
695 } else if (m == RT6_NUD_FAIL_HARD) {
696 goto out;
697 }
698
699 if (strict & RT6_LOOKUP_F_REACHABLE)
700 rt6_probe(rt);
701
702 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703 if (m > *mpri) {
704 *do_rr = match_do_rr;
705 *mpri = m;
706 match = rt;
707 }
708 out:
709 return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713 struct rt6_info *leaf,
714 struct rt6_info *rr_head,
715 u32 metric, int oif, int strict,
716 bool *do_rr)
717 {
718 struct rt6_info *rt, *match, *cont;
719 int mpri = -1;
720
721 match = NULL;
722 cont = NULL;
723 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724 if (rt->rt6i_metric != metric) {
725 cont = rt;
726 break;
727 }
728
729 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 }
731
732 for (rt = leaf; rt && rt != rr_head;
733 rt = rcu_dereference(rt->rt6_next)) {
734 if (rt->rt6i_metric != metric) {
735 cont = rt;
736 break;
737 }
738
739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740 }
741
742 if (match || !cont)
743 return match;
744
745 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748 return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 int oif, int strict)
753 {
754 struct rt6_info *leaf = rcu_dereference(fn->leaf);
755 struct rt6_info *match, *rt0;
756 bool do_rr = false;
757 int key_plen;
758
759 if (!leaf || leaf == net->ipv6.ip6_null_entry)
760 return net->ipv6.ip6_null_entry;
761
762 rt0 = rcu_dereference(fn->rr_ptr);
763 if (!rt0)
764 rt0 = leaf;
765
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
770 */
771 key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->rt6i_src.plen)
774 key_plen = rt0->rt6i_src.plen;
775 #endif
776 if (fn->fn_bit != key_plen)
777 return net->ipv6.ip6_null_entry;
778
779 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780 &do_rr);
781
782 if (do_rr) {
783 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785 /* no entries matched; do round-robin */
786 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787 next = leaf;
788
789 if (next != rt0) {
790 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 /* make sure next is not being deleted from the tree */
792 if (next->rt6i_node)
793 rcu_assign_pointer(fn->rr_ptr, next);
794 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795 }
796 }
797
798 return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808 const struct in6_addr *gwaddr)
809 {
810 struct net *net = dev_net(dev);
811 struct route_info *rinfo = (struct route_info *) opt;
812 struct in6_addr prefix_buf, *prefix;
813 unsigned int pref;
814 unsigned long lifetime;
815 struct rt6_info *rt;
816
817 if (len < sizeof(struct route_info)) {
818 return -EINVAL;
819 }
820
821 /* Sanity check for prefix_len and length */
822 if (rinfo->length > 3) {
823 return -EINVAL;
824 } else if (rinfo->prefix_len > 128) {
825 return -EINVAL;
826 } else if (rinfo->prefix_len > 64) {
827 if (rinfo->length < 2) {
828 return -EINVAL;
829 }
830 } else if (rinfo->prefix_len > 0) {
831 if (rinfo->length < 1) {
832 return -EINVAL;
833 }
834 }
835
836 pref = rinfo->route_pref;
837 if (pref == ICMPV6_ROUTER_PREF_INVALID)
838 return -EINVAL;
839
840 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842 if (rinfo->length == 3)
843 prefix = (struct in6_addr *)rinfo->prefix;
844 else {
845 /* this function is safe */
846 ipv6_addr_prefix(&prefix_buf,
847 (struct in6_addr *)rinfo->prefix,
848 rinfo->prefix_len);
849 prefix = &prefix_buf;
850 }
851
852 if (rinfo->prefix_len == 0)
853 rt = rt6_get_dflt_router(gwaddr, dev);
854 else
855 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856 gwaddr, dev);
857
858 if (rt && !lifetime) {
859 ip6_del_rt(rt);
860 rt = NULL;
861 }
862
863 if (!rt && lifetime)
864 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 dev, pref);
866 else if (rt)
867 rt->rt6i_flags = RTF_ROUTEINFO |
868 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870 if (rt) {
871 if (!addrconf_finite_timeout(lifetime))
872 rt6_clean_expires(rt);
873 else
874 rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876 ip6_rt_put(rt);
877 }
878 return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 struct in6_addr *saddr)
884 {
885 struct fib6_node *pn, *sn;
886 while (1) {
887 if (fn->fn_flags & RTN_TL_ROOT)
888 return NULL;
889 pn = rcu_dereference(fn->parent);
890 sn = FIB6_SUBTREE(pn);
891 if (sn && sn != fn)
892 fn = fib6_lookup(sn, NULL, saddr);
893 else
894 fn = pn;
895 if (fn->fn_flags & RTN_RTINFO)
896 return fn;
897 }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901 bool null_fallback)
902 {
903 struct rt6_info *rt = *prt;
904
905 if (dst_hold_safe(&rt->dst))
906 return true;
907 if (null_fallback) {
908 rt = net->ipv6.ip6_null_entry;
909 dst_hold(&rt->dst);
910 } else {
911 rt = NULL;
912 }
913 *prt = rt;
914 return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 struct fib6_table *table,
919 struct flowi6 *fl6,
920 const struct sk_buff *skb,
921 int flags)
922 {
923 struct rt6_info *rt, *rt_cache;
924 struct fib6_node *fn;
925
926 rcu_read_lock();
927 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
928 restart:
929 rt = rcu_dereference(fn->leaf);
930 if (!rt) {
931 rt = net->ipv6.ip6_null_entry;
932 } else {
933 rt = rt6_device_match(net, rt, &fl6->saddr,
934 fl6->flowi6_oif, flags);
935 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
936 rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
937 skb, flags);
938 }
939 if (rt == net->ipv6.ip6_null_entry) {
940 fn = fib6_backtrack(fn, &fl6->saddr);
941 if (fn)
942 goto restart;
943 }
944 /* Search through exception table */
945 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
946 if (rt_cache)
947 rt = rt_cache;
948
949 if (ip6_hold_safe(net, &rt, true))
950 dst_use_noref(&rt->dst, jiffies);
951
952 rcu_read_unlock();
953
954 trace_fib6_table_lookup(net, rt, table, fl6);
955
956 return rt;
957
958 }
959
960 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
961 const struct sk_buff *skb, int flags)
962 {
963 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
964 }
965 EXPORT_SYMBOL_GPL(ip6_route_lookup);
966
967 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
968 const struct in6_addr *saddr, int oif,
969 const struct sk_buff *skb, int strict)
970 {
971 struct flowi6 fl6 = {
972 .flowi6_oif = oif,
973 .daddr = *daddr,
974 };
975 struct dst_entry *dst;
976 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
977
978 if (saddr) {
979 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
980 flags |= RT6_LOOKUP_F_HAS_SADDR;
981 }
982
983 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
984 if (dst->error == 0)
985 return (struct rt6_info *) dst;
986
987 dst_release(dst);
988
989 return NULL;
990 }
991 EXPORT_SYMBOL(rt6_lookup);
992
993 /* ip6_ins_rt is called with FREE table->tb6_lock.
994 * It takes new route entry, the addition fails by any reason the
995 * route is released.
996 * Caller must hold dst before calling it.
997 */
998
999 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1000 struct mx6_config *mxc,
1001 struct netlink_ext_ack *extack)
1002 {
1003 int err;
1004 struct fib6_table *table;
1005
1006 table = rt->rt6i_table;
1007 spin_lock_bh(&table->tb6_lock);
1008 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1009 spin_unlock_bh(&table->tb6_lock);
1010
1011 return err;
1012 }
1013
1014 int ip6_ins_rt(struct rt6_info *rt)
1015 {
1016 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1017 struct mx6_config mxc = { .mx = NULL, };
1018
1019 /* Hold dst to account for the reference from the fib6 tree */
1020 dst_hold(&rt->dst);
1021 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1022 }
1023
1024 /* called with rcu_lock held */
1025 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1026 {
1027 struct net_device *dev = rt->dst.dev;
1028
1029 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1030 /* for copies of local routes, dst->dev needs to be the
1031 * device if it is a master device, the master device if
1032 * device is enslaved, and the loopback as the default
1033 */
1034 if (netif_is_l3_slave(dev) &&
1035 !rt6_need_strict(&rt->rt6i_dst.addr))
1036 dev = l3mdev_master_dev_rcu(dev);
1037 else if (!netif_is_l3_master(dev))
1038 dev = dev_net(dev)->loopback_dev;
1039 /* last case is netif_is_l3_master(dev) is true in which
1040 * case we want dev returned to be dev
1041 */
1042 }
1043
1044 return dev;
1045 }
1046
1047 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1048 const struct in6_addr *daddr,
1049 const struct in6_addr *saddr)
1050 {
1051 struct net_device *dev;
1052 struct rt6_info *rt;
1053
1054 /*
1055 * Clone the route.
1056 */
1057
1058 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1059 ort = ort->from;
1060
1061 rcu_read_lock();
1062 dev = ip6_rt_get_dev_rcu(ort);
1063 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1064 rcu_read_unlock();
1065 if (!rt)
1066 return NULL;
1067
1068 ip6_rt_copy_init(rt, ort);
1069 rt->rt6i_flags |= RTF_CACHE;
1070 rt->rt6i_metric = 0;
1071 rt->dst.flags |= DST_HOST;
1072 rt->rt6i_dst.addr = *daddr;
1073 rt->rt6i_dst.plen = 128;
1074
1075 if (!rt6_is_gw_or_nonexthop(ort)) {
1076 if (ort->rt6i_dst.plen != 128 &&
1077 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1078 rt->rt6i_flags |= RTF_ANYCAST;
1079 #ifdef CONFIG_IPV6_SUBTREES
1080 if (rt->rt6i_src.plen && saddr) {
1081 rt->rt6i_src.addr = *saddr;
1082 rt->rt6i_src.plen = 128;
1083 }
1084 #endif
1085 }
1086
1087 return rt;
1088 }
1089
1090 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1091 {
1092 struct net_device *dev;
1093 struct rt6_info *pcpu_rt;
1094
1095 rcu_read_lock();
1096 dev = ip6_rt_get_dev_rcu(rt);
1097 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1098 rcu_read_unlock();
1099 if (!pcpu_rt)
1100 return NULL;
1101 ip6_rt_copy_init(pcpu_rt, rt);
1102 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1103 pcpu_rt->rt6i_flags |= RTF_PCPU;
1104 return pcpu_rt;
1105 }
1106
1107 /* It should be called with rcu_read_lock() acquired */
1108 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1109 {
1110 struct rt6_info *pcpu_rt, **p;
1111
1112 p = this_cpu_ptr(rt->rt6i_pcpu);
1113 pcpu_rt = *p;
1114
1115 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1116 rt6_dst_from_metrics_check(pcpu_rt);
1117
1118 return pcpu_rt;
1119 }
1120
1121 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1122 {
1123 struct rt6_info *pcpu_rt, *prev, **p;
1124
1125 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1126 if (!pcpu_rt) {
1127 struct net *net = dev_net(rt->dst.dev);
1128
1129 dst_hold(&net->ipv6.ip6_null_entry->dst);
1130 return net->ipv6.ip6_null_entry;
1131 }
1132
1133 dst_hold(&pcpu_rt->dst);
1134 p = this_cpu_ptr(rt->rt6i_pcpu);
1135 prev = cmpxchg(p, NULL, pcpu_rt);
1136 BUG_ON(prev);
1137
1138 rt6_dst_from_metrics_check(pcpu_rt);
1139 return pcpu_rt;
1140 }
1141
1142 /* exception hash table implementation
1143 */
1144 static DEFINE_SPINLOCK(rt6_exception_lock);
1145
1146 /* Remove rt6_ex from hash table and free the memory
1147 * Caller must hold rt6_exception_lock
1148 */
1149 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1150 struct rt6_exception *rt6_ex)
1151 {
1152 struct net *net;
1153
1154 if (!bucket || !rt6_ex)
1155 return;
1156
1157 net = dev_net(rt6_ex->rt6i->dst.dev);
1158 rt6_ex->rt6i->rt6i_node = NULL;
1159 hlist_del_rcu(&rt6_ex->hlist);
1160 rt6_release(rt6_ex->rt6i);
1161 kfree_rcu(rt6_ex, rcu);
1162 WARN_ON_ONCE(!bucket->depth);
1163 bucket->depth--;
1164 net->ipv6.rt6_stats->fib_rt_cache--;
1165 }
1166
1167 /* Remove oldest rt6_ex in bucket and free the memory
1168 * Caller must hold rt6_exception_lock
1169 */
1170 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1171 {
1172 struct rt6_exception *rt6_ex, *oldest = NULL;
1173
1174 if (!bucket)
1175 return;
1176
1177 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1178 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1179 oldest = rt6_ex;
1180 }
1181 rt6_remove_exception(bucket, oldest);
1182 }
1183
1184 static u32 rt6_exception_hash(const struct in6_addr *dst,
1185 const struct in6_addr *src)
1186 {
1187 static u32 seed __read_mostly;
1188 u32 val;
1189
1190 net_get_random_once(&seed, sizeof(seed));
1191 val = jhash(dst, sizeof(*dst), seed);
1192
1193 #ifdef CONFIG_IPV6_SUBTREES
1194 if (src)
1195 val = jhash(src, sizeof(*src), val);
1196 #endif
1197 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1198 }
1199
1200 /* Helper function to find the cached rt in the hash table
1201 * and update bucket pointer to point to the bucket for this
1202 * (daddr, saddr) pair
1203 * Caller must hold rt6_exception_lock
1204 */
1205 static struct rt6_exception *
1206 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1207 const struct in6_addr *daddr,
1208 const struct in6_addr *saddr)
1209 {
1210 struct rt6_exception *rt6_ex;
1211 u32 hval;
1212
1213 if (!(*bucket) || !daddr)
1214 return NULL;
1215
1216 hval = rt6_exception_hash(daddr, saddr);
1217 *bucket += hval;
1218
1219 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1220 struct rt6_info *rt6 = rt6_ex->rt6i;
1221 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1222
1223 #ifdef CONFIG_IPV6_SUBTREES
1224 if (matched && saddr)
1225 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1226 #endif
1227 if (matched)
1228 return rt6_ex;
1229 }
1230 return NULL;
1231 }
1232
1233 /* Helper function to find the cached rt in the hash table
1234 * and update bucket pointer to point to the bucket for this
1235 * (daddr, saddr) pair
1236 * Caller must hold rcu_read_lock()
1237 */
1238 static struct rt6_exception *
1239 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1240 const struct in6_addr *daddr,
1241 const struct in6_addr *saddr)
1242 {
1243 struct rt6_exception *rt6_ex;
1244 u32 hval;
1245
1246 WARN_ON_ONCE(!rcu_read_lock_held());
1247
1248 if (!(*bucket) || !daddr)
1249 return NULL;
1250
1251 hval = rt6_exception_hash(daddr, saddr);
1252 *bucket += hval;
1253
1254 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1255 struct rt6_info *rt6 = rt6_ex->rt6i;
1256 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1257
1258 #ifdef CONFIG_IPV6_SUBTREES
1259 if (matched && saddr)
1260 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1261 #endif
1262 if (matched)
1263 return rt6_ex;
1264 }
1265 return NULL;
1266 }
1267
1268 static int rt6_insert_exception(struct rt6_info *nrt,
1269 struct rt6_info *ort)
1270 {
1271 struct net *net = dev_net(ort->dst.dev);
1272 struct rt6_exception_bucket *bucket;
1273 struct in6_addr *src_key = NULL;
1274 struct rt6_exception *rt6_ex;
1275 int err = 0;
1276
1277 /* ort can't be a cache or pcpu route */
1278 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1279 ort = ort->from;
1280 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1281
1282 spin_lock_bh(&rt6_exception_lock);
1283
1284 if (ort->exception_bucket_flushed) {
1285 err = -EINVAL;
1286 goto out;
1287 }
1288
1289 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1290 lockdep_is_held(&rt6_exception_lock));
1291 if (!bucket) {
1292 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1293 GFP_ATOMIC);
1294 if (!bucket) {
1295 err = -ENOMEM;
1296 goto out;
1297 }
1298 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1299 }
1300
1301 #ifdef CONFIG_IPV6_SUBTREES
1302 /* rt6i_src.plen != 0 indicates ort is in subtree
1303 * and exception table is indexed by a hash of
1304 * both rt6i_dst and rt6i_src.
1305 * Otherwise, the exception table is indexed by
1306 * a hash of only rt6i_dst.
1307 */
1308 if (ort->rt6i_src.plen)
1309 src_key = &nrt->rt6i_src.addr;
1310 #endif
1311
1312 /* Update rt6i_prefsrc as it could be changed
1313 * in rt6_remove_prefsrc()
1314 */
1315 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1316 /* rt6_mtu_change() might lower mtu on ort.
1317 * Only insert this exception route if its mtu
1318 * is less than ort's mtu value.
1319 */
1320 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1321 err = -EINVAL;
1322 goto out;
1323 }
1324
1325 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1326 src_key);
1327 if (rt6_ex)
1328 rt6_remove_exception(bucket, rt6_ex);
1329
1330 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1331 if (!rt6_ex) {
1332 err = -ENOMEM;
1333 goto out;
1334 }
1335 rt6_ex->rt6i = nrt;
1336 rt6_ex->stamp = jiffies;
1337 atomic_inc(&nrt->rt6i_ref);
1338 nrt->rt6i_node = ort->rt6i_node;
1339 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1340 bucket->depth++;
1341 net->ipv6.rt6_stats->fib_rt_cache++;
1342
1343 if (bucket->depth > FIB6_MAX_DEPTH)
1344 rt6_exception_remove_oldest(bucket);
1345
1346 out:
1347 spin_unlock_bh(&rt6_exception_lock);
1348
1349 /* Update fn->fn_sernum to invalidate all cached dst */
1350 if (!err) {
1351 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1352 fib6_update_sernum(ort);
1353 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1354 fib6_force_start_gc(net);
1355 }
1356
1357 return err;
1358 }
1359
1360 void rt6_flush_exceptions(struct rt6_info *rt)
1361 {
1362 struct rt6_exception_bucket *bucket;
1363 struct rt6_exception *rt6_ex;
1364 struct hlist_node *tmp;
1365 int i;
1366
1367 spin_lock_bh(&rt6_exception_lock);
1368 /* Prevent rt6_insert_exception() to recreate the bucket list */
1369 rt->exception_bucket_flushed = 1;
1370
1371 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1372 lockdep_is_held(&rt6_exception_lock));
1373 if (!bucket)
1374 goto out;
1375
1376 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1377 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1378 rt6_remove_exception(bucket, rt6_ex);
1379 WARN_ON_ONCE(bucket->depth);
1380 bucket++;
1381 }
1382
1383 out:
1384 spin_unlock_bh(&rt6_exception_lock);
1385 }
1386
1387 /* Find cached rt in the hash table inside passed in rt
1388 * Caller has to hold rcu_read_lock()
1389 */
1390 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1391 struct in6_addr *daddr,
1392 struct in6_addr *saddr)
1393 {
1394 struct rt6_exception_bucket *bucket;
1395 struct in6_addr *src_key = NULL;
1396 struct rt6_exception *rt6_ex;
1397 struct rt6_info *res = NULL;
1398
1399 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1400
1401 #ifdef CONFIG_IPV6_SUBTREES
1402 /* rt6i_src.plen != 0 indicates rt is in subtree
1403 * and exception table is indexed by a hash of
1404 * both rt6i_dst and rt6i_src.
1405 * Otherwise, the exception table is indexed by
1406 * a hash of only rt6i_dst.
1407 */
1408 if (rt->rt6i_src.plen)
1409 src_key = saddr;
1410 #endif
1411 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1412
1413 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1414 res = rt6_ex->rt6i;
1415
1416 return res;
1417 }
1418
1419 /* Remove the passed in cached rt from the hash table that contains it */
1420 int rt6_remove_exception_rt(struct rt6_info *rt)
1421 {
1422 struct rt6_exception_bucket *bucket;
1423 struct rt6_info *from = rt->from;
1424 struct in6_addr *src_key = NULL;
1425 struct rt6_exception *rt6_ex;
1426 int err;
1427
1428 if (!from ||
1429 !(rt->rt6i_flags & RTF_CACHE))
1430 return -EINVAL;
1431
1432 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1433 return -ENOENT;
1434
1435 spin_lock_bh(&rt6_exception_lock);
1436 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1437 lockdep_is_held(&rt6_exception_lock));
1438 #ifdef CONFIG_IPV6_SUBTREES
1439 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1440 * and exception table is indexed by a hash of
1441 * both rt6i_dst and rt6i_src.
1442 * Otherwise, the exception table is indexed by
1443 * a hash of only rt6i_dst.
1444 */
1445 if (from->rt6i_src.plen)
1446 src_key = &rt->rt6i_src.addr;
1447 #endif
1448 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1449 &rt->rt6i_dst.addr,
1450 src_key);
1451 if (rt6_ex) {
1452 rt6_remove_exception(bucket, rt6_ex);
1453 err = 0;
1454 } else {
1455 err = -ENOENT;
1456 }
1457
1458 spin_unlock_bh(&rt6_exception_lock);
1459 return err;
1460 }
1461
1462 /* Find rt6_ex which contains the passed in rt cache and
1463 * refresh its stamp
1464 */
1465 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1466 {
1467 struct rt6_exception_bucket *bucket;
1468 struct rt6_info *from = rt->from;
1469 struct in6_addr *src_key = NULL;
1470 struct rt6_exception *rt6_ex;
1471
1472 if (!from ||
1473 !(rt->rt6i_flags & RTF_CACHE))
1474 return;
1475
1476 rcu_read_lock();
1477 bucket = rcu_dereference(from->rt6i_exception_bucket);
1478
1479 #ifdef CONFIG_IPV6_SUBTREES
1480 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1481 * and exception table is indexed by a hash of
1482 * both rt6i_dst and rt6i_src.
1483 * Otherwise, the exception table is indexed by
1484 * a hash of only rt6i_dst.
1485 */
1486 if (from->rt6i_src.plen)
1487 src_key = &rt->rt6i_src.addr;
1488 #endif
1489 rt6_ex = __rt6_find_exception_rcu(&bucket,
1490 &rt->rt6i_dst.addr,
1491 src_key);
1492 if (rt6_ex)
1493 rt6_ex->stamp = jiffies;
1494
1495 rcu_read_unlock();
1496 }
1497
1498 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1499 {
1500 struct rt6_exception_bucket *bucket;
1501 struct rt6_exception *rt6_ex;
1502 int i;
1503
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1506
1507 if (bucket) {
1508 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1509 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1510 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1511 }
1512 bucket++;
1513 }
1514 }
1515 }
1516
1517 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1518 struct rt6_info *rt, int mtu)
1519 {
1520 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1521 * lowest MTU in the path: always allow updating the route PMTU to
1522 * reflect PMTU decreases.
1523 *
1524 * If the new MTU is higher, and the route PMTU is equal to the local
1525 * MTU, this means the old MTU is the lowest in the path, so allow
1526 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1527 * handle this.
1528 */
1529
1530 if (dst_mtu(&rt->dst) >= mtu)
1531 return true;
1532
1533 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1534 return true;
1535
1536 return false;
1537 }
1538
1539 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1540 struct rt6_info *rt, int mtu)
1541 {
1542 struct rt6_exception_bucket *bucket;
1543 struct rt6_exception *rt6_ex;
1544 int i;
1545
1546 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1547 lockdep_is_held(&rt6_exception_lock));
1548
1549 if (!bucket)
1550 return;
1551
1552 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1553 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1554 struct rt6_info *entry = rt6_ex->rt6i;
1555
1556 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1557 * route), the metrics of its rt->dst.from have already
1558 * been updated.
1559 */
1560 if (entry->rt6i_pmtu &&
1561 rt6_mtu_change_route_allowed(idev, entry, mtu))
1562 entry->rt6i_pmtu = mtu;
1563 }
1564 bucket++;
1565 }
1566 }
1567
1568 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1569
1570 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1571 struct in6_addr *gateway)
1572 {
1573 struct rt6_exception_bucket *bucket;
1574 struct rt6_exception *rt6_ex;
1575 struct hlist_node *tmp;
1576 int i;
1577
1578 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1579 return;
1580
1581 spin_lock_bh(&rt6_exception_lock);
1582 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1583 lockdep_is_held(&rt6_exception_lock));
1584
1585 if (bucket) {
1586 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1587 hlist_for_each_entry_safe(rt6_ex, tmp,
1588 &bucket->chain, hlist) {
1589 struct rt6_info *entry = rt6_ex->rt6i;
1590
1591 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1592 RTF_CACHE_GATEWAY &&
1593 ipv6_addr_equal(gateway,
1594 &entry->rt6i_gateway)) {
1595 rt6_remove_exception(bucket, rt6_ex);
1596 }
1597 }
1598 bucket++;
1599 }
1600 }
1601
1602 spin_unlock_bh(&rt6_exception_lock);
1603 }
1604
1605 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1606 struct rt6_exception *rt6_ex,
1607 struct fib6_gc_args *gc_args,
1608 unsigned long now)
1609 {
1610 struct rt6_info *rt = rt6_ex->rt6i;
1611
1612 /* we are pruning and obsoleting aged-out and non gateway exceptions
1613 * even if others have still references to them, so that on next
1614 * dst_check() such references can be dropped.
1615 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1616 * expired, independently from their aging, as per RFC 8201 section 4
1617 */
1618 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1619 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1620 RT6_TRACE("aging clone %p\n", rt);
1621 rt6_remove_exception(bucket, rt6_ex);
1622 return;
1623 }
1624 } else if (time_after(jiffies, rt->dst.expires)) {
1625 RT6_TRACE("purging expired route %p\n", rt);
1626 rt6_remove_exception(bucket, rt6_ex);
1627 return;
1628 }
1629
1630 if (rt->rt6i_flags & RTF_GATEWAY) {
1631 struct neighbour *neigh;
1632 __u8 neigh_flags = 0;
1633
1634 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1635 if (neigh) {
1636 neigh_flags = neigh->flags;
1637 neigh_release(neigh);
1638 }
1639 if (!(neigh_flags & NTF_ROUTER)) {
1640 RT6_TRACE("purging route %p via non-router but gateway\n",
1641 rt);
1642 rt6_remove_exception(bucket, rt6_ex);
1643 return;
1644 }
1645 }
1646
1647 gc_args->more++;
1648 }
1649
1650 void rt6_age_exceptions(struct rt6_info *rt,
1651 struct fib6_gc_args *gc_args,
1652 unsigned long now)
1653 {
1654 struct rt6_exception_bucket *bucket;
1655 struct rt6_exception *rt6_ex;
1656 struct hlist_node *tmp;
1657 int i;
1658
1659 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1660 return;
1661
1662 spin_lock_bh(&rt6_exception_lock);
1663 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1664 lockdep_is_held(&rt6_exception_lock));
1665
1666 if (bucket) {
1667 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1668 hlist_for_each_entry_safe(rt6_ex, tmp,
1669 &bucket->chain, hlist) {
1670 rt6_age_examine_exception(bucket, rt6_ex,
1671 gc_args, now);
1672 }
1673 bucket++;
1674 }
1675 }
1676 spin_unlock_bh(&rt6_exception_lock);
1677 }
1678
1679 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1680 int oif, struct flowi6 *fl6,
1681 const struct sk_buff *skb, int flags)
1682 {
1683 struct fib6_node *fn, *saved_fn;
1684 struct rt6_info *rt, *rt_cache;
1685 int strict = 0;
1686
1687 strict |= flags & RT6_LOOKUP_F_IFACE;
1688 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1689 if (net->ipv6.devconf_all->forwarding == 0)
1690 strict |= RT6_LOOKUP_F_REACHABLE;
1691
1692 rcu_read_lock();
1693
1694 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1695 saved_fn = fn;
1696
1697 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1698 oif = 0;
1699
1700 redo_rt6_select:
1701 rt = rt6_select(net, fn, oif, strict);
1702 if (rt->rt6i_nsiblings)
1703 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1704 if (rt == net->ipv6.ip6_null_entry) {
1705 fn = fib6_backtrack(fn, &fl6->saddr);
1706 if (fn)
1707 goto redo_rt6_select;
1708 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1709 /* also consider unreachable route */
1710 strict &= ~RT6_LOOKUP_F_REACHABLE;
1711 fn = saved_fn;
1712 goto redo_rt6_select;
1713 }
1714 }
1715
1716 /*Search through exception table */
1717 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1718 if (rt_cache)
1719 rt = rt_cache;
1720
1721 if (rt == net->ipv6.ip6_null_entry) {
1722 rcu_read_unlock();
1723 dst_hold(&rt->dst);
1724 trace_fib6_table_lookup(net, rt, table, fl6);
1725 return rt;
1726 } else if (rt->rt6i_flags & RTF_CACHE) {
1727 if (ip6_hold_safe(net, &rt, true)) {
1728 dst_use_noref(&rt->dst, jiffies);
1729 rt6_dst_from_metrics_check(rt);
1730 }
1731 rcu_read_unlock();
1732 trace_fib6_table_lookup(net, rt, table, fl6);
1733 return rt;
1734 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1735 !(rt->rt6i_flags & RTF_GATEWAY))) {
1736 /* Create a RTF_CACHE clone which will not be
1737 * owned by the fib6 tree. It is for the special case where
1738 * the daddr in the skb during the neighbor look-up is different
1739 * from the fl6->daddr used to look-up route here.
1740 */
1741
1742 struct rt6_info *uncached_rt;
1743
1744 if (ip6_hold_safe(net, &rt, true)) {
1745 dst_use_noref(&rt->dst, jiffies);
1746 } else {
1747 rcu_read_unlock();
1748 uncached_rt = rt;
1749 goto uncached_rt_out;
1750 }
1751 rcu_read_unlock();
1752
1753 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1754 dst_release(&rt->dst);
1755
1756 if (uncached_rt) {
1757 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1758 * No need for another dst_hold()
1759 */
1760 rt6_uncached_list_add(uncached_rt);
1761 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1762 } else {
1763 uncached_rt = net->ipv6.ip6_null_entry;
1764 dst_hold(&uncached_rt->dst);
1765 }
1766
1767 uncached_rt_out:
1768 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1769 return uncached_rt;
1770
1771 } else {
1772 /* Get a percpu copy */
1773
1774 struct rt6_info *pcpu_rt;
1775
1776 dst_use_noref(&rt->dst, jiffies);
1777 local_bh_disable();
1778 pcpu_rt = rt6_get_pcpu_route(rt);
1779
1780 if (!pcpu_rt) {
1781 /* atomic_inc_not_zero() is needed when using rcu */
1782 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1783 /* No dst_hold() on rt is needed because grabbing
1784 * rt->rt6i_ref makes sure rt can't be released.
1785 */
1786 pcpu_rt = rt6_make_pcpu_route(rt);
1787 rt6_release(rt);
1788 } else {
1789 /* rt is already removed from tree */
1790 pcpu_rt = net->ipv6.ip6_null_entry;
1791 dst_hold(&pcpu_rt->dst);
1792 }
1793 }
1794 local_bh_enable();
1795 rcu_read_unlock();
1796 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1797 return pcpu_rt;
1798 }
1799 }
1800 EXPORT_SYMBOL_GPL(ip6_pol_route);
1801
1802 static struct rt6_info *ip6_pol_route_input(struct net *net,
1803 struct fib6_table *table,
1804 struct flowi6 *fl6,
1805 const struct sk_buff *skb,
1806 int flags)
1807 {
1808 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1809 }
1810
1811 struct dst_entry *ip6_route_input_lookup(struct net *net,
1812 struct net_device *dev,
1813 struct flowi6 *fl6,
1814 const struct sk_buff *skb,
1815 int flags)
1816 {
1817 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1818 flags |= RT6_LOOKUP_F_IFACE;
1819
1820 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1821 }
1822 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1823
1824 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1825 struct flow_keys *keys,
1826 struct flow_keys *flkeys)
1827 {
1828 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1829 const struct ipv6hdr *key_iph = outer_iph;
1830 struct flow_keys *_flkeys = flkeys;
1831 const struct ipv6hdr *inner_iph;
1832 const struct icmp6hdr *icmph;
1833 struct ipv6hdr _inner_iph;
1834
1835 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1836 goto out;
1837
1838 icmph = icmp6_hdr(skb);
1839 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1840 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1841 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1842 icmph->icmp6_type != ICMPV6_PARAMPROB)
1843 goto out;
1844
1845 inner_iph = skb_header_pointer(skb,
1846 skb_transport_offset(skb) + sizeof(*icmph),
1847 sizeof(_inner_iph), &_inner_iph);
1848 if (!inner_iph)
1849 goto out;
1850
1851 key_iph = inner_iph;
1852 _flkeys = NULL;
1853 out:
1854 if (_flkeys) {
1855 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1856 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1857 keys->tags.flow_label = _flkeys->tags.flow_label;
1858 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1859 } else {
1860 keys->addrs.v6addrs.src = key_iph->saddr;
1861 keys->addrs.v6addrs.dst = key_iph->daddr;
1862 keys->tags.flow_label = ip6_flowinfo(key_iph);
1863 keys->basic.ip_proto = key_iph->nexthdr;
1864 }
1865 }
1866
1867 /* if skb is set it will be used and fl6 can be NULL */
1868 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1869 const struct sk_buff *skb, struct flow_keys *flkeys)
1870 {
1871 struct flow_keys hash_keys;
1872 u32 mhash;
1873
1874 switch (ip6_multipath_hash_policy(net)) {
1875 case 0:
1876 memset(&hash_keys, 0, sizeof(hash_keys));
1877 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1878 if (skb) {
1879 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1880 } else {
1881 hash_keys.addrs.v6addrs.src = fl6->saddr;
1882 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1883 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1884 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1885 }
1886 break;
1887 case 1:
1888 if (skb) {
1889 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1890 struct flow_keys keys;
1891
1892 /* short-circuit if we already have L4 hash present */
1893 if (skb->l4_hash)
1894 return skb_get_hash_raw(skb) >> 1;
1895
1896 memset(&hash_keys, 0, sizeof(hash_keys));
1897
1898 if (!flkeys) {
1899 skb_flow_dissect_flow_keys(skb, &keys, flag);
1900 flkeys = &keys;
1901 }
1902 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1903 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1904 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1905 hash_keys.ports.src = flkeys->ports.src;
1906 hash_keys.ports.dst = flkeys->ports.dst;
1907 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1908 } else {
1909 memset(&hash_keys, 0, sizeof(hash_keys));
1910 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1911 hash_keys.addrs.v6addrs.src = fl6->saddr;
1912 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1913 hash_keys.ports.src = fl6->fl6_sport;
1914 hash_keys.ports.dst = fl6->fl6_dport;
1915 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1916 }
1917 break;
1918 }
1919 mhash = flow_hash_from_keys(&hash_keys);
1920
1921 return mhash >> 1;
1922 }
1923
1924 void ip6_route_input(struct sk_buff *skb)
1925 {
1926 const struct ipv6hdr *iph = ipv6_hdr(skb);
1927 struct net *net = dev_net(skb->dev);
1928 int flags = RT6_LOOKUP_F_HAS_SADDR;
1929 struct ip_tunnel_info *tun_info;
1930 struct flowi6 fl6 = {
1931 .flowi6_iif = skb->dev->ifindex,
1932 .daddr = iph->daddr,
1933 .saddr = iph->saddr,
1934 .flowlabel = ip6_flowinfo(iph),
1935 .flowi6_mark = skb->mark,
1936 .flowi6_proto = iph->nexthdr,
1937 };
1938 struct flow_keys *flkeys = NULL, _flkeys;
1939
1940 tun_info = skb_tunnel_info(skb);
1941 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1942 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1943
1944 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1945 flkeys = &_flkeys;
1946
1947 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1948 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1949 skb_dst_drop(skb);
1950 skb_dst_set(skb,
1951 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1952 }
1953
1954 static struct rt6_info *ip6_pol_route_output(struct net *net,
1955 struct fib6_table *table,
1956 struct flowi6 *fl6,
1957 const struct sk_buff *skb,
1958 int flags)
1959 {
1960 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1961 }
1962
1963 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1964 struct flowi6 *fl6, int flags)
1965 {
1966 bool any_src;
1967
1968 if (rt6_need_strict(&fl6->daddr)) {
1969 struct dst_entry *dst;
1970
1971 dst = l3mdev_link_scope_lookup(net, fl6);
1972 if (dst)
1973 return dst;
1974 }
1975
1976 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1977
1978 any_src = ipv6_addr_any(&fl6->saddr);
1979 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1980 (fl6->flowi6_oif && any_src))
1981 flags |= RT6_LOOKUP_F_IFACE;
1982
1983 if (!any_src)
1984 flags |= RT6_LOOKUP_F_HAS_SADDR;
1985 else if (sk)
1986 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1987
1988 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1989 }
1990 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1991
1992 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1993 {
1994 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1995 struct net_device *loopback_dev = net->loopback_dev;
1996 struct dst_entry *new = NULL;
1997
1998 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1999 DST_OBSOLETE_DEAD, 0);
2000 if (rt) {
2001 rt6_info_init(rt);
2002 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2003
2004 new = &rt->dst;
2005 new->__use = 1;
2006 new->input = dst_discard;
2007 new->output = dst_discard_out;
2008
2009 dst_copy_metrics(new, &ort->dst);
2010
2011 rt->rt6i_idev = in6_dev_get(loopback_dev);
2012 rt->rt6i_gateway = ort->rt6i_gateway;
2013 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2014 rt->rt6i_metric = 0;
2015
2016 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2017 #ifdef CONFIG_IPV6_SUBTREES
2018 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2019 #endif
2020 }
2021
2022 dst_release(dst_orig);
2023 return new ? new : ERR_PTR(-ENOMEM);
2024 }
2025
2026 /*
2027 * Destination cache support functions
2028 */
2029
2030 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2031 {
2032 if (rt->from &&
2033 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2034 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2035 }
2036
2037 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2038 {
2039 u32 rt_cookie = 0;
2040
2041 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2042 return NULL;
2043
2044 if (rt6_check_expired(rt))
2045 return NULL;
2046
2047 return &rt->dst;
2048 }
2049
2050 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2051 {
2052 if (!__rt6_check_expired(rt) &&
2053 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2054 rt6_check(rt->from, cookie))
2055 return &rt->dst;
2056 else
2057 return NULL;
2058 }
2059
2060 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2061 {
2062 struct rt6_info *rt;
2063
2064 rt = (struct rt6_info *) dst;
2065
2066 /* All IPV6 dsts are created with ->obsolete set to the value
2067 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2068 * into this function always.
2069 */
2070
2071 rt6_dst_from_metrics_check(rt);
2072
2073 if (rt->rt6i_flags & RTF_PCPU ||
2074 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2075 return rt6_dst_from_check(rt, cookie);
2076 else
2077 return rt6_check(rt, cookie);
2078 }
2079
2080 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2081 {
2082 struct rt6_info *rt = (struct rt6_info *) dst;
2083
2084 if (rt) {
2085 if (rt->rt6i_flags & RTF_CACHE) {
2086 if (rt6_check_expired(rt)) {
2087 ip6_del_rt(rt);
2088 dst = NULL;
2089 }
2090 } else {
2091 dst_release(dst);
2092 dst = NULL;
2093 }
2094 }
2095 return dst;
2096 }
2097
2098 static void ip6_link_failure(struct sk_buff *skb)
2099 {
2100 struct rt6_info *rt;
2101
2102 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2103
2104 rt = (struct rt6_info *) skb_dst(skb);
2105 if (rt) {
2106 if (rt->rt6i_flags & RTF_CACHE) {
2107 if (dst_hold_safe(&rt->dst))
2108 ip6_del_rt(rt);
2109 } else {
2110 struct fib6_node *fn;
2111
2112 rcu_read_lock();
2113 fn = rcu_dereference(rt->rt6i_node);
2114 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2115 fn->fn_sernum = -1;
2116 rcu_read_unlock();
2117 }
2118 }
2119 }
2120
2121 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2122 {
2123 struct net *net = dev_net(rt->dst.dev);
2124
2125 rt->rt6i_flags |= RTF_MODIFIED;
2126 rt->rt6i_pmtu = mtu;
2127 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2128 }
2129
2130 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2131 {
2132 return !(rt->rt6i_flags & RTF_CACHE) &&
2133 (rt->rt6i_flags & RTF_PCPU ||
2134 rcu_access_pointer(rt->rt6i_node));
2135 }
2136
2137 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2138 const struct ipv6hdr *iph, u32 mtu)
2139 {
2140 const struct in6_addr *daddr, *saddr;
2141 struct rt6_info *rt6 = (struct rt6_info *)dst;
2142
2143 if (rt6->rt6i_flags & RTF_LOCAL)
2144 return;
2145
2146 if (dst_metric_locked(dst, RTAX_MTU))
2147 return;
2148
2149 if (iph) {
2150 daddr = &iph->daddr;
2151 saddr = &iph->saddr;
2152 } else if (sk) {
2153 daddr = &sk->sk_v6_daddr;
2154 saddr = &inet6_sk(sk)->saddr;
2155 } else {
2156 daddr = NULL;
2157 saddr = NULL;
2158 }
2159 dst_confirm_neigh(dst, daddr);
2160 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2161 if (mtu >= dst_mtu(dst))
2162 return;
2163
2164 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2165 rt6_do_update_pmtu(rt6, mtu);
2166 /* update rt6_ex->stamp for cache */
2167 if (rt6->rt6i_flags & RTF_CACHE)
2168 rt6_update_exception_stamp_rt(rt6);
2169 } else if (daddr) {
2170 struct rt6_info *nrt6;
2171
2172 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2173 if (nrt6) {
2174 rt6_do_update_pmtu(nrt6, mtu);
2175 if (rt6_insert_exception(nrt6, rt6))
2176 dst_release_immediate(&nrt6->dst);
2177 }
2178 }
2179 }
2180
2181 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182 struct sk_buff *skb, u32 mtu)
2183 {
2184 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2185 }
2186
2187 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2188 int oif, u32 mark, kuid_t uid)
2189 {
2190 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2191 struct dst_entry *dst;
2192 struct flowi6 fl6;
2193
2194 memset(&fl6, 0, sizeof(fl6));
2195 fl6.flowi6_oif = oif;
2196 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2197 fl6.daddr = iph->daddr;
2198 fl6.saddr = iph->saddr;
2199 fl6.flowlabel = ip6_flowinfo(iph);
2200 fl6.flowi6_uid = uid;
2201
2202 dst = ip6_route_output(net, NULL, &fl6);
2203 if (!dst->error)
2204 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2205 dst_release(dst);
2206 }
2207 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2208
2209 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2210 {
2211 struct dst_entry *dst;
2212
2213 ip6_update_pmtu(skb, sock_net(sk), mtu,
2214 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2215
2216 dst = __sk_dst_get(sk);
2217 if (!dst || !dst->obsolete ||
2218 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2219 return;
2220
2221 bh_lock_sock(sk);
2222 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2223 ip6_datagram_dst_update(sk, false);
2224 bh_unlock_sock(sk);
2225 }
2226 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2227
2228 /* Handle redirects */
2229 struct ip6rd_flowi {
2230 struct flowi6 fl6;
2231 struct in6_addr gateway;
2232 };
2233
2234 static struct rt6_info *__ip6_route_redirect(struct net *net,
2235 struct fib6_table *table,
2236 struct flowi6 *fl6,
2237 const struct sk_buff *skb,
2238 int flags)
2239 {
2240 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2241 struct rt6_info *rt, *rt_cache;
2242 struct fib6_node *fn;
2243
2244 /* Get the "current" route for this destination and
2245 * check if the redirect has come from appropriate router.
2246 *
2247 * RFC 4861 specifies that redirects should only be
2248 * accepted if they come from the nexthop to the target.
2249 * Due to the way the routes are chosen, this notion
2250 * is a bit fuzzy and one might need to check all possible
2251 * routes.
2252 */
2253
2254 rcu_read_lock();
2255 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2256 restart:
2257 for_each_fib6_node_rt_rcu(fn) {
2258 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2259 continue;
2260 if (rt6_check_expired(rt))
2261 continue;
2262 if (rt->dst.error)
2263 break;
2264 if (!(rt->rt6i_flags & RTF_GATEWAY))
2265 continue;
2266 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2267 continue;
2268 /* rt_cache's gateway might be different from its 'parent'
2269 * in the case of an ip redirect.
2270 * So we keep searching in the exception table if the gateway
2271 * is different.
2272 */
2273 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2274 rt_cache = rt6_find_cached_rt(rt,
2275 &fl6->daddr,
2276 &fl6->saddr);
2277 if (rt_cache &&
2278 ipv6_addr_equal(&rdfl->gateway,
2279 &rt_cache->rt6i_gateway)) {
2280 rt = rt_cache;
2281 break;
2282 }
2283 continue;
2284 }
2285 break;
2286 }
2287
2288 if (!rt)
2289 rt = net->ipv6.ip6_null_entry;
2290 else if (rt->dst.error) {
2291 rt = net->ipv6.ip6_null_entry;
2292 goto out;
2293 }
2294
2295 if (rt == net->ipv6.ip6_null_entry) {
2296 fn = fib6_backtrack(fn, &fl6->saddr);
2297 if (fn)
2298 goto restart;
2299 }
2300
2301 out:
2302 ip6_hold_safe(net, &rt, true);
2303
2304 rcu_read_unlock();
2305
2306 trace_fib6_table_lookup(net, rt, table, fl6);
2307 return rt;
2308 };
2309
2310 static struct dst_entry *ip6_route_redirect(struct net *net,
2311 const struct flowi6 *fl6,
2312 const struct sk_buff *skb,
2313 const struct in6_addr *gateway)
2314 {
2315 int flags = RT6_LOOKUP_F_HAS_SADDR;
2316 struct ip6rd_flowi rdfl;
2317
2318 rdfl.fl6 = *fl6;
2319 rdfl.gateway = *gateway;
2320
2321 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2322 flags, __ip6_route_redirect);
2323 }
2324
2325 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2326 kuid_t uid)
2327 {
2328 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2329 struct dst_entry *dst;
2330 struct flowi6 fl6;
2331
2332 memset(&fl6, 0, sizeof(fl6));
2333 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2334 fl6.flowi6_oif = oif;
2335 fl6.flowi6_mark = mark;
2336 fl6.daddr = iph->daddr;
2337 fl6.saddr = iph->saddr;
2338 fl6.flowlabel = ip6_flowinfo(iph);
2339 fl6.flowi6_uid = uid;
2340
2341 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2342 rt6_do_redirect(dst, NULL, skb);
2343 dst_release(dst);
2344 }
2345 EXPORT_SYMBOL_GPL(ip6_redirect);
2346
2347 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2348 u32 mark)
2349 {
2350 const struct ipv6hdr *iph = ipv6_hdr(skb);
2351 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2352 struct dst_entry *dst;
2353 struct flowi6 fl6;
2354
2355 memset(&fl6, 0, sizeof(fl6));
2356 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2357 fl6.flowi6_oif = oif;
2358 fl6.flowi6_mark = mark;
2359 fl6.daddr = msg->dest;
2360 fl6.saddr = iph->daddr;
2361 fl6.flowi6_uid = sock_net_uid(net, NULL);
2362
2363 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2364 rt6_do_redirect(dst, NULL, skb);
2365 dst_release(dst);
2366 }
2367
2368 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2369 {
2370 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2371 sk->sk_uid);
2372 }
2373 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2374
2375 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2376 {
2377 struct net_device *dev = dst->dev;
2378 unsigned int mtu = dst_mtu(dst);
2379 struct net *net = dev_net(dev);
2380
2381 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2382
2383 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2384 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2385
2386 /*
2387 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2388 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2389 * IPV6_MAXPLEN is also valid and means: "any MSS,
2390 * rely only on pmtu discovery"
2391 */
2392 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2393 mtu = IPV6_MAXPLEN;
2394 return mtu;
2395 }
2396
2397 static unsigned int ip6_mtu(const struct dst_entry *dst)
2398 {
2399 const struct rt6_info *rt = (const struct rt6_info *)dst;
2400 unsigned int mtu = rt->rt6i_pmtu;
2401 struct inet6_dev *idev;
2402
2403 if (mtu)
2404 goto out;
2405
2406 mtu = dst_metric_raw(dst, RTAX_MTU);
2407 if (mtu)
2408 goto out;
2409
2410 mtu = IPV6_MIN_MTU;
2411
2412 rcu_read_lock();
2413 idev = __in6_dev_get(dst->dev);
2414 if (idev)
2415 mtu = idev->cnf.mtu6;
2416 rcu_read_unlock();
2417
2418 out:
2419 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2420
2421 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2422 }
2423
2424 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2425 struct flowi6 *fl6)
2426 {
2427 struct dst_entry *dst;
2428 struct rt6_info *rt;
2429 struct inet6_dev *idev = in6_dev_get(dev);
2430 struct net *net = dev_net(dev);
2431
2432 if (unlikely(!idev))
2433 return ERR_PTR(-ENODEV);
2434
2435 rt = ip6_dst_alloc(net, dev, 0);
2436 if (unlikely(!rt)) {
2437 in6_dev_put(idev);
2438 dst = ERR_PTR(-ENOMEM);
2439 goto out;
2440 }
2441
2442 rt->dst.flags |= DST_HOST;
2443 rt->dst.input = ip6_input;
2444 rt->dst.output = ip6_output;
2445 rt->rt6i_gateway = fl6->daddr;
2446 rt->rt6i_dst.addr = fl6->daddr;
2447 rt->rt6i_dst.plen = 128;
2448 rt->rt6i_idev = idev;
2449 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2450
2451 /* Add this dst into uncached_list so that rt6_disable_ip() can
2452 * do proper release of the net_device
2453 */
2454 rt6_uncached_list_add(rt);
2455 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2456
2457 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2458
2459 out:
2460 return dst;
2461 }
2462
2463 static int ip6_dst_gc(struct dst_ops *ops)
2464 {
2465 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2466 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2467 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2468 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2469 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2470 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2471 int entries;
2472
2473 entries = dst_entries_get_fast(ops);
2474 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2475 entries <= rt_max_size)
2476 goto out;
2477
2478 net->ipv6.ip6_rt_gc_expire++;
2479 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2480 entries = dst_entries_get_slow(ops);
2481 if (entries < ops->gc_thresh)
2482 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2483 out:
2484 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2485 return entries > rt_max_size;
2486 }
2487
2488 static int ip6_convert_metrics(struct mx6_config *mxc,
2489 const struct fib6_config *cfg)
2490 {
2491 struct net *net = cfg->fc_nlinfo.nl_net;
2492 bool ecn_ca = false;
2493 struct nlattr *nla;
2494 int remaining;
2495 u32 *mp;
2496
2497 if (!cfg->fc_mx)
2498 return 0;
2499
2500 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2501 if (unlikely(!mp))
2502 return -ENOMEM;
2503
2504 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2505 int type = nla_type(nla);
2506 u32 val;
2507
2508 if (!type)
2509 continue;
2510 if (unlikely(type > RTAX_MAX))
2511 goto err;
2512
2513 if (type == RTAX_CC_ALGO) {
2514 char tmp[TCP_CA_NAME_MAX];
2515
2516 nla_strlcpy(tmp, nla, sizeof(tmp));
2517 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2518 if (val == TCP_CA_UNSPEC)
2519 goto err;
2520 } else {
2521 val = nla_get_u32(nla);
2522 }
2523 if (type == RTAX_HOPLIMIT && val > 255)
2524 val = 255;
2525 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2526 goto err;
2527
2528 mp[type - 1] = val;
2529 __set_bit(type - 1, mxc->mx_valid);
2530 }
2531
2532 if (ecn_ca) {
2533 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2534 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2535 }
2536
2537 mxc->mx = mp;
2538 return 0;
2539 err:
2540 kfree(mp);
2541 return -EINVAL;
2542 }
2543
2544 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2545 struct fib6_config *cfg,
2546 const struct in6_addr *gw_addr,
2547 u32 tbid, int flags)
2548 {
2549 struct flowi6 fl6 = {
2550 .flowi6_oif = cfg->fc_ifindex,
2551 .daddr = *gw_addr,
2552 .saddr = cfg->fc_prefsrc,
2553 };
2554 struct fib6_table *table;
2555 struct rt6_info *rt;
2556
2557 table = fib6_get_table(net, tbid);
2558 if (!table)
2559 return NULL;
2560
2561 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2562 flags |= RT6_LOOKUP_F_HAS_SADDR;
2563
2564 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2565 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2566
2567 /* if table lookup failed, fall back to full lookup */
2568 if (rt == net->ipv6.ip6_null_entry) {
2569 ip6_rt_put(rt);
2570 rt = NULL;
2571 }
2572
2573 return rt;
2574 }
2575
2576 static int ip6_route_check_nh_onlink(struct net *net,
2577 struct fib6_config *cfg,
2578 const struct net_device *dev,
2579 struct netlink_ext_ack *extack)
2580 {
2581 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2582 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2583 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2584 struct rt6_info *grt;
2585 int err;
2586
2587 err = 0;
2588 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2589 if (grt) {
2590 if (!grt->dst.error &&
2591 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2592 NL_SET_ERR_MSG(extack,
2593 "Nexthop has invalid gateway or device mismatch");
2594 err = -EINVAL;
2595 }
2596
2597 ip6_rt_put(grt);
2598 }
2599
2600 return err;
2601 }
2602
2603 static int ip6_route_check_nh(struct net *net,
2604 struct fib6_config *cfg,
2605 struct net_device **_dev,
2606 struct inet6_dev **idev)
2607 {
2608 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2609 struct net_device *dev = _dev ? *_dev : NULL;
2610 struct rt6_info *grt = NULL;
2611 int err = -EHOSTUNREACH;
2612
2613 if (cfg->fc_table) {
2614 int flags = RT6_LOOKUP_F_IFACE;
2615
2616 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2617 cfg->fc_table, flags);
2618 if (grt) {
2619 if (grt->rt6i_flags & RTF_GATEWAY ||
2620 (dev && dev != grt->dst.dev)) {
2621 ip6_rt_put(grt);
2622 grt = NULL;
2623 }
2624 }
2625 }
2626
2627 if (!grt)
2628 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2629
2630 if (!grt)
2631 goto out;
2632
2633 if (dev) {
2634 if (dev != grt->dst.dev) {
2635 ip6_rt_put(grt);
2636 goto out;
2637 }
2638 } else {
2639 *_dev = dev = grt->dst.dev;
2640 *idev = grt->rt6i_idev;
2641 dev_hold(dev);
2642 in6_dev_hold(grt->rt6i_idev);
2643 }
2644
2645 if (!(grt->rt6i_flags & RTF_GATEWAY))
2646 err = 0;
2647
2648 ip6_rt_put(grt);
2649
2650 out:
2651 return err;
2652 }
2653
2654 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2655 struct net_device **_dev, struct inet6_dev **idev,
2656 struct netlink_ext_ack *extack)
2657 {
2658 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2659 int gwa_type = ipv6_addr_type(gw_addr);
2660 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2661 const struct net_device *dev = *_dev;
2662 bool need_addr_check = !dev;
2663 int err = -EINVAL;
2664
2665 /* if gw_addr is local we will fail to detect this in case
2666 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2667 * will return already-added prefix route via interface that
2668 * prefix route was assigned to, which might be non-loopback.
2669 */
2670 if (dev &&
2671 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2672 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2673 goto out;
2674 }
2675
2676 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2677 /* IPv6 strictly inhibits using not link-local
2678 * addresses as nexthop address.
2679 * Otherwise, router will not able to send redirects.
2680 * It is very good, but in some (rare!) circumstances
2681 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2682 * some exceptions. --ANK
2683 * We allow IPv4-mapped nexthops to support RFC4798-type
2684 * addressing
2685 */
2686 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2687 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2688 goto out;
2689 }
2690
2691 if (cfg->fc_flags & RTNH_F_ONLINK)
2692 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2693 else
2694 err = ip6_route_check_nh(net, cfg, _dev, idev);
2695
2696 if (err)
2697 goto out;
2698 }
2699
2700 /* reload in case device was changed */
2701 dev = *_dev;
2702
2703 err = -EINVAL;
2704 if (!dev) {
2705 NL_SET_ERR_MSG(extack, "Egress device not specified");
2706 goto out;
2707 } else if (dev->flags & IFF_LOOPBACK) {
2708 NL_SET_ERR_MSG(extack,
2709 "Egress device can not be loopback device for this route");
2710 goto out;
2711 }
2712
2713 /* if we did not check gw_addr above, do so now that the
2714 * egress device has been resolved.
2715 */
2716 if (need_addr_check &&
2717 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2718 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2719 goto out;
2720 }
2721
2722 err = 0;
2723 out:
2724 return err;
2725 }
2726
2727 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2728 struct netlink_ext_ack *extack)
2729 {
2730 struct net *net = cfg->fc_nlinfo.nl_net;
2731 struct rt6_info *rt = NULL;
2732 struct net_device *dev = NULL;
2733 struct inet6_dev *idev = NULL;
2734 struct fib6_table *table;
2735 int addr_type;
2736 int err = -EINVAL;
2737
2738 /* RTF_PCPU is an internal flag; can not be set by userspace */
2739 if (cfg->fc_flags & RTF_PCPU) {
2740 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2741 goto out;
2742 }
2743
2744 /* RTF_CACHE is an internal flag; can not be set by userspace */
2745 if (cfg->fc_flags & RTF_CACHE) {
2746 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2747 goto out;
2748 }
2749
2750 if (cfg->fc_dst_len > 128) {
2751 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2752 goto out;
2753 }
2754 if (cfg->fc_src_len > 128) {
2755 NL_SET_ERR_MSG(extack, "Invalid source address length");
2756 goto out;
2757 }
2758 #ifndef CONFIG_IPV6_SUBTREES
2759 if (cfg->fc_src_len) {
2760 NL_SET_ERR_MSG(extack,
2761 "Specifying source address requires IPV6_SUBTREES to be enabled");
2762 goto out;
2763 }
2764 #endif
2765 if (cfg->fc_ifindex) {
2766 err = -ENODEV;
2767 dev = dev_get_by_index(net, cfg->fc_ifindex);
2768 if (!dev)
2769 goto out;
2770 idev = in6_dev_get(dev);
2771 if (!idev)
2772 goto out;
2773 }
2774
2775 if (cfg->fc_metric == 0)
2776 cfg->fc_metric = IP6_RT_PRIO_USER;
2777
2778 if (cfg->fc_flags & RTNH_F_ONLINK) {
2779 if (!dev) {
2780 NL_SET_ERR_MSG(extack,
2781 "Nexthop device required for onlink");
2782 err = -ENODEV;
2783 goto out;
2784 }
2785
2786 if (!(dev->flags & IFF_UP)) {
2787 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2788 err = -ENETDOWN;
2789 goto out;
2790 }
2791 }
2792
2793 err = -ENOBUFS;
2794 if (cfg->fc_nlinfo.nlh &&
2795 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2796 table = fib6_get_table(net, cfg->fc_table);
2797 if (!table) {
2798 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2799 table = fib6_new_table(net, cfg->fc_table);
2800 }
2801 } else {
2802 table = fib6_new_table(net, cfg->fc_table);
2803 }
2804
2805 if (!table)
2806 goto out;
2807
2808 rt = ip6_dst_alloc(net, NULL,
2809 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2810
2811 if (!rt) {
2812 err = -ENOMEM;
2813 goto out;
2814 }
2815
2816 if (cfg->fc_flags & RTF_EXPIRES)
2817 rt6_set_expires(rt, jiffies +
2818 clock_t_to_jiffies(cfg->fc_expires));
2819 else
2820 rt6_clean_expires(rt);
2821
2822 if (cfg->fc_protocol == RTPROT_UNSPEC)
2823 cfg->fc_protocol = RTPROT_BOOT;
2824 rt->rt6i_protocol = cfg->fc_protocol;
2825
2826 addr_type = ipv6_addr_type(&cfg->fc_dst);
2827
2828 if (addr_type & IPV6_ADDR_MULTICAST)
2829 rt->dst.input = ip6_mc_input;
2830 else if (cfg->fc_flags & RTF_LOCAL)
2831 rt->dst.input = ip6_input;
2832 else
2833 rt->dst.input = ip6_forward;
2834
2835 rt->dst.output = ip6_output;
2836
2837 if (cfg->fc_encap) {
2838 struct lwtunnel_state *lwtstate;
2839
2840 err = lwtunnel_build_state(cfg->fc_encap_type,
2841 cfg->fc_encap, AF_INET6, cfg,
2842 &lwtstate, extack);
2843 if (err)
2844 goto out;
2845 rt->dst.lwtstate = lwtstate_get(lwtstate);
2846 lwtunnel_set_redirect(&rt->dst);
2847 }
2848
2849 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2850 rt->rt6i_dst.plen = cfg->fc_dst_len;
2851 if (rt->rt6i_dst.plen == 128)
2852 rt->dst.flags |= DST_HOST;
2853
2854 #ifdef CONFIG_IPV6_SUBTREES
2855 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2856 rt->rt6i_src.plen = cfg->fc_src_len;
2857 #endif
2858
2859 rt->rt6i_metric = cfg->fc_metric;
2860 rt->rt6i_nh_weight = 1;
2861
2862 /* We cannot add true routes via loopback here,
2863 they would result in kernel looping; promote them to reject routes
2864 */
2865 if ((cfg->fc_flags & RTF_REJECT) ||
2866 (dev && (dev->flags & IFF_LOOPBACK) &&
2867 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2868 !(cfg->fc_flags & RTF_LOCAL))) {
2869 /* hold loopback dev/idev if we haven't done so. */
2870 if (dev != net->loopback_dev) {
2871 if (dev) {
2872 dev_put(dev);
2873 in6_dev_put(idev);
2874 }
2875 dev = net->loopback_dev;
2876 dev_hold(dev);
2877 idev = in6_dev_get(dev);
2878 if (!idev) {
2879 err = -ENODEV;
2880 goto out;
2881 }
2882 }
2883 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2884 switch (cfg->fc_type) {
2885 case RTN_BLACKHOLE:
2886 rt->dst.error = -EINVAL;
2887 rt->dst.output = dst_discard_out;
2888 rt->dst.input = dst_discard;
2889 break;
2890 case RTN_PROHIBIT:
2891 rt->dst.error = -EACCES;
2892 rt->dst.output = ip6_pkt_prohibit_out;
2893 rt->dst.input = ip6_pkt_prohibit;
2894 break;
2895 case RTN_THROW:
2896 case RTN_UNREACHABLE:
2897 default:
2898 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2899 : (cfg->fc_type == RTN_UNREACHABLE)
2900 ? -EHOSTUNREACH : -ENETUNREACH;
2901 rt->dst.output = ip6_pkt_discard_out;
2902 rt->dst.input = ip6_pkt_discard;
2903 break;
2904 }
2905 goto install_route;
2906 }
2907
2908 if (cfg->fc_flags & RTF_GATEWAY) {
2909 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2910 if (err)
2911 goto out;
2912
2913 rt->rt6i_gateway = cfg->fc_gateway;
2914 }
2915
2916 err = -ENODEV;
2917 if (!dev)
2918 goto out;
2919
2920 if (!(dev->flags & IFF_UP)) {
2921 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2922 err = -ENETDOWN;
2923 goto out;
2924 }
2925
2926 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2927 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2928 NL_SET_ERR_MSG(extack, "Invalid source address");
2929 err = -EINVAL;
2930 goto out;
2931 }
2932 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2933 rt->rt6i_prefsrc.plen = 128;
2934 } else
2935 rt->rt6i_prefsrc.plen = 0;
2936
2937 rt->rt6i_flags = cfg->fc_flags;
2938
2939 install_route:
2940 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2941 !netif_carrier_ok(dev))
2942 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2943 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2944 rt->dst.dev = dev;
2945 rt->rt6i_idev = idev;
2946 rt->rt6i_table = table;
2947
2948 cfg->fc_nlinfo.nl_net = dev_net(dev);
2949
2950 return rt;
2951 out:
2952 if (dev)
2953 dev_put(dev);
2954 if (idev)
2955 in6_dev_put(idev);
2956 if (rt)
2957 dst_release_immediate(&rt->dst);
2958
2959 return ERR_PTR(err);
2960 }
2961
2962 int ip6_route_add(struct fib6_config *cfg,
2963 struct netlink_ext_ack *extack)
2964 {
2965 struct mx6_config mxc = { .mx = NULL, };
2966 struct rt6_info *rt;
2967 int err;
2968
2969 rt = ip6_route_info_create(cfg, extack);
2970 if (IS_ERR(rt)) {
2971 err = PTR_ERR(rt);
2972 rt = NULL;
2973 goto out;
2974 }
2975
2976 err = ip6_convert_metrics(&mxc, cfg);
2977 if (err)
2978 goto out;
2979
2980 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2981
2982 kfree(mxc.mx);
2983
2984 return err;
2985 out:
2986 if (rt)
2987 dst_release_immediate(&rt->dst);
2988
2989 return err;
2990 }
2991
2992 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2993 {
2994 int err;
2995 struct fib6_table *table;
2996 struct net *net = dev_net(rt->dst.dev);
2997
2998 if (rt == net->ipv6.ip6_null_entry) {
2999 err = -ENOENT;
3000 goto out;
3001 }
3002
3003 table = rt->rt6i_table;
3004 spin_lock_bh(&table->tb6_lock);
3005 err = fib6_del(rt, info);
3006 spin_unlock_bh(&table->tb6_lock);
3007
3008 out:
3009 ip6_rt_put(rt);
3010 return err;
3011 }
3012
3013 int ip6_del_rt(struct rt6_info *rt)
3014 {
3015 struct nl_info info = {
3016 .nl_net = dev_net(rt->dst.dev),
3017 };
3018 return __ip6_del_rt(rt, &info);
3019 }
3020
3021 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3022 {
3023 struct nl_info *info = &cfg->fc_nlinfo;
3024 struct net *net = info->nl_net;
3025 struct sk_buff *skb = NULL;
3026 struct fib6_table *table;
3027 int err = -ENOENT;
3028
3029 if (rt == net->ipv6.ip6_null_entry)
3030 goto out_put;
3031 table = rt->rt6i_table;
3032 spin_lock_bh(&table->tb6_lock);
3033
3034 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3035 struct rt6_info *sibling, *next_sibling;
3036
3037 /* prefer to send a single notification with all hops */
3038 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3039 if (skb) {
3040 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3041
3042 if (rt6_fill_node(net, skb, rt,
3043 NULL, NULL, 0, RTM_DELROUTE,
3044 info->portid, seq, 0) < 0) {
3045 kfree_skb(skb);
3046 skb = NULL;
3047 } else
3048 info->skip_notify = 1;
3049 }
3050
3051 list_for_each_entry_safe(sibling, next_sibling,
3052 &rt->rt6i_siblings,
3053 rt6i_siblings) {
3054 err = fib6_del(sibling, info);
3055 if (err)
3056 goto out_unlock;
3057 }
3058 }
3059
3060 err = fib6_del(rt, info);
3061 out_unlock:
3062 spin_unlock_bh(&table->tb6_lock);
3063 out_put:
3064 ip6_rt_put(rt);
3065
3066 if (skb) {
3067 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3068 info->nlh, gfp_any());
3069 }
3070 return err;
3071 }
3072
3073 static int ip6_route_del(struct fib6_config *cfg,
3074 struct netlink_ext_ack *extack)
3075 {
3076 struct rt6_info *rt, *rt_cache;
3077 struct fib6_table *table;
3078 struct fib6_node *fn;
3079 int err = -ESRCH;
3080
3081 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3082 if (!table) {
3083 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3084 return err;
3085 }
3086
3087 rcu_read_lock();
3088
3089 fn = fib6_locate(&table->tb6_root,
3090 &cfg->fc_dst, cfg->fc_dst_len,
3091 &cfg->fc_src, cfg->fc_src_len,
3092 !(cfg->fc_flags & RTF_CACHE));
3093
3094 if (fn) {
3095 for_each_fib6_node_rt_rcu(fn) {
3096 if (cfg->fc_flags & RTF_CACHE) {
3097 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3098 &cfg->fc_src);
3099 if (!rt_cache)
3100 continue;
3101 rt = rt_cache;
3102 }
3103 if (cfg->fc_ifindex &&
3104 (!rt->dst.dev ||
3105 rt->dst.dev->ifindex != cfg->fc_ifindex))
3106 continue;
3107 if (cfg->fc_flags & RTF_GATEWAY &&
3108 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3109 continue;
3110 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3111 continue;
3112 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3113 continue;
3114 if (!dst_hold_safe(&rt->dst))
3115 break;
3116 rcu_read_unlock();
3117
3118 /* if gateway was specified only delete the one hop */
3119 if (cfg->fc_flags & RTF_GATEWAY)
3120 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3121
3122 return __ip6_del_rt_siblings(rt, cfg);
3123 }
3124 }
3125 rcu_read_unlock();
3126
3127 return err;
3128 }
3129
3130 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3131 {
3132 struct netevent_redirect netevent;
3133 struct rt6_info *rt, *nrt = NULL;
3134 struct ndisc_options ndopts;
3135 struct inet6_dev *in6_dev;
3136 struct neighbour *neigh;
3137 struct rd_msg *msg;
3138 int optlen, on_link;
3139 u8 *lladdr;
3140
3141 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3142 optlen -= sizeof(*msg);
3143
3144 if (optlen < 0) {
3145 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3146 return;
3147 }
3148
3149 msg = (struct rd_msg *)icmp6_hdr(skb);
3150
3151 if (ipv6_addr_is_multicast(&msg->dest)) {
3152 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3153 return;
3154 }
3155
3156 on_link = 0;
3157 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3158 on_link = 1;
3159 } else if (ipv6_addr_type(&msg->target) !=
3160 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3161 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3162 return;
3163 }
3164
3165 in6_dev = __in6_dev_get(skb->dev);
3166 if (!in6_dev)
3167 return;
3168 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3169 return;
3170
3171 /* RFC2461 8.1:
3172 * The IP source address of the Redirect MUST be the same as the current
3173 * first-hop router for the specified ICMP Destination Address.
3174 */
3175
3176 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3177 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3178 return;
3179 }
3180
3181 lladdr = NULL;
3182 if (ndopts.nd_opts_tgt_lladdr) {
3183 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3184 skb->dev);
3185 if (!lladdr) {
3186 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3187 return;
3188 }
3189 }
3190
3191 rt = (struct rt6_info *) dst;
3192 if (rt->rt6i_flags & RTF_REJECT) {
3193 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3194 return;
3195 }
3196
3197 /* Redirect received -> path was valid.
3198 * Look, redirects are sent only in response to data packets,
3199 * so that this nexthop apparently is reachable. --ANK
3200 */
3201 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3202
3203 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3204 if (!neigh)
3205 return;
3206
3207 /*
3208 * We have finally decided to accept it.
3209 */
3210
3211 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3212 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3213 NEIGH_UPDATE_F_OVERRIDE|
3214 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3215 NEIGH_UPDATE_F_ISROUTER)),
3216 NDISC_REDIRECT, &ndopts);
3217
3218 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3219 if (!nrt)
3220 goto out;
3221
3222 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3223 if (on_link)
3224 nrt->rt6i_flags &= ~RTF_GATEWAY;
3225
3226 nrt->rt6i_protocol = RTPROT_REDIRECT;
3227 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3228
3229 /* No need to remove rt from the exception table if rt is
3230 * a cached route because rt6_insert_exception() will
3231 * takes care of it
3232 */
3233 if (rt6_insert_exception(nrt, rt)) {
3234 dst_release_immediate(&nrt->dst);
3235 goto out;
3236 }
3237
3238 netevent.old = &rt->dst;
3239 netevent.new = &nrt->dst;
3240 netevent.daddr = &msg->dest;
3241 netevent.neigh = neigh;
3242 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3243
3244 out:
3245 neigh_release(neigh);
3246 }
3247
3248 /*
3249 * Misc support functions
3250 */
3251
3252 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3253 {
3254 BUG_ON(from->from);
3255
3256 rt->rt6i_flags &= ~RTF_EXPIRES;
3257 dst_hold(&from->dst);
3258 rt->from = from;
3259 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3260 }
3261
3262 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3263 {
3264 rt->dst.input = ort->dst.input;
3265 rt->dst.output = ort->dst.output;
3266 rt->rt6i_dst = ort->rt6i_dst;
3267 rt->dst.error = ort->dst.error;
3268 rt->rt6i_idev = ort->rt6i_idev;
3269 if (rt->rt6i_idev)
3270 in6_dev_hold(rt->rt6i_idev);
3271 rt->dst.lastuse = jiffies;
3272 rt->rt6i_gateway = ort->rt6i_gateway;
3273 rt->rt6i_flags = ort->rt6i_flags;
3274 rt6_set_from(rt, ort);
3275 rt->rt6i_metric = ort->rt6i_metric;
3276 #ifdef CONFIG_IPV6_SUBTREES
3277 rt->rt6i_src = ort->rt6i_src;
3278 #endif
3279 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3280 rt->rt6i_table = ort->rt6i_table;
3281 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3282 }
3283
3284 #ifdef CONFIG_IPV6_ROUTE_INFO
3285 static struct rt6_info *rt6_get_route_info(struct net *net,
3286 const struct in6_addr *prefix, int prefixlen,
3287 const struct in6_addr *gwaddr,
3288 struct net_device *dev)
3289 {
3290 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3291 int ifindex = dev->ifindex;
3292 struct fib6_node *fn;
3293 struct rt6_info *rt = NULL;
3294 struct fib6_table *table;
3295
3296 table = fib6_get_table(net, tb_id);
3297 if (!table)
3298 return NULL;
3299
3300 rcu_read_lock();
3301 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3302 if (!fn)
3303 goto out;
3304
3305 for_each_fib6_node_rt_rcu(fn) {
3306 if (rt->dst.dev->ifindex != ifindex)
3307 continue;
3308 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3309 continue;
3310 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3311 continue;
3312 ip6_hold_safe(NULL, &rt, false);
3313 break;
3314 }
3315 out:
3316 rcu_read_unlock();
3317 return rt;
3318 }
3319
3320 static struct rt6_info *rt6_add_route_info(struct net *net,
3321 const struct in6_addr *prefix, int prefixlen,
3322 const struct in6_addr *gwaddr,
3323 struct net_device *dev,
3324 unsigned int pref)
3325 {
3326 struct fib6_config cfg = {
3327 .fc_metric = IP6_RT_PRIO_USER,
3328 .fc_ifindex = dev->ifindex,
3329 .fc_dst_len = prefixlen,
3330 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3331 RTF_UP | RTF_PREF(pref),
3332 .fc_protocol = RTPROT_RA,
3333 .fc_nlinfo.portid = 0,
3334 .fc_nlinfo.nlh = NULL,
3335 .fc_nlinfo.nl_net = net,
3336 };
3337
3338 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3339 cfg.fc_dst = *prefix;
3340 cfg.fc_gateway = *gwaddr;
3341
3342 /* We should treat it as a default route if prefix length is 0. */
3343 if (!prefixlen)
3344 cfg.fc_flags |= RTF_DEFAULT;
3345
3346 ip6_route_add(&cfg, NULL);
3347
3348 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3349 }
3350 #endif
3351
3352 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3353 {
3354 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3355 struct rt6_info *rt;
3356 struct fib6_table *table;
3357
3358 table = fib6_get_table(dev_net(dev), tb_id);
3359 if (!table)
3360 return NULL;
3361
3362 rcu_read_lock();
3363 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3364 if (dev == rt->dst.dev &&
3365 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3366 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3367 break;
3368 }
3369 if (rt)
3370 ip6_hold_safe(NULL, &rt, false);
3371 rcu_read_unlock();
3372 return rt;
3373 }
3374
3375 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3376 struct net_device *dev,
3377 unsigned int pref)
3378 {
3379 struct fib6_config cfg = {
3380 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3381 .fc_metric = IP6_RT_PRIO_USER,
3382 .fc_ifindex = dev->ifindex,
3383 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3384 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3385 .fc_protocol = RTPROT_RA,
3386 .fc_nlinfo.portid = 0,
3387 .fc_nlinfo.nlh = NULL,
3388 .fc_nlinfo.nl_net = dev_net(dev),
3389 };
3390
3391 cfg.fc_gateway = *gwaddr;
3392
3393 if (!ip6_route_add(&cfg, NULL)) {
3394 struct fib6_table *table;
3395
3396 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3397 if (table)
3398 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3399 }
3400
3401 return rt6_get_dflt_router(gwaddr, dev);
3402 }
3403
3404 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3405 {
3406 struct rt6_info *rt;
3407
3408 restart:
3409 rcu_read_lock();
3410 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3411 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3412 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3413 if (dst_hold_safe(&rt->dst)) {
3414 rcu_read_unlock();
3415 ip6_del_rt(rt);
3416 } else {
3417 rcu_read_unlock();
3418 }
3419 goto restart;
3420 }
3421 }
3422 rcu_read_unlock();
3423
3424 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3425 }
3426
3427 void rt6_purge_dflt_routers(struct net *net)
3428 {
3429 struct fib6_table *table;
3430 struct hlist_head *head;
3431 unsigned int h;
3432
3433 rcu_read_lock();
3434
3435 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3436 head = &net->ipv6.fib_table_hash[h];
3437 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3438 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3439 __rt6_purge_dflt_routers(table);
3440 }
3441 }
3442
3443 rcu_read_unlock();
3444 }
3445
3446 static void rtmsg_to_fib6_config(struct net *net,
3447 struct in6_rtmsg *rtmsg,
3448 struct fib6_config *cfg)
3449 {
3450 memset(cfg, 0, sizeof(*cfg));
3451
3452 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3453 : RT6_TABLE_MAIN;
3454 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3455 cfg->fc_metric = rtmsg->rtmsg_metric;
3456 cfg->fc_expires = rtmsg->rtmsg_info;
3457 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3458 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3459 cfg->fc_flags = rtmsg->rtmsg_flags;
3460
3461 cfg->fc_nlinfo.nl_net = net;
3462
3463 cfg->fc_dst = rtmsg->rtmsg_dst;
3464 cfg->fc_src = rtmsg->rtmsg_src;
3465 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3466 }
3467
3468 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3469 {
3470 struct fib6_config cfg;
3471 struct in6_rtmsg rtmsg;
3472 int err;
3473
3474 switch (cmd) {
3475 case SIOCADDRT: /* Add a route */
3476 case SIOCDELRT: /* Delete a route */
3477 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3478 return -EPERM;
3479 err = copy_from_user(&rtmsg, arg,
3480 sizeof(struct in6_rtmsg));
3481 if (err)
3482 return -EFAULT;
3483
3484 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3485
3486 rtnl_lock();
3487 switch (cmd) {
3488 case SIOCADDRT:
3489 err = ip6_route_add(&cfg, NULL);
3490 break;
3491 case SIOCDELRT:
3492 err = ip6_route_del(&cfg, NULL);
3493 break;
3494 default:
3495 err = -EINVAL;
3496 }
3497 rtnl_unlock();
3498
3499 return err;
3500 }
3501
3502 return -EINVAL;
3503 }
3504
3505 /*
3506 * Drop the packet on the floor
3507 */
3508
3509 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3510 {
3511 int type;
3512 struct dst_entry *dst = skb_dst(skb);
3513 switch (ipstats_mib_noroutes) {
3514 case IPSTATS_MIB_INNOROUTES:
3515 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3516 if (type == IPV6_ADDR_ANY) {
3517 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3518 IPSTATS_MIB_INADDRERRORS);
3519 break;
3520 }
3521 /* FALLTHROUGH */
3522 case IPSTATS_MIB_OUTNOROUTES:
3523 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3524 ipstats_mib_noroutes);
3525 break;
3526 }
3527 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3528 kfree_skb(skb);
3529 return 0;
3530 }
3531
3532 static int ip6_pkt_discard(struct sk_buff *skb)
3533 {
3534 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3535 }
3536
3537 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3538 {
3539 skb->dev = skb_dst(skb)->dev;
3540 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3541 }
3542
3543 static int ip6_pkt_prohibit(struct sk_buff *skb)
3544 {
3545 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3546 }
3547
3548 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3549 {
3550 skb->dev = skb_dst(skb)->dev;
3551 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3552 }
3553
3554 /*
3555 * Allocate a dst for local (unicast / anycast) address.
3556 */
3557
3558 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3559 const struct in6_addr *addr,
3560 bool anycast)
3561 {
3562 u32 tb_id;
3563 struct net *net = dev_net(idev->dev);
3564 struct net_device *dev = idev->dev;
3565 struct rt6_info *rt;
3566
3567 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3568 if (!rt)
3569 return ERR_PTR(-ENOMEM);
3570
3571 in6_dev_hold(idev);
3572
3573 rt->dst.flags |= DST_HOST;
3574 rt->dst.input = ip6_input;
3575 rt->dst.output = ip6_output;
3576 rt->rt6i_idev = idev;
3577
3578 rt->rt6i_protocol = RTPROT_KERNEL;
3579 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3580 if (anycast)
3581 rt->rt6i_flags |= RTF_ANYCAST;
3582 else
3583 rt->rt6i_flags |= RTF_LOCAL;
3584
3585 rt->rt6i_gateway = *addr;
3586 rt->rt6i_dst.addr = *addr;
3587 rt->rt6i_dst.plen = 128;
3588 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3589 rt->rt6i_table = fib6_get_table(net, tb_id);
3590
3591 return rt;
3592 }
3593
3594 /* remove deleted ip from prefsrc entries */
3595 struct arg_dev_net_ip {
3596 struct net_device *dev;
3597 struct net *net;
3598 struct in6_addr *addr;
3599 };
3600
3601 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3602 {
3603 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3604 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3605 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3606
3607 if (((void *)rt->dst.dev == dev || !dev) &&
3608 rt != net->ipv6.ip6_null_entry &&
3609 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3610 spin_lock_bh(&rt6_exception_lock);
3611 /* remove prefsrc entry */
3612 rt->rt6i_prefsrc.plen = 0;
3613 /* need to update cache as well */
3614 rt6_exceptions_remove_prefsrc(rt);
3615 spin_unlock_bh(&rt6_exception_lock);
3616 }
3617 return 0;
3618 }
3619
3620 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3621 {
3622 struct net *net = dev_net(ifp->idev->dev);
3623 struct arg_dev_net_ip adni = {
3624 .dev = ifp->idev->dev,
3625 .net = net,
3626 .addr = &ifp->addr,
3627 };
3628 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3629 }
3630
3631 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3632
3633 /* Remove routers and update dst entries when gateway turn into host. */
3634 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3635 {
3636 struct in6_addr *gateway = (struct in6_addr *)arg;
3637
3638 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3639 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3640 return -1;
3641 }
3642
3643 /* Further clean up cached routes in exception table.
3644 * This is needed because cached route may have a different
3645 * gateway than its 'parent' in the case of an ip redirect.
3646 */
3647 rt6_exceptions_clean_tohost(rt, gateway);
3648
3649 return 0;
3650 }
3651
3652 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3653 {
3654 fib6_clean_all(net, fib6_clean_tohost, gateway);
3655 }
3656
3657 struct arg_netdev_event {
3658 const struct net_device *dev;
3659 union {
3660 unsigned int nh_flags;
3661 unsigned long event;
3662 };
3663 };
3664
3665 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3666 {
3667 struct rt6_info *iter;
3668 struct fib6_node *fn;
3669
3670 fn = rcu_dereference_protected(rt->rt6i_node,
3671 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3672 iter = rcu_dereference_protected(fn->leaf,
3673 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3674 while (iter) {
3675 if (iter->rt6i_metric == rt->rt6i_metric &&
3676 rt6_qualify_for_ecmp(iter))
3677 return iter;
3678 iter = rcu_dereference_protected(iter->rt6_next,
3679 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3680 }
3681
3682 return NULL;
3683 }
3684
3685 static bool rt6_is_dead(const struct rt6_info *rt)
3686 {
3687 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3688 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3689 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3690 return true;
3691
3692 return false;
3693 }
3694
3695 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3696 {
3697 struct rt6_info *iter;
3698 int total = 0;
3699
3700 if (!rt6_is_dead(rt))
3701 total += rt->rt6i_nh_weight;
3702
3703 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3704 if (!rt6_is_dead(iter))
3705 total += iter->rt6i_nh_weight;
3706 }
3707
3708 return total;
3709 }
3710
3711 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3712 {
3713 int upper_bound = -1;
3714
3715 if (!rt6_is_dead(rt)) {
3716 *weight += rt->rt6i_nh_weight;
3717 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3718 total) - 1;
3719 }
3720 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3721 }
3722
3723 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3724 {
3725 struct rt6_info *iter;
3726 int weight = 0;
3727
3728 rt6_upper_bound_set(rt, &weight, total);
3729
3730 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3731 rt6_upper_bound_set(iter, &weight, total);
3732 }
3733
3734 void rt6_multipath_rebalance(struct rt6_info *rt)
3735 {
3736 struct rt6_info *first;
3737 int total;
3738
3739 /* In case the entire multipath route was marked for flushing,
3740 * then there is no need to rebalance upon the removal of every
3741 * sibling route.
3742 */
3743 if (!rt->rt6i_nsiblings || rt->should_flush)
3744 return;
3745
3746 /* During lookup routes are evaluated in order, so we need to
3747 * make sure upper bounds are assigned from the first sibling
3748 * onwards.
3749 */
3750 first = rt6_multipath_first_sibling(rt);
3751 if (WARN_ON_ONCE(!first))
3752 return;
3753
3754 total = rt6_multipath_total_weight(first);
3755 rt6_multipath_upper_bound_set(first, total);
3756 }
3757
3758 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3759 {
3760 const struct arg_netdev_event *arg = p_arg;
3761 const struct net *net = dev_net(arg->dev);
3762
3763 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3764 rt->rt6i_nh_flags &= ~arg->nh_flags;
3765 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3766 rt6_multipath_rebalance(rt);
3767 }
3768
3769 return 0;
3770 }
3771
3772 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3773 {
3774 struct arg_netdev_event arg = {
3775 .dev = dev,
3776 {
3777 .nh_flags = nh_flags,
3778 },
3779 };
3780
3781 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3782 arg.nh_flags |= RTNH_F_LINKDOWN;
3783
3784 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3785 }
3786
3787 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3788 const struct net_device *dev)
3789 {
3790 struct rt6_info *iter;
3791
3792 if (rt->dst.dev == dev)
3793 return true;
3794 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3795 if (iter->dst.dev == dev)
3796 return true;
3797
3798 return false;
3799 }
3800
3801 static void rt6_multipath_flush(struct rt6_info *rt)
3802 {
3803 struct rt6_info *iter;
3804
3805 rt->should_flush = 1;
3806 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3807 iter->should_flush = 1;
3808 }
3809
3810 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3811 const struct net_device *down_dev)
3812 {
3813 struct rt6_info *iter;
3814 unsigned int dead = 0;
3815
3816 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3817 dead++;
3818 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3819 if (iter->dst.dev == down_dev ||
3820 iter->rt6i_nh_flags & RTNH_F_DEAD)
3821 dead++;
3822
3823 return dead;
3824 }
3825
3826 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3827 const struct net_device *dev,
3828 unsigned int nh_flags)
3829 {
3830 struct rt6_info *iter;
3831
3832 if (rt->dst.dev == dev)
3833 rt->rt6i_nh_flags |= nh_flags;
3834 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3835 if (iter->dst.dev == dev)
3836 iter->rt6i_nh_flags |= nh_flags;
3837 }
3838
3839 /* called with write lock held for table with rt */
3840 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3841 {
3842 const struct arg_netdev_event *arg = p_arg;
3843 const struct net_device *dev = arg->dev;
3844 const struct net *net = dev_net(dev);
3845
3846 if (rt == net->ipv6.ip6_null_entry)
3847 return 0;
3848
3849 switch (arg->event) {
3850 case NETDEV_UNREGISTER:
3851 return rt->dst.dev == dev ? -1 : 0;
3852 case NETDEV_DOWN:
3853 if (rt->should_flush)
3854 return -1;
3855 if (!rt->rt6i_nsiblings)
3856 return rt->dst.dev == dev ? -1 : 0;
3857 if (rt6_multipath_uses_dev(rt, dev)) {
3858 unsigned int count;
3859
3860 count = rt6_multipath_dead_count(rt, dev);
3861 if (rt->rt6i_nsiblings + 1 == count) {
3862 rt6_multipath_flush(rt);
3863 return -1;
3864 }
3865 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3866 RTNH_F_LINKDOWN);
3867 fib6_update_sernum(rt);
3868 rt6_multipath_rebalance(rt);
3869 }
3870 return -2;
3871 case NETDEV_CHANGE:
3872 if (rt->dst.dev != dev ||
3873 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3874 break;
3875 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3876 rt6_multipath_rebalance(rt);
3877 break;
3878 }
3879
3880 return 0;
3881 }
3882
3883 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3884 {
3885 struct arg_netdev_event arg = {
3886 .dev = dev,
3887 {
3888 .event = event,
3889 },
3890 };
3891
3892 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3893 }
3894
3895 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3896 {
3897 rt6_sync_down_dev(dev, event);
3898 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3899 neigh_ifdown(&nd_tbl, dev);
3900 }
3901
3902 struct rt6_mtu_change_arg {
3903 struct net_device *dev;
3904 unsigned int mtu;
3905 };
3906
3907 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3908 {
3909 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3910 struct inet6_dev *idev;
3911
3912 /* In IPv6 pmtu discovery is not optional,
3913 so that RTAX_MTU lock cannot disable it.
3914 We still use this lock to block changes
3915 caused by addrconf/ndisc.
3916 */
3917
3918 idev = __in6_dev_get(arg->dev);
3919 if (!idev)
3920 return 0;
3921
3922 /* For administrative MTU increase, there is no way to discover
3923 IPv6 PMTU increase, so PMTU increase should be updated here.
3924 Since RFC 1981 doesn't include administrative MTU increase
3925 update PMTU increase is a MUST. (i.e. jumbo frame)
3926 */
3927 if (rt->dst.dev == arg->dev &&
3928 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3929 spin_lock_bh(&rt6_exception_lock);
3930 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3931 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3932 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3933 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3934 spin_unlock_bh(&rt6_exception_lock);
3935 }
3936 return 0;
3937 }
3938
3939 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3940 {
3941 struct rt6_mtu_change_arg arg = {
3942 .dev = dev,
3943 .mtu = mtu,
3944 };
3945
3946 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3947 }
3948
3949 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3950 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3951 [RTA_OIF] = { .type = NLA_U32 },
3952 [RTA_IIF] = { .type = NLA_U32 },
3953 [RTA_PRIORITY] = { .type = NLA_U32 },
3954 [RTA_METRICS] = { .type = NLA_NESTED },
3955 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3956 [RTA_PREF] = { .type = NLA_U8 },
3957 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3958 [RTA_ENCAP] = { .type = NLA_NESTED },
3959 [RTA_EXPIRES] = { .type = NLA_U32 },
3960 [RTA_UID] = { .type = NLA_U32 },
3961 [RTA_MARK] = { .type = NLA_U32 },
3962 };
3963
3964 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3965 struct fib6_config *cfg,
3966 struct netlink_ext_ack *extack)
3967 {
3968 struct rtmsg *rtm;
3969 struct nlattr *tb[RTA_MAX+1];
3970 unsigned int pref;
3971 int err;
3972
3973 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3974 NULL);
3975 if (err < 0)
3976 goto errout;
3977
3978 err = -EINVAL;
3979 rtm = nlmsg_data(nlh);
3980 memset(cfg, 0, sizeof(*cfg));
3981
3982 cfg->fc_table = rtm->rtm_table;
3983 cfg->fc_dst_len = rtm->rtm_dst_len;
3984 cfg->fc_src_len = rtm->rtm_src_len;
3985 cfg->fc_flags = RTF_UP;
3986 cfg->fc_protocol = rtm->rtm_protocol;
3987 cfg->fc_type = rtm->rtm_type;
3988
3989 if (rtm->rtm_type == RTN_UNREACHABLE ||
3990 rtm->rtm_type == RTN_BLACKHOLE ||
3991 rtm->rtm_type == RTN_PROHIBIT ||
3992 rtm->rtm_type == RTN_THROW)
3993 cfg->fc_flags |= RTF_REJECT;
3994
3995 if (rtm->rtm_type == RTN_LOCAL)
3996 cfg->fc_flags |= RTF_LOCAL;
3997
3998 if (rtm->rtm_flags & RTM_F_CLONED)
3999 cfg->fc_flags |= RTF_CACHE;
4000
4001 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4002
4003 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4004 cfg->fc_nlinfo.nlh = nlh;
4005 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4006
4007 if (tb[RTA_GATEWAY]) {
4008 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4009 cfg->fc_flags |= RTF_GATEWAY;
4010 }
4011
4012 if (tb[RTA_DST]) {
4013 int plen = (rtm->rtm_dst_len + 7) >> 3;
4014
4015 if (nla_len(tb[RTA_DST]) < plen)
4016 goto errout;
4017
4018 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4019 }
4020
4021 if (tb[RTA_SRC]) {
4022 int plen = (rtm->rtm_src_len + 7) >> 3;
4023
4024 if (nla_len(tb[RTA_SRC]) < plen)
4025 goto errout;
4026
4027 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4028 }
4029
4030 if (tb[RTA_PREFSRC])
4031 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4032
4033 if (tb[RTA_OIF])
4034 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4035
4036 if (tb[RTA_PRIORITY])
4037 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4038
4039 if (tb[RTA_METRICS]) {
4040 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4041 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4042 }
4043
4044 if (tb[RTA_TABLE])
4045 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4046
4047 if (tb[RTA_MULTIPATH]) {
4048 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4049 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4050
4051 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4052 cfg->fc_mp_len, extack);
4053 if (err < 0)
4054 goto errout;
4055 }
4056
4057 if (tb[RTA_PREF]) {
4058 pref = nla_get_u8(tb[RTA_PREF]);
4059 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4060 pref != ICMPV6_ROUTER_PREF_HIGH)
4061 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4062 cfg->fc_flags |= RTF_PREF(pref);
4063 }
4064
4065 if (tb[RTA_ENCAP])
4066 cfg->fc_encap = tb[RTA_ENCAP];
4067
4068 if (tb[RTA_ENCAP_TYPE]) {
4069 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4070
4071 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4072 if (err < 0)
4073 goto errout;
4074 }
4075
4076 if (tb[RTA_EXPIRES]) {
4077 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4078
4079 if (addrconf_finite_timeout(timeout)) {
4080 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4081 cfg->fc_flags |= RTF_EXPIRES;
4082 }
4083 }
4084
4085 err = 0;
4086 errout:
4087 return err;
4088 }
4089
4090 struct rt6_nh {
4091 struct rt6_info *rt6_info;
4092 struct fib6_config r_cfg;
4093 struct mx6_config mxc;
4094 struct list_head next;
4095 };
4096
4097 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4098 {
4099 struct rt6_nh *nh;
4100
4101 list_for_each_entry(nh, rt6_nh_list, next) {
4102 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4103 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4104 nh->r_cfg.fc_ifindex);
4105 }
4106 }
4107
4108 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4109 struct rt6_info *rt, struct fib6_config *r_cfg)
4110 {
4111 struct rt6_nh *nh;
4112 int err = -EEXIST;
4113
4114 list_for_each_entry(nh, rt6_nh_list, next) {
4115 /* check if rt6_info already exists */
4116 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4117 return err;
4118 }
4119
4120 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4121 if (!nh)
4122 return -ENOMEM;
4123 nh->rt6_info = rt;
4124 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4125 if (err) {
4126 kfree(nh);
4127 return err;
4128 }
4129 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4130 list_add_tail(&nh->next, rt6_nh_list);
4131
4132 return 0;
4133 }
4134
4135 static void ip6_route_mpath_notify(struct rt6_info *rt,
4136 struct rt6_info *rt_last,
4137 struct nl_info *info,
4138 __u16 nlflags)
4139 {
4140 /* if this is an APPEND route, then rt points to the first route
4141 * inserted and rt_last points to last route inserted. Userspace
4142 * wants a consistent dump of the route which starts at the first
4143 * nexthop. Since sibling routes are always added at the end of
4144 * the list, find the first sibling of the last route appended
4145 */
4146 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4147 rt = list_first_entry(&rt_last->rt6i_siblings,
4148 struct rt6_info,
4149 rt6i_siblings);
4150 }
4151
4152 if (rt)
4153 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4154 }
4155
4156 static int ip6_route_multipath_add(struct fib6_config *cfg,
4157 struct netlink_ext_ack *extack)
4158 {
4159 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4160 struct nl_info *info = &cfg->fc_nlinfo;
4161 struct fib6_config r_cfg;
4162 struct rtnexthop *rtnh;
4163 struct rt6_info *rt;
4164 struct rt6_nh *err_nh;
4165 struct rt6_nh *nh, *nh_safe;
4166 __u16 nlflags;
4167 int remaining;
4168 int attrlen;
4169 int err = 1;
4170 int nhn = 0;
4171 int replace = (cfg->fc_nlinfo.nlh &&
4172 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4173 LIST_HEAD(rt6_nh_list);
4174
4175 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4176 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4177 nlflags |= NLM_F_APPEND;
4178
4179 remaining = cfg->fc_mp_len;
4180 rtnh = (struct rtnexthop *)cfg->fc_mp;
4181
4182 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4183 * rt6_info structs per nexthop
4184 */
4185 while (rtnh_ok(rtnh, remaining)) {
4186 memcpy(&r_cfg, cfg, sizeof(*cfg));
4187 if (rtnh->rtnh_ifindex)
4188 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4189
4190 attrlen = rtnh_attrlen(rtnh);
4191 if (attrlen > 0) {
4192 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4193
4194 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4195 if (nla) {
4196 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4197 r_cfg.fc_flags |= RTF_GATEWAY;
4198 }
4199 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4200 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4201 if (nla)
4202 r_cfg.fc_encap_type = nla_get_u16(nla);
4203 }
4204
4205 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4206 rt = ip6_route_info_create(&r_cfg, extack);
4207 if (IS_ERR(rt)) {
4208 err = PTR_ERR(rt);
4209 rt = NULL;
4210 goto cleanup;
4211 }
4212
4213 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4214
4215 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4216 if (err) {
4217 dst_release_immediate(&rt->dst);
4218 goto cleanup;
4219 }
4220
4221 rtnh = rtnh_next(rtnh, &remaining);
4222 }
4223
4224 /* for add and replace send one notification with all nexthops.
4225 * Skip the notification in fib6_add_rt2node and send one with
4226 * the full route when done
4227 */
4228 info->skip_notify = 1;
4229
4230 err_nh = NULL;
4231 list_for_each_entry(nh, &rt6_nh_list, next) {
4232 rt_last = nh->rt6_info;
4233 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4234 /* save reference to first route for notification */
4235 if (!rt_notif && !err)
4236 rt_notif = nh->rt6_info;
4237
4238 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4239 nh->rt6_info = NULL;
4240 if (err) {
4241 if (replace && nhn)
4242 ip6_print_replace_route_err(&rt6_nh_list);
4243 err_nh = nh;
4244 goto add_errout;
4245 }
4246
4247 /* Because each route is added like a single route we remove
4248 * these flags after the first nexthop: if there is a collision,
4249 * we have already failed to add the first nexthop:
4250 * fib6_add_rt2node() has rejected it; when replacing, old
4251 * nexthops have been replaced by first new, the rest should
4252 * be added to it.
4253 */
4254 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4255 NLM_F_REPLACE);
4256 nhn++;
4257 }
4258
4259 /* success ... tell user about new route */
4260 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4261 goto cleanup;
4262
4263 add_errout:
4264 /* send notification for routes that were added so that
4265 * the delete notifications sent by ip6_route_del are
4266 * coherent
4267 */
4268 if (rt_notif)
4269 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4270
4271 /* Delete routes that were already added */
4272 list_for_each_entry(nh, &rt6_nh_list, next) {
4273 if (err_nh == nh)
4274 break;
4275 ip6_route_del(&nh->r_cfg, extack);
4276 }
4277
4278 cleanup:
4279 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4280 if (nh->rt6_info)
4281 dst_release_immediate(&nh->rt6_info->dst);
4282 kfree(nh->mxc.mx);
4283 list_del(&nh->next);
4284 kfree(nh);
4285 }
4286
4287 return err;
4288 }
4289
4290 static int ip6_route_multipath_del(struct fib6_config *cfg,
4291 struct netlink_ext_ack *extack)
4292 {
4293 struct fib6_config r_cfg;
4294 struct rtnexthop *rtnh;
4295 int remaining;
4296 int attrlen;
4297 int err = 1, last_err = 0;
4298
4299 remaining = cfg->fc_mp_len;
4300 rtnh = (struct rtnexthop *)cfg->fc_mp;
4301
4302 /* Parse a Multipath Entry */
4303 while (rtnh_ok(rtnh, remaining)) {
4304 memcpy(&r_cfg, cfg, sizeof(*cfg));
4305 if (rtnh->rtnh_ifindex)
4306 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4307
4308 attrlen = rtnh_attrlen(rtnh);
4309 if (attrlen > 0) {
4310 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4311
4312 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4313 if (nla) {
4314 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4315 r_cfg.fc_flags |= RTF_GATEWAY;
4316 }
4317 }
4318 err = ip6_route_del(&r_cfg, extack);
4319 if (err)
4320 last_err = err;
4321
4322 rtnh = rtnh_next(rtnh, &remaining);
4323 }
4324
4325 return last_err;
4326 }
4327
4328 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4329 struct netlink_ext_ack *extack)
4330 {
4331 struct fib6_config cfg;
4332 int err;
4333
4334 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4335 if (err < 0)
4336 return err;
4337
4338 if (cfg.fc_mp)
4339 return ip6_route_multipath_del(&cfg, extack);
4340 else {
4341 cfg.fc_delete_all_nh = 1;
4342 return ip6_route_del(&cfg, extack);
4343 }
4344 }
4345
4346 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4347 struct netlink_ext_ack *extack)
4348 {
4349 struct fib6_config cfg;
4350 int err;
4351
4352 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4353 if (err < 0)
4354 return err;
4355
4356 if (cfg.fc_mp)
4357 return ip6_route_multipath_add(&cfg, extack);
4358 else
4359 return ip6_route_add(&cfg, extack);
4360 }
4361
4362 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4363 {
4364 int nexthop_len = 0;
4365
4366 if (rt->rt6i_nsiblings) {
4367 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4368 + NLA_ALIGN(sizeof(struct rtnexthop))
4369 + nla_total_size(16) /* RTA_GATEWAY */
4370 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4371
4372 nexthop_len *= rt->rt6i_nsiblings;
4373 }
4374
4375 return NLMSG_ALIGN(sizeof(struct rtmsg))
4376 + nla_total_size(16) /* RTA_SRC */
4377 + nla_total_size(16) /* RTA_DST */
4378 + nla_total_size(16) /* RTA_GATEWAY */
4379 + nla_total_size(16) /* RTA_PREFSRC */
4380 + nla_total_size(4) /* RTA_TABLE */
4381 + nla_total_size(4) /* RTA_IIF */
4382 + nla_total_size(4) /* RTA_OIF */
4383 + nla_total_size(4) /* RTA_PRIORITY */
4384 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4385 + nla_total_size(sizeof(struct rta_cacheinfo))
4386 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4387 + nla_total_size(1) /* RTA_PREF */
4388 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4389 + nexthop_len;
4390 }
4391
4392 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4393 unsigned int *flags, bool skip_oif)
4394 {
4395 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4396 *flags |= RTNH_F_DEAD;
4397
4398 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4399 *flags |= RTNH_F_LINKDOWN;
4400 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4401 *flags |= RTNH_F_DEAD;
4402 }
4403
4404 if (rt->rt6i_flags & RTF_GATEWAY) {
4405 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4406 goto nla_put_failure;
4407 }
4408
4409 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4410 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4411 *flags |= RTNH_F_OFFLOAD;
4412
4413 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4414 if (!skip_oif && rt->dst.dev &&
4415 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4416 goto nla_put_failure;
4417
4418 if (rt->dst.lwtstate &&
4419 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4420 goto nla_put_failure;
4421
4422 return 0;
4423
4424 nla_put_failure:
4425 return -EMSGSIZE;
4426 }
4427
4428 /* add multipath next hop */
4429 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4430 {
4431 struct rtnexthop *rtnh;
4432 unsigned int flags = 0;
4433
4434 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4435 if (!rtnh)
4436 goto nla_put_failure;
4437
4438 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4439 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4440
4441 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4442 goto nla_put_failure;
4443
4444 rtnh->rtnh_flags = flags;
4445
4446 /* length of rtnetlink header + attributes */
4447 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4448
4449 return 0;
4450
4451 nla_put_failure:
4452 return -EMSGSIZE;
4453 }
4454
4455 static int rt6_fill_node(struct net *net,
4456 struct sk_buff *skb, struct rt6_info *rt,
4457 struct in6_addr *dst, struct in6_addr *src,
4458 int iif, int type, u32 portid, u32 seq,
4459 unsigned int flags)
4460 {
4461 u32 metrics[RTAX_MAX];
4462 struct rtmsg *rtm;
4463 struct nlmsghdr *nlh;
4464 long expires;
4465 u32 table;
4466
4467 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4468 if (!nlh)
4469 return -EMSGSIZE;
4470
4471 rtm = nlmsg_data(nlh);
4472 rtm->rtm_family = AF_INET6;
4473 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4474 rtm->rtm_src_len = rt->rt6i_src.plen;
4475 rtm->rtm_tos = 0;
4476 if (rt->rt6i_table)
4477 table = rt->rt6i_table->tb6_id;
4478 else
4479 table = RT6_TABLE_UNSPEC;
4480 rtm->rtm_table = table;
4481 if (nla_put_u32(skb, RTA_TABLE, table))
4482 goto nla_put_failure;
4483 if (rt->rt6i_flags & RTF_REJECT) {
4484 switch (rt->dst.error) {
4485 case -EINVAL:
4486 rtm->rtm_type = RTN_BLACKHOLE;
4487 break;
4488 case -EACCES:
4489 rtm->rtm_type = RTN_PROHIBIT;
4490 break;
4491 case -EAGAIN:
4492 rtm->rtm_type = RTN_THROW;
4493 break;
4494 default:
4495 rtm->rtm_type = RTN_UNREACHABLE;
4496 break;
4497 }
4498 }
4499 else if (rt->rt6i_flags & RTF_LOCAL)
4500 rtm->rtm_type = RTN_LOCAL;
4501 else if (rt->rt6i_flags & RTF_ANYCAST)
4502 rtm->rtm_type = RTN_ANYCAST;
4503 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4504 rtm->rtm_type = RTN_LOCAL;
4505 else
4506 rtm->rtm_type = RTN_UNICAST;
4507 rtm->rtm_flags = 0;
4508 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4509 rtm->rtm_protocol = rt->rt6i_protocol;
4510
4511 if (rt->rt6i_flags & RTF_CACHE)
4512 rtm->rtm_flags |= RTM_F_CLONED;
4513
4514 if (dst) {
4515 if (nla_put_in6_addr(skb, RTA_DST, dst))
4516 goto nla_put_failure;
4517 rtm->rtm_dst_len = 128;
4518 } else if (rtm->rtm_dst_len)
4519 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4520 goto nla_put_failure;
4521 #ifdef CONFIG_IPV6_SUBTREES
4522 if (src) {
4523 if (nla_put_in6_addr(skb, RTA_SRC, src))
4524 goto nla_put_failure;
4525 rtm->rtm_src_len = 128;
4526 } else if (rtm->rtm_src_len &&
4527 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4528 goto nla_put_failure;
4529 #endif
4530 if (iif) {
4531 #ifdef CONFIG_IPV6_MROUTE
4532 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4533 int err = ip6mr_get_route(net, skb, rtm, portid);
4534
4535 if (err == 0)
4536 return 0;
4537 if (err < 0)
4538 goto nla_put_failure;
4539 } else
4540 #endif
4541 if (nla_put_u32(skb, RTA_IIF, iif))
4542 goto nla_put_failure;
4543 } else if (dst) {
4544 struct in6_addr saddr_buf;
4545 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4546 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4547 goto nla_put_failure;
4548 }
4549
4550 if (rt->rt6i_prefsrc.plen) {
4551 struct in6_addr saddr_buf;
4552 saddr_buf = rt->rt6i_prefsrc.addr;
4553 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4554 goto nla_put_failure;
4555 }
4556
4557 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4558 if (rt->rt6i_pmtu)
4559 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4560 if (rtnetlink_put_metrics(skb, metrics) < 0)
4561 goto nla_put_failure;
4562
4563 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4564 goto nla_put_failure;
4565
4566 /* For multipath routes, walk the siblings list and add
4567 * each as a nexthop within RTA_MULTIPATH.
4568 */
4569 if (rt->rt6i_nsiblings) {
4570 struct rt6_info *sibling, *next_sibling;
4571 struct nlattr *mp;
4572
4573 mp = nla_nest_start(skb, RTA_MULTIPATH);
4574 if (!mp)
4575 goto nla_put_failure;
4576
4577 if (rt6_add_nexthop(skb, rt) < 0)
4578 goto nla_put_failure;
4579
4580 list_for_each_entry_safe(sibling, next_sibling,
4581 &rt->rt6i_siblings, rt6i_siblings) {
4582 if (rt6_add_nexthop(skb, sibling) < 0)
4583 goto nla_put_failure;
4584 }
4585
4586 nla_nest_end(skb, mp);
4587 } else {
4588 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4589 goto nla_put_failure;
4590 }
4591
4592 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4593
4594 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4595 goto nla_put_failure;
4596
4597 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4598 goto nla_put_failure;
4599
4600
4601 nlmsg_end(skb, nlh);
4602 return 0;
4603
4604 nla_put_failure:
4605 nlmsg_cancel(skb, nlh);
4606 return -EMSGSIZE;
4607 }
4608
4609 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4610 {
4611 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4612 struct net *net = arg->net;
4613
4614 if (rt == net->ipv6.ip6_null_entry)
4615 return 0;
4616
4617 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4618 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4619
4620 /* user wants prefix routes only */
4621 if (rtm->rtm_flags & RTM_F_PREFIX &&
4622 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4623 /* success since this is not a prefix route */
4624 return 1;
4625 }
4626 }
4627
4628 return rt6_fill_node(net,
4629 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4630 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4631 NLM_F_MULTI);
4632 }
4633
4634 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4635 struct netlink_ext_ack *extack)
4636 {
4637 struct net *net = sock_net(in_skb->sk);
4638 struct nlattr *tb[RTA_MAX+1];
4639 int err, iif = 0, oif = 0;
4640 struct dst_entry *dst;
4641 struct rt6_info *rt;
4642 struct sk_buff *skb;
4643 struct rtmsg *rtm;
4644 struct flowi6 fl6;
4645 bool fibmatch;
4646
4647 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4648 extack);
4649 if (err < 0)
4650 goto errout;
4651
4652 err = -EINVAL;
4653 memset(&fl6, 0, sizeof(fl6));
4654 rtm = nlmsg_data(nlh);
4655 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4656 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4657
4658 if (tb[RTA_SRC]) {
4659 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4660 goto errout;
4661
4662 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4663 }
4664
4665 if (tb[RTA_DST]) {
4666 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4667 goto errout;
4668
4669 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4670 }
4671
4672 if (tb[RTA_IIF])
4673 iif = nla_get_u32(tb[RTA_IIF]);
4674
4675 if (tb[RTA_OIF])
4676 oif = nla_get_u32(tb[RTA_OIF]);
4677
4678 if (tb[RTA_MARK])
4679 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4680
4681 if (tb[RTA_UID])
4682 fl6.flowi6_uid = make_kuid(current_user_ns(),
4683 nla_get_u32(tb[RTA_UID]));
4684 else
4685 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4686
4687 if (iif) {
4688 struct net_device *dev;
4689 int flags = 0;
4690
4691 rcu_read_lock();
4692
4693 dev = dev_get_by_index_rcu(net, iif);
4694 if (!dev) {
4695 rcu_read_unlock();
4696 err = -ENODEV;
4697 goto errout;
4698 }
4699
4700 fl6.flowi6_iif = iif;
4701
4702 if (!ipv6_addr_any(&fl6.saddr))
4703 flags |= RT6_LOOKUP_F_HAS_SADDR;
4704
4705 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4706
4707 rcu_read_unlock();
4708 } else {
4709 fl6.flowi6_oif = oif;
4710
4711 dst = ip6_route_output(net, NULL, &fl6);
4712 }
4713
4714
4715 rt = container_of(dst, struct rt6_info, dst);
4716 if (rt->dst.error) {
4717 err = rt->dst.error;
4718 ip6_rt_put(rt);
4719 goto errout;
4720 }
4721
4722 if (rt == net->ipv6.ip6_null_entry) {
4723 err = rt->dst.error;
4724 ip6_rt_put(rt);
4725 goto errout;
4726 }
4727
4728 if (fibmatch && rt->from) {
4729 struct rt6_info *ort = rt->from;
4730
4731 dst_hold(&ort->dst);
4732 ip6_rt_put(rt);
4733 rt = ort;
4734 }
4735
4736 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4737 if (!skb) {
4738 ip6_rt_put(rt);
4739 err = -ENOBUFS;
4740 goto errout;
4741 }
4742
4743 skb_dst_set(skb, &rt->dst);
4744 if (fibmatch)
4745 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4746 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4747 nlh->nlmsg_seq, 0);
4748 else
4749 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4750 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4751 nlh->nlmsg_seq, 0);
4752 if (err < 0) {
4753 kfree_skb(skb);
4754 goto errout;
4755 }
4756
4757 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4758 errout:
4759 return err;
4760 }
4761
4762 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4763 unsigned int nlm_flags)
4764 {
4765 struct sk_buff *skb;
4766 struct net *net = info->nl_net;
4767 u32 seq;
4768 int err;
4769
4770 err = -ENOBUFS;
4771 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4772
4773 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4774 if (!skb)
4775 goto errout;
4776
4777 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4778 event, info->portid, seq, nlm_flags);
4779 if (err < 0) {
4780 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4781 WARN_ON(err == -EMSGSIZE);
4782 kfree_skb(skb);
4783 goto errout;
4784 }
4785 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4786 info->nlh, gfp_any());
4787 return;
4788 errout:
4789 if (err < 0)
4790 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4791 }
4792
4793 static int ip6_route_dev_notify(struct notifier_block *this,
4794 unsigned long event, void *ptr)
4795 {
4796 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4797 struct net *net = dev_net(dev);
4798
4799 if (!(dev->flags & IFF_LOOPBACK))
4800 return NOTIFY_OK;
4801
4802 if (event == NETDEV_REGISTER) {
4803 net->ipv6.ip6_null_entry->dst.dev = dev;
4804 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4805 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4806 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4807 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4808 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4809 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4810 #endif
4811 } else if (event == NETDEV_UNREGISTER &&
4812 dev->reg_state != NETREG_UNREGISTERED) {
4813 /* NETDEV_UNREGISTER could be fired for multiple times by
4814 * netdev_wait_allrefs(). Make sure we only call this once.
4815 */
4816 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4817 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4818 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4819 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4820 #endif
4821 }
4822
4823 return NOTIFY_OK;
4824 }
4825
4826 /*
4827 * /proc
4828 */
4829
4830 #ifdef CONFIG_PROC_FS
4831
4832 static const struct file_operations ipv6_route_proc_fops = {
4833 .open = ipv6_route_open,
4834 .read = seq_read,
4835 .llseek = seq_lseek,
4836 .release = seq_release_net,
4837 };
4838
4839 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4840 {
4841 struct net *net = (struct net *)seq->private;
4842 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4843 net->ipv6.rt6_stats->fib_nodes,
4844 net->ipv6.rt6_stats->fib_route_nodes,
4845 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4846 net->ipv6.rt6_stats->fib_rt_entries,
4847 net->ipv6.rt6_stats->fib_rt_cache,
4848 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4849 net->ipv6.rt6_stats->fib_discarded_routes);
4850
4851 return 0;
4852 }
4853
4854 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4855 {
4856 return single_open_net(inode, file, rt6_stats_seq_show);
4857 }
4858
4859 static const struct file_operations rt6_stats_seq_fops = {
4860 .open = rt6_stats_seq_open,
4861 .read = seq_read,
4862 .llseek = seq_lseek,
4863 .release = single_release_net,
4864 };
4865 #endif /* CONFIG_PROC_FS */
4866
4867 #ifdef CONFIG_SYSCTL
4868
4869 static
4870 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4871 void __user *buffer, size_t *lenp, loff_t *ppos)
4872 {
4873 struct net *net;
4874 int delay;
4875 if (!write)
4876 return -EINVAL;
4877
4878 net = (struct net *)ctl->extra1;
4879 delay = net->ipv6.sysctl.flush_delay;
4880 proc_dointvec(ctl, write, buffer, lenp, ppos);
4881 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4882 return 0;
4883 }
4884
4885 struct ctl_table ipv6_route_table_template[] = {
4886 {
4887 .procname = "flush",
4888 .data = &init_net.ipv6.sysctl.flush_delay,
4889 .maxlen = sizeof(int),
4890 .mode = 0200,
4891 .proc_handler = ipv6_sysctl_rtcache_flush
4892 },
4893 {
4894 .procname = "gc_thresh",
4895 .data = &ip6_dst_ops_template.gc_thresh,
4896 .maxlen = sizeof(int),
4897 .mode = 0644,
4898 .proc_handler = proc_dointvec,
4899 },
4900 {
4901 .procname = "max_size",
4902 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4903 .maxlen = sizeof(int),
4904 .mode = 0644,
4905 .proc_handler = proc_dointvec,
4906 },
4907 {
4908 .procname = "gc_min_interval",
4909 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4910 .maxlen = sizeof(int),
4911 .mode = 0644,
4912 .proc_handler = proc_dointvec_jiffies,
4913 },
4914 {
4915 .procname = "gc_timeout",
4916 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4917 .maxlen = sizeof(int),
4918 .mode = 0644,
4919 .proc_handler = proc_dointvec_jiffies,
4920 },
4921 {
4922 .procname = "gc_interval",
4923 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4924 .maxlen = sizeof(int),
4925 .mode = 0644,
4926 .proc_handler = proc_dointvec_jiffies,
4927 },
4928 {
4929 .procname = "gc_elasticity",
4930 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4931 .maxlen = sizeof(int),
4932 .mode = 0644,
4933 .proc_handler = proc_dointvec,
4934 },
4935 {
4936 .procname = "mtu_expires",
4937 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4938 .maxlen = sizeof(int),
4939 .mode = 0644,
4940 .proc_handler = proc_dointvec_jiffies,
4941 },
4942 {
4943 .procname = "min_adv_mss",
4944 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4945 .maxlen = sizeof(int),
4946 .mode = 0644,
4947 .proc_handler = proc_dointvec,
4948 },
4949 {
4950 .procname = "gc_min_interval_ms",
4951 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4952 .maxlen = sizeof(int),
4953 .mode = 0644,
4954 .proc_handler = proc_dointvec_ms_jiffies,
4955 },
4956 { }
4957 };
4958
4959 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4960 {
4961 struct ctl_table *table;
4962
4963 table = kmemdup(ipv6_route_table_template,
4964 sizeof(ipv6_route_table_template),
4965 GFP_KERNEL);
4966
4967 if (table) {
4968 table[0].data = &net->ipv6.sysctl.flush_delay;
4969 table[0].extra1 = net;
4970 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4971 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4972 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4973 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4974 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4975 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4976 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4977 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4978 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4979
4980 /* Don't export sysctls to unprivileged users */
4981 if (net->user_ns != &init_user_ns)
4982 table[0].procname = NULL;
4983 }
4984
4985 return table;
4986 }
4987 #endif
4988
4989 static int __net_init ip6_route_net_init(struct net *net)
4990 {
4991 int ret = -ENOMEM;
4992
4993 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4994 sizeof(net->ipv6.ip6_dst_ops));
4995
4996 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4997 goto out_ip6_dst_ops;
4998
4999 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5000 sizeof(*net->ipv6.ip6_null_entry),
5001 GFP_KERNEL);
5002 if (!net->ipv6.ip6_null_entry)
5003 goto out_ip6_dst_entries;
5004 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5005 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5006 ip6_template_metrics, true);
5007
5008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5009 net->ipv6.fib6_has_custom_rules = false;
5010 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5011 sizeof(*net->ipv6.ip6_prohibit_entry),
5012 GFP_KERNEL);
5013 if (!net->ipv6.ip6_prohibit_entry)
5014 goto out_ip6_null_entry;
5015 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5016 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5017 ip6_template_metrics, true);
5018
5019 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5020 sizeof(*net->ipv6.ip6_blk_hole_entry),
5021 GFP_KERNEL);
5022 if (!net->ipv6.ip6_blk_hole_entry)
5023 goto out_ip6_prohibit_entry;
5024 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5025 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5026 ip6_template_metrics, true);
5027 #endif
5028
5029 net->ipv6.sysctl.flush_delay = 0;
5030 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5031 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5032 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5033 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5034 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5035 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5036 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5037
5038 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5039
5040 ret = 0;
5041 out:
5042 return ret;
5043
5044 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5045 out_ip6_prohibit_entry:
5046 kfree(net->ipv6.ip6_prohibit_entry);
5047 out_ip6_null_entry:
5048 kfree(net->ipv6.ip6_null_entry);
5049 #endif
5050 out_ip6_dst_entries:
5051 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5052 out_ip6_dst_ops:
5053 goto out;
5054 }
5055
5056 static void __net_exit ip6_route_net_exit(struct net *net)
5057 {
5058 kfree(net->ipv6.ip6_null_entry);
5059 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5060 kfree(net->ipv6.ip6_prohibit_entry);
5061 kfree(net->ipv6.ip6_blk_hole_entry);
5062 #endif
5063 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5064 }
5065
5066 static int __net_init ip6_route_net_init_late(struct net *net)
5067 {
5068 #ifdef CONFIG_PROC_FS
5069 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5070 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
5071 #endif
5072 return 0;
5073 }
5074
5075 static void __net_exit ip6_route_net_exit_late(struct net *net)
5076 {
5077 #ifdef CONFIG_PROC_FS
5078 remove_proc_entry("ipv6_route", net->proc_net);
5079 remove_proc_entry("rt6_stats", net->proc_net);
5080 #endif
5081 }
5082
5083 static struct pernet_operations ip6_route_net_ops = {
5084 .init = ip6_route_net_init,
5085 .exit = ip6_route_net_exit,
5086 .async = true,
5087 };
5088
5089 static int __net_init ipv6_inetpeer_init(struct net *net)
5090 {
5091 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5092
5093 if (!bp)
5094 return -ENOMEM;
5095 inet_peer_base_init(bp);
5096 net->ipv6.peers = bp;
5097 return 0;
5098 }
5099
5100 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5101 {
5102 struct inet_peer_base *bp = net->ipv6.peers;
5103
5104 net->ipv6.peers = NULL;
5105 inetpeer_invalidate_tree(bp);
5106 kfree(bp);
5107 }
5108
5109 static struct pernet_operations ipv6_inetpeer_ops = {
5110 .init = ipv6_inetpeer_init,
5111 .exit = ipv6_inetpeer_exit,
5112 .async = true,
5113 };
5114
5115 static struct pernet_operations ip6_route_net_late_ops = {
5116 .init = ip6_route_net_init_late,
5117 .exit = ip6_route_net_exit_late,
5118 .async = true,
5119 };
5120
5121 static struct notifier_block ip6_route_dev_notifier = {
5122 .notifier_call = ip6_route_dev_notify,
5123 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5124 };
5125
5126 void __init ip6_route_init_special_entries(void)
5127 {
5128 /* Registering of the loopback is done before this portion of code,
5129 * the loopback reference in rt6_info will not be taken, do it
5130 * manually for init_net */
5131 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5132 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5133 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5134 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5135 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5136 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5137 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5138 #endif
5139 }
5140
5141 int __init ip6_route_init(void)
5142 {
5143 int ret;
5144 int cpu;
5145
5146 ret = -ENOMEM;
5147 ip6_dst_ops_template.kmem_cachep =
5148 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5149 SLAB_HWCACHE_ALIGN, NULL);
5150 if (!ip6_dst_ops_template.kmem_cachep)
5151 goto out;
5152
5153 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5154 if (ret)
5155 goto out_kmem_cache;
5156
5157 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5158 if (ret)
5159 goto out_dst_entries;
5160
5161 ret = register_pernet_subsys(&ip6_route_net_ops);
5162 if (ret)
5163 goto out_register_inetpeer;
5164
5165 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5166
5167 ret = fib6_init();
5168 if (ret)
5169 goto out_register_subsys;
5170
5171 ret = xfrm6_init();
5172 if (ret)
5173 goto out_fib6_init;
5174
5175 ret = fib6_rules_init();
5176 if (ret)
5177 goto xfrm6_init;
5178
5179 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5180 if (ret)
5181 goto fib6_rules_init;
5182
5183 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5184 inet6_rtm_newroute, NULL, 0);
5185 if (ret < 0)
5186 goto out_register_late_subsys;
5187
5188 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5189 inet6_rtm_delroute, NULL, 0);
5190 if (ret < 0)
5191 goto out_register_late_subsys;
5192
5193 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5194 inet6_rtm_getroute, NULL,
5195 RTNL_FLAG_DOIT_UNLOCKED);
5196 if (ret < 0)
5197 goto out_register_late_subsys;
5198
5199 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5200 if (ret)
5201 goto out_register_late_subsys;
5202
5203 for_each_possible_cpu(cpu) {
5204 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5205
5206 INIT_LIST_HEAD(&ul->head);
5207 spin_lock_init(&ul->lock);
5208 }
5209
5210 out:
5211 return ret;
5212
5213 out_register_late_subsys:
5214 rtnl_unregister_all(PF_INET6);
5215 unregister_pernet_subsys(&ip6_route_net_late_ops);
5216 fib6_rules_init:
5217 fib6_rules_cleanup();
5218 xfrm6_init:
5219 xfrm6_fini();
5220 out_fib6_init:
5221 fib6_gc_cleanup();
5222 out_register_subsys:
5223 unregister_pernet_subsys(&ip6_route_net_ops);
5224 out_register_inetpeer:
5225 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5226 out_dst_entries:
5227 dst_entries_destroy(&ip6_dst_blackhole_ops);
5228 out_kmem_cache:
5229 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5230 goto out;
5231 }
5232
5233 void ip6_route_cleanup(void)
5234 {
5235 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5236 unregister_pernet_subsys(&ip6_route_net_late_ops);
5237 fib6_rules_cleanup();
5238 xfrm6_fini();
5239 fib6_gc_cleanup();
5240 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5241 unregister_pernet_subsys(&ip6_route_net_ops);
5242 dst_entries_destroy(&ip6_dst_blackhole_ops);
5243 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5244 }