]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/ipv6/route.c
86fb2411e2bda7d148a6467411b4d025bd4c397c
[mirror_ubuntu-artful-kernel.git] / net / ipv6 / route.c
1 /*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14 /* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
77 RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
89
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu);
97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 struct sk_buff *skb);
99 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103 struct sk_buff *skb, struct rt6_info *rt,
104 struct in6_addr *dst, struct in6_addr *src,
105 int iif, int type, u32 portid, u32 seq,
106 unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110 const struct in6_addr *prefix, int prefixlen,
111 const struct in6_addr *gwaddr,
112 struct net_device *dev,
113 unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115 const struct in6_addr *prefix, int prefixlen,
116 const struct in6_addr *gwaddr,
117 struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121 spinlock_t lock;
122 struct list_head head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131 rt->rt6i_uncached_list = ul;
132
133 spin_lock_bh(&ul->lock);
134 list_add_tail(&rt->rt6i_uncached, &ul->head);
135 spin_unlock_bh(&ul->lock);
136 }
137
138 static void rt6_uncached_list_del(struct rt6_info *rt)
139 {
140 if (!list_empty(&rt->rt6i_uncached)) {
141 struct uncached_list *ul = rt->rt6i_uncached_list;
142
143 spin_lock_bh(&ul->lock);
144 list_del(&rt->rt6i_uncached);
145 spin_unlock_bh(&ul->lock);
146 }
147 }
148
149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
150 {
151 struct net_device *loopback_dev = net->loopback_dev;
152 int cpu;
153
154 if (dev == loopback_dev)
155 return;
156
157 for_each_possible_cpu(cpu) {
158 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
159 struct rt6_info *rt;
160
161 spin_lock_bh(&ul->lock);
162 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
163 struct inet6_dev *rt_idev = rt->rt6i_idev;
164 struct net_device *rt_dev = rt->dst.dev;
165
166 if (rt_idev->dev == dev) {
167 rt->rt6i_idev = in6_dev_get(loopback_dev);
168 in6_dev_put(rt_idev);
169 }
170
171 if (rt_dev == dev) {
172 rt->dst.dev = loopback_dev;
173 dev_hold(rt->dst.dev);
174 dev_put(rt_dev);
175 }
176 }
177 spin_unlock_bh(&ul->lock);
178 }
179 }
180
181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
182 {
183 return dst_metrics_write_ptr(rt->dst.from);
184 }
185
186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
187 {
188 struct rt6_info *rt = (struct rt6_info *)dst;
189
190 if (rt->rt6i_flags & RTF_PCPU)
191 return rt6_pcpu_cow_metrics(rt);
192 else if (rt->rt6i_flags & RTF_CACHE)
193 return NULL;
194 else
195 return dst_cow_metrics_generic(dst, old);
196 }
197
198 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
199 struct sk_buff *skb,
200 const void *daddr)
201 {
202 struct in6_addr *p = &rt->rt6i_gateway;
203
204 if (!ipv6_addr_any(p))
205 return (const void *) p;
206 else if (skb)
207 return &ipv6_hdr(skb)->daddr;
208 return daddr;
209 }
210
211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
212 struct sk_buff *skb,
213 const void *daddr)
214 {
215 struct rt6_info *rt = (struct rt6_info *) dst;
216 struct neighbour *n;
217
218 daddr = choose_neigh_daddr(rt, skb, daddr);
219 n = __ipv6_neigh_lookup(dst->dev, daddr);
220 if (n)
221 return n;
222 return neigh_create(&nd_tbl, daddr, dst->dev);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
229
230 daddr = choose_neigh_daddr(rt, NULL, daddr);
231 if (!daddr)
232 return;
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 return;
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 return;
237 __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241 .family = AF_INET6,
242 .gc = ip6_dst_gc,
243 .gc_thresh = 1024,
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
246 .mtu = ip6_mtu,
247 .cow_metrics = ipv6_cow_metrics,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263 return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277 .family = AF_INET6,
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct rt6_info ip6_null_entry_template = {
293 .dst = {
294 .__refcnt = ATOMIC_INIT(1),
295 .__use = 1,
296 .obsolete = DST_OBSOLETE_FORCE_CHK,
297 .error = -ENETUNREACH,
298 .input = ip6_pkt_discard,
299 .output = ip6_pkt_discard_out,
300 },
301 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
302 .rt6i_protocol = RTPROT_KERNEL,
303 .rt6i_metric = ~(u32) 0,
304 .rt6i_ref = ATOMIC_INIT(1),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 .dst = {
311 .__refcnt = ATOMIC_INIT(1),
312 .__use = 1,
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
314 .error = -EACCES,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
317 },
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
319 .rt6i_protocol = RTPROT_KERNEL,
320 .rt6i_metric = ~(u32) 0,
321 .rt6i_ref = ATOMIC_INIT(1),
322 };
323
324 static const struct rt6_info ip6_blk_hole_entry_template = {
325 .dst = {
326 .__refcnt = ATOMIC_INIT(1),
327 .__use = 1,
328 .obsolete = DST_OBSOLETE_FORCE_CHK,
329 .error = -EINVAL,
330 .input = dst_discard,
331 .output = dst_discard_out,
332 },
333 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
334 .rt6i_protocol = RTPROT_KERNEL,
335 .rt6i_metric = ~(u32) 0,
336 .rt6i_ref = ATOMIC_INIT(1),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343 struct dst_entry *dst = &rt->dst;
344
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_siblings);
347 INIT_LIST_HEAD(&rt->rt6i_uncached);
348 }
349
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info *__ip6_dst_alloc(struct net *net,
352 struct net_device *dev,
353 int flags)
354 {
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
357
358 if (rt)
359 rt6_info_init(rt);
360
361 return rt;
362 }
363
364 struct rt6_info *ip6_dst_alloc(struct net *net,
365 struct net_device *dev,
366 int flags)
367 {
368 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
369
370 if (rt) {
371 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
372 if (rt->rt6i_pcpu) {
373 int cpu;
374
375 for_each_possible_cpu(cpu) {
376 struct rt6_info **p;
377
378 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
379 /* no one shares rt */
380 *p = NULL;
381 }
382 } else {
383 dst_release_immediate(&rt->dst);
384 return NULL;
385 }
386 }
387
388 return rt;
389 }
390 EXPORT_SYMBOL(ip6_dst_alloc);
391
392 static void ip6_dst_destroy(struct dst_entry *dst)
393 {
394 struct rt6_info *rt = (struct rt6_info *)dst;
395 struct dst_entry *from = dst->from;
396 struct inet6_dev *idev;
397
398 dst_destroy_metrics_generic(dst);
399 free_percpu(rt->rt6i_pcpu);
400 rt6_uncached_list_del(rt);
401
402 idev = rt->rt6i_idev;
403 if (idev) {
404 rt->rt6i_idev = NULL;
405 in6_dev_put(idev);
406 }
407
408 dst->from = NULL;
409 dst_release(from);
410 }
411
412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
413 int how)
414 {
415 struct rt6_info *rt = (struct rt6_info *)dst;
416 struct inet6_dev *idev = rt->rt6i_idev;
417 struct net_device *loopback_dev =
418 dev_net(dev)->loopback_dev;
419
420 if (idev && idev->dev != loopback_dev) {
421 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
422 if (loopback_idev) {
423 rt->rt6i_idev = loopback_idev;
424 in6_dev_put(idev);
425 }
426 }
427 }
428
429 static bool __rt6_check_expired(const struct rt6_info *rt)
430 {
431 if (rt->rt6i_flags & RTF_EXPIRES)
432 return time_after(jiffies, rt->dst.expires);
433 else
434 return false;
435 }
436
437 static bool rt6_check_expired(const struct rt6_info *rt)
438 {
439 if (rt->rt6i_flags & RTF_EXPIRES) {
440 if (time_after(jiffies, rt->dst.expires))
441 return true;
442 } else if (rt->dst.from) {
443 return rt6_check_expired((struct rt6_info *) rt->dst.from);
444 }
445 return false;
446 }
447
448 /* Multipath route selection:
449 * Hash based function using packet header and flowlabel.
450 * Adapted from fib_info_hashfn()
451 */
452 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
453 const struct flowi6 *fl6)
454 {
455 return get_hash_from_flowi6(fl6) % candidate_count;
456 }
457
458 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
459 struct flowi6 *fl6, int oif,
460 int strict)
461 {
462 struct rt6_info *sibling, *next_sibling;
463 int route_choosen;
464
465 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
466 /* Don't change the route, if route_choosen == 0
467 * (siblings does not include ourself)
468 */
469 if (route_choosen)
470 list_for_each_entry_safe(sibling, next_sibling,
471 &match->rt6i_siblings, rt6i_siblings) {
472 route_choosen--;
473 if (route_choosen == 0) {
474 if (rt6_score_route(sibling, oif, strict) < 0)
475 break;
476 match = sibling;
477 break;
478 }
479 }
480 return match;
481 }
482
483 /*
484 * Route lookup. Any table->tb6_lock is implied.
485 */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488 struct rt6_info *rt,
489 const struct in6_addr *saddr,
490 int oif,
491 int flags)
492 {
493 struct rt6_info *local = NULL;
494 struct rt6_info *sprt;
495
496 if (!oif && ipv6_addr_any(saddr))
497 goto out;
498
499 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
500 struct net_device *dev = sprt->dst.dev;
501
502 if (oif) {
503 if (dev->ifindex == oif)
504 return sprt;
505 if (dev->flags & IFF_LOOPBACK) {
506 if (!sprt->rt6i_idev ||
507 sprt->rt6i_idev->dev->ifindex != oif) {
508 if (flags & RT6_LOOKUP_F_IFACE)
509 continue;
510 if (local &&
511 local->rt6i_idev->dev->ifindex == oif)
512 continue;
513 }
514 local = sprt;
515 }
516 } else {
517 if (ipv6_chk_addr(net, saddr, dev,
518 flags & RT6_LOOKUP_F_IFACE))
519 return sprt;
520 }
521 }
522
523 if (oif) {
524 if (local)
525 return local;
526
527 if (flags & RT6_LOOKUP_F_IFACE)
528 return net->ipv6.ip6_null_entry;
529 }
530 out:
531 return rt;
532 }
533
534 #ifdef CONFIG_IPV6_ROUTER_PREF
535 struct __rt6_probe_work {
536 struct work_struct work;
537 struct in6_addr target;
538 struct net_device *dev;
539 };
540
541 static void rt6_probe_deferred(struct work_struct *w)
542 {
543 struct in6_addr mcaddr;
544 struct __rt6_probe_work *work =
545 container_of(w, struct __rt6_probe_work, work);
546
547 addrconf_addr_solict_mult(&work->target, &mcaddr);
548 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
549 dev_put(work->dev);
550 kfree(work);
551 }
552
553 static void rt6_probe(struct rt6_info *rt)
554 {
555 struct __rt6_probe_work *work;
556 struct neighbour *neigh;
557 /*
558 * Okay, this does not seem to be appropriate
559 * for now, however, we need to check if it
560 * is really so; aka Router Reachability Probing.
561 *
562 * Router Reachability Probe MUST be rate-limited
563 * to no more than one per minute.
564 */
565 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
566 return;
567 rcu_read_lock_bh();
568 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
569 if (neigh) {
570 if (neigh->nud_state & NUD_VALID)
571 goto out;
572
573 work = NULL;
574 write_lock(&neigh->lock);
575 if (!(neigh->nud_state & NUD_VALID) &&
576 time_after(jiffies,
577 neigh->updated +
578 rt->rt6i_idev->cnf.rtr_probe_interval)) {
579 work = kmalloc(sizeof(*work), GFP_ATOMIC);
580 if (work)
581 __neigh_set_probe_once(neigh);
582 }
583 write_unlock(&neigh->lock);
584 } else {
585 work = kmalloc(sizeof(*work), GFP_ATOMIC);
586 }
587
588 if (work) {
589 INIT_WORK(&work->work, rt6_probe_deferred);
590 work->target = rt->rt6i_gateway;
591 dev_hold(rt->dst.dev);
592 work->dev = rt->dst.dev;
593 schedule_work(&work->work);
594 }
595
596 out:
597 rcu_read_unlock_bh();
598 }
599 #else
600 static inline void rt6_probe(struct rt6_info *rt)
601 {
602 }
603 #endif
604
605 /*
606 * Default Router Selection (RFC 2461 6.3.6)
607 */
608 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
609 {
610 struct net_device *dev = rt->dst.dev;
611 if (!oif || dev->ifindex == oif)
612 return 2;
613 if ((dev->flags & IFF_LOOPBACK) &&
614 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
615 return 1;
616 return 0;
617 }
618
619 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
620 {
621 struct neighbour *neigh;
622 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
623
624 if (rt->rt6i_flags & RTF_NONEXTHOP ||
625 !(rt->rt6i_flags & RTF_GATEWAY))
626 return RT6_NUD_SUCCEED;
627
628 rcu_read_lock_bh();
629 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
630 if (neigh) {
631 read_lock(&neigh->lock);
632 if (neigh->nud_state & NUD_VALID)
633 ret = RT6_NUD_SUCCEED;
634 #ifdef CONFIG_IPV6_ROUTER_PREF
635 else if (!(neigh->nud_state & NUD_FAILED))
636 ret = RT6_NUD_SUCCEED;
637 else
638 ret = RT6_NUD_FAIL_PROBE;
639 #endif
640 read_unlock(&neigh->lock);
641 } else {
642 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
643 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
644 }
645 rcu_read_unlock_bh();
646
647 return ret;
648 }
649
650 static int rt6_score_route(struct rt6_info *rt, int oif,
651 int strict)
652 {
653 int m;
654
655 m = rt6_check_dev(rt, oif);
656 if (!m && (strict & RT6_LOOKUP_F_IFACE))
657 return RT6_NUD_FAIL_HARD;
658 #ifdef CONFIG_IPV6_ROUTER_PREF
659 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
660 #endif
661 if (strict & RT6_LOOKUP_F_REACHABLE) {
662 int n = rt6_check_neigh(rt);
663 if (n < 0)
664 return n;
665 }
666 return m;
667 }
668
669 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
670 int *mpri, struct rt6_info *match,
671 bool *do_rr)
672 {
673 int m;
674 bool match_do_rr = false;
675 struct inet6_dev *idev = rt->rt6i_idev;
676 struct net_device *dev = rt->dst.dev;
677
678 if (dev && !netif_carrier_ok(dev) &&
679 idev->cnf.ignore_routes_with_linkdown &&
680 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
681 goto out;
682
683 if (rt6_check_expired(rt))
684 goto out;
685
686 m = rt6_score_route(rt, oif, strict);
687 if (m == RT6_NUD_FAIL_DO_RR) {
688 match_do_rr = true;
689 m = 0; /* lowest valid score */
690 } else if (m == RT6_NUD_FAIL_HARD) {
691 goto out;
692 }
693
694 if (strict & RT6_LOOKUP_F_REACHABLE)
695 rt6_probe(rt);
696
697 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
698 if (m > *mpri) {
699 *do_rr = match_do_rr;
700 *mpri = m;
701 match = rt;
702 }
703 out:
704 return match;
705 }
706
707 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
708 struct rt6_info *rr_head,
709 u32 metric, int oif, int strict,
710 bool *do_rr)
711 {
712 struct rt6_info *rt, *match, *cont;
713 int mpri = -1;
714
715 match = NULL;
716 cont = NULL;
717 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
718 if (rt->rt6i_metric != metric) {
719 cont = rt;
720 break;
721 }
722
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724 }
725
726 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
727 if (rt->rt6i_metric != metric) {
728 cont = rt;
729 break;
730 }
731
732 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 }
734
735 if (match || !cont)
736 return match;
737
738 for (rt = cont; rt; rt = rt->dst.rt6_next)
739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740
741 return match;
742 }
743
744 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
745 {
746 struct rt6_info *match, *rt0;
747 struct net *net;
748 bool do_rr = false;
749
750 rt0 = fn->rr_ptr;
751 if (!rt0)
752 fn->rr_ptr = rt0 = fn->leaf;
753
754 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
755 &do_rr);
756
757 if (do_rr) {
758 struct rt6_info *next = rt0->dst.rt6_next;
759
760 /* no entries matched; do round-robin */
761 if (!next || next->rt6i_metric != rt0->rt6i_metric)
762 next = fn->leaf;
763
764 if (next != rt0)
765 fn->rr_ptr = next;
766 }
767
768 net = dev_net(rt0->dst.dev);
769 return match ? match : net->ipv6.ip6_null_entry;
770 }
771
772 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
773 {
774 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
775 }
776
777 #ifdef CONFIG_IPV6_ROUTE_INFO
778 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
779 const struct in6_addr *gwaddr)
780 {
781 struct net *net = dev_net(dev);
782 struct route_info *rinfo = (struct route_info *) opt;
783 struct in6_addr prefix_buf, *prefix;
784 unsigned int pref;
785 unsigned long lifetime;
786 struct rt6_info *rt;
787
788 if (len < sizeof(struct route_info)) {
789 return -EINVAL;
790 }
791
792 /* Sanity check for prefix_len and length */
793 if (rinfo->length > 3) {
794 return -EINVAL;
795 } else if (rinfo->prefix_len > 128) {
796 return -EINVAL;
797 } else if (rinfo->prefix_len > 64) {
798 if (rinfo->length < 2) {
799 return -EINVAL;
800 }
801 } else if (rinfo->prefix_len > 0) {
802 if (rinfo->length < 1) {
803 return -EINVAL;
804 }
805 }
806
807 pref = rinfo->route_pref;
808 if (pref == ICMPV6_ROUTER_PREF_INVALID)
809 return -EINVAL;
810
811 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
812
813 if (rinfo->length == 3)
814 prefix = (struct in6_addr *)rinfo->prefix;
815 else {
816 /* this function is safe */
817 ipv6_addr_prefix(&prefix_buf,
818 (struct in6_addr *)rinfo->prefix,
819 rinfo->prefix_len);
820 prefix = &prefix_buf;
821 }
822
823 if (rinfo->prefix_len == 0)
824 rt = rt6_get_dflt_router(gwaddr, dev);
825 else
826 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
827 gwaddr, dev);
828
829 if (rt && !lifetime) {
830 ip6_del_rt(rt);
831 rt = NULL;
832 }
833
834 if (!rt && lifetime)
835 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
836 dev, pref);
837 else if (rt)
838 rt->rt6i_flags = RTF_ROUTEINFO |
839 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
840
841 if (rt) {
842 if (!addrconf_finite_timeout(lifetime))
843 rt6_clean_expires(rt);
844 else
845 rt6_set_expires(rt, jiffies + HZ * lifetime);
846
847 ip6_rt_put(rt);
848 }
849 return 0;
850 }
851 #endif
852
853 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
854 struct in6_addr *saddr)
855 {
856 struct fib6_node *pn;
857 while (1) {
858 if (fn->fn_flags & RTN_TL_ROOT)
859 return NULL;
860 pn = fn->parent;
861 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
862 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
863 else
864 fn = pn;
865 if (fn->fn_flags & RTN_RTINFO)
866 return fn;
867 }
868 }
869
870 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
871 struct fib6_table *table,
872 struct flowi6 *fl6, int flags)
873 {
874 struct fib6_node *fn;
875 struct rt6_info *rt;
876
877 read_lock_bh(&table->tb6_lock);
878 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
879 restart:
880 rt = fn->leaf;
881 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
882 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
883 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
884 if (rt == net->ipv6.ip6_null_entry) {
885 fn = fib6_backtrack(fn, &fl6->saddr);
886 if (fn)
887 goto restart;
888 }
889 dst_use(&rt->dst, jiffies);
890 read_unlock_bh(&table->tb6_lock);
891
892 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
893
894 return rt;
895
896 }
897
898 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
899 int flags)
900 {
901 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
902 }
903 EXPORT_SYMBOL_GPL(ip6_route_lookup);
904
905 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
906 const struct in6_addr *saddr, int oif, int strict)
907 {
908 struct flowi6 fl6 = {
909 .flowi6_oif = oif,
910 .daddr = *daddr,
911 };
912 struct dst_entry *dst;
913 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
914
915 if (saddr) {
916 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
917 flags |= RT6_LOOKUP_F_HAS_SADDR;
918 }
919
920 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
921 if (dst->error == 0)
922 return (struct rt6_info *) dst;
923
924 dst_release(dst);
925
926 return NULL;
927 }
928 EXPORT_SYMBOL(rt6_lookup);
929
930 /* ip6_ins_rt is called with FREE table->tb6_lock.
931 * It takes new route entry, the addition fails by any reason the
932 * route is released.
933 * Caller must hold dst before calling it.
934 */
935
936 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
937 struct mx6_config *mxc,
938 struct netlink_ext_ack *extack)
939 {
940 int err;
941 struct fib6_table *table;
942
943 table = rt->rt6i_table;
944 write_lock_bh(&table->tb6_lock);
945 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
946 write_unlock_bh(&table->tb6_lock);
947
948 return err;
949 }
950
951 int ip6_ins_rt(struct rt6_info *rt)
952 {
953 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
954 struct mx6_config mxc = { .mx = NULL, };
955
956 /* Hold dst to account for the reference from the fib6 tree */
957 dst_hold(&rt->dst);
958 return __ip6_ins_rt(rt, &info, &mxc, NULL);
959 }
960
961 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
962 const struct in6_addr *daddr,
963 const struct in6_addr *saddr)
964 {
965 struct rt6_info *rt;
966
967 /*
968 * Clone the route.
969 */
970
971 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
972 ort = (struct rt6_info *)ort->dst.from;
973
974 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
975
976 if (!rt)
977 return NULL;
978
979 ip6_rt_copy_init(rt, ort);
980 rt->rt6i_flags |= RTF_CACHE;
981 rt->rt6i_metric = 0;
982 rt->dst.flags |= DST_HOST;
983 rt->rt6i_dst.addr = *daddr;
984 rt->rt6i_dst.plen = 128;
985
986 if (!rt6_is_gw_or_nonexthop(ort)) {
987 if (ort->rt6i_dst.plen != 128 &&
988 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
989 rt->rt6i_flags |= RTF_ANYCAST;
990 #ifdef CONFIG_IPV6_SUBTREES
991 if (rt->rt6i_src.plen && saddr) {
992 rt->rt6i_src.addr = *saddr;
993 rt->rt6i_src.plen = 128;
994 }
995 #endif
996 }
997
998 return rt;
999 }
1000
1001 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1002 {
1003 struct rt6_info *pcpu_rt;
1004
1005 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1006 rt->dst.dev, rt->dst.flags);
1007
1008 if (!pcpu_rt)
1009 return NULL;
1010 ip6_rt_copy_init(pcpu_rt, rt);
1011 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1012 pcpu_rt->rt6i_flags |= RTF_PCPU;
1013 return pcpu_rt;
1014 }
1015
1016 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1017 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1018 {
1019 struct rt6_info *pcpu_rt, **p;
1020
1021 p = this_cpu_ptr(rt->rt6i_pcpu);
1022 pcpu_rt = *p;
1023
1024 if (pcpu_rt) {
1025 dst_hold(&pcpu_rt->dst);
1026 rt6_dst_from_metrics_check(pcpu_rt);
1027 }
1028 return pcpu_rt;
1029 }
1030
1031 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1032 {
1033 struct fib6_table *table = rt->rt6i_table;
1034 struct rt6_info *pcpu_rt, *prev, **p;
1035
1036 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1037 if (!pcpu_rt) {
1038 struct net *net = dev_net(rt->dst.dev);
1039
1040 dst_hold(&net->ipv6.ip6_null_entry->dst);
1041 return net->ipv6.ip6_null_entry;
1042 }
1043
1044 read_lock_bh(&table->tb6_lock);
1045 if (rt->rt6i_pcpu) {
1046 p = this_cpu_ptr(rt->rt6i_pcpu);
1047 prev = cmpxchg(p, NULL, pcpu_rt);
1048 if (prev) {
1049 /* If someone did it before us, return prev instead */
1050 dst_release_immediate(&pcpu_rt->dst);
1051 pcpu_rt = prev;
1052 }
1053 } else {
1054 /* rt has been removed from the fib6 tree
1055 * before we have a chance to acquire the read_lock.
1056 * In this case, don't brother to create a pcpu rt
1057 * since rt is going away anyway. The next
1058 * dst_check() will trigger a re-lookup.
1059 */
1060 dst_release_immediate(&pcpu_rt->dst);
1061 pcpu_rt = rt;
1062 }
1063 dst_hold(&pcpu_rt->dst);
1064 rt6_dst_from_metrics_check(pcpu_rt);
1065 read_unlock_bh(&table->tb6_lock);
1066 return pcpu_rt;
1067 }
1068
1069 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1070 int oif, struct flowi6 *fl6, int flags)
1071 {
1072 struct fib6_node *fn, *saved_fn;
1073 struct rt6_info *rt;
1074 int strict = 0;
1075
1076 strict |= flags & RT6_LOOKUP_F_IFACE;
1077 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1078 if (net->ipv6.devconf_all->forwarding == 0)
1079 strict |= RT6_LOOKUP_F_REACHABLE;
1080
1081 read_lock_bh(&table->tb6_lock);
1082
1083 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1084 saved_fn = fn;
1085
1086 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1087 oif = 0;
1088
1089 redo_rt6_select:
1090 rt = rt6_select(fn, oif, strict);
1091 if (rt->rt6i_nsiblings)
1092 rt = rt6_multipath_select(rt, fl6, oif, strict);
1093 if (rt == net->ipv6.ip6_null_entry) {
1094 fn = fib6_backtrack(fn, &fl6->saddr);
1095 if (fn)
1096 goto redo_rt6_select;
1097 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1098 /* also consider unreachable route */
1099 strict &= ~RT6_LOOKUP_F_REACHABLE;
1100 fn = saved_fn;
1101 goto redo_rt6_select;
1102 }
1103 }
1104
1105
1106 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1107 dst_use(&rt->dst, jiffies);
1108 read_unlock_bh(&table->tb6_lock);
1109
1110 rt6_dst_from_metrics_check(rt);
1111
1112 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1113 return rt;
1114 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1115 !(rt->rt6i_flags & RTF_GATEWAY))) {
1116 /* Create a RTF_CACHE clone which will not be
1117 * owned by the fib6 tree. It is for the special case where
1118 * the daddr in the skb during the neighbor look-up is different
1119 * from the fl6->daddr used to look-up route here.
1120 */
1121
1122 struct rt6_info *uncached_rt;
1123
1124 dst_use(&rt->dst, jiffies);
1125 read_unlock_bh(&table->tb6_lock);
1126
1127 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1128 dst_release(&rt->dst);
1129
1130 if (uncached_rt) {
1131 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1132 * No need for another dst_hold()
1133 */
1134 rt6_uncached_list_add(uncached_rt);
1135 } else {
1136 uncached_rt = net->ipv6.ip6_null_entry;
1137 dst_hold(&uncached_rt->dst);
1138 }
1139
1140 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1141 return uncached_rt;
1142
1143 } else {
1144 /* Get a percpu copy */
1145
1146 struct rt6_info *pcpu_rt;
1147
1148 rt->dst.lastuse = jiffies;
1149 rt->dst.__use++;
1150 pcpu_rt = rt6_get_pcpu_route(rt);
1151
1152 if (pcpu_rt) {
1153 read_unlock_bh(&table->tb6_lock);
1154 } else {
1155 /* We have to do the read_unlock first
1156 * because rt6_make_pcpu_route() may trigger
1157 * ip6_dst_gc() which will take the write_lock.
1158 */
1159 dst_hold(&rt->dst);
1160 read_unlock_bh(&table->tb6_lock);
1161 pcpu_rt = rt6_make_pcpu_route(rt);
1162 dst_release(&rt->dst);
1163 }
1164
1165 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1166 return pcpu_rt;
1167
1168 }
1169 }
1170 EXPORT_SYMBOL_GPL(ip6_pol_route);
1171
1172 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1173 struct flowi6 *fl6, int flags)
1174 {
1175 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1176 }
1177
1178 struct dst_entry *ip6_route_input_lookup(struct net *net,
1179 struct net_device *dev,
1180 struct flowi6 *fl6, int flags)
1181 {
1182 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1183 flags |= RT6_LOOKUP_F_IFACE;
1184
1185 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1186 }
1187 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1188
1189 void ip6_route_input(struct sk_buff *skb)
1190 {
1191 const struct ipv6hdr *iph = ipv6_hdr(skb);
1192 struct net *net = dev_net(skb->dev);
1193 int flags = RT6_LOOKUP_F_HAS_SADDR;
1194 struct ip_tunnel_info *tun_info;
1195 struct flowi6 fl6 = {
1196 .flowi6_iif = skb->dev->ifindex,
1197 .daddr = iph->daddr,
1198 .saddr = iph->saddr,
1199 .flowlabel = ip6_flowinfo(iph),
1200 .flowi6_mark = skb->mark,
1201 .flowi6_proto = iph->nexthdr,
1202 };
1203
1204 tun_info = skb_tunnel_info(skb);
1205 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1206 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1207 skb_dst_drop(skb);
1208 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1209 }
1210
1211 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1212 struct flowi6 *fl6, int flags)
1213 {
1214 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1215 }
1216
1217 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1218 struct flowi6 *fl6, int flags)
1219 {
1220 bool any_src;
1221
1222 if (rt6_need_strict(&fl6->daddr)) {
1223 struct dst_entry *dst;
1224
1225 dst = l3mdev_link_scope_lookup(net, fl6);
1226 if (dst)
1227 return dst;
1228 }
1229
1230 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1231
1232 any_src = ipv6_addr_any(&fl6->saddr);
1233 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1234 (fl6->flowi6_oif && any_src))
1235 flags |= RT6_LOOKUP_F_IFACE;
1236
1237 if (!any_src)
1238 flags |= RT6_LOOKUP_F_HAS_SADDR;
1239 else if (sk)
1240 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1241
1242 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1243 }
1244 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1245
1246 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1247 {
1248 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1249 struct net_device *loopback_dev = net->loopback_dev;
1250 struct dst_entry *new = NULL;
1251
1252 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1253 DST_OBSOLETE_NONE, 0);
1254 if (rt) {
1255 rt6_info_init(rt);
1256
1257 new = &rt->dst;
1258 new->__use = 1;
1259 new->input = dst_discard;
1260 new->output = dst_discard_out;
1261
1262 dst_copy_metrics(new, &ort->dst);
1263
1264 rt->rt6i_idev = in6_dev_get(loopback_dev);
1265 rt->rt6i_gateway = ort->rt6i_gateway;
1266 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1267 rt->rt6i_metric = 0;
1268
1269 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1270 #ifdef CONFIG_IPV6_SUBTREES
1271 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1272 #endif
1273 }
1274
1275 dst_release(dst_orig);
1276 return new ? new : ERR_PTR(-ENOMEM);
1277 }
1278
1279 /*
1280 * Destination cache support functions
1281 */
1282
1283 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1284 {
1285 if (rt->dst.from &&
1286 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1287 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1288 }
1289
1290 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1291 {
1292 u32 rt_cookie = 0;
1293
1294 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1295 return NULL;
1296
1297 if (rt6_check_expired(rt))
1298 return NULL;
1299
1300 return &rt->dst;
1301 }
1302
1303 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1304 {
1305 if (!__rt6_check_expired(rt) &&
1306 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1307 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1308 return &rt->dst;
1309 else
1310 return NULL;
1311 }
1312
1313 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1314 {
1315 struct rt6_info *rt;
1316
1317 rt = (struct rt6_info *) dst;
1318
1319 /* All IPV6 dsts are created with ->obsolete set to the value
1320 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1321 * into this function always.
1322 */
1323
1324 rt6_dst_from_metrics_check(rt);
1325
1326 if (rt->rt6i_flags & RTF_PCPU ||
1327 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1328 return rt6_dst_from_check(rt, cookie);
1329 else
1330 return rt6_check(rt, cookie);
1331 }
1332
1333 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1334 {
1335 struct rt6_info *rt = (struct rt6_info *) dst;
1336
1337 if (rt) {
1338 if (rt->rt6i_flags & RTF_CACHE) {
1339 if (rt6_check_expired(rt)) {
1340 ip6_del_rt(rt);
1341 dst = NULL;
1342 }
1343 } else {
1344 dst_release(dst);
1345 dst = NULL;
1346 }
1347 }
1348 return dst;
1349 }
1350
1351 static void ip6_link_failure(struct sk_buff *skb)
1352 {
1353 struct rt6_info *rt;
1354
1355 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1356
1357 rt = (struct rt6_info *) skb_dst(skb);
1358 if (rt) {
1359 if (rt->rt6i_flags & RTF_CACHE) {
1360 if (dst_hold_safe(&rt->dst))
1361 ip6_del_rt(rt);
1362 } else {
1363 struct fib6_node *fn;
1364
1365 rcu_read_lock();
1366 fn = rcu_dereference(rt->rt6i_node);
1367 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1368 fn->fn_sernum = -1;
1369 rcu_read_unlock();
1370 }
1371 }
1372 }
1373
1374 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1375 {
1376 struct net *net = dev_net(rt->dst.dev);
1377
1378 rt->rt6i_flags |= RTF_MODIFIED;
1379 rt->rt6i_pmtu = mtu;
1380 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1381 }
1382
1383 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1384 {
1385 return !(rt->rt6i_flags & RTF_CACHE) &&
1386 (rt->rt6i_flags & RTF_PCPU ||
1387 rcu_access_pointer(rt->rt6i_node));
1388 }
1389
1390 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1391 const struct ipv6hdr *iph, u32 mtu)
1392 {
1393 const struct in6_addr *daddr, *saddr;
1394 struct rt6_info *rt6 = (struct rt6_info *)dst;
1395
1396 if (rt6->rt6i_flags & RTF_LOCAL)
1397 return;
1398
1399 if (dst_metric_locked(dst, RTAX_MTU))
1400 return;
1401
1402 if (iph) {
1403 daddr = &iph->daddr;
1404 saddr = &iph->saddr;
1405 } else if (sk) {
1406 daddr = &sk->sk_v6_daddr;
1407 saddr = &inet6_sk(sk)->saddr;
1408 } else {
1409 daddr = NULL;
1410 saddr = NULL;
1411 }
1412 dst_confirm_neigh(dst, daddr);
1413 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1414 if (mtu >= dst_mtu(dst))
1415 return;
1416
1417 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1418 rt6_do_update_pmtu(rt6, mtu);
1419 } else if (daddr) {
1420 struct rt6_info *nrt6;
1421
1422 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1423 if (nrt6) {
1424 rt6_do_update_pmtu(nrt6, mtu);
1425
1426 /* ip6_ins_rt(nrt6) will bump the
1427 * rt6->rt6i_node->fn_sernum
1428 * which will fail the next rt6_check() and
1429 * invalidate the sk->sk_dst_cache.
1430 */
1431 ip6_ins_rt(nrt6);
1432 /* Release the reference taken in
1433 * ip6_rt_cache_alloc()
1434 */
1435 dst_release(&nrt6->dst);
1436 }
1437 }
1438 }
1439
1440 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1441 struct sk_buff *skb, u32 mtu)
1442 {
1443 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1444 }
1445
1446 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1447 int oif, u32 mark, kuid_t uid)
1448 {
1449 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1450 struct dst_entry *dst;
1451 struct flowi6 fl6;
1452
1453 memset(&fl6, 0, sizeof(fl6));
1454 fl6.flowi6_oif = oif;
1455 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1456 fl6.daddr = iph->daddr;
1457 fl6.saddr = iph->saddr;
1458 fl6.flowlabel = ip6_flowinfo(iph);
1459 fl6.flowi6_uid = uid;
1460
1461 dst = ip6_route_output(net, NULL, &fl6);
1462 if (!dst->error)
1463 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1464 dst_release(dst);
1465 }
1466 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1467
1468 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1469 {
1470 struct dst_entry *dst;
1471
1472 ip6_update_pmtu(skb, sock_net(sk), mtu,
1473 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1474
1475 dst = __sk_dst_get(sk);
1476 if (!dst || !dst->obsolete ||
1477 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1478 return;
1479
1480 bh_lock_sock(sk);
1481 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1482 ip6_datagram_dst_update(sk, false);
1483 bh_unlock_sock(sk);
1484 }
1485 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1486
1487 /* Handle redirects */
1488 struct ip6rd_flowi {
1489 struct flowi6 fl6;
1490 struct in6_addr gateway;
1491 };
1492
1493 static struct rt6_info *__ip6_route_redirect(struct net *net,
1494 struct fib6_table *table,
1495 struct flowi6 *fl6,
1496 int flags)
1497 {
1498 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1499 struct rt6_info *rt;
1500 struct fib6_node *fn;
1501
1502 /* Get the "current" route for this destination and
1503 * check if the redirect has come from appropriate router.
1504 *
1505 * RFC 4861 specifies that redirects should only be
1506 * accepted if they come from the nexthop to the target.
1507 * Due to the way the routes are chosen, this notion
1508 * is a bit fuzzy and one might need to check all possible
1509 * routes.
1510 */
1511
1512 read_lock_bh(&table->tb6_lock);
1513 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1514 restart:
1515 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1516 if (rt6_check_expired(rt))
1517 continue;
1518 if (rt->dst.error)
1519 break;
1520 if (!(rt->rt6i_flags & RTF_GATEWAY))
1521 continue;
1522 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1523 continue;
1524 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1525 continue;
1526 break;
1527 }
1528
1529 if (!rt)
1530 rt = net->ipv6.ip6_null_entry;
1531 else if (rt->dst.error) {
1532 rt = net->ipv6.ip6_null_entry;
1533 goto out;
1534 }
1535
1536 if (rt == net->ipv6.ip6_null_entry) {
1537 fn = fib6_backtrack(fn, &fl6->saddr);
1538 if (fn)
1539 goto restart;
1540 }
1541
1542 out:
1543 dst_hold(&rt->dst);
1544
1545 read_unlock_bh(&table->tb6_lock);
1546
1547 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1548 return rt;
1549 };
1550
1551 static struct dst_entry *ip6_route_redirect(struct net *net,
1552 const struct flowi6 *fl6,
1553 const struct in6_addr *gateway)
1554 {
1555 int flags = RT6_LOOKUP_F_HAS_SADDR;
1556 struct ip6rd_flowi rdfl;
1557
1558 rdfl.fl6 = *fl6;
1559 rdfl.gateway = *gateway;
1560
1561 return fib6_rule_lookup(net, &rdfl.fl6,
1562 flags, __ip6_route_redirect);
1563 }
1564
1565 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1566 kuid_t uid)
1567 {
1568 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1569 struct dst_entry *dst;
1570 struct flowi6 fl6;
1571
1572 memset(&fl6, 0, sizeof(fl6));
1573 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1574 fl6.flowi6_oif = oif;
1575 fl6.flowi6_mark = mark;
1576 fl6.daddr = iph->daddr;
1577 fl6.saddr = iph->saddr;
1578 fl6.flowlabel = ip6_flowinfo(iph);
1579 fl6.flowi6_uid = uid;
1580
1581 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1582 rt6_do_redirect(dst, NULL, skb);
1583 dst_release(dst);
1584 }
1585 EXPORT_SYMBOL_GPL(ip6_redirect);
1586
1587 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1588 u32 mark)
1589 {
1590 const struct ipv6hdr *iph = ipv6_hdr(skb);
1591 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1592 struct dst_entry *dst;
1593 struct flowi6 fl6;
1594
1595 memset(&fl6, 0, sizeof(fl6));
1596 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1597 fl6.flowi6_oif = oif;
1598 fl6.flowi6_mark = mark;
1599 fl6.daddr = msg->dest;
1600 fl6.saddr = iph->daddr;
1601 fl6.flowi6_uid = sock_net_uid(net, NULL);
1602
1603 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1604 rt6_do_redirect(dst, NULL, skb);
1605 dst_release(dst);
1606 }
1607
1608 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1609 {
1610 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1611 sk->sk_uid);
1612 }
1613 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1614
1615 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1616 {
1617 struct net_device *dev = dst->dev;
1618 unsigned int mtu = dst_mtu(dst);
1619 struct net *net = dev_net(dev);
1620
1621 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1622
1623 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1624 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1625
1626 /*
1627 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1628 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1629 * IPV6_MAXPLEN is also valid and means: "any MSS,
1630 * rely only on pmtu discovery"
1631 */
1632 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1633 mtu = IPV6_MAXPLEN;
1634 return mtu;
1635 }
1636
1637 static unsigned int ip6_mtu(const struct dst_entry *dst)
1638 {
1639 const struct rt6_info *rt = (const struct rt6_info *)dst;
1640 unsigned int mtu = rt->rt6i_pmtu;
1641 struct inet6_dev *idev;
1642
1643 if (mtu)
1644 goto out;
1645
1646 mtu = dst_metric_raw(dst, RTAX_MTU);
1647 if (mtu)
1648 goto out;
1649
1650 mtu = IPV6_MIN_MTU;
1651
1652 rcu_read_lock();
1653 idev = __in6_dev_get(dst->dev);
1654 if (idev)
1655 mtu = idev->cnf.mtu6;
1656 rcu_read_unlock();
1657
1658 out:
1659 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1660
1661 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1662 }
1663
1664 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1665 struct flowi6 *fl6)
1666 {
1667 struct dst_entry *dst;
1668 struct rt6_info *rt;
1669 struct inet6_dev *idev = in6_dev_get(dev);
1670 struct net *net = dev_net(dev);
1671
1672 if (unlikely(!idev))
1673 return ERR_PTR(-ENODEV);
1674
1675 rt = ip6_dst_alloc(net, dev, 0);
1676 if (unlikely(!rt)) {
1677 in6_dev_put(idev);
1678 dst = ERR_PTR(-ENOMEM);
1679 goto out;
1680 }
1681
1682 rt->dst.flags |= DST_HOST;
1683 rt->dst.output = ip6_output;
1684 rt->rt6i_gateway = fl6->daddr;
1685 rt->rt6i_dst.addr = fl6->daddr;
1686 rt->rt6i_dst.plen = 128;
1687 rt->rt6i_idev = idev;
1688 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1689
1690 /* Add this dst into uncached_list so that rt6_ifdown() can
1691 * do proper release of the net_device
1692 */
1693 rt6_uncached_list_add(rt);
1694
1695 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1696
1697 out:
1698 return dst;
1699 }
1700
1701 static int ip6_dst_gc(struct dst_ops *ops)
1702 {
1703 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1704 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1705 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1706 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1707 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1708 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1709 int entries;
1710
1711 entries = dst_entries_get_fast(ops);
1712 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1713 entries <= rt_max_size)
1714 goto out;
1715
1716 net->ipv6.ip6_rt_gc_expire++;
1717 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1718 entries = dst_entries_get_slow(ops);
1719 if (entries < ops->gc_thresh)
1720 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1721 out:
1722 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1723 return entries > rt_max_size;
1724 }
1725
1726 static int ip6_convert_metrics(struct mx6_config *mxc,
1727 const struct fib6_config *cfg)
1728 {
1729 bool ecn_ca = false;
1730 struct nlattr *nla;
1731 int remaining;
1732 u32 *mp;
1733
1734 if (!cfg->fc_mx)
1735 return 0;
1736
1737 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1738 if (unlikely(!mp))
1739 return -ENOMEM;
1740
1741 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1742 int type = nla_type(nla);
1743 u32 val;
1744
1745 if (!type)
1746 continue;
1747 if (unlikely(type > RTAX_MAX))
1748 goto err;
1749
1750 if (type == RTAX_CC_ALGO) {
1751 char tmp[TCP_CA_NAME_MAX];
1752
1753 nla_strlcpy(tmp, nla, sizeof(tmp));
1754 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1755 if (val == TCP_CA_UNSPEC)
1756 goto err;
1757 } else {
1758 val = nla_get_u32(nla);
1759 }
1760 if (type == RTAX_HOPLIMIT && val > 255)
1761 val = 255;
1762 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1763 goto err;
1764
1765 mp[type - 1] = val;
1766 __set_bit(type - 1, mxc->mx_valid);
1767 }
1768
1769 if (ecn_ca) {
1770 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1771 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1772 }
1773
1774 mxc->mx = mp;
1775 return 0;
1776 err:
1777 kfree(mp);
1778 return -EINVAL;
1779 }
1780
1781 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1782 struct fib6_config *cfg,
1783 const struct in6_addr *gw_addr)
1784 {
1785 struct flowi6 fl6 = {
1786 .flowi6_oif = cfg->fc_ifindex,
1787 .daddr = *gw_addr,
1788 .saddr = cfg->fc_prefsrc,
1789 };
1790 struct fib6_table *table;
1791 struct rt6_info *rt;
1792 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1793
1794 table = fib6_get_table(net, cfg->fc_table);
1795 if (!table)
1796 return NULL;
1797
1798 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1799 flags |= RT6_LOOKUP_F_HAS_SADDR;
1800
1801 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1802
1803 /* if table lookup failed, fall back to full lookup */
1804 if (rt == net->ipv6.ip6_null_entry) {
1805 ip6_rt_put(rt);
1806 rt = NULL;
1807 }
1808
1809 return rt;
1810 }
1811
1812 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1813 struct netlink_ext_ack *extack)
1814 {
1815 struct net *net = cfg->fc_nlinfo.nl_net;
1816 struct rt6_info *rt = NULL;
1817 struct net_device *dev = NULL;
1818 struct inet6_dev *idev = NULL;
1819 struct fib6_table *table;
1820 int addr_type;
1821 int err = -EINVAL;
1822
1823 /* RTF_PCPU is an internal flag; can not be set by userspace */
1824 if (cfg->fc_flags & RTF_PCPU) {
1825 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1826 goto out;
1827 }
1828
1829 if (cfg->fc_dst_len > 128) {
1830 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1831 goto out;
1832 }
1833 if (cfg->fc_src_len > 128) {
1834 NL_SET_ERR_MSG(extack, "Invalid source address length");
1835 goto out;
1836 }
1837 #ifndef CONFIG_IPV6_SUBTREES
1838 if (cfg->fc_src_len) {
1839 NL_SET_ERR_MSG(extack,
1840 "Specifying source address requires IPV6_SUBTREES to be enabled");
1841 goto out;
1842 }
1843 #endif
1844 if (cfg->fc_ifindex) {
1845 err = -ENODEV;
1846 dev = dev_get_by_index(net, cfg->fc_ifindex);
1847 if (!dev)
1848 goto out;
1849 idev = in6_dev_get(dev);
1850 if (!idev)
1851 goto out;
1852 }
1853
1854 if (cfg->fc_metric == 0)
1855 cfg->fc_metric = IP6_RT_PRIO_USER;
1856
1857 err = -ENOBUFS;
1858 if (cfg->fc_nlinfo.nlh &&
1859 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1860 table = fib6_get_table(net, cfg->fc_table);
1861 if (!table) {
1862 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1863 table = fib6_new_table(net, cfg->fc_table);
1864 }
1865 } else {
1866 table = fib6_new_table(net, cfg->fc_table);
1867 }
1868
1869 if (!table)
1870 goto out;
1871
1872 rt = ip6_dst_alloc(net, NULL,
1873 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1874
1875 if (!rt) {
1876 err = -ENOMEM;
1877 goto out;
1878 }
1879
1880 if (cfg->fc_flags & RTF_EXPIRES)
1881 rt6_set_expires(rt, jiffies +
1882 clock_t_to_jiffies(cfg->fc_expires));
1883 else
1884 rt6_clean_expires(rt);
1885
1886 if (cfg->fc_protocol == RTPROT_UNSPEC)
1887 cfg->fc_protocol = RTPROT_BOOT;
1888 rt->rt6i_protocol = cfg->fc_protocol;
1889
1890 addr_type = ipv6_addr_type(&cfg->fc_dst);
1891
1892 if (addr_type & IPV6_ADDR_MULTICAST)
1893 rt->dst.input = ip6_mc_input;
1894 else if (cfg->fc_flags & RTF_LOCAL)
1895 rt->dst.input = ip6_input;
1896 else
1897 rt->dst.input = ip6_forward;
1898
1899 rt->dst.output = ip6_output;
1900
1901 if (cfg->fc_encap) {
1902 struct lwtunnel_state *lwtstate;
1903
1904 err = lwtunnel_build_state(cfg->fc_encap_type,
1905 cfg->fc_encap, AF_INET6, cfg,
1906 &lwtstate, extack);
1907 if (err)
1908 goto out;
1909 rt->dst.lwtstate = lwtstate_get(lwtstate);
1910 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1911 rt->dst.lwtstate->orig_output = rt->dst.output;
1912 rt->dst.output = lwtunnel_output;
1913 }
1914 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1915 rt->dst.lwtstate->orig_input = rt->dst.input;
1916 rt->dst.input = lwtunnel_input;
1917 }
1918 }
1919
1920 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1921 rt->rt6i_dst.plen = cfg->fc_dst_len;
1922 if (rt->rt6i_dst.plen == 128)
1923 rt->dst.flags |= DST_HOST;
1924
1925 #ifdef CONFIG_IPV6_SUBTREES
1926 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1927 rt->rt6i_src.plen = cfg->fc_src_len;
1928 #endif
1929
1930 rt->rt6i_metric = cfg->fc_metric;
1931
1932 /* We cannot add true routes via loopback here,
1933 they would result in kernel looping; promote them to reject routes
1934 */
1935 if ((cfg->fc_flags & RTF_REJECT) ||
1936 (dev && (dev->flags & IFF_LOOPBACK) &&
1937 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1938 !(cfg->fc_flags & RTF_LOCAL))) {
1939 /* hold loopback dev/idev if we haven't done so. */
1940 if (dev != net->loopback_dev) {
1941 if (dev) {
1942 dev_put(dev);
1943 in6_dev_put(idev);
1944 }
1945 dev = net->loopback_dev;
1946 dev_hold(dev);
1947 idev = in6_dev_get(dev);
1948 if (!idev) {
1949 err = -ENODEV;
1950 goto out;
1951 }
1952 }
1953 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1954 switch (cfg->fc_type) {
1955 case RTN_BLACKHOLE:
1956 rt->dst.error = -EINVAL;
1957 rt->dst.output = dst_discard_out;
1958 rt->dst.input = dst_discard;
1959 break;
1960 case RTN_PROHIBIT:
1961 rt->dst.error = -EACCES;
1962 rt->dst.output = ip6_pkt_prohibit_out;
1963 rt->dst.input = ip6_pkt_prohibit;
1964 break;
1965 case RTN_THROW:
1966 case RTN_UNREACHABLE:
1967 default:
1968 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1969 : (cfg->fc_type == RTN_UNREACHABLE)
1970 ? -EHOSTUNREACH : -ENETUNREACH;
1971 rt->dst.output = ip6_pkt_discard_out;
1972 rt->dst.input = ip6_pkt_discard;
1973 break;
1974 }
1975 goto install_route;
1976 }
1977
1978 if (cfg->fc_flags & RTF_GATEWAY) {
1979 const struct in6_addr *gw_addr;
1980 int gwa_type;
1981
1982 gw_addr = &cfg->fc_gateway;
1983 gwa_type = ipv6_addr_type(gw_addr);
1984
1985 /* if gw_addr is local we will fail to detect this in case
1986 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1987 * will return already-added prefix route via interface that
1988 * prefix route was assigned to, which might be non-loopback.
1989 */
1990 err = -EINVAL;
1991 if (ipv6_chk_addr_and_flags(net, gw_addr,
1992 gwa_type & IPV6_ADDR_LINKLOCAL ?
1993 dev : NULL, 0, 0)) {
1994 NL_SET_ERR_MSG(extack, "Invalid gateway address");
1995 goto out;
1996 }
1997 rt->rt6i_gateway = *gw_addr;
1998
1999 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2000 struct rt6_info *grt = NULL;
2001
2002 /* IPv6 strictly inhibits using not link-local
2003 addresses as nexthop address.
2004 Otherwise, router will not able to send redirects.
2005 It is very good, but in some (rare!) circumstances
2006 (SIT, PtP, NBMA NOARP links) it is handy to allow
2007 some exceptions. --ANK
2008 We allow IPv4-mapped nexthops to support RFC4798-type
2009 addressing
2010 */
2011 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2012 IPV6_ADDR_MAPPED))) {
2013 NL_SET_ERR_MSG(extack,
2014 "Invalid gateway address");
2015 goto out;
2016 }
2017
2018 if (cfg->fc_table) {
2019 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2020
2021 if (grt) {
2022 if (grt->rt6i_flags & RTF_GATEWAY ||
2023 (dev && dev != grt->dst.dev)) {
2024 ip6_rt_put(grt);
2025 grt = NULL;
2026 }
2027 }
2028 }
2029
2030 if (!grt)
2031 grt = rt6_lookup(net, gw_addr, NULL,
2032 cfg->fc_ifindex, 1);
2033
2034 err = -EHOSTUNREACH;
2035 if (!grt)
2036 goto out;
2037 if (dev) {
2038 if (dev != grt->dst.dev) {
2039 ip6_rt_put(grt);
2040 goto out;
2041 }
2042 } else {
2043 dev = grt->dst.dev;
2044 idev = grt->rt6i_idev;
2045 dev_hold(dev);
2046 in6_dev_hold(grt->rt6i_idev);
2047 }
2048 if (!(grt->rt6i_flags & RTF_GATEWAY))
2049 err = 0;
2050 ip6_rt_put(grt);
2051
2052 if (err)
2053 goto out;
2054 }
2055 err = -EINVAL;
2056 if (!dev) {
2057 NL_SET_ERR_MSG(extack, "Egress device not specified");
2058 goto out;
2059 } else if (dev->flags & IFF_LOOPBACK) {
2060 NL_SET_ERR_MSG(extack,
2061 "Egress device can not be loopback device for this route");
2062 goto out;
2063 }
2064 }
2065
2066 err = -ENODEV;
2067 if (!dev)
2068 goto out;
2069
2070 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2071 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2072 NL_SET_ERR_MSG(extack, "Invalid source address");
2073 err = -EINVAL;
2074 goto out;
2075 }
2076 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2077 rt->rt6i_prefsrc.plen = 128;
2078 } else
2079 rt->rt6i_prefsrc.plen = 0;
2080
2081 rt->rt6i_flags = cfg->fc_flags;
2082
2083 install_route:
2084 rt->dst.dev = dev;
2085 rt->rt6i_idev = idev;
2086 rt->rt6i_table = table;
2087
2088 cfg->fc_nlinfo.nl_net = dev_net(dev);
2089
2090 return rt;
2091 out:
2092 if (dev)
2093 dev_put(dev);
2094 if (idev)
2095 in6_dev_put(idev);
2096 if (rt)
2097 dst_release_immediate(&rt->dst);
2098
2099 return ERR_PTR(err);
2100 }
2101
2102 int ip6_route_add(struct fib6_config *cfg,
2103 struct netlink_ext_ack *extack)
2104 {
2105 struct mx6_config mxc = { .mx = NULL, };
2106 struct rt6_info *rt;
2107 int err;
2108
2109 rt = ip6_route_info_create(cfg, extack);
2110 if (IS_ERR(rt)) {
2111 err = PTR_ERR(rt);
2112 rt = NULL;
2113 goto out;
2114 }
2115
2116 err = ip6_convert_metrics(&mxc, cfg);
2117 if (err)
2118 goto out;
2119
2120 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2121
2122 kfree(mxc.mx);
2123
2124 return err;
2125 out:
2126 if (rt)
2127 dst_release_immediate(&rt->dst);
2128
2129 return err;
2130 }
2131
2132 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2133 {
2134 int err;
2135 struct fib6_table *table;
2136 struct net *net = dev_net(rt->dst.dev);
2137
2138 if (rt == net->ipv6.ip6_null_entry) {
2139 err = -ENOENT;
2140 goto out;
2141 }
2142
2143 table = rt->rt6i_table;
2144 write_lock_bh(&table->tb6_lock);
2145 err = fib6_del(rt, info);
2146 write_unlock_bh(&table->tb6_lock);
2147
2148 out:
2149 ip6_rt_put(rt);
2150 return err;
2151 }
2152
2153 int ip6_del_rt(struct rt6_info *rt)
2154 {
2155 struct nl_info info = {
2156 .nl_net = dev_net(rt->dst.dev),
2157 };
2158 return __ip6_del_rt(rt, &info);
2159 }
2160
2161 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2162 {
2163 struct nl_info *info = &cfg->fc_nlinfo;
2164 struct net *net = info->nl_net;
2165 struct sk_buff *skb = NULL;
2166 struct fib6_table *table;
2167 int err = -ENOENT;
2168
2169 if (rt == net->ipv6.ip6_null_entry)
2170 goto out_put;
2171 table = rt->rt6i_table;
2172 write_lock_bh(&table->tb6_lock);
2173
2174 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2175 struct rt6_info *sibling, *next_sibling;
2176
2177 /* prefer to send a single notification with all hops */
2178 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2179 if (skb) {
2180 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2181
2182 if (rt6_fill_node(net, skb, rt,
2183 NULL, NULL, 0, RTM_DELROUTE,
2184 info->portid, seq, 0) < 0) {
2185 kfree_skb(skb);
2186 skb = NULL;
2187 } else
2188 info->skip_notify = 1;
2189 }
2190
2191 list_for_each_entry_safe(sibling, next_sibling,
2192 &rt->rt6i_siblings,
2193 rt6i_siblings) {
2194 err = fib6_del(sibling, info);
2195 if (err)
2196 goto out_unlock;
2197 }
2198 }
2199
2200 err = fib6_del(rt, info);
2201 out_unlock:
2202 write_unlock_bh(&table->tb6_lock);
2203 out_put:
2204 ip6_rt_put(rt);
2205
2206 if (skb) {
2207 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2208 info->nlh, gfp_any());
2209 }
2210 return err;
2211 }
2212
2213 static int ip6_route_del(struct fib6_config *cfg,
2214 struct netlink_ext_ack *extack)
2215 {
2216 struct fib6_table *table;
2217 struct fib6_node *fn;
2218 struct rt6_info *rt;
2219 int err = -ESRCH;
2220
2221 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2222 if (!table) {
2223 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2224 return err;
2225 }
2226
2227 read_lock_bh(&table->tb6_lock);
2228
2229 fn = fib6_locate(&table->tb6_root,
2230 &cfg->fc_dst, cfg->fc_dst_len,
2231 &cfg->fc_src, cfg->fc_src_len);
2232
2233 if (fn) {
2234 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2235 if ((rt->rt6i_flags & RTF_CACHE) &&
2236 !(cfg->fc_flags & RTF_CACHE))
2237 continue;
2238 if (cfg->fc_ifindex &&
2239 (!rt->dst.dev ||
2240 rt->dst.dev->ifindex != cfg->fc_ifindex))
2241 continue;
2242 if (cfg->fc_flags & RTF_GATEWAY &&
2243 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2244 continue;
2245 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2246 continue;
2247 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2248 continue;
2249 dst_hold(&rt->dst);
2250 read_unlock_bh(&table->tb6_lock);
2251
2252 /* if gateway was specified only delete the one hop */
2253 if (cfg->fc_flags & RTF_GATEWAY)
2254 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2255
2256 return __ip6_del_rt_siblings(rt, cfg);
2257 }
2258 }
2259 read_unlock_bh(&table->tb6_lock);
2260
2261 return err;
2262 }
2263
2264 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2265 {
2266 struct netevent_redirect netevent;
2267 struct rt6_info *rt, *nrt = NULL;
2268 struct ndisc_options ndopts;
2269 struct inet6_dev *in6_dev;
2270 struct neighbour *neigh;
2271 struct rd_msg *msg;
2272 int optlen, on_link;
2273 u8 *lladdr;
2274
2275 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2276 optlen -= sizeof(*msg);
2277
2278 if (optlen < 0) {
2279 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2280 return;
2281 }
2282
2283 msg = (struct rd_msg *)icmp6_hdr(skb);
2284
2285 if (ipv6_addr_is_multicast(&msg->dest)) {
2286 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2287 return;
2288 }
2289
2290 on_link = 0;
2291 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2292 on_link = 1;
2293 } else if (ipv6_addr_type(&msg->target) !=
2294 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2295 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2296 return;
2297 }
2298
2299 in6_dev = __in6_dev_get(skb->dev);
2300 if (!in6_dev)
2301 return;
2302 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2303 return;
2304
2305 /* RFC2461 8.1:
2306 * The IP source address of the Redirect MUST be the same as the current
2307 * first-hop router for the specified ICMP Destination Address.
2308 */
2309
2310 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2311 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2312 return;
2313 }
2314
2315 lladdr = NULL;
2316 if (ndopts.nd_opts_tgt_lladdr) {
2317 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2318 skb->dev);
2319 if (!lladdr) {
2320 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2321 return;
2322 }
2323 }
2324
2325 rt = (struct rt6_info *) dst;
2326 if (rt->rt6i_flags & RTF_REJECT) {
2327 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2328 return;
2329 }
2330
2331 /* Redirect received -> path was valid.
2332 * Look, redirects are sent only in response to data packets,
2333 * so that this nexthop apparently is reachable. --ANK
2334 */
2335 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2336
2337 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2338 if (!neigh)
2339 return;
2340
2341 /*
2342 * We have finally decided to accept it.
2343 */
2344
2345 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2346 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2347 NEIGH_UPDATE_F_OVERRIDE|
2348 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2349 NEIGH_UPDATE_F_ISROUTER)),
2350 NDISC_REDIRECT, &ndopts);
2351
2352 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2353 if (!nrt)
2354 goto out;
2355
2356 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2357 if (on_link)
2358 nrt->rt6i_flags &= ~RTF_GATEWAY;
2359
2360 nrt->rt6i_protocol = RTPROT_REDIRECT;
2361 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2362
2363 if (ip6_ins_rt(nrt))
2364 goto out_release;
2365
2366 netevent.old = &rt->dst;
2367 netevent.new = &nrt->dst;
2368 netevent.daddr = &msg->dest;
2369 netevent.neigh = neigh;
2370 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2371
2372 if (rt->rt6i_flags & RTF_CACHE) {
2373 rt = (struct rt6_info *) dst_clone(&rt->dst);
2374 ip6_del_rt(rt);
2375 }
2376
2377 out_release:
2378 /* Release the reference taken in
2379 * ip6_rt_cache_alloc()
2380 */
2381 dst_release(&nrt->dst);
2382
2383 out:
2384 neigh_release(neigh);
2385 }
2386
2387 /*
2388 * Misc support functions
2389 */
2390
2391 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2392 {
2393 BUG_ON(from->dst.from);
2394
2395 rt->rt6i_flags &= ~RTF_EXPIRES;
2396 dst_hold(&from->dst);
2397 rt->dst.from = &from->dst;
2398 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2399 }
2400
2401 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2402 {
2403 rt->dst.input = ort->dst.input;
2404 rt->dst.output = ort->dst.output;
2405 rt->rt6i_dst = ort->rt6i_dst;
2406 rt->dst.error = ort->dst.error;
2407 rt->rt6i_idev = ort->rt6i_idev;
2408 if (rt->rt6i_idev)
2409 in6_dev_hold(rt->rt6i_idev);
2410 rt->dst.lastuse = jiffies;
2411 rt->rt6i_gateway = ort->rt6i_gateway;
2412 rt->rt6i_flags = ort->rt6i_flags;
2413 rt6_set_from(rt, ort);
2414 rt->rt6i_metric = ort->rt6i_metric;
2415 #ifdef CONFIG_IPV6_SUBTREES
2416 rt->rt6i_src = ort->rt6i_src;
2417 #endif
2418 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2419 rt->rt6i_table = ort->rt6i_table;
2420 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2421 }
2422
2423 #ifdef CONFIG_IPV6_ROUTE_INFO
2424 static struct rt6_info *rt6_get_route_info(struct net *net,
2425 const struct in6_addr *prefix, int prefixlen,
2426 const struct in6_addr *gwaddr,
2427 struct net_device *dev)
2428 {
2429 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2430 int ifindex = dev->ifindex;
2431 struct fib6_node *fn;
2432 struct rt6_info *rt = NULL;
2433 struct fib6_table *table;
2434
2435 table = fib6_get_table(net, tb_id);
2436 if (!table)
2437 return NULL;
2438
2439 read_lock_bh(&table->tb6_lock);
2440 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2441 if (!fn)
2442 goto out;
2443
2444 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2445 if (rt->dst.dev->ifindex != ifindex)
2446 continue;
2447 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2448 continue;
2449 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2450 continue;
2451 dst_hold(&rt->dst);
2452 break;
2453 }
2454 out:
2455 read_unlock_bh(&table->tb6_lock);
2456 return rt;
2457 }
2458
2459 static struct rt6_info *rt6_add_route_info(struct net *net,
2460 const struct in6_addr *prefix, int prefixlen,
2461 const struct in6_addr *gwaddr,
2462 struct net_device *dev,
2463 unsigned int pref)
2464 {
2465 struct fib6_config cfg = {
2466 .fc_metric = IP6_RT_PRIO_USER,
2467 .fc_ifindex = dev->ifindex,
2468 .fc_dst_len = prefixlen,
2469 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2470 RTF_UP | RTF_PREF(pref),
2471 .fc_protocol = RTPROT_RA,
2472 .fc_nlinfo.portid = 0,
2473 .fc_nlinfo.nlh = NULL,
2474 .fc_nlinfo.nl_net = net,
2475 };
2476
2477 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2478 cfg.fc_dst = *prefix;
2479 cfg.fc_gateway = *gwaddr;
2480
2481 /* We should treat it as a default route if prefix length is 0. */
2482 if (!prefixlen)
2483 cfg.fc_flags |= RTF_DEFAULT;
2484
2485 ip6_route_add(&cfg, NULL);
2486
2487 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2488 }
2489 #endif
2490
2491 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2492 {
2493 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2494 struct rt6_info *rt;
2495 struct fib6_table *table;
2496
2497 table = fib6_get_table(dev_net(dev), tb_id);
2498 if (!table)
2499 return NULL;
2500
2501 read_lock_bh(&table->tb6_lock);
2502 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2503 if (dev == rt->dst.dev &&
2504 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2505 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2506 break;
2507 }
2508 if (rt)
2509 dst_hold(&rt->dst);
2510 read_unlock_bh(&table->tb6_lock);
2511 return rt;
2512 }
2513
2514 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2515 struct net_device *dev,
2516 unsigned int pref)
2517 {
2518 struct fib6_config cfg = {
2519 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2520 .fc_metric = IP6_RT_PRIO_USER,
2521 .fc_ifindex = dev->ifindex,
2522 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2523 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2524 .fc_protocol = RTPROT_RA,
2525 .fc_nlinfo.portid = 0,
2526 .fc_nlinfo.nlh = NULL,
2527 .fc_nlinfo.nl_net = dev_net(dev),
2528 };
2529
2530 cfg.fc_gateway = *gwaddr;
2531
2532 if (!ip6_route_add(&cfg, NULL)) {
2533 struct fib6_table *table;
2534
2535 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2536 if (table)
2537 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2538 }
2539
2540 return rt6_get_dflt_router(gwaddr, dev);
2541 }
2542
2543 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2544 {
2545 struct rt6_info *rt;
2546
2547 restart:
2548 read_lock_bh(&table->tb6_lock);
2549 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2550 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2551 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2552 dst_hold(&rt->dst);
2553 read_unlock_bh(&table->tb6_lock);
2554 ip6_del_rt(rt);
2555 goto restart;
2556 }
2557 }
2558 read_unlock_bh(&table->tb6_lock);
2559
2560 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2561 }
2562
2563 void rt6_purge_dflt_routers(struct net *net)
2564 {
2565 struct fib6_table *table;
2566 struct hlist_head *head;
2567 unsigned int h;
2568
2569 rcu_read_lock();
2570
2571 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2572 head = &net->ipv6.fib_table_hash[h];
2573 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2574 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2575 __rt6_purge_dflt_routers(table);
2576 }
2577 }
2578
2579 rcu_read_unlock();
2580 }
2581
2582 static void rtmsg_to_fib6_config(struct net *net,
2583 struct in6_rtmsg *rtmsg,
2584 struct fib6_config *cfg)
2585 {
2586 memset(cfg, 0, sizeof(*cfg));
2587
2588 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2589 : RT6_TABLE_MAIN;
2590 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2591 cfg->fc_metric = rtmsg->rtmsg_metric;
2592 cfg->fc_expires = rtmsg->rtmsg_info;
2593 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2594 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2595 cfg->fc_flags = rtmsg->rtmsg_flags;
2596
2597 cfg->fc_nlinfo.nl_net = net;
2598
2599 cfg->fc_dst = rtmsg->rtmsg_dst;
2600 cfg->fc_src = rtmsg->rtmsg_src;
2601 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2602 }
2603
2604 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2605 {
2606 struct fib6_config cfg;
2607 struct in6_rtmsg rtmsg;
2608 int err;
2609
2610 switch (cmd) {
2611 case SIOCADDRT: /* Add a route */
2612 case SIOCDELRT: /* Delete a route */
2613 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2614 return -EPERM;
2615 err = copy_from_user(&rtmsg, arg,
2616 sizeof(struct in6_rtmsg));
2617 if (err)
2618 return -EFAULT;
2619
2620 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2621
2622 rtnl_lock();
2623 switch (cmd) {
2624 case SIOCADDRT:
2625 err = ip6_route_add(&cfg, NULL);
2626 break;
2627 case SIOCDELRT:
2628 err = ip6_route_del(&cfg, NULL);
2629 break;
2630 default:
2631 err = -EINVAL;
2632 }
2633 rtnl_unlock();
2634
2635 return err;
2636 }
2637
2638 return -EINVAL;
2639 }
2640
2641 /*
2642 * Drop the packet on the floor
2643 */
2644
2645 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2646 {
2647 int type;
2648 struct dst_entry *dst = skb_dst(skb);
2649 switch (ipstats_mib_noroutes) {
2650 case IPSTATS_MIB_INNOROUTES:
2651 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2652 if (type == IPV6_ADDR_ANY) {
2653 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2654 IPSTATS_MIB_INADDRERRORS);
2655 break;
2656 }
2657 /* FALLTHROUGH */
2658 case IPSTATS_MIB_OUTNOROUTES:
2659 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2660 ipstats_mib_noroutes);
2661 break;
2662 }
2663 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2664 kfree_skb(skb);
2665 return 0;
2666 }
2667
2668 static int ip6_pkt_discard(struct sk_buff *skb)
2669 {
2670 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2671 }
2672
2673 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2674 {
2675 skb->dev = skb_dst(skb)->dev;
2676 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2677 }
2678
2679 static int ip6_pkt_prohibit(struct sk_buff *skb)
2680 {
2681 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2682 }
2683
2684 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2685 {
2686 skb->dev = skb_dst(skb)->dev;
2687 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2688 }
2689
2690 /*
2691 * Allocate a dst for local (unicast / anycast) address.
2692 */
2693
2694 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2695 const struct in6_addr *addr,
2696 bool anycast)
2697 {
2698 u32 tb_id;
2699 struct net *net = dev_net(idev->dev);
2700 struct net_device *dev = net->loopback_dev;
2701 struct rt6_info *rt;
2702
2703 /* use L3 Master device as loopback for host routes if device
2704 * is enslaved and address is not link local or multicast
2705 */
2706 if (!rt6_need_strict(addr))
2707 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2708
2709 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2710 if (!rt)
2711 return ERR_PTR(-ENOMEM);
2712
2713 in6_dev_hold(idev);
2714
2715 rt->dst.flags |= DST_HOST;
2716 rt->dst.input = ip6_input;
2717 rt->dst.output = ip6_output;
2718 rt->rt6i_idev = idev;
2719
2720 rt->rt6i_protocol = RTPROT_KERNEL;
2721 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2722 if (anycast)
2723 rt->rt6i_flags |= RTF_ANYCAST;
2724 else
2725 rt->rt6i_flags |= RTF_LOCAL;
2726
2727 rt->rt6i_gateway = *addr;
2728 rt->rt6i_dst.addr = *addr;
2729 rt->rt6i_dst.plen = 128;
2730 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2731 rt->rt6i_table = fib6_get_table(net, tb_id);
2732
2733 return rt;
2734 }
2735
2736 /* remove deleted ip from prefsrc entries */
2737 struct arg_dev_net_ip {
2738 struct net_device *dev;
2739 struct net *net;
2740 struct in6_addr *addr;
2741 };
2742
2743 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2744 {
2745 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2746 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2747 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2748
2749 if (((void *)rt->dst.dev == dev || !dev) &&
2750 rt != net->ipv6.ip6_null_entry &&
2751 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2752 /* remove prefsrc entry */
2753 rt->rt6i_prefsrc.plen = 0;
2754 }
2755 return 0;
2756 }
2757
2758 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2759 {
2760 struct net *net = dev_net(ifp->idev->dev);
2761 struct arg_dev_net_ip adni = {
2762 .dev = ifp->idev->dev,
2763 .net = net,
2764 .addr = &ifp->addr,
2765 };
2766 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2767 }
2768
2769 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2770 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2771
2772 /* Remove routers and update dst entries when gateway turn into host. */
2773 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2774 {
2775 struct in6_addr *gateway = (struct in6_addr *)arg;
2776
2777 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2778 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2779 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2780 return -1;
2781 }
2782 return 0;
2783 }
2784
2785 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2786 {
2787 fib6_clean_all(net, fib6_clean_tohost, gateway);
2788 }
2789
2790 struct arg_dev_net {
2791 struct net_device *dev;
2792 struct net *net;
2793 };
2794
2795 /* called with write lock held for table with rt */
2796 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2797 {
2798 const struct arg_dev_net *adn = arg;
2799 const struct net_device *dev = adn->dev;
2800
2801 if ((rt->dst.dev == dev || !dev) &&
2802 rt != adn->net->ipv6.ip6_null_entry &&
2803 (rt->rt6i_nsiblings == 0 ||
2804 (dev && netdev_unregistering(dev)) ||
2805 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2806 return -1;
2807
2808 return 0;
2809 }
2810
2811 void rt6_ifdown(struct net *net, struct net_device *dev)
2812 {
2813 struct arg_dev_net adn = {
2814 .dev = dev,
2815 .net = net,
2816 };
2817
2818 fib6_clean_all(net, fib6_ifdown, &adn);
2819 if (dev)
2820 rt6_uncached_list_flush_dev(net, dev);
2821 }
2822
2823 struct rt6_mtu_change_arg {
2824 struct net_device *dev;
2825 unsigned int mtu;
2826 };
2827
2828 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2829 {
2830 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2831 struct inet6_dev *idev;
2832
2833 /* In IPv6 pmtu discovery is not optional,
2834 so that RTAX_MTU lock cannot disable it.
2835 We still use this lock to block changes
2836 caused by addrconf/ndisc.
2837 */
2838
2839 idev = __in6_dev_get(arg->dev);
2840 if (!idev)
2841 return 0;
2842
2843 /* For administrative MTU increase, there is no way to discover
2844 IPv6 PMTU increase, so PMTU increase should be updated here.
2845 Since RFC 1981 doesn't include administrative MTU increase
2846 update PMTU increase is a MUST. (i.e. jumbo frame)
2847 */
2848 /*
2849 If new MTU is less than route PMTU, this new MTU will be the
2850 lowest MTU in the path, update the route PMTU to reflect PMTU
2851 decreases; if new MTU is greater than route PMTU, and the
2852 old MTU is the lowest MTU in the path, update the route PMTU
2853 to reflect the increase. In this case if the other nodes' MTU
2854 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2855 PMTU discovery.
2856 */
2857 if (rt->dst.dev == arg->dev &&
2858 dst_metric_raw(&rt->dst, RTAX_MTU) &&
2859 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2860 if (rt->rt6i_flags & RTF_CACHE) {
2861 /* For RTF_CACHE with rt6i_pmtu == 0
2862 * (i.e. a redirected route),
2863 * the metrics of its rt->dst.from has already
2864 * been updated.
2865 */
2866 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2867 rt->rt6i_pmtu = arg->mtu;
2868 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2869 (dst_mtu(&rt->dst) < arg->mtu &&
2870 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2871 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2872 }
2873 }
2874 return 0;
2875 }
2876
2877 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2878 {
2879 struct rt6_mtu_change_arg arg = {
2880 .dev = dev,
2881 .mtu = mtu,
2882 };
2883
2884 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2885 }
2886
2887 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2888 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2889 [RTA_OIF] = { .type = NLA_U32 },
2890 [RTA_IIF] = { .type = NLA_U32 },
2891 [RTA_PRIORITY] = { .type = NLA_U32 },
2892 [RTA_METRICS] = { .type = NLA_NESTED },
2893 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2894 [RTA_PREF] = { .type = NLA_U8 },
2895 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2896 [RTA_ENCAP] = { .type = NLA_NESTED },
2897 [RTA_EXPIRES] = { .type = NLA_U32 },
2898 [RTA_UID] = { .type = NLA_U32 },
2899 [RTA_MARK] = { .type = NLA_U32 },
2900 };
2901
2902 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2903 struct fib6_config *cfg,
2904 struct netlink_ext_ack *extack)
2905 {
2906 struct rtmsg *rtm;
2907 struct nlattr *tb[RTA_MAX+1];
2908 unsigned int pref;
2909 int err;
2910
2911 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2912 NULL);
2913 if (err < 0)
2914 goto errout;
2915
2916 err = -EINVAL;
2917 rtm = nlmsg_data(nlh);
2918 memset(cfg, 0, sizeof(*cfg));
2919
2920 cfg->fc_table = rtm->rtm_table;
2921 cfg->fc_dst_len = rtm->rtm_dst_len;
2922 cfg->fc_src_len = rtm->rtm_src_len;
2923 cfg->fc_flags = RTF_UP;
2924 cfg->fc_protocol = rtm->rtm_protocol;
2925 cfg->fc_type = rtm->rtm_type;
2926
2927 if (rtm->rtm_type == RTN_UNREACHABLE ||
2928 rtm->rtm_type == RTN_BLACKHOLE ||
2929 rtm->rtm_type == RTN_PROHIBIT ||
2930 rtm->rtm_type == RTN_THROW)
2931 cfg->fc_flags |= RTF_REJECT;
2932
2933 if (rtm->rtm_type == RTN_LOCAL)
2934 cfg->fc_flags |= RTF_LOCAL;
2935
2936 if (rtm->rtm_flags & RTM_F_CLONED)
2937 cfg->fc_flags |= RTF_CACHE;
2938
2939 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2940 cfg->fc_nlinfo.nlh = nlh;
2941 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2942
2943 if (tb[RTA_GATEWAY]) {
2944 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2945 cfg->fc_flags |= RTF_GATEWAY;
2946 }
2947
2948 if (tb[RTA_DST]) {
2949 int plen = (rtm->rtm_dst_len + 7) >> 3;
2950
2951 if (nla_len(tb[RTA_DST]) < plen)
2952 goto errout;
2953
2954 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2955 }
2956
2957 if (tb[RTA_SRC]) {
2958 int plen = (rtm->rtm_src_len + 7) >> 3;
2959
2960 if (nla_len(tb[RTA_SRC]) < plen)
2961 goto errout;
2962
2963 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2964 }
2965
2966 if (tb[RTA_PREFSRC])
2967 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2968
2969 if (tb[RTA_OIF])
2970 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2971
2972 if (tb[RTA_PRIORITY])
2973 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2974
2975 if (tb[RTA_METRICS]) {
2976 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2977 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2978 }
2979
2980 if (tb[RTA_TABLE])
2981 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2982
2983 if (tb[RTA_MULTIPATH]) {
2984 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2985 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2986
2987 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2988 cfg->fc_mp_len, extack);
2989 if (err < 0)
2990 goto errout;
2991 }
2992
2993 if (tb[RTA_PREF]) {
2994 pref = nla_get_u8(tb[RTA_PREF]);
2995 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2996 pref != ICMPV6_ROUTER_PREF_HIGH)
2997 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2998 cfg->fc_flags |= RTF_PREF(pref);
2999 }
3000
3001 if (tb[RTA_ENCAP])
3002 cfg->fc_encap = tb[RTA_ENCAP];
3003
3004 if (tb[RTA_ENCAP_TYPE]) {
3005 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3006
3007 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3008 if (err < 0)
3009 goto errout;
3010 }
3011
3012 if (tb[RTA_EXPIRES]) {
3013 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3014
3015 if (addrconf_finite_timeout(timeout)) {
3016 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3017 cfg->fc_flags |= RTF_EXPIRES;
3018 }
3019 }
3020
3021 err = 0;
3022 errout:
3023 return err;
3024 }
3025
3026 struct rt6_nh {
3027 struct rt6_info *rt6_info;
3028 struct fib6_config r_cfg;
3029 struct mx6_config mxc;
3030 struct list_head next;
3031 };
3032
3033 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3034 {
3035 struct rt6_nh *nh;
3036
3037 list_for_each_entry(nh, rt6_nh_list, next) {
3038 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3039 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3040 nh->r_cfg.fc_ifindex);
3041 }
3042 }
3043
3044 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3045 struct rt6_info *rt, struct fib6_config *r_cfg)
3046 {
3047 struct rt6_nh *nh;
3048 int err = -EEXIST;
3049
3050 list_for_each_entry(nh, rt6_nh_list, next) {
3051 /* check if rt6_info already exists */
3052 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3053 return err;
3054 }
3055
3056 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3057 if (!nh)
3058 return -ENOMEM;
3059 nh->rt6_info = rt;
3060 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3061 if (err) {
3062 kfree(nh);
3063 return err;
3064 }
3065 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3066 list_add_tail(&nh->next, rt6_nh_list);
3067
3068 return 0;
3069 }
3070
3071 static void ip6_route_mpath_notify(struct rt6_info *rt,
3072 struct rt6_info *rt_last,
3073 struct nl_info *info,
3074 __u16 nlflags)
3075 {
3076 /* if this is an APPEND route, then rt points to the first route
3077 * inserted and rt_last points to last route inserted. Userspace
3078 * wants a consistent dump of the route which starts at the first
3079 * nexthop. Since sibling routes are always added at the end of
3080 * the list, find the first sibling of the last route appended
3081 */
3082 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3083 rt = list_first_entry(&rt_last->rt6i_siblings,
3084 struct rt6_info,
3085 rt6i_siblings);
3086 }
3087
3088 if (rt)
3089 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3090 }
3091
3092 static int ip6_route_multipath_add(struct fib6_config *cfg,
3093 struct netlink_ext_ack *extack)
3094 {
3095 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3096 struct nl_info *info = &cfg->fc_nlinfo;
3097 struct fib6_config r_cfg;
3098 struct rtnexthop *rtnh;
3099 struct rt6_info *rt;
3100 struct rt6_nh *err_nh;
3101 struct rt6_nh *nh, *nh_safe;
3102 __u16 nlflags;
3103 int remaining;
3104 int attrlen;
3105 int err = 1;
3106 int nhn = 0;
3107 int replace = (cfg->fc_nlinfo.nlh &&
3108 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3109 LIST_HEAD(rt6_nh_list);
3110
3111 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3112 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3113 nlflags |= NLM_F_APPEND;
3114
3115 remaining = cfg->fc_mp_len;
3116 rtnh = (struct rtnexthop *)cfg->fc_mp;
3117
3118 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3119 * rt6_info structs per nexthop
3120 */
3121 while (rtnh_ok(rtnh, remaining)) {
3122 memcpy(&r_cfg, cfg, sizeof(*cfg));
3123 if (rtnh->rtnh_ifindex)
3124 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3125
3126 attrlen = rtnh_attrlen(rtnh);
3127 if (attrlen > 0) {
3128 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3129
3130 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3131 if (nla) {
3132 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3133 r_cfg.fc_flags |= RTF_GATEWAY;
3134 }
3135 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3136 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3137 if (nla)
3138 r_cfg.fc_encap_type = nla_get_u16(nla);
3139 }
3140
3141 rt = ip6_route_info_create(&r_cfg, extack);
3142 if (IS_ERR(rt)) {
3143 err = PTR_ERR(rt);
3144 rt = NULL;
3145 goto cleanup;
3146 }
3147
3148 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3149 if (err) {
3150 dst_release_immediate(&rt->dst);
3151 goto cleanup;
3152 }
3153
3154 rtnh = rtnh_next(rtnh, &remaining);
3155 }
3156
3157 /* for add and replace send one notification with all nexthops.
3158 * Skip the notification in fib6_add_rt2node and send one with
3159 * the full route when done
3160 */
3161 info->skip_notify = 1;
3162
3163 err_nh = NULL;
3164 list_for_each_entry(nh, &rt6_nh_list, next) {
3165 rt_last = nh->rt6_info;
3166 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3167 /* save reference to first route for notification */
3168 if (!rt_notif && !err)
3169 rt_notif = nh->rt6_info;
3170
3171 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3172 nh->rt6_info = NULL;
3173 if (err) {
3174 if (replace && nhn)
3175 ip6_print_replace_route_err(&rt6_nh_list);
3176 err_nh = nh;
3177 goto add_errout;
3178 }
3179
3180 /* Because each route is added like a single route we remove
3181 * these flags after the first nexthop: if there is a collision,
3182 * we have already failed to add the first nexthop:
3183 * fib6_add_rt2node() has rejected it; when replacing, old
3184 * nexthops have been replaced by first new, the rest should
3185 * be added to it.
3186 */
3187 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3188 NLM_F_REPLACE);
3189 nhn++;
3190 }
3191
3192 /* success ... tell user about new route */
3193 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3194 goto cleanup;
3195
3196 add_errout:
3197 /* send notification for routes that were added so that
3198 * the delete notifications sent by ip6_route_del are
3199 * coherent
3200 */
3201 if (rt_notif)
3202 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3203
3204 /* Delete routes that were already added */
3205 list_for_each_entry(nh, &rt6_nh_list, next) {
3206 if (err_nh == nh)
3207 break;
3208 ip6_route_del(&nh->r_cfg, extack);
3209 }
3210
3211 cleanup:
3212 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3213 if (nh->rt6_info)
3214 dst_release_immediate(&nh->rt6_info->dst);
3215 kfree(nh->mxc.mx);
3216 list_del(&nh->next);
3217 kfree(nh);
3218 }
3219
3220 return err;
3221 }
3222
3223 static int ip6_route_multipath_del(struct fib6_config *cfg,
3224 struct netlink_ext_ack *extack)
3225 {
3226 struct fib6_config r_cfg;
3227 struct rtnexthop *rtnh;
3228 int remaining;
3229 int attrlen;
3230 int err = 1, last_err = 0;
3231
3232 remaining = cfg->fc_mp_len;
3233 rtnh = (struct rtnexthop *)cfg->fc_mp;
3234
3235 /* Parse a Multipath Entry */
3236 while (rtnh_ok(rtnh, remaining)) {
3237 memcpy(&r_cfg, cfg, sizeof(*cfg));
3238 if (rtnh->rtnh_ifindex)
3239 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3240
3241 attrlen = rtnh_attrlen(rtnh);
3242 if (attrlen > 0) {
3243 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3244
3245 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3246 if (nla) {
3247 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3248 r_cfg.fc_flags |= RTF_GATEWAY;
3249 }
3250 }
3251 err = ip6_route_del(&r_cfg, extack);
3252 if (err)
3253 last_err = err;
3254
3255 rtnh = rtnh_next(rtnh, &remaining);
3256 }
3257
3258 return last_err;
3259 }
3260
3261 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3262 struct netlink_ext_ack *extack)
3263 {
3264 struct fib6_config cfg;
3265 int err;
3266
3267 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3268 if (err < 0)
3269 return err;
3270
3271 if (cfg.fc_mp)
3272 return ip6_route_multipath_del(&cfg, extack);
3273 else {
3274 cfg.fc_delete_all_nh = 1;
3275 return ip6_route_del(&cfg, extack);
3276 }
3277 }
3278
3279 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3280 struct netlink_ext_ack *extack)
3281 {
3282 struct fib6_config cfg;
3283 int err;
3284
3285 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3286 if (err < 0)
3287 return err;
3288
3289 if (cfg.fc_mp)
3290 return ip6_route_multipath_add(&cfg, extack);
3291 else
3292 return ip6_route_add(&cfg, extack);
3293 }
3294
3295 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3296 {
3297 int nexthop_len = 0;
3298
3299 if (rt->rt6i_nsiblings) {
3300 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3301 + NLA_ALIGN(sizeof(struct rtnexthop))
3302 + nla_total_size(16) /* RTA_GATEWAY */
3303 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3304
3305 nexthop_len *= rt->rt6i_nsiblings;
3306 }
3307
3308 return NLMSG_ALIGN(sizeof(struct rtmsg))
3309 + nla_total_size(16) /* RTA_SRC */
3310 + nla_total_size(16) /* RTA_DST */
3311 + nla_total_size(16) /* RTA_GATEWAY */
3312 + nla_total_size(16) /* RTA_PREFSRC */
3313 + nla_total_size(4) /* RTA_TABLE */
3314 + nla_total_size(4) /* RTA_IIF */
3315 + nla_total_size(4) /* RTA_OIF */
3316 + nla_total_size(4) /* RTA_PRIORITY */
3317 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3318 + nla_total_size(sizeof(struct rta_cacheinfo))
3319 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3320 + nla_total_size(1) /* RTA_PREF */
3321 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3322 + nexthop_len;
3323 }
3324
3325 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3326 unsigned int *flags, bool skip_oif)
3327 {
3328 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3329 *flags |= RTNH_F_LINKDOWN;
3330 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3331 *flags |= RTNH_F_DEAD;
3332 }
3333
3334 if (rt->rt6i_flags & RTF_GATEWAY) {
3335 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3336 goto nla_put_failure;
3337 }
3338
3339 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3340 if (!skip_oif && rt->dst.dev &&
3341 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3342 goto nla_put_failure;
3343
3344 if (rt->dst.lwtstate &&
3345 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3346 goto nla_put_failure;
3347
3348 return 0;
3349
3350 nla_put_failure:
3351 return -EMSGSIZE;
3352 }
3353
3354 /* add multipath next hop */
3355 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3356 {
3357 struct rtnexthop *rtnh;
3358 unsigned int flags = 0;
3359
3360 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3361 if (!rtnh)
3362 goto nla_put_failure;
3363
3364 rtnh->rtnh_hops = 0;
3365 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3366
3367 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3368 goto nla_put_failure;
3369
3370 rtnh->rtnh_flags = flags;
3371
3372 /* length of rtnetlink header + attributes */
3373 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3374
3375 return 0;
3376
3377 nla_put_failure:
3378 return -EMSGSIZE;
3379 }
3380
3381 static int rt6_fill_node(struct net *net,
3382 struct sk_buff *skb, struct rt6_info *rt,
3383 struct in6_addr *dst, struct in6_addr *src,
3384 int iif, int type, u32 portid, u32 seq,
3385 unsigned int flags)
3386 {
3387 u32 metrics[RTAX_MAX];
3388 struct rtmsg *rtm;
3389 struct nlmsghdr *nlh;
3390 long expires;
3391 u32 table;
3392
3393 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3394 if (!nlh)
3395 return -EMSGSIZE;
3396
3397 rtm = nlmsg_data(nlh);
3398 rtm->rtm_family = AF_INET6;
3399 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3400 rtm->rtm_src_len = rt->rt6i_src.plen;
3401 rtm->rtm_tos = 0;
3402 if (rt->rt6i_table)
3403 table = rt->rt6i_table->tb6_id;
3404 else
3405 table = RT6_TABLE_UNSPEC;
3406 rtm->rtm_table = table;
3407 if (nla_put_u32(skb, RTA_TABLE, table))
3408 goto nla_put_failure;
3409 if (rt->rt6i_flags & RTF_REJECT) {
3410 switch (rt->dst.error) {
3411 case -EINVAL:
3412 rtm->rtm_type = RTN_BLACKHOLE;
3413 break;
3414 case -EACCES:
3415 rtm->rtm_type = RTN_PROHIBIT;
3416 break;
3417 case -EAGAIN:
3418 rtm->rtm_type = RTN_THROW;
3419 break;
3420 default:
3421 rtm->rtm_type = RTN_UNREACHABLE;
3422 break;
3423 }
3424 }
3425 else if (rt->rt6i_flags & RTF_LOCAL)
3426 rtm->rtm_type = RTN_LOCAL;
3427 else if (rt->rt6i_flags & RTF_ANYCAST)
3428 rtm->rtm_type = RTN_ANYCAST;
3429 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3430 rtm->rtm_type = RTN_LOCAL;
3431 else
3432 rtm->rtm_type = RTN_UNICAST;
3433 rtm->rtm_flags = 0;
3434 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3435 rtm->rtm_protocol = rt->rt6i_protocol;
3436
3437 if (rt->rt6i_flags & RTF_CACHE)
3438 rtm->rtm_flags |= RTM_F_CLONED;
3439
3440 if (dst) {
3441 if (nla_put_in6_addr(skb, RTA_DST, dst))
3442 goto nla_put_failure;
3443 rtm->rtm_dst_len = 128;
3444 } else if (rtm->rtm_dst_len)
3445 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3446 goto nla_put_failure;
3447 #ifdef CONFIG_IPV6_SUBTREES
3448 if (src) {
3449 if (nla_put_in6_addr(skb, RTA_SRC, src))
3450 goto nla_put_failure;
3451 rtm->rtm_src_len = 128;
3452 } else if (rtm->rtm_src_len &&
3453 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3454 goto nla_put_failure;
3455 #endif
3456 if (iif) {
3457 #ifdef CONFIG_IPV6_MROUTE
3458 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3459 int err = ip6mr_get_route(net, skb, rtm, portid);
3460
3461 if (err == 0)
3462 return 0;
3463 if (err < 0)
3464 goto nla_put_failure;
3465 } else
3466 #endif
3467 if (nla_put_u32(skb, RTA_IIF, iif))
3468 goto nla_put_failure;
3469 } else if (dst) {
3470 struct in6_addr saddr_buf;
3471 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3472 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3473 goto nla_put_failure;
3474 }
3475
3476 if (rt->rt6i_prefsrc.plen) {
3477 struct in6_addr saddr_buf;
3478 saddr_buf = rt->rt6i_prefsrc.addr;
3479 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3480 goto nla_put_failure;
3481 }
3482
3483 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3484 if (rt->rt6i_pmtu)
3485 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3486 if (rtnetlink_put_metrics(skb, metrics) < 0)
3487 goto nla_put_failure;
3488
3489 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3490 goto nla_put_failure;
3491
3492 /* For multipath routes, walk the siblings list and add
3493 * each as a nexthop within RTA_MULTIPATH.
3494 */
3495 if (rt->rt6i_nsiblings) {
3496 struct rt6_info *sibling, *next_sibling;
3497 struct nlattr *mp;
3498
3499 mp = nla_nest_start(skb, RTA_MULTIPATH);
3500 if (!mp)
3501 goto nla_put_failure;
3502
3503 if (rt6_add_nexthop(skb, rt) < 0)
3504 goto nla_put_failure;
3505
3506 list_for_each_entry_safe(sibling, next_sibling,
3507 &rt->rt6i_siblings, rt6i_siblings) {
3508 if (rt6_add_nexthop(skb, sibling) < 0)
3509 goto nla_put_failure;
3510 }
3511
3512 nla_nest_end(skb, mp);
3513 } else {
3514 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3515 goto nla_put_failure;
3516 }
3517
3518 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3519
3520 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3521 goto nla_put_failure;
3522
3523 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3524 goto nla_put_failure;
3525
3526
3527 nlmsg_end(skb, nlh);
3528 return 0;
3529
3530 nla_put_failure:
3531 nlmsg_cancel(skb, nlh);
3532 return -EMSGSIZE;
3533 }
3534
3535 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3536 {
3537 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3538 struct net *net = arg->net;
3539
3540 if (rt == net->ipv6.ip6_null_entry)
3541 return 0;
3542
3543 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3544 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3545
3546 /* user wants prefix routes only */
3547 if (rtm->rtm_flags & RTM_F_PREFIX &&
3548 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3549 /* success since this is not a prefix route */
3550 return 1;
3551 }
3552 }
3553
3554 return rt6_fill_node(net,
3555 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3556 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3557 NLM_F_MULTI);
3558 }
3559
3560 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3561 struct netlink_ext_ack *extack)
3562 {
3563 struct net *net = sock_net(in_skb->sk);
3564 struct nlattr *tb[RTA_MAX+1];
3565 int err, iif = 0, oif = 0;
3566 struct dst_entry *dst;
3567 struct rt6_info *rt;
3568 struct sk_buff *skb;
3569 struct rtmsg *rtm;
3570 struct flowi6 fl6;
3571 bool fibmatch;
3572
3573 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3574 extack);
3575 if (err < 0)
3576 goto errout;
3577
3578 err = -EINVAL;
3579 memset(&fl6, 0, sizeof(fl6));
3580 rtm = nlmsg_data(nlh);
3581 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3582 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3583
3584 if (tb[RTA_SRC]) {
3585 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3586 goto errout;
3587
3588 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3589 }
3590
3591 if (tb[RTA_DST]) {
3592 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3593 goto errout;
3594
3595 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3596 }
3597
3598 if (tb[RTA_IIF])
3599 iif = nla_get_u32(tb[RTA_IIF]);
3600
3601 if (tb[RTA_OIF])
3602 oif = nla_get_u32(tb[RTA_OIF]);
3603
3604 if (tb[RTA_MARK])
3605 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3606
3607 if (tb[RTA_UID])
3608 fl6.flowi6_uid = make_kuid(current_user_ns(),
3609 nla_get_u32(tb[RTA_UID]));
3610 else
3611 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3612
3613 if (iif) {
3614 struct net_device *dev;
3615 int flags = 0;
3616
3617 dev = __dev_get_by_index(net, iif);
3618 if (!dev) {
3619 err = -ENODEV;
3620 goto errout;
3621 }
3622
3623 fl6.flowi6_iif = iif;
3624
3625 if (!ipv6_addr_any(&fl6.saddr))
3626 flags |= RT6_LOOKUP_F_HAS_SADDR;
3627
3628 if (!fibmatch)
3629 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3630 } else {
3631 fl6.flowi6_oif = oif;
3632
3633 if (!fibmatch)
3634 dst = ip6_route_output(net, NULL, &fl6);
3635 }
3636
3637 if (fibmatch)
3638 dst = ip6_route_lookup(net, &fl6, 0);
3639
3640 rt = container_of(dst, struct rt6_info, dst);
3641 if (rt->dst.error) {
3642 err = rt->dst.error;
3643 ip6_rt_put(rt);
3644 goto errout;
3645 }
3646
3647 if (rt == net->ipv6.ip6_null_entry) {
3648 err = rt->dst.error;
3649 ip6_rt_put(rt);
3650 goto errout;
3651 }
3652
3653 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3654 if (!skb) {
3655 ip6_rt_put(rt);
3656 err = -ENOBUFS;
3657 goto errout;
3658 }
3659
3660 skb_dst_set(skb, &rt->dst);
3661 if (fibmatch)
3662 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3663 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3664 nlh->nlmsg_seq, 0);
3665 else
3666 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3667 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3668 nlh->nlmsg_seq, 0);
3669 if (err < 0) {
3670 kfree_skb(skb);
3671 goto errout;
3672 }
3673
3674 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3675 errout:
3676 return err;
3677 }
3678
3679 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3680 unsigned int nlm_flags)
3681 {
3682 struct sk_buff *skb;
3683 struct net *net = info->nl_net;
3684 u32 seq;
3685 int err;
3686
3687 err = -ENOBUFS;
3688 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3689
3690 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3691 if (!skb)
3692 goto errout;
3693
3694 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3695 event, info->portid, seq, nlm_flags);
3696 if (err < 0) {
3697 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3698 WARN_ON(err == -EMSGSIZE);
3699 kfree_skb(skb);
3700 goto errout;
3701 }
3702 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3703 info->nlh, gfp_any());
3704 return;
3705 errout:
3706 if (err < 0)
3707 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3708 }
3709
3710 static int ip6_route_dev_notify(struct notifier_block *this,
3711 unsigned long event, void *ptr)
3712 {
3713 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3714 struct net *net = dev_net(dev);
3715
3716 if (!(dev->flags & IFF_LOOPBACK))
3717 return NOTIFY_OK;
3718
3719 if (event == NETDEV_REGISTER) {
3720 net->ipv6.ip6_null_entry->dst.dev = dev;
3721 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3722 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3723 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3724 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3725 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3726 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3727 #endif
3728 } else if (event == NETDEV_UNREGISTER &&
3729 dev->reg_state != NETREG_UNREGISTERED) {
3730 /* NETDEV_UNREGISTER could be fired for multiple times by
3731 * netdev_wait_allrefs(). Make sure we only call this once.
3732 */
3733 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3734 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3735 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3736 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3737 #endif
3738 }
3739
3740 return NOTIFY_OK;
3741 }
3742
3743 /*
3744 * /proc
3745 */
3746
3747 #ifdef CONFIG_PROC_FS
3748
3749 static const struct file_operations ipv6_route_proc_fops = {
3750 .owner = THIS_MODULE,
3751 .open = ipv6_route_open,
3752 .read = seq_read,
3753 .llseek = seq_lseek,
3754 .release = seq_release_net,
3755 };
3756
3757 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3758 {
3759 struct net *net = (struct net *)seq->private;
3760 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3761 net->ipv6.rt6_stats->fib_nodes,
3762 net->ipv6.rt6_stats->fib_route_nodes,
3763 net->ipv6.rt6_stats->fib_rt_alloc,
3764 net->ipv6.rt6_stats->fib_rt_entries,
3765 net->ipv6.rt6_stats->fib_rt_cache,
3766 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3767 net->ipv6.rt6_stats->fib_discarded_routes);
3768
3769 return 0;
3770 }
3771
3772 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3773 {
3774 return single_open_net(inode, file, rt6_stats_seq_show);
3775 }
3776
3777 static const struct file_operations rt6_stats_seq_fops = {
3778 .owner = THIS_MODULE,
3779 .open = rt6_stats_seq_open,
3780 .read = seq_read,
3781 .llseek = seq_lseek,
3782 .release = single_release_net,
3783 };
3784 #endif /* CONFIG_PROC_FS */
3785
3786 #ifdef CONFIG_SYSCTL
3787
3788 static
3789 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3790 void __user *buffer, size_t *lenp, loff_t *ppos)
3791 {
3792 struct net *net;
3793 int delay;
3794 if (!write)
3795 return -EINVAL;
3796
3797 net = (struct net *)ctl->extra1;
3798 delay = net->ipv6.sysctl.flush_delay;
3799 proc_dointvec(ctl, write, buffer, lenp, ppos);
3800 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3801 return 0;
3802 }
3803
3804 struct ctl_table ipv6_route_table_template[] = {
3805 {
3806 .procname = "flush",
3807 .data = &init_net.ipv6.sysctl.flush_delay,
3808 .maxlen = sizeof(int),
3809 .mode = 0200,
3810 .proc_handler = ipv6_sysctl_rtcache_flush
3811 },
3812 {
3813 .procname = "gc_thresh",
3814 .data = &ip6_dst_ops_template.gc_thresh,
3815 .maxlen = sizeof(int),
3816 .mode = 0644,
3817 .proc_handler = proc_dointvec,
3818 },
3819 {
3820 .procname = "max_size",
3821 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3822 .maxlen = sizeof(int),
3823 .mode = 0644,
3824 .proc_handler = proc_dointvec,
3825 },
3826 {
3827 .procname = "gc_min_interval",
3828 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3829 .maxlen = sizeof(int),
3830 .mode = 0644,
3831 .proc_handler = proc_dointvec_jiffies,
3832 },
3833 {
3834 .procname = "gc_timeout",
3835 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3836 .maxlen = sizeof(int),
3837 .mode = 0644,
3838 .proc_handler = proc_dointvec_jiffies,
3839 },
3840 {
3841 .procname = "gc_interval",
3842 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3843 .maxlen = sizeof(int),
3844 .mode = 0644,
3845 .proc_handler = proc_dointvec_jiffies,
3846 },
3847 {
3848 .procname = "gc_elasticity",
3849 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3850 .maxlen = sizeof(int),
3851 .mode = 0644,
3852 .proc_handler = proc_dointvec,
3853 },
3854 {
3855 .procname = "mtu_expires",
3856 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3857 .maxlen = sizeof(int),
3858 .mode = 0644,
3859 .proc_handler = proc_dointvec_jiffies,
3860 },
3861 {
3862 .procname = "min_adv_mss",
3863 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3864 .maxlen = sizeof(int),
3865 .mode = 0644,
3866 .proc_handler = proc_dointvec,
3867 },
3868 {
3869 .procname = "gc_min_interval_ms",
3870 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3871 .maxlen = sizeof(int),
3872 .mode = 0644,
3873 .proc_handler = proc_dointvec_ms_jiffies,
3874 },
3875 { }
3876 };
3877
3878 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3879 {
3880 struct ctl_table *table;
3881
3882 table = kmemdup(ipv6_route_table_template,
3883 sizeof(ipv6_route_table_template),
3884 GFP_KERNEL);
3885
3886 if (table) {
3887 table[0].data = &net->ipv6.sysctl.flush_delay;
3888 table[0].extra1 = net;
3889 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3890 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3891 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3892 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3893 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3894 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3895 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3896 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3897 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3898
3899 /* Don't export sysctls to unprivileged users */
3900 if (net->user_ns != &init_user_ns)
3901 table[0].procname = NULL;
3902 }
3903
3904 return table;
3905 }
3906 #endif
3907
3908 static int __net_init ip6_route_net_init(struct net *net)
3909 {
3910 int ret = -ENOMEM;
3911
3912 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3913 sizeof(net->ipv6.ip6_dst_ops));
3914
3915 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3916 goto out_ip6_dst_ops;
3917
3918 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3919 sizeof(*net->ipv6.ip6_null_entry),
3920 GFP_KERNEL);
3921 if (!net->ipv6.ip6_null_entry)
3922 goto out_ip6_dst_entries;
3923 net->ipv6.ip6_null_entry->dst.path =
3924 (struct dst_entry *)net->ipv6.ip6_null_entry;
3925 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3926 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3927 ip6_template_metrics, true);
3928
3929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3930 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3931 sizeof(*net->ipv6.ip6_prohibit_entry),
3932 GFP_KERNEL);
3933 if (!net->ipv6.ip6_prohibit_entry)
3934 goto out_ip6_null_entry;
3935 net->ipv6.ip6_prohibit_entry->dst.path =
3936 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3937 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3938 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3939 ip6_template_metrics, true);
3940
3941 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3942 sizeof(*net->ipv6.ip6_blk_hole_entry),
3943 GFP_KERNEL);
3944 if (!net->ipv6.ip6_blk_hole_entry)
3945 goto out_ip6_prohibit_entry;
3946 net->ipv6.ip6_blk_hole_entry->dst.path =
3947 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3948 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3949 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3950 ip6_template_metrics, true);
3951 #endif
3952
3953 net->ipv6.sysctl.flush_delay = 0;
3954 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3955 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3956 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3957 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3958 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3959 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3960 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3961
3962 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3963
3964 ret = 0;
3965 out:
3966 return ret;
3967
3968 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3969 out_ip6_prohibit_entry:
3970 kfree(net->ipv6.ip6_prohibit_entry);
3971 out_ip6_null_entry:
3972 kfree(net->ipv6.ip6_null_entry);
3973 #endif
3974 out_ip6_dst_entries:
3975 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3976 out_ip6_dst_ops:
3977 goto out;
3978 }
3979
3980 static void __net_exit ip6_route_net_exit(struct net *net)
3981 {
3982 kfree(net->ipv6.ip6_null_entry);
3983 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3984 kfree(net->ipv6.ip6_prohibit_entry);
3985 kfree(net->ipv6.ip6_blk_hole_entry);
3986 #endif
3987 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3988 }
3989
3990 static int __net_init ip6_route_net_init_late(struct net *net)
3991 {
3992 #ifdef CONFIG_PROC_FS
3993 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3994 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3995 #endif
3996 return 0;
3997 }
3998
3999 static void __net_exit ip6_route_net_exit_late(struct net *net)
4000 {
4001 #ifdef CONFIG_PROC_FS
4002 remove_proc_entry("ipv6_route", net->proc_net);
4003 remove_proc_entry("rt6_stats", net->proc_net);
4004 #endif
4005 }
4006
4007 static struct pernet_operations ip6_route_net_ops = {
4008 .init = ip6_route_net_init,
4009 .exit = ip6_route_net_exit,
4010 };
4011
4012 static int __net_init ipv6_inetpeer_init(struct net *net)
4013 {
4014 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4015
4016 if (!bp)
4017 return -ENOMEM;
4018 inet_peer_base_init(bp);
4019 net->ipv6.peers = bp;
4020 return 0;
4021 }
4022
4023 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4024 {
4025 struct inet_peer_base *bp = net->ipv6.peers;
4026
4027 net->ipv6.peers = NULL;
4028 inetpeer_invalidate_tree(bp);
4029 kfree(bp);
4030 }
4031
4032 static struct pernet_operations ipv6_inetpeer_ops = {
4033 .init = ipv6_inetpeer_init,
4034 .exit = ipv6_inetpeer_exit,
4035 };
4036
4037 static struct pernet_operations ip6_route_net_late_ops = {
4038 .init = ip6_route_net_init_late,
4039 .exit = ip6_route_net_exit_late,
4040 };
4041
4042 static struct notifier_block ip6_route_dev_notifier = {
4043 .notifier_call = ip6_route_dev_notify,
4044 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4045 };
4046
4047 void __init ip6_route_init_special_entries(void)
4048 {
4049 /* Registering of the loopback is done before this portion of code,
4050 * the loopback reference in rt6_info will not be taken, do it
4051 * manually for init_net */
4052 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4053 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4054 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4055 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4056 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4057 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4058 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4059 #endif
4060 }
4061
4062 int __init ip6_route_init(void)
4063 {
4064 int ret;
4065 int cpu;
4066
4067 ret = -ENOMEM;
4068 ip6_dst_ops_template.kmem_cachep =
4069 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4070 SLAB_HWCACHE_ALIGN, NULL);
4071 if (!ip6_dst_ops_template.kmem_cachep)
4072 goto out;
4073
4074 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4075 if (ret)
4076 goto out_kmem_cache;
4077
4078 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4079 if (ret)
4080 goto out_dst_entries;
4081
4082 ret = register_pernet_subsys(&ip6_route_net_ops);
4083 if (ret)
4084 goto out_register_inetpeer;
4085
4086 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4087
4088 ret = fib6_init();
4089 if (ret)
4090 goto out_register_subsys;
4091
4092 ret = xfrm6_init();
4093 if (ret)
4094 goto out_fib6_init;
4095
4096 ret = fib6_rules_init();
4097 if (ret)
4098 goto xfrm6_init;
4099
4100 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4101 if (ret)
4102 goto fib6_rules_init;
4103
4104 ret = -ENOBUFS;
4105 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4106 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4107 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4108 goto out_register_late_subsys;
4109
4110 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4111 if (ret)
4112 goto out_register_late_subsys;
4113
4114 for_each_possible_cpu(cpu) {
4115 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4116
4117 INIT_LIST_HEAD(&ul->head);
4118 spin_lock_init(&ul->lock);
4119 }
4120
4121 out:
4122 return ret;
4123
4124 out_register_late_subsys:
4125 unregister_pernet_subsys(&ip6_route_net_late_ops);
4126 fib6_rules_init:
4127 fib6_rules_cleanup();
4128 xfrm6_init:
4129 xfrm6_fini();
4130 out_fib6_init:
4131 fib6_gc_cleanup();
4132 out_register_subsys:
4133 unregister_pernet_subsys(&ip6_route_net_ops);
4134 out_register_inetpeer:
4135 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4136 out_dst_entries:
4137 dst_entries_destroy(&ip6_dst_blackhole_ops);
4138 out_kmem_cache:
4139 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4140 goto out;
4141 }
4142
4143 void ip6_route_cleanup(void)
4144 {
4145 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4146 unregister_pernet_subsys(&ip6_route_net_late_ops);
4147 fib6_rules_cleanup();
4148 xfrm6_fini();
4149 fib6_gc_cleanup();
4150 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4151 unregister_pernet_subsys(&ip6_route_net_ops);
4152 dst_entries_destroy(&ip6_dst_blackhole_ops);
4153 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4154 }