2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
57 #include <net/dst_metadata.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
74 RT6_NUD_FAIL_HARD
= -3,
75 RT6_NUD_FAIL_PROBE
= -2,
76 RT6_NUD_FAIL_DO_RR
= -1,
80 static void ip6_rt_copy_init(struct rt6_info
*rt
, struct rt6_info
*ort
);
81 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
82 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
83 static unsigned int ip6_mtu(const struct dst_entry
*dst
);
84 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
85 static void ip6_dst_destroy(struct dst_entry
*);
86 static void ip6_dst_ifdown(struct dst_entry
*,
87 struct net_device
*dev
, int how
);
88 static int ip6_dst_gc(struct dst_ops
*ops
);
90 static int ip6_pkt_discard(struct sk_buff
*skb
);
91 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
92 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
93 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
);
94 static void ip6_link_failure(struct sk_buff
*skb
);
95 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
96 struct sk_buff
*skb
, u32 mtu
);
97 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
99 static void rt6_dst_from_metrics_check(struct rt6_info
*rt
);
100 static int rt6_score_route(struct rt6_info
*rt
, int oif
, int strict
);
101 static size_t rt6_nlmsg_size(struct rt6_info
*rt
);
102 static int rt6_fill_node(struct net
*net
,
103 struct sk_buff
*skb
, struct rt6_info
*rt
,
104 struct in6_addr
*dst
, struct in6_addr
*src
,
105 int iif
, int type
, u32 portid
, u32 seq
,
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
110 const struct in6_addr
*prefix
, int prefixlen
,
111 const struct in6_addr
*gwaddr
,
112 struct net_device
*dev
,
114 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
115 const struct in6_addr
*prefix
, int prefixlen
,
116 const struct in6_addr
*gwaddr
,
117 struct net_device
*dev
);
120 struct uncached_list
{
122 struct list_head head
;
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt6_uncached_list
);
127 static void rt6_uncached_list_add(struct rt6_info
*rt
)
129 struct uncached_list
*ul
= raw_cpu_ptr(&rt6_uncached_list
);
131 rt
->rt6i_uncached_list
= ul
;
133 spin_lock_bh(&ul
->lock
);
134 list_add_tail(&rt
->rt6i_uncached
, &ul
->head
);
135 spin_unlock_bh(&ul
->lock
);
138 static void rt6_uncached_list_del(struct rt6_info
*rt
)
140 if (!list_empty(&rt
->rt6i_uncached
)) {
141 struct uncached_list
*ul
= rt
->rt6i_uncached_list
;
143 spin_lock_bh(&ul
->lock
);
144 list_del(&rt
->rt6i_uncached
);
145 spin_unlock_bh(&ul
->lock
);
149 static void rt6_uncached_list_flush_dev(struct net
*net
, struct net_device
*dev
)
151 struct net_device
*loopback_dev
= net
->loopback_dev
;
154 if (dev
== loopback_dev
)
157 for_each_possible_cpu(cpu
) {
158 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
161 spin_lock_bh(&ul
->lock
);
162 list_for_each_entry(rt
, &ul
->head
, rt6i_uncached
) {
163 struct inet6_dev
*rt_idev
= rt
->rt6i_idev
;
164 struct net_device
*rt_dev
= rt
->dst
.dev
;
166 if (rt_idev
->dev
== dev
) {
167 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
168 in6_dev_put(rt_idev
);
172 rt
->dst
.dev
= loopback_dev
;
173 dev_hold(rt
->dst
.dev
);
177 spin_unlock_bh(&ul
->lock
);
181 static u32
*rt6_pcpu_cow_metrics(struct rt6_info
*rt
)
183 return dst_metrics_write_ptr(rt
->dst
.from
);
186 static u32
*ipv6_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
188 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
190 if (rt
->rt6i_flags
& RTF_PCPU
)
191 return rt6_pcpu_cow_metrics(rt
);
192 else if (rt
->rt6i_flags
& RTF_CACHE
)
195 return dst_cow_metrics_generic(dst
, old
);
198 static inline const void *choose_neigh_daddr(struct rt6_info
*rt
,
202 struct in6_addr
*p
= &rt
->rt6i_gateway
;
204 if (!ipv6_addr_any(p
))
205 return (const void *) p
;
207 return &ipv6_hdr(skb
)->daddr
;
211 static struct neighbour
*ip6_neigh_lookup(const struct dst_entry
*dst
,
215 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
218 daddr
= choose_neigh_daddr(rt
, skb
, daddr
);
219 n
= __ipv6_neigh_lookup(dst
->dev
, daddr
);
222 return neigh_create(&nd_tbl
, daddr
, dst
->dev
);
225 static void ip6_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
227 struct net_device
*dev
= dst
->dev
;
228 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
230 daddr
= choose_neigh_daddr(rt
, NULL
, daddr
);
233 if (dev
->flags
& (IFF_NOARP
| IFF_LOOPBACK
))
235 if (ipv6_addr_is_multicast((const struct in6_addr
*)daddr
))
237 __ipv6_confirm_neigh(dev
, daddr
);
240 static struct dst_ops ip6_dst_ops_template
= {
244 .check
= ip6_dst_check
,
245 .default_advmss
= ip6_default_advmss
,
247 .cow_metrics
= ipv6_cow_metrics
,
248 .destroy
= ip6_dst_destroy
,
249 .ifdown
= ip6_dst_ifdown
,
250 .negative_advice
= ip6_negative_advice
,
251 .link_failure
= ip6_link_failure
,
252 .update_pmtu
= ip6_rt_update_pmtu
,
253 .redirect
= rt6_do_redirect
,
254 .local_out
= __ip6_local_out
,
255 .neigh_lookup
= ip6_neigh_lookup
,
256 .confirm_neigh
= ip6_confirm_neigh
,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry
*dst
)
261 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
263 return mtu
? : dst
->dev
->mtu
;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
267 struct sk_buff
*skb
, u32 mtu
)
271 static void ip6_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
276 static struct dst_ops ip6_dst_blackhole_ops
= {
278 .destroy
= ip6_dst_destroy
,
279 .check
= ip6_dst_check
,
280 .mtu
= ip6_blackhole_mtu
,
281 .default_advmss
= ip6_default_advmss
,
282 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
283 .redirect
= ip6_rt_blackhole_redirect
,
284 .cow_metrics
= dst_cow_metrics_generic
,
285 .neigh_lookup
= ip6_neigh_lookup
,
288 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
289 [RTAX_HOPLIMIT
- 1] = 0,
292 static const struct rt6_info ip6_null_entry_template
= {
294 .__refcnt
= ATOMIC_INIT(1),
296 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
297 .error
= -ENETUNREACH
,
298 .input
= ip6_pkt_discard
,
299 .output
= ip6_pkt_discard_out
,
301 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
302 .rt6i_protocol
= RTPROT_KERNEL
,
303 .rt6i_metric
= ~(u32
) 0,
304 .rt6i_ref
= ATOMIC_INIT(1),
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
309 static const struct rt6_info ip6_prohibit_entry_template
= {
311 .__refcnt
= ATOMIC_INIT(1),
313 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
315 .input
= ip6_pkt_prohibit
,
316 .output
= ip6_pkt_prohibit_out
,
318 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
319 .rt6i_protocol
= RTPROT_KERNEL
,
320 .rt6i_metric
= ~(u32
) 0,
321 .rt6i_ref
= ATOMIC_INIT(1),
324 static const struct rt6_info ip6_blk_hole_entry_template
= {
326 .__refcnt
= ATOMIC_INIT(1),
328 .obsolete
= DST_OBSOLETE_FORCE_CHK
,
330 .input
= dst_discard
,
331 .output
= dst_discard_out
,
333 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
334 .rt6i_protocol
= RTPROT_KERNEL
,
335 .rt6i_metric
= ~(u32
) 0,
336 .rt6i_ref
= ATOMIC_INIT(1),
341 static void rt6_info_init(struct rt6_info
*rt
)
343 struct dst_entry
*dst
= &rt
->dst
;
345 memset(dst
+ 1, 0, sizeof(*rt
) - sizeof(*dst
));
346 INIT_LIST_HEAD(&rt
->rt6i_siblings
);
347 INIT_LIST_HEAD(&rt
->rt6i_uncached
);
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info
*__ip6_dst_alloc(struct net
*net
,
352 struct net_device
*dev
,
355 struct rt6_info
*rt
= dst_alloc(&net
->ipv6
.ip6_dst_ops
, dev
,
356 1, DST_OBSOLETE_FORCE_CHK
, flags
);
364 struct rt6_info
*ip6_dst_alloc(struct net
*net
,
365 struct net_device
*dev
,
368 struct rt6_info
*rt
= __ip6_dst_alloc(net
, dev
, flags
);
371 rt
->rt6i_pcpu
= alloc_percpu_gfp(struct rt6_info
*, GFP_ATOMIC
);
375 for_each_possible_cpu(cpu
) {
378 p
= per_cpu_ptr(rt
->rt6i_pcpu
, cpu
);
379 /* no one shares rt */
383 dst_release_immediate(&rt
->dst
);
390 EXPORT_SYMBOL(ip6_dst_alloc
);
392 static void ip6_dst_destroy(struct dst_entry
*dst
)
394 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
395 struct dst_entry
*from
= dst
->from
;
396 struct inet6_dev
*idev
;
398 dst_destroy_metrics_generic(dst
);
399 free_percpu(rt
->rt6i_pcpu
);
400 rt6_uncached_list_del(rt
);
402 idev
= rt
->rt6i_idev
;
404 rt
->rt6i_idev
= NULL
;
412 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
415 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
416 struct inet6_dev
*idev
= rt
->rt6i_idev
;
417 struct net_device
*loopback_dev
=
418 dev_net(dev
)->loopback_dev
;
420 if (idev
&& idev
->dev
!= loopback_dev
) {
421 struct inet6_dev
*loopback_idev
= in6_dev_get(loopback_dev
);
423 rt
->rt6i_idev
= loopback_idev
;
429 static bool __rt6_check_expired(const struct rt6_info
*rt
)
431 if (rt
->rt6i_flags
& RTF_EXPIRES
)
432 return time_after(jiffies
, rt
->dst
.expires
);
437 static bool rt6_check_expired(const struct rt6_info
*rt
)
439 if (rt
->rt6i_flags
& RTF_EXPIRES
) {
440 if (time_after(jiffies
, rt
->dst
.expires
))
442 } else if (rt
->dst
.from
) {
443 return rt6_check_expired((struct rt6_info
*) rt
->dst
.from
);
448 /* Multipath route selection:
449 * Hash based function using packet header and flowlabel.
450 * Adapted from fib_info_hashfn()
452 static int rt6_info_hash_nhsfn(unsigned int candidate_count
,
453 const struct flowi6
*fl6
)
455 return get_hash_from_flowi6(fl6
) % candidate_count
;
458 static struct rt6_info
*rt6_multipath_select(struct rt6_info
*match
,
459 struct flowi6
*fl6
, int oif
,
462 struct rt6_info
*sibling
, *next_sibling
;
465 route_choosen
= rt6_info_hash_nhsfn(match
->rt6i_nsiblings
+ 1, fl6
);
466 /* Don't change the route, if route_choosen == 0
467 * (siblings does not include ourself)
470 list_for_each_entry_safe(sibling
, next_sibling
,
471 &match
->rt6i_siblings
, rt6i_siblings
) {
473 if (route_choosen
== 0) {
474 if (rt6_score_route(sibling
, oif
, strict
) < 0)
484 * Route lookup. Any table->tb6_lock is implied.
487 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
489 const struct in6_addr
*saddr
,
493 struct rt6_info
*local
= NULL
;
494 struct rt6_info
*sprt
;
496 if (!oif
&& ipv6_addr_any(saddr
))
499 for (sprt
= rt
; sprt
; sprt
= sprt
->dst
.rt6_next
) {
500 struct net_device
*dev
= sprt
->dst
.dev
;
503 if (dev
->ifindex
== oif
)
505 if (dev
->flags
& IFF_LOOPBACK
) {
506 if (!sprt
->rt6i_idev
||
507 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
508 if (flags
& RT6_LOOKUP_F_IFACE
)
511 local
->rt6i_idev
->dev
->ifindex
== oif
)
517 if (ipv6_chk_addr(net
, saddr
, dev
,
518 flags
& RT6_LOOKUP_F_IFACE
))
527 if (flags
& RT6_LOOKUP_F_IFACE
)
528 return net
->ipv6
.ip6_null_entry
;
534 #ifdef CONFIG_IPV6_ROUTER_PREF
535 struct __rt6_probe_work
{
536 struct work_struct work
;
537 struct in6_addr target
;
538 struct net_device
*dev
;
541 static void rt6_probe_deferred(struct work_struct
*w
)
543 struct in6_addr mcaddr
;
544 struct __rt6_probe_work
*work
=
545 container_of(w
, struct __rt6_probe_work
, work
);
547 addrconf_addr_solict_mult(&work
->target
, &mcaddr
);
548 ndisc_send_ns(work
->dev
, &work
->target
, &mcaddr
, NULL
, 0);
553 static void rt6_probe(struct rt6_info
*rt
)
555 struct __rt6_probe_work
*work
;
556 struct neighbour
*neigh
;
558 * Okay, this does not seem to be appropriate
559 * for now, however, we need to check if it
560 * is really so; aka Router Reachability Probing.
562 * Router Reachability Probe MUST be rate-limited
563 * to no more than one per minute.
565 if (!rt
|| !(rt
->rt6i_flags
& RTF_GATEWAY
))
568 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
570 if (neigh
->nud_state
& NUD_VALID
)
574 write_lock(&neigh
->lock
);
575 if (!(neigh
->nud_state
& NUD_VALID
) &&
578 rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
579 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
581 __neigh_set_probe_once(neigh
);
583 write_unlock(&neigh
->lock
);
585 work
= kmalloc(sizeof(*work
), GFP_ATOMIC
);
589 INIT_WORK(&work
->work
, rt6_probe_deferred
);
590 work
->target
= rt
->rt6i_gateway
;
591 dev_hold(rt
->dst
.dev
);
592 work
->dev
= rt
->dst
.dev
;
593 schedule_work(&work
->work
);
597 rcu_read_unlock_bh();
600 static inline void rt6_probe(struct rt6_info
*rt
)
606 * Default Router Selection (RFC 2461 6.3.6)
608 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
610 struct net_device
*dev
= rt
->dst
.dev
;
611 if (!oif
|| dev
->ifindex
== oif
)
613 if ((dev
->flags
& IFF_LOOPBACK
) &&
614 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
619 static inline enum rt6_nud_state
rt6_check_neigh(struct rt6_info
*rt
)
621 struct neighbour
*neigh
;
622 enum rt6_nud_state ret
= RT6_NUD_FAIL_HARD
;
624 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
625 !(rt
->rt6i_flags
& RTF_GATEWAY
))
626 return RT6_NUD_SUCCEED
;
629 neigh
= __ipv6_neigh_lookup_noref(rt
->dst
.dev
, &rt
->rt6i_gateway
);
631 read_lock(&neigh
->lock
);
632 if (neigh
->nud_state
& NUD_VALID
)
633 ret
= RT6_NUD_SUCCEED
;
634 #ifdef CONFIG_IPV6_ROUTER_PREF
635 else if (!(neigh
->nud_state
& NUD_FAILED
))
636 ret
= RT6_NUD_SUCCEED
;
638 ret
= RT6_NUD_FAIL_PROBE
;
640 read_unlock(&neigh
->lock
);
642 ret
= IS_ENABLED(CONFIG_IPV6_ROUTER_PREF
) ?
643 RT6_NUD_SUCCEED
: RT6_NUD_FAIL_DO_RR
;
645 rcu_read_unlock_bh();
650 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
655 m
= rt6_check_dev(rt
, oif
);
656 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
657 return RT6_NUD_FAIL_HARD
;
658 #ifdef CONFIG_IPV6_ROUTER_PREF
659 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
661 if (strict
& RT6_LOOKUP_F_REACHABLE
) {
662 int n
= rt6_check_neigh(rt
);
669 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
670 int *mpri
, struct rt6_info
*match
,
674 bool match_do_rr
= false;
675 struct inet6_dev
*idev
= rt
->rt6i_idev
;
676 struct net_device
*dev
= rt
->dst
.dev
;
678 if (dev
&& !netif_carrier_ok(dev
) &&
679 idev
->cnf
.ignore_routes_with_linkdown
&&
680 !(strict
& RT6_LOOKUP_F_IGNORE_LINKSTATE
))
683 if (rt6_check_expired(rt
))
686 m
= rt6_score_route(rt
, oif
, strict
);
687 if (m
== RT6_NUD_FAIL_DO_RR
) {
689 m
= 0; /* lowest valid score */
690 } else if (m
== RT6_NUD_FAIL_HARD
) {
694 if (strict
& RT6_LOOKUP_F_REACHABLE
)
697 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699 *do_rr
= match_do_rr
;
707 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
708 struct rt6_info
*rr_head
,
709 u32 metric
, int oif
, int strict
,
712 struct rt6_info
*rt
, *match
, *cont
;
717 for (rt
= rr_head
; rt
; rt
= rt
->dst
.rt6_next
) {
718 if (rt
->rt6i_metric
!= metric
) {
723 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
726 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
; rt
= rt
->dst
.rt6_next
) {
727 if (rt
->rt6i_metric
!= metric
) {
732 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
738 for (rt
= cont
; rt
; rt
= rt
->dst
.rt6_next
)
739 match
= find_match(rt
, oif
, strict
, &mpri
, match
, do_rr
);
744 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
746 struct rt6_info
*match
, *rt0
;
752 fn
->rr_ptr
= rt0
= fn
->leaf
;
754 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
,
758 struct rt6_info
*next
= rt0
->dst
.rt6_next
;
760 /* no entries matched; do round-robin */
761 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
768 net
= dev_net(rt0
->dst
.dev
);
769 return match
? match
: net
->ipv6
.ip6_null_entry
;
772 static bool rt6_is_gw_or_nonexthop(const struct rt6_info
*rt
)
774 return (rt
->rt6i_flags
& (RTF_NONEXTHOP
| RTF_GATEWAY
));
777 #ifdef CONFIG_IPV6_ROUTE_INFO
778 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
779 const struct in6_addr
*gwaddr
)
781 struct net
*net
= dev_net(dev
);
782 struct route_info
*rinfo
= (struct route_info
*) opt
;
783 struct in6_addr prefix_buf
, *prefix
;
785 unsigned long lifetime
;
788 if (len
< sizeof(struct route_info
)) {
792 /* Sanity check for prefix_len and length */
793 if (rinfo
->length
> 3) {
795 } else if (rinfo
->prefix_len
> 128) {
797 } else if (rinfo
->prefix_len
> 64) {
798 if (rinfo
->length
< 2) {
801 } else if (rinfo
->prefix_len
> 0) {
802 if (rinfo
->length
< 1) {
807 pref
= rinfo
->route_pref
;
808 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
811 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
813 if (rinfo
->length
== 3)
814 prefix
= (struct in6_addr
*)rinfo
->prefix
;
816 /* this function is safe */
817 ipv6_addr_prefix(&prefix_buf
,
818 (struct in6_addr
*)rinfo
->prefix
,
820 prefix
= &prefix_buf
;
823 if (rinfo
->prefix_len
== 0)
824 rt
= rt6_get_dflt_router(gwaddr
, dev
);
826 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
,
829 if (rt
&& !lifetime
) {
835 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
838 rt
->rt6i_flags
= RTF_ROUTEINFO
|
839 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
842 if (!addrconf_finite_timeout(lifetime
))
843 rt6_clean_expires(rt
);
845 rt6_set_expires(rt
, jiffies
+ HZ
* lifetime
);
853 static struct fib6_node
* fib6_backtrack(struct fib6_node
*fn
,
854 struct in6_addr
*saddr
)
856 struct fib6_node
*pn
;
858 if (fn
->fn_flags
& RTN_TL_ROOT
)
861 if (FIB6_SUBTREE(pn
) && FIB6_SUBTREE(pn
) != fn
)
862 fn
= fib6_lookup(FIB6_SUBTREE(pn
), NULL
, saddr
);
865 if (fn
->fn_flags
& RTN_RTINFO
)
870 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
871 struct fib6_table
*table
,
872 struct flowi6
*fl6
, int flags
)
874 struct fib6_node
*fn
;
877 read_lock_bh(&table
->tb6_lock
);
878 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
881 rt
= rt6_device_match(net
, rt
, &fl6
->saddr
, fl6
->flowi6_oif
, flags
);
882 if (rt
->rt6i_nsiblings
&& fl6
->flowi6_oif
== 0)
883 rt
= rt6_multipath_select(rt
, fl6
, fl6
->flowi6_oif
, flags
);
884 if (rt
== net
->ipv6
.ip6_null_entry
) {
885 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
889 dst_use(&rt
->dst
, jiffies
);
890 read_unlock_bh(&table
->tb6_lock
);
892 trace_fib6_table_lookup(net
, rt
, table
->tb6_id
, fl6
);
898 struct dst_entry
*ip6_route_lookup(struct net
*net
, struct flowi6
*fl6
,
901 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_lookup
);
903 EXPORT_SYMBOL_GPL(ip6_route_lookup
);
905 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
906 const struct in6_addr
*saddr
, int oif
, int strict
)
908 struct flowi6 fl6
= {
912 struct dst_entry
*dst
;
913 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
916 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
917 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
920 dst
= fib6_rule_lookup(net
, &fl6
, flags
, ip6_pol_route_lookup
);
922 return (struct rt6_info
*) dst
;
928 EXPORT_SYMBOL(rt6_lookup
);
930 /* ip6_ins_rt is called with FREE table->tb6_lock.
931 * It takes new route entry, the addition fails by any reason the
933 * Caller must hold dst before calling it.
936 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
,
937 struct mx6_config
*mxc
,
938 struct netlink_ext_ack
*extack
)
941 struct fib6_table
*table
;
943 table
= rt
->rt6i_table
;
944 write_lock_bh(&table
->tb6_lock
);
945 err
= fib6_add(&table
->tb6_root
, rt
, info
, mxc
, extack
);
946 write_unlock_bh(&table
->tb6_lock
);
951 int ip6_ins_rt(struct rt6_info
*rt
)
953 struct nl_info info
= { .nl_net
= dev_net(rt
->dst
.dev
), };
954 struct mx6_config mxc
= { .mx
= NULL
, };
956 /* Hold dst to account for the reference from the fib6 tree */
958 return __ip6_ins_rt(rt
, &info
, &mxc
, NULL
);
961 static struct rt6_info
*ip6_rt_cache_alloc(struct rt6_info
*ort
,
962 const struct in6_addr
*daddr
,
963 const struct in6_addr
*saddr
)
971 if (ort
->rt6i_flags
& (RTF_CACHE
| RTF_PCPU
))
972 ort
= (struct rt6_info
*)ort
->dst
.from
;
974 rt
= __ip6_dst_alloc(dev_net(ort
->dst
.dev
), ort
->dst
.dev
, 0);
979 ip6_rt_copy_init(rt
, ort
);
980 rt
->rt6i_flags
|= RTF_CACHE
;
982 rt
->dst
.flags
|= DST_HOST
;
983 rt
->rt6i_dst
.addr
= *daddr
;
984 rt
->rt6i_dst
.plen
= 128;
986 if (!rt6_is_gw_or_nonexthop(ort
)) {
987 if (ort
->rt6i_dst
.plen
!= 128 &&
988 ipv6_addr_equal(&ort
->rt6i_dst
.addr
, daddr
))
989 rt
->rt6i_flags
|= RTF_ANYCAST
;
990 #ifdef CONFIG_IPV6_SUBTREES
991 if (rt
->rt6i_src
.plen
&& saddr
) {
992 rt
->rt6i_src
.addr
= *saddr
;
993 rt
->rt6i_src
.plen
= 128;
1001 static struct rt6_info
*ip6_rt_pcpu_alloc(struct rt6_info
*rt
)
1003 struct rt6_info
*pcpu_rt
;
1005 pcpu_rt
= __ip6_dst_alloc(dev_net(rt
->dst
.dev
),
1006 rt
->dst
.dev
, rt
->dst
.flags
);
1010 ip6_rt_copy_init(pcpu_rt
, rt
);
1011 pcpu_rt
->rt6i_protocol
= rt
->rt6i_protocol
;
1012 pcpu_rt
->rt6i_flags
|= RTF_PCPU
;
1016 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1017 static struct rt6_info
*rt6_get_pcpu_route(struct rt6_info
*rt
)
1019 struct rt6_info
*pcpu_rt
, **p
;
1021 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1025 dst_hold(&pcpu_rt
->dst
);
1026 rt6_dst_from_metrics_check(pcpu_rt
);
1031 static struct rt6_info
*rt6_make_pcpu_route(struct rt6_info
*rt
)
1033 struct fib6_table
*table
= rt
->rt6i_table
;
1034 struct rt6_info
*pcpu_rt
, *prev
, **p
;
1036 pcpu_rt
= ip6_rt_pcpu_alloc(rt
);
1038 struct net
*net
= dev_net(rt
->dst
.dev
);
1040 dst_hold(&net
->ipv6
.ip6_null_entry
->dst
);
1041 return net
->ipv6
.ip6_null_entry
;
1044 read_lock_bh(&table
->tb6_lock
);
1045 if (rt
->rt6i_pcpu
) {
1046 p
= this_cpu_ptr(rt
->rt6i_pcpu
);
1047 prev
= cmpxchg(p
, NULL
, pcpu_rt
);
1049 /* If someone did it before us, return prev instead */
1050 dst_release_immediate(&pcpu_rt
->dst
);
1054 /* rt has been removed from the fib6 tree
1055 * before we have a chance to acquire the read_lock.
1056 * In this case, don't brother to create a pcpu rt
1057 * since rt is going away anyway. The next
1058 * dst_check() will trigger a re-lookup.
1060 dst_release_immediate(&pcpu_rt
->dst
);
1063 dst_hold(&pcpu_rt
->dst
);
1064 rt6_dst_from_metrics_check(pcpu_rt
);
1065 read_unlock_bh(&table
->tb6_lock
);
1069 struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
,
1070 int oif
, struct flowi6
*fl6
, int flags
)
1072 struct fib6_node
*fn
, *saved_fn
;
1073 struct rt6_info
*rt
;
1076 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
1077 strict
|= flags
& RT6_LOOKUP_F_IGNORE_LINKSTATE
;
1078 if (net
->ipv6
.devconf_all
->forwarding
== 0)
1079 strict
|= RT6_LOOKUP_F_REACHABLE
;
1081 read_lock_bh(&table
->tb6_lock
);
1083 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1086 if (fl6
->flowi6_flags
& FLOWI_FLAG_SKIP_NH_OIF
)
1090 rt
= rt6_select(fn
, oif
, strict
);
1091 if (rt
->rt6i_nsiblings
)
1092 rt
= rt6_multipath_select(rt
, fl6
, oif
, strict
);
1093 if (rt
== net
->ipv6
.ip6_null_entry
) {
1094 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1096 goto redo_rt6_select
;
1097 else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
1098 /* also consider unreachable route */
1099 strict
&= ~RT6_LOOKUP_F_REACHABLE
;
1101 goto redo_rt6_select
;
1106 if (rt
== net
->ipv6
.ip6_null_entry
|| (rt
->rt6i_flags
& RTF_CACHE
)) {
1107 dst_use(&rt
->dst
, jiffies
);
1108 read_unlock_bh(&table
->tb6_lock
);
1110 rt6_dst_from_metrics_check(rt
);
1112 trace_fib6_table_lookup(net
, rt
, table
->tb6_id
, fl6
);
1114 } else if (unlikely((fl6
->flowi6_flags
& FLOWI_FLAG_KNOWN_NH
) &&
1115 !(rt
->rt6i_flags
& RTF_GATEWAY
))) {
1116 /* Create a RTF_CACHE clone which will not be
1117 * owned by the fib6 tree. It is for the special case where
1118 * the daddr in the skb during the neighbor look-up is different
1119 * from the fl6->daddr used to look-up route here.
1122 struct rt6_info
*uncached_rt
;
1124 dst_use(&rt
->dst
, jiffies
);
1125 read_unlock_bh(&table
->tb6_lock
);
1127 uncached_rt
= ip6_rt_cache_alloc(rt
, &fl6
->daddr
, NULL
);
1128 dst_release(&rt
->dst
);
1131 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1132 * No need for another dst_hold()
1134 rt6_uncached_list_add(uncached_rt
);
1136 uncached_rt
= net
->ipv6
.ip6_null_entry
;
1137 dst_hold(&uncached_rt
->dst
);
1140 trace_fib6_table_lookup(net
, uncached_rt
, table
->tb6_id
, fl6
);
1144 /* Get a percpu copy */
1146 struct rt6_info
*pcpu_rt
;
1148 rt
->dst
.lastuse
= jiffies
;
1150 pcpu_rt
= rt6_get_pcpu_route(rt
);
1153 read_unlock_bh(&table
->tb6_lock
);
1155 /* We have to do the read_unlock first
1156 * because rt6_make_pcpu_route() may trigger
1157 * ip6_dst_gc() which will take the write_lock.
1160 read_unlock_bh(&table
->tb6_lock
);
1161 pcpu_rt
= rt6_make_pcpu_route(rt
);
1162 dst_release(&rt
->dst
);
1165 trace_fib6_table_lookup(net
, pcpu_rt
, table
->tb6_id
, fl6
);
1170 EXPORT_SYMBOL_GPL(ip6_pol_route
);
1172 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
1173 struct flowi6
*fl6
, int flags
)
1175 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, flags
);
1178 struct dst_entry
*ip6_route_input_lookup(struct net
*net
,
1179 struct net_device
*dev
,
1180 struct flowi6
*fl6
, int flags
)
1182 if (rt6_need_strict(&fl6
->daddr
) && dev
->type
!= ARPHRD_PIMREG
)
1183 flags
|= RT6_LOOKUP_F_IFACE
;
1185 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_input
);
1187 EXPORT_SYMBOL_GPL(ip6_route_input_lookup
);
1189 void ip6_route_input(struct sk_buff
*skb
)
1191 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
1192 struct net
*net
= dev_net(skb
->dev
);
1193 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1194 struct ip_tunnel_info
*tun_info
;
1195 struct flowi6 fl6
= {
1196 .flowi6_iif
= skb
->dev
->ifindex
,
1197 .daddr
= iph
->daddr
,
1198 .saddr
= iph
->saddr
,
1199 .flowlabel
= ip6_flowinfo(iph
),
1200 .flowi6_mark
= skb
->mark
,
1201 .flowi6_proto
= iph
->nexthdr
,
1204 tun_info
= skb_tunnel_info(skb
);
1205 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
1206 fl6
.flowi6_tun_key
.tun_id
= tun_info
->key
.tun_id
;
1208 skb_dst_set(skb
, ip6_route_input_lookup(net
, skb
->dev
, &fl6
, flags
));
1211 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
1212 struct flowi6
*fl6
, int flags
)
1214 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, flags
);
1217 struct dst_entry
*ip6_route_output_flags(struct net
*net
, const struct sock
*sk
,
1218 struct flowi6
*fl6
, int flags
)
1222 if (rt6_need_strict(&fl6
->daddr
)) {
1223 struct dst_entry
*dst
;
1225 dst
= l3mdev_link_scope_lookup(net
, fl6
);
1230 fl6
->flowi6_iif
= LOOPBACK_IFINDEX
;
1232 any_src
= ipv6_addr_any(&fl6
->saddr
);
1233 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
) ||
1234 (fl6
->flowi6_oif
&& any_src
))
1235 flags
|= RT6_LOOKUP_F_IFACE
;
1238 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1240 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
1242 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_output
);
1244 EXPORT_SYMBOL_GPL(ip6_route_output_flags
);
1246 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
1248 struct rt6_info
*rt
, *ort
= (struct rt6_info
*) dst_orig
;
1249 struct net_device
*loopback_dev
= net
->loopback_dev
;
1250 struct dst_entry
*new = NULL
;
1252 rt
= dst_alloc(&ip6_dst_blackhole_ops
, loopback_dev
, 1,
1253 DST_OBSOLETE_NONE
, 0);
1259 new->input
= dst_discard
;
1260 new->output
= dst_discard_out
;
1262 dst_copy_metrics(new, &ort
->dst
);
1264 rt
->rt6i_idev
= in6_dev_get(loopback_dev
);
1265 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
1266 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_PCPU
;
1267 rt
->rt6i_metric
= 0;
1269 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1270 #ifdef CONFIG_IPV6_SUBTREES
1271 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1275 dst_release(dst_orig
);
1276 return new ? new : ERR_PTR(-ENOMEM
);
1280 * Destination cache support functions
1283 static void rt6_dst_from_metrics_check(struct rt6_info
*rt
)
1286 dst_metrics_ptr(&rt
->dst
) != dst_metrics_ptr(rt
->dst
.from
))
1287 dst_init_metrics(&rt
->dst
, dst_metrics_ptr(rt
->dst
.from
), true);
1290 static struct dst_entry
*rt6_check(struct rt6_info
*rt
, u32 cookie
)
1294 if (!rt6_get_cookie_safe(rt
, &rt_cookie
) || rt_cookie
!= cookie
)
1297 if (rt6_check_expired(rt
))
1303 static struct dst_entry
*rt6_dst_from_check(struct rt6_info
*rt
, u32 cookie
)
1305 if (!__rt6_check_expired(rt
) &&
1306 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
1307 rt6_check((struct rt6_info
*)(rt
->dst
.from
), cookie
))
1313 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
1315 struct rt6_info
*rt
;
1317 rt
= (struct rt6_info
*) dst
;
1319 /* All IPV6 dsts are created with ->obsolete set to the value
1320 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1321 * into this function always.
1324 rt6_dst_from_metrics_check(rt
);
1326 if (rt
->rt6i_flags
& RTF_PCPU
||
1327 (unlikely(!list_empty(&rt
->rt6i_uncached
)) && rt
->dst
.from
))
1328 return rt6_dst_from_check(rt
, cookie
);
1330 return rt6_check(rt
, cookie
);
1333 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
1335 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1338 if (rt
->rt6i_flags
& RTF_CACHE
) {
1339 if (rt6_check_expired(rt
)) {
1351 static void ip6_link_failure(struct sk_buff
*skb
)
1353 struct rt6_info
*rt
;
1355 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
1357 rt
= (struct rt6_info
*) skb_dst(skb
);
1359 if (rt
->rt6i_flags
& RTF_CACHE
) {
1360 if (dst_hold_safe(&rt
->dst
))
1363 struct fib6_node
*fn
;
1366 fn
= rcu_dereference(rt
->rt6i_node
);
1367 if (fn
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
1374 static void rt6_do_update_pmtu(struct rt6_info
*rt
, u32 mtu
)
1376 struct net
*net
= dev_net(rt
->dst
.dev
);
1378 rt
->rt6i_flags
|= RTF_MODIFIED
;
1379 rt
->rt6i_pmtu
= mtu
;
1380 rt6_update_expires(rt
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1383 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info
*rt
)
1385 return !(rt
->rt6i_flags
& RTF_CACHE
) &&
1386 (rt
->rt6i_flags
& RTF_PCPU
||
1387 rcu_access_pointer(rt
->rt6i_node
));
1390 static void __ip6_rt_update_pmtu(struct dst_entry
*dst
, const struct sock
*sk
,
1391 const struct ipv6hdr
*iph
, u32 mtu
)
1393 const struct in6_addr
*daddr
, *saddr
;
1394 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
1396 if (rt6
->rt6i_flags
& RTF_LOCAL
)
1399 if (dst_metric_locked(dst
, RTAX_MTU
))
1403 daddr
= &iph
->daddr
;
1404 saddr
= &iph
->saddr
;
1406 daddr
= &sk
->sk_v6_daddr
;
1407 saddr
= &inet6_sk(sk
)->saddr
;
1412 dst_confirm_neigh(dst
, daddr
);
1413 mtu
= max_t(u32
, mtu
, IPV6_MIN_MTU
);
1414 if (mtu
>= dst_mtu(dst
))
1417 if (!rt6_cache_allowed_for_pmtu(rt6
)) {
1418 rt6_do_update_pmtu(rt6
, mtu
);
1420 struct rt6_info
*nrt6
;
1422 nrt6
= ip6_rt_cache_alloc(rt6
, daddr
, saddr
);
1424 rt6_do_update_pmtu(nrt6
, mtu
);
1426 /* ip6_ins_rt(nrt6) will bump the
1427 * rt6->rt6i_node->fn_sernum
1428 * which will fail the next rt6_check() and
1429 * invalidate the sk->sk_dst_cache.
1432 /* Release the reference taken in
1433 * ip6_rt_cache_alloc()
1435 dst_release(&nrt6
->dst
);
1440 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
1441 struct sk_buff
*skb
, u32 mtu
)
1443 __ip6_rt_update_pmtu(dst
, sk
, skb
? ipv6_hdr(skb
) : NULL
, mtu
);
1446 void ip6_update_pmtu(struct sk_buff
*skb
, struct net
*net
, __be32 mtu
,
1447 int oif
, u32 mark
, kuid_t uid
)
1449 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
1450 struct dst_entry
*dst
;
1453 memset(&fl6
, 0, sizeof(fl6
));
1454 fl6
.flowi6_oif
= oif
;
1455 fl6
.flowi6_mark
= mark
? mark
: IP6_REPLY_MARK(net
, skb
->mark
);
1456 fl6
.daddr
= iph
->daddr
;
1457 fl6
.saddr
= iph
->saddr
;
1458 fl6
.flowlabel
= ip6_flowinfo(iph
);
1459 fl6
.flowi6_uid
= uid
;
1461 dst
= ip6_route_output(net
, NULL
, &fl6
);
1463 __ip6_rt_update_pmtu(dst
, NULL
, iph
, ntohl(mtu
));
1466 EXPORT_SYMBOL_GPL(ip6_update_pmtu
);
1468 void ip6_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, __be32 mtu
)
1470 struct dst_entry
*dst
;
1472 ip6_update_pmtu(skb
, sock_net(sk
), mtu
,
1473 sk
->sk_bound_dev_if
, sk
->sk_mark
, sk
->sk_uid
);
1475 dst
= __sk_dst_get(sk
);
1476 if (!dst
|| !dst
->obsolete
||
1477 dst
->ops
->check(dst
, inet6_sk(sk
)->dst_cookie
))
1481 if (!sock_owned_by_user(sk
) && !ipv6_addr_v4mapped(&sk
->sk_v6_daddr
))
1482 ip6_datagram_dst_update(sk
, false);
1485 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu
);
1487 /* Handle redirects */
1488 struct ip6rd_flowi
{
1490 struct in6_addr gateway
;
1493 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1494 struct fib6_table
*table
,
1498 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
1499 struct rt6_info
*rt
;
1500 struct fib6_node
*fn
;
1502 /* Get the "current" route for this destination and
1503 * check if the redirect has come from appropriate router.
1505 * RFC 4861 specifies that redirects should only be
1506 * accepted if they come from the nexthop to the target.
1507 * Due to the way the routes are chosen, this notion
1508 * is a bit fuzzy and one might need to check all possible
1512 read_lock_bh(&table
->tb6_lock
);
1513 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1515 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1516 if (rt6_check_expired(rt
))
1520 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1522 if (fl6
->flowi6_oif
!= rt
->dst
.dev
->ifindex
)
1524 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1530 rt
= net
->ipv6
.ip6_null_entry
;
1531 else if (rt
->dst
.error
) {
1532 rt
= net
->ipv6
.ip6_null_entry
;
1536 if (rt
== net
->ipv6
.ip6_null_entry
) {
1537 fn
= fib6_backtrack(fn
, &fl6
->saddr
);
1545 read_unlock_bh(&table
->tb6_lock
);
1547 trace_fib6_table_lookup(net
, rt
, table
->tb6_id
, fl6
);
1551 static struct dst_entry
*ip6_route_redirect(struct net
*net
,
1552 const struct flowi6
*fl6
,
1553 const struct in6_addr
*gateway
)
1555 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1556 struct ip6rd_flowi rdfl
;
1559 rdfl
.gateway
= *gateway
;
1561 return fib6_rule_lookup(net
, &rdfl
.fl6
,
1562 flags
, __ip6_route_redirect
);
1565 void ip6_redirect(struct sk_buff
*skb
, struct net
*net
, int oif
, u32 mark
,
1568 const struct ipv6hdr
*iph
= (struct ipv6hdr
*) skb
->data
;
1569 struct dst_entry
*dst
;
1572 memset(&fl6
, 0, sizeof(fl6
));
1573 fl6
.flowi6_iif
= LOOPBACK_IFINDEX
;
1574 fl6
.flowi6_oif
= oif
;
1575 fl6
.flowi6_mark
= mark
;
1576 fl6
.daddr
= iph
->daddr
;
1577 fl6
.saddr
= iph
->saddr
;
1578 fl6
.flowlabel
= ip6_flowinfo(iph
);
1579 fl6
.flowi6_uid
= uid
;
1581 dst
= ip6_route_redirect(net
, &fl6
, &ipv6_hdr(skb
)->saddr
);
1582 rt6_do_redirect(dst
, NULL
, skb
);
1585 EXPORT_SYMBOL_GPL(ip6_redirect
);
1587 void ip6_redirect_no_header(struct sk_buff
*skb
, struct net
*net
, int oif
,
1590 const struct ipv6hdr
*iph
= ipv6_hdr(skb
);
1591 const struct rd_msg
*msg
= (struct rd_msg
*)icmp6_hdr(skb
);
1592 struct dst_entry
*dst
;
1595 memset(&fl6
, 0, sizeof(fl6
));
1596 fl6
.flowi6_iif
= LOOPBACK_IFINDEX
;
1597 fl6
.flowi6_oif
= oif
;
1598 fl6
.flowi6_mark
= mark
;
1599 fl6
.daddr
= msg
->dest
;
1600 fl6
.saddr
= iph
->daddr
;
1601 fl6
.flowi6_uid
= sock_net_uid(net
, NULL
);
1603 dst
= ip6_route_redirect(net
, &fl6
, &iph
->saddr
);
1604 rt6_do_redirect(dst
, NULL
, skb
);
1608 void ip6_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
1610 ip6_redirect(skb
, sock_net(sk
), sk
->sk_bound_dev_if
, sk
->sk_mark
,
1613 EXPORT_SYMBOL_GPL(ip6_sk_redirect
);
1615 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
1617 struct net_device
*dev
= dst
->dev
;
1618 unsigned int mtu
= dst_mtu(dst
);
1619 struct net
*net
= dev_net(dev
);
1621 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
1623 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
1624 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
1627 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1628 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1629 * IPV6_MAXPLEN is also valid and means: "any MSS,
1630 * rely only on pmtu discovery"
1632 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
1637 static unsigned int ip6_mtu(const struct dst_entry
*dst
)
1639 const struct rt6_info
*rt
= (const struct rt6_info
*)dst
;
1640 unsigned int mtu
= rt
->rt6i_pmtu
;
1641 struct inet6_dev
*idev
;
1646 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
1653 idev
= __in6_dev_get(dst
->dev
);
1655 mtu
= idev
->cnf
.mtu6
;
1659 mtu
= min_t(unsigned int, mtu
, IP6_MAX_MTU
);
1661 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
1664 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
1667 struct dst_entry
*dst
;
1668 struct rt6_info
*rt
;
1669 struct inet6_dev
*idev
= in6_dev_get(dev
);
1670 struct net
*net
= dev_net(dev
);
1672 if (unlikely(!idev
))
1673 return ERR_PTR(-ENODEV
);
1675 rt
= ip6_dst_alloc(net
, dev
, 0);
1676 if (unlikely(!rt
)) {
1678 dst
= ERR_PTR(-ENOMEM
);
1682 rt
->dst
.flags
|= DST_HOST
;
1683 rt
->dst
.output
= ip6_output
;
1684 rt
->rt6i_gateway
= fl6
->daddr
;
1685 rt
->rt6i_dst
.addr
= fl6
->daddr
;
1686 rt
->rt6i_dst
.plen
= 128;
1687 rt
->rt6i_idev
= idev
;
1688 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 0);
1690 /* Add this dst into uncached_list so that rt6_ifdown() can
1691 * do proper release of the net_device
1693 rt6_uncached_list_add(rt
);
1695 dst
= xfrm_lookup(net
, &rt
->dst
, flowi6_to_flowi(fl6
), NULL
, 0);
1701 static int ip6_dst_gc(struct dst_ops
*ops
)
1703 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1704 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1705 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1706 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1707 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1708 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1711 entries
= dst_entries_get_fast(ops
);
1712 if (time_after(rt_last_gc
+ rt_min_interval
, jiffies
) &&
1713 entries
<= rt_max_size
)
1716 net
->ipv6
.ip6_rt_gc_expire
++;
1717 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
, true);
1718 entries
= dst_entries_get_slow(ops
);
1719 if (entries
< ops
->gc_thresh
)
1720 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1722 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1723 return entries
> rt_max_size
;
1726 static int ip6_convert_metrics(struct mx6_config
*mxc
,
1727 const struct fib6_config
*cfg
)
1729 bool ecn_ca
= false;
1737 mp
= kzalloc(sizeof(u32
) * RTAX_MAX
, GFP_KERNEL
);
1741 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1742 int type
= nla_type(nla
);
1747 if (unlikely(type
> RTAX_MAX
))
1750 if (type
== RTAX_CC_ALGO
) {
1751 char tmp
[TCP_CA_NAME_MAX
];
1753 nla_strlcpy(tmp
, nla
, sizeof(tmp
));
1754 val
= tcp_ca_get_key_by_name(tmp
, &ecn_ca
);
1755 if (val
== TCP_CA_UNSPEC
)
1758 val
= nla_get_u32(nla
);
1760 if (type
== RTAX_HOPLIMIT
&& val
> 255)
1762 if (type
== RTAX_FEATURES
&& (val
& ~RTAX_FEATURE_MASK
))
1766 __set_bit(type
- 1, mxc
->mx_valid
);
1770 __set_bit(RTAX_FEATURES
- 1, mxc
->mx_valid
);
1771 mp
[RTAX_FEATURES
- 1] |= DST_FEATURE_ECN_CA
;
1781 static struct rt6_info
*ip6_nh_lookup_table(struct net
*net
,
1782 struct fib6_config
*cfg
,
1783 const struct in6_addr
*gw_addr
)
1785 struct flowi6 fl6
= {
1786 .flowi6_oif
= cfg
->fc_ifindex
,
1788 .saddr
= cfg
->fc_prefsrc
,
1790 struct fib6_table
*table
;
1791 struct rt6_info
*rt
;
1792 int flags
= RT6_LOOKUP_F_IFACE
| RT6_LOOKUP_F_IGNORE_LINKSTATE
;
1794 table
= fib6_get_table(net
, cfg
->fc_table
);
1798 if (!ipv6_addr_any(&cfg
->fc_prefsrc
))
1799 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
1801 rt
= ip6_pol_route(net
, table
, cfg
->fc_ifindex
, &fl6
, flags
);
1803 /* if table lookup failed, fall back to full lookup */
1804 if (rt
== net
->ipv6
.ip6_null_entry
) {
1812 static struct rt6_info
*ip6_route_info_create(struct fib6_config
*cfg
,
1813 struct netlink_ext_ack
*extack
)
1815 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1816 struct rt6_info
*rt
= NULL
;
1817 struct net_device
*dev
= NULL
;
1818 struct inet6_dev
*idev
= NULL
;
1819 struct fib6_table
*table
;
1823 /* RTF_PCPU is an internal flag; can not be set by userspace */
1824 if (cfg
->fc_flags
& RTF_PCPU
) {
1825 NL_SET_ERR_MSG(extack
, "Userspace can not set RTF_PCPU");
1829 if (cfg
->fc_dst_len
> 128) {
1830 NL_SET_ERR_MSG(extack
, "Invalid prefix length");
1833 if (cfg
->fc_src_len
> 128) {
1834 NL_SET_ERR_MSG(extack
, "Invalid source address length");
1837 #ifndef CONFIG_IPV6_SUBTREES
1838 if (cfg
->fc_src_len
) {
1839 NL_SET_ERR_MSG(extack
,
1840 "Specifying source address requires IPV6_SUBTREES to be enabled");
1844 if (cfg
->fc_ifindex
) {
1846 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1849 idev
= in6_dev_get(dev
);
1854 if (cfg
->fc_metric
== 0)
1855 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1858 if (cfg
->fc_nlinfo
.nlh
&&
1859 !(cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_CREATE
)) {
1860 table
= fib6_get_table(net
, cfg
->fc_table
);
1862 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1863 table
= fib6_new_table(net
, cfg
->fc_table
);
1866 table
= fib6_new_table(net
, cfg
->fc_table
);
1872 rt
= ip6_dst_alloc(net
, NULL
,
1873 (cfg
->fc_flags
& RTF_ADDRCONF
) ? 0 : DST_NOCOUNT
);
1880 if (cfg
->fc_flags
& RTF_EXPIRES
)
1881 rt6_set_expires(rt
, jiffies
+
1882 clock_t_to_jiffies(cfg
->fc_expires
));
1884 rt6_clean_expires(rt
);
1886 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1887 cfg
->fc_protocol
= RTPROT_BOOT
;
1888 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1890 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1892 if (addr_type
& IPV6_ADDR_MULTICAST
)
1893 rt
->dst
.input
= ip6_mc_input
;
1894 else if (cfg
->fc_flags
& RTF_LOCAL
)
1895 rt
->dst
.input
= ip6_input
;
1897 rt
->dst
.input
= ip6_forward
;
1899 rt
->dst
.output
= ip6_output
;
1901 if (cfg
->fc_encap
) {
1902 struct lwtunnel_state
*lwtstate
;
1904 err
= lwtunnel_build_state(cfg
->fc_encap_type
,
1905 cfg
->fc_encap
, AF_INET6
, cfg
,
1909 rt
->dst
.lwtstate
= lwtstate_get(lwtstate
);
1910 if (lwtunnel_output_redirect(rt
->dst
.lwtstate
)) {
1911 rt
->dst
.lwtstate
->orig_output
= rt
->dst
.output
;
1912 rt
->dst
.output
= lwtunnel_output
;
1914 if (lwtunnel_input_redirect(rt
->dst
.lwtstate
)) {
1915 rt
->dst
.lwtstate
->orig_input
= rt
->dst
.input
;
1916 rt
->dst
.input
= lwtunnel_input
;
1920 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1921 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1922 if (rt
->rt6i_dst
.plen
== 128)
1923 rt
->dst
.flags
|= DST_HOST
;
1925 #ifdef CONFIG_IPV6_SUBTREES
1926 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1927 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1930 rt
->rt6i_metric
= cfg
->fc_metric
;
1932 /* We cannot add true routes via loopback here,
1933 they would result in kernel looping; promote them to reject routes
1935 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1936 (dev
&& (dev
->flags
& IFF_LOOPBACK
) &&
1937 !(addr_type
& IPV6_ADDR_LOOPBACK
) &&
1938 !(cfg
->fc_flags
& RTF_LOCAL
))) {
1939 /* hold loopback dev/idev if we haven't done so. */
1940 if (dev
!= net
->loopback_dev
) {
1945 dev
= net
->loopback_dev
;
1947 idev
= in6_dev_get(dev
);
1953 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1954 switch (cfg
->fc_type
) {
1956 rt
->dst
.error
= -EINVAL
;
1957 rt
->dst
.output
= dst_discard_out
;
1958 rt
->dst
.input
= dst_discard
;
1961 rt
->dst
.error
= -EACCES
;
1962 rt
->dst
.output
= ip6_pkt_prohibit_out
;
1963 rt
->dst
.input
= ip6_pkt_prohibit
;
1966 case RTN_UNREACHABLE
:
1968 rt
->dst
.error
= (cfg
->fc_type
== RTN_THROW
) ? -EAGAIN
1969 : (cfg
->fc_type
== RTN_UNREACHABLE
)
1970 ? -EHOSTUNREACH
: -ENETUNREACH
;
1971 rt
->dst
.output
= ip6_pkt_discard_out
;
1972 rt
->dst
.input
= ip6_pkt_discard
;
1978 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1979 const struct in6_addr
*gw_addr
;
1982 gw_addr
= &cfg
->fc_gateway
;
1983 gwa_type
= ipv6_addr_type(gw_addr
);
1985 /* if gw_addr is local we will fail to detect this in case
1986 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1987 * will return already-added prefix route via interface that
1988 * prefix route was assigned to, which might be non-loopback.
1991 if (ipv6_chk_addr_and_flags(net
, gw_addr
,
1992 gwa_type
& IPV6_ADDR_LINKLOCAL
?
1993 dev
: NULL
, 0, 0)) {
1994 NL_SET_ERR_MSG(extack
, "Invalid gateway address");
1997 rt
->rt6i_gateway
= *gw_addr
;
1999 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
2000 struct rt6_info
*grt
= NULL
;
2002 /* IPv6 strictly inhibits using not link-local
2003 addresses as nexthop address.
2004 Otherwise, router will not able to send redirects.
2005 It is very good, but in some (rare!) circumstances
2006 (SIT, PtP, NBMA NOARP links) it is handy to allow
2007 some exceptions. --ANK
2008 We allow IPv4-mapped nexthops to support RFC4798-type
2011 if (!(gwa_type
& (IPV6_ADDR_UNICAST
|
2012 IPV6_ADDR_MAPPED
))) {
2013 NL_SET_ERR_MSG(extack
,
2014 "Invalid gateway address");
2018 if (cfg
->fc_table
) {
2019 grt
= ip6_nh_lookup_table(net
, cfg
, gw_addr
);
2022 if (grt
->rt6i_flags
& RTF_GATEWAY
||
2023 (dev
&& dev
!= grt
->dst
.dev
)) {
2031 grt
= rt6_lookup(net
, gw_addr
, NULL
,
2032 cfg
->fc_ifindex
, 1);
2034 err
= -EHOSTUNREACH
;
2038 if (dev
!= grt
->dst
.dev
) {
2044 idev
= grt
->rt6i_idev
;
2046 in6_dev_hold(grt
->rt6i_idev
);
2048 if (!(grt
->rt6i_flags
& RTF_GATEWAY
))
2057 NL_SET_ERR_MSG(extack
, "Egress device not specified");
2059 } else if (dev
->flags
& IFF_LOOPBACK
) {
2060 NL_SET_ERR_MSG(extack
,
2061 "Egress device can not be loopback device for this route");
2070 if (!ipv6_addr_any(&cfg
->fc_prefsrc
)) {
2071 if (!ipv6_chk_addr(net
, &cfg
->fc_prefsrc
, dev
, 0)) {
2072 NL_SET_ERR_MSG(extack
, "Invalid source address");
2076 rt
->rt6i_prefsrc
.addr
= cfg
->fc_prefsrc
;
2077 rt
->rt6i_prefsrc
.plen
= 128;
2079 rt
->rt6i_prefsrc
.plen
= 0;
2081 rt
->rt6i_flags
= cfg
->fc_flags
;
2085 rt
->rt6i_idev
= idev
;
2086 rt
->rt6i_table
= table
;
2088 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
2097 dst_release_immediate(&rt
->dst
);
2099 return ERR_PTR(err
);
2102 int ip6_route_add(struct fib6_config
*cfg
,
2103 struct netlink_ext_ack
*extack
)
2105 struct mx6_config mxc
= { .mx
= NULL
, };
2106 struct rt6_info
*rt
;
2109 rt
= ip6_route_info_create(cfg
, extack
);
2116 err
= ip6_convert_metrics(&mxc
, cfg
);
2120 err
= __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
, &mxc
, extack
);
2127 dst_release_immediate(&rt
->dst
);
2132 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
2135 struct fib6_table
*table
;
2136 struct net
*net
= dev_net(rt
->dst
.dev
);
2138 if (rt
== net
->ipv6
.ip6_null_entry
) {
2143 table
= rt
->rt6i_table
;
2144 write_lock_bh(&table
->tb6_lock
);
2145 err
= fib6_del(rt
, info
);
2146 write_unlock_bh(&table
->tb6_lock
);
2153 int ip6_del_rt(struct rt6_info
*rt
)
2155 struct nl_info info
= {
2156 .nl_net
= dev_net(rt
->dst
.dev
),
2158 return __ip6_del_rt(rt
, &info
);
2161 static int __ip6_del_rt_siblings(struct rt6_info
*rt
, struct fib6_config
*cfg
)
2163 struct nl_info
*info
= &cfg
->fc_nlinfo
;
2164 struct net
*net
= info
->nl_net
;
2165 struct sk_buff
*skb
= NULL
;
2166 struct fib6_table
*table
;
2169 if (rt
== net
->ipv6
.ip6_null_entry
)
2171 table
= rt
->rt6i_table
;
2172 write_lock_bh(&table
->tb6_lock
);
2174 if (rt
->rt6i_nsiblings
&& cfg
->fc_delete_all_nh
) {
2175 struct rt6_info
*sibling
, *next_sibling
;
2177 /* prefer to send a single notification with all hops */
2178 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
2180 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
2182 if (rt6_fill_node(net
, skb
, rt
,
2183 NULL
, NULL
, 0, RTM_DELROUTE
,
2184 info
->portid
, seq
, 0) < 0) {
2188 info
->skip_notify
= 1;
2191 list_for_each_entry_safe(sibling
, next_sibling
,
2194 err
= fib6_del(sibling
, info
);
2200 err
= fib6_del(rt
, info
);
2202 write_unlock_bh(&table
->tb6_lock
);
2207 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
2208 info
->nlh
, gfp_any());
2213 static int ip6_route_del(struct fib6_config
*cfg
,
2214 struct netlink_ext_ack
*extack
)
2216 struct fib6_table
*table
;
2217 struct fib6_node
*fn
;
2218 struct rt6_info
*rt
;
2221 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
2223 NL_SET_ERR_MSG(extack
, "FIB table does not exist");
2227 read_lock_bh(&table
->tb6_lock
);
2229 fn
= fib6_locate(&table
->tb6_root
,
2230 &cfg
->fc_dst
, cfg
->fc_dst_len
,
2231 &cfg
->fc_src
, cfg
->fc_src_len
);
2234 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
2235 if ((rt
->rt6i_flags
& RTF_CACHE
) &&
2236 !(cfg
->fc_flags
& RTF_CACHE
))
2238 if (cfg
->fc_ifindex
&&
2240 rt
->dst
.dev
->ifindex
!= cfg
->fc_ifindex
))
2242 if (cfg
->fc_flags
& RTF_GATEWAY
&&
2243 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
2245 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
2247 if (cfg
->fc_protocol
&& cfg
->fc_protocol
!= rt
->rt6i_protocol
)
2250 read_unlock_bh(&table
->tb6_lock
);
2252 /* if gateway was specified only delete the one hop */
2253 if (cfg
->fc_flags
& RTF_GATEWAY
)
2254 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
2256 return __ip6_del_rt_siblings(rt
, cfg
);
2259 read_unlock_bh(&table
->tb6_lock
);
2264 static void rt6_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
2266 struct netevent_redirect netevent
;
2267 struct rt6_info
*rt
, *nrt
= NULL
;
2268 struct ndisc_options ndopts
;
2269 struct inet6_dev
*in6_dev
;
2270 struct neighbour
*neigh
;
2272 int optlen
, on_link
;
2275 optlen
= skb_tail_pointer(skb
) - skb_transport_header(skb
);
2276 optlen
-= sizeof(*msg
);
2279 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2283 msg
= (struct rd_msg
*)icmp6_hdr(skb
);
2285 if (ipv6_addr_is_multicast(&msg
->dest
)) {
2286 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2291 if (ipv6_addr_equal(&msg
->dest
, &msg
->target
)) {
2293 } else if (ipv6_addr_type(&msg
->target
) !=
2294 (IPV6_ADDR_UNICAST
|IPV6_ADDR_LINKLOCAL
)) {
2295 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2299 in6_dev
= __in6_dev_get(skb
->dev
);
2302 if (in6_dev
->cnf
.forwarding
|| !in6_dev
->cnf
.accept_redirects
)
2306 * The IP source address of the Redirect MUST be the same as the current
2307 * first-hop router for the specified ICMP Destination Address.
2310 if (!ndisc_parse_options(skb
->dev
, msg
->opt
, optlen
, &ndopts
)) {
2311 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2316 if (ndopts
.nd_opts_tgt_lladdr
) {
2317 lladdr
= ndisc_opt_addr_data(ndopts
.nd_opts_tgt_lladdr
,
2320 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2325 rt
= (struct rt6_info
*) dst
;
2326 if (rt
->rt6i_flags
& RTF_REJECT
) {
2327 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2331 /* Redirect received -> path was valid.
2332 * Look, redirects are sent only in response to data packets,
2333 * so that this nexthop apparently is reachable. --ANK
2335 dst_confirm_neigh(&rt
->dst
, &ipv6_hdr(skb
)->saddr
);
2337 neigh
= __neigh_lookup(&nd_tbl
, &msg
->target
, skb
->dev
, 1);
2342 * We have finally decided to accept it.
2345 ndisc_update(skb
->dev
, neigh
, lladdr
, NUD_STALE
,
2346 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
2347 NEIGH_UPDATE_F_OVERRIDE
|
2348 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
2349 NEIGH_UPDATE_F_ISROUTER
)),
2350 NDISC_REDIRECT
, &ndopts
);
2352 nrt
= ip6_rt_cache_alloc(rt
, &msg
->dest
, NULL
);
2356 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
2358 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
2360 nrt
->rt6i_protocol
= RTPROT_REDIRECT
;
2361 nrt
->rt6i_gateway
= *(struct in6_addr
*)neigh
->primary_key
;
2363 if (ip6_ins_rt(nrt
))
2366 netevent
.old
= &rt
->dst
;
2367 netevent
.new = &nrt
->dst
;
2368 netevent
.daddr
= &msg
->dest
;
2369 netevent
.neigh
= neigh
;
2370 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
2372 if (rt
->rt6i_flags
& RTF_CACHE
) {
2373 rt
= (struct rt6_info
*) dst_clone(&rt
->dst
);
2378 /* Release the reference taken in
2379 * ip6_rt_cache_alloc()
2381 dst_release(&nrt
->dst
);
2384 neigh_release(neigh
);
2388 * Misc support functions
2391 static void rt6_set_from(struct rt6_info
*rt
, struct rt6_info
*from
)
2393 BUG_ON(from
->dst
.from
);
2395 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
2396 dst_hold(&from
->dst
);
2397 rt
->dst
.from
= &from
->dst
;
2398 dst_init_metrics(&rt
->dst
, dst_metrics_ptr(&from
->dst
), true);
2401 static void ip6_rt_copy_init(struct rt6_info
*rt
, struct rt6_info
*ort
)
2403 rt
->dst
.input
= ort
->dst
.input
;
2404 rt
->dst
.output
= ort
->dst
.output
;
2405 rt
->rt6i_dst
= ort
->rt6i_dst
;
2406 rt
->dst
.error
= ort
->dst
.error
;
2407 rt
->rt6i_idev
= ort
->rt6i_idev
;
2409 in6_dev_hold(rt
->rt6i_idev
);
2410 rt
->dst
.lastuse
= jiffies
;
2411 rt
->rt6i_gateway
= ort
->rt6i_gateway
;
2412 rt
->rt6i_flags
= ort
->rt6i_flags
;
2413 rt6_set_from(rt
, ort
);
2414 rt
->rt6i_metric
= ort
->rt6i_metric
;
2415 #ifdef CONFIG_IPV6_SUBTREES
2416 rt
->rt6i_src
= ort
->rt6i_src
;
2418 rt
->rt6i_prefsrc
= ort
->rt6i_prefsrc
;
2419 rt
->rt6i_table
= ort
->rt6i_table
;
2420 rt
->dst
.lwtstate
= lwtstate_get(ort
->dst
.lwtstate
);
2423 #ifdef CONFIG_IPV6_ROUTE_INFO
2424 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
2425 const struct in6_addr
*prefix
, int prefixlen
,
2426 const struct in6_addr
*gwaddr
,
2427 struct net_device
*dev
)
2429 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
;
2430 int ifindex
= dev
->ifindex
;
2431 struct fib6_node
*fn
;
2432 struct rt6_info
*rt
= NULL
;
2433 struct fib6_table
*table
;
2435 table
= fib6_get_table(net
, tb_id
);
2439 read_lock_bh(&table
->tb6_lock
);
2440 fn
= fib6_locate(&table
->tb6_root
, prefix
, prefixlen
, NULL
, 0);
2444 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
2445 if (rt
->dst
.dev
->ifindex
!= ifindex
)
2447 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
2449 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
2455 read_unlock_bh(&table
->tb6_lock
);
2459 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
2460 const struct in6_addr
*prefix
, int prefixlen
,
2461 const struct in6_addr
*gwaddr
,
2462 struct net_device
*dev
,
2465 struct fib6_config cfg
= {
2466 .fc_metric
= IP6_RT_PRIO_USER
,
2467 .fc_ifindex
= dev
->ifindex
,
2468 .fc_dst_len
= prefixlen
,
2469 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
2470 RTF_UP
| RTF_PREF(pref
),
2471 .fc_protocol
= RTPROT_RA
,
2472 .fc_nlinfo
.portid
= 0,
2473 .fc_nlinfo
.nlh
= NULL
,
2474 .fc_nlinfo
.nl_net
= net
,
2477 cfg
.fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_INFO
,
2478 cfg
.fc_dst
= *prefix
;
2479 cfg
.fc_gateway
= *gwaddr
;
2481 /* We should treat it as a default route if prefix length is 0. */
2483 cfg
.fc_flags
|= RTF_DEFAULT
;
2485 ip6_route_add(&cfg
, NULL
);
2487 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, dev
);
2491 struct rt6_info
*rt6_get_dflt_router(const struct in6_addr
*addr
, struct net_device
*dev
)
2493 u32 tb_id
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
;
2494 struct rt6_info
*rt
;
2495 struct fib6_table
*table
;
2497 table
= fib6_get_table(dev_net(dev
), tb_id
);
2501 read_lock_bh(&table
->tb6_lock
);
2502 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
2503 if (dev
== rt
->dst
.dev
&&
2504 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
2505 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
2510 read_unlock_bh(&table
->tb6_lock
);
2514 struct rt6_info
*rt6_add_dflt_router(const struct in6_addr
*gwaddr
,
2515 struct net_device
*dev
,
2518 struct fib6_config cfg
= {
2519 .fc_table
= l3mdev_fib_table(dev
) ? : RT6_TABLE_DFLT
,
2520 .fc_metric
= IP6_RT_PRIO_USER
,
2521 .fc_ifindex
= dev
->ifindex
,
2522 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
2523 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
2524 .fc_protocol
= RTPROT_RA
,
2525 .fc_nlinfo
.portid
= 0,
2526 .fc_nlinfo
.nlh
= NULL
,
2527 .fc_nlinfo
.nl_net
= dev_net(dev
),
2530 cfg
.fc_gateway
= *gwaddr
;
2532 if (!ip6_route_add(&cfg
, NULL
)) {
2533 struct fib6_table
*table
;
2535 table
= fib6_get_table(dev_net(dev
), cfg
.fc_table
);
2537 table
->flags
|= RT6_TABLE_HAS_DFLT_ROUTER
;
2540 return rt6_get_dflt_router(gwaddr
, dev
);
2543 static void __rt6_purge_dflt_routers(struct fib6_table
*table
)
2545 struct rt6_info
*rt
;
2548 read_lock_bh(&table
->tb6_lock
);
2549 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
2550 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
) &&
2551 (!rt
->rt6i_idev
|| rt
->rt6i_idev
->cnf
.accept_ra
!= 2)) {
2553 read_unlock_bh(&table
->tb6_lock
);
2558 read_unlock_bh(&table
->tb6_lock
);
2560 table
->flags
&= ~RT6_TABLE_HAS_DFLT_ROUTER
;
2563 void rt6_purge_dflt_routers(struct net
*net
)
2565 struct fib6_table
*table
;
2566 struct hlist_head
*head
;
2571 for (h
= 0; h
< FIB6_TABLE_HASHSZ
; h
++) {
2572 head
= &net
->ipv6
.fib_table_hash
[h
];
2573 hlist_for_each_entry_rcu(table
, head
, tb6_hlist
) {
2574 if (table
->flags
& RT6_TABLE_HAS_DFLT_ROUTER
)
2575 __rt6_purge_dflt_routers(table
);
2582 static void rtmsg_to_fib6_config(struct net
*net
,
2583 struct in6_rtmsg
*rtmsg
,
2584 struct fib6_config
*cfg
)
2586 memset(cfg
, 0, sizeof(*cfg
));
2588 cfg
->fc_table
= l3mdev_fib_table_by_index(net
, rtmsg
->rtmsg_ifindex
) ?
2590 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
2591 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
2592 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
2593 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
2594 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
2595 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
2597 cfg
->fc_nlinfo
.nl_net
= net
;
2599 cfg
->fc_dst
= rtmsg
->rtmsg_dst
;
2600 cfg
->fc_src
= rtmsg
->rtmsg_src
;
2601 cfg
->fc_gateway
= rtmsg
->rtmsg_gateway
;
2604 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
2606 struct fib6_config cfg
;
2607 struct in6_rtmsg rtmsg
;
2611 case SIOCADDRT
: /* Add a route */
2612 case SIOCDELRT
: /* Delete a route */
2613 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
2615 err
= copy_from_user(&rtmsg
, arg
,
2616 sizeof(struct in6_rtmsg
));
2620 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
2625 err
= ip6_route_add(&cfg
, NULL
);
2628 err
= ip6_route_del(&cfg
, NULL
);
2642 * Drop the packet on the floor
2645 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
2648 struct dst_entry
*dst
= skb_dst(skb
);
2649 switch (ipstats_mib_noroutes
) {
2650 case IPSTATS_MIB_INNOROUTES
:
2651 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
2652 if (type
== IPV6_ADDR_ANY
) {
2653 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
2654 IPSTATS_MIB_INADDRERRORS
);
2658 case IPSTATS_MIB_OUTNOROUTES
:
2659 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
2660 ipstats_mib_noroutes
);
2663 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
2668 static int ip6_pkt_discard(struct sk_buff
*skb
)
2670 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
2673 static int ip6_pkt_discard_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
2675 skb
->dev
= skb_dst(skb
)->dev
;
2676 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
2679 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
2681 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
2684 static int ip6_pkt_prohibit_out(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
2686 skb
->dev
= skb_dst(skb
)->dev
;
2687 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
2691 * Allocate a dst for local (unicast / anycast) address.
2694 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
2695 const struct in6_addr
*addr
,
2699 struct net
*net
= dev_net(idev
->dev
);
2700 struct net_device
*dev
= net
->loopback_dev
;
2701 struct rt6_info
*rt
;
2703 /* use L3 Master device as loopback for host routes if device
2704 * is enslaved and address is not link local or multicast
2706 if (!rt6_need_strict(addr
))
2707 dev
= l3mdev_master_dev_rcu(idev
->dev
) ? : dev
;
2709 rt
= ip6_dst_alloc(net
, dev
, DST_NOCOUNT
);
2711 return ERR_PTR(-ENOMEM
);
2715 rt
->dst
.flags
|= DST_HOST
;
2716 rt
->dst
.input
= ip6_input
;
2717 rt
->dst
.output
= ip6_output
;
2718 rt
->rt6i_idev
= idev
;
2720 rt
->rt6i_protocol
= RTPROT_KERNEL
;
2721 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
2723 rt
->rt6i_flags
|= RTF_ANYCAST
;
2725 rt
->rt6i_flags
|= RTF_LOCAL
;
2727 rt
->rt6i_gateway
= *addr
;
2728 rt
->rt6i_dst
.addr
= *addr
;
2729 rt
->rt6i_dst
.plen
= 128;
2730 tb_id
= l3mdev_fib_table(idev
->dev
) ? : RT6_TABLE_LOCAL
;
2731 rt
->rt6i_table
= fib6_get_table(net
, tb_id
);
2736 /* remove deleted ip from prefsrc entries */
2737 struct arg_dev_net_ip
{
2738 struct net_device
*dev
;
2740 struct in6_addr
*addr
;
2743 static int fib6_remove_prefsrc(struct rt6_info
*rt
, void *arg
)
2745 struct net_device
*dev
= ((struct arg_dev_net_ip
*)arg
)->dev
;
2746 struct net
*net
= ((struct arg_dev_net_ip
*)arg
)->net
;
2747 struct in6_addr
*addr
= ((struct arg_dev_net_ip
*)arg
)->addr
;
2749 if (((void *)rt
->dst
.dev
== dev
|| !dev
) &&
2750 rt
!= net
->ipv6
.ip6_null_entry
&&
2751 ipv6_addr_equal(addr
, &rt
->rt6i_prefsrc
.addr
)) {
2752 /* remove prefsrc entry */
2753 rt
->rt6i_prefsrc
.plen
= 0;
2758 void rt6_remove_prefsrc(struct inet6_ifaddr
*ifp
)
2760 struct net
*net
= dev_net(ifp
->idev
->dev
);
2761 struct arg_dev_net_ip adni
= {
2762 .dev
= ifp
->idev
->dev
,
2766 fib6_clean_all(net
, fib6_remove_prefsrc
, &adni
);
2769 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2770 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2772 /* Remove routers and update dst entries when gateway turn into host. */
2773 static int fib6_clean_tohost(struct rt6_info
*rt
, void *arg
)
2775 struct in6_addr
*gateway
= (struct in6_addr
*)arg
;
2777 if ((((rt
->rt6i_flags
& RTF_RA_ROUTER
) == RTF_RA_ROUTER
) ||
2778 ((rt
->rt6i_flags
& RTF_CACHE_GATEWAY
) == RTF_CACHE_GATEWAY
)) &&
2779 ipv6_addr_equal(gateway
, &rt
->rt6i_gateway
)) {
2785 void rt6_clean_tohost(struct net
*net
, struct in6_addr
*gateway
)
2787 fib6_clean_all(net
, fib6_clean_tohost
, gateway
);
2790 struct arg_dev_net
{
2791 struct net_device
*dev
;
2795 /* called with write lock held for table with rt */
2796 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
2798 const struct arg_dev_net
*adn
= arg
;
2799 const struct net_device
*dev
= adn
->dev
;
2801 if ((rt
->dst
.dev
== dev
|| !dev
) &&
2802 rt
!= adn
->net
->ipv6
.ip6_null_entry
&&
2803 (rt
->rt6i_nsiblings
== 0 ||
2804 (dev
&& netdev_unregistering(dev
)) ||
2805 !rt
->rt6i_idev
->cnf
.ignore_routes_with_linkdown
))
2811 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
2813 struct arg_dev_net adn
= {
2818 fib6_clean_all(net
, fib6_ifdown
, &adn
);
2820 rt6_uncached_list_flush_dev(net
, dev
);
2823 struct rt6_mtu_change_arg
{
2824 struct net_device
*dev
;
2828 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2830 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2831 struct inet6_dev
*idev
;
2833 /* In IPv6 pmtu discovery is not optional,
2834 so that RTAX_MTU lock cannot disable it.
2835 We still use this lock to block changes
2836 caused by addrconf/ndisc.
2839 idev
= __in6_dev_get(arg
->dev
);
2843 /* For administrative MTU increase, there is no way to discover
2844 IPv6 PMTU increase, so PMTU increase should be updated here.
2845 Since RFC 1981 doesn't include administrative MTU increase
2846 update PMTU increase is a MUST. (i.e. jumbo frame)
2849 If new MTU is less than route PMTU, this new MTU will be the
2850 lowest MTU in the path, update the route PMTU to reflect PMTU
2851 decreases; if new MTU is greater than route PMTU, and the
2852 old MTU is the lowest MTU in the path, update the route PMTU
2853 to reflect the increase. In this case if the other nodes' MTU
2854 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2857 if (rt
->dst
.dev
== arg
->dev
&&
2858 dst_metric_raw(&rt
->dst
, RTAX_MTU
) &&
2859 !dst_metric_locked(&rt
->dst
, RTAX_MTU
)) {
2860 if (rt
->rt6i_flags
& RTF_CACHE
) {
2861 /* For RTF_CACHE with rt6i_pmtu == 0
2862 * (i.e. a redirected route),
2863 * the metrics of its rt->dst.from has already
2866 if (rt
->rt6i_pmtu
&& rt
->rt6i_pmtu
> arg
->mtu
)
2867 rt
->rt6i_pmtu
= arg
->mtu
;
2868 } else if (dst_mtu(&rt
->dst
) >= arg
->mtu
||
2869 (dst_mtu(&rt
->dst
) < arg
->mtu
&&
2870 dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
)) {
2871 dst_metric_set(&rt
->dst
, RTAX_MTU
, arg
->mtu
);
2877 void rt6_mtu_change(struct net_device
*dev
, unsigned int mtu
)
2879 struct rt6_mtu_change_arg arg
= {
2884 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, &arg
);
2887 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2888 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2889 [RTA_OIF
] = { .type
= NLA_U32
},
2890 [RTA_IIF
] = { .type
= NLA_U32
},
2891 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2892 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2893 [RTA_MULTIPATH
] = { .len
= sizeof(struct rtnexthop
) },
2894 [RTA_PREF
] = { .type
= NLA_U8
},
2895 [RTA_ENCAP_TYPE
] = { .type
= NLA_U16
},
2896 [RTA_ENCAP
] = { .type
= NLA_NESTED
},
2897 [RTA_EXPIRES
] = { .type
= NLA_U32
},
2898 [RTA_UID
] = { .type
= NLA_U32
},
2899 [RTA_MARK
] = { .type
= NLA_U32
},
2902 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2903 struct fib6_config
*cfg
,
2904 struct netlink_ext_ack
*extack
)
2907 struct nlattr
*tb
[RTA_MAX
+1];
2911 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
,
2917 rtm
= nlmsg_data(nlh
);
2918 memset(cfg
, 0, sizeof(*cfg
));
2920 cfg
->fc_table
= rtm
->rtm_table
;
2921 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2922 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2923 cfg
->fc_flags
= RTF_UP
;
2924 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2925 cfg
->fc_type
= rtm
->rtm_type
;
2927 if (rtm
->rtm_type
== RTN_UNREACHABLE
||
2928 rtm
->rtm_type
== RTN_BLACKHOLE
||
2929 rtm
->rtm_type
== RTN_PROHIBIT
||
2930 rtm
->rtm_type
== RTN_THROW
)
2931 cfg
->fc_flags
|= RTF_REJECT
;
2933 if (rtm
->rtm_type
== RTN_LOCAL
)
2934 cfg
->fc_flags
|= RTF_LOCAL
;
2936 if (rtm
->rtm_flags
& RTM_F_CLONED
)
2937 cfg
->fc_flags
|= RTF_CACHE
;
2939 cfg
->fc_nlinfo
.portid
= NETLINK_CB(skb
).portid
;
2940 cfg
->fc_nlinfo
.nlh
= nlh
;
2941 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2943 if (tb
[RTA_GATEWAY
]) {
2944 cfg
->fc_gateway
= nla_get_in6_addr(tb
[RTA_GATEWAY
]);
2945 cfg
->fc_flags
|= RTF_GATEWAY
;
2949 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2951 if (nla_len(tb
[RTA_DST
]) < plen
)
2954 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2958 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2960 if (nla_len(tb
[RTA_SRC
]) < plen
)
2963 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2966 if (tb
[RTA_PREFSRC
])
2967 cfg
->fc_prefsrc
= nla_get_in6_addr(tb
[RTA_PREFSRC
]);
2970 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2972 if (tb
[RTA_PRIORITY
])
2973 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2975 if (tb
[RTA_METRICS
]) {
2976 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2977 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2981 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2983 if (tb
[RTA_MULTIPATH
]) {
2984 cfg
->fc_mp
= nla_data(tb
[RTA_MULTIPATH
]);
2985 cfg
->fc_mp_len
= nla_len(tb
[RTA_MULTIPATH
]);
2987 err
= lwtunnel_valid_encap_type_attr(cfg
->fc_mp
,
2988 cfg
->fc_mp_len
, extack
);
2994 pref
= nla_get_u8(tb
[RTA_PREF
]);
2995 if (pref
!= ICMPV6_ROUTER_PREF_LOW
&&
2996 pref
!= ICMPV6_ROUTER_PREF_HIGH
)
2997 pref
= ICMPV6_ROUTER_PREF_MEDIUM
;
2998 cfg
->fc_flags
|= RTF_PREF(pref
);
3002 cfg
->fc_encap
= tb
[RTA_ENCAP
];
3004 if (tb
[RTA_ENCAP_TYPE
]) {
3005 cfg
->fc_encap_type
= nla_get_u16(tb
[RTA_ENCAP_TYPE
]);
3007 err
= lwtunnel_valid_encap_type(cfg
->fc_encap_type
, extack
);
3012 if (tb
[RTA_EXPIRES
]) {
3013 unsigned long timeout
= addrconf_timeout_fixup(nla_get_u32(tb
[RTA_EXPIRES
]), HZ
);
3015 if (addrconf_finite_timeout(timeout
)) {
3016 cfg
->fc_expires
= jiffies_to_clock_t(timeout
* HZ
);
3017 cfg
->fc_flags
|= RTF_EXPIRES
;
3027 struct rt6_info
*rt6_info
;
3028 struct fib6_config r_cfg
;
3029 struct mx6_config mxc
;
3030 struct list_head next
;
3033 static void ip6_print_replace_route_err(struct list_head
*rt6_nh_list
)
3037 list_for_each_entry(nh
, rt6_nh_list
, next
) {
3038 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3039 &nh
->r_cfg
.fc_dst
, &nh
->r_cfg
.fc_gateway
,
3040 nh
->r_cfg
.fc_ifindex
);
3044 static int ip6_route_info_append(struct list_head
*rt6_nh_list
,
3045 struct rt6_info
*rt
, struct fib6_config
*r_cfg
)
3050 list_for_each_entry(nh
, rt6_nh_list
, next
) {
3051 /* check if rt6_info already exists */
3052 if (rt6_duplicate_nexthop(nh
->rt6_info
, rt
))
3056 nh
= kzalloc(sizeof(*nh
), GFP_KERNEL
);
3060 err
= ip6_convert_metrics(&nh
->mxc
, r_cfg
);
3065 memcpy(&nh
->r_cfg
, r_cfg
, sizeof(*r_cfg
));
3066 list_add_tail(&nh
->next
, rt6_nh_list
);
3071 static void ip6_route_mpath_notify(struct rt6_info
*rt
,
3072 struct rt6_info
*rt_last
,
3073 struct nl_info
*info
,
3076 /* if this is an APPEND route, then rt points to the first route
3077 * inserted and rt_last points to last route inserted. Userspace
3078 * wants a consistent dump of the route which starts at the first
3079 * nexthop. Since sibling routes are always added at the end of
3080 * the list, find the first sibling of the last route appended
3082 if ((nlflags
& NLM_F_APPEND
) && rt_last
&& rt_last
->rt6i_nsiblings
) {
3083 rt
= list_first_entry(&rt_last
->rt6i_siblings
,
3089 inet6_rt_notify(RTM_NEWROUTE
, rt
, info
, nlflags
);
3092 static int ip6_route_multipath_add(struct fib6_config
*cfg
,
3093 struct netlink_ext_ack
*extack
)
3095 struct rt6_info
*rt_notif
= NULL
, *rt_last
= NULL
;
3096 struct nl_info
*info
= &cfg
->fc_nlinfo
;
3097 struct fib6_config r_cfg
;
3098 struct rtnexthop
*rtnh
;
3099 struct rt6_info
*rt
;
3100 struct rt6_nh
*err_nh
;
3101 struct rt6_nh
*nh
, *nh_safe
;
3107 int replace
= (cfg
->fc_nlinfo
.nlh
&&
3108 (cfg
->fc_nlinfo
.nlh
->nlmsg_flags
& NLM_F_REPLACE
));
3109 LIST_HEAD(rt6_nh_list
);
3111 nlflags
= replace
? NLM_F_REPLACE
: NLM_F_CREATE
;
3112 if (info
->nlh
&& info
->nlh
->nlmsg_flags
& NLM_F_APPEND
)
3113 nlflags
|= NLM_F_APPEND
;
3115 remaining
= cfg
->fc_mp_len
;
3116 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
3118 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3119 * rt6_info structs per nexthop
3121 while (rtnh_ok(rtnh
, remaining
)) {
3122 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
3123 if (rtnh
->rtnh_ifindex
)
3124 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
3126 attrlen
= rtnh_attrlen(rtnh
);
3128 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
3130 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
3132 r_cfg
.fc_gateway
= nla_get_in6_addr(nla
);
3133 r_cfg
.fc_flags
|= RTF_GATEWAY
;
3135 r_cfg
.fc_encap
= nla_find(attrs
, attrlen
, RTA_ENCAP
);
3136 nla
= nla_find(attrs
, attrlen
, RTA_ENCAP_TYPE
);
3138 r_cfg
.fc_encap_type
= nla_get_u16(nla
);
3141 rt
= ip6_route_info_create(&r_cfg
, extack
);
3148 err
= ip6_route_info_append(&rt6_nh_list
, rt
, &r_cfg
);
3150 dst_release_immediate(&rt
->dst
);
3154 rtnh
= rtnh_next(rtnh
, &remaining
);
3157 /* for add and replace send one notification with all nexthops.
3158 * Skip the notification in fib6_add_rt2node and send one with
3159 * the full route when done
3161 info
->skip_notify
= 1;
3164 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
3165 rt_last
= nh
->rt6_info
;
3166 err
= __ip6_ins_rt(nh
->rt6_info
, info
, &nh
->mxc
, extack
);
3167 /* save reference to first route for notification */
3168 if (!rt_notif
&& !err
)
3169 rt_notif
= nh
->rt6_info
;
3171 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3172 nh
->rt6_info
= NULL
;
3175 ip6_print_replace_route_err(&rt6_nh_list
);
3180 /* Because each route is added like a single route we remove
3181 * these flags after the first nexthop: if there is a collision,
3182 * we have already failed to add the first nexthop:
3183 * fib6_add_rt2node() has rejected it; when replacing, old
3184 * nexthops have been replaced by first new, the rest should
3187 cfg
->fc_nlinfo
.nlh
->nlmsg_flags
&= ~(NLM_F_EXCL
|
3192 /* success ... tell user about new route */
3193 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
3197 /* send notification for routes that were added so that
3198 * the delete notifications sent by ip6_route_del are
3202 ip6_route_mpath_notify(rt_notif
, rt_last
, info
, nlflags
);
3204 /* Delete routes that were already added */
3205 list_for_each_entry(nh
, &rt6_nh_list
, next
) {
3208 ip6_route_del(&nh
->r_cfg
, extack
);
3212 list_for_each_entry_safe(nh
, nh_safe
, &rt6_nh_list
, next
) {
3214 dst_release_immediate(&nh
->rt6_info
->dst
);
3216 list_del(&nh
->next
);
3223 static int ip6_route_multipath_del(struct fib6_config
*cfg
,
3224 struct netlink_ext_ack
*extack
)
3226 struct fib6_config r_cfg
;
3227 struct rtnexthop
*rtnh
;
3230 int err
= 1, last_err
= 0;
3232 remaining
= cfg
->fc_mp_len
;
3233 rtnh
= (struct rtnexthop
*)cfg
->fc_mp
;
3235 /* Parse a Multipath Entry */
3236 while (rtnh_ok(rtnh
, remaining
)) {
3237 memcpy(&r_cfg
, cfg
, sizeof(*cfg
));
3238 if (rtnh
->rtnh_ifindex
)
3239 r_cfg
.fc_ifindex
= rtnh
->rtnh_ifindex
;
3241 attrlen
= rtnh_attrlen(rtnh
);
3243 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
3245 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
3247 nla_memcpy(&r_cfg
.fc_gateway
, nla
, 16);
3248 r_cfg
.fc_flags
|= RTF_GATEWAY
;
3251 err
= ip6_route_del(&r_cfg
, extack
);
3255 rtnh
= rtnh_next(rtnh
, &remaining
);
3261 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
3262 struct netlink_ext_ack
*extack
)
3264 struct fib6_config cfg
;
3267 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
3272 return ip6_route_multipath_del(&cfg
, extack
);
3274 cfg
.fc_delete_all_nh
= 1;
3275 return ip6_route_del(&cfg
, extack
);
3279 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
3280 struct netlink_ext_ack
*extack
)
3282 struct fib6_config cfg
;
3285 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
, extack
);
3290 return ip6_route_multipath_add(&cfg
, extack
);
3292 return ip6_route_add(&cfg
, extack
);
3295 static size_t rt6_nlmsg_size(struct rt6_info
*rt
)
3297 int nexthop_len
= 0;
3299 if (rt
->rt6i_nsiblings
) {
3300 nexthop_len
= nla_total_size(0) /* RTA_MULTIPATH */
3301 + NLA_ALIGN(sizeof(struct rtnexthop
))
3302 + nla_total_size(16) /* RTA_GATEWAY */
3303 + lwtunnel_get_encap_size(rt
->dst
.lwtstate
);
3305 nexthop_len
*= rt
->rt6i_nsiblings
;
3308 return NLMSG_ALIGN(sizeof(struct rtmsg
))
3309 + nla_total_size(16) /* RTA_SRC */
3310 + nla_total_size(16) /* RTA_DST */
3311 + nla_total_size(16) /* RTA_GATEWAY */
3312 + nla_total_size(16) /* RTA_PREFSRC */
3313 + nla_total_size(4) /* RTA_TABLE */
3314 + nla_total_size(4) /* RTA_IIF */
3315 + nla_total_size(4) /* RTA_OIF */
3316 + nla_total_size(4) /* RTA_PRIORITY */
3317 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
3318 + nla_total_size(sizeof(struct rta_cacheinfo
))
3319 + nla_total_size(TCP_CA_NAME_MAX
) /* RTAX_CC_ALGO */
3320 + nla_total_size(1) /* RTA_PREF */
3321 + lwtunnel_get_encap_size(rt
->dst
.lwtstate
)
3325 static int rt6_nexthop_info(struct sk_buff
*skb
, struct rt6_info
*rt
,
3326 unsigned int *flags
, bool skip_oif
)
3328 if (!netif_running(rt
->dst
.dev
) || !netif_carrier_ok(rt
->dst
.dev
)) {
3329 *flags
|= RTNH_F_LINKDOWN
;
3330 if (rt
->rt6i_idev
->cnf
.ignore_routes_with_linkdown
)
3331 *flags
|= RTNH_F_DEAD
;
3334 if (rt
->rt6i_flags
& RTF_GATEWAY
) {
3335 if (nla_put_in6_addr(skb
, RTA_GATEWAY
, &rt
->rt6i_gateway
) < 0)
3336 goto nla_put_failure
;
3339 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3340 if (!skip_oif
&& rt
->dst
.dev
&&
3341 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
3342 goto nla_put_failure
;
3344 if (rt
->dst
.lwtstate
&&
3345 lwtunnel_fill_encap(skb
, rt
->dst
.lwtstate
) < 0)
3346 goto nla_put_failure
;
3354 /* add multipath next hop */
3355 static int rt6_add_nexthop(struct sk_buff
*skb
, struct rt6_info
*rt
)
3357 struct rtnexthop
*rtnh
;
3358 unsigned int flags
= 0;
3360 rtnh
= nla_reserve_nohdr(skb
, sizeof(*rtnh
));
3362 goto nla_put_failure
;
3364 rtnh
->rtnh_hops
= 0;
3365 rtnh
->rtnh_ifindex
= rt
->dst
.dev
? rt
->dst
.dev
->ifindex
: 0;
3367 if (rt6_nexthop_info(skb
, rt
, &flags
, true) < 0)
3368 goto nla_put_failure
;
3370 rtnh
->rtnh_flags
= flags
;
3372 /* length of rtnetlink header + attributes */
3373 rtnh
->rtnh_len
= nlmsg_get_pos(skb
) - (void *)rtnh
;
3381 static int rt6_fill_node(struct net
*net
,
3382 struct sk_buff
*skb
, struct rt6_info
*rt
,
3383 struct in6_addr
*dst
, struct in6_addr
*src
,
3384 int iif
, int type
, u32 portid
, u32 seq
,
3387 u32 metrics
[RTAX_MAX
];
3389 struct nlmsghdr
*nlh
;
3393 nlh
= nlmsg_put(skb
, portid
, seq
, type
, sizeof(*rtm
), flags
);
3397 rtm
= nlmsg_data(nlh
);
3398 rtm
->rtm_family
= AF_INET6
;
3399 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
3400 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
3403 table
= rt
->rt6i_table
->tb6_id
;
3405 table
= RT6_TABLE_UNSPEC
;
3406 rtm
->rtm_table
= table
;
3407 if (nla_put_u32(skb
, RTA_TABLE
, table
))
3408 goto nla_put_failure
;
3409 if (rt
->rt6i_flags
& RTF_REJECT
) {
3410 switch (rt
->dst
.error
) {
3412 rtm
->rtm_type
= RTN_BLACKHOLE
;
3415 rtm
->rtm_type
= RTN_PROHIBIT
;
3418 rtm
->rtm_type
= RTN_THROW
;
3421 rtm
->rtm_type
= RTN_UNREACHABLE
;
3425 else if (rt
->rt6i_flags
& RTF_LOCAL
)
3426 rtm
->rtm_type
= RTN_LOCAL
;
3427 else if (rt
->rt6i_flags
& RTF_ANYCAST
)
3428 rtm
->rtm_type
= RTN_ANYCAST
;
3429 else if (rt
->dst
.dev
&& (rt
->dst
.dev
->flags
& IFF_LOOPBACK
))
3430 rtm
->rtm_type
= RTN_LOCAL
;
3432 rtm
->rtm_type
= RTN_UNICAST
;
3434 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
3435 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
3437 if (rt
->rt6i_flags
& RTF_CACHE
)
3438 rtm
->rtm_flags
|= RTM_F_CLONED
;
3441 if (nla_put_in6_addr(skb
, RTA_DST
, dst
))
3442 goto nla_put_failure
;
3443 rtm
->rtm_dst_len
= 128;
3444 } else if (rtm
->rtm_dst_len
)
3445 if (nla_put_in6_addr(skb
, RTA_DST
, &rt
->rt6i_dst
.addr
))
3446 goto nla_put_failure
;
3447 #ifdef CONFIG_IPV6_SUBTREES
3449 if (nla_put_in6_addr(skb
, RTA_SRC
, src
))
3450 goto nla_put_failure
;
3451 rtm
->rtm_src_len
= 128;
3452 } else if (rtm
->rtm_src_len
&&
3453 nla_put_in6_addr(skb
, RTA_SRC
, &rt
->rt6i_src
.addr
))
3454 goto nla_put_failure
;
3457 #ifdef CONFIG_IPV6_MROUTE
3458 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
3459 int err
= ip6mr_get_route(net
, skb
, rtm
, portid
);
3464 goto nla_put_failure
;
3467 if (nla_put_u32(skb
, RTA_IIF
, iif
))
3468 goto nla_put_failure
;
3470 struct in6_addr saddr_buf
;
3471 if (ip6_route_get_saddr(net
, rt
, dst
, 0, &saddr_buf
) == 0 &&
3472 nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
3473 goto nla_put_failure
;
3476 if (rt
->rt6i_prefsrc
.plen
) {
3477 struct in6_addr saddr_buf
;
3478 saddr_buf
= rt
->rt6i_prefsrc
.addr
;
3479 if (nla_put_in6_addr(skb
, RTA_PREFSRC
, &saddr_buf
))
3480 goto nla_put_failure
;
3483 memcpy(metrics
, dst_metrics_ptr(&rt
->dst
), sizeof(metrics
));
3485 metrics
[RTAX_MTU
- 1] = rt
->rt6i_pmtu
;
3486 if (rtnetlink_put_metrics(skb
, metrics
) < 0)
3487 goto nla_put_failure
;
3489 if (nla_put_u32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
))
3490 goto nla_put_failure
;
3492 /* For multipath routes, walk the siblings list and add
3493 * each as a nexthop within RTA_MULTIPATH.
3495 if (rt
->rt6i_nsiblings
) {
3496 struct rt6_info
*sibling
, *next_sibling
;
3499 mp
= nla_nest_start(skb
, RTA_MULTIPATH
);
3501 goto nla_put_failure
;
3503 if (rt6_add_nexthop(skb
, rt
) < 0)
3504 goto nla_put_failure
;
3506 list_for_each_entry_safe(sibling
, next_sibling
,
3507 &rt
->rt6i_siblings
, rt6i_siblings
) {
3508 if (rt6_add_nexthop(skb
, sibling
) < 0)
3509 goto nla_put_failure
;
3512 nla_nest_end(skb
, mp
);
3514 if (rt6_nexthop_info(skb
, rt
, &rtm
->rtm_flags
, false) < 0)
3515 goto nla_put_failure
;
3518 expires
= (rt
->rt6i_flags
& RTF_EXPIRES
) ? rt
->dst
.expires
- jiffies
: 0;
3520 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, expires
, rt
->dst
.error
) < 0)
3521 goto nla_put_failure
;
3523 if (nla_put_u8(skb
, RTA_PREF
, IPV6_EXTRACT_PREF(rt
->rt6i_flags
)))
3524 goto nla_put_failure
;
3527 nlmsg_end(skb
, nlh
);
3531 nlmsg_cancel(skb
, nlh
);
3535 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
3537 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
3538 struct net
*net
= arg
->net
;
3540 if (rt
== net
->ipv6
.ip6_null_entry
)
3543 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
3544 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
3546 /* user wants prefix routes only */
3547 if (rtm
->rtm_flags
& RTM_F_PREFIX
&&
3548 !(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
3549 /* success since this is not a prefix route */
3554 return rt6_fill_node(net
,
3555 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
3556 NETLINK_CB(arg
->cb
->skb
).portid
, arg
->cb
->nlh
->nlmsg_seq
,
3560 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
3561 struct netlink_ext_ack
*extack
)
3563 struct net
*net
= sock_net(in_skb
->sk
);
3564 struct nlattr
*tb
[RTA_MAX
+1];
3565 int err
, iif
= 0, oif
= 0;
3566 struct dst_entry
*dst
;
3567 struct rt6_info
*rt
;
3568 struct sk_buff
*skb
;
3573 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
,
3579 memset(&fl6
, 0, sizeof(fl6
));
3580 rtm
= nlmsg_data(nlh
);
3581 fl6
.flowlabel
= ip6_make_flowinfo(rtm
->rtm_tos
, 0);
3582 fibmatch
= !!(rtm
->rtm_flags
& RTM_F_FIB_MATCH
);
3585 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
3588 fl6
.saddr
= *(struct in6_addr
*)nla_data(tb
[RTA_SRC
]);
3592 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
3595 fl6
.daddr
= *(struct in6_addr
*)nla_data(tb
[RTA_DST
]);
3599 iif
= nla_get_u32(tb
[RTA_IIF
]);
3602 oif
= nla_get_u32(tb
[RTA_OIF
]);
3605 fl6
.flowi6_mark
= nla_get_u32(tb
[RTA_MARK
]);
3608 fl6
.flowi6_uid
= make_kuid(current_user_ns(),
3609 nla_get_u32(tb
[RTA_UID
]));
3611 fl6
.flowi6_uid
= iif
? INVALID_UID
: current_uid();
3614 struct net_device
*dev
;
3617 dev
= __dev_get_by_index(net
, iif
);
3623 fl6
.flowi6_iif
= iif
;
3625 if (!ipv6_addr_any(&fl6
.saddr
))
3626 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
3629 dst
= ip6_route_input_lookup(net
, dev
, &fl6
, flags
);
3631 fl6
.flowi6_oif
= oif
;
3634 dst
= ip6_route_output(net
, NULL
, &fl6
);
3638 dst
= ip6_route_lookup(net
, &fl6
, 0);
3640 rt
= container_of(dst
, struct rt6_info
, dst
);
3641 if (rt
->dst
.error
) {
3642 err
= rt
->dst
.error
;
3647 if (rt
== net
->ipv6
.ip6_null_entry
) {
3648 err
= rt
->dst
.error
;
3653 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3660 skb_dst_set(skb
, &rt
->dst
);
3662 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, iif
,
3663 RTM_NEWROUTE
, NETLINK_CB(in_skb
).portid
,
3666 err
= rt6_fill_node(net
, skb
, rt
, &fl6
.daddr
, &fl6
.saddr
, iif
,
3667 RTM_NEWROUTE
, NETLINK_CB(in_skb
).portid
,
3674 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
3679 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
,
3680 unsigned int nlm_flags
)
3682 struct sk_buff
*skb
;
3683 struct net
*net
= info
->nl_net
;
3688 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
3690 skb
= nlmsg_new(rt6_nlmsg_size(rt
), gfp_any());
3694 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
3695 event
, info
->portid
, seq
, nlm_flags
);
3697 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3698 WARN_ON(err
== -EMSGSIZE
);
3702 rtnl_notify(skb
, net
, info
->portid
, RTNLGRP_IPV6_ROUTE
,
3703 info
->nlh
, gfp_any());
3707 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
3710 static int ip6_route_dev_notify(struct notifier_block
*this,
3711 unsigned long event
, void *ptr
)
3713 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
3714 struct net
*net
= dev_net(dev
);
3716 if (!(dev
->flags
& IFF_LOOPBACK
))
3719 if (event
== NETDEV_REGISTER
) {
3720 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
3721 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
3722 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3723 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
3724 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
3725 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
3726 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
3728 } else if (event
== NETDEV_UNREGISTER
&&
3729 dev
->reg_state
!= NETREG_UNREGISTERED
) {
3730 /* NETDEV_UNREGISTER could be fired for multiple times by
3731 * netdev_wait_allrefs(). Make sure we only call this once.
3733 in6_dev_put_clear(&net
->ipv6
.ip6_null_entry
->rt6i_idev
);
3734 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3735 in6_dev_put_clear(&net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
);
3736 in6_dev_put_clear(&net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
);
3747 #ifdef CONFIG_PROC_FS
3749 static const struct file_operations ipv6_route_proc_fops
= {
3750 .owner
= THIS_MODULE
,
3751 .open
= ipv6_route_open
,
3753 .llseek
= seq_lseek
,
3754 .release
= seq_release_net
,
3757 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
3759 struct net
*net
= (struct net
*)seq
->private;
3760 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
3761 net
->ipv6
.rt6_stats
->fib_nodes
,
3762 net
->ipv6
.rt6_stats
->fib_route_nodes
,
3763 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
3764 net
->ipv6
.rt6_stats
->fib_rt_entries
,
3765 net
->ipv6
.rt6_stats
->fib_rt_cache
,
3766 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
3767 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
3772 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
3774 return single_open_net(inode
, file
, rt6_stats_seq_show
);
3777 static const struct file_operations rt6_stats_seq_fops
= {
3778 .owner
= THIS_MODULE
,
3779 .open
= rt6_stats_seq_open
,
3781 .llseek
= seq_lseek
,
3782 .release
= single_release_net
,
3784 #endif /* CONFIG_PROC_FS */
3786 #ifdef CONFIG_SYSCTL
3789 int ipv6_sysctl_rtcache_flush(struct ctl_table
*ctl
, int write
,
3790 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
3797 net
= (struct net
*)ctl
->extra1
;
3798 delay
= net
->ipv6
.sysctl
.flush_delay
;
3799 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
3800 fib6_run_gc(delay
<= 0 ? 0 : (unsigned long)delay
, net
, delay
> 0);
3804 struct ctl_table ipv6_route_table_template
[] = {
3806 .procname
= "flush",
3807 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
3808 .maxlen
= sizeof(int),
3810 .proc_handler
= ipv6_sysctl_rtcache_flush
3813 .procname
= "gc_thresh",
3814 .data
= &ip6_dst_ops_template
.gc_thresh
,
3815 .maxlen
= sizeof(int),
3817 .proc_handler
= proc_dointvec
,
3820 .procname
= "max_size",
3821 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
3822 .maxlen
= sizeof(int),
3824 .proc_handler
= proc_dointvec
,
3827 .procname
= "gc_min_interval",
3828 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
3829 .maxlen
= sizeof(int),
3831 .proc_handler
= proc_dointvec_jiffies
,
3834 .procname
= "gc_timeout",
3835 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
3836 .maxlen
= sizeof(int),
3838 .proc_handler
= proc_dointvec_jiffies
,
3841 .procname
= "gc_interval",
3842 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
3843 .maxlen
= sizeof(int),
3845 .proc_handler
= proc_dointvec_jiffies
,
3848 .procname
= "gc_elasticity",
3849 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
3850 .maxlen
= sizeof(int),
3852 .proc_handler
= proc_dointvec
,
3855 .procname
= "mtu_expires",
3856 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
3857 .maxlen
= sizeof(int),
3859 .proc_handler
= proc_dointvec_jiffies
,
3862 .procname
= "min_adv_mss",
3863 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
3864 .maxlen
= sizeof(int),
3866 .proc_handler
= proc_dointvec
,
3869 .procname
= "gc_min_interval_ms",
3870 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
3871 .maxlen
= sizeof(int),
3873 .proc_handler
= proc_dointvec_ms_jiffies
,
3878 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
3880 struct ctl_table
*table
;
3882 table
= kmemdup(ipv6_route_table_template
,
3883 sizeof(ipv6_route_table_template
),
3887 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
3888 table
[0].extra1
= net
;
3889 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
3890 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
3891 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
3892 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
3893 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
3894 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
3895 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
3896 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
3897 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
3899 /* Don't export sysctls to unprivileged users */
3900 if (net
->user_ns
!= &init_user_ns
)
3901 table
[0].procname
= NULL
;
3908 static int __net_init
ip6_route_net_init(struct net
*net
)
3912 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
3913 sizeof(net
->ipv6
.ip6_dst_ops
));
3915 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
3916 goto out_ip6_dst_ops
;
3918 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
3919 sizeof(*net
->ipv6
.ip6_null_entry
),
3921 if (!net
->ipv6
.ip6_null_entry
)
3922 goto out_ip6_dst_entries
;
3923 net
->ipv6
.ip6_null_entry
->dst
.path
=
3924 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
3925 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
3926 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
3927 ip6_template_metrics
, true);
3929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3930 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
3931 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
3933 if (!net
->ipv6
.ip6_prohibit_entry
)
3934 goto out_ip6_null_entry
;
3935 net
->ipv6
.ip6_prohibit_entry
->dst
.path
=
3936 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
3937 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
3938 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
3939 ip6_template_metrics
, true);
3941 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
3942 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
3944 if (!net
->ipv6
.ip6_blk_hole_entry
)
3945 goto out_ip6_prohibit_entry
;
3946 net
->ipv6
.ip6_blk_hole_entry
->dst
.path
=
3947 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
3948 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
3949 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
3950 ip6_template_metrics
, true);
3953 net
->ipv6
.sysctl
.flush_delay
= 0;
3954 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
3955 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
3956 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
3957 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
3958 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
3959 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
3960 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
3962 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
3968 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3969 out_ip6_prohibit_entry
:
3970 kfree(net
->ipv6
.ip6_prohibit_entry
);
3972 kfree(net
->ipv6
.ip6_null_entry
);
3974 out_ip6_dst_entries
:
3975 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
3980 static void __net_exit
ip6_route_net_exit(struct net
*net
)
3982 kfree(net
->ipv6
.ip6_null_entry
);
3983 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3984 kfree(net
->ipv6
.ip6_prohibit_entry
);
3985 kfree(net
->ipv6
.ip6_blk_hole_entry
);
3987 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
3990 static int __net_init
ip6_route_net_init_late(struct net
*net
)
3992 #ifdef CONFIG_PROC_FS
3993 proc_create("ipv6_route", 0, net
->proc_net
, &ipv6_route_proc_fops
);
3994 proc_create("rt6_stats", S_IRUGO
, net
->proc_net
, &rt6_stats_seq_fops
);
3999 static void __net_exit
ip6_route_net_exit_late(struct net
*net
)
4001 #ifdef CONFIG_PROC_FS
4002 remove_proc_entry("ipv6_route", net
->proc_net
);
4003 remove_proc_entry("rt6_stats", net
->proc_net
);
4007 static struct pernet_operations ip6_route_net_ops
= {
4008 .init
= ip6_route_net_init
,
4009 .exit
= ip6_route_net_exit
,
4012 static int __net_init
ipv6_inetpeer_init(struct net
*net
)
4014 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
4018 inet_peer_base_init(bp
);
4019 net
->ipv6
.peers
= bp
;
4023 static void __net_exit
ipv6_inetpeer_exit(struct net
*net
)
4025 struct inet_peer_base
*bp
= net
->ipv6
.peers
;
4027 net
->ipv6
.peers
= NULL
;
4028 inetpeer_invalidate_tree(bp
);
4032 static struct pernet_operations ipv6_inetpeer_ops
= {
4033 .init
= ipv6_inetpeer_init
,
4034 .exit
= ipv6_inetpeer_exit
,
4037 static struct pernet_operations ip6_route_net_late_ops
= {
4038 .init
= ip6_route_net_init_late
,
4039 .exit
= ip6_route_net_exit_late
,
4042 static struct notifier_block ip6_route_dev_notifier
= {
4043 .notifier_call
= ip6_route_dev_notify
,
4044 .priority
= ADDRCONF_NOTIFY_PRIORITY
- 10,
4047 void __init
ip6_route_init_special_entries(void)
4049 /* Registering of the loopback is done before this portion of code,
4050 * the loopback reference in rt6_info will not be taken, do it
4051 * manually for init_net */
4052 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
4053 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
4054 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4055 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
4056 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
4057 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
4058 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
4062 int __init
ip6_route_init(void)
4068 ip6_dst_ops_template
.kmem_cachep
=
4069 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
4070 SLAB_HWCACHE_ALIGN
, NULL
);
4071 if (!ip6_dst_ops_template
.kmem_cachep
)
4074 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
4076 goto out_kmem_cache
;
4078 ret
= register_pernet_subsys(&ipv6_inetpeer_ops
);
4080 goto out_dst_entries
;
4082 ret
= register_pernet_subsys(&ip6_route_net_ops
);
4084 goto out_register_inetpeer
;
4086 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
4090 goto out_register_subsys
;
4096 ret
= fib6_rules_init();
4100 ret
= register_pernet_subsys(&ip6_route_net_late_ops
);
4102 goto fib6_rules_init
;
4105 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
, NULL
) ||
4106 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
, NULL
) ||
4107 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
, NULL
))
4108 goto out_register_late_subsys
;
4110 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
4112 goto out_register_late_subsys
;
4114 for_each_possible_cpu(cpu
) {
4115 struct uncached_list
*ul
= per_cpu_ptr(&rt6_uncached_list
, cpu
);
4117 INIT_LIST_HEAD(&ul
->head
);
4118 spin_lock_init(&ul
->lock
);
4124 out_register_late_subsys
:
4125 unregister_pernet_subsys(&ip6_route_net_late_ops
);
4127 fib6_rules_cleanup();
4132 out_register_subsys
:
4133 unregister_pernet_subsys(&ip6_route_net_ops
);
4134 out_register_inetpeer
:
4135 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
4137 dst_entries_destroy(&ip6_dst_blackhole_ops
);
4139 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
4143 void ip6_route_cleanup(void)
4145 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
4146 unregister_pernet_subsys(&ip6_route_net_late_ops
);
4147 fib6_rules_cleanup();
4150 unregister_pernet_subsys(&ipv6_inetpeer_ops
);
4151 unregister_pernet_subsys(&ip6_route_net_ops
);
4152 dst_entries_destroy(&ip6_dst_blackhole_ops
);
4153 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);