2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #include "fib_lookup.h"
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
122 #define RT_GC_TIMEOUT (300*HZ)
124 static int ip_rt_max_size
;
125 static int ip_rt_redirect_number __read_mostly
= 9;
126 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
127 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly
= HZ
;
129 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
130 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
131 static u32 ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly
= 256;
134 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
136 static int ip_min_valid_pmtu __read_mostly
= IPV4_MIN_MTU
;
139 * Interface to generic destination cache.
142 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
143 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
144 static unsigned int ipv4_mtu(const struct dst_entry
*dst
);
145 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
146 static void ipv4_link_failure(struct sk_buff
*skb
);
147 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
148 struct sk_buff
*skb
, u32 mtu
,
150 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
151 struct sk_buff
*skb
);
152 static void ipv4_dst_destroy(struct dst_entry
*dst
);
154 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
160 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
163 static void ipv4_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
);
165 static struct dst_ops ipv4_dst_ops
= {
167 .check
= ipv4_dst_check
,
168 .default_advmss
= ipv4_default_advmss
,
170 .cow_metrics
= ipv4_cow_metrics
,
171 .destroy
= ipv4_dst_destroy
,
172 .negative_advice
= ipv4_negative_advice
,
173 .link_failure
= ipv4_link_failure
,
174 .update_pmtu
= ip_rt_update_pmtu
,
175 .redirect
= ip_do_redirect
,
176 .local_out
= __ip_local_out
,
177 .neigh_lookup
= ipv4_neigh_lookup
,
178 .confirm_neigh
= ipv4_confirm_neigh
,
181 #define ECN_OR_COST(class) TC_PRIO_##class
183 const __u8 ip_tos2prio
[16] = {
185 ECN_OR_COST(BESTEFFORT
),
187 ECN_OR_COST(BESTEFFORT
),
193 ECN_OR_COST(INTERACTIVE
),
195 ECN_OR_COST(INTERACTIVE
),
196 TC_PRIO_INTERACTIVE_BULK
,
197 ECN_OR_COST(INTERACTIVE_BULK
),
198 TC_PRIO_INTERACTIVE_BULK
,
199 ECN_OR_COST(INTERACTIVE_BULK
)
201 EXPORT_SYMBOL(ip_tos2prio
);
203 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
204 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
206 #ifdef CONFIG_PROC_FS
207 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
211 return SEQ_START_TOKEN
;
214 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
220 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
224 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
226 if (v
== SEQ_START_TOKEN
)
227 seq_printf(seq
, "%-127s\n",
228 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
229 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
234 static const struct seq_operations rt_cache_seq_ops
= {
235 .start
= rt_cache_seq_start
,
236 .next
= rt_cache_seq_next
,
237 .stop
= rt_cache_seq_stop
,
238 .show
= rt_cache_seq_show
,
241 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
243 return seq_open(file
, &rt_cache_seq_ops
);
246 static const struct file_operations rt_cache_seq_fops
= {
247 .owner
= THIS_MODULE
,
248 .open
= rt_cache_seq_open
,
251 .release
= seq_release
,
255 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
260 return SEQ_START_TOKEN
;
262 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
263 if (!cpu_possible(cpu
))
266 return &per_cpu(rt_cache_stat
, cpu
);
271 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
275 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
276 if (!cpu_possible(cpu
))
279 return &per_cpu(rt_cache_stat
, cpu
);
285 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
290 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
292 struct rt_cache_stat
*st
= v
;
294 if (v
== SEQ_START_TOKEN
) {
295 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
299 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
300 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
301 dst_entries_get_slow(&ipv4_dst_ops
),
314 0, /* st->gc_total */
315 0, /* st->gc_ignored */
316 0, /* st->gc_goal_miss */
317 0, /* st->gc_dst_overflow */
318 0, /* st->in_hlist_search */
319 0 /* st->out_hlist_search */
324 static const struct seq_operations rt_cpu_seq_ops
= {
325 .start
= rt_cpu_seq_start
,
326 .next
= rt_cpu_seq_next
,
327 .stop
= rt_cpu_seq_stop
,
328 .show
= rt_cpu_seq_show
,
332 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
334 return seq_open(file
, &rt_cpu_seq_ops
);
337 static const struct file_operations rt_cpu_seq_fops
= {
338 .owner
= THIS_MODULE
,
339 .open
= rt_cpu_seq_open
,
342 .release
= seq_release
,
345 #ifdef CONFIG_IP_ROUTE_CLASSID
346 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
348 struct ip_rt_acct
*dst
, *src
;
351 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
355 for_each_possible_cpu(i
) {
356 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
357 for (j
= 0; j
< 256; j
++) {
358 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
359 dst
[j
].o_packets
+= src
[j
].o_packets
;
360 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
361 dst
[j
].i_packets
+= src
[j
].i_packets
;
365 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
370 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
372 return single_open(file
, rt_acct_proc_show
, NULL
);
375 static const struct file_operations rt_acct_proc_fops
= {
376 .owner
= THIS_MODULE
,
377 .open
= rt_acct_proc_open
,
380 .release
= single_release
,
384 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
386 struct proc_dir_entry
*pde
;
388 pde
= proc_create("rt_cache", S_IRUGO
, net
->proc_net
,
393 pde
= proc_create("rt_cache", S_IRUGO
,
394 net
->proc_net_stat
, &rt_cpu_seq_fops
);
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
405 #ifdef CONFIG_IP_ROUTE_CLASSID
407 remove_proc_entry("rt_cache", net
->proc_net_stat
);
410 remove_proc_entry("rt_cache", net
->proc_net
);
415 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
417 remove_proc_entry("rt_cache", net
->proc_net_stat
);
418 remove_proc_entry("rt_cache", net
->proc_net
);
419 #ifdef CONFIG_IP_ROUTE_CLASSID
420 remove_proc_entry("rt_acct", net
->proc_net
);
424 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
425 .init
= ip_rt_do_proc_init
,
426 .exit
= ip_rt_do_proc_exit
,
429 static int __init
ip_rt_proc_init(void)
431 return register_pernet_subsys(&ip_rt_proc_ops
);
435 static inline int ip_rt_proc_init(void)
439 #endif /* CONFIG_PROC_FS */
441 static inline bool rt_is_expired(const struct rtable
*rth
)
443 return rth
->rt_genid
!= rt_genid_ipv4(dev_net(rth
->dst
.dev
));
446 void rt_cache_flush(struct net
*net
)
448 rt_genid_bump_ipv4(net
);
451 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
455 struct net_device
*dev
= dst
->dev
;
456 const __be32
*pkey
= daddr
;
457 const struct rtable
*rt
;
460 rt
= (const struct rtable
*) dst
;
462 pkey
= (const __be32
*) &rt
->rt_gateway
;
464 pkey
= &ip_hdr(skb
)->daddr
;
466 n
= __ipv4_neigh_lookup(dev
, *(__force u32
*)pkey
);
469 return neigh_create(&arp_tbl
, pkey
, dev
);
472 static void ipv4_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
474 struct net_device
*dev
= dst
->dev
;
475 const __be32
*pkey
= daddr
;
476 const struct rtable
*rt
;
478 rt
= (const struct rtable
*)dst
;
480 pkey
= (const __be32
*)&rt
->rt_gateway
;
483 (RTCF_MULTICAST
| RTCF_BROADCAST
| RTCF_LOCAL
)))
486 __ipv4_confirm_neigh(dev
, *(__force u32
*)pkey
);
489 #define IP_IDENTS_SZ 2048u
491 static atomic_t
*ip_idents __read_mostly
;
492 static u32
*ip_tstamps __read_mostly
;
494 /* In order to protect privacy, we add a perturbation to identifiers
495 * if one generator is seldom used. This makes hard for an attacker
496 * to infer how many packets were sent between two points in time.
498 u32
ip_idents_reserve(u32 hash
, int segs
)
500 u32
*p_tstamp
= ip_tstamps
+ hash
% IP_IDENTS_SZ
;
501 atomic_t
*p_id
= ip_idents
+ hash
% IP_IDENTS_SZ
;
502 u32 old
= READ_ONCE(*p_tstamp
);
503 u32 now
= (u32
)jiffies
;
506 if (old
!= now
&& cmpxchg(p_tstamp
, old
, now
) == old
)
507 delta
= prandom_u32_max(now
- old
);
509 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
511 old
= (u32
)atomic_read(p_id
);
512 new = old
+ delta
+ segs
;
513 } while (atomic_cmpxchg(p_id
, old
, new) != old
);
517 EXPORT_SYMBOL(ip_idents_reserve
);
519 void __ip_select_ident(struct net
*net
, struct iphdr
*iph
, int segs
)
523 /* Note the following code is not safe, but this is okay. */
524 if (unlikely(siphash_key_is_zero(&net
->ipv4
.ip_id_key
)))
525 get_random_bytes(&net
->ipv4
.ip_id_key
,
526 sizeof(net
->ipv4
.ip_id_key
));
528 hash
= siphash_3u32((__force u32
)iph
->daddr
,
529 (__force u32
)iph
->saddr
,
531 &net
->ipv4
.ip_id_key
);
532 id
= ip_idents_reserve(hash
, segs
);
535 EXPORT_SYMBOL(__ip_select_ident
);
537 static void __build_flow_key(const struct net
*net
, struct flowi4
*fl4
,
538 const struct sock
*sk
,
539 const struct iphdr
*iph
,
541 u8 prot
, u32 mark
, int flow_flags
)
544 const struct inet_sock
*inet
= inet_sk(sk
);
546 oif
= sk
->sk_bound_dev_if
;
548 tos
= RT_CONN_FLAGS(sk
);
549 prot
= inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
;
551 flowi4_init_output(fl4
, oif
, mark
, tos
,
552 RT_SCOPE_UNIVERSE
, prot
,
554 iph
->daddr
, iph
->saddr
, 0, 0,
555 sock_net_uid(net
, sk
));
558 static void build_skb_flow_key(struct flowi4
*fl4
, const struct sk_buff
*skb
,
559 const struct sock
*sk
)
561 const struct net
*net
= dev_net(skb
->dev
);
562 const struct iphdr
*iph
= ip_hdr(skb
);
563 int oif
= skb
->dev
->ifindex
;
564 u8 tos
= RT_TOS(iph
->tos
);
565 u8 prot
= iph
->protocol
;
566 u32 mark
= skb
->mark
;
568 __build_flow_key(net
, fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
571 static void build_sk_flow_key(struct flowi4
*fl4
, const struct sock
*sk
)
573 const struct inet_sock
*inet
= inet_sk(sk
);
574 const struct ip_options_rcu
*inet_opt
;
575 __be32 daddr
= inet
->inet_daddr
;
578 inet_opt
= rcu_dereference(inet
->inet_opt
);
579 if (inet_opt
&& inet_opt
->opt
.srr
)
580 daddr
= inet_opt
->opt
.faddr
;
581 flowi4_init_output(fl4
, sk
->sk_bound_dev_if
, sk
->sk_mark
,
582 RT_CONN_FLAGS(sk
), RT_SCOPE_UNIVERSE
,
583 inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
,
584 inet_sk_flowi_flags(sk
),
585 daddr
, inet
->inet_saddr
, 0, 0, sk
->sk_uid
);
589 static void ip_rt_build_flow_key(struct flowi4
*fl4
, const struct sock
*sk
,
590 const struct sk_buff
*skb
)
593 build_skb_flow_key(fl4
, skb
, sk
);
595 build_sk_flow_key(fl4
, sk
);
598 static DEFINE_SPINLOCK(fnhe_lock
);
600 static void fnhe_flush_routes(struct fib_nh_exception
*fnhe
)
604 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
606 RCU_INIT_POINTER(fnhe
->fnhe_rth_input
, NULL
);
607 dst_dev_put(&rt
->dst
);
608 dst_release(&rt
->dst
);
610 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
612 RCU_INIT_POINTER(fnhe
->fnhe_rth_output
, NULL
);
613 dst_dev_put(&rt
->dst
);
614 dst_release(&rt
->dst
);
618 static struct fib_nh_exception
*fnhe_oldest(struct fnhe_hash_bucket
*hash
)
620 struct fib_nh_exception
*fnhe
, *oldest
;
622 oldest
= rcu_dereference(hash
->chain
);
623 for (fnhe
= rcu_dereference(oldest
->fnhe_next
); fnhe
;
624 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
625 if (time_before(fnhe
->fnhe_stamp
, oldest
->fnhe_stamp
))
628 fnhe_flush_routes(oldest
);
632 static inline u32
fnhe_hashfun(__be32 daddr
)
634 static u32 fnhe_hashrnd __read_mostly
;
637 net_get_random_once(&fnhe_hashrnd
, sizeof(fnhe_hashrnd
));
638 hval
= jhash_1word((__force u32
) daddr
, fnhe_hashrnd
);
639 return hash_32(hval
, FNHE_HASH_SHIFT
);
642 static void fill_route_from_fnhe(struct rtable
*rt
, struct fib_nh_exception
*fnhe
)
644 rt
->rt_pmtu
= fnhe
->fnhe_pmtu
;
645 rt
->rt_mtu_locked
= fnhe
->fnhe_mtu_locked
;
646 rt
->dst
.expires
= fnhe
->fnhe_expires
;
649 rt
->rt_flags
|= RTCF_REDIRECTED
;
650 rt
->rt_gateway
= fnhe
->fnhe_gw
;
651 rt
->rt_uses_gateway
= 1;
655 static void update_or_create_fnhe(struct fib_nh
*nh
, __be32 daddr
, __be32 gw
,
656 u32 pmtu
, bool lock
, unsigned long expires
)
658 struct fnhe_hash_bucket
*hash
;
659 struct fib_nh_exception
*fnhe
;
665 genid
= fnhe_genid(dev_net(nh
->nh_dev
));
666 hval
= fnhe_hashfun(daddr
);
668 spin_lock_bh(&fnhe_lock
);
670 hash
= rcu_dereference(nh
->nh_exceptions
);
672 hash
= kzalloc(FNHE_HASH_SIZE
* sizeof(*hash
), GFP_ATOMIC
);
675 rcu_assign_pointer(nh
->nh_exceptions
, hash
);
681 for (fnhe
= rcu_dereference(hash
->chain
); fnhe
;
682 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
683 if (fnhe
->fnhe_daddr
== daddr
)
689 if (fnhe
->fnhe_genid
!= genid
)
690 fnhe
->fnhe_genid
= genid
;
694 fnhe
->fnhe_pmtu
= pmtu
;
695 fnhe
->fnhe_mtu_locked
= lock
;
697 fnhe
->fnhe_expires
= max(1UL, expires
);
698 /* Update all cached dsts too */
699 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
701 fill_route_from_fnhe(rt
, fnhe
);
702 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
704 fill_route_from_fnhe(rt
, fnhe
);
706 if (depth
> FNHE_RECLAIM_DEPTH
)
707 fnhe
= fnhe_oldest(hash
);
709 fnhe
= kzalloc(sizeof(*fnhe
), GFP_ATOMIC
);
713 fnhe
->fnhe_next
= hash
->chain
;
714 rcu_assign_pointer(hash
->chain
, fnhe
);
716 fnhe
->fnhe_genid
= genid
;
717 fnhe
->fnhe_daddr
= daddr
;
719 fnhe
->fnhe_pmtu
= pmtu
;
720 fnhe
->fnhe_mtu_locked
= lock
;
721 fnhe
->fnhe_expires
= max(1UL, expires
);
723 /* Exception created; mark the cached routes for the nexthop
724 * stale, so anyone caching it rechecks if this exception
727 rt
= rcu_dereference(nh
->nh_rth_input
);
729 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
731 for_each_possible_cpu(i
) {
732 struct rtable __rcu
**prt
;
733 prt
= per_cpu_ptr(nh
->nh_pcpu_rth_output
, i
);
734 rt
= rcu_dereference(*prt
);
736 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
740 fnhe
->fnhe_stamp
= jiffies
;
743 spin_unlock_bh(&fnhe_lock
);
746 static void __ip_do_redirect(struct rtable
*rt
, struct sk_buff
*skb
, struct flowi4
*fl4
,
749 __be32 new_gw
= icmp_hdr(skb
)->un
.gateway
;
750 __be32 old_gw
= ip_hdr(skb
)->saddr
;
751 struct net_device
*dev
= skb
->dev
;
752 struct in_device
*in_dev
;
753 struct fib_result res
;
757 switch (icmp_hdr(skb
)->code
& 7) {
759 case ICMP_REDIR_NETTOS
:
760 case ICMP_REDIR_HOST
:
761 case ICMP_REDIR_HOSTTOS
:
768 if (rt
->rt_gateway
!= old_gw
)
771 in_dev
= __in_dev_get_rcu(dev
);
776 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
777 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
778 ipv4_is_zeronet(new_gw
))
779 goto reject_redirect
;
781 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
782 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
783 goto reject_redirect
;
784 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
785 goto reject_redirect
;
787 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
788 goto reject_redirect
;
791 n
= __ipv4_neigh_lookup(rt
->dst
.dev
, new_gw
);
793 n
= neigh_create(&arp_tbl
, &new_gw
, rt
->dst
.dev
);
795 if (!(n
->nud_state
& NUD_VALID
)) {
796 neigh_event_send(n
, NULL
);
798 if (fib_lookup(net
, fl4
, &res
, 0) == 0) {
799 struct fib_nh
*nh
= &FIB_RES_NH(res
);
801 update_or_create_fnhe(nh
, fl4
->daddr
, new_gw
,
803 jiffies
+ ip_rt_gc_timeout
);
806 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
807 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE
, n
);
814 #ifdef CONFIG_IP_ROUTE_VERBOSE
815 if (IN_DEV_LOG_MARTIANS(in_dev
)) {
816 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
817 __be32 daddr
= iph
->daddr
;
818 __be32 saddr
= iph
->saddr
;
820 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
821 " Advised path = %pI4 -> %pI4\n",
822 &old_gw
, dev
->name
, &new_gw
,
829 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
833 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
834 struct net
*net
= dev_net(skb
->dev
);
835 int oif
= skb
->dev
->ifindex
;
836 u8 tos
= RT_TOS(iph
->tos
);
837 u8 prot
= iph
->protocol
;
838 u32 mark
= skb
->mark
;
840 rt
= (struct rtable
*) dst
;
842 __build_flow_key(net
, &fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
843 __ip_do_redirect(rt
, skb
, &fl4
, true);
846 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
848 struct rtable
*rt
= (struct rtable
*)dst
;
849 struct dst_entry
*ret
= dst
;
852 if (dst
->obsolete
> 0) {
855 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
866 * 1. The first ip_rt_redirect_number redirects are sent
867 * with exponential backoff, then we stop sending them at all,
868 * assuming that the host ignores our redirects.
869 * 2. If we did not see packets requiring redirects
870 * during ip_rt_redirect_silence, we assume that the host
871 * forgot redirected route and start to send redirects again.
873 * This algorithm is much cheaper and more intelligent than dumb load limiting
876 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
877 * and "frag. need" (breaks PMTU discovery) in icmp.c.
880 void ip_rt_send_redirect(struct sk_buff
*skb
)
882 struct rtable
*rt
= skb_rtable(skb
);
883 struct in_device
*in_dev
;
884 struct inet_peer
*peer
;
890 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
891 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
895 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
896 vif
= l3mdev_master_ifindex_rcu(rt
->dst
.dev
);
899 net
= dev_net(rt
->dst
.dev
);
900 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
, vif
, 1);
902 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
,
903 rt_nexthop(rt
, ip_hdr(skb
)->daddr
));
907 /* No redirected packets during ip_rt_redirect_silence;
908 * reset the algorithm.
910 if (time_after(jiffies
, peer
->rate_last
+ ip_rt_redirect_silence
)) {
911 peer
->rate_tokens
= 0;
912 peer
->n_redirects
= 0;
915 /* Too many ignored redirects; do not send anything
916 * set dst.rate_last to the last seen redirected packet.
918 if (peer
->n_redirects
>= ip_rt_redirect_number
) {
919 peer
->rate_last
= jiffies
;
923 /* Check for load limit; set rate_last to the latest sent
926 if (peer
->rate_tokens
== 0 ||
929 (ip_rt_redirect_load
<< peer
->n_redirects
)))) {
930 __be32 gw
= rt_nexthop(rt
, ip_hdr(skb
)->daddr
);
932 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, gw
);
933 peer
->rate_last
= jiffies
;
935 #ifdef CONFIG_IP_ROUTE_VERBOSE
937 peer
->n_redirects
== ip_rt_redirect_number
)
938 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
939 &ip_hdr(skb
)->saddr
, inet_iif(skb
),
940 &ip_hdr(skb
)->daddr
, &gw
);
947 static int ip_error(struct sk_buff
*skb
)
949 struct in_device
*in_dev
= __in_dev_get_rcu(skb
->dev
);
950 struct rtable
*rt
= skb_rtable(skb
);
951 struct inet_peer
*peer
;
957 /* IP on this device is disabled. */
961 net
= dev_net(rt
->dst
.dev
);
962 if (!IN_DEV_FORWARD(in_dev
)) {
963 switch (rt
->dst
.error
) {
965 __IP_INC_STATS(net
, IPSTATS_MIB_INADDRERRORS
);
969 __IP_INC_STATS(net
, IPSTATS_MIB_INNOROUTES
);
975 switch (rt
->dst
.error
) {
980 code
= ICMP_HOST_UNREACH
;
983 code
= ICMP_NET_UNREACH
;
984 __IP_INC_STATS(net
, IPSTATS_MIB_INNOROUTES
);
987 code
= ICMP_PKT_FILTERED
;
991 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
,
992 l3mdev_master_ifindex(skb
->dev
), 1);
997 peer
->rate_tokens
+= now
- peer
->rate_last
;
998 if (peer
->rate_tokens
> ip_rt_error_burst
)
999 peer
->rate_tokens
= ip_rt_error_burst
;
1000 peer
->rate_last
= now
;
1001 if (peer
->rate_tokens
>= ip_rt_error_cost
)
1002 peer
->rate_tokens
-= ip_rt_error_cost
;
1008 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1010 out
: kfree_skb(skb
);
1014 static void __ip_rt_update_pmtu(struct rtable
*rt
, struct flowi4
*fl4
, u32 mtu
)
1016 struct dst_entry
*dst
= &rt
->dst
;
1017 u32 old_mtu
= ipv4_mtu(dst
);
1018 struct fib_result res
;
1021 if (ip_mtu_locked(dst
))
1027 if (mtu
< ip_rt_min_pmtu
) {
1029 mtu
= min(old_mtu
, ip_rt_min_pmtu
);
1032 if (rt
->rt_pmtu
== mtu
&& !lock
&&
1033 time_before(jiffies
, dst
->expires
- ip_rt_mtu_expires
/ 2))
1037 if (fib_lookup(dev_net(dst
->dev
), fl4
, &res
, 0) == 0) {
1038 struct fib_nh
*nh
= &FIB_RES_NH(res
);
1040 update_or_create_fnhe(nh
, fl4
->daddr
, 0, mtu
, lock
,
1041 jiffies
+ ip_rt_mtu_expires
);
1046 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
1047 struct sk_buff
*skb
, u32 mtu
,
1050 struct rtable
*rt
= (struct rtable
*) dst
;
1053 ip_rt_build_flow_key(&fl4
, sk
, skb
);
1054 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1057 void ipv4_update_pmtu(struct sk_buff
*skb
, struct net
*net
, u32 mtu
,
1058 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1060 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1065 mark
= IP4_REPLY_MARK(net
, skb
->mark
);
1067 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
,
1068 RT_TOS(iph
->tos
), protocol
, mark
, flow_flags
);
1069 rt
= __ip_route_output_key(net
, &fl4
);
1071 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1075 EXPORT_SYMBOL_GPL(ipv4_update_pmtu
);
1077 static void __ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1079 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1083 __build_flow_key(sock_net(sk
), &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1085 if (!fl4
.flowi4_mark
)
1086 fl4
.flowi4_mark
= IP4_REPLY_MARK(sock_net(sk
), skb
->mark
);
1088 rt
= __ip_route_output_key(sock_net(sk
), &fl4
);
1090 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1095 void ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1097 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1100 struct dst_entry
*odst
= NULL
;
1102 struct net
*net
= sock_net(sk
);
1106 if (!ip_sk_accept_pmtu(sk
))
1109 odst
= sk_dst_get(sk
);
1111 if (sock_owned_by_user(sk
) || !odst
) {
1112 __ipv4_sk_update_pmtu(skb
, sk
, mtu
);
1116 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1118 rt
= (struct rtable
*)odst
;
1119 if (odst
->obsolete
&& !odst
->ops
->check(odst
, 0)) {
1120 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1127 __ip_rt_update_pmtu((struct rtable
*) rt
->dst
.path
, &fl4
, mtu
);
1129 if (!dst_check(&rt
->dst
, 0)) {
1131 dst_release(&rt
->dst
);
1133 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1141 sk_dst_set(sk
, &rt
->dst
);
1147 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu
);
1149 void ipv4_redirect(struct sk_buff
*skb
, struct net
*net
,
1150 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1152 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1156 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
,
1157 RT_TOS(iph
->tos
), protocol
, mark
, flow_flags
);
1158 rt
= __ip_route_output_key(net
, &fl4
);
1160 __ip_do_redirect(rt
, skb
, &fl4
, false);
1164 EXPORT_SYMBOL_GPL(ipv4_redirect
);
1166 void ipv4_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
1168 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1171 struct net
*net
= sock_net(sk
);
1173 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1174 rt
= __ip_route_output_key(net
, &fl4
);
1176 __ip_do_redirect(rt
, skb
, &fl4
, false);
1180 EXPORT_SYMBOL_GPL(ipv4_sk_redirect
);
1182 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1184 struct rtable
*rt
= (struct rtable
*) dst
;
1186 /* All IPV4 dsts are created with ->obsolete set to the value
1187 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1188 * into this function always.
1190 * When a PMTU/redirect information update invalidates a route,
1191 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1192 * DST_OBSOLETE_DEAD by dst_free().
1194 if (dst
->obsolete
!= DST_OBSOLETE_FORCE_CHK
|| rt_is_expired(rt
))
1199 static void ipv4_send_dest_unreach(struct sk_buff
*skb
)
1201 struct ip_options opt
;
1204 /* Recompile ip options since IPCB may not be valid anymore.
1205 * Also check we have a reasonable ipv4 header.
1207 if (!pskb_network_may_pull(skb
, sizeof(struct iphdr
)) ||
1208 ip_hdr(skb
)->version
!= 4 || ip_hdr(skb
)->ihl
< 5)
1211 memset(&opt
, 0, sizeof(opt
));
1212 if (ip_hdr(skb
)->ihl
> 5) {
1213 if (!pskb_network_may_pull(skb
, ip_hdr(skb
)->ihl
* 4))
1215 opt
.optlen
= ip_hdr(skb
)->ihl
* 4 - sizeof(struct iphdr
);
1218 res
= __ip_options_compile(dev_net(skb
->dev
), &opt
, skb
, NULL
);
1224 __icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0, &opt
);
1227 static void ipv4_link_failure(struct sk_buff
*skb
)
1231 ipv4_send_dest_unreach(skb
);
1233 rt
= skb_rtable(skb
);
1235 dst_set_expires(&rt
->dst
, 0);
1238 static int ip_rt_bug(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
1240 pr_debug("%s: %pI4 -> %pI4, %s\n",
1241 __func__
, &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1242 skb
->dev
? skb
->dev
->name
: "?");
1249 We do not cache source address of outgoing interface,
1250 because it is used only by IP RR, TS and SRR options,
1251 so that it out of fast path.
1253 BTW remember: "addr" is allowed to be not aligned
1257 void ip_rt_get_source(u8
*addr
, struct sk_buff
*skb
, struct rtable
*rt
)
1261 if (rt_is_output_route(rt
))
1262 src
= ip_hdr(skb
)->saddr
;
1264 struct fib_result res
;
1270 memset(&fl4
, 0, sizeof(fl4
));
1271 fl4
.daddr
= iph
->daddr
;
1272 fl4
.saddr
= iph
->saddr
;
1273 fl4
.flowi4_tos
= RT_TOS(iph
->tos
);
1274 fl4
.flowi4_oif
= rt
->dst
.dev
->ifindex
;
1275 fl4
.flowi4_iif
= skb
->dev
->ifindex
;
1276 fl4
.flowi4_mark
= skb
->mark
;
1279 if (fib_lookup(dev_net(rt
->dst
.dev
), &fl4
, &res
, 0) == 0)
1280 src
= FIB_RES_PREFSRC(dev_net(rt
->dst
.dev
), res
);
1282 src
= inet_select_addr(rt
->dst
.dev
,
1283 rt_nexthop(rt
, iph
->daddr
),
1287 memcpy(addr
, &src
, 4);
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1293 if (!(rt
->dst
.tclassid
& 0xFFFF))
1294 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1295 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1296 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1300 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1302 unsigned int header_size
= sizeof(struct tcphdr
) + sizeof(struct iphdr
);
1303 unsigned int advmss
= max_t(unsigned int, ipv4_mtu(dst
) - header_size
,
1306 return min(advmss
, IPV4_MAX_PMTU
- header_size
);
1309 static unsigned int ipv4_mtu(const struct dst_entry
*dst
)
1311 const struct rtable
*rt
= (const struct rtable
*) dst
;
1312 unsigned int mtu
= rt
->rt_pmtu
;
1314 if (!mtu
|| time_after_eq(jiffies
, rt
->dst
.expires
))
1315 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
1320 mtu
= READ_ONCE(dst
->dev
->mtu
);
1322 if (unlikely(ip_mtu_locked(dst
))) {
1323 if (rt
->rt_uses_gateway
&& mtu
> 576)
1327 mtu
= min_t(unsigned int, mtu
, IP_MAX_MTU
);
1329 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
1332 static void ip_del_fnhe(struct fib_nh
*nh
, __be32 daddr
)
1334 struct fnhe_hash_bucket
*hash
;
1335 struct fib_nh_exception
*fnhe
, __rcu
**fnhe_p
;
1336 u32 hval
= fnhe_hashfun(daddr
);
1338 spin_lock_bh(&fnhe_lock
);
1340 hash
= rcu_dereference_protected(nh
->nh_exceptions
,
1341 lockdep_is_held(&fnhe_lock
));
1344 fnhe_p
= &hash
->chain
;
1345 fnhe
= rcu_dereference_protected(*fnhe_p
, lockdep_is_held(&fnhe_lock
));
1347 if (fnhe
->fnhe_daddr
== daddr
) {
1348 rcu_assign_pointer(*fnhe_p
, rcu_dereference_protected(
1349 fnhe
->fnhe_next
, lockdep_is_held(&fnhe_lock
)));
1350 /* set fnhe_daddr to 0 to ensure it won't bind with
1351 * new dsts in rt_bind_exception().
1353 fnhe
->fnhe_daddr
= 0;
1354 fnhe_flush_routes(fnhe
);
1355 kfree_rcu(fnhe
, rcu
);
1358 fnhe_p
= &fnhe
->fnhe_next
;
1359 fnhe
= rcu_dereference_protected(fnhe
->fnhe_next
,
1360 lockdep_is_held(&fnhe_lock
));
1363 spin_unlock_bh(&fnhe_lock
);
1366 static struct fib_nh_exception
*find_exception(struct fib_nh
*nh
, __be32 daddr
)
1368 struct fnhe_hash_bucket
*hash
= rcu_dereference(nh
->nh_exceptions
);
1369 struct fib_nh_exception
*fnhe
;
1375 hval
= fnhe_hashfun(daddr
);
1377 for (fnhe
= rcu_dereference(hash
[hval
].chain
); fnhe
;
1378 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
1379 if (fnhe
->fnhe_daddr
== daddr
) {
1380 if (fnhe
->fnhe_expires
&&
1381 time_after(jiffies
, fnhe
->fnhe_expires
)) {
1382 ip_del_fnhe(nh
, daddr
);
1391 static bool rt_bind_exception(struct rtable
*rt
, struct fib_nh_exception
*fnhe
,
1392 __be32 daddr
, const bool do_cache
)
1396 spin_lock_bh(&fnhe_lock
);
1398 if (daddr
== fnhe
->fnhe_daddr
) {
1399 struct rtable __rcu
**porig
;
1400 struct rtable
*orig
;
1401 int genid
= fnhe_genid(dev_net(rt
->dst
.dev
));
1403 if (rt_is_input_route(rt
))
1404 porig
= &fnhe
->fnhe_rth_input
;
1406 porig
= &fnhe
->fnhe_rth_output
;
1407 orig
= rcu_dereference(*porig
);
1409 if (fnhe
->fnhe_genid
!= genid
) {
1410 fnhe
->fnhe_genid
= genid
;
1412 fnhe
->fnhe_pmtu
= 0;
1413 fnhe
->fnhe_expires
= 0;
1414 fnhe_flush_routes(fnhe
);
1417 fill_route_from_fnhe(rt
, fnhe
);
1418 if (!rt
->rt_gateway
)
1419 rt
->rt_gateway
= daddr
;
1423 rcu_assign_pointer(*porig
, rt
);
1425 dst_dev_put(&orig
->dst
);
1426 dst_release(&orig
->dst
);
1431 fnhe
->fnhe_stamp
= jiffies
;
1433 spin_unlock_bh(&fnhe_lock
);
1438 static bool rt_cache_route(struct fib_nh
*nh
, struct rtable
*rt
)
1440 struct rtable
*orig
, *prev
, **p
;
1443 if (rt_is_input_route(rt
)) {
1444 p
= (struct rtable
**)&nh
->nh_rth_input
;
1446 p
= (struct rtable
**)raw_cpu_ptr(nh
->nh_pcpu_rth_output
);
1450 /* hold dst before doing cmpxchg() to avoid race condition
1454 prev
= cmpxchg(p
, orig
, rt
);
1457 rt_add_uncached_list(orig
);
1458 dst_release(&orig
->dst
);
1461 dst_release(&rt
->dst
);
1468 struct uncached_list
{
1470 struct list_head head
;
1473 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt_uncached_list
);
1475 void rt_add_uncached_list(struct rtable
*rt
)
1477 struct uncached_list
*ul
= raw_cpu_ptr(&rt_uncached_list
);
1479 rt
->rt_uncached_list
= ul
;
1481 spin_lock_bh(&ul
->lock
);
1482 list_add_tail(&rt
->rt_uncached
, &ul
->head
);
1483 spin_unlock_bh(&ul
->lock
);
1486 void rt_del_uncached_list(struct rtable
*rt
)
1488 if (!list_empty(&rt
->rt_uncached
)) {
1489 struct uncached_list
*ul
= rt
->rt_uncached_list
;
1491 spin_lock_bh(&ul
->lock
);
1492 list_del(&rt
->rt_uncached
);
1493 spin_unlock_bh(&ul
->lock
);
1497 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1499 struct dst_metrics
*p
= (struct dst_metrics
*)DST_METRICS_PTR(dst
);
1500 struct rtable
*rt
= (struct rtable
*)dst
;
1502 if (p
!= &dst_default_metrics
&& refcount_dec_and_test(&p
->refcnt
))
1505 rt_del_uncached_list(rt
);
1508 void rt_flush_dev(struct net_device
*dev
)
1510 struct net
*net
= dev_net(dev
);
1514 for_each_possible_cpu(cpu
) {
1515 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
1517 spin_lock_bh(&ul
->lock
);
1518 list_for_each_entry(rt
, &ul
->head
, rt_uncached
) {
1519 if (rt
->dst
.dev
!= dev
)
1521 rt
->dst
.dev
= net
->loopback_dev
;
1522 dev_hold(rt
->dst
.dev
);
1525 spin_unlock_bh(&ul
->lock
);
1529 static bool rt_cache_valid(const struct rtable
*rt
)
1532 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
1536 static void rt_set_nexthop(struct rtable
*rt
, __be32 daddr
,
1537 const struct fib_result
*res
,
1538 struct fib_nh_exception
*fnhe
,
1539 struct fib_info
*fi
, u16 type
, u32 itag
,
1540 const bool do_cache
)
1542 bool cached
= false;
1545 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
1547 if (nh
->nh_gw
&& nh
->nh_scope
== RT_SCOPE_LINK
) {
1548 rt
->rt_gateway
= nh
->nh_gw
;
1549 rt
->rt_uses_gateway
= 1;
1551 dst_init_metrics(&rt
->dst
, fi
->fib_metrics
->metrics
, true);
1552 if (fi
->fib_metrics
!= &dst_default_metrics
) {
1553 rt
->dst
._metrics
|= DST_METRICS_REFCOUNTED
;
1554 refcount_inc(&fi
->fib_metrics
->refcnt
);
1556 #ifdef CONFIG_IP_ROUTE_CLASSID
1557 rt
->dst
.tclassid
= nh
->nh_tclassid
;
1559 rt
->dst
.lwtstate
= lwtstate_get(nh
->nh_lwtstate
);
1561 cached
= rt_bind_exception(rt
, fnhe
, daddr
, do_cache
);
1563 cached
= rt_cache_route(nh
, rt
);
1564 if (unlikely(!cached
)) {
1565 /* Routes we intend to cache in nexthop exception or
1566 * FIB nexthop have the DST_NOCACHE bit clear.
1567 * However, if we are unsuccessful at storing this
1568 * route into the cache we really need to set it.
1570 if (!rt
->rt_gateway
)
1571 rt
->rt_gateway
= daddr
;
1572 rt_add_uncached_list(rt
);
1575 rt_add_uncached_list(rt
);
1577 #ifdef CONFIG_IP_ROUTE_CLASSID
1578 #ifdef CONFIG_IP_MULTIPLE_TABLES
1579 set_class_tag(rt
, res
->tclassid
);
1581 set_class_tag(rt
, itag
);
1585 struct rtable
*rt_dst_alloc(struct net_device
*dev
,
1586 unsigned int flags
, u16 type
,
1587 bool nopolicy
, bool noxfrm
, bool will_cache
)
1591 rt
= dst_alloc(&ipv4_dst_ops
, dev
, 1, DST_OBSOLETE_FORCE_CHK
,
1592 (will_cache
? 0 : DST_HOST
) |
1593 (nopolicy
? DST_NOPOLICY
: 0) |
1594 (noxfrm
? DST_NOXFRM
: 0));
1597 rt
->rt_genid
= rt_genid_ipv4(dev_net(dev
));
1598 rt
->rt_flags
= flags
;
1600 rt
->rt_is_input
= 0;
1603 rt
->rt_mtu_locked
= 0;
1605 rt
->rt_uses_gateway
= 0;
1606 rt
->rt_table_id
= 0;
1607 INIT_LIST_HEAD(&rt
->rt_uncached
);
1609 rt
->dst
.output
= ip_output
;
1610 if (flags
& RTCF_LOCAL
)
1611 rt
->dst
.input
= ip_local_deliver
;
1616 EXPORT_SYMBOL(rt_dst_alloc
);
1618 /* called in rcu_read_lock() section */
1619 int ip_mc_validate_source(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1620 u8 tos
, struct net_device
*dev
,
1621 struct in_device
*in_dev
, u32
*itag
)
1625 /* Primary sanity checks. */
1629 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1630 skb
->protocol
!= htons(ETH_P_IP
))
1633 if (ipv4_is_loopback(saddr
) && !IN_DEV_ROUTE_LOCALNET(in_dev
))
1636 if (ipv4_is_zeronet(saddr
)) {
1637 if (!ipv4_is_local_multicast(daddr
))
1640 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
1648 /* called in rcu_read_lock() section */
1649 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1650 u8 tos
, struct net_device
*dev
, int our
)
1652 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1653 unsigned int flags
= RTCF_MULTICAST
;
1658 err
= ip_mc_validate_source(skb
, daddr
, saddr
, tos
, dev
, in_dev
, &itag
);
1663 flags
|= RTCF_LOCAL
;
1665 rth
= rt_dst_alloc(dev_net(dev
)->loopback_dev
, flags
, RTN_MULTICAST
,
1666 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false, false);
1670 #ifdef CONFIG_IP_ROUTE_CLASSID
1671 rth
->dst
.tclassid
= itag
;
1673 rth
->dst
.output
= ip_rt_bug
;
1674 rth
->rt_is_input
= 1;
1676 #ifdef CONFIG_IP_MROUTE
1677 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1678 rth
->dst
.input
= ip_mr_input
;
1680 RT_CACHE_STAT_INC(in_slow_mc
);
1682 skb_dst_set(skb
, &rth
->dst
);
1687 static void ip_handle_martian_source(struct net_device
*dev
,
1688 struct in_device
*in_dev
,
1689 struct sk_buff
*skb
,
1693 RT_CACHE_STAT_INC(in_martian_src
);
1694 #ifdef CONFIG_IP_ROUTE_VERBOSE
1695 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1697 * RFC1812 recommendation, if source is martian,
1698 * the only hint is MAC header.
1700 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1701 &daddr
, &saddr
, dev
->name
);
1702 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1703 print_hex_dump(KERN_WARNING
, "ll header: ",
1704 DUMP_PREFIX_OFFSET
, 16, 1,
1705 skb_mac_header(skb
),
1706 dev
->hard_header_len
, true);
1712 static void set_lwt_redirect(struct rtable
*rth
)
1714 if (lwtunnel_output_redirect(rth
->dst
.lwtstate
)) {
1715 rth
->dst
.lwtstate
->orig_output
= rth
->dst
.output
;
1716 rth
->dst
.output
= lwtunnel_output
;
1719 if (lwtunnel_input_redirect(rth
->dst
.lwtstate
)) {
1720 rth
->dst
.lwtstate
->orig_input
= rth
->dst
.input
;
1721 rth
->dst
.input
= lwtunnel_input
;
1725 /* called in rcu_read_lock() section */
1726 static int __mkroute_input(struct sk_buff
*skb
,
1727 const struct fib_result
*res
,
1728 struct in_device
*in_dev
,
1729 __be32 daddr
, __be32 saddr
, u32 tos
)
1731 struct fib_nh_exception
*fnhe
;
1734 struct in_device
*out_dev
;
1738 /* get a working reference to the output device */
1739 out_dev
= __in_dev_get_rcu(FIB_RES_DEV(*res
));
1741 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1745 err
= fib_validate_source(skb
, saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
1746 in_dev
->dev
, in_dev
, &itag
);
1748 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
1754 do_cache
= res
->fi
&& !itag
;
1755 if (out_dev
== in_dev
&& err
&& IN_DEV_TX_REDIRECTS(out_dev
) &&
1756 skb
->protocol
== htons(ETH_P_IP
) &&
1757 (IN_DEV_SHARED_MEDIA(out_dev
) ||
1758 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
1759 IPCB(skb
)->flags
|= IPSKB_DOREDIRECT
;
1761 if (skb
->protocol
!= htons(ETH_P_IP
)) {
1762 /* Not IP (i.e. ARP). Do not create route, if it is
1763 * invalid for proxy arp. DNAT routes are always valid.
1765 * Proxy arp feature have been extended to allow, ARP
1766 * replies back to the same interface, to support
1767 * Private VLAN switch technologies. See arp.c.
1769 if (out_dev
== in_dev
&&
1770 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
1776 fnhe
= find_exception(&FIB_RES_NH(*res
), daddr
);
1779 rth
= rcu_dereference(fnhe
->fnhe_rth_input
);
1781 rth
= rcu_dereference(FIB_RES_NH(*res
).nh_rth_input
);
1782 if (rt_cache_valid(rth
)) {
1783 skb_dst_set_noref(skb
, &rth
->dst
);
1788 rth
= rt_dst_alloc(out_dev
->dev
, 0, res
->type
,
1789 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
1790 IN_DEV_CONF_GET(out_dev
, NOXFRM
), do_cache
);
1796 rth
->rt_is_input
= 1;
1798 rth
->rt_table_id
= res
->table
->tb_id
;
1799 RT_CACHE_STAT_INC(in_slow_tot
);
1801 rth
->dst
.input
= ip_forward
;
1803 rt_set_nexthop(rth
, daddr
, res
, fnhe
, res
->fi
, res
->type
, itag
,
1805 set_lwt_redirect(rth
);
1806 skb_dst_set(skb
, &rth
->dst
);
1813 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1814 /* To make ICMP packets follow the right flow, the multipath hash is
1815 * calculated from the inner IP addresses.
1817 static void ip_multipath_l3_keys(const struct sk_buff
*skb
,
1818 struct flow_keys
*hash_keys
)
1820 const struct iphdr
*outer_iph
= ip_hdr(skb
);
1821 const struct iphdr
*inner_iph
;
1822 const struct icmphdr
*icmph
;
1823 struct iphdr _inner_iph
;
1824 struct icmphdr _icmph
;
1826 hash_keys
->addrs
.v4addrs
.src
= outer_iph
->saddr
;
1827 hash_keys
->addrs
.v4addrs
.dst
= outer_iph
->daddr
;
1828 if (likely(outer_iph
->protocol
!= IPPROTO_ICMP
))
1831 if (unlikely((outer_iph
->frag_off
& htons(IP_OFFSET
)) != 0))
1834 icmph
= skb_header_pointer(skb
, outer_iph
->ihl
* 4, sizeof(_icmph
),
1839 if (icmph
->type
!= ICMP_DEST_UNREACH
&&
1840 icmph
->type
!= ICMP_REDIRECT
&&
1841 icmph
->type
!= ICMP_TIME_EXCEEDED
&&
1842 icmph
->type
!= ICMP_PARAMETERPROB
)
1845 inner_iph
= skb_header_pointer(skb
,
1846 outer_iph
->ihl
* 4 + sizeof(_icmph
),
1847 sizeof(_inner_iph
), &_inner_iph
);
1850 hash_keys
->addrs
.v4addrs
.src
= inner_iph
->saddr
;
1851 hash_keys
->addrs
.v4addrs
.dst
= inner_iph
->daddr
;
1854 /* if skb is set it will be used and fl4 can be NULL */
1855 int fib_multipath_hash(const struct fib_info
*fi
, const struct flowi4
*fl4
,
1856 const struct sk_buff
*skb
)
1858 struct net
*net
= fi
->fib_net
;
1859 struct flow_keys hash_keys
;
1862 switch (net
->ipv4
.sysctl_fib_multipath_hash_policy
) {
1864 memset(&hash_keys
, 0, sizeof(hash_keys
));
1865 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1867 ip_multipath_l3_keys(skb
, &hash_keys
);
1869 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
1870 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
1874 /* skb is currently provided only when forwarding */
1876 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
1877 struct flow_keys keys
;
1879 /* short-circuit if we already have L4 hash present */
1881 return skb_get_hash_raw(skb
) >> 1;
1882 memset(&hash_keys
, 0, sizeof(hash_keys
));
1883 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
1885 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1886 hash_keys
.addrs
.v4addrs
.src
= keys
.addrs
.v4addrs
.src
;
1887 hash_keys
.addrs
.v4addrs
.dst
= keys
.addrs
.v4addrs
.dst
;
1888 hash_keys
.ports
.src
= keys
.ports
.src
;
1889 hash_keys
.ports
.dst
= keys
.ports
.dst
;
1890 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
1892 memset(&hash_keys
, 0, sizeof(hash_keys
));
1893 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1894 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
1895 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
1896 hash_keys
.ports
.src
= fl4
->fl4_sport
;
1897 hash_keys
.ports
.dst
= fl4
->fl4_dport
;
1898 hash_keys
.basic
.ip_proto
= fl4
->flowi4_proto
;
1902 mhash
= flow_hash_from_keys(&hash_keys
);
1906 EXPORT_SYMBOL_GPL(fib_multipath_hash
);
1907 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1909 static int ip_mkroute_input(struct sk_buff
*skb
,
1910 struct fib_result
*res
,
1911 struct in_device
*in_dev
,
1912 __be32 daddr
, __be32 saddr
, u32 tos
)
1914 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1915 if (res
->fi
&& res
->fi
->fib_nhs
> 1) {
1916 int h
= fib_multipath_hash(res
->fi
, NULL
, skb
);
1918 fib_select_multipath(res
, h
);
1922 /* create a routing cache entry */
1923 return __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
);
1927 * NOTE. We drop all the packets that has local source
1928 * addresses, because every properly looped back packet
1929 * must have correct destination already attached by output routine.
1931 * Such approach solves two big problems:
1932 * 1. Not simplex devices are handled properly.
1933 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1934 * called with rcu_read_lock()
1937 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1938 u8 tos
, struct net_device
*dev
,
1939 struct fib_result
*res
)
1941 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1942 struct ip_tunnel_info
*tun_info
;
1944 unsigned int flags
= 0;
1948 struct net
*net
= dev_net(dev
);
1951 /* IP on this device is disabled. */
1956 /* Check for the most weird martians, which can be not detected
1960 tun_info
= skb_tunnel_info(skb
);
1961 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
1962 fl4
.flowi4_tun_key
.tun_id
= tun_info
->key
.tun_id
;
1964 fl4
.flowi4_tun_key
.tun_id
= 0;
1967 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
))
1968 goto martian_source
;
1972 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
1975 /* Accept zero addresses only to limited broadcast;
1976 * I even do not know to fix it or not. Waiting for complains :-)
1978 if (ipv4_is_zeronet(saddr
))
1979 goto martian_source
;
1981 if (ipv4_is_zeronet(daddr
))
1982 goto martian_destination
;
1984 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1985 * and call it once if daddr or/and saddr are loopback addresses
1987 if (ipv4_is_loopback(daddr
)) {
1988 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
))
1989 goto martian_destination
;
1990 } else if (ipv4_is_loopback(saddr
)) {
1991 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
))
1992 goto martian_source
;
1996 * Now we are ready to route packet.
1999 fl4
.flowi4_iif
= dev
->ifindex
;
2000 fl4
.flowi4_mark
= skb
->mark
;
2001 fl4
.flowi4_tos
= tos
;
2002 fl4
.flowi4_scope
= RT_SCOPE_UNIVERSE
;
2003 fl4
.flowi4_flags
= 0;
2006 fl4
.flowi4_uid
= sock_net_uid(net
, NULL
);
2007 err
= fib_lookup(net
, &fl4
, res
, 0);
2009 if (!IN_DEV_FORWARD(in_dev
))
2010 err
= -EHOSTUNREACH
;
2014 if (res
->type
== RTN_BROADCAST
)
2017 if (res
->type
== RTN_LOCAL
) {
2018 err
= fib_validate_source(skb
, saddr
, daddr
, tos
,
2019 0, dev
, in_dev
, &itag
);
2021 goto martian_source
;
2025 if (!IN_DEV_FORWARD(in_dev
)) {
2026 err
= -EHOSTUNREACH
;
2029 if (res
->type
!= RTN_UNICAST
)
2030 goto martian_destination
;
2032 err
= ip_mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
);
2036 if (skb
->protocol
!= htons(ETH_P_IP
))
2039 if (!ipv4_is_zeronet(saddr
)) {
2040 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
2043 goto martian_source
;
2045 flags
|= RTCF_BROADCAST
;
2046 res
->type
= RTN_BROADCAST
;
2047 RT_CACHE_STAT_INC(in_brd
);
2053 rth
= rcu_dereference(FIB_RES_NH(*res
).nh_rth_input
);
2054 if (rt_cache_valid(rth
)) {
2055 skb_dst_set_noref(skb
, &rth
->dst
);
2063 rth
= rt_dst_alloc(l3mdev_master_dev_rcu(dev
) ? : net
->loopback_dev
,
2064 flags
| RTCF_LOCAL
, res
->type
,
2065 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false, do_cache
);
2069 rth
->dst
.output
= ip_rt_bug
;
2070 #ifdef CONFIG_IP_ROUTE_CLASSID
2071 rth
->dst
.tclassid
= itag
;
2073 rth
->rt_is_input
= 1;
2075 rth
->rt_table_id
= res
->table
->tb_id
;
2077 RT_CACHE_STAT_INC(in_slow_tot
);
2078 if (res
->type
== RTN_UNREACHABLE
) {
2079 rth
->dst
.input
= ip_error
;
2080 rth
->dst
.error
= -err
;
2081 rth
->rt_flags
&= ~RTCF_LOCAL
;
2085 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
2087 rth
->dst
.lwtstate
= lwtstate_get(nh
->nh_lwtstate
);
2088 if (lwtunnel_input_redirect(rth
->dst
.lwtstate
)) {
2089 WARN_ON(rth
->dst
.input
== lwtunnel_input
);
2090 rth
->dst
.lwtstate
->orig_input
= rth
->dst
.input
;
2091 rth
->dst
.input
= lwtunnel_input
;
2094 if (unlikely(!rt_cache_route(nh
, rth
)))
2095 rt_add_uncached_list(rth
);
2097 skb_dst_set(skb
, &rth
->dst
);
2102 RT_CACHE_STAT_INC(in_no_route
);
2103 res
->type
= RTN_UNREACHABLE
;
2109 * Do not cache martian addresses: they should be logged (RFC1812)
2111 martian_destination
:
2112 RT_CACHE_STAT_INC(in_martian_dst
);
2113 #ifdef CONFIG_IP_ROUTE_VERBOSE
2114 if (IN_DEV_LOG_MARTIANS(in_dev
))
2115 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2116 &daddr
, &saddr
, dev
->name
);
2128 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2132 int ip_route_input_noref(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2133 u8 tos
, struct net_device
*dev
)
2135 struct fib_result res
;
2138 tos
&= IPTOS_RT_MASK
;
2140 err
= ip_route_input_rcu(skb
, daddr
, saddr
, tos
, dev
, &res
);
2145 EXPORT_SYMBOL(ip_route_input_noref
);
2147 /* called with rcu_read_lock held */
2148 int ip_route_input_rcu(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2149 u8 tos
, struct net_device
*dev
, struct fib_result
*res
)
2151 /* Multicast recognition logic is moved from route cache to here.
2152 The problem was that too many Ethernet cards have broken/missing
2153 hardware multicast filters :-( As result the host on multicasting
2154 network acquires a lot of useless route cache entries, sort of
2155 SDR messages from all the world. Now we try to get rid of them.
2156 Really, provided software IP multicast filter is organized
2157 reasonably (at least, hashed), it does not result in a slowdown
2158 comparing with route cache reject entries.
2159 Note, that multicast routers are not affected, because
2160 route cache entry is created eventually.
2162 if (ipv4_is_multicast(daddr
)) {
2163 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2169 our
= ip_check_mc_rcu(in_dev
, daddr
, saddr
,
2170 ip_hdr(skb
)->protocol
);
2172 /* check l3 master if no match yet */
2173 if (!our
&& netif_is_l3_slave(dev
)) {
2174 struct in_device
*l3_in_dev
;
2176 l3_in_dev
= __in_dev_get_rcu(skb
->dev
);
2178 our
= ip_check_mc_rcu(l3_in_dev
, daddr
, saddr
,
2179 ip_hdr(skb
)->protocol
);
2183 #ifdef CONFIG_IP_MROUTE
2185 (!ipv4_is_local_multicast(daddr
) &&
2186 IN_DEV_MFORWARD(in_dev
))
2189 err
= ip_route_input_mc(skb
, daddr
, saddr
,
2195 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
, res
);
2198 /* called with rcu_read_lock() */
2199 static struct rtable
*__mkroute_output(const struct fib_result
*res
,
2200 const struct flowi4
*fl4
, int orig_oif
,
2201 struct net_device
*dev_out
,
2204 struct fib_info
*fi
= res
->fi
;
2205 struct fib_nh_exception
*fnhe
;
2206 struct in_device
*in_dev
;
2207 u16 type
= res
->type
;
2211 in_dev
= __in_dev_get_rcu(dev_out
);
2213 return ERR_PTR(-EINVAL
);
2215 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
)))
2216 if (ipv4_is_loopback(fl4
->saddr
) &&
2217 !(dev_out
->flags
& IFF_LOOPBACK
) &&
2218 !netif_is_l3_master(dev_out
))
2219 return ERR_PTR(-EINVAL
);
2221 if (ipv4_is_lbcast(fl4
->daddr
))
2222 type
= RTN_BROADCAST
;
2223 else if (ipv4_is_multicast(fl4
->daddr
))
2224 type
= RTN_MULTICAST
;
2225 else if (ipv4_is_zeronet(fl4
->daddr
))
2226 return ERR_PTR(-EINVAL
);
2228 if (dev_out
->flags
& IFF_LOOPBACK
)
2229 flags
|= RTCF_LOCAL
;
2232 if (type
== RTN_BROADCAST
) {
2233 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2235 } else if (type
== RTN_MULTICAST
) {
2236 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2237 if (!ip_check_mc_rcu(in_dev
, fl4
->daddr
, fl4
->saddr
,
2239 flags
&= ~RTCF_LOCAL
;
2242 /* If multicast route do not exist use
2243 * default one, but do not gateway in this case.
2246 if (fi
&& res
->prefixlen
< 4)
2248 } else if ((type
== RTN_LOCAL
) && (orig_oif
!= 0) &&
2249 (orig_oif
!= dev_out
->ifindex
)) {
2250 /* For local routes that require a particular output interface
2251 * we do not want to cache the result. Caching the result
2252 * causes incorrect behaviour when there are multiple source
2253 * addresses on the interface, the end result being that if the
2254 * intended recipient is waiting on that interface for the
2255 * packet he won't receive it because it will be delivered on
2256 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2257 * be set to the loopback interface as well.
2263 do_cache
&= fi
!= NULL
;
2265 struct rtable __rcu
**prth
;
2266 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
2268 fnhe
= find_exception(nh
, fl4
->daddr
);
2272 prth
= &fnhe
->fnhe_rth_output
;
2274 if (unlikely(fl4
->flowi4_flags
&
2275 FLOWI_FLAG_KNOWN_NH
&&
2277 nh
->nh_scope
== RT_SCOPE_LINK
))) {
2281 prth
= raw_cpu_ptr(nh
->nh_pcpu_rth_output
);
2283 rth
= rcu_dereference(*prth
);
2284 if (rt_cache_valid(rth
) && dst_hold_safe(&rth
->dst
))
2289 rth
= rt_dst_alloc(dev_out
, flags
, type
,
2290 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2291 IN_DEV_CONF_GET(in_dev
, NOXFRM
),
2294 return ERR_PTR(-ENOBUFS
);
2296 rth
->rt_iif
= orig_oif
;
2298 rth
->rt_table_id
= res
->table
->tb_id
;
2300 RT_CACHE_STAT_INC(out_slow_tot
);
2302 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2303 if (flags
& RTCF_LOCAL
&&
2304 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2305 rth
->dst
.output
= ip_mc_output
;
2306 RT_CACHE_STAT_INC(out_slow_mc
);
2308 #ifdef CONFIG_IP_MROUTE
2309 if (type
== RTN_MULTICAST
) {
2310 if (IN_DEV_MFORWARD(in_dev
) &&
2311 !ipv4_is_local_multicast(fl4
->daddr
)) {
2312 rth
->dst
.input
= ip_mr_input
;
2313 rth
->dst
.output
= ip_mc_output
;
2319 rt_set_nexthop(rth
, fl4
->daddr
, res
, fnhe
, fi
, type
, 0, do_cache
);
2320 set_lwt_redirect(rth
);
2326 * Major route resolver routine.
2329 struct rtable
*ip_route_output_key_hash(struct net
*net
, struct flowi4
*fl4
,
2330 const struct sk_buff
*skb
)
2332 __u8 tos
= RT_FL_TOS(fl4
);
2333 struct fib_result res
= {
2341 fl4
->flowi4_iif
= LOOPBACK_IFINDEX
;
2342 fl4
->flowi4_tos
= tos
& IPTOS_RT_MASK
;
2343 fl4
->flowi4_scope
= ((tos
& RTO_ONLINK
) ?
2344 RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
);
2347 rth
= ip_route_output_key_hash_rcu(net
, fl4
, &res
, skb
);
2352 EXPORT_SYMBOL_GPL(ip_route_output_key_hash
);
2354 struct rtable
*ip_route_output_key_hash_rcu(struct net
*net
, struct flowi4
*fl4
,
2355 struct fib_result
*res
,
2356 const struct sk_buff
*skb
)
2358 struct net_device
*dev_out
= NULL
;
2359 int orig_oif
= fl4
->flowi4_oif
;
2360 unsigned int flags
= 0;
2365 if (ipv4_is_multicast(fl4
->saddr
) ||
2366 ipv4_is_lbcast(fl4
->saddr
) ||
2367 ipv4_is_zeronet(fl4
->saddr
)) {
2368 rth
= ERR_PTR(-EINVAL
);
2372 rth
= ERR_PTR(-ENETUNREACH
);
2374 /* I removed check for oif == dev_out->oif here.
2375 It was wrong for two reasons:
2376 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2377 is assigned to multiple interfaces.
2378 2. Moreover, we are allowed to send packets with saddr
2379 of another iface. --ANK
2382 if (fl4
->flowi4_oif
== 0 &&
2383 (ipv4_is_multicast(fl4
->daddr
) ||
2384 ipv4_is_lbcast(fl4
->daddr
))) {
2385 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2386 dev_out
= __ip_dev_find(net
, fl4
->saddr
, false);
2390 /* Special hack: user can direct multicasts
2391 and limited broadcast via necessary interface
2392 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2393 This hack is not just for fun, it allows
2394 vic,vat and friends to work.
2395 They bind socket to loopback, set ttl to zero
2396 and expect that it will work.
2397 From the viewpoint of routing cache they are broken,
2398 because we are not allowed to build multicast path
2399 with loopback source addr (look, routing cache
2400 cannot know, that ttl is zero, so that packet
2401 will not leave this host and route is valid).
2402 Luckily, this hack is good workaround.
2405 fl4
->flowi4_oif
= dev_out
->ifindex
;
2409 if (!(fl4
->flowi4_flags
& FLOWI_FLAG_ANYSRC
)) {
2410 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2411 if (!__ip_dev_find(net
, fl4
->saddr
, false))
2417 if (fl4
->flowi4_oif
) {
2418 dev_out
= dev_get_by_index_rcu(net
, fl4
->flowi4_oif
);
2419 rth
= ERR_PTR(-ENODEV
);
2423 /* RACE: Check return value of inet_select_addr instead. */
2424 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2425 rth
= ERR_PTR(-ENETUNREACH
);
2428 if (ipv4_is_local_multicast(fl4
->daddr
) ||
2429 ipv4_is_lbcast(fl4
->daddr
) ||
2430 fl4
->flowi4_proto
== IPPROTO_IGMP
) {
2432 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2437 if (ipv4_is_multicast(fl4
->daddr
))
2438 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2440 else if (!fl4
->daddr
)
2441 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2447 fl4
->daddr
= fl4
->saddr
;
2449 fl4
->daddr
= fl4
->saddr
= htonl(INADDR_LOOPBACK
);
2450 dev_out
= net
->loopback_dev
;
2451 fl4
->flowi4_oif
= LOOPBACK_IFINDEX
;
2452 res
->type
= RTN_LOCAL
;
2453 flags
|= RTCF_LOCAL
;
2457 err
= fib_lookup(net
, fl4
, res
, 0);
2461 if (fl4
->flowi4_oif
&&
2462 (ipv4_is_multicast(fl4
->daddr
) ||
2463 !netif_index_is_l3_master(net
, fl4
->flowi4_oif
))) {
2464 /* Apparently, routing tables are wrong. Assume,
2465 that the destination is on link.
2468 Because we are allowed to send to iface
2469 even if it has NO routes and NO assigned
2470 addresses. When oif is specified, routing
2471 tables are looked up with only one purpose:
2472 to catch if destination is gatewayed, rather than
2473 direct. Moreover, if MSG_DONTROUTE is set,
2474 we send packet, ignoring both routing tables
2475 and ifaddr state. --ANK
2478 We could make it even if oif is unknown,
2479 likely IPv6, but we do not.
2482 if (fl4
->saddr
== 0)
2483 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2485 res
->type
= RTN_UNICAST
;
2492 if (res
->type
== RTN_LOCAL
) {
2494 if (res
->fi
->fib_prefsrc
)
2495 fl4
->saddr
= res
->fi
->fib_prefsrc
;
2497 fl4
->saddr
= fl4
->daddr
;
2500 /* L3 master device is the loopback for that domain */
2501 dev_out
= l3mdev_master_dev_rcu(FIB_RES_DEV(*res
)) ? :
2504 /* make sure orig_oif points to fib result device even
2505 * though packet rx/tx happens over loopback or l3mdev
2507 orig_oif
= FIB_RES_OIF(*res
);
2509 fl4
->flowi4_oif
= dev_out
->ifindex
;
2510 flags
|= RTCF_LOCAL
;
2514 fib_select_path(net
, res
, fl4
, skb
);
2516 dev_out
= FIB_RES_DEV(*res
);
2517 fl4
->flowi4_oif
= dev_out
->ifindex
;
2521 rth
= __mkroute_output(res
, fl4
, orig_oif
, dev_out
, flags
);
2527 static struct dst_entry
*ipv4_blackhole_dst_check(struct dst_entry
*dst
, u32 cookie
)
2532 static unsigned int ipv4_blackhole_mtu(const struct dst_entry
*dst
)
2534 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2536 return mtu
? : dst
->dev
->mtu
;
2539 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2540 struct sk_buff
*skb
, u32 mtu
,
2545 static void ipv4_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
2546 struct sk_buff
*skb
)
2550 static u32
*ipv4_rt_blackhole_cow_metrics(struct dst_entry
*dst
,
2556 static struct dst_ops ipv4_dst_blackhole_ops
= {
2558 .check
= ipv4_blackhole_dst_check
,
2559 .mtu
= ipv4_blackhole_mtu
,
2560 .default_advmss
= ipv4_default_advmss
,
2561 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2562 .redirect
= ipv4_rt_blackhole_redirect
,
2563 .cow_metrics
= ipv4_rt_blackhole_cow_metrics
,
2564 .neigh_lookup
= ipv4_neigh_lookup
,
2567 struct dst_entry
*ipv4_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2569 struct rtable
*ort
= (struct rtable
*) dst_orig
;
2572 rt
= dst_alloc(&ipv4_dst_blackhole_ops
, NULL
, 1, DST_OBSOLETE_DEAD
, 0);
2574 struct dst_entry
*new = &rt
->dst
;
2577 new->input
= dst_discard
;
2578 new->output
= dst_discard_out
;
2580 new->dev
= net
->loopback_dev
;
2584 rt
->rt_is_input
= ort
->rt_is_input
;
2585 rt
->rt_iif
= ort
->rt_iif
;
2586 rt
->rt_pmtu
= ort
->rt_pmtu
;
2587 rt
->rt_mtu_locked
= ort
->rt_mtu_locked
;
2589 rt
->rt_genid
= rt_genid_ipv4(net
);
2590 rt
->rt_flags
= ort
->rt_flags
;
2591 rt
->rt_type
= ort
->rt_type
;
2592 rt
->rt_gateway
= ort
->rt_gateway
;
2593 rt
->rt_uses_gateway
= ort
->rt_uses_gateway
;
2595 INIT_LIST_HEAD(&rt
->rt_uncached
);
2598 dst_release(dst_orig
);
2600 return rt
? &rt
->dst
: ERR_PTR(-ENOMEM
);
2603 struct rtable
*ip_route_output_flow(struct net
*net
, struct flowi4
*flp4
,
2604 const struct sock
*sk
)
2606 struct rtable
*rt
= __ip_route_output_key(net
, flp4
);
2611 if (flp4
->flowi4_proto
)
2612 rt
= (struct rtable
*)xfrm_lookup_route(net
, &rt
->dst
,
2613 flowi4_to_flowi(flp4
),
2618 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2620 /* called with rcu_read_lock held */
2621 static int rt_fill_info(struct net
*net
, __be32 dst
, __be32 src
, u32 table_id
,
2622 struct flowi4
*fl4
, struct sk_buff
*skb
, u32 portid
,
2625 struct rtable
*rt
= skb_rtable(skb
);
2627 struct nlmsghdr
*nlh
;
2628 unsigned long expires
= 0;
2630 u32 metrics
[RTAX_MAX
];
2632 nlh
= nlmsg_put(skb
, portid
, seq
, RTM_NEWROUTE
, sizeof(*r
), 0);
2636 r
= nlmsg_data(nlh
);
2637 r
->rtm_family
= AF_INET
;
2638 r
->rtm_dst_len
= 32;
2640 r
->rtm_tos
= fl4
->flowi4_tos
;
2641 r
->rtm_table
= table_id
< 256 ? table_id
: RT_TABLE_COMPAT
;
2642 if (nla_put_u32(skb
, RTA_TABLE
, table_id
))
2643 goto nla_put_failure
;
2644 r
->rtm_type
= rt
->rt_type
;
2645 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2646 r
->rtm_protocol
= RTPROT_UNSPEC
;
2647 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2648 if (rt
->rt_flags
& RTCF_NOTIFY
)
2649 r
->rtm_flags
|= RTM_F_NOTIFY
;
2650 if (IPCB(skb
)->flags
& IPSKB_DOREDIRECT
)
2651 r
->rtm_flags
|= RTCF_DOREDIRECT
;
2653 if (nla_put_in_addr(skb
, RTA_DST
, dst
))
2654 goto nla_put_failure
;
2656 r
->rtm_src_len
= 32;
2657 if (nla_put_in_addr(skb
, RTA_SRC
, src
))
2658 goto nla_put_failure
;
2661 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
2662 goto nla_put_failure
;
2663 #ifdef CONFIG_IP_ROUTE_CLASSID
2664 if (rt
->dst
.tclassid
&&
2665 nla_put_u32(skb
, RTA_FLOW
, rt
->dst
.tclassid
))
2666 goto nla_put_failure
;
2668 if (!rt_is_input_route(rt
) &&
2669 fl4
->saddr
!= src
) {
2670 if (nla_put_in_addr(skb
, RTA_PREFSRC
, fl4
->saddr
))
2671 goto nla_put_failure
;
2673 if (rt
->rt_uses_gateway
&&
2674 nla_put_in_addr(skb
, RTA_GATEWAY
, rt
->rt_gateway
))
2675 goto nla_put_failure
;
2677 expires
= rt
->dst
.expires
;
2679 unsigned long now
= jiffies
;
2681 if (time_before(now
, expires
))
2687 memcpy(metrics
, dst_metrics_ptr(&rt
->dst
), sizeof(metrics
));
2688 if (rt
->rt_pmtu
&& expires
)
2689 metrics
[RTAX_MTU
- 1] = rt
->rt_pmtu
;
2690 if (rt
->rt_mtu_locked
&& expires
)
2691 metrics
[RTAX_LOCK
- 1] |= BIT(RTAX_MTU
);
2692 if (rtnetlink_put_metrics(skb
, metrics
) < 0)
2693 goto nla_put_failure
;
2695 if (fl4
->flowi4_mark
&&
2696 nla_put_u32(skb
, RTA_MARK
, fl4
->flowi4_mark
))
2697 goto nla_put_failure
;
2699 if (!uid_eq(fl4
->flowi4_uid
, INVALID_UID
) &&
2700 nla_put_u32(skb
, RTA_UID
,
2701 from_kuid_munged(current_user_ns(), fl4
->flowi4_uid
)))
2702 goto nla_put_failure
;
2704 error
= rt
->dst
.error
;
2706 if (rt_is_input_route(rt
)) {
2707 #ifdef CONFIG_IP_MROUTE
2708 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2709 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2710 int err
= ipmr_get_route(net
, skb
,
2711 fl4
->saddr
, fl4
->daddr
,
2717 goto nla_put_failure
;
2721 if (nla_put_u32(skb
, RTA_IIF
, skb
->dev
->ifindex
))
2722 goto nla_put_failure
;
2725 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, expires
, error
) < 0)
2726 goto nla_put_failure
;
2728 nlmsg_end(skb
, nlh
);
2732 nlmsg_cancel(skb
, nlh
);
2736 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
2737 struct netlink_ext_ack
*extack
)
2739 struct net
*net
= sock_net(in_skb
->sk
);
2741 struct nlattr
*tb
[RTA_MAX
+1];
2742 struct fib_result res
= {};
2743 struct rtable
*rt
= NULL
;
2750 struct sk_buff
*skb
;
2751 u32 table_id
= RT_TABLE_MAIN
;
2754 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
,
2759 rtm
= nlmsg_data(nlh
);
2761 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2767 /* Reserve room for dummy headers, this skb can pass
2768 through good chunk of routing engine.
2770 skb_reset_mac_header(skb
);
2771 skb_reset_network_header(skb
);
2773 src
= tb
[RTA_SRC
] ? nla_get_in_addr(tb
[RTA_SRC
]) : 0;
2774 dst
= tb
[RTA_DST
] ? nla_get_in_addr(tb
[RTA_DST
]) : 0;
2775 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2776 mark
= tb
[RTA_MARK
] ? nla_get_u32(tb
[RTA_MARK
]) : 0;
2778 uid
= make_kuid(current_user_ns(), nla_get_u32(tb
[RTA_UID
]));
2780 uid
= (iif
? INVALID_UID
: current_uid());
2782 /* Bugfix: need to give ip_route_input enough of an IP header to
2785 ip_hdr(skb
)->protocol
= IPPROTO_UDP
;
2786 ip_hdr(skb
)->saddr
= src
;
2787 ip_hdr(skb
)->daddr
= dst
;
2789 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2791 memset(&fl4
, 0, sizeof(fl4
));
2794 fl4
.flowi4_tos
= rtm
->rtm_tos
;
2795 fl4
.flowi4_oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0;
2796 fl4
.flowi4_mark
= mark
;
2797 fl4
.flowi4_uid
= uid
;
2802 struct net_device
*dev
;
2804 dev
= dev_get_by_index_rcu(net
, iif
);
2810 skb
->protocol
= htons(ETH_P_IP
);
2813 err
= ip_route_input_rcu(skb
, dst
, src
, rtm
->rtm_tos
,
2816 rt
= skb_rtable(skb
);
2817 if (err
== 0 && rt
->dst
.error
)
2818 err
= -rt
->dst
.error
;
2820 fl4
.flowi4_iif
= LOOPBACK_IFINDEX
;
2821 rt
= ip_route_output_key_hash_rcu(net
, &fl4
, &res
, skb
);
2826 skb_dst_set(skb
, &rt
->dst
);
2832 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
2833 rt
->rt_flags
|= RTCF_NOTIFY
;
2835 if (rtm
->rtm_flags
& RTM_F_LOOKUP_TABLE
)
2836 table_id
= rt
->rt_table_id
;
2838 if (rtm
->rtm_flags
& RTM_F_FIB_MATCH
) {
2840 err
= fib_props
[res
.type
].error
;
2842 err
= -EHOSTUNREACH
;
2845 err
= fib_dump_info(skb
, NETLINK_CB(in_skb
).portid
,
2846 nlh
->nlmsg_seq
, RTM_NEWROUTE
, table_id
,
2847 rt
->rt_type
, res
.prefix
, res
.prefixlen
,
2848 fl4
.flowi4_tos
, res
.fi
, 0);
2850 err
= rt_fill_info(net
, dst
, src
, table_id
, &fl4
, skb
,
2851 NETLINK_CB(in_skb
).portid
, nlh
->nlmsg_seq
);
2858 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
2868 void ip_rt_multicast_event(struct in_device
*in_dev
)
2870 rt_cache_flush(dev_net(in_dev
->dev
));
2873 #ifdef CONFIG_SYSCTL
2874 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
2875 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
2876 static int ip_rt_gc_elasticity __read_mostly
= 8;
2878 static int ipv4_sysctl_rtcache_flush(struct ctl_table
*__ctl
, int write
,
2879 void __user
*buffer
,
2880 size_t *lenp
, loff_t
*ppos
)
2882 struct net
*net
= (struct net
*)__ctl
->extra1
;
2885 rt_cache_flush(net
);
2886 fnhe_genid_bump(net
);
2893 static struct ctl_table ipv4_route_table
[] = {
2895 .procname
= "gc_thresh",
2896 .data
= &ipv4_dst_ops
.gc_thresh
,
2897 .maxlen
= sizeof(int),
2899 .proc_handler
= proc_dointvec
,
2902 .procname
= "max_size",
2903 .data
= &ip_rt_max_size
,
2904 .maxlen
= sizeof(int),
2906 .proc_handler
= proc_dointvec
,
2909 /* Deprecated. Use gc_min_interval_ms */
2911 .procname
= "gc_min_interval",
2912 .data
= &ip_rt_gc_min_interval
,
2913 .maxlen
= sizeof(int),
2915 .proc_handler
= proc_dointvec_jiffies
,
2918 .procname
= "gc_min_interval_ms",
2919 .data
= &ip_rt_gc_min_interval
,
2920 .maxlen
= sizeof(int),
2922 .proc_handler
= proc_dointvec_ms_jiffies
,
2925 .procname
= "gc_timeout",
2926 .data
= &ip_rt_gc_timeout
,
2927 .maxlen
= sizeof(int),
2929 .proc_handler
= proc_dointvec_jiffies
,
2932 .procname
= "gc_interval",
2933 .data
= &ip_rt_gc_interval
,
2934 .maxlen
= sizeof(int),
2936 .proc_handler
= proc_dointvec_jiffies
,
2939 .procname
= "redirect_load",
2940 .data
= &ip_rt_redirect_load
,
2941 .maxlen
= sizeof(int),
2943 .proc_handler
= proc_dointvec
,
2946 .procname
= "redirect_number",
2947 .data
= &ip_rt_redirect_number
,
2948 .maxlen
= sizeof(int),
2950 .proc_handler
= proc_dointvec
,
2953 .procname
= "redirect_silence",
2954 .data
= &ip_rt_redirect_silence
,
2955 .maxlen
= sizeof(int),
2957 .proc_handler
= proc_dointvec
,
2960 .procname
= "error_cost",
2961 .data
= &ip_rt_error_cost
,
2962 .maxlen
= sizeof(int),
2964 .proc_handler
= proc_dointvec
,
2967 .procname
= "error_burst",
2968 .data
= &ip_rt_error_burst
,
2969 .maxlen
= sizeof(int),
2971 .proc_handler
= proc_dointvec
,
2974 .procname
= "gc_elasticity",
2975 .data
= &ip_rt_gc_elasticity
,
2976 .maxlen
= sizeof(int),
2978 .proc_handler
= proc_dointvec
,
2981 .procname
= "mtu_expires",
2982 .data
= &ip_rt_mtu_expires
,
2983 .maxlen
= sizeof(int),
2985 .proc_handler
= proc_dointvec_jiffies
,
2988 .procname
= "min_pmtu",
2989 .data
= &ip_rt_min_pmtu
,
2990 .maxlen
= sizeof(int),
2992 .proc_handler
= proc_dointvec_minmax
,
2993 .extra1
= &ip_min_valid_pmtu
,
2996 .procname
= "min_adv_mss",
2997 .data
= &ip_rt_min_advmss
,
2998 .maxlen
= sizeof(int),
3000 .proc_handler
= proc_dointvec
,
3005 static struct ctl_table ipv4_route_flush_table
[] = {
3007 .procname
= "flush",
3008 .maxlen
= sizeof(int),
3010 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3015 static __net_init
int sysctl_route_net_init(struct net
*net
)
3017 struct ctl_table
*tbl
;
3019 tbl
= ipv4_route_flush_table
;
3020 if (!net_eq(net
, &init_net
)) {
3021 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3025 /* Don't export sysctls to unprivileged users */
3026 if (net
->user_ns
!= &init_user_ns
)
3027 tbl
[0].procname
= NULL
;
3029 tbl
[0].extra1
= net
;
3031 net
->ipv4
.route_hdr
= register_net_sysctl(net
, "net/ipv4/route", tbl
);
3032 if (!net
->ipv4
.route_hdr
)
3037 if (tbl
!= ipv4_route_flush_table
)
3043 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3045 struct ctl_table
*tbl
;
3047 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3048 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3049 BUG_ON(tbl
== ipv4_route_flush_table
);
3053 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3054 .init
= sysctl_route_net_init
,
3055 .exit
= sysctl_route_net_exit
,
3059 static __net_init
int rt_genid_init(struct net
*net
)
3061 atomic_set(&net
->ipv4
.rt_genid
, 0);
3062 atomic_set(&net
->fnhe_genid
, 0);
3063 atomic_set(&net
->ipv4
.dev_addr_genid
, get_random_int());
3067 static __net_initdata
struct pernet_operations rt_genid_ops
= {
3068 .init
= rt_genid_init
,
3071 static int __net_init
ipv4_inetpeer_init(struct net
*net
)
3073 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
3077 inet_peer_base_init(bp
);
3078 net
->ipv4
.peers
= bp
;
3082 static void __net_exit
ipv4_inetpeer_exit(struct net
*net
)
3084 struct inet_peer_base
*bp
= net
->ipv4
.peers
;
3086 net
->ipv4
.peers
= NULL
;
3087 inetpeer_invalidate_tree(bp
);
3091 static __net_initdata
struct pernet_operations ipv4_inetpeer_ops
= {
3092 .init
= ipv4_inetpeer_init
,
3093 .exit
= ipv4_inetpeer_exit
,
3096 #ifdef CONFIG_IP_ROUTE_CLASSID
3097 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3098 #endif /* CONFIG_IP_ROUTE_CLASSID */
3100 int __init
ip_rt_init(void)
3104 ip_idents
= kmalloc(IP_IDENTS_SZ
* sizeof(*ip_idents
), GFP_KERNEL
);
3106 panic("IP: failed to allocate ip_idents\n");
3108 prandom_bytes(ip_idents
, IP_IDENTS_SZ
* sizeof(*ip_idents
));
3110 ip_tstamps
= kcalloc(IP_IDENTS_SZ
, sizeof(*ip_tstamps
), GFP_KERNEL
);
3112 panic("IP: failed to allocate ip_tstamps\n");
3114 for_each_possible_cpu(cpu
) {
3115 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
3117 INIT_LIST_HEAD(&ul
->head
);
3118 spin_lock_init(&ul
->lock
);
3120 #ifdef CONFIG_IP_ROUTE_CLASSID
3121 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3123 panic("IP: failed to allocate ip_rt_acct\n");
3126 ipv4_dst_ops
.kmem_cachep
=
3127 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3128 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3130 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3132 if (dst_entries_init(&ipv4_dst_ops
) < 0)
3133 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3135 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
3136 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3138 ipv4_dst_ops
.gc_thresh
= ~0;
3139 ip_rt_max_size
= INT_MAX
;
3144 if (ip_rt_proc_init())
3145 pr_err("Unable to create route proc files\n");
3150 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
,
3151 RTNL_FLAG_DOIT_UNLOCKED
);
3153 #ifdef CONFIG_SYSCTL
3154 register_pernet_subsys(&sysctl_route_ops
);
3156 register_pernet_subsys(&rt_genid_ops
);
3157 register_pernet_subsys(&ipv4_inetpeer_ops
);
3161 #ifdef CONFIG_SYSCTL
3163 * We really need to sanitize the damn ipv4 init order, then all
3164 * this nonsense will go away.
3166 void __init
ip_static_sysctl_init(void)
3168 register_net_sysctl(&init_net
, "net/ipv4/route", ipv4_route_table
);