2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #include "fib_lookup.h"
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
122 #define RT_GC_TIMEOUT (300*HZ)
124 static int ip_rt_max_size
;
125 static int ip_rt_redirect_number __read_mostly
= 9;
126 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
127 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly
= HZ
;
129 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
130 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
131 static u32 ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly
= 256;
134 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
136 static int ip_min_valid_pmtu __read_mostly
= IPV4_MIN_MTU
;
139 * Interface to generic destination cache.
142 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
143 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
144 static unsigned int ipv4_mtu(const struct dst_entry
*dst
);
145 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
146 static void ipv4_link_failure(struct sk_buff
*skb
);
147 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
148 struct sk_buff
*skb
, u32 mtu
);
149 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
150 struct sk_buff
*skb
);
151 static void ipv4_dst_destroy(struct dst_entry
*dst
);
153 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
159 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
162 static void ipv4_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
);
164 static struct dst_ops ipv4_dst_ops
= {
166 .check
= ipv4_dst_check
,
167 .default_advmss
= ipv4_default_advmss
,
169 .cow_metrics
= ipv4_cow_metrics
,
170 .destroy
= ipv4_dst_destroy
,
171 .negative_advice
= ipv4_negative_advice
,
172 .link_failure
= ipv4_link_failure
,
173 .update_pmtu
= ip_rt_update_pmtu
,
174 .redirect
= ip_do_redirect
,
175 .local_out
= __ip_local_out
,
176 .neigh_lookup
= ipv4_neigh_lookup
,
177 .confirm_neigh
= ipv4_confirm_neigh
,
180 #define ECN_OR_COST(class) TC_PRIO_##class
182 const __u8 ip_tos2prio
[16] = {
184 ECN_OR_COST(BESTEFFORT
),
186 ECN_OR_COST(BESTEFFORT
),
192 ECN_OR_COST(INTERACTIVE
),
194 ECN_OR_COST(INTERACTIVE
),
195 TC_PRIO_INTERACTIVE_BULK
,
196 ECN_OR_COST(INTERACTIVE_BULK
),
197 TC_PRIO_INTERACTIVE_BULK
,
198 ECN_OR_COST(INTERACTIVE_BULK
)
200 EXPORT_SYMBOL(ip_tos2prio
);
202 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
210 return SEQ_START_TOKEN
;
213 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
219 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
223 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
225 if (v
== SEQ_START_TOKEN
)
226 seq_printf(seq
, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
233 static const struct seq_operations rt_cache_seq_ops
= {
234 .start
= rt_cache_seq_start
,
235 .next
= rt_cache_seq_next
,
236 .stop
= rt_cache_seq_stop
,
237 .show
= rt_cache_seq_show
,
240 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
242 return seq_open(file
, &rt_cache_seq_ops
);
245 static const struct file_operations rt_cache_seq_fops
= {
246 .owner
= THIS_MODULE
,
247 .open
= rt_cache_seq_open
,
250 .release
= seq_release
,
254 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
259 return SEQ_START_TOKEN
;
261 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
262 if (!cpu_possible(cpu
))
265 return &per_cpu(rt_cache_stat
, cpu
);
270 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
274 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
275 if (!cpu_possible(cpu
))
278 return &per_cpu(rt_cache_stat
, cpu
);
284 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
289 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
291 struct rt_cache_stat
*st
= v
;
293 if (v
== SEQ_START_TOKEN
) {
294 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
298 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops
),
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
323 static const struct seq_operations rt_cpu_seq_ops
= {
324 .start
= rt_cpu_seq_start
,
325 .next
= rt_cpu_seq_next
,
326 .stop
= rt_cpu_seq_stop
,
327 .show
= rt_cpu_seq_show
,
331 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
333 return seq_open(file
, &rt_cpu_seq_ops
);
336 static const struct file_operations rt_cpu_seq_fops
= {
337 .owner
= THIS_MODULE
,
338 .open
= rt_cpu_seq_open
,
341 .release
= seq_release
,
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
347 struct ip_rt_acct
*dst
, *src
;
350 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
354 for_each_possible_cpu(i
) {
355 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
356 for (j
= 0; j
< 256; j
++) {
357 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
358 dst
[j
].o_packets
+= src
[j
].o_packets
;
359 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
360 dst
[j
].i_packets
+= src
[j
].i_packets
;
364 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
369 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
371 return single_open(file
, rt_acct_proc_show
, NULL
);
374 static const struct file_operations rt_acct_proc_fops
= {
375 .owner
= THIS_MODULE
,
376 .open
= rt_acct_proc_open
,
379 .release
= single_release
,
383 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
385 struct proc_dir_entry
*pde
;
387 pde
= proc_create("rt_cache", S_IRUGO
, net
->proc_net
,
392 pde
= proc_create("rt_cache", S_IRUGO
,
393 net
->proc_net_stat
, &rt_cpu_seq_fops
);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
404 #ifdef CONFIG_IP_ROUTE_CLASSID
406 remove_proc_entry("rt_cache", net
->proc_net_stat
);
409 remove_proc_entry("rt_cache", net
->proc_net
);
414 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
416 remove_proc_entry("rt_cache", net
->proc_net_stat
);
417 remove_proc_entry("rt_cache", net
->proc_net
);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net
->proc_net
);
423 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
424 .init
= ip_rt_do_proc_init
,
425 .exit
= ip_rt_do_proc_exit
,
428 static int __init
ip_rt_proc_init(void)
430 return register_pernet_subsys(&ip_rt_proc_ops
);
434 static inline int ip_rt_proc_init(void)
438 #endif /* CONFIG_PROC_FS */
440 static inline bool rt_is_expired(const struct rtable
*rth
)
442 return rth
->rt_genid
!= rt_genid_ipv4(dev_net(rth
->dst
.dev
));
445 void rt_cache_flush(struct net
*net
)
447 rt_genid_bump_ipv4(net
);
450 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
454 struct net_device
*dev
= dst
->dev
;
455 const __be32
*pkey
= daddr
;
456 const struct rtable
*rt
;
459 rt
= (const struct rtable
*) dst
;
461 pkey
= (const __be32
*) &rt
->rt_gateway
;
463 pkey
= &ip_hdr(skb
)->daddr
;
465 n
= __ipv4_neigh_lookup(dev
, *(__force u32
*)pkey
);
468 return neigh_create(&arp_tbl
, pkey
, dev
);
471 static void ipv4_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
473 struct net_device
*dev
= dst
->dev
;
474 const __be32
*pkey
= daddr
;
475 const struct rtable
*rt
;
477 rt
= (const struct rtable
*)dst
;
479 pkey
= (const __be32
*)&rt
->rt_gateway
;
482 (RTCF_MULTICAST
| RTCF_BROADCAST
| RTCF_LOCAL
)))
485 __ipv4_confirm_neigh(dev
, *(__force u32
*)pkey
);
488 #define IP_IDENTS_SZ 2048u
490 static atomic_t
*ip_idents __read_mostly
;
491 static u32
*ip_tstamps __read_mostly
;
493 /* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
497 u32
ip_idents_reserve(u32 hash
, int segs
)
499 u32
*p_tstamp
= ip_tstamps
+ hash
% IP_IDENTS_SZ
;
500 atomic_t
*p_id
= ip_idents
+ hash
% IP_IDENTS_SZ
;
501 u32 old
= READ_ONCE(*p_tstamp
);
502 u32 now
= (u32
)jiffies
;
505 if (old
!= now
&& cmpxchg(p_tstamp
, old
, now
) == old
)
506 delta
= prandom_u32_max(now
- old
);
508 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
510 old
= (u32
)atomic_read(p_id
);
511 new = old
+ delta
+ segs
;
512 } while (atomic_cmpxchg(p_id
, old
, new) != old
);
516 EXPORT_SYMBOL(ip_idents_reserve
);
518 void __ip_select_ident(struct net
*net
, struct iphdr
*iph
, int segs
)
520 static u32 ip_idents_hashrnd __read_mostly
;
523 net_get_random_once(&ip_idents_hashrnd
, sizeof(ip_idents_hashrnd
));
525 hash
= jhash_3words((__force u32
)iph
->daddr
,
526 (__force u32
)iph
->saddr
,
527 iph
->protocol
^ net_hash_mix(net
),
529 id
= ip_idents_reserve(hash
, segs
);
532 EXPORT_SYMBOL(__ip_select_ident
);
534 static void __build_flow_key(const struct net
*net
, struct flowi4
*fl4
,
535 const struct sock
*sk
,
536 const struct iphdr
*iph
,
538 u8 prot
, u32 mark
, int flow_flags
)
541 const struct inet_sock
*inet
= inet_sk(sk
);
543 oif
= sk
->sk_bound_dev_if
;
545 tos
= RT_CONN_FLAGS(sk
);
546 prot
= inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
;
548 flowi4_init_output(fl4
, oif
, mark
, tos
,
549 RT_SCOPE_UNIVERSE
, prot
,
551 iph
->daddr
, iph
->saddr
, 0, 0,
552 sock_net_uid(net
, sk
));
555 static void build_skb_flow_key(struct flowi4
*fl4
, const struct sk_buff
*skb
,
556 const struct sock
*sk
)
558 const struct net
*net
= dev_net(skb
->dev
);
559 const struct iphdr
*iph
= ip_hdr(skb
);
560 int oif
= skb
->dev
->ifindex
;
561 u8 tos
= RT_TOS(iph
->tos
);
562 u8 prot
= iph
->protocol
;
563 u32 mark
= skb
->mark
;
565 __build_flow_key(net
, fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
568 static void build_sk_flow_key(struct flowi4
*fl4
, const struct sock
*sk
)
570 const struct inet_sock
*inet
= inet_sk(sk
);
571 const struct ip_options_rcu
*inet_opt
;
572 __be32 daddr
= inet
->inet_daddr
;
575 inet_opt
= rcu_dereference(inet
->inet_opt
);
576 if (inet_opt
&& inet_opt
->opt
.srr
)
577 daddr
= inet_opt
->opt
.faddr
;
578 flowi4_init_output(fl4
, sk
->sk_bound_dev_if
, sk
->sk_mark
,
579 RT_CONN_FLAGS(sk
), RT_SCOPE_UNIVERSE
,
580 inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
,
581 inet_sk_flowi_flags(sk
),
582 daddr
, inet
->inet_saddr
, 0, 0, sk
->sk_uid
);
586 static void ip_rt_build_flow_key(struct flowi4
*fl4
, const struct sock
*sk
,
587 const struct sk_buff
*skb
)
590 build_skb_flow_key(fl4
, skb
, sk
);
592 build_sk_flow_key(fl4
, sk
);
595 static DEFINE_SPINLOCK(fnhe_lock
);
597 static void fnhe_flush_routes(struct fib_nh_exception
*fnhe
)
601 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
603 RCU_INIT_POINTER(fnhe
->fnhe_rth_input
, NULL
);
604 dst_dev_put(&rt
->dst
);
605 dst_release(&rt
->dst
);
607 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
609 RCU_INIT_POINTER(fnhe
->fnhe_rth_output
, NULL
);
610 dst_dev_put(&rt
->dst
);
611 dst_release(&rt
->dst
);
615 static struct fib_nh_exception
*fnhe_oldest(struct fnhe_hash_bucket
*hash
)
617 struct fib_nh_exception
*fnhe
, *oldest
;
619 oldest
= rcu_dereference(hash
->chain
);
620 for (fnhe
= rcu_dereference(oldest
->fnhe_next
); fnhe
;
621 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
622 if (time_before(fnhe
->fnhe_stamp
, oldest
->fnhe_stamp
))
625 fnhe_flush_routes(oldest
);
629 static inline u32
fnhe_hashfun(__be32 daddr
)
631 static u32 fnhe_hashrnd __read_mostly
;
634 net_get_random_once(&fnhe_hashrnd
, sizeof(fnhe_hashrnd
));
635 hval
= jhash_1word((__force u32
) daddr
, fnhe_hashrnd
);
636 return hash_32(hval
, FNHE_HASH_SHIFT
);
639 static void fill_route_from_fnhe(struct rtable
*rt
, struct fib_nh_exception
*fnhe
)
641 rt
->rt_pmtu
= fnhe
->fnhe_pmtu
;
642 rt
->dst
.expires
= fnhe
->fnhe_expires
;
645 rt
->rt_flags
|= RTCF_REDIRECTED
;
646 rt
->rt_gateway
= fnhe
->fnhe_gw
;
647 rt
->rt_uses_gateway
= 1;
651 static void update_or_create_fnhe(struct fib_nh
*nh
, __be32 daddr
, __be32 gw
,
652 u32 pmtu
, unsigned long expires
)
654 struct fnhe_hash_bucket
*hash
;
655 struct fib_nh_exception
*fnhe
;
661 genid
= fnhe_genid(dev_net(nh
->nh_dev
));
662 hval
= fnhe_hashfun(daddr
);
664 spin_lock_bh(&fnhe_lock
);
666 hash
= rcu_dereference(nh
->nh_exceptions
);
668 hash
= kzalloc(FNHE_HASH_SIZE
* sizeof(*hash
), GFP_ATOMIC
);
671 rcu_assign_pointer(nh
->nh_exceptions
, hash
);
677 for (fnhe
= rcu_dereference(hash
->chain
); fnhe
;
678 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
679 if (fnhe
->fnhe_daddr
== daddr
)
685 if (fnhe
->fnhe_genid
!= genid
)
686 fnhe
->fnhe_genid
= genid
;
690 fnhe
->fnhe_pmtu
= pmtu
;
691 fnhe
->fnhe_expires
= max(1UL, expires
);
692 /* Update all cached dsts too */
693 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
695 fill_route_from_fnhe(rt
, fnhe
);
696 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
698 fill_route_from_fnhe(rt
, fnhe
);
700 if (depth
> FNHE_RECLAIM_DEPTH
)
701 fnhe
= fnhe_oldest(hash
);
703 fnhe
= kzalloc(sizeof(*fnhe
), GFP_ATOMIC
);
707 fnhe
->fnhe_next
= hash
->chain
;
708 rcu_assign_pointer(hash
->chain
, fnhe
);
710 fnhe
->fnhe_genid
= genid
;
711 fnhe
->fnhe_daddr
= daddr
;
713 fnhe
->fnhe_pmtu
= pmtu
;
714 fnhe
->fnhe_expires
= expires
;
716 /* Exception created; mark the cached routes for the nexthop
717 * stale, so anyone caching it rechecks if this exception
720 rt
= rcu_dereference(nh
->nh_rth_input
);
722 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
724 for_each_possible_cpu(i
) {
725 struct rtable __rcu
**prt
;
726 prt
= per_cpu_ptr(nh
->nh_pcpu_rth_output
, i
);
727 rt
= rcu_dereference(*prt
);
729 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
733 fnhe
->fnhe_stamp
= jiffies
;
736 spin_unlock_bh(&fnhe_lock
);
739 static void __ip_do_redirect(struct rtable
*rt
, struct sk_buff
*skb
, struct flowi4
*fl4
,
742 __be32 new_gw
= icmp_hdr(skb
)->un
.gateway
;
743 __be32 old_gw
= ip_hdr(skb
)->saddr
;
744 struct net_device
*dev
= skb
->dev
;
745 struct in_device
*in_dev
;
746 struct fib_result res
;
750 switch (icmp_hdr(skb
)->code
& 7) {
752 case ICMP_REDIR_NETTOS
:
753 case ICMP_REDIR_HOST
:
754 case ICMP_REDIR_HOSTTOS
:
761 if (rt
->rt_gateway
!= old_gw
)
764 in_dev
= __in_dev_get_rcu(dev
);
769 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
770 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
771 ipv4_is_zeronet(new_gw
))
772 goto reject_redirect
;
774 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
775 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
776 goto reject_redirect
;
777 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
778 goto reject_redirect
;
780 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
781 goto reject_redirect
;
784 n
= __ipv4_neigh_lookup(rt
->dst
.dev
, new_gw
);
786 n
= neigh_create(&arp_tbl
, &new_gw
, rt
->dst
.dev
);
788 if (!(n
->nud_state
& NUD_VALID
)) {
789 neigh_event_send(n
, NULL
);
791 if (fib_lookup(net
, fl4
, &res
, 0) == 0) {
792 struct fib_nh
*nh
= &FIB_RES_NH(res
);
794 update_or_create_fnhe(nh
, fl4
->daddr
, new_gw
,
795 0, jiffies
+ ip_rt_gc_timeout
);
798 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
799 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE
, n
);
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807 if (IN_DEV_LOG_MARTIANS(in_dev
)) {
808 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
809 __be32 daddr
= iph
->daddr
;
810 __be32 saddr
= iph
->saddr
;
812 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813 " Advised path = %pI4 -> %pI4\n",
814 &old_gw
, dev
->name
, &new_gw
,
821 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
825 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
826 struct net
*net
= dev_net(skb
->dev
);
827 int oif
= skb
->dev
->ifindex
;
828 u8 tos
= RT_TOS(iph
->tos
);
829 u8 prot
= iph
->protocol
;
830 u32 mark
= skb
->mark
;
832 rt
= (struct rtable
*) dst
;
834 __build_flow_key(net
, &fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
835 __ip_do_redirect(rt
, skb
, &fl4
, true);
838 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
840 struct rtable
*rt
= (struct rtable
*)dst
;
841 struct dst_entry
*ret
= dst
;
844 if (dst
->obsolete
> 0) {
847 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
858 * 1. The first ip_rt_redirect_number redirects are sent
859 * with exponential backoff, then we stop sending them at all,
860 * assuming that the host ignores our redirects.
861 * 2. If we did not see packets requiring redirects
862 * during ip_rt_redirect_silence, we assume that the host
863 * forgot redirected route and start to send redirects again.
865 * This algorithm is much cheaper and more intelligent than dumb load limiting
868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
872 void ip_rt_send_redirect(struct sk_buff
*skb
)
874 struct rtable
*rt
= skb_rtable(skb
);
875 struct in_device
*in_dev
;
876 struct inet_peer
*peer
;
882 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
883 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
887 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
888 vif
= l3mdev_master_ifindex_rcu(rt
->dst
.dev
);
891 net
= dev_net(rt
->dst
.dev
);
892 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
, vif
, 1);
894 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
,
895 rt_nexthop(rt
, ip_hdr(skb
)->daddr
));
899 /* No redirected packets during ip_rt_redirect_silence;
900 * reset the algorithm.
902 if (time_after(jiffies
, peer
->rate_last
+ ip_rt_redirect_silence
))
903 peer
->rate_tokens
= 0;
905 /* Too many ignored redirects; do not send anything
906 * set dst.rate_last to the last seen redirected packet.
908 if (peer
->rate_tokens
>= ip_rt_redirect_number
) {
909 peer
->rate_last
= jiffies
;
913 /* Check for load limit; set rate_last to the latest sent
916 if (peer
->rate_tokens
== 0 ||
919 (ip_rt_redirect_load
<< peer
->rate_tokens
)))) {
920 __be32 gw
= rt_nexthop(rt
, ip_hdr(skb
)->daddr
);
922 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, gw
);
923 peer
->rate_last
= jiffies
;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
927 peer
->rate_tokens
== ip_rt_redirect_number
)
928 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929 &ip_hdr(skb
)->saddr
, inet_iif(skb
),
930 &ip_hdr(skb
)->daddr
, &gw
);
937 static int ip_error(struct sk_buff
*skb
)
939 struct in_device
*in_dev
= __in_dev_get_rcu(skb
->dev
);
940 struct rtable
*rt
= skb_rtable(skb
);
941 struct inet_peer
*peer
;
947 /* IP on this device is disabled. */
951 net
= dev_net(rt
->dst
.dev
);
952 if (!IN_DEV_FORWARD(in_dev
)) {
953 switch (rt
->dst
.error
) {
955 __IP_INC_STATS(net
, IPSTATS_MIB_INADDRERRORS
);
959 __IP_INC_STATS(net
, IPSTATS_MIB_INNOROUTES
);
965 switch (rt
->dst
.error
) {
970 code
= ICMP_HOST_UNREACH
;
973 code
= ICMP_NET_UNREACH
;
974 __IP_INC_STATS(net
, IPSTATS_MIB_INNOROUTES
);
977 code
= ICMP_PKT_FILTERED
;
981 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
,
982 l3mdev_master_ifindex(skb
->dev
), 1);
987 peer
->rate_tokens
+= now
- peer
->rate_last
;
988 if (peer
->rate_tokens
> ip_rt_error_burst
)
989 peer
->rate_tokens
= ip_rt_error_burst
;
990 peer
->rate_last
= now
;
991 if (peer
->rate_tokens
>= ip_rt_error_cost
)
992 peer
->rate_tokens
-= ip_rt_error_cost
;
998 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1000 out
: kfree_skb(skb
);
1004 static void __ip_rt_update_pmtu(struct rtable
*rt
, struct flowi4
*fl4
, u32 mtu
)
1006 struct dst_entry
*dst
= &rt
->dst
;
1007 struct fib_result res
;
1009 if (dst_metric_locked(dst
, RTAX_MTU
))
1012 if (ipv4_mtu(dst
) < mtu
)
1015 if (mtu
< ip_rt_min_pmtu
)
1016 mtu
= ip_rt_min_pmtu
;
1018 if (rt
->rt_pmtu
== mtu
&&
1019 time_before(jiffies
, dst
->expires
- ip_rt_mtu_expires
/ 2))
1023 if (fib_lookup(dev_net(dst
->dev
), fl4
, &res
, 0) == 0) {
1024 struct fib_nh
*nh
= &FIB_RES_NH(res
);
1026 update_or_create_fnhe(nh
, fl4
->daddr
, 0, mtu
,
1027 jiffies
+ ip_rt_mtu_expires
);
1032 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
1033 struct sk_buff
*skb
, u32 mtu
)
1035 struct rtable
*rt
= (struct rtable
*) dst
;
1038 ip_rt_build_flow_key(&fl4
, sk
, skb
);
1039 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1042 void ipv4_update_pmtu(struct sk_buff
*skb
, struct net
*net
, u32 mtu
,
1043 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1045 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1050 mark
= IP4_REPLY_MARK(net
, skb
->mark
);
1052 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
,
1053 RT_TOS(iph
->tos
), protocol
, mark
, flow_flags
);
1054 rt
= __ip_route_output_key(net
, &fl4
);
1056 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1060 EXPORT_SYMBOL_GPL(ipv4_update_pmtu
);
1062 static void __ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1064 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1068 __build_flow_key(sock_net(sk
), &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1070 if (!fl4
.flowi4_mark
)
1071 fl4
.flowi4_mark
= IP4_REPLY_MARK(sock_net(sk
), skb
->mark
);
1073 rt
= __ip_route_output_key(sock_net(sk
), &fl4
);
1075 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1080 void ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1082 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1085 struct dst_entry
*odst
= NULL
;
1087 struct net
*net
= sock_net(sk
);
1091 if (!ip_sk_accept_pmtu(sk
))
1094 odst
= sk_dst_get(sk
);
1096 if (sock_owned_by_user(sk
) || !odst
) {
1097 __ipv4_sk_update_pmtu(skb
, sk
, mtu
);
1101 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1103 rt
= (struct rtable
*)odst
;
1104 if (odst
->obsolete
&& !odst
->ops
->check(odst
, 0)) {
1105 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1112 __ip_rt_update_pmtu((struct rtable
*) rt
->dst
.path
, &fl4
, mtu
);
1114 if (!dst_check(&rt
->dst
, 0)) {
1116 dst_release(&rt
->dst
);
1118 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1126 sk_dst_set(sk
, &rt
->dst
);
1132 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu
);
1134 void ipv4_redirect(struct sk_buff
*skb
, struct net
*net
,
1135 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1137 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1141 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
,
1142 RT_TOS(iph
->tos
), protocol
, mark
, flow_flags
);
1143 rt
= __ip_route_output_key(net
, &fl4
);
1145 __ip_do_redirect(rt
, skb
, &fl4
, false);
1149 EXPORT_SYMBOL_GPL(ipv4_redirect
);
1151 void ipv4_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
1153 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1156 struct net
*net
= sock_net(sk
);
1158 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1159 rt
= __ip_route_output_key(net
, &fl4
);
1161 __ip_do_redirect(rt
, skb
, &fl4
, false);
1165 EXPORT_SYMBOL_GPL(ipv4_sk_redirect
);
1167 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1169 struct rtable
*rt
= (struct rtable
*) dst
;
1171 /* All IPV4 dsts are created with ->obsolete set to the value
1172 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1173 * into this function always.
1175 * When a PMTU/redirect information update invalidates a route,
1176 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1177 * DST_OBSOLETE_DEAD by dst_free().
1179 if (dst
->obsolete
!= DST_OBSOLETE_FORCE_CHK
|| rt_is_expired(rt
))
1184 static void ipv4_link_failure(struct sk_buff
*skb
)
1188 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1190 rt
= skb_rtable(skb
);
1192 dst_set_expires(&rt
->dst
, 0);
1195 static int ip_rt_bug(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
1197 pr_debug("%s: %pI4 -> %pI4, %s\n",
1198 __func__
, &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1199 skb
->dev
? skb
->dev
->name
: "?");
1206 We do not cache source address of outgoing interface,
1207 because it is used only by IP RR, TS and SRR options,
1208 so that it out of fast path.
1210 BTW remember: "addr" is allowed to be not aligned
1214 void ip_rt_get_source(u8
*addr
, struct sk_buff
*skb
, struct rtable
*rt
)
1218 if (rt_is_output_route(rt
))
1219 src
= ip_hdr(skb
)->saddr
;
1221 struct fib_result res
;
1227 memset(&fl4
, 0, sizeof(fl4
));
1228 fl4
.daddr
= iph
->daddr
;
1229 fl4
.saddr
= iph
->saddr
;
1230 fl4
.flowi4_tos
= RT_TOS(iph
->tos
);
1231 fl4
.flowi4_oif
= rt
->dst
.dev
->ifindex
;
1232 fl4
.flowi4_iif
= skb
->dev
->ifindex
;
1233 fl4
.flowi4_mark
= skb
->mark
;
1236 if (fib_lookup(dev_net(rt
->dst
.dev
), &fl4
, &res
, 0) == 0)
1237 src
= FIB_RES_PREFSRC(dev_net(rt
->dst
.dev
), res
);
1239 src
= inet_select_addr(rt
->dst
.dev
,
1240 rt_nexthop(rt
, iph
->daddr
),
1244 memcpy(addr
, &src
, 4);
1247 #ifdef CONFIG_IP_ROUTE_CLASSID
1248 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1250 if (!(rt
->dst
.tclassid
& 0xFFFF))
1251 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1252 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1253 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1257 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1259 unsigned int header_size
= sizeof(struct tcphdr
) + sizeof(struct iphdr
);
1260 unsigned int advmss
= max_t(unsigned int, ipv4_mtu(dst
) - header_size
,
1263 return min(advmss
, IPV4_MAX_PMTU
- header_size
);
1266 static unsigned int ipv4_mtu(const struct dst_entry
*dst
)
1268 const struct rtable
*rt
= (const struct rtable
*) dst
;
1269 unsigned int mtu
= rt
->rt_pmtu
;
1271 if (!mtu
|| time_after_eq(jiffies
, rt
->dst
.expires
))
1272 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
1277 mtu
= READ_ONCE(dst
->dev
->mtu
);
1279 if (unlikely(dst_metric_locked(dst
, RTAX_MTU
))) {
1280 if (rt
->rt_uses_gateway
&& mtu
> 576)
1284 mtu
= min_t(unsigned int, mtu
, IP_MAX_MTU
);
1286 return mtu
- lwtunnel_headroom(dst
->lwtstate
, mtu
);
1289 static struct fib_nh_exception
*find_exception(struct fib_nh
*nh
, __be32 daddr
)
1291 struct fnhe_hash_bucket
*hash
= rcu_dereference(nh
->nh_exceptions
);
1292 struct fib_nh_exception
*fnhe
;
1298 hval
= fnhe_hashfun(daddr
);
1300 for (fnhe
= rcu_dereference(hash
[hval
].chain
); fnhe
;
1301 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
1302 if (fnhe
->fnhe_daddr
== daddr
)
1308 static bool rt_bind_exception(struct rtable
*rt
, struct fib_nh_exception
*fnhe
,
1309 __be32 daddr
, const bool do_cache
)
1313 spin_lock_bh(&fnhe_lock
);
1315 if (daddr
== fnhe
->fnhe_daddr
) {
1316 struct rtable __rcu
**porig
;
1317 struct rtable
*orig
;
1318 int genid
= fnhe_genid(dev_net(rt
->dst
.dev
));
1320 if (rt_is_input_route(rt
))
1321 porig
= &fnhe
->fnhe_rth_input
;
1323 porig
= &fnhe
->fnhe_rth_output
;
1324 orig
= rcu_dereference(*porig
);
1326 if (fnhe
->fnhe_genid
!= genid
) {
1327 fnhe
->fnhe_genid
= genid
;
1329 fnhe
->fnhe_pmtu
= 0;
1330 fnhe
->fnhe_expires
= 0;
1331 fnhe_flush_routes(fnhe
);
1334 fill_route_from_fnhe(rt
, fnhe
);
1335 if (!rt
->rt_gateway
)
1336 rt
->rt_gateway
= daddr
;
1340 rcu_assign_pointer(*porig
, rt
);
1342 dst_dev_put(&orig
->dst
);
1343 dst_release(&orig
->dst
);
1348 fnhe
->fnhe_stamp
= jiffies
;
1350 spin_unlock_bh(&fnhe_lock
);
1355 static bool rt_cache_route(struct fib_nh
*nh
, struct rtable
*rt
)
1357 struct rtable
*orig
, *prev
, **p
;
1360 if (rt_is_input_route(rt
)) {
1361 p
= (struct rtable
**)&nh
->nh_rth_input
;
1363 p
= (struct rtable
**)raw_cpu_ptr(nh
->nh_pcpu_rth_output
);
1367 /* hold dst before doing cmpxchg() to avoid race condition
1371 prev
= cmpxchg(p
, orig
, rt
);
1374 dst_dev_put(&orig
->dst
);
1375 dst_release(&orig
->dst
);
1378 dst_release(&rt
->dst
);
1385 struct uncached_list
{
1387 struct list_head head
;
1390 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt_uncached_list
);
1392 static void rt_add_uncached_list(struct rtable
*rt
)
1394 struct uncached_list
*ul
= raw_cpu_ptr(&rt_uncached_list
);
1396 rt
->rt_uncached_list
= ul
;
1398 spin_lock_bh(&ul
->lock
);
1399 list_add_tail(&rt
->rt_uncached
, &ul
->head
);
1400 spin_unlock_bh(&ul
->lock
);
1403 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1405 struct dst_metrics
*p
= (struct dst_metrics
*)DST_METRICS_PTR(dst
);
1406 struct rtable
*rt
= (struct rtable
*) dst
;
1408 if (p
!= &dst_default_metrics
&& refcount_dec_and_test(&p
->refcnt
))
1411 if (!list_empty(&rt
->rt_uncached
)) {
1412 struct uncached_list
*ul
= rt
->rt_uncached_list
;
1414 spin_lock_bh(&ul
->lock
);
1415 list_del(&rt
->rt_uncached
);
1416 spin_unlock_bh(&ul
->lock
);
1420 void rt_flush_dev(struct net_device
*dev
)
1422 struct net
*net
= dev_net(dev
);
1426 for_each_possible_cpu(cpu
) {
1427 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
1429 spin_lock_bh(&ul
->lock
);
1430 list_for_each_entry(rt
, &ul
->head
, rt_uncached
) {
1431 if (rt
->dst
.dev
!= dev
)
1433 rt
->dst
.dev
= net
->loopback_dev
;
1434 dev_hold(rt
->dst
.dev
);
1437 spin_unlock_bh(&ul
->lock
);
1441 static bool rt_cache_valid(const struct rtable
*rt
)
1444 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
1448 static void rt_set_nexthop(struct rtable
*rt
, __be32 daddr
,
1449 const struct fib_result
*res
,
1450 struct fib_nh_exception
*fnhe
,
1451 struct fib_info
*fi
, u16 type
, u32 itag
,
1452 const bool do_cache
)
1454 bool cached
= false;
1457 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
1459 if (nh
->nh_gw
&& nh
->nh_scope
== RT_SCOPE_LINK
) {
1460 rt
->rt_gateway
= nh
->nh_gw
;
1461 rt
->rt_uses_gateway
= 1;
1463 dst_init_metrics(&rt
->dst
, fi
->fib_metrics
->metrics
, true);
1464 if (fi
->fib_metrics
!= &dst_default_metrics
) {
1465 rt
->dst
._metrics
|= DST_METRICS_REFCOUNTED
;
1466 refcount_inc(&fi
->fib_metrics
->refcnt
);
1468 #ifdef CONFIG_IP_ROUTE_CLASSID
1469 rt
->dst
.tclassid
= nh
->nh_tclassid
;
1471 rt
->dst
.lwtstate
= lwtstate_get(nh
->nh_lwtstate
);
1473 cached
= rt_bind_exception(rt
, fnhe
, daddr
, do_cache
);
1475 cached
= rt_cache_route(nh
, rt
);
1476 if (unlikely(!cached
)) {
1477 /* Routes we intend to cache in nexthop exception or
1478 * FIB nexthop have the DST_NOCACHE bit clear.
1479 * However, if we are unsuccessful at storing this
1480 * route into the cache we really need to set it.
1482 if (!rt
->rt_gateway
)
1483 rt
->rt_gateway
= daddr
;
1484 rt_add_uncached_list(rt
);
1487 rt_add_uncached_list(rt
);
1489 #ifdef CONFIG_IP_ROUTE_CLASSID
1490 #ifdef CONFIG_IP_MULTIPLE_TABLES
1491 set_class_tag(rt
, res
->tclassid
);
1493 set_class_tag(rt
, itag
);
1497 struct rtable
*rt_dst_alloc(struct net_device
*dev
,
1498 unsigned int flags
, u16 type
,
1499 bool nopolicy
, bool noxfrm
, bool will_cache
)
1503 rt
= dst_alloc(&ipv4_dst_ops
, dev
, 1, DST_OBSOLETE_FORCE_CHK
,
1504 (will_cache
? 0 : DST_HOST
) |
1505 (nopolicy
? DST_NOPOLICY
: 0) |
1506 (noxfrm
? DST_NOXFRM
: 0));
1509 rt
->rt_genid
= rt_genid_ipv4(dev_net(dev
));
1510 rt
->rt_flags
= flags
;
1512 rt
->rt_is_input
= 0;
1516 rt
->rt_uses_gateway
= 0;
1517 rt
->rt_table_id
= 0;
1518 INIT_LIST_HEAD(&rt
->rt_uncached
);
1520 rt
->dst
.output
= ip_output
;
1521 if (flags
& RTCF_LOCAL
)
1522 rt
->dst
.input
= ip_local_deliver
;
1527 EXPORT_SYMBOL(rt_dst_alloc
);
1529 /* called in rcu_read_lock() section */
1530 int ip_mc_validate_source(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1531 u8 tos
, struct net_device
*dev
,
1532 struct in_device
*in_dev
, u32
*itag
)
1536 /* Primary sanity checks. */
1540 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1541 skb
->protocol
!= htons(ETH_P_IP
))
1544 if (ipv4_is_loopback(saddr
) && !IN_DEV_ROUTE_LOCALNET(in_dev
))
1547 if (ipv4_is_zeronet(saddr
)) {
1548 if (!ipv4_is_local_multicast(daddr
))
1551 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
1559 /* called in rcu_read_lock() section */
1560 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1561 u8 tos
, struct net_device
*dev
, int our
)
1563 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1564 unsigned int flags
= RTCF_MULTICAST
;
1569 err
= ip_mc_validate_source(skb
, daddr
, saddr
, tos
, dev
, in_dev
, &itag
);
1574 flags
|= RTCF_LOCAL
;
1576 rth
= rt_dst_alloc(dev_net(dev
)->loopback_dev
, flags
, RTN_MULTICAST
,
1577 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false, false);
1581 #ifdef CONFIG_IP_ROUTE_CLASSID
1582 rth
->dst
.tclassid
= itag
;
1584 rth
->dst
.output
= ip_rt_bug
;
1585 rth
->rt_is_input
= 1;
1587 #ifdef CONFIG_IP_MROUTE
1588 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1589 rth
->dst
.input
= ip_mr_input
;
1591 RT_CACHE_STAT_INC(in_slow_mc
);
1593 skb_dst_set(skb
, &rth
->dst
);
1598 static void ip_handle_martian_source(struct net_device
*dev
,
1599 struct in_device
*in_dev
,
1600 struct sk_buff
*skb
,
1604 RT_CACHE_STAT_INC(in_martian_src
);
1605 #ifdef CONFIG_IP_ROUTE_VERBOSE
1606 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1608 * RFC1812 recommendation, if source is martian,
1609 * the only hint is MAC header.
1611 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1612 &daddr
, &saddr
, dev
->name
);
1613 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1614 print_hex_dump(KERN_WARNING
, "ll header: ",
1615 DUMP_PREFIX_OFFSET
, 16, 1,
1616 skb_mac_header(skb
),
1617 dev
->hard_header_len
, true);
1623 static void ip_del_fnhe(struct fib_nh
*nh
, __be32 daddr
)
1625 struct fnhe_hash_bucket
*hash
;
1626 struct fib_nh_exception
*fnhe
, __rcu
**fnhe_p
;
1627 u32 hval
= fnhe_hashfun(daddr
);
1629 spin_lock_bh(&fnhe_lock
);
1631 hash
= rcu_dereference_protected(nh
->nh_exceptions
,
1632 lockdep_is_held(&fnhe_lock
));
1635 fnhe_p
= &hash
->chain
;
1636 fnhe
= rcu_dereference_protected(*fnhe_p
, lockdep_is_held(&fnhe_lock
));
1638 if (fnhe
->fnhe_daddr
== daddr
) {
1639 rcu_assign_pointer(*fnhe_p
, rcu_dereference_protected(
1640 fnhe
->fnhe_next
, lockdep_is_held(&fnhe_lock
)));
1641 fnhe_flush_routes(fnhe
);
1642 kfree_rcu(fnhe
, rcu
);
1645 fnhe_p
= &fnhe
->fnhe_next
;
1646 fnhe
= rcu_dereference_protected(fnhe
->fnhe_next
,
1647 lockdep_is_held(&fnhe_lock
));
1650 spin_unlock_bh(&fnhe_lock
);
1653 static void set_lwt_redirect(struct rtable
*rth
)
1655 if (lwtunnel_output_redirect(rth
->dst
.lwtstate
)) {
1656 rth
->dst
.lwtstate
->orig_output
= rth
->dst
.output
;
1657 rth
->dst
.output
= lwtunnel_output
;
1660 if (lwtunnel_input_redirect(rth
->dst
.lwtstate
)) {
1661 rth
->dst
.lwtstate
->orig_input
= rth
->dst
.input
;
1662 rth
->dst
.input
= lwtunnel_input
;
1666 /* called in rcu_read_lock() section */
1667 static int __mkroute_input(struct sk_buff
*skb
,
1668 const struct fib_result
*res
,
1669 struct in_device
*in_dev
,
1670 __be32 daddr
, __be32 saddr
, u32 tos
)
1672 struct fib_nh_exception
*fnhe
;
1675 struct in_device
*out_dev
;
1679 /* get a working reference to the output device */
1680 out_dev
= __in_dev_get_rcu(FIB_RES_DEV(*res
));
1682 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1686 err
= fib_validate_source(skb
, saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
1687 in_dev
->dev
, in_dev
, &itag
);
1689 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
1695 do_cache
= res
->fi
&& !itag
;
1696 if (out_dev
== in_dev
&& err
&& IN_DEV_TX_REDIRECTS(out_dev
) &&
1697 skb
->protocol
== htons(ETH_P_IP
) &&
1698 (IN_DEV_SHARED_MEDIA(out_dev
) ||
1699 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
1700 IPCB(skb
)->flags
|= IPSKB_DOREDIRECT
;
1702 if (skb
->protocol
!= htons(ETH_P_IP
)) {
1703 /* Not IP (i.e. ARP). Do not create route, if it is
1704 * invalid for proxy arp. DNAT routes are always valid.
1706 * Proxy arp feature have been extended to allow, ARP
1707 * replies back to the same interface, to support
1708 * Private VLAN switch technologies. See arp.c.
1710 if (out_dev
== in_dev
&&
1711 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
1717 fnhe
= find_exception(&FIB_RES_NH(*res
), daddr
);
1720 rth
= rcu_dereference(fnhe
->fnhe_rth_input
);
1721 if (rth
&& rth
->dst
.expires
&&
1722 time_after(jiffies
, rth
->dst
.expires
)) {
1723 ip_del_fnhe(&FIB_RES_NH(*res
), daddr
);
1730 rth
= rcu_dereference(FIB_RES_NH(*res
).nh_rth_input
);
1733 if (rt_cache_valid(rth
)) {
1734 skb_dst_set_noref(skb
, &rth
->dst
);
1739 rth
= rt_dst_alloc(out_dev
->dev
, 0, res
->type
,
1740 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
1741 IN_DEV_CONF_GET(out_dev
, NOXFRM
), do_cache
);
1747 rth
->rt_is_input
= 1;
1749 rth
->rt_table_id
= res
->table
->tb_id
;
1750 RT_CACHE_STAT_INC(in_slow_tot
);
1752 rth
->dst
.input
= ip_forward
;
1754 rt_set_nexthop(rth
, daddr
, res
, fnhe
, res
->fi
, res
->type
, itag
,
1756 set_lwt_redirect(rth
);
1757 skb_dst_set(skb
, &rth
->dst
);
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1765 /* To make ICMP packets follow the right flow, the multipath hash is
1766 * calculated from the inner IP addresses.
1768 static void ip_multipath_l3_keys(const struct sk_buff
*skb
,
1769 struct flow_keys
*hash_keys
)
1771 const struct iphdr
*outer_iph
= ip_hdr(skb
);
1772 const struct iphdr
*inner_iph
;
1773 const struct icmphdr
*icmph
;
1774 struct iphdr _inner_iph
;
1775 struct icmphdr _icmph
;
1777 hash_keys
->addrs
.v4addrs
.src
= outer_iph
->saddr
;
1778 hash_keys
->addrs
.v4addrs
.dst
= outer_iph
->daddr
;
1779 if (likely(outer_iph
->protocol
!= IPPROTO_ICMP
))
1782 if (unlikely((outer_iph
->frag_off
& htons(IP_OFFSET
)) != 0))
1785 icmph
= skb_header_pointer(skb
, outer_iph
->ihl
* 4, sizeof(_icmph
),
1790 if (icmph
->type
!= ICMP_DEST_UNREACH
&&
1791 icmph
->type
!= ICMP_REDIRECT
&&
1792 icmph
->type
!= ICMP_TIME_EXCEEDED
&&
1793 icmph
->type
!= ICMP_PARAMETERPROB
)
1796 inner_iph
= skb_header_pointer(skb
,
1797 outer_iph
->ihl
* 4 + sizeof(_icmph
),
1798 sizeof(_inner_iph
), &_inner_iph
);
1801 hash_keys
->addrs
.v4addrs
.src
= inner_iph
->saddr
;
1802 hash_keys
->addrs
.v4addrs
.dst
= inner_iph
->daddr
;
1805 /* if skb is set it will be used and fl4 can be NULL */
1806 int fib_multipath_hash(const struct fib_info
*fi
, const struct flowi4
*fl4
,
1807 const struct sk_buff
*skb
)
1809 struct net
*net
= fi
->fib_net
;
1810 struct flow_keys hash_keys
;
1813 switch (net
->ipv4
.sysctl_fib_multipath_hash_policy
) {
1815 memset(&hash_keys
, 0, sizeof(hash_keys
));
1816 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1818 ip_multipath_l3_keys(skb
, &hash_keys
);
1820 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
1821 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
1825 /* skb is currently provided only when forwarding */
1827 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
1828 struct flow_keys keys
;
1830 /* short-circuit if we already have L4 hash present */
1832 return skb_get_hash_raw(skb
) >> 1;
1833 memset(&hash_keys
, 0, sizeof(hash_keys
));
1834 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
1835 hash_keys
.addrs
.v4addrs
.src
= keys
.addrs
.v4addrs
.src
;
1836 hash_keys
.addrs
.v4addrs
.dst
= keys
.addrs
.v4addrs
.dst
;
1837 hash_keys
.ports
.src
= keys
.ports
.src
;
1838 hash_keys
.ports
.dst
= keys
.ports
.dst
;
1839 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
1841 memset(&hash_keys
, 0, sizeof(hash_keys
));
1842 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1843 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
1844 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
1845 hash_keys
.ports
.src
= fl4
->fl4_sport
;
1846 hash_keys
.ports
.dst
= fl4
->fl4_dport
;
1847 hash_keys
.basic
.ip_proto
= fl4
->flowi4_proto
;
1851 mhash
= flow_hash_from_keys(&hash_keys
);
1855 EXPORT_SYMBOL_GPL(fib_multipath_hash
);
1856 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1858 static int ip_mkroute_input(struct sk_buff
*skb
,
1859 struct fib_result
*res
,
1860 struct in_device
*in_dev
,
1861 __be32 daddr
, __be32 saddr
, u32 tos
)
1863 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1864 if (res
->fi
&& res
->fi
->fib_nhs
> 1) {
1865 int h
= fib_multipath_hash(res
->fi
, NULL
, skb
);
1867 fib_select_multipath(res
, h
);
1871 /* create a routing cache entry */
1872 return __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
);
1876 * NOTE. We drop all the packets that has local source
1877 * addresses, because every properly looped back packet
1878 * must have correct destination already attached by output routine.
1880 * Such approach solves two big problems:
1881 * 1. Not simplex devices are handled properly.
1882 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1883 * called with rcu_read_lock()
1886 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1887 u8 tos
, struct net_device
*dev
,
1888 struct fib_result
*res
)
1890 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1891 struct ip_tunnel_info
*tun_info
;
1893 unsigned int flags
= 0;
1897 struct net
*net
= dev_net(dev
);
1900 /* IP on this device is disabled. */
1905 /* Check for the most weird martians, which can be not detected
1909 tun_info
= skb_tunnel_info(skb
);
1910 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
1911 fl4
.flowi4_tun_key
.tun_id
= tun_info
->key
.tun_id
;
1913 fl4
.flowi4_tun_key
.tun_id
= 0;
1916 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
))
1917 goto martian_source
;
1921 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
1924 /* Accept zero addresses only to limited broadcast;
1925 * I even do not know to fix it or not. Waiting for complains :-)
1927 if (ipv4_is_zeronet(saddr
))
1928 goto martian_source
;
1930 if (ipv4_is_zeronet(daddr
))
1931 goto martian_destination
;
1933 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1934 * and call it once if daddr or/and saddr are loopback addresses
1936 if (ipv4_is_loopback(daddr
)) {
1937 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
))
1938 goto martian_destination
;
1939 } else if (ipv4_is_loopback(saddr
)) {
1940 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
))
1941 goto martian_source
;
1945 * Now we are ready to route packet.
1948 fl4
.flowi4_iif
= dev
->ifindex
;
1949 fl4
.flowi4_mark
= skb
->mark
;
1950 fl4
.flowi4_tos
= tos
;
1951 fl4
.flowi4_scope
= RT_SCOPE_UNIVERSE
;
1952 fl4
.flowi4_flags
= 0;
1955 fl4
.flowi4_uid
= sock_net_uid(net
, NULL
);
1956 err
= fib_lookup(net
, &fl4
, res
, 0);
1958 if (!IN_DEV_FORWARD(in_dev
))
1959 err
= -EHOSTUNREACH
;
1963 if (res
->type
== RTN_BROADCAST
)
1966 if (res
->type
== RTN_LOCAL
) {
1967 err
= fib_validate_source(skb
, saddr
, daddr
, tos
,
1968 0, dev
, in_dev
, &itag
);
1970 goto martian_source
;
1974 if (!IN_DEV_FORWARD(in_dev
)) {
1975 err
= -EHOSTUNREACH
;
1978 if (res
->type
!= RTN_UNICAST
)
1979 goto martian_destination
;
1981 err
= ip_mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
);
1985 if (skb
->protocol
!= htons(ETH_P_IP
))
1988 if (!ipv4_is_zeronet(saddr
)) {
1989 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
1992 goto martian_source
;
1994 flags
|= RTCF_BROADCAST
;
1995 res
->type
= RTN_BROADCAST
;
1996 RT_CACHE_STAT_INC(in_brd
);
2002 rth
= rcu_dereference(FIB_RES_NH(*res
).nh_rth_input
);
2003 if (rt_cache_valid(rth
)) {
2004 skb_dst_set_noref(skb
, &rth
->dst
);
2012 rth
= rt_dst_alloc(l3mdev_master_dev_rcu(dev
) ? : net
->loopback_dev
,
2013 flags
| RTCF_LOCAL
, res
->type
,
2014 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false, do_cache
);
2018 rth
->dst
.output
= ip_rt_bug
;
2019 #ifdef CONFIG_IP_ROUTE_CLASSID
2020 rth
->dst
.tclassid
= itag
;
2022 rth
->rt_is_input
= 1;
2024 rth
->rt_table_id
= res
->table
->tb_id
;
2026 RT_CACHE_STAT_INC(in_slow_tot
);
2027 if (res
->type
== RTN_UNREACHABLE
) {
2028 rth
->dst
.input
= ip_error
;
2029 rth
->dst
.error
= -err
;
2030 rth
->rt_flags
&= ~RTCF_LOCAL
;
2034 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
2036 rth
->dst
.lwtstate
= lwtstate_get(nh
->nh_lwtstate
);
2037 if (lwtunnel_input_redirect(rth
->dst
.lwtstate
)) {
2038 WARN_ON(rth
->dst
.input
== lwtunnel_input
);
2039 rth
->dst
.lwtstate
->orig_input
= rth
->dst
.input
;
2040 rth
->dst
.input
= lwtunnel_input
;
2043 if (unlikely(!rt_cache_route(nh
, rth
)))
2044 rt_add_uncached_list(rth
);
2046 skb_dst_set(skb
, &rth
->dst
);
2051 RT_CACHE_STAT_INC(in_no_route
);
2052 res
->type
= RTN_UNREACHABLE
;
2058 * Do not cache martian addresses: they should be logged (RFC1812)
2060 martian_destination
:
2061 RT_CACHE_STAT_INC(in_martian_dst
);
2062 #ifdef CONFIG_IP_ROUTE_VERBOSE
2063 if (IN_DEV_LOG_MARTIANS(in_dev
))
2064 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2065 &daddr
, &saddr
, dev
->name
);
2077 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2081 int ip_route_input_noref(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2082 u8 tos
, struct net_device
*dev
)
2084 struct fib_result res
;
2087 tos
&= IPTOS_RT_MASK
;
2089 err
= ip_route_input_rcu(skb
, daddr
, saddr
, tos
, dev
, &res
);
2094 EXPORT_SYMBOL(ip_route_input_noref
);
2096 /* called with rcu_read_lock held */
2097 int ip_route_input_rcu(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2098 u8 tos
, struct net_device
*dev
, struct fib_result
*res
)
2100 /* Multicast recognition logic is moved from route cache to here.
2101 The problem was that too many Ethernet cards have broken/missing
2102 hardware multicast filters :-( As result the host on multicasting
2103 network acquires a lot of useless route cache entries, sort of
2104 SDR messages from all the world. Now we try to get rid of them.
2105 Really, provided software IP multicast filter is organized
2106 reasonably (at least, hashed), it does not result in a slowdown
2107 comparing with route cache reject entries.
2108 Note, that multicast routers are not affected, because
2109 route cache entry is created eventually.
2111 if (ipv4_is_multicast(daddr
)) {
2112 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2117 our
= ip_check_mc_rcu(in_dev
, daddr
, saddr
,
2118 ip_hdr(skb
)->protocol
);
2120 /* check l3 master if no match yet */
2121 if ((!in_dev
|| !our
) && netif_is_l3_slave(dev
)) {
2122 struct in_device
*l3_in_dev
;
2124 l3_in_dev
= __in_dev_get_rcu(skb
->dev
);
2126 our
= ip_check_mc_rcu(l3_in_dev
, daddr
, saddr
,
2127 ip_hdr(skb
)->protocol
);
2131 #ifdef CONFIG_IP_MROUTE
2133 (!ipv4_is_local_multicast(daddr
) &&
2134 IN_DEV_MFORWARD(in_dev
))
2137 err
= ip_route_input_mc(skb
, daddr
, saddr
,
2143 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
, res
);
2146 /* called with rcu_read_lock() */
2147 static struct rtable
*__mkroute_output(const struct fib_result
*res
,
2148 const struct flowi4
*fl4
, int orig_oif
,
2149 struct net_device
*dev_out
,
2152 struct fib_info
*fi
= res
->fi
;
2153 struct fib_nh_exception
*fnhe
;
2154 struct in_device
*in_dev
;
2155 u16 type
= res
->type
;
2159 in_dev
= __in_dev_get_rcu(dev_out
);
2161 return ERR_PTR(-EINVAL
);
2163 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
)))
2164 if (ipv4_is_loopback(fl4
->saddr
) &&
2165 !(dev_out
->flags
& IFF_LOOPBACK
) &&
2166 !netif_is_l3_master(dev_out
))
2167 return ERR_PTR(-EINVAL
);
2169 if (ipv4_is_lbcast(fl4
->daddr
))
2170 type
= RTN_BROADCAST
;
2171 else if (ipv4_is_multicast(fl4
->daddr
))
2172 type
= RTN_MULTICAST
;
2173 else if (ipv4_is_zeronet(fl4
->daddr
))
2174 return ERR_PTR(-EINVAL
);
2176 if (dev_out
->flags
& IFF_LOOPBACK
)
2177 flags
|= RTCF_LOCAL
;
2180 if (type
== RTN_BROADCAST
) {
2181 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2183 } else if (type
== RTN_MULTICAST
) {
2184 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2185 if (!ip_check_mc_rcu(in_dev
, fl4
->daddr
, fl4
->saddr
,
2187 flags
&= ~RTCF_LOCAL
;
2190 /* If multicast route do not exist use
2191 * default one, but do not gateway in this case.
2194 if (fi
&& res
->prefixlen
< 4)
2196 } else if ((type
== RTN_LOCAL
) && (orig_oif
!= 0) &&
2197 (orig_oif
!= dev_out
->ifindex
)) {
2198 /* For local routes that require a particular output interface
2199 * we do not want to cache the result. Caching the result
2200 * causes incorrect behaviour when there are multiple source
2201 * addresses on the interface, the end result being that if the
2202 * intended recipient is waiting on that interface for the
2203 * packet he won't receive it because it will be delivered on
2204 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2205 * be set to the loopback interface as well.
2211 do_cache
&= fi
!= NULL
;
2213 struct rtable __rcu
**prth
;
2214 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
2216 fnhe
= find_exception(nh
, fl4
->daddr
);
2218 prth
= &fnhe
->fnhe_rth_output
;
2219 rth
= rcu_dereference(*prth
);
2220 if (rth
&& rth
->dst
.expires
&&
2221 time_after(jiffies
, rth
->dst
.expires
)) {
2222 ip_del_fnhe(nh
, fl4
->daddr
);
2229 if (unlikely(fl4
->flowi4_flags
&
2230 FLOWI_FLAG_KNOWN_NH
&&
2232 nh
->nh_scope
== RT_SCOPE_LINK
))) {
2236 prth
= raw_cpu_ptr(nh
->nh_pcpu_rth_output
);
2237 rth
= rcu_dereference(*prth
);
2240 if (rt_cache_valid(rth
) && dst_hold_safe(&rth
->dst
))
2245 rth
= rt_dst_alloc(dev_out
, flags
, type
,
2246 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2247 IN_DEV_CONF_GET(in_dev
, NOXFRM
),
2250 return ERR_PTR(-ENOBUFS
);
2252 rth
->rt_iif
= orig_oif
;
2254 rth
->rt_table_id
= res
->table
->tb_id
;
2256 RT_CACHE_STAT_INC(out_slow_tot
);
2258 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2259 if (flags
& RTCF_LOCAL
&&
2260 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2261 rth
->dst
.output
= ip_mc_output
;
2262 RT_CACHE_STAT_INC(out_slow_mc
);
2264 #ifdef CONFIG_IP_MROUTE
2265 if (type
== RTN_MULTICAST
) {
2266 if (IN_DEV_MFORWARD(in_dev
) &&
2267 !ipv4_is_local_multicast(fl4
->daddr
)) {
2268 rth
->dst
.input
= ip_mr_input
;
2269 rth
->dst
.output
= ip_mc_output
;
2275 rt_set_nexthop(rth
, fl4
->daddr
, res
, fnhe
, fi
, type
, 0, do_cache
);
2276 set_lwt_redirect(rth
);
2282 * Major route resolver routine.
2285 struct rtable
*ip_route_output_key_hash(struct net
*net
, struct flowi4
*fl4
,
2286 const struct sk_buff
*skb
)
2288 __u8 tos
= RT_FL_TOS(fl4
);
2289 struct fib_result res
;
2296 fl4
->flowi4_iif
= LOOPBACK_IFINDEX
;
2297 fl4
->flowi4_tos
= tos
& IPTOS_RT_MASK
;
2298 fl4
->flowi4_scope
= ((tos
& RTO_ONLINK
) ?
2299 RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
);
2302 rth
= ip_route_output_key_hash_rcu(net
, fl4
, &res
, skb
);
2307 EXPORT_SYMBOL_GPL(ip_route_output_key_hash
);
2309 struct rtable
*ip_route_output_key_hash_rcu(struct net
*net
, struct flowi4
*fl4
,
2310 struct fib_result
*res
,
2311 const struct sk_buff
*skb
)
2313 struct net_device
*dev_out
= NULL
;
2314 int orig_oif
= fl4
->flowi4_oif
;
2315 unsigned int flags
= 0;
2317 int err
= -ENETUNREACH
;
2320 rth
= ERR_PTR(-EINVAL
);
2321 if (ipv4_is_multicast(fl4
->saddr
) ||
2322 ipv4_is_lbcast(fl4
->saddr
) ||
2323 ipv4_is_zeronet(fl4
->saddr
))
2326 /* I removed check for oif == dev_out->oif here.
2327 It was wrong for two reasons:
2328 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2329 is assigned to multiple interfaces.
2330 2. Moreover, we are allowed to send packets with saddr
2331 of another iface. --ANK
2334 if (fl4
->flowi4_oif
== 0 &&
2335 (ipv4_is_multicast(fl4
->daddr
) ||
2336 ipv4_is_lbcast(fl4
->daddr
))) {
2337 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2338 dev_out
= __ip_dev_find(net
, fl4
->saddr
, false);
2342 /* Special hack: user can direct multicasts
2343 and limited broadcast via necessary interface
2344 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2345 This hack is not just for fun, it allows
2346 vic,vat and friends to work.
2347 They bind socket to loopback, set ttl to zero
2348 and expect that it will work.
2349 From the viewpoint of routing cache they are broken,
2350 because we are not allowed to build multicast path
2351 with loopback source addr (look, routing cache
2352 cannot know, that ttl is zero, so that packet
2353 will not leave this host and route is valid).
2354 Luckily, this hack is good workaround.
2357 fl4
->flowi4_oif
= dev_out
->ifindex
;
2361 if (!(fl4
->flowi4_flags
& FLOWI_FLAG_ANYSRC
)) {
2362 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2363 if (!__ip_dev_find(net
, fl4
->saddr
, false))
2369 if (fl4
->flowi4_oif
) {
2370 dev_out
= dev_get_by_index_rcu(net
, fl4
->flowi4_oif
);
2371 rth
= ERR_PTR(-ENODEV
);
2375 /* RACE: Check return value of inet_select_addr instead. */
2376 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2377 rth
= ERR_PTR(-ENETUNREACH
);
2380 if (ipv4_is_local_multicast(fl4
->daddr
) ||
2381 ipv4_is_lbcast(fl4
->daddr
) ||
2382 fl4
->flowi4_proto
== IPPROTO_IGMP
) {
2384 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2389 if (ipv4_is_multicast(fl4
->daddr
))
2390 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2392 else if (!fl4
->daddr
)
2393 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2399 fl4
->daddr
= fl4
->saddr
;
2401 fl4
->daddr
= fl4
->saddr
= htonl(INADDR_LOOPBACK
);
2402 dev_out
= net
->loopback_dev
;
2403 fl4
->flowi4_oif
= LOOPBACK_IFINDEX
;
2404 res
->type
= RTN_LOCAL
;
2405 flags
|= RTCF_LOCAL
;
2409 err
= fib_lookup(net
, fl4
, res
, 0);
2413 if (fl4
->flowi4_oif
&&
2414 (ipv4_is_multicast(fl4
->daddr
) ||
2415 !netif_index_is_l3_master(net
, fl4
->flowi4_oif
))) {
2416 /* Apparently, routing tables are wrong. Assume,
2417 that the destination is on link.
2420 Because we are allowed to send to iface
2421 even if it has NO routes and NO assigned
2422 addresses. When oif is specified, routing
2423 tables are looked up with only one purpose:
2424 to catch if destination is gatewayed, rather than
2425 direct. Moreover, if MSG_DONTROUTE is set,
2426 we send packet, ignoring both routing tables
2427 and ifaddr state. --ANK
2430 We could make it even if oif is unknown,
2431 likely IPv6, but we do not.
2434 if (fl4
->saddr
== 0)
2435 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2437 res
->type
= RTN_UNICAST
;
2444 if (res
->type
== RTN_LOCAL
) {
2446 if (res
->fi
->fib_prefsrc
)
2447 fl4
->saddr
= res
->fi
->fib_prefsrc
;
2449 fl4
->saddr
= fl4
->daddr
;
2452 /* L3 master device is the loopback for that domain */
2453 dev_out
= l3mdev_master_dev_rcu(FIB_RES_DEV(*res
)) ? :
2456 /* make sure orig_oif points to fib result device even
2457 * though packet rx/tx happens over loopback or l3mdev
2459 orig_oif
= FIB_RES_OIF(*res
);
2461 fl4
->flowi4_oif
= dev_out
->ifindex
;
2462 flags
|= RTCF_LOCAL
;
2466 fib_select_path(net
, res
, fl4
, skb
);
2468 dev_out
= FIB_RES_DEV(*res
);
2469 fl4
->flowi4_oif
= dev_out
->ifindex
;
2473 rth
= __mkroute_output(res
, fl4
, orig_oif
, dev_out
, flags
);
2479 static struct dst_entry
*ipv4_blackhole_dst_check(struct dst_entry
*dst
, u32 cookie
)
2484 static unsigned int ipv4_blackhole_mtu(const struct dst_entry
*dst
)
2486 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2488 return mtu
? : dst
->dev
->mtu
;
2491 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2492 struct sk_buff
*skb
, u32 mtu
)
2496 static void ipv4_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
2497 struct sk_buff
*skb
)
2501 static u32
*ipv4_rt_blackhole_cow_metrics(struct dst_entry
*dst
,
2507 static struct dst_ops ipv4_dst_blackhole_ops
= {
2509 .check
= ipv4_blackhole_dst_check
,
2510 .mtu
= ipv4_blackhole_mtu
,
2511 .default_advmss
= ipv4_default_advmss
,
2512 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2513 .redirect
= ipv4_rt_blackhole_redirect
,
2514 .cow_metrics
= ipv4_rt_blackhole_cow_metrics
,
2515 .neigh_lookup
= ipv4_neigh_lookup
,
2518 struct dst_entry
*ipv4_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2520 struct rtable
*ort
= (struct rtable
*) dst_orig
;
2523 rt
= dst_alloc(&ipv4_dst_blackhole_ops
, NULL
, 1, DST_OBSOLETE_DEAD
, 0);
2525 struct dst_entry
*new = &rt
->dst
;
2528 new->input
= dst_discard
;
2529 new->output
= dst_discard_out
;
2531 new->dev
= net
->loopback_dev
;
2535 rt
->rt_is_input
= ort
->rt_is_input
;
2536 rt
->rt_iif
= ort
->rt_iif
;
2537 rt
->rt_pmtu
= ort
->rt_pmtu
;
2539 rt
->rt_genid
= rt_genid_ipv4(net
);
2540 rt
->rt_flags
= ort
->rt_flags
;
2541 rt
->rt_type
= ort
->rt_type
;
2542 rt
->rt_gateway
= ort
->rt_gateway
;
2543 rt
->rt_uses_gateway
= ort
->rt_uses_gateway
;
2545 INIT_LIST_HEAD(&rt
->rt_uncached
);
2548 dst_release(dst_orig
);
2550 return rt
? &rt
->dst
: ERR_PTR(-ENOMEM
);
2553 struct rtable
*ip_route_output_flow(struct net
*net
, struct flowi4
*flp4
,
2554 const struct sock
*sk
)
2556 struct rtable
*rt
= __ip_route_output_key(net
, flp4
);
2561 if (flp4
->flowi4_proto
)
2562 rt
= (struct rtable
*)xfrm_lookup_route(net
, &rt
->dst
,
2563 flowi4_to_flowi(flp4
),
2568 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2570 /* called with rcu_read_lock held */
2571 static int rt_fill_info(struct net
*net
, __be32 dst
, __be32 src
, u32 table_id
,
2572 struct flowi4
*fl4
, struct sk_buff
*skb
, u32 portid
,
2575 struct rtable
*rt
= skb_rtable(skb
);
2577 struct nlmsghdr
*nlh
;
2578 unsigned long expires
= 0;
2580 u32 metrics
[RTAX_MAX
];
2582 nlh
= nlmsg_put(skb
, portid
, seq
, RTM_NEWROUTE
, sizeof(*r
), 0);
2586 r
= nlmsg_data(nlh
);
2587 r
->rtm_family
= AF_INET
;
2588 r
->rtm_dst_len
= 32;
2590 r
->rtm_tos
= fl4
->flowi4_tos
;
2591 r
->rtm_table
= table_id
< 256 ? table_id
: RT_TABLE_COMPAT
;
2592 if (nla_put_u32(skb
, RTA_TABLE
, table_id
))
2593 goto nla_put_failure
;
2594 r
->rtm_type
= rt
->rt_type
;
2595 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2596 r
->rtm_protocol
= RTPROT_UNSPEC
;
2597 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2598 if (rt
->rt_flags
& RTCF_NOTIFY
)
2599 r
->rtm_flags
|= RTM_F_NOTIFY
;
2600 if (IPCB(skb
)->flags
& IPSKB_DOREDIRECT
)
2601 r
->rtm_flags
|= RTCF_DOREDIRECT
;
2603 if (nla_put_in_addr(skb
, RTA_DST
, dst
))
2604 goto nla_put_failure
;
2606 r
->rtm_src_len
= 32;
2607 if (nla_put_in_addr(skb
, RTA_SRC
, src
))
2608 goto nla_put_failure
;
2611 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
2612 goto nla_put_failure
;
2613 #ifdef CONFIG_IP_ROUTE_CLASSID
2614 if (rt
->dst
.tclassid
&&
2615 nla_put_u32(skb
, RTA_FLOW
, rt
->dst
.tclassid
))
2616 goto nla_put_failure
;
2618 if (!rt_is_input_route(rt
) &&
2619 fl4
->saddr
!= src
) {
2620 if (nla_put_in_addr(skb
, RTA_PREFSRC
, fl4
->saddr
))
2621 goto nla_put_failure
;
2623 if (rt
->rt_uses_gateway
&&
2624 nla_put_in_addr(skb
, RTA_GATEWAY
, rt
->rt_gateway
))
2625 goto nla_put_failure
;
2627 expires
= rt
->dst
.expires
;
2629 unsigned long now
= jiffies
;
2631 if (time_before(now
, expires
))
2637 memcpy(metrics
, dst_metrics_ptr(&rt
->dst
), sizeof(metrics
));
2638 if (rt
->rt_pmtu
&& expires
)
2639 metrics
[RTAX_MTU
- 1] = rt
->rt_pmtu
;
2640 if (rtnetlink_put_metrics(skb
, metrics
) < 0)
2641 goto nla_put_failure
;
2643 if (fl4
->flowi4_mark
&&
2644 nla_put_u32(skb
, RTA_MARK
, fl4
->flowi4_mark
))
2645 goto nla_put_failure
;
2647 if (!uid_eq(fl4
->flowi4_uid
, INVALID_UID
) &&
2648 nla_put_u32(skb
, RTA_UID
,
2649 from_kuid_munged(current_user_ns(), fl4
->flowi4_uid
)))
2650 goto nla_put_failure
;
2652 error
= rt
->dst
.error
;
2654 if (rt_is_input_route(rt
)) {
2655 #ifdef CONFIG_IP_MROUTE
2656 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2657 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2658 int err
= ipmr_get_route(net
, skb
,
2659 fl4
->saddr
, fl4
->daddr
,
2665 goto nla_put_failure
;
2669 if (nla_put_u32(skb
, RTA_IIF
, skb
->dev
->ifindex
))
2670 goto nla_put_failure
;
2673 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, expires
, error
) < 0)
2674 goto nla_put_failure
;
2676 nlmsg_end(skb
, nlh
);
2680 nlmsg_cancel(skb
, nlh
);
2684 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
2685 struct netlink_ext_ack
*extack
)
2687 struct net
*net
= sock_net(in_skb
->sk
);
2689 struct nlattr
*tb
[RTA_MAX
+1];
2690 struct fib_result res
= {};
2691 struct rtable
*rt
= NULL
;
2698 struct sk_buff
*skb
;
2699 u32 table_id
= RT_TABLE_MAIN
;
2702 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
,
2707 rtm
= nlmsg_data(nlh
);
2709 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2715 /* Reserve room for dummy headers, this skb can pass
2716 through good chunk of routing engine.
2718 skb_reset_mac_header(skb
);
2719 skb_reset_network_header(skb
);
2721 src
= tb
[RTA_SRC
] ? nla_get_in_addr(tb
[RTA_SRC
]) : 0;
2722 dst
= tb
[RTA_DST
] ? nla_get_in_addr(tb
[RTA_DST
]) : 0;
2723 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2724 mark
= tb
[RTA_MARK
] ? nla_get_u32(tb
[RTA_MARK
]) : 0;
2726 uid
= make_kuid(current_user_ns(), nla_get_u32(tb
[RTA_UID
]));
2728 uid
= (iif
? INVALID_UID
: current_uid());
2730 /* Bugfix: need to give ip_route_input enough of an IP header to
2733 ip_hdr(skb
)->protocol
= IPPROTO_UDP
;
2734 ip_hdr(skb
)->saddr
= src
;
2735 ip_hdr(skb
)->daddr
= dst
;
2737 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2739 memset(&fl4
, 0, sizeof(fl4
));
2742 fl4
.flowi4_tos
= rtm
->rtm_tos
;
2743 fl4
.flowi4_oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0;
2744 fl4
.flowi4_mark
= mark
;
2745 fl4
.flowi4_uid
= uid
;
2750 struct net_device
*dev
;
2752 dev
= dev_get_by_index_rcu(net
, iif
);
2758 skb
->protocol
= htons(ETH_P_IP
);
2761 err
= ip_route_input_rcu(skb
, dst
, src
, rtm
->rtm_tos
,
2764 rt
= skb_rtable(skb
);
2765 if (err
== 0 && rt
->dst
.error
)
2766 err
= -rt
->dst
.error
;
2768 fl4
.flowi4_iif
= LOOPBACK_IFINDEX
;
2769 rt
= ip_route_output_key_hash_rcu(net
, &fl4
, &res
, skb
);
2774 skb_dst_set(skb
, &rt
->dst
);
2780 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
2781 rt
->rt_flags
|= RTCF_NOTIFY
;
2783 if (rtm
->rtm_flags
& RTM_F_LOOKUP_TABLE
)
2784 table_id
= rt
->rt_table_id
;
2786 if (rtm
->rtm_flags
& RTM_F_FIB_MATCH
) {
2788 err
= fib_props
[res
.type
].error
;
2790 err
= -EHOSTUNREACH
;
2793 err
= fib_dump_info(skb
, NETLINK_CB(in_skb
).portid
,
2794 nlh
->nlmsg_seq
, RTM_NEWROUTE
, table_id
,
2795 rt
->rt_type
, res
.prefix
, res
.prefixlen
,
2796 fl4
.flowi4_tos
, res
.fi
, 0);
2798 err
= rt_fill_info(net
, dst
, src
, table_id
, &fl4
, skb
,
2799 NETLINK_CB(in_skb
).portid
, nlh
->nlmsg_seq
);
2806 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
2816 void ip_rt_multicast_event(struct in_device
*in_dev
)
2818 rt_cache_flush(dev_net(in_dev
->dev
));
2821 #ifdef CONFIG_SYSCTL
2822 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
2823 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
2824 static int ip_rt_gc_elasticity __read_mostly
= 8;
2826 static int ipv4_sysctl_rtcache_flush(struct ctl_table
*__ctl
, int write
,
2827 void __user
*buffer
,
2828 size_t *lenp
, loff_t
*ppos
)
2830 struct net
*net
= (struct net
*)__ctl
->extra1
;
2833 rt_cache_flush(net
);
2834 fnhe_genid_bump(net
);
2841 static struct ctl_table ipv4_route_table
[] = {
2843 .procname
= "gc_thresh",
2844 .data
= &ipv4_dst_ops
.gc_thresh
,
2845 .maxlen
= sizeof(int),
2847 .proc_handler
= proc_dointvec
,
2850 .procname
= "max_size",
2851 .data
= &ip_rt_max_size
,
2852 .maxlen
= sizeof(int),
2854 .proc_handler
= proc_dointvec
,
2857 /* Deprecated. Use gc_min_interval_ms */
2859 .procname
= "gc_min_interval",
2860 .data
= &ip_rt_gc_min_interval
,
2861 .maxlen
= sizeof(int),
2863 .proc_handler
= proc_dointvec_jiffies
,
2866 .procname
= "gc_min_interval_ms",
2867 .data
= &ip_rt_gc_min_interval
,
2868 .maxlen
= sizeof(int),
2870 .proc_handler
= proc_dointvec_ms_jiffies
,
2873 .procname
= "gc_timeout",
2874 .data
= &ip_rt_gc_timeout
,
2875 .maxlen
= sizeof(int),
2877 .proc_handler
= proc_dointvec_jiffies
,
2880 .procname
= "gc_interval",
2881 .data
= &ip_rt_gc_interval
,
2882 .maxlen
= sizeof(int),
2884 .proc_handler
= proc_dointvec_jiffies
,
2887 .procname
= "redirect_load",
2888 .data
= &ip_rt_redirect_load
,
2889 .maxlen
= sizeof(int),
2891 .proc_handler
= proc_dointvec
,
2894 .procname
= "redirect_number",
2895 .data
= &ip_rt_redirect_number
,
2896 .maxlen
= sizeof(int),
2898 .proc_handler
= proc_dointvec
,
2901 .procname
= "redirect_silence",
2902 .data
= &ip_rt_redirect_silence
,
2903 .maxlen
= sizeof(int),
2905 .proc_handler
= proc_dointvec
,
2908 .procname
= "error_cost",
2909 .data
= &ip_rt_error_cost
,
2910 .maxlen
= sizeof(int),
2912 .proc_handler
= proc_dointvec
,
2915 .procname
= "error_burst",
2916 .data
= &ip_rt_error_burst
,
2917 .maxlen
= sizeof(int),
2919 .proc_handler
= proc_dointvec
,
2922 .procname
= "gc_elasticity",
2923 .data
= &ip_rt_gc_elasticity
,
2924 .maxlen
= sizeof(int),
2926 .proc_handler
= proc_dointvec
,
2929 .procname
= "mtu_expires",
2930 .data
= &ip_rt_mtu_expires
,
2931 .maxlen
= sizeof(int),
2933 .proc_handler
= proc_dointvec_jiffies
,
2936 .procname
= "min_pmtu",
2937 .data
= &ip_rt_min_pmtu
,
2938 .maxlen
= sizeof(int),
2940 .proc_handler
= proc_dointvec_minmax
,
2941 .extra1
= &ip_min_valid_pmtu
,
2944 .procname
= "min_adv_mss",
2945 .data
= &ip_rt_min_advmss
,
2946 .maxlen
= sizeof(int),
2948 .proc_handler
= proc_dointvec
,
2953 static struct ctl_table ipv4_route_flush_table
[] = {
2955 .procname
= "flush",
2956 .maxlen
= sizeof(int),
2958 .proc_handler
= ipv4_sysctl_rtcache_flush
,
2963 static __net_init
int sysctl_route_net_init(struct net
*net
)
2965 struct ctl_table
*tbl
;
2967 tbl
= ipv4_route_flush_table
;
2968 if (!net_eq(net
, &init_net
)) {
2969 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
2973 /* Don't export sysctls to unprivileged users */
2974 if (net
->user_ns
!= &init_user_ns
)
2975 tbl
[0].procname
= NULL
;
2977 tbl
[0].extra1
= net
;
2979 net
->ipv4
.route_hdr
= register_net_sysctl(net
, "net/ipv4/route", tbl
);
2980 if (!net
->ipv4
.route_hdr
)
2985 if (tbl
!= ipv4_route_flush_table
)
2991 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
2993 struct ctl_table
*tbl
;
2995 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
2996 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
2997 BUG_ON(tbl
== ipv4_route_flush_table
);
3001 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3002 .init
= sysctl_route_net_init
,
3003 .exit
= sysctl_route_net_exit
,
3007 static __net_init
int rt_genid_init(struct net
*net
)
3009 atomic_set(&net
->ipv4
.rt_genid
, 0);
3010 atomic_set(&net
->fnhe_genid
, 0);
3011 atomic_set(&net
->ipv4
.dev_addr_genid
, get_random_int());
3015 static __net_initdata
struct pernet_operations rt_genid_ops
= {
3016 .init
= rt_genid_init
,
3019 static int __net_init
ipv4_inetpeer_init(struct net
*net
)
3021 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
3025 inet_peer_base_init(bp
);
3026 net
->ipv4
.peers
= bp
;
3030 static void __net_exit
ipv4_inetpeer_exit(struct net
*net
)
3032 struct inet_peer_base
*bp
= net
->ipv4
.peers
;
3034 net
->ipv4
.peers
= NULL
;
3035 inetpeer_invalidate_tree(bp
);
3039 static __net_initdata
struct pernet_operations ipv4_inetpeer_ops
= {
3040 .init
= ipv4_inetpeer_init
,
3041 .exit
= ipv4_inetpeer_exit
,
3044 #ifdef CONFIG_IP_ROUTE_CLASSID
3045 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3046 #endif /* CONFIG_IP_ROUTE_CLASSID */
3048 int __init
ip_rt_init(void)
3052 ip_idents
= kmalloc(IP_IDENTS_SZ
* sizeof(*ip_idents
), GFP_KERNEL
);
3054 panic("IP: failed to allocate ip_idents\n");
3056 prandom_bytes(ip_idents
, IP_IDENTS_SZ
* sizeof(*ip_idents
));
3058 ip_tstamps
= kcalloc(IP_IDENTS_SZ
, sizeof(*ip_tstamps
), GFP_KERNEL
);
3060 panic("IP: failed to allocate ip_tstamps\n");
3062 for_each_possible_cpu(cpu
) {
3063 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
3065 INIT_LIST_HEAD(&ul
->head
);
3066 spin_lock_init(&ul
->lock
);
3068 #ifdef CONFIG_IP_ROUTE_CLASSID
3069 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3071 panic("IP: failed to allocate ip_rt_acct\n");
3074 ipv4_dst_ops
.kmem_cachep
=
3075 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3076 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3078 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3080 if (dst_entries_init(&ipv4_dst_ops
) < 0)
3081 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3083 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
3084 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3086 ipv4_dst_ops
.gc_thresh
= ~0;
3087 ip_rt_max_size
= INT_MAX
;
3092 if (ip_rt_proc_init())
3093 pr_err("Unable to create route proc files\n");
3098 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
,
3099 RTNL_FLAG_DOIT_UNLOCKED
);
3101 #ifdef CONFIG_SYSCTL
3102 register_pernet_subsys(&sysctl_route_ops
);
3104 register_pernet_subsys(&rt_genid_ops
);
3105 register_pernet_subsys(&ipv4_inetpeer_ops
);
3109 #ifdef CONFIG_SYSCTL
3111 * We really need to sanitize the damn ipv4 init order, then all
3112 * this nonsense will go away.
3114 void __init
ip_static_sysctl_init(void)
3116 register_net_sysctl(&init_net
, "net/ipv4/route", ipv4_route_table
);