2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
114 #include <net/secure_seq.h>
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119 #define IP_MAX_MTU 0xFFF0
121 #define RT_GC_TIMEOUT (300*HZ)
123 static int ip_rt_max_size
;
124 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
125 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
126 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
127 static int ip_rt_redirect_number __read_mostly
= 9;
128 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
129 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly
= HZ
;
131 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
132 static int ip_rt_gc_elasticity __read_mostly
= 8;
133 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
134 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly
= 256;
136 static int rt_chain_length_max __read_mostly
= 20;
138 static struct delayed_work expires_work
;
139 static unsigned long expires_ljiffies
;
142 * Interface to generic destination cache.
145 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
146 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
147 static unsigned int ipv4_mtu(const struct dst_entry
*dst
);
148 static void ipv4_dst_destroy(struct dst_entry
*dst
);
149 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
150 static void ipv4_link_failure(struct sk_buff
*skb
);
151 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
152 static int rt_garbage_collect(struct dst_ops
*ops
);
154 static void ipv4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
159 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
161 struct rtable
*rt
= (struct rtable
*) dst
;
162 struct inet_peer
*peer
;
165 peer
= rt_get_peer_create(rt
, rt
->rt_dst
);
167 u32
*old_p
= __DST_METRICS_PTR(old
);
168 unsigned long prev
, new;
171 if (inet_metrics_new(peer
))
172 memcpy(p
, old_p
, sizeof(u32
) * RTAX_MAX
);
174 new = (unsigned long) p
;
175 prev
= cmpxchg(&dst
->_metrics
, old
, new);
178 p
= __DST_METRICS_PTR(prev
);
179 if (prev
& DST_METRICS_READ_ONLY
)
183 fib_info_put(rt
->fi
);
191 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
, const void *daddr
);
193 static struct dst_ops ipv4_dst_ops
= {
195 .protocol
= cpu_to_be16(ETH_P_IP
),
196 .gc
= rt_garbage_collect
,
197 .check
= ipv4_dst_check
,
198 .default_advmss
= ipv4_default_advmss
,
200 .cow_metrics
= ipv4_cow_metrics
,
201 .destroy
= ipv4_dst_destroy
,
202 .ifdown
= ipv4_dst_ifdown
,
203 .negative_advice
= ipv4_negative_advice
,
204 .link_failure
= ipv4_link_failure
,
205 .update_pmtu
= ip_rt_update_pmtu
,
206 .local_out
= __ip_local_out
,
207 .neigh_lookup
= ipv4_neigh_lookup
,
210 #define ECN_OR_COST(class) TC_PRIO_##class
212 const __u8 ip_tos2prio
[16] = {
214 ECN_OR_COST(BESTEFFORT
),
216 ECN_OR_COST(BESTEFFORT
),
222 ECN_OR_COST(INTERACTIVE
),
224 ECN_OR_COST(INTERACTIVE
),
225 TC_PRIO_INTERACTIVE_BULK
,
226 ECN_OR_COST(INTERACTIVE_BULK
),
227 TC_PRIO_INTERACTIVE_BULK
,
228 ECN_OR_COST(INTERACTIVE_BULK
)
230 EXPORT_SYMBOL(ip_tos2prio
);
236 /* The locking scheme is rather straight forward:
238 * 1) Read-Copy Update protects the buckets of the central route hash.
239 * 2) Only writers remove entries, and they hold the lock
240 * as they look at rtable reference counts.
241 * 3) Only readers acquire references to rtable entries,
242 * they do so with atomic increments and with the
246 struct rt_hash_bucket
{
247 struct rtable __rcu
*chain
;
250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
251 defined(CONFIG_PROVE_LOCKING)
253 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
254 * The size of this table is a power of two and depends on the number of CPUS.
255 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
257 #ifdef CONFIG_LOCKDEP
258 # define RT_HASH_LOCK_SZ 256
261 # define RT_HASH_LOCK_SZ 4096
263 # define RT_HASH_LOCK_SZ 2048
265 # define RT_HASH_LOCK_SZ 1024
267 # define RT_HASH_LOCK_SZ 512
269 # define RT_HASH_LOCK_SZ 256
273 static spinlock_t
*rt_hash_locks
;
274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
276 static __init
void rt_hash_lock_init(void)
280 rt_hash_locks
= kmalloc(sizeof(spinlock_t
) * RT_HASH_LOCK_SZ
,
283 panic("IP: failed to allocate rt_hash_locks\n");
285 for (i
= 0; i
< RT_HASH_LOCK_SZ
; i
++)
286 spin_lock_init(&rt_hash_locks
[i
]);
289 # define rt_hash_lock_addr(slot) NULL
291 static inline void rt_hash_lock_init(void)
296 static struct rt_hash_bucket
*rt_hash_table __read_mostly
;
297 static unsigned int rt_hash_mask __read_mostly
;
298 static unsigned int rt_hash_log __read_mostly
;
300 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
303 static inline unsigned int rt_hash(__be32 daddr
, __be32 saddr
, int idx
,
306 return jhash_3words((__force u32
)daddr
, (__force u32
)saddr
,
311 static inline int rt_genid(struct net
*net
)
313 return atomic_read(&net
->ipv4
.rt_genid
);
316 #ifdef CONFIG_PROC_FS
317 struct rt_cache_iter_state
{
318 struct seq_net_private p
;
323 static struct rtable
*rt_cache_get_first(struct seq_file
*seq
)
325 struct rt_cache_iter_state
*st
= seq
->private;
326 struct rtable
*r
= NULL
;
328 for (st
->bucket
= rt_hash_mask
; st
->bucket
>= 0; --st
->bucket
) {
329 if (!rcu_access_pointer(rt_hash_table
[st
->bucket
].chain
))
332 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
334 if (dev_net(r
->dst
.dev
) == seq_file_net(seq
) &&
335 r
->rt_genid
== st
->genid
)
337 r
= rcu_dereference_bh(r
->dst
.rt_next
);
339 rcu_read_unlock_bh();
344 static struct rtable
*__rt_cache_get_next(struct seq_file
*seq
,
347 struct rt_cache_iter_state
*st
= seq
->private;
349 r
= rcu_dereference_bh(r
->dst
.rt_next
);
351 rcu_read_unlock_bh();
353 if (--st
->bucket
< 0)
355 } while (!rcu_access_pointer(rt_hash_table
[st
->bucket
].chain
));
357 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
362 static struct rtable
*rt_cache_get_next(struct seq_file
*seq
,
365 struct rt_cache_iter_state
*st
= seq
->private;
366 while ((r
= __rt_cache_get_next(seq
, r
)) != NULL
) {
367 if (dev_net(r
->dst
.dev
) != seq_file_net(seq
))
369 if (r
->rt_genid
== st
->genid
)
375 static struct rtable
*rt_cache_get_idx(struct seq_file
*seq
, loff_t pos
)
377 struct rtable
*r
= rt_cache_get_first(seq
);
380 while (pos
&& (r
= rt_cache_get_next(seq
, r
)))
382 return pos
? NULL
: r
;
385 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
387 struct rt_cache_iter_state
*st
= seq
->private;
389 return rt_cache_get_idx(seq
, *pos
- 1);
390 st
->genid
= rt_genid(seq_file_net(seq
));
391 return SEQ_START_TOKEN
;
394 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
398 if (v
== SEQ_START_TOKEN
)
399 r
= rt_cache_get_first(seq
);
401 r
= rt_cache_get_next(seq
, v
);
406 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
408 if (v
&& v
!= SEQ_START_TOKEN
)
409 rcu_read_unlock_bh();
412 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
414 if (v
== SEQ_START_TOKEN
)
415 seq_printf(seq
, "%-127s\n",
416 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
417 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 struct rtable
*r
= v
;
423 seq_printf(seq
, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
424 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
425 r
->dst
.dev
? r
->dst
.dev
->name
: "*",
426 (__force u32
)r
->rt_dst
,
427 (__force u32
)r
->rt_gateway
,
428 r
->rt_flags
, atomic_read(&r
->dst
.__refcnt
),
429 r
->dst
.__use
, 0, (__force u32
)r
->rt_src
,
430 dst_metric_advmss(&r
->dst
) + 40,
431 dst_metric(&r
->dst
, RTAX_WINDOW
),
432 (int)((dst_metric(&r
->dst
, RTAX_RTT
) >> 3) +
433 dst_metric(&r
->dst
, RTAX_RTTVAR
)),
437 seq_printf(seq
, "%*s\n", 127 - len
, "");
442 static const struct seq_operations rt_cache_seq_ops
= {
443 .start
= rt_cache_seq_start
,
444 .next
= rt_cache_seq_next
,
445 .stop
= rt_cache_seq_stop
,
446 .show
= rt_cache_seq_show
,
449 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
451 return seq_open_net(inode
, file
, &rt_cache_seq_ops
,
452 sizeof(struct rt_cache_iter_state
));
455 static const struct file_operations rt_cache_seq_fops
= {
456 .owner
= THIS_MODULE
,
457 .open
= rt_cache_seq_open
,
460 .release
= seq_release_net
,
464 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
469 return SEQ_START_TOKEN
;
471 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
472 if (!cpu_possible(cpu
))
475 return &per_cpu(rt_cache_stat
, cpu
);
480 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
484 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
485 if (!cpu_possible(cpu
))
488 return &per_cpu(rt_cache_stat
, cpu
);
494 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
499 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
501 struct rt_cache_stat
*st
= v
;
503 if (v
== SEQ_START_TOKEN
) {
504 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
508 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
509 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
510 dst_entries_get_slow(&ipv4_dst_ops
),
533 static const struct seq_operations rt_cpu_seq_ops
= {
534 .start
= rt_cpu_seq_start
,
535 .next
= rt_cpu_seq_next
,
536 .stop
= rt_cpu_seq_stop
,
537 .show
= rt_cpu_seq_show
,
541 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
543 return seq_open(file
, &rt_cpu_seq_ops
);
546 static const struct file_operations rt_cpu_seq_fops
= {
547 .owner
= THIS_MODULE
,
548 .open
= rt_cpu_seq_open
,
551 .release
= seq_release
,
554 #ifdef CONFIG_IP_ROUTE_CLASSID
555 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
557 struct ip_rt_acct
*dst
, *src
;
560 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
564 for_each_possible_cpu(i
) {
565 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
566 for (j
= 0; j
< 256; j
++) {
567 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
568 dst
[j
].o_packets
+= src
[j
].o_packets
;
569 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
570 dst
[j
].i_packets
+= src
[j
].i_packets
;
574 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
579 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
581 return single_open(file
, rt_acct_proc_show
, NULL
);
584 static const struct file_operations rt_acct_proc_fops
= {
585 .owner
= THIS_MODULE
,
586 .open
= rt_acct_proc_open
,
589 .release
= single_release
,
593 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
595 struct proc_dir_entry
*pde
;
597 pde
= proc_net_fops_create(net
, "rt_cache", S_IRUGO
,
602 pde
= proc_create("rt_cache", S_IRUGO
,
603 net
->proc_net_stat
, &rt_cpu_seq_fops
);
607 #ifdef CONFIG_IP_ROUTE_CLASSID
608 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
614 #ifdef CONFIG_IP_ROUTE_CLASSID
616 remove_proc_entry("rt_cache", net
->proc_net_stat
);
619 remove_proc_entry("rt_cache", net
->proc_net
);
624 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
626 remove_proc_entry("rt_cache", net
->proc_net_stat
);
627 remove_proc_entry("rt_cache", net
->proc_net
);
628 #ifdef CONFIG_IP_ROUTE_CLASSID
629 remove_proc_entry("rt_acct", net
->proc_net
);
633 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
634 .init
= ip_rt_do_proc_init
,
635 .exit
= ip_rt_do_proc_exit
,
638 static int __init
ip_rt_proc_init(void)
640 return register_pernet_subsys(&ip_rt_proc_ops
);
644 static inline int ip_rt_proc_init(void)
648 #endif /* CONFIG_PROC_FS */
650 static inline void rt_free(struct rtable
*rt
)
652 call_rcu_bh(&rt
->dst
.rcu_head
, dst_rcu_free
);
655 static inline void rt_drop(struct rtable
*rt
)
658 call_rcu_bh(&rt
->dst
.rcu_head
, dst_rcu_free
);
661 static inline int rt_fast_clean(struct rtable
*rth
)
663 /* Kill broadcast/multicast entries very aggresively, if they
664 collide in hash table with more useful entries */
665 return (rth
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
666 rt_is_input_route(rth
) && rth
->dst
.rt_next
;
669 static inline int rt_valuable(struct rtable
*rth
)
671 return (rth
->rt_flags
& (RTCF_REDIRECTED
| RTCF_NOTIFY
)) ||
672 (rt_has_peer(rth
) && rt_peer_ptr(rth
)->pmtu_expires
);
675 static int rt_may_expire(struct rtable
*rth
, unsigned long tmo1
, unsigned long tmo2
)
680 if (atomic_read(&rth
->dst
.__refcnt
))
683 age
= jiffies
- rth
->dst
.lastuse
;
684 if ((age
<= tmo1
&& !rt_fast_clean(rth
)) ||
685 (age
<= tmo2
&& rt_valuable(rth
)))
691 /* Bits of score are:
693 * 30: not quite useless
694 * 29..0: usage counter
696 static inline u32
rt_score(struct rtable
*rt
)
698 u32 score
= jiffies
- rt
->dst
.lastuse
;
700 score
= ~score
& ~(3<<30);
705 if (rt_is_output_route(rt
) ||
706 !(rt
->rt_flags
& (RTCF_BROADCAST
|RTCF_MULTICAST
|RTCF_LOCAL
)))
712 static inline bool rt_caching(const struct net
*net
)
714 return net
->ipv4
.current_rt_cache_rebuild_count
<=
715 net
->ipv4
.sysctl_rt_cache_rebuild_count
;
718 static inline bool compare_hash_inputs(const struct rtable
*rt1
,
719 const struct rtable
*rt2
)
721 return ((((__force u32
)rt1
->rt_key_dst
^ (__force u32
)rt2
->rt_key_dst
) |
722 ((__force u32
)rt1
->rt_key_src
^ (__force u32
)rt2
->rt_key_src
) |
723 (rt1
->rt_route_iif
^ rt2
->rt_route_iif
)) == 0);
726 static inline int compare_keys(struct rtable
*rt1
, struct rtable
*rt2
)
728 return (((__force u32
)rt1
->rt_key_dst
^ (__force u32
)rt2
->rt_key_dst
) |
729 ((__force u32
)rt1
->rt_key_src
^ (__force u32
)rt2
->rt_key_src
) |
730 (rt1
->rt_mark
^ rt2
->rt_mark
) |
731 (rt1
->rt_key_tos
^ rt2
->rt_key_tos
) |
732 (rt1
->rt_route_iif
^ rt2
->rt_route_iif
) |
733 (rt1
->rt_oif
^ rt2
->rt_oif
)) == 0;
736 static inline int compare_netns(struct rtable
*rt1
, struct rtable
*rt2
)
738 return net_eq(dev_net(rt1
->dst
.dev
), dev_net(rt2
->dst
.dev
));
741 static inline int rt_is_expired(struct rtable
*rth
)
743 return rth
->rt_genid
!= rt_genid(dev_net(rth
->dst
.dev
));
747 * Perform a full scan of hash table and free all entries.
748 * Can be called by a softirq or a process.
749 * In the later case, we want to be reschedule if necessary
751 static void rt_do_flush(struct net
*net
, int process_context
)
754 struct rtable
*rth
, *next
;
756 for (i
= 0; i
<= rt_hash_mask
; i
++) {
757 struct rtable __rcu
**pprev
;
760 if (process_context
&& need_resched())
762 rth
= rcu_access_pointer(rt_hash_table
[i
].chain
);
766 spin_lock_bh(rt_hash_lock_addr(i
));
769 pprev
= &rt_hash_table
[i
].chain
;
770 rth
= rcu_dereference_protected(*pprev
,
771 lockdep_is_held(rt_hash_lock_addr(i
)));
774 next
= rcu_dereference_protected(rth
->dst
.rt_next
,
775 lockdep_is_held(rt_hash_lock_addr(i
)));
778 net_eq(dev_net(rth
->dst
.dev
), net
)) {
779 rcu_assign_pointer(*pprev
, next
);
780 rcu_assign_pointer(rth
->dst
.rt_next
, list
);
783 pprev
= &rth
->dst
.rt_next
;
788 spin_unlock_bh(rt_hash_lock_addr(i
));
790 for (; list
; list
= next
) {
791 next
= rcu_dereference_protected(list
->dst
.rt_next
, 1);
798 * While freeing expired entries, we compute average chain length
799 * and standard deviation, using fixed-point arithmetic.
800 * This to have an estimation of rt_chain_length_max
801 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
802 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
806 #define ONE (1UL << FRACT_BITS)
809 * Given a hash chain and an item in this hash chain,
810 * find if a previous entry has the same hash_inputs
811 * (but differs on tos, mark or oif)
812 * Returns 0 if an alias is found.
813 * Returns ONE if rth has no alias before itself.
815 static int has_noalias(const struct rtable
*head
, const struct rtable
*rth
)
817 const struct rtable
*aux
= head
;
820 if (compare_hash_inputs(aux
, rth
))
822 aux
= rcu_dereference_protected(aux
->dst
.rt_next
, 1);
827 static void rt_check_expire(void)
829 static unsigned int rover
;
830 unsigned int i
= rover
, goal
;
832 struct rtable __rcu
**rthp
;
833 unsigned long samples
= 0;
834 unsigned long sum
= 0, sum2
= 0;
838 delta
= jiffies
- expires_ljiffies
;
839 expires_ljiffies
= jiffies
;
840 mult
= ((u64
)delta
) << rt_hash_log
;
841 if (ip_rt_gc_timeout
> 1)
842 do_div(mult
, ip_rt_gc_timeout
);
843 goal
= (unsigned int)mult
;
844 if (goal
> rt_hash_mask
)
845 goal
= rt_hash_mask
+ 1;
846 for (; goal
> 0; goal
--) {
847 unsigned long tmo
= ip_rt_gc_timeout
;
848 unsigned long length
;
850 i
= (i
+ 1) & rt_hash_mask
;
851 rthp
= &rt_hash_table
[i
].chain
;
858 if (rcu_dereference_raw(*rthp
) == NULL
)
861 spin_lock_bh(rt_hash_lock_addr(i
));
862 while ((rth
= rcu_dereference_protected(*rthp
,
863 lockdep_is_held(rt_hash_lock_addr(i
)))) != NULL
) {
864 prefetch(rth
->dst
.rt_next
);
865 if (rt_is_expired(rth
) ||
866 rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
)) {
867 *rthp
= rth
->dst
.rt_next
;
872 /* We only count entries on a chain with equal
873 * hash inputs once so that entries for
874 * different QOS levels, and other non-hash
875 * input attributes don't unfairly skew the
879 rthp
= &rth
->dst
.rt_next
;
880 length
+= has_noalias(rt_hash_table
[i
].chain
, rth
);
882 spin_unlock_bh(rt_hash_lock_addr(i
));
884 sum2
+= length
*length
;
887 unsigned long avg
= sum
/ samples
;
888 unsigned long sd
= int_sqrt(sum2
/ samples
- avg
*avg
);
889 rt_chain_length_max
= max_t(unsigned long,
891 (avg
+ 4*sd
) >> FRACT_BITS
);
897 * rt_worker_func() is run in process context.
898 * we call rt_check_expire() to scan part of the hash table
900 static void rt_worker_func(struct work_struct
*work
)
903 schedule_delayed_work(&expires_work
, ip_rt_gc_interval
);
907 * Perturbation of rt_genid by a small quantity [1..256]
908 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
909 * many times (2^24) without giving recent rt_genid.
910 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
912 static void rt_cache_invalidate(struct net
*net
)
914 unsigned char shuffle
;
916 get_random_bytes(&shuffle
, sizeof(shuffle
));
917 atomic_add(shuffle
+ 1U, &net
->ipv4
.rt_genid
);
918 inetpeer_invalidate_family(AF_INET
);
922 * delay < 0 : invalidate cache (fast : entries will be deleted later)
923 * delay >= 0 : invalidate & flush cache (can be long)
925 void rt_cache_flush(struct net
*net
, int delay
)
927 rt_cache_invalidate(net
);
929 rt_do_flush(net
, !in_softirq());
932 /* Flush previous cache invalidated entries from the cache */
933 void rt_cache_flush_batch(struct net
*net
)
935 rt_do_flush(net
, !in_softirq());
938 static void rt_emergency_hash_rebuild(struct net
*net
)
940 net_warn_ratelimited("Route hash chain too long!\n");
941 rt_cache_invalidate(net
);
945 Short description of GC goals.
947 We want to build algorithm, which will keep routing cache
948 at some equilibrium point, when number of aged off entries
949 is kept approximately equal to newly generated ones.
951 Current expiration strength is variable "expire".
952 We try to adjust it dynamically, so that if networking
953 is idle expires is large enough to keep enough of warm entries,
954 and when load increases it reduces to limit cache size.
957 static int rt_garbage_collect(struct dst_ops
*ops
)
959 static unsigned long expire
= RT_GC_TIMEOUT
;
960 static unsigned long last_gc
;
962 static int equilibrium
;
964 struct rtable __rcu
**rthp
;
965 unsigned long now
= jiffies
;
967 int entries
= dst_entries_get_fast(&ipv4_dst_ops
);
970 * Garbage collection is pretty expensive,
971 * do not make it too frequently.
974 RT_CACHE_STAT_INC(gc_total
);
976 if (now
- last_gc
< ip_rt_gc_min_interval
&&
977 entries
< ip_rt_max_size
) {
978 RT_CACHE_STAT_INC(gc_ignored
);
982 entries
= dst_entries_get_slow(&ipv4_dst_ops
);
983 /* Calculate number of entries, which we want to expire now. */
984 goal
= entries
- (ip_rt_gc_elasticity
<< rt_hash_log
);
986 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
987 equilibrium
= ipv4_dst_ops
.gc_thresh
;
988 goal
= entries
- equilibrium
;
990 equilibrium
+= min_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
991 goal
= entries
- equilibrium
;
994 /* We are in dangerous area. Try to reduce cache really
997 goal
= max_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
998 equilibrium
= entries
- goal
;
1001 if (now
- last_gc
>= ip_rt_gc_min_interval
)
1005 equilibrium
+= goal
;
1012 for (i
= rt_hash_mask
, k
= rover
; i
>= 0; i
--) {
1013 unsigned long tmo
= expire
;
1015 k
= (k
+ 1) & rt_hash_mask
;
1016 rthp
= &rt_hash_table
[k
].chain
;
1017 spin_lock_bh(rt_hash_lock_addr(k
));
1018 while ((rth
= rcu_dereference_protected(*rthp
,
1019 lockdep_is_held(rt_hash_lock_addr(k
)))) != NULL
) {
1020 if (!rt_is_expired(rth
) &&
1021 !rt_may_expire(rth
, tmo
, expire
)) {
1023 rthp
= &rth
->dst
.rt_next
;
1026 *rthp
= rth
->dst
.rt_next
;
1030 spin_unlock_bh(rt_hash_lock_addr(k
));
1039 /* Goal is not achieved. We stop process if:
1041 - if expire reduced to zero. Otherwise, expire is halfed.
1042 - if table is not full.
1043 - if we are called from interrupt.
1044 - jiffies check is just fallback/debug loop breaker.
1045 We will not spin here for long time in any case.
1048 RT_CACHE_STAT_INC(gc_goal_miss
);
1055 if (dst_entries_get_fast(&ipv4_dst_ops
) < ip_rt_max_size
)
1057 } while (!in_softirq() && time_before_eq(jiffies
, now
));
1059 if (dst_entries_get_fast(&ipv4_dst_ops
) < ip_rt_max_size
)
1061 if (dst_entries_get_slow(&ipv4_dst_ops
) < ip_rt_max_size
)
1063 net_warn_ratelimited("dst cache overflow\n");
1064 RT_CACHE_STAT_INC(gc_dst_overflow
);
1068 expire
+= ip_rt_gc_min_interval
;
1069 if (expire
> ip_rt_gc_timeout
||
1070 dst_entries_get_fast(&ipv4_dst_ops
) < ipv4_dst_ops
.gc_thresh
||
1071 dst_entries_get_slow(&ipv4_dst_ops
) < ipv4_dst_ops
.gc_thresh
)
1072 expire
= ip_rt_gc_timeout
;
1077 * Returns number of entries in a hash chain that have different hash_inputs
1079 static int slow_chain_length(const struct rtable
*head
)
1082 const struct rtable
*rth
= head
;
1085 length
+= has_noalias(head
, rth
);
1086 rth
= rcu_dereference_protected(rth
->dst
.rt_next
, 1);
1088 return length
>> FRACT_BITS
;
1091 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
, const void *daddr
)
1093 struct net_device
*dev
= dst
->dev
;
1094 const __be32
*pkey
= daddr
;
1095 const struct rtable
*rt
;
1096 struct neighbour
*n
;
1098 rt
= (const struct rtable
*) dst
;
1100 pkey
= (const __be32
*) &rt
->rt_gateway
;
1102 n
= __ipv4_neigh_lookup(dev
, *(__force u32
*)pkey
);
1105 return neigh_create(&arp_tbl
, pkey
, dev
);
1108 static int rt_bind_neighbour(struct rtable
*rt
)
1110 struct neighbour
*n
= ipv4_neigh_lookup(&rt
->dst
, &rt
->rt_gateway
);
1113 dst_set_neighbour(&rt
->dst
, n
);
1118 static struct rtable
*rt_intern_hash(unsigned int hash
, struct rtable
*rt
,
1119 struct sk_buff
*skb
, int ifindex
)
1121 struct rtable
*rth
, *cand
;
1122 struct rtable __rcu
**rthp
, **candp
;
1126 int attempts
= !in_softirq();
1130 min_score
= ~(u32
)0;
1135 if (!rt_caching(dev_net(rt
->dst
.dev
)) || (rt
->dst
.flags
& DST_NOCACHE
)) {
1137 * If we're not caching, just tell the caller we
1138 * were successful and don't touch the route. The
1139 * caller hold the sole reference to the cache entry, and
1140 * it will be released when the caller is done with it.
1141 * If we drop it here, the callers have no way to resolve routes
1142 * when we're not caching. Instead, just point *rp at rt, so
1143 * the caller gets a single use out of the route
1144 * Note that we do rt_free on this new route entry, so that
1145 * once its refcount hits zero, we are still able to reap it
1147 * Note: To avoid expensive rcu stuff for this uncached dst,
1148 * we set DST_NOCACHE so that dst_release() can free dst without
1149 * waiting a grace period.
1152 rt
->dst
.flags
|= DST_NOCACHE
;
1153 if (rt
->rt_type
== RTN_UNICAST
|| rt_is_output_route(rt
)) {
1154 int err
= rt_bind_neighbour(rt
);
1156 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1158 return ERR_PTR(err
);
1165 rthp
= &rt_hash_table
[hash
].chain
;
1167 spin_lock_bh(rt_hash_lock_addr(hash
));
1168 while ((rth
= rcu_dereference_protected(*rthp
,
1169 lockdep_is_held(rt_hash_lock_addr(hash
)))) != NULL
) {
1170 if (rt_is_expired(rth
)) {
1171 *rthp
= rth
->dst
.rt_next
;
1175 if (compare_keys(rth
, rt
) && compare_netns(rth
, rt
)) {
1177 *rthp
= rth
->dst
.rt_next
;
1179 * Since lookup is lockfree, the deletion
1180 * must be visible to another weakly ordered CPU before
1181 * the insertion at the start of the hash chain.
1183 rcu_assign_pointer(rth
->dst
.rt_next
,
1184 rt_hash_table
[hash
].chain
);
1186 * Since lookup is lockfree, the update writes
1187 * must be ordered for consistency on SMP.
1189 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rth
);
1191 dst_use(&rth
->dst
, now
);
1192 spin_unlock_bh(rt_hash_lock_addr(hash
));
1196 skb_dst_set(skb
, &rth
->dst
);
1200 if (!atomic_read(&rth
->dst
.__refcnt
)) {
1201 u32 score
= rt_score(rth
);
1203 if (score
<= min_score
) {
1212 rthp
= &rth
->dst
.rt_next
;
1216 /* ip_rt_gc_elasticity used to be average length of chain
1217 * length, when exceeded gc becomes really aggressive.
1219 * The second limit is less certain. At the moment it allows
1220 * only 2 entries per bucket. We will see.
1222 if (chain_length
> ip_rt_gc_elasticity
) {
1223 *candp
= cand
->dst
.rt_next
;
1227 if (chain_length
> rt_chain_length_max
&&
1228 slow_chain_length(rt_hash_table
[hash
].chain
) > rt_chain_length_max
) {
1229 struct net
*net
= dev_net(rt
->dst
.dev
);
1230 int num
= ++net
->ipv4
.current_rt_cache_rebuild_count
;
1231 if (!rt_caching(net
)) {
1232 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1233 rt
->dst
.dev
->name
, num
);
1235 rt_emergency_hash_rebuild(net
);
1236 spin_unlock_bh(rt_hash_lock_addr(hash
));
1238 hash
= rt_hash(rt
->rt_key_dst
, rt
->rt_key_src
,
1239 ifindex
, rt_genid(net
));
1244 /* Try to bind route to arp only if it is output
1245 route or unicast forwarding path.
1247 if (rt
->rt_type
== RTN_UNICAST
|| rt_is_output_route(rt
)) {
1248 int err
= rt_bind_neighbour(rt
);
1250 spin_unlock_bh(rt_hash_lock_addr(hash
));
1252 if (err
!= -ENOBUFS
) {
1254 return ERR_PTR(err
);
1257 /* Neighbour tables are full and nothing
1258 can be released. Try to shrink route cache,
1259 it is most likely it holds some neighbour records.
1261 if (attempts
-- > 0) {
1262 int saved_elasticity
= ip_rt_gc_elasticity
;
1263 int saved_int
= ip_rt_gc_min_interval
;
1264 ip_rt_gc_elasticity
= 1;
1265 ip_rt_gc_min_interval
= 0;
1266 rt_garbage_collect(&ipv4_dst_ops
);
1267 ip_rt_gc_min_interval
= saved_int
;
1268 ip_rt_gc_elasticity
= saved_elasticity
;
1272 net_warn_ratelimited("Neighbour table overflow\n");
1274 return ERR_PTR(-ENOBUFS
);
1278 rt
->dst
.rt_next
= rt_hash_table
[hash
].chain
;
1281 * Since lookup is lockfree, we must make sure
1282 * previous writes to rt are committed to memory
1283 * before making rt visible to other CPUS.
1285 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rt
);
1287 spin_unlock_bh(rt_hash_lock_addr(hash
));
1291 skb_dst_set(skb
, &rt
->dst
);
1295 static atomic_t __rt_peer_genid
= ATOMIC_INIT(0);
1297 static u32
rt_peer_genid(void)
1299 return atomic_read(&__rt_peer_genid
);
1302 void rt_bind_peer(struct rtable
*rt
, __be32 daddr
, int create
)
1304 struct inet_peer_base
*base
;
1305 struct inet_peer
*peer
;
1307 base
= inetpeer_base_ptr(rt
->_peer
);
1311 peer
= inet_getpeer_v4(base
, daddr
, create
);
1313 if (!rt_set_peer(rt
, peer
))
1316 rt
->rt_peer_genid
= rt_peer_genid();
1321 * Peer allocation may fail only in serious out-of-memory conditions. However
1322 * we still can generate some output.
1323 * Random ID selection looks a bit dangerous because we have no chances to
1324 * select ID being unique in a reasonable period of time.
1325 * But broken packet identifier may be better than no packet at all.
1327 static void ip_select_fb_ident(struct iphdr
*iph
)
1329 static DEFINE_SPINLOCK(ip_fb_id_lock
);
1330 static u32 ip_fallback_id
;
1333 spin_lock_bh(&ip_fb_id_lock
);
1334 salt
= secure_ip_id((__force __be32
)ip_fallback_id
^ iph
->daddr
);
1335 iph
->id
= htons(salt
& 0xFFFF);
1336 ip_fallback_id
= salt
;
1337 spin_unlock_bh(&ip_fb_id_lock
);
1340 void __ip_select_ident(struct iphdr
*iph
, struct dst_entry
*dst
, int more
)
1342 struct rtable
*rt
= (struct rtable
*) dst
;
1344 if (rt
&& !(rt
->dst
.flags
& DST_NOPEER
)) {
1345 struct inet_peer
*peer
= rt_get_peer_create(rt
, rt
->rt_dst
);
1347 /* If peer is attached to destination, it is never detached,
1348 so that we need not to grab a lock to dereference it.
1351 iph
->id
= htons(inet_getid(peer
, more
));
1355 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1357 ip_select_fb_ident(iph
);
1359 EXPORT_SYMBOL(__ip_select_ident
);
1361 static void rt_del(unsigned int hash
, struct rtable
*rt
)
1363 struct rtable __rcu
**rthp
;
1366 rthp
= &rt_hash_table
[hash
].chain
;
1367 spin_lock_bh(rt_hash_lock_addr(hash
));
1369 while ((aux
= rcu_dereference_protected(*rthp
,
1370 lockdep_is_held(rt_hash_lock_addr(hash
)))) != NULL
) {
1371 if (aux
== rt
|| rt_is_expired(aux
)) {
1372 *rthp
= aux
->dst
.rt_next
;
1376 rthp
= &aux
->dst
.rt_next
;
1378 spin_unlock_bh(rt_hash_lock_addr(hash
));
1381 static void check_peer_redir(struct dst_entry
*dst
, struct inet_peer
*peer
)
1383 struct rtable
*rt
= (struct rtable
*) dst
;
1384 __be32 orig_gw
= rt
->rt_gateway
;
1385 struct neighbour
*n
, *old_n
;
1387 dst_confirm(&rt
->dst
);
1389 rt
->rt_gateway
= peer
->redirect_learned
.a4
;
1391 n
= ipv4_neigh_lookup(&rt
->dst
, &rt
->rt_gateway
);
1393 rt
->rt_gateway
= orig_gw
;
1396 old_n
= xchg(&rt
->dst
._neighbour
, n
);
1398 neigh_release(old_n
);
1399 if (!(n
->nud_state
& NUD_VALID
)) {
1400 neigh_event_send(n
, NULL
);
1402 rt
->rt_flags
|= RTCF_REDIRECTED
;
1403 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE
, n
);
1407 /* called in rcu_read_lock() section */
1408 void ip_rt_redirect(__be32 old_gw
, __be32 daddr
, __be32 new_gw
,
1409 __be32 saddr
, struct net_device
*dev
)
1412 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1413 __be32 skeys
[2] = { saddr
, 0 };
1414 int ikeys
[2] = { dev
->ifindex
, 0 };
1415 struct inet_peer
*peer
;
1422 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
1423 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
1424 ipv4_is_zeronet(new_gw
))
1425 goto reject_redirect
;
1427 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
1428 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
1429 goto reject_redirect
;
1430 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
1431 goto reject_redirect
;
1433 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
1434 goto reject_redirect
;
1437 for (s
= 0; s
< 2; s
++) {
1438 for (i
= 0; i
< 2; i
++) {
1440 struct rtable __rcu
**rthp
;
1443 hash
= rt_hash(daddr
, skeys
[s
], ikeys
[i
], rt_genid(net
));
1445 rthp
= &rt_hash_table
[hash
].chain
;
1447 while ((rt
= rcu_dereference(*rthp
)) != NULL
) {
1448 rthp
= &rt
->dst
.rt_next
;
1450 if (rt
->rt_key_dst
!= daddr
||
1451 rt
->rt_key_src
!= skeys
[s
] ||
1452 rt
->rt_oif
!= ikeys
[i
] ||
1453 rt_is_input_route(rt
) ||
1454 rt_is_expired(rt
) ||
1455 !net_eq(dev_net(rt
->dst
.dev
), net
) ||
1457 rt
->dst
.dev
!= dev
||
1458 rt
->rt_gateway
!= old_gw
)
1461 peer
= rt_get_peer_create(rt
, rt
->rt_dst
);
1463 if (peer
->redirect_learned
.a4
!= new_gw
) {
1464 peer
->redirect_learned
.a4
= new_gw
;
1465 atomic_inc(&__rt_peer_genid
);
1467 check_peer_redir(&rt
->dst
, peer
);
1475 #ifdef CONFIG_IP_ROUTE_VERBOSE
1476 if (IN_DEV_LOG_MARTIANS(in_dev
))
1477 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1478 " Advised path = %pI4 -> %pI4\n",
1479 &old_gw
, dev
->name
, &new_gw
,
1485 static bool peer_pmtu_expired(struct inet_peer
*peer
)
1487 unsigned long orig
= ACCESS_ONCE(peer
->pmtu_expires
);
1490 time_after_eq(jiffies
, orig
) &&
1491 cmpxchg(&peer
->pmtu_expires
, orig
, 0) == orig
;
1494 static bool peer_pmtu_cleaned(struct inet_peer
*peer
)
1496 unsigned long orig
= ACCESS_ONCE(peer
->pmtu_expires
);
1499 cmpxchg(&peer
->pmtu_expires
, orig
, 0) == orig
;
1502 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
1504 struct rtable
*rt
= (struct rtable
*)dst
;
1505 struct dst_entry
*ret
= dst
;
1508 if (dst
->obsolete
> 0) {
1511 } else if (rt
->rt_flags
& RTCF_REDIRECTED
) {
1512 unsigned int hash
= rt_hash(rt
->rt_key_dst
, rt
->rt_key_src
,
1514 rt_genid(dev_net(dst
->dev
)));
1517 } else if (rt_has_peer(rt
)) {
1518 struct inet_peer
*peer
= rt_peer_ptr(rt
);
1519 if (peer_pmtu_expired(peer
))
1520 dst_metric_set(dst
, RTAX_MTU
, peer
->pmtu_orig
);
1528 * 1. The first ip_rt_redirect_number redirects are sent
1529 * with exponential backoff, then we stop sending them at all,
1530 * assuming that the host ignores our redirects.
1531 * 2. If we did not see packets requiring redirects
1532 * during ip_rt_redirect_silence, we assume that the host
1533 * forgot redirected route and start to send redirects again.
1535 * This algorithm is much cheaper and more intelligent than dumb load limiting
1538 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1539 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1542 void ip_rt_send_redirect(struct sk_buff
*skb
)
1544 struct rtable
*rt
= skb_rtable(skb
);
1545 struct in_device
*in_dev
;
1546 struct inet_peer
*peer
;
1550 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
1551 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
1555 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
1558 peer
= rt_get_peer_create(rt
, rt
->rt_dst
);
1560 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1564 /* No redirected packets during ip_rt_redirect_silence;
1565 * reset the algorithm.
1567 if (time_after(jiffies
, peer
->rate_last
+ ip_rt_redirect_silence
))
1568 peer
->rate_tokens
= 0;
1570 /* Too many ignored redirects; do not send anything
1571 * set dst.rate_last to the last seen redirected packet.
1573 if (peer
->rate_tokens
>= ip_rt_redirect_number
) {
1574 peer
->rate_last
= jiffies
;
1578 /* Check for load limit; set rate_last to the latest sent
1581 if (peer
->rate_tokens
== 0 ||
1584 (ip_rt_redirect_load
<< peer
->rate_tokens
)))) {
1585 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1586 peer
->rate_last
= jiffies
;
1587 ++peer
->rate_tokens
;
1588 #ifdef CONFIG_IP_ROUTE_VERBOSE
1590 peer
->rate_tokens
== ip_rt_redirect_number
)
1591 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1592 &ip_hdr(skb
)->saddr
, rt
->rt_iif
,
1593 &rt
->rt_dst
, &rt
->rt_gateway
);
1598 static int ip_error(struct sk_buff
*skb
)
1600 struct in_device
*in_dev
= __in_dev_get_rcu(skb
->dev
);
1601 struct rtable
*rt
= skb_rtable(skb
);
1602 struct inet_peer
*peer
;
1608 net
= dev_net(rt
->dst
.dev
);
1609 if (!IN_DEV_FORWARD(in_dev
)) {
1610 switch (rt
->dst
.error
) {
1612 IP_INC_STATS_BH(net
, IPSTATS_MIB_INADDRERRORS
);
1616 IP_INC_STATS_BH(net
, IPSTATS_MIB_INNOROUTES
);
1622 switch (rt
->dst
.error
) {
1627 code
= ICMP_HOST_UNREACH
;
1630 code
= ICMP_NET_UNREACH
;
1631 IP_INC_STATS_BH(net
, IPSTATS_MIB_INNOROUTES
);
1634 code
= ICMP_PKT_FILTERED
;
1638 peer
= rt_get_peer_create(rt
, rt
->rt_dst
);
1643 peer
->rate_tokens
+= now
- peer
->rate_last
;
1644 if (peer
->rate_tokens
> ip_rt_error_burst
)
1645 peer
->rate_tokens
= ip_rt_error_burst
;
1646 peer
->rate_last
= now
;
1647 if (peer
->rate_tokens
>= ip_rt_error_cost
)
1648 peer
->rate_tokens
-= ip_rt_error_cost
;
1653 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1655 out
: kfree_skb(skb
);
1659 static void check_peer_pmtu(struct dst_entry
*dst
, struct inet_peer
*peer
)
1661 unsigned long expires
= ACCESS_ONCE(peer
->pmtu_expires
);
1665 if (time_before(jiffies
, expires
)) {
1666 u32 orig_dst_mtu
= dst_mtu(dst
);
1667 if (peer
->pmtu_learned
< orig_dst_mtu
) {
1668 if (!peer
->pmtu_orig
)
1669 peer
->pmtu_orig
= dst_metric_raw(dst
, RTAX_MTU
);
1670 dst_metric_set(dst
, RTAX_MTU
, peer
->pmtu_learned
);
1672 } else if (cmpxchg(&peer
->pmtu_expires
, expires
, 0) == expires
)
1673 dst_metric_set(dst
, RTAX_MTU
, peer
->pmtu_orig
);
1676 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
1678 struct rtable
*rt
= (struct rtable
*) dst
;
1679 struct inet_peer
*peer
;
1683 peer
= rt_get_peer_create(rt
, rt
->rt_dst
);
1685 unsigned long pmtu_expires
= ACCESS_ONCE(peer
->pmtu_expires
);
1687 if (mtu
< ip_rt_min_pmtu
)
1688 mtu
= ip_rt_min_pmtu
;
1689 if (!pmtu_expires
|| mtu
< peer
->pmtu_learned
) {
1691 pmtu_expires
= jiffies
+ ip_rt_mtu_expires
;
1695 peer
->pmtu_learned
= mtu
;
1696 peer
->pmtu_expires
= pmtu_expires
;
1698 atomic_inc(&__rt_peer_genid
);
1699 rt
->rt_peer_genid
= rt_peer_genid();
1701 check_peer_pmtu(dst
, peer
);
1705 void ipv4_update_pmtu(struct sk_buff
*skb
, struct net
*net
, u32 mtu
,
1706 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1708 const struct iphdr
*iph
= (const struct iphdr
*)skb
->data
;
1712 flowi4_init_output(&fl4
, oif
, mark
, RT_TOS(iph
->tos
), RT_SCOPE_UNIVERSE
,
1713 protocol
, flow_flags
| FLOWI_FLAG_PRECOW_METRICS
,
1714 iph
->daddr
, iph
->saddr
, 0, 0);
1715 rt
= __ip_route_output_key(net
, &fl4
);
1717 ip_rt_update_pmtu(&rt
->dst
, mtu
);
1721 EXPORT_SYMBOL_GPL(ipv4_update_pmtu
);
1723 void ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1725 const struct inet_sock
*inet
= inet_sk(sk
);
1727 return ipv4_update_pmtu(skb
, sock_net(sk
), mtu
,
1728 sk
->sk_bound_dev_if
, sk
->sk_mark
,
1729 inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
,
1730 inet_sk_flowi_flags(sk
));
1732 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu
);
1734 static void ipv4_validate_peer(struct rtable
*rt
)
1736 if (rt
->rt_peer_genid
!= rt_peer_genid()) {
1737 struct inet_peer
*peer
= rt_get_peer(rt
, rt
->rt_dst
);
1740 check_peer_pmtu(&rt
->dst
, peer
);
1742 if (peer
->redirect_learned
.a4
&&
1743 peer
->redirect_learned
.a4
!= rt
->rt_gateway
)
1744 check_peer_redir(&rt
->dst
, peer
);
1747 rt
->rt_peer_genid
= rt_peer_genid();
1751 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1753 struct rtable
*rt
= (struct rtable
*) dst
;
1755 if (rt_is_expired(rt
))
1757 ipv4_validate_peer(rt
);
1761 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1763 struct rtable
*rt
= (struct rtable
*) dst
;
1766 fib_info_put(rt
->fi
);
1769 if (rt_has_peer(rt
)) {
1770 struct inet_peer
*peer
= rt_peer_ptr(rt
);
1776 static void ipv4_link_failure(struct sk_buff
*skb
)
1780 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1782 rt
= skb_rtable(skb
);
1783 if (rt
&& rt_has_peer(rt
)) {
1784 struct inet_peer
*peer
= rt_peer_ptr(rt
);
1785 if (peer_pmtu_cleaned(peer
))
1786 dst_metric_set(&rt
->dst
, RTAX_MTU
, peer
->pmtu_orig
);
1790 static int ip_rt_bug(struct sk_buff
*skb
)
1792 pr_debug("%s: %pI4 -> %pI4, %s\n",
1793 __func__
, &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1794 skb
->dev
? skb
->dev
->name
: "?");
1801 We do not cache source address of outgoing interface,
1802 because it is used only by IP RR, TS and SRR options,
1803 so that it out of fast path.
1805 BTW remember: "addr" is allowed to be not aligned
1809 void ip_rt_get_source(u8
*addr
, struct sk_buff
*skb
, struct rtable
*rt
)
1813 if (rt_is_output_route(rt
))
1814 src
= ip_hdr(skb
)->saddr
;
1816 struct fib_result res
;
1822 memset(&fl4
, 0, sizeof(fl4
));
1823 fl4
.daddr
= iph
->daddr
;
1824 fl4
.saddr
= iph
->saddr
;
1825 fl4
.flowi4_tos
= RT_TOS(iph
->tos
);
1826 fl4
.flowi4_oif
= rt
->dst
.dev
->ifindex
;
1827 fl4
.flowi4_iif
= skb
->dev
->ifindex
;
1828 fl4
.flowi4_mark
= skb
->mark
;
1831 if (fib_lookup(dev_net(rt
->dst
.dev
), &fl4
, &res
) == 0)
1832 src
= FIB_RES_PREFSRC(dev_net(rt
->dst
.dev
), res
);
1834 src
= inet_select_addr(rt
->dst
.dev
, rt
->rt_gateway
,
1838 memcpy(addr
, &src
, 4);
1841 #ifdef CONFIG_IP_ROUTE_CLASSID
1842 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1844 if (!(rt
->dst
.tclassid
& 0xFFFF))
1845 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1846 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1847 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1851 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1853 unsigned int advmss
= dst_metric_raw(dst
, RTAX_ADVMSS
);
1856 advmss
= max_t(unsigned int, dst
->dev
->mtu
- 40,
1858 if (advmss
> 65535 - 40)
1859 advmss
= 65535 - 40;
1864 static unsigned int ipv4_mtu(const struct dst_entry
*dst
)
1866 const struct rtable
*rt
= (const struct rtable
*) dst
;
1867 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
1869 if (mtu
&& rt_is_output_route(rt
))
1872 mtu
= dst
->dev
->mtu
;
1874 if (unlikely(dst_metric_locked(dst
, RTAX_MTU
))) {
1876 if (rt
->rt_gateway
!= rt
->rt_dst
&& mtu
> 576)
1880 if (mtu
> IP_MAX_MTU
)
1886 static void rt_init_metrics(struct rtable
*rt
, const struct flowi4
*fl4
,
1887 struct fib_info
*fi
)
1889 struct inet_peer_base
*base
;
1890 struct inet_peer
*peer
;
1893 /* If a peer entry exists for this destination, we must hook
1894 * it up in order to get at cached metrics.
1896 if (fl4
&& (fl4
->flowi4_flags
& FLOWI_FLAG_PRECOW_METRICS
))
1899 base
= inetpeer_base_ptr(rt
->_peer
);
1902 peer
= inet_getpeer_v4(base
, rt
->rt_dst
, create
);
1904 __rt_set_peer(rt
, peer
);
1905 rt
->rt_peer_genid
= rt_peer_genid();
1906 if (inet_metrics_new(peer
))
1907 memcpy(peer
->metrics
, fi
->fib_metrics
,
1908 sizeof(u32
) * RTAX_MAX
);
1909 dst_init_metrics(&rt
->dst
, peer
->metrics
, false);
1911 check_peer_pmtu(&rt
->dst
, peer
);
1913 if (peer
->redirect_learned
.a4
&&
1914 peer
->redirect_learned
.a4
!= rt
->rt_gateway
) {
1915 rt
->rt_gateway
= peer
->redirect_learned
.a4
;
1916 rt
->rt_flags
|= RTCF_REDIRECTED
;
1919 if (fi
->fib_metrics
!= (u32
*) dst_default_metrics
) {
1921 atomic_inc(&fi
->fib_clntref
);
1923 dst_init_metrics(&rt
->dst
, fi
->fib_metrics
, true);
1927 static void rt_set_nexthop(struct rtable
*rt
, const struct flowi4
*fl4
,
1928 const struct fib_result
*res
,
1929 struct fib_info
*fi
, u16 type
, u32 itag
)
1931 struct dst_entry
*dst
= &rt
->dst
;
1934 if (FIB_RES_GW(*res
) &&
1935 FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1936 rt
->rt_gateway
= FIB_RES_GW(*res
);
1937 rt_init_metrics(rt
, fl4
, fi
);
1938 #ifdef CONFIG_IP_ROUTE_CLASSID
1939 dst
->tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1943 if (dst_mtu(dst
) > IP_MAX_MTU
)
1944 dst_metric_set(dst
, RTAX_MTU
, IP_MAX_MTU
);
1946 #ifdef CONFIG_IP_ROUTE_CLASSID
1947 #ifdef CONFIG_IP_MULTIPLE_TABLES
1948 set_class_tag(rt
, fib_rules_tclass(res
));
1950 set_class_tag(rt
, itag
);
1954 static struct rtable
*rt_dst_alloc(struct net_device
*dev
,
1955 bool nopolicy
, bool noxfrm
)
1957 return dst_alloc(&ipv4_dst_ops
, dev
, 1, -1,
1959 (nopolicy
? DST_NOPOLICY
: 0) |
1960 (noxfrm
? DST_NOXFRM
: 0));
1963 /* called in rcu_read_lock() section */
1964 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1965 u8 tos
, struct net_device
*dev
, int our
)
1969 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1973 /* Primary sanity checks. */
1978 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1979 skb
->protocol
!= htons(ETH_P_IP
))
1982 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
)))
1983 if (ipv4_is_loopback(saddr
))
1986 if (ipv4_is_zeronet(saddr
)) {
1987 if (!ipv4_is_local_multicast(daddr
))
1990 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
1995 rth
= rt_dst_alloc(dev_net(dev
)->loopback_dev
,
1996 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false);
2000 #ifdef CONFIG_IP_ROUTE_CLASSID
2001 rth
->dst
.tclassid
= itag
;
2003 rth
->dst
.output
= ip_rt_bug
;
2005 rth
->rt_key_dst
= daddr
;
2006 rth
->rt_key_src
= saddr
;
2007 rth
->rt_genid
= rt_genid(dev_net(dev
));
2008 rth
->rt_flags
= RTCF_MULTICAST
;
2009 rth
->rt_type
= RTN_MULTICAST
;
2010 rth
->rt_key_tos
= tos
;
2011 rth
->rt_dst
= daddr
;
2012 rth
->rt_src
= saddr
;
2013 rth
->rt_route_iif
= dev
->ifindex
;
2014 rth
->rt_iif
= dev
->ifindex
;
2016 rth
->rt_mark
= skb
->mark
;
2017 rth
->rt_gateway
= daddr
;
2018 rth
->rt_peer_genid
= 0;
2019 rt_init_peer(rth
, dev_net(dev
)->ipv4
.peers
);
2022 rth
->dst
.input
= ip_local_deliver
;
2023 rth
->rt_flags
|= RTCF_LOCAL
;
2026 #ifdef CONFIG_IP_MROUTE
2027 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
2028 rth
->dst
.input
= ip_mr_input
;
2030 RT_CACHE_STAT_INC(in_slow_mc
);
2032 hash
= rt_hash(daddr
, saddr
, dev
->ifindex
, rt_genid(dev_net(dev
)));
2033 rth
= rt_intern_hash(hash
, rth
, skb
, dev
->ifindex
);
2034 return IS_ERR(rth
) ? PTR_ERR(rth
) : 0;
2045 static void ip_handle_martian_source(struct net_device
*dev
,
2046 struct in_device
*in_dev
,
2047 struct sk_buff
*skb
,
2051 RT_CACHE_STAT_INC(in_martian_src
);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
2055 * RFC1812 recommendation, if source is martian,
2056 * the only hint is MAC header.
2058 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2059 &daddr
, &saddr
, dev
->name
);
2060 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
2061 print_hex_dump(KERN_WARNING
, "ll header: ",
2062 DUMP_PREFIX_OFFSET
, 16, 1,
2063 skb_mac_header(skb
),
2064 dev
->hard_header_len
, true);
2070 /* called in rcu_read_lock() section */
2071 static int __mkroute_input(struct sk_buff
*skb
,
2072 const struct fib_result
*res
,
2073 struct in_device
*in_dev
,
2074 __be32 daddr
, __be32 saddr
, u32 tos
,
2075 struct rtable
**result
)
2079 struct in_device
*out_dev
;
2080 unsigned int flags
= 0;
2083 /* get a working reference to the output device */
2084 out_dev
= __in_dev_get_rcu(FIB_RES_DEV(*res
));
2085 if (out_dev
== NULL
) {
2086 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2091 err
= fib_validate_source(skb
, saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
2092 in_dev
->dev
, in_dev
, &itag
);
2094 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
2101 flags
|= RTCF_DIRECTSRC
;
2103 if (out_dev
== in_dev
&& err
&&
2104 (IN_DEV_SHARED_MEDIA(out_dev
) ||
2105 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
2106 flags
|= RTCF_DOREDIRECT
;
2108 if (skb
->protocol
!= htons(ETH_P_IP
)) {
2109 /* Not IP (i.e. ARP). Do not create route, if it is
2110 * invalid for proxy arp. DNAT routes are always valid.
2112 * Proxy arp feature have been extended to allow, ARP
2113 * replies back to the same interface, to support
2114 * Private VLAN switch technologies. See arp.c.
2116 if (out_dev
== in_dev
&&
2117 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
2123 rth
= rt_dst_alloc(out_dev
->dev
,
2124 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2125 IN_DEV_CONF_GET(out_dev
, NOXFRM
));
2131 rth
->rt_key_dst
= daddr
;
2132 rth
->rt_key_src
= saddr
;
2133 rth
->rt_genid
= rt_genid(dev_net(rth
->dst
.dev
));
2134 rth
->rt_flags
= flags
;
2135 rth
->rt_type
= res
->type
;
2136 rth
->rt_key_tos
= tos
;
2137 rth
->rt_dst
= daddr
;
2138 rth
->rt_src
= saddr
;
2139 rth
->rt_route_iif
= in_dev
->dev
->ifindex
;
2140 rth
->rt_iif
= in_dev
->dev
->ifindex
;
2142 rth
->rt_mark
= skb
->mark
;
2143 rth
->rt_gateway
= daddr
;
2144 rth
->rt_peer_genid
= 0;
2145 rt_init_peer(rth
, &res
->table
->tb_peers
);
2148 rth
->dst
.input
= ip_forward
;
2149 rth
->dst
.output
= ip_output
;
2151 rt_set_nexthop(rth
, NULL
, res
, res
->fi
, res
->type
, itag
);
2159 static int ip_mkroute_input(struct sk_buff
*skb
,
2160 struct fib_result
*res
,
2161 const struct flowi4
*fl4
,
2162 struct in_device
*in_dev
,
2163 __be32 daddr
, __be32 saddr
, u32 tos
)
2165 struct rtable
*rth
= NULL
;
2169 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2170 if (res
->fi
&& res
->fi
->fib_nhs
> 1)
2171 fib_select_multipath(res
);
2174 /* create a routing cache entry */
2175 err
= __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
, &rth
);
2179 /* put it into the cache */
2180 hash
= rt_hash(daddr
, saddr
, fl4
->flowi4_iif
,
2181 rt_genid(dev_net(rth
->dst
.dev
)));
2182 rth
= rt_intern_hash(hash
, rth
, skb
, fl4
->flowi4_iif
);
2184 return PTR_ERR(rth
);
2189 * NOTE. We drop all the packets that has local source
2190 * addresses, because every properly looped back packet
2191 * must have correct destination already attached by output routine.
2193 * Such approach solves two big problems:
2194 * 1. Not simplex devices are handled properly.
2195 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2196 * called with rcu_read_lock()
2199 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2200 u8 tos
, struct net_device
*dev
)
2202 struct fib_result res
;
2203 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2205 unsigned int flags
= 0;
2210 struct net
*net
= dev_net(dev
);
2212 /* IP on this device is disabled. */
2217 /* Check for the most weird martians, which can be not detected
2221 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
))
2222 goto martian_source
;
2224 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
2227 /* Accept zero addresses only to limited broadcast;
2228 * I even do not know to fix it or not. Waiting for complains :-)
2230 if (ipv4_is_zeronet(saddr
))
2231 goto martian_source
;
2233 if (ipv4_is_zeronet(daddr
))
2234 goto martian_destination
;
2236 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
))) {
2237 if (ipv4_is_loopback(daddr
))
2238 goto martian_destination
;
2240 if (ipv4_is_loopback(saddr
))
2241 goto martian_source
;
2245 * Now we are ready to route packet.
2248 fl4
.flowi4_iif
= dev
->ifindex
;
2249 fl4
.flowi4_mark
= skb
->mark
;
2250 fl4
.flowi4_tos
= tos
;
2251 fl4
.flowi4_scope
= RT_SCOPE_UNIVERSE
;
2254 err
= fib_lookup(net
, &fl4
, &res
);
2258 RT_CACHE_STAT_INC(in_slow_tot
);
2260 if (res
.type
== RTN_BROADCAST
)
2263 if (res
.type
== RTN_LOCAL
) {
2264 err
= fib_validate_source(skb
, saddr
, daddr
, tos
,
2265 net
->loopback_dev
->ifindex
,
2266 dev
, in_dev
, &itag
);
2268 goto martian_source_keep_err
;
2270 flags
|= RTCF_DIRECTSRC
;
2274 if (!IN_DEV_FORWARD(in_dev
))
2276 if (res
.type
!= RTN_UNICAST
)
2277 goto martian_destination
;
2279 err
= ip_mkroute_input(skb
, &res
, &fl4
, in_dev
, daddr
, saddr
, tos
);
2283 if (skb
->protocol
!= htons(ETH_P_IP
))
2286 if (!ipv4_is_zeronet(saddr
)) {
2287 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
2290 goto martian_source_keep_err
;
2292 flags
|= RTCF_DIRECTSRC
;
2294 flags
|= RTCF_BROADCAST
;
2295 res
.type
= RTN_BROADCAST
;
2296 RT_CACHE_STAT_INC(in_brd
);
2299 rth
= rt_dst_alloc(net
->loopback_dev
,
2300 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false);
2304 rth
->dst
.input
= ip_local_deliver
;
2305 rth
->dst
.output
= ip_rt_bug
;
2306 #ifdef CONFIG_IP_ROUTE_CLASSID
2307 rth
->dst
.tclassid
= itag
;
2310 rth
->rt_key_dst
= daddr
;
2311 rth
->rt_key_src
= saddr
;
2312 rth
->rt_genid
= rt_genid(net
);
2313 rth
->rt_flags
= flags
|RTCF_LOCAL
;
2314 rth
->rt_type
= res
.type
;
2315 rth
->rt_key_tos
= tos
;
2316 rth
->rt_dst
= daddr
;
2317 rth
->rt_src
= saddr
;
2318 rth
->rt_route_iif
= dev
->ifindex
;
2319 rth
->rt_iif
= dev
->ifindex
;
2321 rth
->rt_mark
= skb
->mark
;
2322 rth
->rt_gateway
= daddr
;
2323 rth
->rt_peer_genid
= 0;
2324 rt_init_peer(rth
, net
->ipv4
.peers
);
2326 if (res
.type
== RTN_UNREACHABLE
) {
2327 rth
->dst
.input
= ip_error
;
2328 rth
->dst
.error
= -err
;
2329 rth
->rt_flags
&= ~RTCF_LOCAL
;
2331 hash
= rt_hash(daddr
, saddr
, fl4
.flowi4_iif
, rt_genid(net
));
2332 rth
= rt_intern_hash(hash
, rth
, skb
, fl4
.flowi4_iif
);
2339 RT_CACHE_STAT_INC(in_no_route
);
2340 res
.type
= RTN_UNREACHABLE
;
2346 * Do not cache martian addresses: they should be logged (RFC1812)
2348 martian_destination
:
2349 RT_CACHE_STAT_INC(in_martian_dst
);
2350 #ifdef CONFIG_IP_ROUTE_VERBOSE
2351 if (IN_DEV_LOG_MARTIANS(in_dev
))
2352 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2353 &daddr
, &saddr
, dev
->name
);
2366 martian_source_keep_err
:
2367 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2371 int ip_route_input_common(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2372 u8 tos
, struct net_device
*dev
, bool noref
)
2376 int iif
= dev
->ifindex
;
2384 if (!rt_caching(net
))
2387 tos
&= IPTOS_RT_MASK
;
2388 hash
= rt_hash(daddr
, saddr
, iif
, rt_genid(net
));
2390 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2391 rth
= rcu_dereference(rth
->dst
.rt_next
)) {
2392 if ((((__force u32
)rth
->rt_key_dst
^ (__force u32
)daddr
) |
2393 ((__force u32
)rth
->rt_key_src
^ (__force u32
)saddr
) |
2394 (rth
->rt_route_iif
^ iif
) |
2395 (rth
->rt_key_tos
^ tos
)) == 0 &&
2396 rth
->rt_mark
== skb
->mark
&&
2397 net_eq(dev_net(rth
->dst
.dev
), net
) &&
2398 !rt_is_expired(rth
)) {
2399 ipv4_validate_peer(rth
);
2401 dst_use_noref(&rth
->dst
, jiffies
);
2402 skb_dst_set_noref(skb
, &rth
->dst
);
2404 dst_use(&rth
->dst
, jiffies
);
2405 skb_dst_set(skb
, &rth
->dst
);
2407 RT_CACHE_STAT_INC(in_hit
);
2411 RT_CACHE_STAT_INC(in_hlist_search
);
2415 /* Multicast recognition logic is moved from route cache to here.
2416 The problem was that too many Ethernet cards have broken/missing
2417 hardware multicast filters :-( As result the host on multicasting
2418 network acquires a lot of useless route cache entries, sort of
2419 SDR messages from all the world. Now we try to get rid of them.
2420 Really, provided software IP multicast filter is organized
2421 reasonably (at least, hashed), it does not result in a slowdown
2422 comparing with route cache reject entries.
2423 Note, that multicast routers are not affected, because
2424 route cache entry is created eventually.
2426 if (ipv4_is_multicast(daddr
)) {
2427 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2430 int our
= ip_check_mc_rcu(in_dev
, daddr
, saddr
,
2431 ip_hdr(skb
)->protocol
);
2433 #ifdef CONFIG_IP_MROUTE
2435 (!ipv4_is_local_multicast(daddr
) &&
2436 IN_DEV_MFORWARD(in_dev
))
2439 int res
= ip_route_input_mc(skb
, daddr
, saddr
,
2448 res
= ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2452 EXPORT_SYMBOL(ip_route_input_common
);
2454 /* called with rcu_read_lock() */
2455 static struct rtable
*__mkroute_output(const struct fib_result
*res
,
2456 const struct flowi4
*fl4
,
2457 __be32 orig_daddr
, __be32 orig_saddr
,
2458 int orig_oif
, __u8 orig_rtos
,
2459 struct net_device
*dev_out
,
2462 struct fib_info
*fi
= res
->fi
;
2463 struct in_device
*in_dev
;
2464 u16 type
= res
->type
;
2467 in_dev
= __in_dev_get_rcu(dev_out
);
2469 return ERR_PTR(-EINVAL
);
2471 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
)))
2472 if (ipv4_is_loopback(fl4
->saddr
) && !(dev_out
->flags
& IFF_LOOPBACK
))
2473 return ERR_PTR(-EINVAL
);
2475 if (ipv4_is_lbcast(fl4
->daddr
))
2476 type
= RTN_BROADCAST
;
2477 else if (ipv4_is_multicast(fl4
->daddr
))
2478 type
= RTN_MULTICAST
;
2479 else if (ipv4_is_zeronet(fl4
->daddr
))
2480 return ERR_PTR(-EINVAL
);
2482 if (dev_out
->flags
& IFF_LOOPBACK
)
2483 flags
|= RTCF_LOCAL
;
2485 if (type
== RTN_BROADCAST
) {
2486 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2488 } else if (type
== RTN_MULTICAST
) {
2489 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2490 if (!ip_check_mc_rcu(in_dev
, fl4
->daddr
, fl4
->saddr
,
2492 flags
&= ~RTCF_LOCAL
;
2493 /* If multicast route do not exist use
2494 * default one, but do not gateway in this case.
2497 if (fi
&& res
->prefixlen
< 4)
2501 rth
= rt_dst_alloc(dev_out
,
2502 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2503 IN_DEV_CONF_GET(in_dev
, NOXFRM
));
2505 return ERR_PTR(-ENOBUFS
);
2507 rth
->dst
.output
= ip_output
;
2509 rth
->rt_key_dst
= orig_daddr
;
2510 rth
->rt_key_src
= orig_saddr
;
2511 rth
->rt_genid
= rt_genid(dev_net(dev_out
));
2512 rth
->rt_flags
= flags
;
2513 rth
->rt_type
= type
;
2514 rth
->rt_key_tos
= orig_rtos
;
2515 rth
->rt_dst
= fl4
->daddr
;
2516 rth
->rt_src
= fl4
->saddr
;
2517 rth
->rt_route_iif
= 0;
2518 rth
->rt_iif
= orig_oif
? : dev_out
->ifindex
;
2519 rth
->rt_oif
= orig_oif
;
2520 rth
->rt_mark
= fl4
->flowi4_mark
;
2521 rth
->rt_gateway
= fl4
->daddr
;
2522 rth
->rt_peer_genid
= 0;
2523 rt_init_peer(rth
, (res
->table
?
2524 &res
->table
->tb_peers
:
2525 dev_net(dev_out
)->ipv4
.peers
));
2528 RT_CACHE_STAT_INC(out_slow_tot
);
2530 if (flags
& RTCF_LOCAL
)
2531 rth
->dst
.input
= ip_local_deliver
;
2532 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2533 if (flags
& RTCF_LOCAL
&&
2534 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2535 rth
->dst
.output
= ip_mc_output
;
2536 RT_CACHE_STAT_INC(out_slow_mc
);
2538 #ifdef CONFIG_IP_MROUTE
2539 if (type
== RTN_MULTICAST
) {
2540 if (IN_DEV_MFORWARD(in_dev
) &&
2541 !ipv4_is_local_multicast(fl4
->daddr
)) {
2542 rth
->dst
.input
= ip_mr_input
;
2543 rth
->dst
.output
= ip_mc_output
;
2549 rt_set_nexthop(rth
, fl4
, res
, fi
, type
, 0);
2551 if (fl4
->flowi4_flags
& FLOWI_FLAG_RT_NOCACHE
)
2552 rth
->dst
.flags
|= DST_NOCACHE
;
2558 * Major route resolver routine.
2559 * called with rcu_read_lock();
2562 static struct rtable
*ip_route_output_slow(struct net
*net
, struct flowi4
*fl4
)
2564 struct net_device
*dev_out
= NULL
;
2565 __u8 tos
= RT_FL_TOS(fl4
);
2566 unsigned int flags
= 0;
2567 struct fib_result res
;
2575 #ifdef CONFIG_IP_MULTIPLE_TABLES
2579 orig_daddr
= fl4
->daddr
;
2580 orig_saddr
= fl4
->saddr
;
2581 orig_oif
= fl4
->flowi4_oif
;
2583 fl4
->flowi4_iif
= net
->loopback_dev
->ifindex
;
2584 fl4
->flowi4_tos
= tos
& IPTOS_RT_MASK
;
2585 fl4
->flowi4_scope
= ((tos
& RTO_ONLINK
) ?
2586 RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
);
2590 rth
= ERR_PTR(-EINVAL
);
2591 if (ipv4_is_multicast(fl4
->saddr
) ||
2592 ipv4_is_lbcast(fl4
->saddr
) ||
2593 ipv4_is_zeronet(fl4
->saddr
))
2596 /* I removed check for oif == dev_out->oif here.
2597 It was wrong for two reasons:
2598 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2599 is assigned to multiple interfaces.
2600 2. Moreover, we are allowed to send packets with saddr
2601 of another iface. --ANK
2604 if (fl4
->flowi4_oif
== 0 &&
2605 (ipv4_is_multicast(fl4
->daddr
) ||
2606 ipv4_is_lbcast(fl4
->daddr
))) {
2607 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2608 dev_out
= __ip_dev_find(net
, fl4
->saddr
, false);
2609 if (dev_out
== NULL
)
2612 /* Special hack: user can direct multicasts
2613 and limited broadcast via necessary interface
2614 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2615 This hack is not just for fun, it allows
2616 vic,vat and friends to work.
2617 They bind socket to loopback, set ttl to zero
2618 and expect that it will work.
2619 From the viewpoint of routing cache they are broken,
2620 because we are not allowed to build multicast path
2621 with loopback source addr (look, routing cache
2622 cannot know, that ttl is zero, so that packet
2623 will not leave this host and route is valid).
2624 Luckily, this hack is good workaround.
2627 fl4
->flowi4_oif
= dev_out
->ifindex
;
2631 if (!(fl4
->flowi4_flags
& FLOWI_FLAG_ANYSRC
)) {
2632 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2633 if (!__ip_dev_find(net
, fl4
->saddr
, false))
2639 if (fl4
->flowi4_oif
) {
2640 dev_out
= dev_get_by_index_rcu(net
, fl4
->flowi4_oif
);
2641 rth
= ERR_PTR(-ENODEV
);
2642 if (dev_out
== NULL
)
2645 /* RACE: Check return value of inet_select_addr instead. */
2646 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2647 rth
= ERR_PTR(-ENETUNREACH
);
2650 if (ipv4_is_local_multicast(fl4
->daddr
) ||
2651 ipv4_is_lbcast(fl4
->daddr
)) {
2653 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2658 if (ipv4_is_multicast(fl4
->daddr
))
2659 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2661 else if (!fl4
->daddr
)
2662 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2668 fl4
->daddr
= fl4
->saddr
;
2670 fl4
->daddr
= fl4
->saddr
= htonl(INADDR_LOOPBACK
);
2671 dev_out
= net
->loopback_dev
;
2672 fl4
->flowi4_oif
= net
->loopback_dev
->ifindex
;
2673 res
.type
= RTN_LOCAL
;
2674 flags
|= RTCF_LOCAL
;
2678 if (fib_lookup(net
, fl4
, &res
)) {
2681 if (fl4
->flowi4_oif
) {
2682 /* Apparently, routing tables are wrong. Assume,
2683 that the destination is on link.
2686 Because we are allowed to send to iface
2687 even if it has NO routes and NO assigned
2688 addresses. When oif is specified, routing
2689 tables are looked up with only one purpose:
2690 to catch if destination is gatewayed, rather than
2691 direct. Moreover, if MSG_DONTROUTE is set,
2692 we send packet, ignoring both routing tables
2693 and ifaddr state. --ANK
2696 We could make it even if oif is unknown,
2697 likely IPv6, but we do not.
2700 if (fl4
->saddr
== 0)
2701 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2703 res
.type
= RTN_UNICAST
;
2706 rth
= ERR_PTR(-ENETUNREACH
);
2710 if (res
.type
== RTN_LOCAL
) {
2712 if (res
.fi
->fib_prefsrc
)
2713 fl4
->saddr
= res
.fi
->fib_prefsrc
;
2715 fl4
->saddr
= fl4
->daddr
;
2717 dev_out
= net
->loopback_dev
;
2718 fl4
->flowi4_oif
= dev_out
->ifindex
;
2720 flags
|= RTCF_LOCAL
;
2724 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2725 if (res
.fi
->fib_nhs
> 1 && fl4
->flowi4_oif
== 0)
2726 fib_select_multipath(&res
);
2729 if (!res
.prefixlen
&&
2730 res
.table
->tb_num_default
> 1 &&
2731 res
.type
== RTN_UNICAST
&& !fl4
->flowi4_oif
)
2732 fib_select_default(&res
);
2735 fl4
->saddr
= FIB_RES_PREFSRC(net
, res
);
2737 dev_out
= FIB_RES_DEV(res
);
2738 fl4
->flowi4_oif
= dev_out
->ifindex
;
2742 rth
= __mkroute_output(&res
, fl4
, orig_daddr
, orig_saddr
, orig_oif
,
2743 tos
, dev_out
, flags
);
2747 hash
= rt_hash(orig_daddr
, orig_saddr
, orig_oif
,
2748 rt_genid(dev_net(dev_out
)));
2749 rth
= rt_intern_hash(hash
, rth
, NULL
, orig_oif
);
2757 struct rtable
*__ip_route_output_key(struct net
*net
, struct flowi4
*flp4
)
2762 if (!rt_caching(net
))
2765 hash
= rt_hash(flp4
->daddr
, flp4
->saddr
, flp4
->flowi4_oif
, rt_genid(net
));
2768 for (rth
= rcu_dereference_bh(rt_hash_table
[hash
].chain
); rth
;
2769 rth
= rcu_dereference_bh(rth
->dst
.rt_next
)) {
2770 if (rth
->rt_key_dst
== flp4
->daddr
&&
2771 rth
->rt_key_src
== flp4
->saddr
&&
2772 rt_is_output_route(rth
) &&
2773 rth
->rt_oif
== flp4
->flowi4_oif
&&
2774 rth
->rt_mark
== flp4
->flowi4_mark
&&
2775 !((rth
->rt_key_tos
^ flp4
->flowi4_tos
) &
2776 (IPTOS_RT_MASK
| RTO_ONLINK
)) &&
2777 net_eq(dev_net(rth
->dst
.dev
), net
) &&
2778 !rt_is_expired(rth
)) {
2779 ipv4_validate_peer(rth
);
2780 dst_use(&rth
->dst
, jiffies
);
2781 RT_CACHE_STAT_INC(out_hit
);
2782 rcu_read_unlock_bh();
2784 flp4
->saddr
= rth
->rt_src
;
2786 flp4
->daddr
= rth
->rt_dst
;
2789 RT_CACHE_STAT_INC(out_hlist_search
);
2791 rcu_read_unlock_bh();
2794 return ip_route_output_slow(net
, flp4
);
2796 EXPORT_SYMBOL_GPL(__ip_route_output_key
);
2798 static struct dst_entry
*ipv4_blackhole_dst_check(struct dst_entry
*dst
, u32 cookie
)
2803 static unsigned int ipv4_blackhole_mtu(const struct dst_entry
*dst
)
2805 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2807 return mtu
? : dst
->dev
->mtu
;
2810 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
2814 static u32
*ipv4_rt_blackhole_cow_metrics(struct dst_entry
*dst
,
2820 static struct dst_ops ipv4_dst_blackhole_ops
= {
2822 .protocol
= cpu_to_be16(ETH_P_IP
),
2823 .destroy
= ipv4_dst_destroy
,
2824 .check
= ipv4_blackhole_dst_check
,
2825 .mtu
= ipv4_blackhole_mtu
,
2826 .default_advmss
= ipv4_default_advmss
,
2827 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2828 .cow_metrics
= ipv4_rt_blackhole_cow_metrics
,
2829 .neigh_lookup
= ipv4_neigh_lookup
,
2832 struct dst_entry
*ipv4_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2834 struct rtable
*rt
= dst_alloc(&ipv4_dst_blackhole_ops
, NULL
, 1, 0, 0);
2835 struct rtable
*ort
= (struct rtable
*) dst_orig
;
2838 struct dst_entry
*new = &rt
->dst
;
2841 new->input
= dst_discard
;
2842 new->output
= dst_discard
;
2843 dst_copy_metrics(new, &ort
->dst
);
2845 new->dev
= ort
->dst
.dev
;
2849 rt
->rt_key_dst
= ort
->rt_key_dst
;
2850 rt
->rt_key_src
= ort
->rt_key_src
;
2851 rt
->rt_key_tos
= ort
->rt_key_tos
;
2852 rt
->rt_route_iif
= ort
->rt_route_iif
;
2853 rt
->rt_iif
= ort
->rt_iif
;
2854 rt
->rt_oif
= ort
->rt_oif
;
2855 rt
->rt_mark
= ort
->rt_mark
;
2857 rt
->rt_genid
= rt_genid(net
);
2858 rt
->rt_flags
= ort
->rt_flags
;
2859 rt
->rt_type
= ort
->rt_type
;
2860 rt
->rt_dst
= ort
->rt_dst
;
2861 rt
->rt_src
= ort
->rt_src
;
2862 rt
->rt_gateway
= ort
->rt_gateway
;
2863 rt_transfer_peer(rt
, ort
);
2866 atomic_inc(&rt
->fi
->fib_clntref
);
2871 dst_release(dst_orig
);
2873 return rt
? &rt
->dst
: ERR_PTR(-ENOMEM
);
2876 struct rtable
*ip_route_output_flow(struct net
*net
, struct flowi4
*flp4
,
2879 struct rtable
*rt
= __ip_route_output_key(net
, flp4
);
2884 if (flp4
->flowi4_proto
)
2885 rt
= (struct rtable
*) xfrm_lookup(net
, &rt
->dst
,
2886 flowi4_to_flowi(flp4
),
2891 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2893 static int rt_fill_info(struct net
*net
,
2894 struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
2895 int nowait
, unsigned int flags
)
2897 struct rtable
*rt
= skb_rtable(skb
);
2899 struct nlmsghdr
*nlh
;
2900 unsigned long expires
= 0;
2901 u32 id
= 0, ts
= 0, tsage
= 0, error
;
2903 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*r
), flags
);
2907 r
= nlmsg_data(nlh
);
2908 r
->rtm_family
= AF_INET
;
2909 r
->rtm_dst_len
= 32;
2911 r
->rtm_tos
= rt
->rt_key_tos
;
2912 r
->rtm_table
= RT_TABLE_MAIN
;
2913 if (nla_put_u32(skb
, RTA_TABLE
, RT_TABLE_MAIN
))
2914 goto nla_put_failure
;
2915 r
->rtm_type
= rt
->rt_type
;
2916 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2917 r
->rtm_protocol
= RTPROT_UNSPEC
;
2918 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2919 if (rt
->rt_flags
& RTCF_NOTIFY
)
2920 r
->rtm_flags
|= RTM_F_NOTIFY
;
2922 if (nla_put_be32(skb
, RTA_DST
, rt
->rt_dst
))
2923 goto nla_put_failure
;
2924 if (rt
->rt_key_src
) {
2925 r
->rtm_src_len
= 32;
2926 if (nla_put_be32(skb
, RTA_SRC
, rt
->rt_key_src
))
2927 goto nla_put_failure
;
2930 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
2931 goto nla_put_failure
;
2932 #ifdef CONFIG_IP_ROUTE_CLASSID
2933 if (rt
->dst
.tclassid
&&
2934 nla_put_u32(skb
, RTA_FLOW
, rt
->dst
.tclassid
))
2935 goto nla_put_failure
;
2937 if (!rt_is_input_route(rt
) &&
2938 rt
->rt_src
!= rt
->rt_key_src
) {
2939 if (nla_put_be32(skb
, RTA_PREFSRC
, rt
->rt_src
))
2940 goto nla_put_failure
;
2942 if (rt
->rt_dst
!= rt
->rt_gateway
&&
2943 nla_put_be32(skb
, RTA_GATEWAY
, rt
->rt_gateway
))
2944 goto nla_put_failure
;
2946 if (rtnetlink_put_metrics(skb
, dst_metrics_ptr(&rt
->dst
)) < 0)
2947 goto nla_put_failure
;
2950 nla_put_be32(skb
, RTA_MARK
, rt
->rt_mark
))
2951 goto nla_put_failure
;
2953 error
= rt
->dst
.error
;
2954 if (rt_has_peer(rt
)) {
2955 const struct inet_peer
*peer
= rt_peer_ptr(rt
);
2956 inet_peer_refcheck(peer
);
2957 id
= atomic_read(&peer
->ip_id_count
) & 0xffff;
2958 if (peer
->tcp_ts_stamp
) {
2960 tsage
= get_seconds() - peer
->tcp_ts_stamp
;
2962 expires
= ACCESS_ONCE(peer
->pmtu_expires
);
2964 if (time_before(jiffies
, expires
))
2971 if (rt_is_input_route(rt
)) {
2972 #ifdef CONFIG_IP_MROUTE
2973 __be32 dst
= rt
->rt_dst
;
2975 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2976 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2977 int err
= ipmr_get_route(net
, skb
,
2978 rt
->rt_src
, rt
->rt_dst
,
2984 goto nla_put_failure
;
2986 if (err
== -EMSGSIZE
)
2987 goto nla_put_failure
;
2993 if (nla_put_u32(skb
, RTA_IIF
, rt
->rt_iif
))
2994 goto nla_put_failure
;
2997 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, id
, ts
, tsage
,
2998 expires
, error
) < 0)
2999 goto nla_put_failure
;
3001 return nlmsg_end(skb
, nlh
);
3004 nlmsg_cancel(skb
, nlh
);
3008 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
, void *arg
)
3010 struct net
*net
= sock_net(in_skb
->sk
);
3012 struct nlattr
*tb
[RTA_MAX
+1];
3013 struct rtable
*rt
= NULL
;
3019 struct sk_buff
*skb
;
3021 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
3025 rtm
= nlmsg_data(nlh
);
3027 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3033 /* Reserve room for dummy headers, this skb can pass
3034 through good chunk of routing engine.
3036 skb_reset_mac_header(skb
);
3037 skb_reset_network_header(skb
);
3039 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3040 ip_hdr(skb
)->protocol
= IPPROTO_ICMP
;
3041 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
3043 src
= tb
[RTA_SRC
] ? nla_get_be32(tb
[RTA_SRC
]) : 0;
3044 dst
= tb
[RTA_DST
] ? nla_get_be32(tb
[RTA_DST
]) : 0;
3045 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
3046 mark
= tb
[RTA_MARK
] ? nla_get_u32(tb
[RTA_MARK
]) : 0;
3049 struct net_device
*dev
;
3051 dev
= __dev_get_by_index(net
, iif
);
3057 skb
->protocol
= htons(ETH_P_IP
);
3061 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
3064 rt
= skb_rtable(skb
);
3065 if (err
== 0 && rt
->dst
.error
)
3066 err
= -rt
->dst
.error
;
3068 struct flowi4 fl4
= {
3071 .flowi4_tos
= rtm
->rtm_tos
,
3072 .flowi4_oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0,
3073 .flowi4_mark
= mark
,
3075 rt
= ip_route_output_key(net
, &fl4
);
3085 skb_dst_set(skb
, &rt
->dst
);
3086 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
3087 rt
->rt_flags
|= RTCF_NOTIFY
;
3089 err
= rt_fill_info(net
, skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
,
3090 RTM_NEWROUTE
, 0, 0);
3094 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
3103 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3110 net
= sock_net(skb
->sk
);
3115 s_idx
= idx
= cb
->args
[1];
3116 for (h
= s_h
; h
<= rt_hash_mask
; h
++, s_idx
= 0) {
3117 if (!rt_hash_table
[h
].chain
)
3120 for (rt
= rcu_dereference_bh(rt_hash_table
[h
].chain
), idx
= 0; rt
;
3121 rt
= rcu_dereference_bh(rt
->dst
.rt_next
), idx
++) {
3122 if (!net_eq(dev_net(rt
->dst
.dev
), net
) || idx
< s_idx
)
3124 if (rt_is_expired(rt
))
3126 skb_dst_set_noref(skb
, &rt
->dst
);
3127 if (rt_fill_info(net
, skb
, NETLINK_CB(cb
->skb
).pid
,
3128 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
,
3129 1, NLM_F_MULTI
) <= 0) {
3131 rcu_read_unlock_bh();
3136 rcu_read_unlock_bh();
3145 void ip_rt_multicast_event(struct in_device
*in_dev
)
3147 rt_cache_flush(dev_net(in_dev
->dev
), 0);
3150 #ifdef CONFIG_SYSCTL
3151 static int ipv4_sysctl_rtcache_flush(ctl_table
*__ctl
, int write
,
3152 void __user
*buffer
,
3153 size_t *lenp
, loff_t
*ppos
)
3160 memcpy(&ctl
, __ctl
, sizeof(ctl
));
3161 ctl
.data
= &flush_delay
;
3162 proc_dointvec(&ctl
, write
, buffer
, lenp
, ppos
);
3164 net
= (struct net
*)__ctl
->extra1
;
3165 rt_cache_flush(net
, flush_delay
);
3172 static ctl_table ipv4_route_table
[] = {
3174 .procname
= "gc_thresh",
3175 .data
= &ipv4_dst_ops
.gc_thresh
,
3176 .maxlen
= sizeof(int),
3178 .proc_handler
= proc_dointvec
,
3181 .procname
= "max_size",
3182 .data
= &ip_rt_max_size
,
3183 .maxlen
= sizeof(int),
3185 .proc_handler
= proc_dointvec
,
3188 /* Deprecated. Use gc_min_interval_ms */
3190 .procname
= "gc_min_interval",
3191 .data
= &ip_rt_gc_min_interval
,
3192 .maxlen
= sizeof(int),
3194 .proc_handler
= proc_dointvec_jiffies
,
3197 .procname
= "gc_min_interval_ms",
3198 .data
= &ip_rt_gc_min_interval
,
3199 .maxlen
= sizeof(int),
3201 .proc_handler
= proc_dointvec_ms_jiffies
,
3204 .procname
= "gc_timeout",
3205 .data
= &ip_rt_gc_timeout
,
3206 .maxlen
= sizeof(int),
3208 .proc_handler
= proc_dointvec_jiffies
,
3211 .procname
= "gc_interval",
3212 .data
= &ip_rt_gc_interval
,
3213 .maxlen
= sizeof(int),
3215 .proc_handler
= proc_dointvec_jiffies
,
3218 .procname
= "redirect_load",
3219 .data
= &ip_rt_redirect_load
,
3220 .maxlen
= sizeof(int),
3222 .proc_handler
= proc_dointvec
,
3225 .procname
= "redirect_number",
3226 .data
= &ip_rt_redirect_number
,
3227 .maxlen
= sizeof(int),
3229 .proc_handler
= proc_dointvec
,
3232 .procname
= "redirect_silence",
3233 .data
= &ip_rt_redirect_silence
,
3234 .maxlen
= sizeof(int),
3236 .proc_handler
= proc_dointvec
,
3239 .procname
= "error_cost",
3240 .data
= &ip_rt_error_cost
,
3241 .maxlen
= sizeof(int),
3243 .proc_handler
= proc_dointvec
,
3246 .procname
= "error_burst",
3247 .data
= &ip_rt_error_burst
,
3248 .maxlen
= sizeof(int),
3250 .proc_handler
= proc_dointvec
,
3253 .procname
= "gc_elasticity",
3254 .data
= &ip_rt_gc_elasticity
,
3255 .maxlen
= sizeof(int),
3257 .proc_handler
= proc_dointvec
,
3260 .procname
= "mtu_expires",
3261 .data
= &ip_rt_mtu_expires
,
3262 .maxlen
= sizeof(int),
3264 .proc_handler
= proc_dointvec_jiffies
,
3267 .procname
= "min_pmtu",
3268 .data
= &ip_rt_min_pmtu
,
3269 .maxlen
= sizeof(int),
3271 .proc_handler
= proc_dointvec
,
3274 .procname
= "min_adv_mss",
3275 .data
= &ip_rt_min_advmss
,
3276 .maxlen
= sizeof(int),
3278 .proc_handler
= proc_dointvec
,
3283 static struct ctl_table ipv4_route_flush_table
[] = {
3285 .procname
= "flush",
3286 .maxlen
= sizeof(int),
3288 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3293 static __net_init
int sysctl_route_net_init(struct net
*net
)
3295 struct ctl_table
*tbl
;
3297 tbl
= ipv4_route_flush_table
;
3298 if (!net_eq(net
, &init_net
)) {
3299 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3303 tbl
[0].extra1
= net
;
3305 net
->ipv4
.route_hdr
= register_net_sysctl(net
, "net/ipv4/route", tbl
);
3306 if (net
->ipv4
.route_hdr
== NULL
)
3311 if (tbl
!= ipv4_route_flush_table
)
3317 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3319 struct ctl_table
*tbl
;
3321 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3322 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3323 BUG_ON(tbl
== ipv4_route_flush_table
);
3327 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3328 .init
= sysctl_route_net_init
,
3329 .exit
= sysctl_route_net_exit
,
3333 static __net_init
int rt_genid_init(struct net
*net
)
3335 get_random_bytes(&net
->ipv4
.rt_genid
,
3336 sizeof(net
->ipv4
.rt_genid
));
3337 get_random_bytes(&net
->ipv4
.dev_addr_genid
,
3338 sizeof(net
->ipv4
.dev_addr_genid
));
3342 static __net_initdata
struct pernet_operations rt_genid_ops
= {
3343 .init
= rt_genid_init
,
3346 static int __net_init
ipv4_inetpeer_init(struct net
*net
)
3348 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
3352 inet_peer_base_init(bp
);
3353 net
->ipv4
.peers
= bp
;
3357 static void __net_exit
ipv4_inetpeer_exit(struct net
*net
)
3359 struct inet_peer_base
*bp
= net
->ipv4
.peers
;
3361 net
->ipv4
.peers
= NULL
;
3362 inetpeer_invalidate_tree(bp
);
3366 static __net_initdata
struct pernet_operations ipv4_inetpeer_ops
= {
3367 .init
= ipv4_inetpeer_init
,
3368 .exit
= ipv4_inetpeer_exit
,
3371 #ifdef CONFIG_IP_ROUTE_CLASSID
3372 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3373 #endif /* CONFIG_IP_ROUTE_CLASSID */
3375 static __initdata
unsigned long rhash_entries
;
3376 static int __init
set_rhash_entries(char *str
)
3383 ret
= kstrtoul(str
, 0, &rhash_entries
);
3389 __setup("rhash_entries=", set_rhash_entries
);
3391 int __init
ip_rt_init(void)
3395 #ifdef CONFIG_IP_ROUTE_CLASSID
3396 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3398 panic("IP: failed to allocate ip_rt_acct\n");
3401 ipv4_dst_ops
.kmem_cachep
=
3402 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3403 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3405 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3407 if (dst_entries_init(&ipv4_dst_ops
) < 0)
3408 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3410 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
3411 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3413 rt_hash_table
= (struct rt_hash_bucket
*)
3414 alloc_large_system_hash("IP route cache",
3415 sizeof(struct rt_hash_bucket
),
3417 (totalram_pages
>= 128 * 1024) ?
3423 rhash_entries
? 0 : 512 * 1024);
3424 memset(rt_hash_table
, 0, (rt_hash_mask
+ 1) * sizeof(struct rt_hash_bucket
));
3425 rt_hash_lock_init();
3427 ipv4_dst_ops
.gc_thresh
= (rt_hash_mask
+ 1);
3428 ip_rt_max_size
= (rt_hash_mask
+ 1) * 16;
3433 INIT_DELAYED_WORK_DEFERRABLE(&expires_work
, rt_worker_func
);
3434 expires_ljiffies
= jiffies
;
3435 schedule_delayed_work(&expires_work
,
3436 net_random() % ip_rt_gc_interval
+ ip_rt_gc_interval
);
3438 if (ip_rt_proc_init())
3439 pr_err("Unable to create route proc files\n");
3442 xfrm4_init(ip_rt_max_size
);
3444 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
, NULL
);
3446 #ifdef CONFIG_SYSCTL
3447 register_pernet_subsys(&sysctl_route_ops
);
3449 register_pernet_subsys(&rt_genid_ops
);
3450 register_pernet_subsys(&ipv4_inetpeer_ops
);
3454 #ifdef CONFIG_SYSCTL
3456 * We really need to sanitize the damn ipv4 init order, then all
3457 * this nonsense will go away.
3459 void __init
ip_static_sysctl_init(void)
3461 register_net_sysctl(&init_net
, "net/ipv4/route", ipv4_route_table
);