]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/route.c
inetpeer: Invalidate the inetpeer tree along with the routing cache
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
b9eda06f 94#include <linux/prefetch.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
6e5714ea 112#include <net/secure_seq.h>
1da177e4 113
68a5e3dd 114#define RT_FL_TOS(oldflp4) \
f61759e6 115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
1da177e4 121static int ip_rt_max_size;
817bc4db 122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
1080d709 134static int rt_chain_length_max __read_mostly = 20;
de68dca1 135static int redirect_genid;
1da177e4 136
9f28a2fc
ED
137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
1da177e4
LT
140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 146static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 147static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 151static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 152
72cdd1d9
ED
153static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 int how)
155{
156}
1da177e4 157
62fa8a84
DM
158static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159{
06582540
DM
160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
162 u32 *p = NULL;
163
164 if (!rt->peer)
a48eff12 165 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 166
06582540
DM
167 peer = rt->peer;
168 if (peer) {
62fa8a84
DM
169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
171
06582540
DM
172 p = peer->metrics;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
175
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
178
179 if (prev != old) {
62fa8a84
DM
180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
182 p = NULL;
183 } else {
62fa8a84
DM
184 if (rt->fi) {
185 fib_info_put(rt->fi);
186 rt->fi = NULL;
187 }
188 }
189 }
190 return p;
191}
192
d3aaeb38
DM
193static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
1da177e4
LT
195static struct dst_ops ipv4_dst_ops = {
196 .family = AF_INET,
09640e63 197 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
0dbaee3b 200 .default_advmss = ipv4_default_advmss,
ebb762f2 201 .mtu = ipv4_mtu,
62fa8a84 202 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 208 .local_out = __ip_local_out,
d3aaeb38 209 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
210};
211
212#define ECN_OR_COST(class) TC_PRIO_##class
213
4839c52b 214const __u8 ip_tos2prio[16] = {
1da177e4 215 TC_PRIO_BESTEFFORT,
4a2b9c37 216 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
217 TC_PRIO_BESTEFFORT,
218 ECN_OR_COST(BESTEFFORT),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_BULK,
222 ECN_OR_COST(BULK),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE,
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
231};
232
233
234/*
235 * Route cache.
236 */
237
238/* The locking scheme is rather straight forward:
239 *
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
245 * lock held.
246 */
247
248struct rt_hash_bucket {
1c31720a 249 struct rtable __rcu *chain;
22c047cc 250};
1080d709 251
8a25d5de
IM
252#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
254/*
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
62051200 257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 258 */
62051200
IM
259#ifdef CONFIG_LOCKDEP
260# define RT_HASH_LOCK_SZ 256
22c047cc 261#else
62051200
IM
262# if NR_CPUS >= 32
263# define RT_HASH_LOCK_SZ 4096
264# elif NR_CPUS >= 16
265# define RT_HASH_LOCK_SZ 2048
266# elif NR_CPUS >= 8
267# define RT_HASH_LOCK_SZ 1024
268# elif NR_CPUS >= 4
269# define RT_HASH_LOCK_SZ 512
270# else
271# define RT_HASH_LOCK_SZ 256
272# endif
22c047cc
ED
273#endif
274
275static spinlock_t *rt_hash_locks;
276# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
277
278static __init void rt_hash_lock_init(void)
279{
280 int i;
281
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 GFP_KERNEL);
284 if (!rt_hash_locks)
285 panic("IP: failed to allocate rt_hash_locks\n");
286
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
289}
22c047cc
ED
290#else
291# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
292
293static inline void rt_hash_lock_init(void)
294{
295}
22c047cc 296#endif
1da177e4 297
817bc4db
SH
298static struct rt_hash_bucket *rt_hash_table __read_mostly;
299static unsigned rt_hash_mask __read_mostly;
300static unsigned int rt_hash_log __read_mostly;
1da177e4 301
2f970d83 302static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 303#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 304
b00180de 305static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 306 int genid)
1da177e4 307{
0eae88f3 308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 309 idx, genid)
29e75252 310 & rt_hash_mask;
1da177e4
LT
311}
312
e84f84f2
DL
313static inline int rt_genid(struct net *net)
314{
315 return atomic_read(&net->ipv4.rt_genid);
316}
317
1da177e4
LT
318#ifdef CONFIG_PROC_FS
319struct rt_cache_iter_state {
a75e936f 320 struct seq_net_private p;
1da177e4 321 int bucket;
29e75252 322 int genid;
1da177e4
LT
323};
324
1218854a 325static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 326{
1218854a 327 struct rt_cache_iter_state *st = seq->private;
1da177e4 328 struct rtable *r = NULL;
1da177e4
LT
329
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 332 continue;
1da177e4 333 rcu_read_lock_bh();
a898def2 334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 335 while (r) {
d8d1f30b 336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 337 r->rt_genid == st->genid)
29e75252 338 return r;
d8d1f30b 339 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 340 }
1da177e4
LT
341 rcu_read_unlock_bh();
342 }
29e75252 343 return r;
1da177e4
LT
344}
345
1218854a 346static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 347 struct rtable *r)
1da177e4 348{
1218854a 349 struct rt_cache_iter_state *st = seq->private;
a6272665 350
1c31720a 351 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
352 while (!r) {
353 rcu_read_unlock_bh();
a6272665
ED
354 do {
355 if (--st->bucket < 0)
356 return NULL;
33d480ce 357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 358 rcu_read_lock_bh();
1c31720a 359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 360 }
1c31720a 361 return r;
1da177e4
LT
362}
363
1218854a 364static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
365 struct rtable *r)
366{
1218854a
YH
367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 369 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 370 continue;
642d6318
DL
371 if (r->rt_genid == st->genid)
372 break;
373 }
374 return r;
375}
376
1218854a 377static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 378{
1218854a 379 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
380
381 if (r)
1218854a 382 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
383 --pos;
384 return pos ? NULL : r;
385}
386
387static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388{
29e75252 389 struct rt_cache_iter_state *st = seq->private;
29e75252 390 if (*pos)
1218854a 391 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 392 st->genid = rt_genid(seq_file_net(seq));
29e75252 393 return SEQ_START_TOKEN;
1da177e4
LT
394}
395
396static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397{
29e75252 398 struct rtable *r;
1da177e4
LT
399
400 if (v == SEQ_START_TOKEN)
1218854a 401 r = rt_cache_get_first(seq);
1da177e4 402 else
1218854a 403 r = rt_cache_get_next(seq, v);
1da177e4
LT
404 ++*pos;
405 return r;
406}
407
408static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409{
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
412}
413
414static int rt_cache_seq_show(struct seq_file *seq, void *v)
415{
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 "HHUptod\tSpecDst");
421 else {
422 struct rtable *r = v;
69cce1d1 423 struct neighbour *n;
218fa90f 424 int len, HHUptod;
1da177e4 425
218fa90f 426 rcu_read_lock();
27217455 427 n = dst_get_neighbour_noref(&r->dst);
218fa90f
ED
428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 rcu_read_unlock();
430
0eae88f3
ED
431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 433 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
d8d1f30b
CG
436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 438 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 442 r->rt_key_tos,
f6b72b62 443 -1,
218fa90f 444 HHUptod,
5e659e4c
PE
445 r->rt_spec_dst, &len);
446
447 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
448 }
449 return 0;
1da177e4
LT
450}
451
f690808e 452static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
457};
458
459static int rt_cache_seq_open(struct inode *inode, struct file *file)
460{
a75e936f 461 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 462 sizeof(struct rt_cache_iter_state));
1da177e4
LT
463}
464
9a32144e 465static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
a75e936f 470 .release = seq_release_net,
1da177e4
LT
471};
472
473
474static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475{
476 int cpu;
477
478 if (*pos == 0)
479 return SEQ_START_TOKEN;
480
0f23174a 481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
2f970d83 485 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
486 }
487 return NULL;
488}
489
490static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491{
492 int cpu;
493
0f23174a 494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
495 if (!cpu_possible(cpu))
496 continue;
497 *pos = cpu+1;
2f970d83 498 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
499 }
500 return NULL;
e905a9ed 501
1da177e4
LT
502}
503
504static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505{
506
507}
508
509static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510{
511 struct rt_cache_stat *st = v;
512
513 if (v == SEQ_START_TOKEN) {
5bec0039 514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
515 return 0;
516 }
e905a9ed 517
1da177e4
LT
518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 520 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
521 st->in_hit,
522 st->in_slow_tot,
523 st->in_slow_mc,
524 st->in_no_route,
525 st->in_brd,
526 st->in_martian_dst,
527 st->in_martian_src,
528
529 st->out_hit,
530 st->out_slow_tot,
e905a9ed 531 st->out_slow_mc,
1da177e4
LT
532
533 st->gc_total,
534 st->gc_ignored,
535 st->gc_goal_miss,
536 st->gc_dst_overflow,
537 st->in_hlist_search,
538 st->out_hlist_search
539 );
540 return 0;
541}
542
f690808e 543static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
548};
549
550
551static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552{
553 return seq_open(file, &rt_cpu_seq_ops);
554}
555
9a32144e 556static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
559 .read = seq_read,
560 .llseek = seq_lseek,
561 .release = seq_release,
562};
563
c7066f70 564#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 565static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 566{
a661c419
AD
567 struct ip_rt_acct *dst, *src;
568 unsigned int i, j;
569
570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 if (!dst)
572 return -ENOMEM;
573
574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
581 }
78c686e9
PE
582 }
583
a661c419
AD
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 kfree(dst);
586 return 0;
587}
78c686e9 588
a661c419
AD
589static int rt_acct_proc_open(struct inode *inode, struct file *file)
590{
591 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 592}
a661c419
AD
593
594static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
597 .read = seq_read,
598 .llseek = seq_lseek,
599 .release = single_release,
600};
78c686e9 601#endif
107f1634 602
73b38711 603static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
604{
605 struct proc_dir_entry *pde;
606
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 &rt_cache_seq_fops);
609 if (!pde)
610 goto err1;
611
77020720
WC
612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
614 if (!pde)
615 goto err2;
616
c7066f70 617#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
619 if (!pde)
620 goto err3;
621#endif
622 return 0;
623
c7066f70 624#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
625err3:
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627#endif
628err2:
629 remove_proc_entry("rt_cache", net->proc_net);
630err1:
631 return -ENOMEM;
632}
73b38711
DL
633
634static void __net_exit ip_rt_do_proc_exit(struct net *net)
635{
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 638#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 639 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 640#endif
73b38711
DL
641}
642
643static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
646};
647
648static int __init ip_rt_proc_init(void)
649{
650 return register_pernet_subsys(&ip_rt_proc_ops);
651}
652
107f1634 653#else
73b38711 654static inline int ip_rt_proc_init(void)
107f1634
PE
655{
656 return 0;
657}
1da177e4 658#endif /* CONFIG_PROC_FS */
e905a9ed 659
5969f71d 660static inline void rt_free(struct rtable *rt)
1da177e4 661{
d8d1f30b 662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
663}
664
5969f71d 665static inline void rt_drop(struct rtable *rt)
1da177e4 666{
1da177e4 667 ip_rt_put(rt);
d8d1f30b 668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
669}
670
5969f71d 671static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
672{
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 676 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
677}
678
5969f71d 679static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
680{
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 682 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
683}
684
685static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686{
687 unsigned long age;
688 int ret = 0;
689
d8d1f30b 690 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
691 goto out;
692
d8d1f30b 693 age = jiffies - rth->dst.lastuse;
1da177e4
LT
694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
696 goto out;
697 ret = 1;
698out: return ret;
699}
700
701/* Bits of score are:
702 * 31: very valuable
703 * 30: not quite useless
704 * 29..0: usage counter
705 */
706static inline u32 rt_score(struct rtable *rt)
707{
d8d1f30b 708 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
709
710 score = ~score & ~(3<<30);
711
712 if (rt_valuable(rt))
713 score |= (1<<31);
714
c7537967 715 if (rt_is_output_route(rt) ||
1da177e4
LT
716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 score |= (1<<30);
718
719 return score;
720}
721
1080d709
NH
722static inline bool rt_caching(const struct net *net)
723{
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
726}
727
5e2b61f7
DM
728static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
1080d709 730{
5e2b61f7
DM
731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
734}
735
5e2b61f7 736static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 737{
5e2b61f7
DM
738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 743 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
744}
745
b5921910
DL
746static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747{
d8d1f30b 748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
749}
750
e84f84f2
DL
751static inline int rt_is_expired(struct rtable *rth)
752{
d8d1f30b 753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
754}
755
beb659bd
ED
756/*
757 * Perform a full scan of hash table and free all entries.
758 * Can be called by a softirq or a process.
759 * In the later case, we want to be reschedule if necessary
760 */
6561a3b1 761static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
762{
763 unsigned int i;
764 struct rtable *rth, *next;
765
766 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
767 struct rtable __rcu **pprev;
768 struct rtable *list;
769
beb659bd
ED
770 if (process_context && need_resched())
771 cond_resched();
33d480ce 772 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
773 if (!rth)
774 continue;
775
776 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 777
6561a3b1
DM
778 list = NULL;
779 pprev = &rt_hash_table[i].chain;
780 rth = rcu_dereference_protected(*pprev,
1c31720a 781 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 782
6561a3b1
DM
783 while (rth) {
784 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 785 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
786
787 if (!net ||
788 net_eq(dev_net(rth->dst.dev), net)) {
789 rcu_assign_pointer(*pprev, next);
790 rcu_assign_pointer(rth->dst.rt_next, list);
791 list = rth;
32cb5b4e 792 } else {
6561a3b1 793 pprev = &rth->dst.rt_next;
32cb5b4e 794 }
6561a3b1 795 rth = next;
32cb5b4e 796 }
6561a3b1 797
beb659bd
ED
798 spin_unlock_bh(rt_hash_lock_addr(i));
799
6561a3b1
DM
800 for (; list; list = next) {
801 next = rcu_dereference_protected(list->dst.rt_next, 1);
802 rt_free(list);
beb659bd
ED
803 }
804 }
805}
806
1080d709
NH
807/*
808 * While freeing expired entries, we compute average chain length
809 * and standard deviation, using fixed-point arithmetic.
810 * This to have an estimation of rt_chain_length_max
811 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
812 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813 */
814
815#define FRACT_BITS 3
816#define ONE (1UL << FRACT_BITS)
817
98376387
ED
818/*
819 * Given a hash chain and an item in this hash chain,
820 * find if a previous entry has the same hash_inputs
821 * (but differs on tos, mark or oif)
822 * Returns 0 if an alias is found.
823 * Returns ONE if rth has no alias before itself.
824 */
825static int has_noalias(const struct rtable *head, const struct rtable *rth)
826{
827 const struct rtable *aux = head;
828
829 while (aux != rth) {
5e2b61f7 830 if (compare_hash_inputs(aux, rth))
98376387 831 return 0;
1c31720a 832 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
833 }
834 return ONE;
835}
836
9f28a2fc
ED
837static void rt_check_expire(void)
838{
839 static unsigned int rover;
840 unsigned int i = rover, goal;
841 struct rtable *rth;
842 struct rtable __rcu **rthp;
843 unsigned long samples = 0;
844 unsigned long sum = 0, sum2 = 0;
845 unsigned long delta;
846 u64 mult;
847
848 delta = jiffies - expires_ljiffies;
849 expires_ljiffies = jiffies;
850 mult = ((u64)delta) << rt_hash_log;
851 if (ip_rt_gc_timeout > 1)
852 do_div(mult, ip_rt_gc_timeout);
853 goal = (unsigned int)mult;
854 if (goal > rt_hash_mask)
855 goal = rt_hash_mask + 1;
856 for (; goal > 0; goal--) {
857 unsigned long tmo = ip_rt_gc_timeout;
858 unsigned long length;
859
860 i = (i + 1) & rt_hash_mask;
861 rthp = &rt_hash_table[i].chain;
862
863 if (need_resched())
864 cond_resched();
865
866 samples++;
867
868 if (rcu_dereference_raw(*rthp) == NULL)
869 continue;
870 length = 0;
871 spin_lock_bh(rt_hash_lock_addr(i));
872 while ((rth = rcu_dereference_protected(*rthp,
873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 prefetch(rth->dst.rt_next);
875 if (rt_is_expired(rth)) {
876 *rthp = rth->dst.rt_next;
877 rt_free(rth);
878 continue;
879 }
880 if (rth->dst.expires) {
881 /* Entry is expired even if it is in use */
882 if (time_before_eq(jiffies, rth->dst.expires)) {
883nofree:
884 tmo >>= 1;
885 rthp = &rth->dst.rt_next;
886 /*
887 * We only count entries on
888 * a chain with equal hash inputs once
889 * so that entries for different QOS
890 * levels, and other non-hash input
891 * attributes don't unfairly skew
892 * the length computation
893 */
894 length += has_noalias(rt_hash_table[i].chain, rth);
895 continue;
896 }
897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 goto nofree;
899
900 /* Cleanup aged off entries. */
901 *rthp = rth->dst.rt_next;
902 rt_free(rth);
903 }
904 spin_unlock_bh(rt_hash_lock_addr(i));
905 sum += length;
906 sum2 += length*length;
907 }
908 if (samples) {
909 unsigned long avg = sum / samples;
910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 rt_chain_length_max = max_t(unsigned long,
912 ip_rt_gc_elasticity,
913 (avg + 4*sd) >> FRACT_BITS);
914 }
915 rover = i;
916}
917
918/*
919 * rt_worker_func() is run in process context.
920 * we call rt_check_expire() to scan part of the hash table
921 */
922static void rt_worker_func(struct work_struct *work)
923{
924 rt_check_expire();
925 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926}
927
29e75252 928/*
25985edc 929 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931 * many times (2^24) without giving recent rt_genid.
932 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 933 */
86c657f6 934static void rt_cache_invalidate(struct net *net)
1da177e4 935{
29e75252 936 unsigned char shuffle;
1da177e4 937
29e75252 938 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
de68dca1 940 redirect_genid++;
5faa5df1 941 inetpeer_invalidate_tree(AF_INET);
1da177e4
LT
942}
943
29e75252
ED
944/*
945 * delay < 0 : invalidate cache (fast : entries will be deleted later)
946 * delay >= 0 : invalidate & flush cache (can be long)
947 */
76e6ebfb 948void rt_cache_flush(struct net *net, int delay)
1da177e4 949{
86c657f6 950 rt_cache_invalidate(net);
29e75252 951 if (delay >= 0)
6561a3b1 952 rt_do_flush(net, !in_softirq());
1da177e4
LT
953}
954
a5ee1551 955/* Flush previous cache invalidated entries from the cache */
6561a3b1 956void rt_cache_flush_batch(struct net *net)
a5ee1551 957{
6561a3b1 958 rt_do_flush(net, !in_softirq());
a5ee1551
EB
959}
960
1080d709
NH
961static void rt_emergency_hash_rebuild(struct net *net)
962{
3ee94372 963 if (net_ratelimit())
1080d709 964 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 965 rt_cache_invalidate(net);
1080d709
NH
966}
967
1da177e4
LT
968/*
969 Short description of GC goals.
970
971 We want to build algorithm, which will keep routing cache
972 at some equilibrium point, when number of aged off entries
973 is kept approximately equal to newly generated ones.
974
975 Current expiration strength is variable "expire".
976 We try to adjust it dynamically, so that if networking
977 is idle expires is large enough to keep enough of warm entries,
978 and when load increases it reduces to limit cache size.
979 */
980
569d3645 981static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
982{
983 static unsigned long expire = RT_GC_TIMEOUT;
984 static unsigned long last_gc;
985 static int rover;
986 static int equilibrium;
1c31720a
ED
987 struct rtable *rth;
988 struct rtable __rcu **rthp;
1da177e4
LT
989 unsigned long now = jiffies;
990 int goal;
fc66f95c 991 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
992
993 /*
994 * Garbage collection is pretty expensive,
995 * do not make it too frequently.
996 */
997
998 RT_CACHE_STAT_INC(gc_total);
999
1000 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 1001 entries < ip_rt_max_size) {
1da177e4
LT
1002 RT_CACHE_STAT_INC(gc_ignored);
1003 goto out;
1004 }
1005
fc66f95c 1006 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1007 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1008 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1009 if (goal <= 0) {
1010 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1012 goal = entries - equilibrium;
1da177e4 1013 if (goal > 0) {
b790cedd 1014 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1015 goal = entries - equilibrium;
1da177e4
LT
1016 }
1017 } else {
1018 /* We are in dangerous area. Try to reduce cache really
1019 * aggressively.
1020 */
b790cedd 1021 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1022 equilibrium = entries - goal;
1da177e4
LT
1023 }
1024
1025 if (now - last_gc >= ip_rt_gc_min_interval)
1026 last_gc = now;
1027
1028 if (goal <= 0) {
1029 equilibrium += goal;
1030 goto work_done;
1031 }
1032
1033 do {
1034 int i, k;
1035
1036 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037 unsigned long tmo = expire;
1038
1039 k = (k + 1) & rt_hash_mask;
1040 rthp = &rt_hash_table[k].chain;
22c047cc 1041 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1042 while ((rth = rcu_dereference_protected(*rthp,
1043 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1044 if (!rt_is_expired(rth) &&
29e75252 1045 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1046 tmo >>= 1;
d8d1f30b 1047 rthp = &rth->dst.rt_next;
1da177e4
LT
1048 continue;
1049 }
d8d1f30b 1050 *rthp = rth->dst.rt_next;
1da177e4
LT
1051 rt_free(rth);
1052 goal--;
1da177e4 1053 }
22c047cc 1054 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1055 if (goal <= 0)
1056 break;
1057 }
1058 rover = k;
1059
1060 if (goal <= 0)
1061 goto work_done;
1062
1063 /* Goal is not achieved. We stop process if:
1064
1065 - if expire reduced to zero. Otherwise, expire is halfed.
1066 - if table is not full.
1067 - if we are called from interrupt.
1068 - jiffies check is just fallback/debug loop breaker.
1069 We will not spin here for long time in any case.
1070 */
1071
1072 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074 if (expire == 0)
1075 break;
1076
1077 expire >>= 1;
1da177e4 1078
fc66f95c 1079 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1080 goto out;
1081 } while (!in_softirq() && time_before_eq(jiffies, now));
1082
fc66f95c
ED
1083 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1086 goto out;
1087 if (net_ratelimit())
1088 printk(KERN_WARNING "dst cache overflow\n");
1089 RT_CACHE_STAT_INC(gc_dst_overflow);
1090 return 1;
1091
1092work_done:
1093 expire += ip_rt_gc_min_interval;
1094 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1095 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1097 expire = ip_rt_gc_timeout;
1da177e4
LT
1098out: return 0;
1099}
1100
98376387
ED
1101/*
1102 * Returns number of entries in a hash chain that have different hash_inputs
1103 */
1104static int slow_chain_length(const struct rtable *head)
1105{
1106 int length = 0;
1107 const struct rtable *rth = head;
1108
1109 while (rth) {
1110 length += has_noalias(head, rth);
1c31720a 1111 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1112 }
1113 return length >> FRACT_BITS;
1114}
1115
d3aaeb38 1116static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
3769cffb 1117{
d3aaeb38
DM
1118 static const __be32 inaddr_any = 0;
1119 struct net_device *dev = dst->dev;
1120 const __be32 *pkey = daddr;
3769cffb
DM
1121 struct neighbour *n;
1122
3769cffb 1123 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
d3aaeb38
DM
1124 pkey = &inaddr_any;
1125
32092ecf 1126 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
d3aaeb38
DM
1127 if (n)
1128 return n;
32092ecf 1129 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1130}
1131
1132static int rt_bind_neighbour(struct rtable *rt)
1133{
1134 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
3769cffb
DM
1135 if (IS_ERR(n))
1136 return PTR_ERR(n);
69cce1d1 1137 dst_set_neighbour(&rt->dst, n);
3769cffb
DM
1138
1139 return 0;
1140}
1141
b23dd4fe
DM
1142static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1143 struct sk_buff *skb, int ifindex)
1da177e4 1144{
1c31720a
ED
1145 struct rtable *rth, *cand;
1146 struct rtable __rcu **rthp, **candp;
1da177e4 1147 unsigned long now;
1da177e4
LT
1148 u32 min_score;
1149 int chain_length;
1150 int attempts = !in_softirq();
1151
1152restart:
1153 chain_length = 0;
1154 min_score = ~(u32)0;
1155 cand = NULL;
1156 candp = NULL;
1157 now = jiffies;
1158
d8d1f30b 1159 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1160 /*
1161 * If we're not caching, just tell the caller we
1162 * were successful and don't touch the route. The
1163 * caller hold the sole reference to the cache entry, and
1164 * it will be released when the caller is done with it.
1165 * If we drop it here, the callers have no way to resolve routes
1166 * when we're not caching. Instead, just point *rp at rt, so
1167 * the caller gets a single use out of the route
b6280b47
NH
1168 * Note that we do rt_free on this new route entry, so that
1169 * once its refcount hits zero, we are still able to reap it
1170 * (Thanks Alexey)
27b75c95
ED
1171 * Note: To avoid expensive rcu stuff for this uncached dst,
1172 * we set DST_NOCACHE so that dst_release() can free dst without
1173 * waiting a grace period.
73e42897 1174 */
b6280b47 1175
c7d4426a 1176 rt->dst.flags |= DST_NOCACHE;
c7537967 1177 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1178 int err = rt_bind_neighbour(rt);
b6280b47
NH
1179 if (err) {
1180 if (net_ratelimit())
1181 printk(KERN_WARNING
1182 "Neighbour table failure & not caching routes.\n");
27b75c95 1183 ip_rt_put(rt);
b23dd4fe 1184 return ERR_PTR(err);
b6280b47
NH
1185 }
1186 }
1187
b6280b47 1188 goto skip_hashing;
1080d709
NH
1189 }
1190
1da177e4
LT
1191 rthp = &rt_hash_table[hash].chain;
1192
22c047cc 1193 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1194 while ((rth = rcu_dereference_protected(*rthp,
1195 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1196 if (rt_is_expired(rth)) {
d8d1f30b 1197 *rthp = rth->dst.rt_next;
29e75252
ED
1198 rt_free(rth);
1199 continue;
1200 }
5e2b61f7 1201 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1202 /* Put it first */
d8d1f30b 1203 *rthp = rth->dst.rt_next;
1da177e4
LT
1204 /*
1205 * Since lookup is lockfree, the deletion
1206 * must be visible to another weakly ordered CPU before
1207 * the insertion at the start of the hash chain.
1208 */
d8d1f30b 1209 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1210 rt_hash_table[hash].chain);
1211 /*
1212 * Since lookup is lockfree, the update writes
1213 * must be ordered for consistency on SMP.
1214 */
1215 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1216
d8d1f30b 1217 dst_use(&rth->dst, now);
22c047cc 1218 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1219
1220 rt_drop(rt);
b23dd4fe 1221 if (skb)
d8d1f30b 1222 skb_dst_set(skb, &rth->dst);
b23dd4fe 1223 return rth;
1da177e4
LT
1224 }
1225
d8d1f30b 1226 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1227 u32 score = rt_score(rth);
1228
1229 if (score <= min_score) {
1230 cand = rth;
1231 candp = rthp;
1232 min_score = score;
1233 }
1234 }
1235
1236 chain_length++;
1237
d8d1f30b 1238 rthp = &rth->dst.rt_next;
1da177e4
LT
1239 }
1240
1241 if (cand) {
1242 /* ip_rt_gc_elasticity used to be average length of chain
1243 * length, when exceeded gc becomes really aggressive.
1244 *
1245 * The second limit is less certain. At the moment it allows
1246 * only 2 entries per bucket. We will see.
1247 */
1248 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1249 *candp = cand->dst.rt_next;
1da177e4
LT
1250 rt_free(cand);
1251 }
1080d709 1252 } else {
98376387
ED
1253 if (chain_length > rt_chain_length_max &&
1254 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1255 struct net *net = dev_net(rt->dst.dev);
1080d709 1256 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1257 if (!rt_caching(net)) {
1080d709 1258 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1259 rt->dst.dev->name, num);
1080d709 1260 }
b35ecb5d 1261 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1262 spin_unlock_bh(rt_hash_lock_addr(hash));
1263
5e2b61f7 1264 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1265 ifindex, rt_genid(net));
1266 goto restart;
1080d709 1267 }
1da177e4
LT
1268 }
1269
1270 /* Try to bind route to arp only if it is output
1271 route or unicast forwarding path.
1272 */
c7537967 1273 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1274 int err = rt_bind_neighbour(rt);
1da177e4 1275 if (err) {
22c047cc 1276 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1277
1278 if (err != -ENOBUFS) {
1279 rt_drop(rt);
b23dd4fe 1280 return ERR_PTR(err);
1da177e4
LT
1281 }
1282
1283 /* Neighbour tables are full and nothing
1284 can be released. Try to shrink route cache,
1285 it is most likely it holds some neighbour records.
1286 */
1287 if (attempts-- > 0) {
1288 int saved_elasticity = ip_rt_gc_elasticity;
1289 int saved_int = ip_rt_gc_min_interval;
1290 ip_rt_gc_elasticity = 1;
1291 ip_rt_gc_min_interval = 0;
569d3645 1292 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1293 ip_rt_gc_min_interval = saved_int;
1294 ip_rt_gc_elasticity = saved_elasticity;
1295 goto restart;
1296 }
1297
1298 if (net_ratelimit())
7e1b33e5 1299 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4 1300 rt_drop(rt);
b23dd4fe 1301 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1302 }
1303 }
1304
d8d1f30b 1305 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1306
00269b54
ED
1307 /*
1308 * Since lookup is lockfree, we must make sure
25985edc 1309 * previous writes to rt are committed to memory
00269b54
ED
1310 * before making rt visible to other CPUS.
1311 */
1ddbcb00 1312 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1313
22c047cc 1314 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1315
b6280b47 1316skip_hashing:
b23dd4fe 1317 if (skb)
d8d1f30b 1318 skb_dst_set(skb, &rt->dst);
b23dd4fe 1319 return rt;
1da177e4
LT
1320}
1321
6431cbc2
DM
1322static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324static u32 rt_peer_genid(void)
1325{
1326 return atomic_read(&__rt_peer_genid);
1327}
1328
a48eff12 1329void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1330{
1da177e4
LT
1331 struct inet_peer *peer;
1332
a48eff12 1333 peer = inet_getpeer_v4(daddr, create);
1da177e4 1334
49e8ab03 1335 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1336 inet_putpeer(peer);
6431cbc2
DM
1337 else
1338 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1339}
1340
1341/*
1342 * Peer allocation may fail only in serious out-of-memory conditions. However
1343 * we still can generate some output.
1344 * Random ID selection looks a bit dangerous because we have no chances to
1345 * select ID being unique in a reasonable period of time.
1346 * But broken packet identifier may be better than no packet at all.
1347 */
1348static void ip_select_fb_ident(struct iphdr *iph)
1349{
1350 static DEFINE_SPINLOCK(ip_fb_id_lock);
1351 static u32 ip_fallback_id;
1352 u32 salt;
1353
1354 spin_lock_bh(&ip_fb_id_lock);
e448515c 1355 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1356 iph->id = htons(salt & 0xFFFF);
1357 ip_fallback_id = salt;
1358 spin_unlock_bh(&ip_fb_id_lock);
1359}
1360
1361void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362{
1363 struct rtable *rt = (struct rtable *) dst;
1364
e688a604 1365 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1da177e4 1366 if (rt->peer == NULL)
a48eff12 1367 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1368
1369 /* If peer is attached to destination, it is never detached,
1370 so that we need not to grab a lock to dereference it.
1371 */
1372 if (rt->peer) {
1373 iph->id = htons(inet_getid(rt->peer, more));
1374 return;
1375 }
e688a604 1376 } else if (!rt)
e905a9ed 1377 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1378 __builtin_return_address(0));
1da177e4
LT
1379
1380 ip_select_fb_ident(iph);
1381}
4bc2f18b 1382EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1383
1384static void rt_del(unsigned hash, struct rtable *rt)
1385{
1c31720a
ED
1386 struct rtable __rcu **rthp;
1387 struct rtable *aux;
1da177e4 1388
29e75252 1389 rthp = &rt_hash_table[hash].chain;
22c047cc 1390 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1391 ip_rt_put(rt);
1c31720a
ED
1392 while ((aux = rcu_dereference_protected(*rthp,
1393 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1394 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1395 *rthp = aux->dst.rt_next;
29e75252
ED
1396 rt_free(aux);
1397 continue;
1da177e4 1398 }
d8d1f30b 1399 rthp = &aux->dst.rt_next;
29e75252 1400 }
22c047cc 1401 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1402}
1403
de398fb8 1404static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
9cc20b26
ED
1405{
1406 struct rtable *rt = (struct rtable *) dst;
1407 __be32 orig_gw = rt->rt_gateway;
1408 struct neighbour *n, *old_n;
1409
1410 dst_confirm(&rt->dst);
1411
1412 rt->rt_gateway = peer->redirect_learned.a4;
1413
1414 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
de398fb8
DM
1415 if (IS_ERR(n)) {
1416 rt->rt_gateway = orig_gw;
1417 return;
1418 }
9cc20b26
ED
1419 old_n = xchg(&rt->dst._neighbour, n);
1420 if (old_n)
1421 neigh_release(old_n);
de398fb8
DM
1422 if (!(n->nud_state & NUD_VALID)) {
1423 neigh_event_send(n, NULL);
9cc20b26
ED
1424 } else {
1425 rt->rt_flags |= RTCF_REDIRECTED;
1426 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1427 }
9cc20b26
ED
1428}
1429
ed7865a4 1430/* called in rcu_read_lock() section */
f7655229
AV
1431void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432 __be32 saddr, struct net_device *dev)
1da177e4 1433{
7cc9150e 1434 int s, i;
ed7865a4 1435 struct in_device *in_dev = __in_dev_get_rcu(dev);
7cc9150e
FL
1436 __be32 skeys[2] = { saddr, 0 };
1437 int ikeys[2] = { dev->ifindex, 0 };
f39925db 1438 struct inet_peer *peer;
317805b8 1439 struct net *net;
1da177e4 1440
1da177e4
LT
1441 if (!in_dev)
1442 return;
1443
c346dca1 1444 net = dev_net(dev);
9d4fb27d
JP
1445 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1446 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1447 ipv4_is_zeronet(new_gw))
1da177e4
LT
1448 goto reject_redirect;
1449
1450 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1451 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1452 goto reject_redirect;
1453 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1454 goto reject_redirect;
1455 } else {
317805b8 1456 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1457 goto reject_redirect;
1458 }
1459
7cc9150e
FL
1460 for (s = 0; s < 2; s++) {
1461 for (i = 0; i < 2; i++) {
9cc20b26
ED
1462 unsigned int hash;
1463 struct rtable __rcu **rthp;
1464 struct rtable *rt;
1465
1466 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1467
1468 rthp = &rt_hash_table[hash].chain;
1469
1470 while ((rt = rcu_dereference(*rthp)) != NULL) {
1471 rthp = &rt->dst.rt_next;
1472
1473 if (rt->rt_key_dst != daddr ||
1474 rt->rt_key_src != skeys[s] ||
1475 rt->rt_oif != ikeys[i] ||
1476 rt_is_input_route(rt) ||
1477 rt_is_expired(rt) ||
1478 !net_eq(dev_net(rt->dst.dev), net) ||
1479 rt->dst.error ||
1480 rt->dst.dev != dev ||
1481 rt->rt_gateway != old_gw)
1482 continue;
e905a9ed 1483
9cc20b26
ED
1484 if (!rt->peer)
1485 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4 1486
9cc20b26
ED
1487 peer = rt->peer;
1488 if (peer) {
de68dca1
ED
1489 if (peer->redirect_learned.a4 != new_gw ||
1490 peer->redirect_genid != redirect_genid) {
9cc20b26 1491 peer->redirect_learned.a4 = new_gw;
de68dca1 1492 peer->redirect_genid = redirect_genid;
9cc20b26
ED
1493 atomic_inc(&__rt_peer_genid);
1494 }
1495 check_peer_redir(&rt->dst, peer);
1496 }
7cc9150e 1497 }
7cc9150e 1498 }
1da177e4 1499 }
1da177e4
LT
1500 return;
1501
1502reject_redirect:
1503#ifdef CONFIG_IP_ROUTE_VERBOSE
1504 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1505 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1506 " Advised path = %pI4 -> %pI4\n",
1507 &old_gw, dev->name, &new_gw,
1508 &saddr, &daddr);
1da177e4 1509#endif
ed7865a4 1510 ;
1da177e4
LT
1511}
1512
fe6fe792
ED
1513static bool peer_pmtu_expired(struct inet_peer *peer)
1514{
1515 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1516
1517 return orig &&
1518 time_after_eq(jiffies, orig) &&
1519 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1520}
1521
1522static bool peer_pmtu_cleaned(struct inet_peer *peer)
1523{
1524 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1525
1526 return orig &&
1527 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1528}
1529
1da177e4
LT
1530static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1531{
ee6b9673 1532 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1533 struct dst_entry *ret = dst;
1534
1535 if (rt) {
d11a4dc1 1536 if (dst->obsolete > 0) {
1da177e4
LT
1537 ip_rt_put(rt);
1538 ret = NULL;
2c8cec5c 1539 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1540 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1541 rt->rt_oif,
e84f84f2 1542 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1543 rt_del(hash, rt);
1544 ret = NULL;
fe6fe792
ED
1545 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1546 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1547 }
1548 }
1549 return ret;
1550}
1551
1552/*
1553 * Algorithm:
1554 * 1. The first ip_rt_redirect_number redirects are sent
1555 * with exponential backoff, then we stop sending them at all,
1556 * assuming that the host ignores our redirects.
1557 * 2. If we did not see packets requiring redirects
1558 * during ip_rt_redirect_silence, we assume that the host
1559 * forgot redirected route and start to send redirects again.
1560 *
1561 * This algorithm is much cheaper and more intelligent than dumb load limiting
1562 * in icmp.c.
1563 *
1564 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1565 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1566 */
1567
1568void ip_rt_send_redirect(struct sk_buff *skb)
1569{
511c3f92 1570 struct rtable *rt = skb_rtable(skb);
30038fc6 1571 struct in_device *in_dev;
92d86829 1572 struct inet_peer *peer;
30038fc6 1573 int log_martians;
1da177e4 1574
30038fc6 1575 rcu_read_lock();
d8d1f30b 1576 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1577 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1578 rcu_read_unlock();
1da177e4 1579 return;
30038fc6
ED
1580 }
1581 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1582 rcu_read_unlock();
1da177e4 1583
92d86829 1584 if (!rt->peer)
a48eff12 1585 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1586 peer = rt->peer;
1587 if (!peer) {
1588 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1589 return;
1590 }
1591
1da177e4
LT
1592 /* No redirected packets during ip_rt_redirect_silence;
1593 * reset the algorithm.
1594 */
92d86829
DM
1595 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1596 peer->rate_tokens = 0;
1da177e4
LT
1597
1598 /* Too many ignored redirects; do not send anything
d8d1f30b 1599 * set dst.rate_last to the last seen redirected packet.
1da177e4 1600 */
92d86829
DM
1601 if (peer->rate_tokens >= ip_rt_redirect_number) {
1602 peer->rate_last = jiffies;
30038fc6 1603 return;
1da177e4
LT
1604 }
1605
1606 /* Check for load limit; set rate_last to the latest sent
1607 * redirect.
1608 */
92d86829 1609 if (peer->rate_tokens == 0 ||
14fb8a76 1610 time_after(jiffies,
92d86829
DM
1611 (peer->rate_last +
1612 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1613 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1614 peer->rate_last = jiffies;
1615 ++peer->rate_tokens;
1da177e4 1616#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1617 if (log_martians &&
92d86829 1618 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1619 net_ratelimit())
673d57e7 1620 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
c5be24ff 1621 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1622 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1623#endif
1624 }
1da177e4
LT
1625}
1626
1627static int ip_error(struct sk_buff *skb)
1628{
511c3f92 1629 struct rtable *rt = skb_rtable(skb);
92d86829 1630 struct inet_peer *peer;
1da177e4 1631 unsigned long now;
92d86829 1632 bool send;
1da177e4
LT
1633 int code;
1634
d8d1f30b 1635 switch (rt->dst.error) {
4500ebf8
JP
1636 case EINVAL:
1637 default:
1638 goto out;
1639 case EHOSTUNREACH:
1640 code = ICMP_HOST_UNREACH;
1641 break;
1642 case ENETUNREACH:
1643 code = ICMP_NET_UNREACH;
1644 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1645 IPSTATS_MIB_INNOROUTES);
1646 break;
1647 case EACCES:
1648 code = ICMP_PKT_FILTERED;
1649 break;
1da177e4
LT
1650 }
1651
92d86829 1652 if (!rt->peer)
a48eff12 1653 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1654 peer = rt->peer;
1655
1656 send = true;
1657 if (peer) {
1658 now = jiffies;
1659 peer->rate_tokens += now - peer->rate_last;
1660 if (peer->rate_tokens > ip_rt_error_burst)
1661 peer->rate_tokens = ip_rt_error_burst;
1662 peer->rate_last = now;
1663 if (peer->rate_tokens >= ip_rt_error_cost)
1664 peer->rate_tokens -= ip_rt_error_cost;
1665 else
1666 send = false;
1da177e4 1667 }
92d86829
DM
1668 if (send)
1669 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1670
1671out: kfree_skb(skb);
1672 return 0;
e905a9ed 1673}
1da177e4
LT
1674
1675/*
1676 * The last two values are not from the RFC but
1677 * are needed for AMPRnet AX.25 paths.
1678 */
1679
9b5b5cff 1680static const unsigned short mtu_plateau[] =
1da177e4
LT
1681{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1682
5969f71d 1683static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1684{
1685 int i;
e905a9ed 1686
1da177e4
LT
1687 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1688 if (old_mtu > mtu_plateau[i])
1689 return mtu_plateau[i];
1690 return 68;
1691}
1692
b71d1d42 1693unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1694 unsigned short new_mtu,
1695 struct net_device *dev)
1da177e4 1696{
1da177e4 1697 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1698 unsigned short est_mtu = 0;
2c8cec5c 1699 struct inet_peer *peer;
1da177e4 1700
2c8cec5c
DM
1701 peer = inet_getpeer_v4(iph->daddr, 1);
1702 if (peer) {
1703 unsigned short mtu = new_mtu;
1da177e4 1704
2c8cec5c
DM
1705 if (new_mtu < 68 || new_mtu >= old_mtu) {
1706 /* BSD 4.2 derived systems incorrectly adjust
1707 * tot_len by the IP header length, and report
1708 * a zero MTU in the ICMP message.
1709 */
1710 if (mtu == 0 &&
1711 old_mtu >= 68 + (iph->ihl << 2))
1712 old_mtu -= iph->ihl << 2;
1713 mtu = guess_mtu(old_mtu);
1714 }
0010e465 1715
2c8cec5c
DM
1716 if (mtu < ip_rt_min_pmtu)
1717 mtu = ip_rt_min_pmtu;
1718 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1719 unsigned long pmtu_expires;
1720
1721 pmtu_expires = jiffies + ip_rt_mtu_expires;
1722 if (!pmtu_expires)
1723 pmtu_expires = 1UL;
1724
2c8cec5c
DM
1725 est_mtu = mtu;
1726 peer->pmtu_learned = mtu;
46af3180 1727 peer->pmtu_expires = pmtu_expires;
59445b6b 1728 atomic_inc(&__rt_peer_genid);
2c8cec5c 1729 }
1da177e4 1730
2c8cec5c 1731 inet_putpeer(peer);
1da177e4
LT
1732 }
1733 return est_mtu ? : new_mtu;
1734}
1735
2c8cec5c
DM
1736static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1737{
fe6fe792 1738 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1739
fe6fe792
ED
1740 if (!expires)
1741 return;
46af3180 1742 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1743 u32 orig_dst_mtu = dst_mtu(dst);
1744 if (peer->pmtu_learned < orig_dst_mtu) {
1745 if (!peer->pmtu_orig)
1746 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1747 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1748 }
1749 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1750 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1751}
1752
1da177e4
LT
1753static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1754{
2c8cec5c
DM
1755 struct rtable *rt = (struct rtable *) dst;
1756 struct inet_peer *peer;
1757
1758 dst_confirm(dst);
1759
1760 if (!rt->peer)
a48eff12 1761 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1762 peer = rt->peer;
1763 if (peer) {
fe6fe792
ED
1764 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1765
2c8cec5c 1766 if (mtu < ip_rt_min_pmtu)
1da177e4 1767 mtu = ip_rt_min_pmtu;
fe6fe792 1768 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1769
1770 pmtu_expires = jiffies + ip_rt_mtu_expires;
1771 if (!pmtu_expires)
1772 pmtu_expires = 1UL;
1773
2c8cec5c 1774 peer->pmtu_learned = mtu;
46af3180 1775 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1776
1777 atomic_inc(&__rt_peer_genid);
1778 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1779 }
46af3180 1780 check_peer_pmtu(dst, peer);
1da177e4
LT
1781 }
1782}
1783
f39925db 1784
de398fb8 1785static void ipv4_validate_peer(struct rtable *rt)
1da177e4 1786{
6431cbc2 1787 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1788 struct inet_peer *peer;
1789
6431cbc2 1790 if (!rt->peer)
a48eff12 1791 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1792
2c8cec5c 1793 peer = rt->peer;
fe6fe792 1794 if (peer) {
efbc368d 1795 check_peer_pmtu(&rt->dst, peer);
2c8cec5c 1796
de68dca1
ED
1797 if (peer->redirect_genid != redirect_genid)
1798 peer->redirect_learned.a4 = 0;
fe6fe792 1799 if (peer->redirect_learned.a4 &&
de398fb8
DM
1800 peer->redirect_learned.a4 != rt->rt_gateway)
1801 check_peer_redir(&rt->dst, peer);
f39925db
DM
1802 }
1803
6431cbc2
DM
1804 rt->rt_peer_genid = rt_peer_genid();
1805 }
efbc368d
DM
1806}
1807
1808static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1809{
1810 struct rtable *rt = (struct rtable *) dst;
1811
1812 if (rt_is_expired(rt))
1813 return NULL;
de398fb8 1814 ipv4_validate_peer(rt);
d11a4dc1 1815 return dst;
1da177e4
LT
1816}
1817
1818static void ipv4_dst_destroy(struct dst_entry *dst)
1819{
1820 struct rtable *rt = (struct rtable *) dst;
1821 struct inet_peer *peer = rt->peer;
1da177e4 1822
62fa8a84
DM
1823 if (rt->fi) {
1824 fib_info_put(rt->fi);
1825 rt->fi = NULL;
1826 }
1da177e4
LT
1827 if (peer) {
1828 rt->peer = NULL;
1829 inet_putpeer(peer);
1830 }
1da177e4
LT
1831}
1832
1da177e4
LT
1833
1834static void ipv4_link_failure(struct sk_buff *skb)
1835{
1836 struct rtable *rt;
1837
1838 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1839
511c3f92 1840 rt = skb_rtable(skb);
fe6fe792
ED
1841 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1842 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1843}
1844
1845static int ip_rt_bug(struct sk_buff *skb)
1846{
673d57e7
HH
1847 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1848 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1849 skb->dev ? skb->dev->name : "?");
1850 kfree_skb(skb);
c378a9c0 1851 WARN_ON(1);
1da177e4
LT
1852 return 0;
1853}
1854
1855/*
1856 We do not cache source address of outgoing interface,
1857 because it is used only by IP RR, TS and SRR options,
1858 so that it out of fast path.
1859
1860 BTW remember: "addr" is allowed to be not aligned
1861 in IP options!
1862 */
1863
8e36360a 1864void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1865{
a61ced5d 1866 __be32 src;
1da177e4 1867
c7537967 1868 if (rt_is_output_route(rt))
c5be24ff 1869 src = ip_hdr(skb)->saddr;
ebc0ffae 1870 else {
8e36360a
DM
1871 struct fib_result res;
1872 struct flowi4 fl4;
1873 struct iphdr *iph;
1874
1875 iph = ip_hdr(skb);
1876
1877 memset(&fl4, 0, sizeof(fl4));
1878 fl4.daddr = iph->daddr;
1879 fl4.saddr = iph->saddr;
b0fe4a31 1880 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1881 fl4.flowi4_oif = rt->dst.dev->ifindex;
1882 fl4.flowi4_iif = skb->dev->ifindex;
1883 fl4.flowi4_mark = skb->mark;
5e2b61f7 1884
ebc0ffae 1885 rcu_read_lock();
68a5e3dd 1886 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1887 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1888 else
1889 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1890 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1891 rcu_read_unlock();
1892 }
1da177e4
LT
1893 memcpy(addr, &src, 4);
1894}
1895
c7066f70 1896#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1897static void set_class_tag(struct rtable *rt, u32 tag)
1898{
d8d1f30b
CG
1899 if (!(rt->dst.tclassid & 0xFFFF))
1900 rt->dst.tclassid |= tag & 0xFFFF;
1901 if (!(rt->dst.tclassid & 0xFFFF0000))
1902 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1903}
1904#endif
1905
0dbaee3b
DM
1906static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1907{
1908 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1909
1910 if (advmss == 0) {
1911 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1912 ip_rt_min_advmss);
1913 if (advmss > 65535 - 40)
1914 advmss = 65535 - 40;
1915 }
1916 return advmss;
1917}
1918
ebb762f2 1919static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1920{
261663b0 1921 const struct rtable *rt = (const struct rtable *) dst;
618f9bc7
SK
1922 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1923
261663b0 1924 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1925 return mtu;
1926
1927 mtu = dst->dev->mtu;
d33e4553
DM
1928
1929 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1930
1931 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1932 mtu = 576;
1933 }
1934
1935 if (mtu > IP_MAX_MTU)
1936 mtu = IP_MAX_MTU;
1937
1938 return mtu;
1939}
1940
813b3b5d 1941static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1942 struct fib_info *fi)
a4daad6b 1943{
0131ba45
DM
1944 struct inet_peer *peer;
1945 int create = 0;
a4daad6b 1946
0131ba45
DM
1947 /* If a peer entry exists for this destination, we must hook
1948 * it up in order to get at cached metrics.
1949 */
813b3b5d 1950 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1951 create = 1;
1952
3c0afdca 1953 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1954 if (peer) {
3c0afdca 1955 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1956 if (inet_metrics_new(peer))
1957 memcpy(peer->metrics, fi->fib_metrics,
1958 sizeof(u32) * RTAX_MAX);
1959 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1960
fe6fe792 1961 check_peer_pmtu(&rt->dst, peer);
de68dca1
ED
1962 if (peer->redirect_genid != redirect_genid)
1963 peer->redirect_learned.a4 = 0;
f39925db
DM
1964 if (peer->redirect_learned.a4 &&
1965 peer->redirect_learned.a4 != rt->rt_gateway) {
1966 rt->rt_gateway = peer->redirect_learned.a4;
1967 rt->rt_flags |= RTCF_REDIRECTED;
1968 }
0131ba45
DM
1969 } else {
1970 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1971 rt->fi = fi;
1972 atomic_inc(&fi->fib_clntref);
1973 }
1974 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1975 }
1976}
1977
813b3b5d 1978static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1979 const struct fib_result *res,
982721f3 1980 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1981{
defb3519 1982 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1983
1984 if (fi) {
1985 if (FIB_RES_GW(*res) &&
1986 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1987 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1988 rt_init_metrics(rt, fl4, fi);
c7066f70 1989#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1990 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1991#endif
d33e4553 1992 }
defb3519 1993
defb3519
DM
1994 if (dst_mtu(dst) > IP_MAX_MTU)
1995 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1996 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1997 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1998
c7066f70 1999#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
2000#ifdef CONFIG_IP_MULTIPLE_TABLES
2001 set_class_tag(rt, fib_rules_tclass(res));
2002#endif
2003 set_class_tag(rt, itag);
2004#endif
1da177e4
LT
2005}
2006
5c1e6aa3
DM
2007static struct rtable *rt_dst_alloc(struct net_device *dev,
2008 bool nopolicy, bool noxfrm)
0c4dcd58 2009{
5c1e6aa3
DM
2010 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2011 DST_HOST |
2012 (nopolicy ? DST_NOPOLICY : 0) |
2013 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
2014}
2015
96d36220 2016/* called in rcu_read_lock() section */
9e12bb22 2017static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2018 u8 tos, struct net_device *dev, int our)
2019{
96d36220 2020 unsigned int hash;
1da177e4 2021 struct rtable *rth;
a61ced5d 2022 __be32 spec_dst;
96d36220 2023 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2024 u32 itag = 0;
b5f7e755 2025 int err;
1da177e4
LT
2026
2027 /* Primary sanity checks. */
2028
2029 if (in_dev == NULL)
2030 return -EINVAL;
2031
1e637c74 2032 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2033 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
2034 goto e_inval;
2035
f97c1e0c
JP
2036 if (ipv4_is_zeronet(saddr)) {
2037 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
2038 goto e_inval;
2039 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 2040 } else {
5c04c819
MS
2041 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2042 &itag);
b5f7e755
ED
2043 if (err < 0)
2044 goto e_err;
2045 }
5c1e6aa3
DM
2046 rth = rt_dst_alloc(init_net.loopback_dev,
2047 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2048 if (!rth)
2049 goto e_nobufs;
2050
cf911662
DM
2051#ifdef CONFIG_IP_ROUTE_CLASSID
2052 rth->dst.tclassid = itag;
2053#endif
d8d1f30b 2054 rth->dst.output = ip_rt_bug;
1da177e4 2055
5e2b61f7 2056 rth->rt_key_dst = daddr;
5e2b61f7 2057 rth->rt_key_src = saddr;
cf911662
DM
2058 rth->rt_genid = rt_genid(dev_net(dev));
2059 rth->rt_flags = RTCF_MULTICAST;
2060 rth->rt_type = RTN_MULTICAST;
475949d8 2061 rth->rt_key_tos = tos;
cf911662 2062 rth->rt_dst = daddr;
1da177e4 2063 rth->rt_src = saddr;
1b86a58f 2064 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2065 rth->rt_iif = dev->ifindex;
5e2b61f7 2066 rth->rt_oif = 0;
cf911662 2067 rth->rt_mark = skb->mark;
1da177e4
LT
2068 rth->rt_gateway = daddr;
2069 rth->rt_spec_dst= spec_dst;
cf911662
DM
2070 rth->rt_peer_genid = 0;
2071 rth->peer = NULL;
2072 rth->fi = NULL;
1da177e4 2073 if (our) {
d8d1f30b 2074 rth->dst.input= ip_local_deliver;
1da177e4
LT
2075 rth->rt_flags |= RTCF_LOCAL;
2076 }
2077
2078#ifdef CONFIG_IP_MROUTE
f97c1e0c 2079 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 2080 rth->dst.input = ip_mr_input;
1da177e4
LT
2081#endif
2082 RT_CACHE_STAT_INC(in_slow_mc);
2083
e84f84f2 2084 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 2085 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 2086 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
2087
2088e_nobufs:
1da177e4 2089 return -ENOBUFS;
1da177e4 2090e_inval:
96d36220 2091 return -EINVAL;
b5f7e755 2092e_err:
b5f7e755 2093 return err;
1da177e4
LT
2094}
2095
2096
2097static void ip_handle_martian_source(struct net_device *dev,
2098 struct in_device *in_dev,
2099 struct sk_buff *skb,
9e12bb22
AV
2100 __be32 daddr,
2101 __be32 saddr)
1da177e4
LT
2102{
2103 RT_CACHE_STAT_INC(in_martian_src);
2104#ifdef CONFIG_IP_ROUTE_VERBOSE
2105 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2106 /*
2107 * RFC1812 recommendation, if source is martian,
2108 * the only hint is MAC header.
2109 */
673d57e7
HH
2110 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2111 &daddr, &saddr, dev->name);
98e399f8 2112 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 2113 int i;
98e399f8 2114 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
2115 printk(KERN_WARNING "ll header: ");
2116 for (i = 0; i < dev->hard_header_len; i++, p++) {
2117 printk("%02x", *p);
2118 if (i < (dev->hard_header_len - 1))
2119 printk(":");
2120 }
2121 printk("\n");
2122 }
2123 }
2124#endif
2125}
2126
47360228 2127/* called in rcu_read_lock() section */
5969f71d 2128static int __mkroute_input(struct sk_buff *skb,
982721f3 2129 const struct fib_result *res,
5969f71d
SH
2130 struct in_device *in_dev,
2131 __be32 daddr, __be32 saddr, u32 tos,
2132 struct rtable **result)
1da177e4 2133{
1da177e4
LT
2134 struct rtable *rth;
2135 int err;
2136 struct in_device *out_dev;
47360228 2137 unsigned int flags = 0;
d9c9df8c
AV
2138 __be32 spec_dst;
2139 u32 itag;
1da177e4
LT
2140
2141 /* get a working reference to the output device */
47360228 2142 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2143 if (out_dev == NULL) {
2144 if (net_ratelimit())
2145 printk(KERN_CRIT "Bug in ip_route_input" \
2146 "_slow(). Please, report\n");
2147 return -EINVAL;
2148 }
2149
2150
5c04c819
MS
2151 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2152 in_dev->dev, &spec_dst, &itag);
1da177e4 2153 if (err < 0) {
e905a9ed 2154 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2155 saddr);
e905a9ed 2156
1da177e4
LT
2157 goto cleanup;
2158 }
2159
2160 if (err)
2161 flags |= RTCF_DIRECTSRC;
2162
51b77cae 2163 if (out_dev == in_dev && err &&
1da177e4
LT
2164 (IN_DEV_SHARED_MEDIA(out_dev) ||
2165 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2166 flags |= RTCF_DOREDIRECT;
2167
2168 if (skb->protocol != htons(ETH_P_IP)) {
2169 /* Not IP (i.e. ARP). Do not create route, if it is
2170 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2171 *
2172 * Proxy arp feature have been extended to allow, ARP
2173 * replies back to the same interface, to support
2174 * Private VLAN switch technologies. See arp.c.
1da177e4 2175 */
65324144
JDB
2176 if (out_dev == in_dev &&
2177 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2178 err = -EINVAL;
2179 goto cleanup;
2180 }
2181 }
2182
5c1e6aa3
DM
2183 rth = rt_dst_alloc(out_dev->dev,
2184 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2185 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2186 if (!rth) {
2187 err = -ENOBUFS;
2188 goto cleanup;
2189 }
2190
5e2b61f7 2191 rth->rt_key_dst = daddr;
5e2b61f7 2192 rth->rt_key_src = saddr;
cf911662
DM
2193 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2194 rth->rt_flags = flags;
2195 rth->rt_type = res->type;
475949d8 2196 rth->rt_key_tos = tos;
cf911662 2197 rth->rt_dst = daddr;
1da177e4 2198 rth->rt_src = saddr;
1b86a58f 2199 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2200 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2201 rth->rt_oif = 0;
cf911662
DM
2202 rth->rt_mark = skb->mark;
2203 rth->rt_gateway = daddr;
1da177e4 2204 rth->rt_spec_dst= spec_dst;
cf911662
DM
2205 rth->rt_peer_genid = 0;
2206 rth->peer = NULL;
2207 rth->fi = NULL;
1da177e4 2208
d8d1f30b
CG
2209 rth->dst.input = ip_forward;
2210 rth->dst.output = ip_output;
1da177e4 2211
5e2b61f7 2212 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2213
1da177e4
LT
2214 *result = rth;
2215 err = 0;
2216 cleanup:
1da177e4 2217 return err;
e905a9ed 2218}
1da177e4 2219
5969f71d
SH
2220static int ip_mkroute_input(struct sk_buff *skb,
2221 struct fib_result *res,
68a5e3dd 2222 const struct flowi4 *fl4,
5969f71d
SH
2223 struct in_device *in_dev,
2224 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2225{
7abaa27c 2226 struct rtable* rth = NULL;
1da177e4
LT
2227 int err;
2228 unsigned hash;
2229
2230#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2231 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2232 fib_select_multipath(res);
1da177e4
LT
2233#endif
2234
2235 /* create a routing cache entry */
2236 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2237 if (err)
2238 return err;
1da177e4
LT
2239
2240 /* put it into the cache */
68a5e3dd 2241 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2242 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2243 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2244 if (IS_ERR(rth))
2245 return PTR_ERR(rth);
2246 return 0;
1da177e4
LT
2247}
2248
1da177e4
LT
2249/*
2250 * NOTE. We drop all the packets that has local source
2251 * addresses, because every properly looped back packet
2252 * must have correct destination already attached by output routine.
2253 *
2254 * Such approach solves two big problems:
2255 * 1. Not simplex devices are handled properly.
2256 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2257 * called with rcu_read_lock()
1da177e4
LT
2258 */
2259
9e12bb22 2260static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2261 u8 tos, struct net_device *dev)
2262{
2263 struct fib_result res;
96d36220 2264 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2265 struct flowi4 fl4;
1da177e4
LT
2266 unsigned flags = 0;
2267 u32 itag = 0;
2268 struct rtable * rth;
2269 unsigned hash;
9e12bb22 2270 __be32 spec_dst;
1da177e4 2271 int err = -EINVAL;
c346dca1 2272 struct net * net = dev_net(dev);
1da177e4
LT
2273
2274 /* IP on this device is disabled. */
2275
2276 if (!in_dev)
2277 goto out;
2278
2279 /* Check for the most weird martians, which can be not detected
2280 by fib_lookup.
2281 */
2282
1e637c74 2283 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2284 ipv4_is_loopback(saddr))
1da177e4
LT
2285 goto martian_source;
2286
27a954bd 2287 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2288 goto brd_input;
2289
2290 /* Accept zero addresses only to limited broadcast;
2291 * I even do not know to fix it or not. Waiting for complains :-)
2292 */
f97c1e0c 2293 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2294 goto martian_source;
2295
27a954bd 2296 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2297 goto martian_destination;
2298
2299 /*
2300 * Now we are ready to route packet.
2301 */
68a5e3dd
DM
2302 fl4.flowi4_oif = 0;
2303 fl4.flowi4_iif = dev->ifindex;
2304 fl4.flowi4_mark = skb->mark;
2305 fl4.flowi4_tos = tos;
2306 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2307 fl4.daddr = daddr;
2308 fl4.saddr = saddr;
2309 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2310 if (err != 0) {
1da177e4 2311 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2312 goto e_hostunreach;
1da177e4
LT
2313 goto no_route;
2314 }
1da177e4
LT
2315
2316 RT_CACHE_STAT_INC(in_slow_tot);
2317
2318 if (res.type == RTN_BROADCAST)
2319 goto brd_input;
2320
2321 if (res.type == RTN_LOCAL) {
5c04c819 2322 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2323 net->loopback_dev->ifindex,
5c04c819 2324 dev, &spec_dst, &itag);
b5f7e755
ED
2325 if (err < 0)
2326 goto martian_source_keep_err;
2327 if (err)
1da177e4
LT
2328 flags |= RTCF_DIRECTSRC;
2329 spec_dst = daddr;
2330 goto local_input;
2331 }
2332
2333 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2334 goto e_hostunreach;
1da177e4
LT
2335 if (res.type != RTN_UNICAST)
2336 goto martian_destination;
2337
68a5e3dd 2338 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2339out: return err;
2340
2341brd_input:
2342 if (skb->protocol != htons(ETH_P_IP))
2343 goto e_inval;
2344
f97c1e0c 2345 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2346 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2347 else {
5c04c819
MS
2348 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2349 &itag);
1da177e4 2350 if (err < 0)
b5f7e755 2351 goto martian_source_keep_err;
1da177e4
LT
2352 if (err)
2353 flags |= RTCF_DIRECTSRC;
2354 }
2355 flags |= RTCF_BROADCAST;
2356 res.type = RTN_BROADCAST;
2357 RT_CACHE_STAT_INC(in_brd);
2358
2359local_input:
5c1e6aa3
DM
2360 rth = rt_dst_alloc(net->loopback_dev,
2361 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2362 if (!rth)
2363 goto e_nobufs;
2364
cf911662 2365 rth->dst.input= ip_local_deliver;
d8d1f30b 2366 rth->dst.output= ip_rt_bug;
cf911662
DM
2367#ifdef CONFIG_IP_ROUTE_CLASSID
2368 rth->dst.tclassid = itag;
2369#endif
1da177e4 2370
5e2b61f7 2371 rth->rt_key_dst = daddr;
5e2b61f7 2372 rth->rt_key_src = saddr;
cf911662
DM
2373 rth->rt_genid = rt_genid(net);
2374 rth->rt_flags = flags|RTCF_LOCAL;
2375 rth->rt_type = res.type;
475949d8 2376 rth->rt_key_tos = tos;
cf911662 2377 rth->rt_dst = daddr;
1da177e4 2378 rth->rt_src = saddr;
c7066f70 2379#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2380 rth->dst.tclassid = itag;
1da177e4 2381#endif
1b86a58f 2382 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2383 rth->rt_iif = dev->ifindex;
cf911662
DM
2384 rth->rt_oif = 0;
2385 rth->rt_mark = skb->mark;
1da177e4
LT
2386 rth->rt_gateway = daddr;
2387 rth->rt_spec_dst= spec_dst;
cf911662
DM
2388 rth->rt_peer_genid = 0;
2389 rth->peer = NULL;
2390 rth->fi = NULL;
1da177e4 2391 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2392 rth->dst.input= ip_error;
2393 rth->dst.error= -err;
1da177e4
LT
2394 rth->rt_flags &= ~RTCF_LOCAL;
2395 }
68a5e3dd
DM
2396 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2397 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2398 err = 0;
2399 if (IS_ERR(rth))
2400 err = PTR_ERR(rth);
ebc0ffae 2401 goto out;
1da177e4
LT
2402
2403no_route:
2404 RT_CACHE_STAT_INC(in_no_route);
2405 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2406 res.type = RTN_UNREACHABLE;
7f53878d
MC
2407 if (err == -ESRCH)
2408 err = -ENETUNREACH;
1da177e4
LT
2409 goto local_input;
2410
2411 /*
2412 * Do not cache martian addresses: they should be logged (RFC1812)
2413 */
2414martian_destination:
2415 RT_CACHE_STAT_INC(in_martian_dst);
2416#ifdef CONFIG_IP_ROUTE_VERBOSE
2417 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2418 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2419 &daddr, &saddr, dev->name);
1da177e4 2420#endif
2c2910a4
DE
2421
2422e_hostunreach:
e905a9ed 2423 err = -EHOSTUNREACH;
ebc0ffae 2424 goto out;
2c2910a4 2425
1da177e4
LT
2426e_inval:
2427 err = -EINVAL;
ebc0ffae 2428 goto out;
1da177e4
LT
2429
2430e_nobufs:
2431 err = -ENOBUFS;
ebc0ffae 2432 goto out;
1da177e4
LT
2433
2434martian_source:
b5f7e755
ED
2435 err = -EINVAL;
2436martian_source_keep_err:
1da177e4 2437 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2438 goto out;
1da177e4
LT
2439}
2440
407eadd9
ED
2441int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2442 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2443{
2444 struct rtable * rth;
2445 unsigned hash;
2446 int iif = dev->ifindex;
b5921910 2447 struct net *net;
96d36220 2448 int res;
1da177e4 2449
c346dca1 2450 net = dev_net(dev);
1080d709 2451
96d36220
ED
2452 rcu_read_lock();
2453
1080d709
NH
2454 if (!rt_caching(net))
2455 goto skip_cache;
2456
1da177e4 2457 tos &= IPTOS_RT_MASK;
e84f84f2 2458 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2459
1da177e4 2460 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2461 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2462 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2463 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2464 (rth->rt_route_iif ^ iif) |
475949d8 2465 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2466 rth->rt_mark == skb->mark &&
d8d1f30b 2467 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2468 !rt_is_expired(rth)) {
de398fb8 2469 ipv4_validate_peer(rth);
407eadd9 2470 if (noref) {
d8d1f30b
CG
2471 dst_use_noref(&rth->dst, jiffies);
2472 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2473 } else {
d8d1f30b
CG
2474 dst_use(&rth->dst, jiffies);
2475 skb_dst_set(skb, &rth->dst);
407eadd9 2476 }
1da177e4
LT
2477 RT_CACHE_STAT_INC(in_hit);
2478 rcu_read_unlock();
1da177e4
LT
2479 return 0;
2480 }
2481 RT_CACHE_STAT_INC(in_hlist_search);
2482 }
1da177e4 2483
1080d709 2484skip_cache:
1da177e4
LT
2485 /* Multicast recognition logic is moved from route cache to here.
2486 The problem was that too many Ethernet cards have broken/missing
2487 hardware multicast filters :-( As result the host on multicasting
2488 network acquires a lot of useless route cache entries, sort of
2489 SDR messages from all the world. Now we try to get rid of them.
2490 Really, provided software IP multicast filter is organized
2491 reasonably (at least, hashed), it does not result in a slowdown
2492 comparing with route cache reject entries.
2493 Note, that multicast routers are not affected, because
2494 route cache entry is created eventually.
2495 */
f97c1e0c 2496 if (ipv4_is_multicast(daddr)) {
96d36220 2497 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2498
96d36220 2499 if (in_dev) {
dbdd9a52
DM
2500 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2501 ip_hdr(skb)->protocol);
1da177e4
LT
2502 if (our
2503#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2504 ||
2505 (!ipv4_is_local_multicast(daddr) &&
2506 IN_DEV_MFORWARD(in_dev))
1da177e4 2507#endif
9d4fb27d 2508 ) {
96d36220
ED
2509 int res = ip_route_input_mc(skb, daddr, saddr,
2510 tos, dev, our);
1da177e4 2511 rcu_read_unlock();
96d36220 2512 return res;
1da177e4
LT
2513 }
2514 }
2515 rcu_read_unlock();
2516 return -EINVAL;
2517 }
96d36220
ED
2518 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2519 rcu_read_unlock();
2520 return res;
1da177e4 2521}
407eadd9 2522EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2523
ebc0ffae 2524/* called with rcu_read_lock() */
982721f3 2525static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2526 const struct flowi4 *fl4,
813b3b5d 2527 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2528 int orig_oif, __u8 orig_rtos,
2529 struct net_device *dev_out,
5ada5527 2530 unsigned int flags)
1da177e4 2531{
982721f3 2532 struct fib_info *fi = res->fi;
5ada5527 2533 struct in_device *in_dev;
982721f3 2534 u16 type = res->type;
5ada5527 2535 struct rtable *rth;
1da177e4 2536
68a5e3dd 2537 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2538 return ERR_PTR(-EINVAL);
1da177e4 2539
68a5e3dd 2540 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2541 type = RTN_BROADCAST;
68a5e3dd 2542 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2543 type = RTN_MULTICAST;
68a5e3dd 2544 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2545 return ERR_PTR(-EINVAL);
1da177e4
LT
2546
2547 if (dev_out->flags & IFF_LOOPBACK)
2548 flags |= RTCF_LOCAL;
2549
dd28d1a0 2550 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2551 if (!in_dev)
5ada5527 2552 return ERR_PTR(-EINVAL);
ebc0ffae 2553
982721f3 2554 if (type == RTN_BROADCAST) {
1da177e4 2555 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2556 fi = NULL;
2557 } else if (type == RTN_MULTICAST) {
dd28d1a0 2558 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2559 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2560 fl4->flowi4_proto))
1da177e4
LT
2561 flags &= ~RTCF_LOCAL;
2562 /* If multicast route do not exist use
dd28d1a0
ED
2563 * default one, but do not gateway in this case.
2564 * Yes, it is hack.
1da177e4 2565 */
982721f3
DM
2566 if (fi && res->prefixlen < 4)
2567 fi = NULL;
1da177e4
LT
2568 }
2569
5c1e6aa3
DM
2570 rth = rt_dst_alloc(dev_out,
2571 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2572 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2573 if (!rth)
5ada5527 2574 return ERR_PTR(-ENOBUFS);
8391d07b 2575
cf911662
DM
2576 rth->dst.output = ip_output;
2577
813b3b5d
DM
2578 rth->rt_key_dst = orig_daddr;
2579 rth->rt_key_src = orig_saddr;
cf911662
DM
2580 rth->rt_genid = rt_genid(dev_net(dev_out));
2581 rth->rt_flags = flags;
2582 rth->rt_type = type;
f61759e6 2583 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2584 rth->rt_dst = fl4->daddr;
2585 rth->rt_src = fl4->saddr;
1b86a58f 2586 rth->rt_route_iif = 0;
813b3b5d
DM
2587 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2588 rth->rt_oif = orig_oif;
2589 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2590 rth->rt_gateway = fl4->daddr;
2591 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2592 rth->rt_peer_genid = 0;
2593 rth->peer = NULL;
2594 rth->fi = NULL;
1da177e4
LT
2595
2596 RT_CACHE_STAT_INC(out_slow_tot);
2597
2598 if (flags & RTCF_LOCAL) {
d8d1f30b 2599 rth->dst.input = ip_local_deliver;
68a5e3dd 2600 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2601 }
2602 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2603 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2604 if (flags & RTCF_LOCAL &&
1da177e4 2605 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2606 rth->dst.output = ip_mc_output;
1da177e4
LT
2607 RT_CACHE_STAT_INC(out_slow_mc);
2608 }
2609#ifdef CONFIG_IP_MROUTE
982721f3 2610 if (type == RTN_MULTICAST) {
1da177e4 2611 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2612 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2613 rth->dst.input = ip_mr_input;
2614 rth->dst.output = ip_mc_output;
1da177e4
LT
2615 }
2616 }
2617#endif
2618 }
2619
813b3b5d 2620 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2621
5ada5527 2622 return rth;
1da177e4
LT
2623}
2624
1da177e4
LT
2625/*
2626 * Major route resolver routine.
0197aa38 2627 * called with rcu_read_lock();
1da177e4
LT
2628 */
2629
813b3b5d 2630static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2631{
1da177e4 2632 struct net_device *dev_out = NULL;
f61759e6 2633 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2634 unsigned int flags = 0;
2635 struct fib_result res;
5ada5527 2636 struct rtable *rth;
813b3b5d
DM
2637 __be32 orig_daddr;
2638 __be32 orig_saddr;
2639 int orig_oif;
1da177e4
LT
2640
2641 res.fi = NULL;
2642#ifdef CONFIG_IP_MULTIPLE_TABLES
2643 res.r = NULL;
2644#endif
2645
813b3b5d
DM
2646 orig_daddr = fl4->daddr;
2647 orig_saddr = fl4->saddr;
2648 orig_oif = fl4->flowi4_oif;
2649
2650 fl4->flowi4_iif = net->loopback_dev->ifindex;
2651 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2652 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2653 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2654
010c2708 2655 rcu_read_lock();
813b3b5d 2656 if (fl4->saddr) {
b23dd4fe 2657 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2658 if (ipv4_is_multicast(fl4->saddr) ||
2659 ipv4_is_lbcast(fl4->saddr) ||
2660 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2661 goto out;
2662
1da177e4
LT
2663 /* I removed check for oif == dev_out->oif here.
2664 It was wrong for two reasons:
1ab35276
DL
2665 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2666 is assigned to multiple interfaces.
1da177e4
LT
2667 2. Moreover, we are allowed to send packets with saddr
2668 of another iface. --ANK
2669 */
2670
813b3b5d
DM
2671 if (fl4->flowi4_oif == 0 &&
2672 (ipv4_is_multicast(fl4->daddr) ||
2673 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2674 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2675 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2676 if (dev_out == NULL)
2677 goto out;
2678
1da177e4
LT
2679 /* Special hack: user can direct multicasts
2680 and limited broadcast via necessary interface
2681 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2682 This hack is not just for fun, it allows
2683 vic,vat and friends to work.
2684 They bind socket to loopback, set ttl to zero
2685 and expect that it will work.
2686 From the viewpoint of routing cache they are broken,
2687 because we are not allowed to build multicast path
2688 with loopback source addr (look, routing cache
2689 cannot know, that ttl is zero, so that packet
2690 will not leave this host and route is valid).
2691 Luckily, this hack is good workaround.
2692 */
2693
813b3b5d 2694 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2695 goto make_route;
2696 }
a210d01a 2697
813b3b5d 2698 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2699 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2700 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2701 goto out;
a210d01a 2702 }
1da177e4
LT
2703 }
2704
2705
813b3b5d
DM
2706 if (fl4->flowi4_oif) {
2707 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2708 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2709 if (dev_out == NULL)
2710 goto out;
e5ed6399
HX
2711
2712 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2713 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2714 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2715 goto out;
2716 }
813b3b5d
DM
2717 if (ipv4_is_local_multicast(fl4->daddr) ||
2718 ipv4_is_lbcast(fl4->daddr)) {
2719 if (!fl4->saddr)
2720 fl4->saddr = inet_select_addr(dev_out, 0,
2721 RT_SCOPE_LINK);
1da177e4
LT
2722 goto make_route;
2723 }
813b3b5d
DM
2724 if (fl4->saddr) {
2725 if (ipv4_is_multicast(fl4->daddr))
2726 fl4->saddr = inet_select_addr(dev_out, 0,
2727 fl4->flowi4_scope);
2728 else if (!fl4->daddr)
2729 fl4->saddr = inet_select_addr(dev_out, 0,
2730 RT_SCOPE_HOST);
1da177e4
LT
2731 }
2732 }
2733
813b3b5d
DM
2734 if (!fl4->daddr) {
2735 fl4->daddr = fl4->saddr;
2736 if (!fl4->daddr)
2737 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2738 dev_out = net->loopback_dev;
813b3b5d 2739 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2740 res.type = RTN_LOCAL;
2741 flags |= RTCF_LOCAL;
2742 goto make_route;
2743 }
2744
813b3b5d 2745 if (fib_lookup(net, fl4, &res)) {
1da177e4 2746 res.fi = NULL;
813b3b5d 2747 if (fl4->flowi4_oif) {
1da177e4
LT
2748 /* Apparently, routing tables are wrong. Assume,
2749 that the destination is on link.
2750
2751 WHY? DW.
2752 Because we are allowed to send to iface
2753 even if it has NO routes and NO assigned
2754 addresses. When oif is specified, routing
2755 tables are looked up with only one purpose:
2756 to catch if destination is gatewayed, rather than
2757 direct. Moreover, if MSG_DONTROUTE is set,
2758 we send packet, ignoring both routing tables
2759 and ifaddr state. --ANK
2760
2761
2762 We could make it even if oif is unknown,
2763 likely IPv6, but we do not.
2764 */
2765
813b3b5d
DM
2766 if (fl4->saddr == 0)
2767 fl4->saddr = inet_select_addr(dev_out, 0,
2768 RT_SCOPE_LINK);
1da177e4
LT
2769 res.type = RTN_UNICAST;
2770 goto make_route;
2771 }
b23dd4fe 2772 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2773 goto out;
2774 }
1da177e4
LT
2775
2776 if (res.type == RTN_LOCAL) {
813b3b5d 2777 if (!fl4->saddr) {
9fc3bbb4 2778 if (res.fi->fib_prefsrc)
813b3b5d 2779 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2780 else
813b3b5d 2781 fl4->saddr = fl4->daddr;
9fc3bbb4 2782 }
b40afd0e 2783 dev_out = net->loopback_dev;
813b3b5d 2784 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2785 res.fi = NULL;
2786 flags |= RTCF_LOCAL;
2787 goto make_route;
2788 }
2789
2790#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2791 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2792 fib_select_multipath(&res);
1da177e4
LT
2793 else
2794#endif
21d8c49e
DM
2795 if (!res.prefixlen &&
2796 res.table->tb_num_default > 1 &&
813b3b5d 2797 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2798 fib_select_default(&res);
1da177e4 2799
813b3b5d
DM
2800 if (!fl4->saddr)
2801 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2802
1da177e4 2803 dev_out = FIB_RES_DEV(res);
813b3b5d 2804 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2805
2806
2807make_route:
813b3b5d 2808 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2809 tos, dev_out, flags);
b23dd4fe 2810 if (!IS_ERR(rth)) {
5ada5527
DM
2811 unsigned int hash;
2812
813b3b5d 2813 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2814 rt_genid(dev_net(dev_out)));
813b3b5d 2815 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2816 }
1da177e4 2817
010c2708
DM
2818out:
2819 rcu_read_unlock();
b23dd4fe 2820 return rth;
1da177e4
LT
2821}
2822
813b3b5d 2823struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2824{
1da177e4 2825 struct rtable *rth;
010c2708 2826 unsigned int hash;
1da177e4 2827
1080d709
NH
2828 if (!rt_caching(net))
2829 goto slow_output;
2830
9d6ec938 2831 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2832
2833 rcu_read_lock_bh();
a898def2 2834 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2835 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2836 if (rth->rt_key_dst == flp4->daddr &&
2837 rth->rt_key_src == flp4->saddr &&
c7537967 2838 rt_is_output_route(rth) &&
9d6ec938
DM
2839 rth->rt_oif == flp4->flowi4_oif &&
2840 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2841 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2842 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2843 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2844 !rt_is_expired(rth)) {
de398fb8 2845 ipv4_validate_peer(rth);
d8d1f30b 2846 dst_use(&rth->dst, jiffies);
1da177e4
LT
2847 RT_CACHE_STAT_INC(out_hit);
2848 rcu_read_unlock_bh();
56157872
DM
2849 if (!flp4->saddr)
2850 flp4->saddr = rth->rt_src;
2851 if (!flp4->daddr)
2852 flp4->daddr = rth->rt_dst;
b23dd4fe 2853 return rth;
1da177e4
LT
2854 }
2855 RT_CACHE_STAT_INC(out_hlist_search);
2856 }
2857 rcu_read_unlock_bh();
2858
1080d709 2859slow_output:
9d6ec938 2860 return ip_route_output_slow(net, flp4);
1da177e4 2861}
d8c97a94
ACM
2862EXPORT_SYMBOL_GPL(__ip_route_output_key);
2863
ae2688d5
JW
2864static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2865{
2866 return NULL;
2867}
2868
ebb762f2 2869static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2870{
618f9bc7
SK
2871 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2872
2873 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2874}
2875
14e50e57
DM
2876static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2877{
2878}
2879
0972ddb2
HB
2880static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2881 unsigned long old)
2882{
2883 return NULL;
2884}
2885
14e50e57
DM
2886static struct dst_ops ipv4_dst_blackhole_ops = {
2887 .family = AF_INET,
09640e63 2888 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2889 .destroy = ipv4_dst_destroy,
ae2688d5 2890 .check = ipv4_blackhole_dst_check,
ebb762f2 2891 .mtu = ipv4_blackhole_mtu,
214f45c9 2892 .default_advmss = ipv4_default_advmss,
14e50e57 2893 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2894 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2895 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2896};
2897
2774c131 2898struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2899{
5c1e6aa3 2900 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2901 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2902
2903 if (rt) {
d8d1f30b 2904 struct dst_entry *new = &rt->dst;
14e50e57 2905
14e50e57 2906 new->__use = 1;
352e512c
HX
2907 new->input = dst_discard;
2908 new->output = dst_discard;
defb3519 2909 dst_copy_metrics(new, &ort->dst);
14e50e57 2910
d8d1f30b 2911 new->dev = ort->dst.dev;
14e50e57
DM
2912 if (new->dev)
2913 dev_hold(new->dev);
2914
5e2b61f7
DM
2915 rt->rt_key_dst = ort->rt_key_dst;
2916 rt->rt_key_src = ort->rt_key_src;
475949d8 2917 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2918 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2919 rt->rt_iif = ort->rt_iif;
2920 rt->rt_oif = ort->rt_oif;
2921 rt->rt_mark = ort->rt_mark;
14e50e57 2922
e84f84f2 2923 rt->rt_genid = rt_genid(net);
14e50e57
DM
2924 rt->rt_flags = ort->rt_flags;
2925 rt->rt_type = ort->rt_type;
2926 rt->rt_dst = ort->rt_dst;
2927 rt->rt_src = ort->rt_src;
14e50e57
DM
2928 rt->rt_gateway = ort->rt_gateway;
2929 rt->rt_spec_dst = ort->rt_spec_dst;
2930 rt->peer = ort->peer;
2931 if (rt->peer)
2932 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2933 rt->fi = ort->fi;
2934 if (rt->fi)
2935 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2936
2937 dst_free(new);
2938 }
2939
2774c131
DM
2940 dst_release(dst_orig);
2941
2942 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2943}
2944
9d6ec938 2945struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2946 struct sock *sk)
1da177e4 2947{
9d6ec938 2948 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2949
b23dd4fe
DM
2950 if (IS_ERR(rt))
2951 return rt;
1da177e4 2952
56157872 2953 if (flp4->flowi4_proto)
9d6ec938
DM
2954 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2955 flowi4_to_flowi(flp4),
2956 sk, 0);
1da177e4 2957
b23dd4fe 2958 return rt;
1da177e4 2959}
d8c97a94
ACM
2960EXPORT_SYMBOL_GPL(ip_route_output_flow);
2961
4feb88e5
BT
2962static int rt_fill_info(struct net *net,
2963 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2964 int nowait, unsigned int flags)
1da177e4 2965{
511c3f92 2966 struct rtable *rt = skb_rtable(skb);
1da177e4 2967 struct rtmsg *r;
be403ea1 2968 struct nlmsghdr *nlh;
2bc8ca40 2969 unsigned long expires = 0;
fe6fe792 2970 const struct inet_peer *peer = rt->peer;
e3703b3d 2971 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2972
2973 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2974 if (nlh == NULL)
26932566 2975 return -EMSGSIZE;
be403ea1
TG
2976
2977 r = nlmsg_data(nlh);
1da177e4
LT
2978 r->rtm_family = AF_INET;
2979 r->rtm_dst_len = 32;
2980 r->rtm_src_len = 0;
475949d8 2981 r->rtm_tos = rt->rt_key_tos;
1da177e4 2982 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2983 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2984 r->rtm_type = rt->rt_type;
2985 r->rtm_scope = RT_SCOPE_UNIVERSE;
2986 r->rtm_protocol = RTPROT_UNSPEC;
2987 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2988 if (rt->rt_flags & RTCF_NOTIFY)
2989 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2990
17fb2c64 2991 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2992
5e2b61f7 2993 if (rt->rt_key_src) {
1da177e4 2994 r->rtm_src_len = 32;
5e2b61f7 2995 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 2996 }
d8d1f30b
CG
2997 if (rt->dst.dev)
2998 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2999#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
3000 if (rt->dst.tclassid)
3001 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 3002#endif
c7537967 3003 if (rt_is_input_route(rt))
17fb2c64 3004 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 3005 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 3006 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 3007
1da177e4 3008 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 3009 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 3010
defb3519 3011 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
3012 goto nla_put_failure;
3013
5e2b61f7
DM
3014 if (rt->rt_mark)
3015 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 3016
d8d1f30b 3017 error = rt->dst.error;
fe6fe792 3018 if (peer) {
317fe0e6 3019 inet_peer_refcheck(rt->peer);
fe6fe792
ED
3020 id = atomic_read(&peer->ip_id_count) & 0xffff;
3021 if (peer->tcp_ts_stamp) {
3022 ts = peer->tcp_ts;
3023 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 3024 }
fe6fe792 3025 expires = ACCESS_ONCE(peer->pmtu_expires);
2bc8ca40
SK
3026 if (expires) {
3027 if (time_before(jiffies, expires))
3028 expires -= jiffies;
3029 else
3030 expires = 0;
3031 }
1da177e4 3032 }
be403ea1 3033
c7537967 3034 if (rt_is_input_route(rt)) {
1da177e4 3035#ifdef CONFIG_IP_MROUTE
e448515c 3036 __be32 dst = rt->rt_dst;
1da177e4 3037
f97c1e0c 3038 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 3039 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
3040 int err = ipmr_get_route(net, skb,
3041 rt->rt_src, rt->rt_dst,
3042 r, nowait);
1da177e4
LT
3043 if (err <= 0) {
3044 if (!nowait) {
3045 if (err == 0)
3046 return 0;
be403ea1 3047 goto nla_put_failure;
1da177e4
LT
3048 } else {
3049 if (err == -EMSGSIZE)
be403ea1 3050 goto nla_put_failure;
e3703b3d 3051 error = err;
1da177e4
LT
3052 }
3053 }
3054 } else
3055#endif
5e2b61f7 3056 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
3057 }
3058
d8d1f30b 3059 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
3060 expires, error) < 0)
3061 goto nla_put_failure;
be403ea1
TG
3062
3063 return nlmsg_end(skb, nlh);
1da177e4 3064
be403ea1 3065nla_put_failure:
26932566
PM
3066 nlmsg_cancel(skb, nlh);
3067 return -EMSGSIZE;
1da177e4
LT
3068}
3069
63f3444f 3070static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 3071{
3b1e0a65 3072 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
3073 struct rtmsg *rtm;
3074 struct nlattr *tb[RTA_MAX+1];
1da177e4 3075 struct rtable *rt = NULL;
9e12bb22
AV
3076 __be32 dst = 0;
3077 __be32 src = 0;
3078 u32 iif;
d889ce3b 3079 int err;
963bfeee 3080 int mark;
1da177e4
LT
3081 struct sk_buff *skb;
3082
d889ce3b
TG
3083 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3084 if (err < 0)
3085 goto errout;
3086
3087 rtm = nlmsg_data(nlh);
3088
1da177e4 3089 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
3090 if (skb == NULL) {
3091 err = -ENOBUFS;
3092 goto errout;
3093 }
1da177e4
LT
3094
3095 /* Reserve room for dummy headers, this skb can pass
3096 through good chunk of routing engine.
3097 */
459a98ed 3098 skb_reset_mac_header(skb);
c1d2bbe1 3099 skb_reset_network_header(skb);
d2c962b8
SH
3100
3101 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3102 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3103 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3104
17fb2c64
AV
3105 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3106 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3107 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3108 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3109
3110 if (iif) {
d889ce3b
TG
3111 struct net_device *dev;
3112
1937504d 3113 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3114 if (dev == NULL) {
3115 err = -ENODEV;
3116 goto errout_free;
3117 }
3118
1da177e4
LT
3119 skb->protocol = htons(ETH_P_IP);
3120 skb->dev = dev;
963bfeee 3121 skb->mark = mark;
1da177e4
LT
3122 local_bh_disable();
3123 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3124 local_bh_enable();
d889ce3b 3125
511c3f92 3126 rt = skb_rtable(skb);
d8d1f30b
CG
3127 if (err == 0 && rt->dst.error)
3128 err = -rt->dst.error;
1da177e4 3129 } else {
68a5e3dd
DM
3130 struct flowi4 fl4 = {
3131 .daddr = dst,
3132 .saddr = src,
3133 .flowi4_tos = rtm->rtm_tos,
3134 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3135 .flowi4_mark = mark,
d889ce3b 3136 };
9d6ec938 3137 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
3138
3139 err = 0;
3140 if (IS_ERR(rt))
3141 err = PTR_ERR(rt);
1da177e4 3142 }
d889ce3b 3143
1da177e4 3144 if (err)
d889ce3b 3145 goto errout_free;
1da177e4 3146
d8d1f30b 3147 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3148 if (rtm->rtm_flags & RTM_F_NOTIFY)
3149 rt->rt_flags |= RTCF_NOTIFY;
3150
4feb88e5 3151 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3152 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3153 if (err <= 0)
3154 goto errout_free;
1da177e4 3155
1937504d 3156 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3157errout:
2942e900 3158 return err;
1da177e4 3159
d889ce3b 3160errout_free:
1da177e4 3161 kfree_skb(skb);
d889ce3b 3162 goto errout;
1da177e4
LT
3163}
3164
3165int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3166{
3167 struct rtable *rt;
3168 int h, s_h;
3169 int idx, s_idx;
1937504d
DL
3170 struct net *net;
3171
3b1e0a65 3172 net = sock_net(skb->sk);
1da177e4
LT
3173
3174 s_h = cb->args[0];
d8c92830
ED
3175 if (s_h < 0)
3176 s_h = 0;
1da177e4 3177 s_idx = idx = cb->args[1];
a6272665
ED
3178 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3179 if (!rt_hash_table[h].chain)
3180 continue;
1da177e4 3181 rcu_read_lock_bh();
a898def2 3182 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3183 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3184 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3185 continue;
e84f84f2 3186 if (rt_is_expired(rt))
29e75252 3187 continue;
d8d1f30b 3188 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3189 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3190 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3191 1, NLM_F_MULTI) <= 0) {
adf30907 3192 skb_dst_drop(skb);
1da177e4
LT
3193 rcu_read_unlock_bh();
3194 goto done;
3195 }
adf30907 3196 skb_dst_drop(skb);
1da177e4
LT
3197 }
3198 rcu_read_unlock_bh();
3199 }
3200
3201done:
3202 cb->args[0] = h;
3203 cb->args[1] = idx;
3204 return skb->len;
3205}
3206
3207void ip_rt_multicast_event(struct in_device *in_dev)
3208{
76e6ebfb 3209 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3210}
3211
3212#ifdef CONFIG_SYSCTL
81c684d1 3213static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3214 void __user *buffer,
1da177e4
LT
3215 size_t *lenp, loff_t *ppos)
3216{
3217 if (write) {
639e104f 3218 int flush_delay;
81c684d1 3219 ctl_table ctl;
39a23e75 3220 struct net *net;
639e104f 3221
81c684d1
DL
3222 memcpy(&ctl, __ctl, sizeof(ctl));
3223 ctl.data = &flush_delay;
8d65af78 3224 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3225
81c684d1 3226 net = (struct net *)__ctl->extra1;
39a23e75 3227 rt_cache_flush(net, flush_delay);
1da177e4 3228 return 0;
e905a9ed 3229 }
1da177e4
LT
3230
3231 return -EINVAL;
3232}
3233
eeb61f71 3234static ctl_table ipv4_route_table[] = {
1da177e4 3235 {
1da177e4
LT
3236 .procname = "gc_thresh",
3237 .data = &ipv4_dst_ops.gc_thresh,
3238 .maxlen = sizeof(int),
3239 .mode = 0644,
6d9f239a 3240 .proc_handler = proc_dointvec,
1da177e4
LT
3241 },
3242 {
1da177e4
LT
3243 .procname = "max_size",
3244 .data = &ip_rt_max_size,
3245 .maxlen = sizeof(int),
3246 .mode = 0644,
6d9f239a 3247 .proc_handler = proc_dointvec,
1da177e4
LT
3248 },
3249 {
3250 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3251
1da177e4
LT
3252 .procname = "gc_min_interval",
3253 .data = &ip_rt_gc_min_interval,
3254 .maxlen = sizeof(int),
3255 .mode = 0644,
6d9f239a 3256 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3257 },
3258 {
1da177e4
LT
3259 .procname = "gc_min_interval_ms",
3260 .data = &ip_rt_gc_min_interval,
3261 .maxlen = sizeof(int),
3262 .mode = 0644,
6d9f239a 3263 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3264 },
3265 {
1da177e4
LT
3266 .procname = "gc_timeout",
3267 .data = &ip_rt_gc_timeout,
3268 .maxlen = sizeof(int),
3269 .mode = 0644,
6d9f239a 3270 .proc_handler = proc_dointvec_jiffies,
1da177e4 3271 },
9f28a2fc
ED
3272 {
3273 .procname = "gc_interval",
3274 .data = &ip_rt_gc_interval,
3275 .maxlen = sizeof(int),
3276 .mode = 0644,
3277 .proc_handler = proc_dointvec_jiffies,
3278 },
1da177e4 3279 {
1da177e4
LT
3280 .procname = "redirect_load",
3281 .data = &ip_rt_redirect_load,
3282 .maxlen = sizeof(int),
3283 .mode = 0644,
6d9f239a 3284 .proc_handler = proc_dointvec,
1da177e4
LT
3285 },
3286 {
1da177e4
LT
3287 .procname = "redirect_number",
3288 .data = &ip_rt_redirect_number,
3289 .maxlen = sizeof(int),
3290 .mode = 0644,
6d9f239a 3291 .proc_handler = proc_dointvec,
1da177e4
LT
3292 },
3293 {
1da177e4
LT
3294 .procname = "redirect_silence",
3295 .data = &ip_rt_redirect_silence,
3296 .maxlen = sizeof(int),
3297 .mode = 0644,
6d9f239a 3298 .proc_handler = proc_dointvec,
1da177e4
LT
3299 },
3300 {
1da177e4
LT
3301 .procname = "error_cost",
3302 .data = &ip_rt_error_cost,
3303 .maxlen = sizeof(int),
3304 .mode = 0644,
6d9f239a 3305 .proc_handler = proc_dointvec,
1da177e4
LT
3306 },
3307 {
1da177e4
LT
3308 .procname = "error_burst",
3309 .data = &ip_rt_error_burst,
3310 .maxlen = sizeof(int),
3311 .mode = 0644,
6d9f239a 3312 .proc_handler = proc_dointvec,
1da177e4
LT
3313 },
3314 {
1da177e4
LT
3315 .procname = "gc_elasticity",
3316 .data = &ip_rt_gc_elasticity,
3317 .maxlen = sizeof(int),
3318 .mode = 0644,
6d9f239a 3319 .proc_handler = proc_dointvec,
1da177e4
LT
3320 },
3321 {
1da177e4
LT
3322 .procname = "mtu_expires",
3323 .data = &ip_rt_mtu_expires,
3324 .maxlen = sizeof(int),
3325 .mode = 0644,
6d9f239a 3326 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3327 },
3328 {
1da177e4
LT
3329 .procname = "min_pmtu",
3330 .data = &ip_rt_min_pmtu,
3331 .maxlen = sizeof(int),
3332 .mode = 0644,
6d9f239a 3333 .proc_handler = proc_dointvec,
1da177e4
LT
3334 },
3335 {
1da177e4
LT
3336 .procname = "min_adv_mss",
3337 .data = &ip_rt_min_advmss,
3338 .maxlen = sizeof(int),
3339 .mode = 0644,
6d9f239a 3340 .proc_handler = proc_dointvec,
1da177e4 3341 },
f8572d8f 3342 { }
1da177e4 3343};
39a23e75 3344
2f4520d3
AV
3345static struct ctl_table empty[1];
3346
3347static struct ctl_table ipv4_skeleton[] =
3348{
f8572d8f 3349 { .procname = "route",
d994af0d 3350 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3351 { .procname = "neigh",
d994af0d 3352 .mode = 0555, .child = empty},
2f4520d3
AV
3353 { }
3354};
3355
3356static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3357 { .procname = "net", },
3358 { .procname = "ipv4", },
39a23e75
DL
3359 { },
3360};
3361
39a23e75
DL
3362static struct ctl_table ipv4_route_flush_table[] = {
3363 {
39a23e75
DL
3364 .procname = "flush",
3365 .maxlen = sizeof(int),
3366 .mode = 0200,
6d9f239a 3367 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3368 },
f8572d8f 3369 { },
39a23e75
DL
3370};
3371
2f4520d3 3372static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3373 { .procname = "net", },
3374 { .procname = "ipv4", },
3375 { .procname = "route", },
2f4520d3
AV
3376 { },
3377};
3378
39a23e75
DL
3379static __net_init int sysctl_route_net_init(struct net *net)
3380{
3381 struct ctl_table *tbl;
3382
3383 tbl = ipv4_route_flush_table;
09ad9bc7 3384 if (!net_eq(net, &init_net)) {
39a23e75
DL
3385 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3386 if (tbl == NULL)
3387 goto err_dup;
3388 }
3389 tbl[0].extra1 = net;
3390
3391 net->ipv4.route_hdr =
3392 register_net_sysctl_table(net, ipv4_route_path, tbl);
3393 if (net->ipv4.route_hdr == NULL)
3394 goto err_reg;
3395 return 0;
3396
3397err_reg:
3398 if (tbl != ipv4_route_flush_table)
3399 kfree(tbl);
3400err_dup:
3401 return -ENOMEM;
3402}
3403
3404static __net_exit void sysctl_route_net_exit(struct net *net)
3405{
3406 struct ctl_table *tbl;
3407
3408 tbl = net->ipv4.route_hdr->ctl_table_arg;
3409 unregister_net_sysctl_table(net->ipv4.route_hdr);
3410 BUG_ON(tbl == ipv4_route_flush_table);
3411 kfree(tbl);
3412}
3413
3414static __net_initdata struct pernet_operations sysctl_route_ops = {
3415 .init = sysctl_route_net_init,
3416 .exit = sysctl_route_net_exit,
3417};
1da177e4
LT
3418#endif
3419
3ee94372 3420static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3421{
3ee94372
NH
3422 get_random_bytes(&net->ipv4.rt_genid,
3423 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3424 get_random_bytes(&net->ipv4.dev_addr_genid,
3425 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3426 return 0;
3427}
3428
3ee94372
NH
3429static __net_initdata struct pernet_operations rt_genid_ops = {
3430 .init = rt_genid_init,
9f5e97e5
DL
3431};
3432
3433
c7066f70 3434#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3435struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3436#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3437
3438static __initdata unsigned long rhash_entries;
3439static int __init set_rhash_entries(char *str)
3440{
3441 if (!str)
3442 return 0;
3443 rhash_entries = simple_strtoul(str, &str, 0);
3444 return 1;
3445}
3446__setup("rhash_entries=", set_rhash_entries);
3447
3448int __init ip_rt_init(void)
3449{
424c4b70 3450 int rc = 0;
1da177e4 3451
c7066f70 3452#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3453 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3454 if (!ip_rt_acct)
3455 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3456#endif
3457
e5d679f3
AD
3458 ipv4_dst_ops.kmem_cachep =
3459 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3460 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3461
14e50e57
DM
3462 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3463
fc66f95c
ED
3464 if (dst_entries_init(&ipv4_dst_ops) < 0)
3465 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3466
3467 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3468 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3469
424c4b70
ED
3470 rt_hash_table = (struct rt_hash_bucket *)
3471 alloc_large_system_hash("IP route cache",
3472 sizeof(struct rt_hash_bucket),
3473 rhash_entries,
4481374c 3474 (totalram_pages >= 128 * 1024) ?
18955cfc 3475 15 : 17,
8d1502de 3476 0,
424c4b70
ED
3477 &rt_hash_log,
3478 &rt_hash_mask,
c9503e0f 3479 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3480 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3481 rt_hash_lock_init();
1da177e4
LT
3482
3483 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3484 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3485
1da177e4
LT
3486 devinet_init();
3487 ip_fib_init();
3488
9f28a2fc
ED
3489 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3490 expires_ljiffies = jiffies;
3491 schedule_delayed_work(&expires_work,
3492 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3493
73b38711 3494 if (ip_rt_proc_init())
107f1634 3495 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3496#ifdef CONFIG_XFRM
3497 xfrm_init();
a33bc5c1 3498 xfrm4_init(ip_rt_max_size);
1da177e4 3499#endif
c7ac8679 3500 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3501
39a23e75
DL
3502#ifdef CONFIG_SYSCTL
3503 register_pernet_subsys(&sysctl_route_ops);
3504#endif
3ee94372 3505 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3506 return rc;
3507}
3508
a1bc6eb4 3509#ifdef CONFIG_SYSCTL
eeb61f71
AV
3510/*
3511 * We really need to sanitize the damn ipv4 init order, then all
3512 * this nonsense will go away.
3513 */
3514void __init ip_static_sysctl_init(void)
3515{
2f4520d3 3516 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3517}
a1bc6eb4 3518#endif