]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/route.c
inet_diag: fix inet_diag_bc_audit()
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
68a5e3dd
DM
112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
1da177e4
LT
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
1080d709 132static int rt_chain_length_max __read_mostly = 20;
1da177e4 133
1da177e4
LT
134/*
135 * Interface to generic destination cache.
136 */
137
138static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 139static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
d33e4553 140static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
1da177e4 141static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
144static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 145static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 146
72cdd1d9
ED
147static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148 int how)
149{
150}
1da177e4 151
62fa8a84
DM
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
06582540
DM
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
a48eff12 159 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 160
06582540
DM
161 peer = rt->peer;
162 if (peer) {
62fa8a84
DM
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
06582540
DM
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
62fa8a84
DM
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
62fa8a84
DM
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
186
1da177e4
LT
187static struct dst_ops ipv4_dst_ops = {
188 .family = AF_INET,
09640e63 189 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
190 .gc = rt_garbage_collect,
191 .check = ipv4_dst_check,
0dbaee3b 192 .default_advmss = ipv4_default_advmss,
d33e4553 193 .default_mtu = ipv4_default_mtu,
62fa8a84 194 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
195 .destroy = ipv4_dst_destroy,
196 .ifdown = ipv4_dst_ifdown,
197 .negative_advice = ipv4_negative_advice,
198 .link_failure = ipv4_link_failure,
199 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 200 .local_out = __ip_local_out,
1da177e4
LT
201};
202
203#define ECN_OR_COST(class) TC_PRIO_##class
204
4839c52b 205const __u8 ip_tos2prio[16] = {
1da177e4 206 TC_PRIO_BESTEFFORT,
4a2b9c37 207 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
208 TC_PRIO_BESTEFFORT,
209 ECN_OR_COST(BESTEFFORT),
210 TC_PRIO_BULK,
211 ECN_OR_COST(BULK),
212 TC_PRIO_BULK,
213 ECN_OR_COST(BULK),
214 TC_PRIO_INTERACTIVE,
215 ECN_OR_COST(INTERACTIVE),
216 TC_PRIO_INTERACTIVE,
217 ECN_OR_COST(INTERACTIVE),
218 TC_PRIO_INTERACTIVE_BULK,
219 ECN_OR_COST(INTERACTIVE_BULK),
220 TC_PRIO_INTERACTIVE_BULK,
221 ECN_OR_COST(INTERACTIVE_BULK)
222};
223
224
225/*
226 * Route cache.
227 */
228
229/* The locking scheme is rather straight forward:
230 *
231 * 1) Read-Copy Update protects the buckets of the central route hash.
232 * 2) Only writers remove entries, and they hold the lock
233 * as they look at rtable reference counts.
234 * 3) Only readers acquire references to rtable entries,
235 * they do so with atomic increments and with the
236 * lock held.
237 */
238
239struct rt_hash_bucket {
1c31720a 240 struct rtable __rcu *chain;
22c047cc 241};
1080d709 242
8a25d5de
IM
243#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
244 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
245/*
246 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
247 * The size of this table is a power of two and depends on the number of CPUS.
62051200 248 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 249 */
62051200
IM
250#ifdef CONFIG_LOCKDEP
251# define RT_HASH_LOCK_SZ 256
22c047cc 252#else
62051200
IM
253# if NR_CPUS >= 32
254# define RT_HASH_LOCK_SZ 4096
255# elif NR_CPUS >= 16
256# define RT_HASH_LOCK_SZ 2048
257# elif NR_CPUS >= 8
258# define RT_HASH_LOCK_SZ 1024
259# elif NR_CPUS >= 4
260# define RT_HASH_LOCK_SZ 512
261# else
262# define RT_HASH_LOCK_SZ 256
263# endif
22c047cc
ED
264#endif
265
266static spinlock_t *rt_hash_locks;
267# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
268
269static __init void rt_hash_lock_init(void)
270{
271 int i;
272
273 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
274 GFP_KERNEL);
275 if (!rt_hash_locks)
276 panic("IP: failed to allocate rt_hash_locks\n");
277
278 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
279 spin_lock_init(&rt_hash_locks[i]);
280}
22c047cc
ED
281#else
282# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
283
284static inline void rt_hash_lock_init(void)
285{
286}
22c047cc 287#endif
1da177e4 288
817bc4db
SH
289static struct rt_hash_bucket *rt_hash_table __read_mostly;
290static unsigned rt_hash_mask __read_mostly;
291static unsigned int rt_hash_log __read_mostly;
1da177e4 292
2f970d83 293static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 294#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 295
b00180de 296static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 297 int genid)
1da177e4 298{
0eae88f3 299 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 300 idx, genid)
29e75252 301 & rt_hash_mask;
1da177e4
LT
302}
303
e84f84f2
DL
304static inline int rt_genid(struct net *net)
305{
306 return atomic_read(&net->ipv4.rt_genid);
307}
308
1da177e4
LT
309#ifdef CONFIG_PROC_FS
310struct rt_cache_iter_state {
a75e936f 311 struct seq_net_private p;
1da177e4 312 int bucket;
29e75252 313 int genid;
1da177e4
LT
314};
315
1218854a 316static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 317{
1218854a 318 struct rt_cache_iter_state *st = seq->private;
1da177e4 319 struct rtable *r = NULL;
1da177e4
LT
320
321 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
1c31720a 322 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
a6272665 323 continue;
1da177e4 324 rcu_read_lock_bh();
a898def2 325 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 326 while (r) {
d8d1f30b 327 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 328 r->rt_genid == st->genid)
29e75252 329 return r;
d8d1f30b 330 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 331 }
1da177e4
LT
332 rcu_read_unlock_bh();
333 }
29e75252 334 return r;
1da177e4
LT
335}
336
1218854a 337static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 338 struct rtable *r)
1da177e4 339{
1218854a 340 struct rt_cache_iter_state *st = seq->private;
a6272665 341
1c31720a 342 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
343 while (!r) {
344 rcu_read_unlock_bh();
a6272665
ED
345 do {
346 if (--st->bucket < 0)
347 return NULL;
1c31720a 348 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
1da177e4 349 rcu_read_lock_bh();
1c31720a 350 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 351 }
1c31720a 352 return r;
1da177e4
LT
353}
354
1218854a 355static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
356 struct rtable *r)
357{
1218854a
YH
358 struct rt_cache_iter_state *st = seq->private;
359 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 360 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 361 continue;
642d6318
DL
362 if (r->rt_genid == st->genid)
363 break;
364 }
365 return r;
366}
367
1218854a 368static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 369{
1218854a 370 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
371
372 if (r)
1218854a 373 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
374 --pos;
375 return pos ? NULL : r;
376}
377
378static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
379{
29e75252 380 struct rt_cache_iter_state *st = seq->private;
29e75252 381 if (*pos)
1218854a 382 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 383 st->genid = rt_genid(seq_file_net(seq));
29e75252 384 return SEQ_START_TOKEN;
1da177e4
LT
385}
386
387static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
388{
29e75252 389 struct rtable *r;
1da177e4
LT
390
391 if (v == SEQ_START_TOKEN)
1218854a 392 r = rt_cache_get_first(seq);
1da177e4 393 else
1218854a 394 r = rt_cache_get_next(seq, v);
1da177e4
LT
395 ++*pos;
396 return r;
397}
398
399static void rt_cache_seq_stop(struct seq_file *seq, void *v)
400{
401 if (v && v != SEQ_START_TOKEN)
402 rcu_read_unlock_bh();
403}
404
405static int rt_cache_seq_show(struct seq_file *seq, void *v)
406{
407 if (v == SEQ_START_TOKEN)
408 seq_printf(seq, "%-127s\n",
409 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
410 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
411 "HHUptod\tSpecDst");
412 else {
413 struct rtable *r = v;
5e659e4c 414 int len;
1da177e4 415
0eae88f3
ED
416 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
417 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 418 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
419 (__force u32)r->rt_dst,
420 (__force u32)r->rt_gateway,
d8d1f30b
CG
421 r->rt_flags, atomic_read(&r->dst.__refcnt),
422 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 423 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
424 dst_metric(&r->dst, RTAX_WINDOW),
425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
426 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 427 r->rt_key_tos,
d8d1f30b
CG
428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
429 r->dst.hh ? (r->dst.hh->hh_output ==
1da177e4 430 dev_queue_xmit) : 0,
5e659e4c
PE
431 r->rt_spec_dst, &len);
432
433 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
434 }
435 return 0;
1da177e4
LT
436}
437
f690808e 438static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
439 .start = rt_cache_seq_start,
440 .next = rt_cache_seq_next,
441 .stop = rt_cache_seq_stop,
442 .show = rt_cache_seq_show,
443};
444
445static int rt_cache_seq_open(struct inode *inode, struct file *file)
446{
a75e936f 447 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 448 sizeof(struct rt_cache_iter_state));
1da177e4
LT
449}
450
9a32144e 451static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
452 .owner = THIS_MODULE,
453 .open = rt_cache_seq_open,
454 .read = seq_read,
455 .llseek = seq_lseek,
a75e936f 456 .release = seq_release_net,
1da177e4
LT
457};
458
459
460static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
461{
462 int cpu;
463
464 if (*pos == 0)
465 return SEQ_START_TOKEN;
466
0f23174a 467 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
468 if (!cpu_possible(cpu))
469 continue;
470 *pos = cpu+1;
2f970d83 471 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
472 }
473 return NULL;
474}
475
476static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
477{
478 int cpu;
479
0f23174a 480 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
2f970d83 484 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
485 }
486 return NULL;
e905a9ed 487
1da177e4
LT
488}
489
490static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
491{
492
493}
494
495static int rt_cpu_seq_show(struct seq_file *seq, void *v)
496{
497 struct rt_cache_stat *st = v;
498
499 if (v == SEQ_START_TOKEN) {
5bec0039 500 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
501 return 0;
502 }
e905a9ed 503
1da177e4
LT
504 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
505 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 506 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
507 st->in_hit,
508 st->in_slow_tot,
509 st->in_slow_mc,
510 st->in_no_route,
511 st->in_brd,
512 st->in_martian_dst,
513 st->in_martian_src,
514
515 st->out_hit,
516 st->out_slow_tot,
e905a9ed 517 st->out_slow_mc,
1da177e4
LT
518
519 st->gc_total,
520 st->gc_ignored,
521 st->gc_goal_miss,
522 st->gc_dst_overflow,
523 st->in_hlist_search,
524 st->out_hlist_search
525 );
526 return 0;
527}
528
f690808e 529static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
530 .start = rt_cpu_seq_start,
531 .next = rt_cpu_seq_next,
532 .stop = rt_cpu_seq_stop,
533 .show = rt_cpu_seq_show,
534};
535
536
537static int rt_cpu_seq_open(struct inode *inode, struct file *file)
538{
539 return seq_open(file, &rt_cpu_seq_ops);
540}
541
9a32144e 542static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
543 .owner = THIS_MODULE,
544 .open = rt_cpu_seq_open,
545 .read = seq_read,
546 .llseek = seq_lseek,
547 .release = seq_release,
548};
549
c7066f70 550#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 551static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 552{
a661c419
AD
553 struct ip_rt_acct *dst, *src;
554 unsigned int i, j;
555
556 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
557 if (!dst)
558 return -ENOMEM;
559
560 for_each_possible_cpu(i) {
561 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
562 for (j = 0; j < 256; j++) {
563 dst[j].o_bytes += src[j].o_bytes;
564 dst[j].o_packets += src[j].o_packets;
565 dst[j].i_bytes += src[j].i_bytes;
566 dst[j].i_packets += src[j].i_packets;
567 }
78c686e9
PE
568 }
569
a661c419
AD
570 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
571 kfree(dst);
572 return 0;
573}
78c686e9 574
a661c419
AD
575static int rt_acct_proc_open(struct inode *inode, struct file *file)
576{
577 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 578}
a661c419
AD
579
580static const struct file_operations rt_acct_proc_fops = {
581 .owner = THIS_MODULE,
582 .open = rt_acct_proc_open,
583 .read = seq_read,
584 .llseek = seq_lseek,
585 .release = single_release,
586};
78c686e9 587#endif
107f1634 588
73b38711 589static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
590{
591 struct proc_dir_entry *pde;
592
593 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
594 &rt_cache_seq_fops);
595 if (!pde)
596 goto err1;
597
77020720
WC
598 pde = proc_create("rt_cache", S_IRUGO,
599 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
600 if (!pde)
601 goto err2;
602
c7066f70 603#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
605 if (!pde)
606 goto err3;
607#endif
608 return 0;
609
c7066f70 610#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
611err3:
612 remove_proc_entry("rt_cache", net->proc_net_stat);
613#endif
614err2:
615 remove_proc_entry("rt_cache", net->proc_net);
616err1:
617 return -ENOMEM;
618}
73b38711
DL
619
620static void __net_exit ip_rt_do_proc_exit(struct net *net)
621{
622 remove_proc_entry("rt_cache", net->proc_net_stat);
623 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 624#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 625 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 626#endif
73b38711
DL
627}
628
629static struct pernet_operations ip_rt_proc_ops __net_initdata = {
630 .init = ip_rt_do_proc_init,
631 .exit = ip_rt_do_proc_exit,
632};
633
634static int __init ip_rt_proc_init(void)
635{
636 return register_pernet_subsys(&ip_rt_proc_ops);
637}
638
107f1634 639#else
73b38711 640static inline int ip_rt_proc_init(void)
107f1634
PE
641{
642 return 0;
643}
1da177e4 644#endif /* CONFIG_PROC_FS */
e905a9ed 645
5969f71d 646static inline void rt_free(struct rtable *rt)
1da177e4 647{
d8d1f30b 648 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
649}
650
5969f71d 651static inline void rt_drop(struct rtable *rt)
1da177e4 652{
1da177e4 653 ip_rt_put(rt);
d8d1f30b 654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
655}
656
5969f71d 657static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
658{
659 /* Kill broadcast/multicast entries very aggresively, if they
660 collide in hash table with more useful entries */
661 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 662 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
663}
664
5969f71d 665static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
666{
667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 668 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
669}
670
671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
672{
673 unsigned long age;
674 int ret = 0;
675
d8d1f30b 676 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
677 goto out;
678
d8d1f30b 679 age = jiffies - rth->dst.lastuse;
1da177e4
LT
680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
681 (age <= tmo2 && rt_valuable(rth)))
682 goto out;
683 ret = 1;
684out: return ret;
685}
686
687/* Bits of score are:
688 * 31: very valuable
689 * 30: not quite useless
690 * 29..0: usage counter
691 */
692static inline u32 rt_score(struct rtable *rt)
693{
d8d1f30b 694 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
695
696 score = ~score & ~(3<<30);
697
698 if (rt_valuable(rt))
699 score |= (1<<31);
700
c7537967 701 if (rt_is_output_route(rt) ||
1da177e4
LT
702 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
703 score |= (1<<30);
704
705 return score;
706}
707
1080d709
NH
708static inline bool rt_caching(const struct net *net)
709{
710 return net->ipv4.current_rt_cache_rebuild_count <=
711 net->ipv4.sysctl_rt_cache_rebuild_count;
712}
713
5e2b61f7
DM
714static inline bool compare_hash_inputs(const struct rtable *rt1,
715 const struct rtable *rt2)
1080d709 716{
5e2b61f7
DM
717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
1080d709
NH
720}
721
5e2b61f7 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 723{
5e2b61f7
DM
724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
726 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 727 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
5e2b61f7
DM
728 (rt1->rt_oif ^ rt2->rt_oif) |
729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
1da177e4
LT
730}
731
b5921910
DL
732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
733{
d8d1f30b 734 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
735}
736
e84f84f2
DL
737static inline int rt_is_expired(struct rtable *rth)
738{
d8d1f30b 739 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
740}
741
beb659bd
ED
742/*
743 * Perform a full scan of hash table and free all entries.
744 * Can be called by a softirq or a process.
745 * In the later case, we want to be reschedule if necessary
746 */
6561a3b1 747static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
748{
749 unsigned int i;
750 struct rtable *rth, *next;
751
752 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
753 struct rtable __rcu **pprev;
754 struct rtable *list;
755
beb659bd
ED
756 if (process_context && need_resched())
757 cond_resched();
1c31720a 758 rth = rcu_dereference_raw(rt_hash_table[i].chain);
beb659bd
ED
759 if (!rth)
760 continue;
761
762 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 763
6561a3b1
DM
764 list = NULL;
765 pprev = &rt_hash_table[i].chain;
766 rth = rcu_dereference_protected(*pprev,
1c31720a 767 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 768
6561a3b1
DM
769 while (rth) {
770 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 771 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
772
773 if (!net ||
774 net_eq(dev_net(rth->dst.dev), net)) {
775 rcu_assign_pointer(*pprev, next);
776 rcu_assign_pointer(rth->dst.rt_next, list);
777 list = rth;
32cb5b4e 778 } else {
6561a3b1 779 pprev = &rth->dst.rt_next;
32cb5b4e 780 }
6561a3b1 781 rth = next;
32cb5b4e 782 }
6561a3b1 783
beb659bd
ED
784 spin_unlock_bh(rt_hash_lock_addr(i));
785
6561a3b1
DM
786 for (; list; list = next) {
787 next = rcu_dereference_protected(list->dst.rt_next, 1);
788 rt_free(list);
beb659bd
ED
789 }
790 }
791}
792
1080d709
NH
793/*
794 * While freeing expired entries, we compute average chain length
795 * and standard deviation, using fixed-point arithmetic.
796 * This to have an estimation of rt_chain_length_max
797 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
798 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
799 */
800
801#define FRACT_BITS 3
802#define ONE (1UL << FRACT_BITS)
803
98376387
ED
804/*
805 * Given a hash chain and an item in this hash chain,
806 * find if a previous entry has the same hash_inputs
807 * (but differs on tos, mark or oif)
808 * Returns 0 if an alias is found.
809 * Returns ONE if rth has no alias before itself.
810 */
811static int has_noalias(const struct rtable *head, const struct rtable *rth)
812{
813 const struct rtable *aux = head;
814
815 while (aux != rth) {
5e2b61f7 816 if (compare_hash_inputs(aux, rth))
98376387 817 return 0;
1c31720a 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
819 }
820 return ONE;
821}
822
29e75252 823/*
25985edc 824 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
826 * many times (2^24) without giving recent rt_genid.
827 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 828 */
86c657f6 829static void rt_cache_invalidate(struct net *net)
1da177e4 830{
29e75252 831 unsigned char shuffle;
1da177e4 832
29e75252 833 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 834 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
835}
836
29e75252
ED
837/*
838 * delay < 0 : invalidate cache (fast : entries will be deleted later)
839 * delay >= 0 : invalidate & flush cache (can be long)
840 */
76e6ebfb 841void rt_cache_flush(struct net *net, int delay)
1da177e4 842{
86c657f6 843 rt_cache_invalidate(net);
29e75252 844 if (delay >= 0)
6561a3b1 845 rt_do_flush(net, !in_softirq());
1da177e4
LT
846}
847
a5ee1551 848/* Flush previous cache invalidated entries from the cache */
6561a3b1 849void rt_cache_flush_batch(struct net *net)
a5ee1551 850{
6561a3b1 851 rt_do_flush(net, !in_softirq());
a5ee1551
EB
852}
853
1080d709
NH
854static void rt_emergency_hash_rebuild(struct net *net)
855{
3ee94372 856 if (net_ratelimit())
1080d709 857 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 858 rt_cache_invalidate(net);
1080d709
NH
859}
860
1da177e4
LT
861/*
862 Short description of GC goals.
863
864 We want to build algorithm, which will keep routing cache
865 at some equilibrium point, when number of aged off entries
866 is kept approximately equal to newly generated ones.
867
868 Current expiration strength is variable "expire".
869 We try to adjust it dynamically, so that if networking
870 is idle expires is large enough to keep enough of warm entries,
871 and when load increases it reduces to limit cache size.
872 */
873
569d3645 874static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
875{
876 static unsigned long expire = RT_GC_TIMEOUT;
877 static unsigned long last_gc;
878 static int rover;
879 static int equilibrium;
1c31720a
ED
880 struct rtable *rth;
881 struct rtable __rcu **rthp;
1da177e4
LT
882 unsigned long now = jiffies;
883 int goal;
fc66f95c 884 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
885
886 /*
887 * Garbage collection is pretty expensive,
888 * do not make it too frequently.
889 */
890
891 RT_CACHE_STAT_INC(gc_total);
892
893 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 894 entries < ip_rt_max_size) {
1da177e4
LT
895 RT_CACHE_STAT_INC(gc_ignored);
896 goto out;
897 }
898
fc66f95c 899 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 900 /* Calculate number of entries, which we want to expire now. */
fc66f95c 901 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
902 if (goal <= 0) {
903 if (equilibrium < ipv4_dst_ops.gc_thresh)
904 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 905 goal = entries - equilibrium;
1da177e4 906 if (goal > 0) {
b790cedd 907 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 908 goal = entries - equilibrium;
1da177e4
LT
909 }
910 } else {
911 /* We are in dangerous area. Try to reduce cache really
912 * aggressively.
913 */
b790cedd 914 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 915 equilibrium = entries - goal;
1da177e4
LT
916 }
917
918 if (now - last_gc >= ip_rt_gc_min_interval)
919 last_gc = now;
920
921 if (goal <= 0) {
922 equilibrium += goal;
923 goto work_done;
924 }
925
926 do {
927 int i, k;
928
929 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
930 unsigned long tmo = expire;
931
932 k = (k + 1) & rt_hash_mask;
933 rthp = &rt_hash_table[k].chain;
22c047cc 934 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
935 while ((rth = rcu_dereference_protected(*rthp,
936 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 937 if (!rt_is_expired(rth) &&
29e75252 938 !rt_may_expire(rth, tmo, expire)) {
1da177e4 939 tmo >>= 1;
d8d1f30b 940 rthp = &rth->dst.rt_next;
1da177e4
LT
941 continue;
942 }
d8d1f30b 943 *rthp = rth->dst.rt_next;
1da177e4
LT
944 rt_free(rth);
945 goal--;
1da177e4 946 }
22c047cc 947 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
948 if (goal <= 0)
949 break;
950 }
951 rover = k;
952
953 if (goal <= 0)
954 goto work_done;
955
956 /* Goal is not achieved. We stop process if:
957
958 - if expire reduced to zero. Otherwise, expire is halfed.
959 - if table is not full.
960 - if we are called from interrupt.
961 - jiffies check is just fallback/debug loop breaker.
962 We will not spin here for long time in any case.
963 */
964
965 RT_CACHE_STAT_INC(gc_goal_miss);
966
967 if (expire == 0)
968 break;
969
970 expire >>= 1;
1da177e4 971
fc66f95c 972 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
973 goto out;
974 } while (!in_softirq() && time_before_eq(jiffies, now));
975
fc66f95c
ED
976 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
977 goto out;
978 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
979 goto out;
980 if (net_ratelimit())
981 printk(KERN_WARNING "dst cache overflow\n");
982 RT_CACHE_STAT_INC(gc_dst_overflow);
983 return 1;
984
985work_done:
986 expire += ip_rt_gc_min_interval;
987 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
988 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
989 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 990 expire = ip_rt_gc_timeout;
1da177e4
LT
991out: return 0;
992}
993
98376387
ED
994/*
995 * Returns number of entries in a hash chain that have different hash_inputs
996 */
997static int slow_chain_length(const struct rtable *head)
998{
999 int length = 0;
1000 const struct rtable *rth = head;
1001
1002 while (rth) {
1003 length += has_noalias(head, rth);
1c31720a 1004 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1005 }
1006 return length >> FRACT_BITS;
1007}
1008
b23dd4fe
DM
1009static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1010 struct sk_buff *skb, int ifindex)
1da177e4 1011{
1c31720a
ED
1012 struct rtable *rth, *cand;
1013 struct rtable __rcu **rthp, **candp;
1da177e4 1014 unsigned long now;
1da177e4
LT
1015 u32 min_score;
1016 int chain_length;
1017 int attempts = !in_softirq();
1018
1019restart:
1020 chain_length = 0;
1021 min_score = ~(u32)0;
1022 cand = NULL;
1023 candp = NULL;
1024 now = jiffies;
1025
d8d1f30b 1026 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1027 /*
1028 * If we're not caching, just tell the caller we
1029 * were successful and don't touch the route. The
1030 * caller hold the sole reference to the cache entry, and
1031 * it will be released when the caller is done with it.
1032 * If we drop it here, the callers have no way to resolve routes
1033 * when we're not caching. Instead, just point *rp at rt, so
1034 * the caller gets a single use out of the route
b6280b47
NH
1035 * Note that we do rt_free on this new route entry, so that
1036 * once its refcount hits zero, we are still able to reap it
1037 * (Thanks Alexey)
27b75c95
ED
1038 * Note: To avoid expensive rcu stuff for this uncached dst,
1039 * we set DST_NOCACHE so that dst_release() can free dst without
1040 * waiting a grace period.
73e42897 1041 */
b6280b47 1042
c7d4426a 1043 rt->dst.flags |= DST_NOCACHE;
c7537967 1044 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1045 int err = arp_bind_neighbour(&rt->dst);
b6280b47
NH
1046 if (err) {
1047 if (net_ratelimit())
1048 printk(KERN_WARNING
1049 "Neighbour table failure & not caching routes.\n");
27b75c95 1050 ip_rt_put(rt);
b23dd4fe 1051 return ERR_PTR(err);
b6280b47
NH
1052 }
1053 }
1054
b6280b47 1055 goto skip_hashing;
1080d709
NH
1056 }
1057
1da177e4
LT
1058 rthp = &rt_hash_table[hash].chain;
1059
22c047cc 1060 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1061 while ((rth = rcu_dereference_protected(*rthp,
1062 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1063 if (rt_is_expired(rth)) {
d8d1f30b 1064 *rthp = rth->dst.rt_next;
29e75252
ED
1065 rt_free(rth);
1066 continue;
1067 }
5e2b61f7 1068 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1069 /* Put it first */
d8d1f30b 1070 *rthp = rth->dst.rt_next;
1da177e4
LT
1071 /*
1072 * Since lookup is lockfree, the deletion
1073 * must be visible to another weakly ordered CPU before
1074 * the insertion at the start of the hash chain.
1075 */
d8d1f30b 1076 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1077 rt_hash_table[hash].chain);
1078 /*
1079 * Since lookup is lockfree, the update writes
1080 * must be ordered for consistency on SMP.
1081 */
1082 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1083
d8d1f30b 1084 dst_use(&rth->dst, now);
22c047cc 1085 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1086
1087 rt_drop(rt);
b23dd4fe 1088 if (skb)
d8d1f30b 1089 skb_dst_set(skb, &rth->dst);
b23dd4fe 1090 return rth;
1da177e4
LT
1091 }
1092
d8d1f30b 1093 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1094 u32 score = rt_score(rth);
1095
1096 if (score <= min_score) {
1097 cand = rth;
1098 candp = rthp;
1099 min_score = score;
1100 }
1101 }
1102
1103 chain_length++;
1104
d8d1f30b 1105 rthp = &rth->dst.rt_next;
1da177e4
LT
1106 }
1107
1108 if (cand) {
1109 /* ip_rt_gc_elasticity used to be average length of chain
1110 * length, when exceeded gc becomes really aggressive.
1111 *
1112 * The second limit is less certain. At the moment it allows
1113 * only 2 entries per bucket. We will see.
1114 */
1115 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1116 *candp = cand->dst.rt_next;
1da177e4
LT
1117 rt_free(cand);
1118 }
1080d709 1119 } else {
98376387
ED
1120 if (chain_length > rt_chain_length_max &&
1121 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1122 struct net *net = dev_net(rt->dst.dev);
1080d709 1123 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1124 if (!rt_caching(net)) {
1080d709 1125 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1126 rt->dst.dev->name, num);
1080d709 1127 }
b35ecb5d 1128 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1129 spin_unlock_bh(rt_hash_lock_addr(hash));
1130
5e2b61f7 1131 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1132 ifindex, rt_genid(net));
1133 goto restart;
1080d709 1134 }
1da177e4
LT
1135 }
1136
1137 /* Try to bind route to arp only if it is output
1138 route or unicast forwarding path.
1139 */
c7537967 1140 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1141 int err = arp_bind_neighbour(&rt->dst);
1da177e4 1142 if (err) {
22c047cc 1143 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1144
1145 if (err != -ENOBUFS) {
1146 rt_drop(rt);
b23dd4fe 1147 return ERR_PTR(err);
1da177e4
LT
1148 }
1149
1150 /* Neighbour tables are full and nothing
1151 can be released. Try to shrink route cache,
1152 it is most likely it holds some neighbour records.
1153 */
1154 if (attempts-- > 0) {
1155 int saved_elasticity = ip_rt_gc_elasticity;
1156 int saved_int = ip_rt_gc_min_interval;
1157 ip_rt_gc_elasticity = 1;
1158 ip_rt_gc_min_interval = 0;
569d3645 1159 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1160 ip_rt_gc_min_interval = saved_int;
1161 ip_rt_gc_elasticity = saved_elasticity;
1162 goto restart;
1163 }
1164
1165 if (net_ratelimit())
7e1b33e5 1166 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4 1167 rt_drop(rt);
b23dd4fe 1168 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1169 }
1170 }
1171
d8d1f30b 1172 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1173
00269b54
ED
1174 /*
1175 * Since lookup is lockfree, we must make sure
25985edc 1176 * previous writes to rt are committed to memory
00269b54
ED
1177 * before making rt visible to other CPUS.
1178 */
1ddbcb00 1179 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1180
22c047cc 1181 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1182
b6280b47 1183skip_hashing:
b23dd4fe 1184 if (skb)
d8d1f30b 1185 skb_dst_set(skb, &rt->dst);
b23dd4fe 1186 return rt;
1da177e4
LT
1187}
1188
6431cbc2
DM
1189static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1190
1191static u32 rt_peer_genid(void)
1192{
1193 return atomic_read(&__rt_peer_genid);
1194}
1195
a48eff12 1196void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1197{
1da177e4
LT
1198 struct inet_peer *peer;
1199
a48eff12 1200 peer = inet_getpeer_v4(daddr, create);
1da177e4 1201
49e8ab03 1202 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1203 inet_putpeer(peer);
6431cbc2
DM
1204 else
1205 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1206}
1207
1208/*
1209 * Peer allocation may fail only in serious out-of-memory conditions. However
1210 * we still can generate some output.
1211 * Random ID selection looks a bit dangerous because we have no chances to
1212 * select ID being unique in a reasonable period of time.
1213 * But broken packet identifier may be better than no packet at all.
1214 */
1215static void ip_select_fb_ident(struct iphdr *iph)
1216{
1217 static DEFINE_SPINLOCK(ip_fb_id_lock);
1218 static u32 ip_fallback_id;
1219 u32 salt;
1220
1221 spin_lock_bh(&ip_fb_id_lock);
e448515c 1222 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1223 iph->id = htons(salt & 0xFFFF);
1224 ip_fallback_id = salt;
1225 spin_unlock_bh(&ip_fb_id_lock);
1226}
1227
1228void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1229{
1230 struct rtable *rt = (struct rtable *) dst;
1231
1232 if (rt) {
1233 if (rt->peer == NULL)
a48eff12 1234 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1235
1236 /* If peer is attached to destination, it is never detached,
1237 so that we need not to grab a lock to dereference it.
1238 */
1239 if (rt->peer) {
1240 iph->id = htons(inet_getid(rt->peer, more));
1241 return;
1242 }
1243 } else
e905a9ed 1244 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1245 __builtin_return_address(0));
1da177e4
LT
1246
1247 ip_select_fb_ident(iph);
1248}
4bc2f18b 1249EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1250
1251static void rt_del(unsigned hash, struct rtable *rt)
1252{
1c31720a
ED
1253 struct rtable __rcu **rthp;
1254 struct rtable *aux;
1da177e4 1255
29e75252 1256 rthp = &rt_hash_table[hash].chain;
22c047cc 1257 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1258 ip_rt_put(rt);
1c31720a
ED
1259 while ((aux = rcu_dereference_protected(*rthp,
1260 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1261 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1262 *rthp = aux->dst.rt_next;
29e75252
ED
1263 rt_free(aux);
1264 continue;
1da177e4 1265 }
d8d1f30b 1266 rthp = &aux->dst.rt_next;
29e75252 1267 }
22c047cc 1268 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1269}
1270
ed7865a4 1271/* called in rcu_read_lock() section */
f7655229
AV
1272void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1273 __be32 saddr, struct net_device *dev)
1da177e4 1274{
ed7865a4 1275 struct in_device *in_dev = __in_dev_get_rcu(dev);
f39925db 1276 struct inet_peer *peer;
317805b8 1277 struct net *net;
1da177e4 1278
1da177e4
LT
1279 if (!in_dev)
1280 return;
1281
c346dca1 1282 net = dev_net(dev);
9d4fb27d
JP
1283 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1284 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1285 ipv4_is_zeronet(new_gw))
1da177e4
LT
1286 goto reject_redirect;
1287
1288 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1289 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1290 goto reject_redirect;
1291 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1292 goto reject_redirect;
1293 } else {
317805b8 1294 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1295 goto reject_redirect;
1296 }
1297
f39925db
DM
1298 peer = inet_getpeer_v4(daddr, 1);
1299 if (peer) {
1300 peer->redirect_learned.a4 = new_gw;
e905a9ed 1301
f39925db 1302 inet_putpeer(peer);
1da177e4 1303
f39925db 1304 atomic_inc(&__rt_peer_genid);
1da177e4 1305 }
1da177e4
LT
1306 return;
1307
1308reject_redirect:
1309#ifdef CONFIG_IP_ROUTE_VERBOSE
1310 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1311 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1312 " Advised path = %pI4 -> %pI4\n",
1313 &old_gw, dev->name, &new_gw,
1314 &saddr, &daddr);
1da177e4 1315#endif
ed7865a4 1316 ;
1da177e4
LT
1317}
1318
fe6fe792
ED
1319static bool peer_pmtu_expired(struct inet_peer *peer)
1320{
1321 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1322
1323 return orig &&
1324 time_after_eq(jiffies, orig) &&
1325 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1326}
1327
1328static bool peer_pmtu_cleaned(struct inet_peer *peer)
1329{
1330 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1331
1332 return orig &&
1333 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1334}
1335
1da177e4
LT
1336static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1337{
ee6b9673 1338 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1339 struct dst_entry *ret = dst;
1340
1341 if (rt) {
d11a4dc1 1342 if (dst->obsolete > 0) {
1da177e4
LT
1343 ip_rt_put(rt);
1344 ret = NULL;
2c8cec5c 1345 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1346 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1347 rt->rt_oif,
e84f84f2 1348 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1349 rt_del(hash, rt);
1350 ret = NULL;
fe6fe792
ED
1351 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1352 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1353 }
1354 }
1355 return ret;
1356}
1357
1358/*
1359 * Algorithm:
1360 * 1. The first ip_rt_redirect_number redirects are sent
1361 * with exponential backoff, then we stop sending them at all,
1362 * assuming that the host ignores our redirects.
1363 * 2. If we did not see packets requiring redirects
1364 * during ip_rt_redirect_silence, we assume that the host
1365 * forgot redirected route and start to send redirects again.
1366 *
1367 * This algorithm is much cheaper and more intelligent than dumb load limiting
1368 * in icmp.c.
1369 *
1370 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1371 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1372 */
1373
1374void ip_rt_send_redirect(struct sk_buff *skb)
1375{
511c3f92 1376 struct rtable *rt = skb_rtable(skb);
30038fc6 1377 struct in_device *in_dev;
92d86829 1378 struct inet_peer *peer;
30038fc6 1379 int log_martians;
1da177e4 1380
30038fc6 1381 rcu_read_lock();
d8d1f30b 1382 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1383 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1384 rcu_read_unlock();
1da177e4 1385 return;
30038fc6
ED
1386 }
1387 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1388 rcu_read_unlock();
1da177e4 1389
92d86829 1390 if (!rt->peer)
a48eff12 1391 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1392 peer = rt->peer;
1393 if (!peer) {
1394 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1395 return;
1396 }
1397
1da177e4
LT
1398 /* No redirected packets during ip_rt_redirect_silence;
1399 * reset the algorithm.
1400 */
92d86829
DM
1401 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1402 peer->rate_tokens = 0;
1da177e4
LT
1403
1404 /* Too many ignored redirects; do not send anything
d8d1f30b 1405 * set dst.rate_last to the last seen redirected packet.
1da177e4 1406 */
92d86829
DM
1407 if (peer->rate_tokens >= ip_rt_redirect_number) {
1408 peer->rate_last = jiffies;
30038fc6 1409 return;
1da177e4
LT
1410 }
1411
1412 /* Check for load limit; set rate_last to the latest sent
1413 * redirect.
1414 */
92d86829 1415 if (peer->rate_tokens == 0 ||
14fb8a76 1416 time_after(jiffies,
92d86829
DM
1417 (peer->rate_last +
1418 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1419 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1420 peer->rate_last = jiffies;
1421 ++peer->rate_tokens;
1da177e4 1422#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1423 if (log_martians &&
92d86829 1424 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1425 net_ratelimit())
673d57e7 1426 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
c5be24ff 1427 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1428 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1429#endif
1430 }
1da177e4
LT
1431}
1432
1433static int ip_error(struct sk_buff *skb)
1434{
511c3f92 1435 struct rtable *rt = skb_rtable(skb);
92d86829 1436 struct inet_peer *peer;
1da177e4 1437 unsigned long now;
92d86829 1438 bool send;
1da177e4
LT
1439 int code;
1440
d8d1f30b 1441 switch (rt->dst.error) {
1da177e4
LT
1442 case EINVAL:
1443 default:
1444 goto out;
1445 case EHOSTUNREACH:
1446 code = ICMP_HOST_UNREACH;
1447 break;
1448 case ENETUNREACH:
1449 code = ICMP_NET_UNREACH;
d8d1f30b 1450 IP_INC_STATS_BH(dev_net(rt->dst.dev),
7c73a6fa 1451 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1452 break;
1453 case EACCES:
1454 code = ICMP_PKT_FILTERED;
1455 break;
1456 }
1457
92d86829 1458 if (!rt->peer)
a48eff12 1459 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1460 peer = rt->peer;
1461
1462 send = true;
1463 if (peer) {
1464 now = jiffies;
1465 peer->rate_tokens += now - peer->rate_last;
1466 if (peer->rate_tokens > ip_rt_error_burst)
1467 peer->rate_tokens = ip_rt_error_burst;
1468 peer->rate_last = now;
1469 if (peer->rate_tokens >= ip_rt_error_cost)
1470 peer->rate_tokens -= ip_rt_error_cost;
1471 else
1472 send = false;
1da177e4 1473 }
92d86829
DM
1474 if (send)
1475 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1476
1477out: kfree_skb(skb);
1478 return 0;
e905a9ed 1479}
1da177e4
LT
1480
1481/*
1482 * The last two values are not from the RFC but
1483 * are needed for AMPRnet AX.25 paths.
1484 */
1485
9b5b5cff 1486static const unsigned short mtu_plateau[] =
1da177e4
LT
1487{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1488
5969f71d 1489static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1490{
1491 int i;
e905a9ed 1492
1da177e4
LT
1493 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1494 if (old_mtu > mtu_plateau[i])
1495 return mtu_plateau[i];
1496 return 68;
1497}
1498
b71d1d42 1499unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1500 unsigned short new_mtu,
1501 struct net_device *dev)
1da177e4 1502{
1da177e4 1503 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1504 unsigned short est_mtu = 0;
2c8cec5c 1505 struct inet_peer *peer;
1da177e4 1506
2c8cec5c
DM
1507 peer = inet_getpeer_v4(iph->daddr, 1);
1508 if (peer) {
1509 unsigned short mtu = new_mtu;
1da177e4 1510
2c8cec5c
DM
1511 if (new_mtu < 68 || new_mtu >= old_mtu) {
1512 /* BSD 4.2 derived systems incorrectly adjust
1513 * tot_len by the IP header length, and report
1514 * a zero MTU in the ICMP message.
1515 */
1516 if (mtu == 0 &&
1517 old_mtu >= 68 + (iph->ihl << 2))
1518 old_mtu -= iph->ihl << 2;
1519 mtu = guess_mtu(old_mtu);
1520 }
0010e465 1521
2c8cec5c
DM
1522 if (mtu < ip_rt_min_pmtu)
1523 mtu = ip_rt_min_pmtu;
1524 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1525 unsigned long pmtu_expires;
1526
1527 pmtu_expires = jiffies + ip_rt_mtu_expires;
1528 if (!pmtu_expires)
1529 pmtu_expires = 1UL;
1530
2c8cec5c
DM
1531 est_mtu = mtu;
1532 peer->pmtu_learned = mtu;
46af3180 1533 peer->pmtu_expires = pmtu_expires;
2c8cec5c 1534 }
1da177e4 1535
2c8cec5c 1536 inet_putpeer(peer);
1da177e4 1537
2c8cec5c 1538 atomic_inc(&__rt_peer_genid);
1da177e4
LT
1539 }
1540 return est_mtu ? : new_mtu;
1541}
1542
2c8cec5c
DM
1543static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1544{
fe6fe792 1545 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1546
fe6fe792
ED
1547 if (!expires)
1548 return;
46af3180 1549 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1550 u32 orig_dst_mtu = dst_mtu(dst);
1551 if (peer->pmtu_learned < orig_dst_mtu) {
1552 if (!peer->pmtu_orig)
1553 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1554 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1555 }
1556 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1557 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1558}
1559
1da177e4
LT
1560static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1561{
2c8cec5c
DM
1562 struct rtable *rt = (struct rtable *) dst;
1563 struct inet_peer *peer;
1564
1565 dst_confirm(dst);
1566
1567 if (!rt->peer)
a48eff12 1568 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1569 peer = rt->peer;
1570 if (peer) {
fe6fe792
ED
1571 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1572
2c8cec5c 1573 if (mtu < ip_rt_min_pmtu)
1da177e4 1574 mtu = ip_rt_min_pmtu;
fe6fe792 1575 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1576
1577 pmtu_expires = jiffies + ip_rt_mtu_expires;
1578 if (!pmtu_expires)
1579 pmtu_expires = 1UL;
1580
2c8cec5c 1581 peer->pmtu_learned = mtu;
46af3180 1582 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1583
1584 atomic_inc(&__rt_peer_genid);
1585 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1586 }
46af3180 1587 check_peer_pmtu(dst, peer);
1da177e4
LT
1588 }
1589}
1590
f39925db
DM
1591static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1592{
1593 struct rtable *rt = (struct rtable *) dst;
1594 __be32 orig_gw = rt->rt_gateway;
1595
1596 dst_confirm(&rt->dst);
1597
1598 neigh_release(rt->dst.neighbour);
1599 rt->dst.neighbour = NULL;
1600
1601 rt->rt_gateway = peer->redirect_learned.a4;
1602 if (arp_bind_neighbour(&rt->dst) ||
1603 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1604 if (rt->dst.neighbour)
1605 neigh_event_send(rt->dst.neighbour, NULL);
1606 rt->rt_gateway = orig_gw;
1607 return -EAGAIN;
1608 } else {
1609 rt->rt_flags |= RTCF_REDIRECTED;
1610 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1611 rt->dst.neighbour);
1612 }
1613 return 0;
1614}
1615
1da177e4
LT
1616static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1617{
6431cbc2
DM
1618 struct rtable *rt = (struct rtable *) dst;
1619
1620 if (rt_is_expired(rt))
d11a4dc1 1621 return NULL;
6431cbc2 1622 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1623 struct inet_peer *peer;
1624
6431cbc2 1625 if (!rt->peer)
a48eff12 1626 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1627
2c8cec5c 1628 peer = rt->peer;
fe6fe792 1629 if (peer) {
2c8cec5c
DM
1630 check_peer_pmtu(dst, peer);
1631
fe6fe792
ED
1632 if (peer->redirect_learned.a4 &&
1633 peer->redirect_learned.a4 != rt->rt_gateway) {
1634 if (check_peer_redir(dst, peer))
1635 return NULL;
1636 }
f39925db
DM
1637 }
1638
6431cbc2
DM
1639 rt->rt_peer_genid = rt_peer_genid();
1640 }
d11a4dc1 1641 return dst;
1da177e4
LT
1642}
1643
1644static void ipv4_dst_destroy(struct dst_entry *dst)
1645{
1646 struct rtable *rt = (struct rtable *) dst;
1647 struct inet_peer *peer = rt->peer;
1da177e4 1648
62fa8a84
DM
1649 if (rt->fi) {
1650 fib_info_put(rt->fi);
1651 rt->fi = NULL;
1652 }
1da177e4
LT
1653 if (peer) {
1654 rt->peer = NULL;
1655 inet_putpeer(peer);
1656 }
1da177e4
LT
1657}
1658
1da177e4
LT
1659
1660static void ipv4_link_failure(struct sk_buff *skb)
1661{
1662 struct rtable *rt;
1663
1664 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1665
511c3f92 1666 rt = skb_rtable(skb);
fe6fe792
ED
1667 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1668 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1669}
1670
1671static int ip_rt_bug(struct sk_buff *skb)
1672{
673d57e7
HH
1673 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1674 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1675 skb->dev ? skb->dev->name : "?");
1676 kfree_skb(skb);
c378a9c0 1677 WARN_ON(1);
1da177e4
LT
1678 return 0;
1679}
1680
1681/*
1682 We do not cache source address of outgoing interface,
1683 because it is used only by IP RR, TS and SRR options,
1684 so that it out of fast path.
1685
1686 BTW remember: "addr" is allowed to be not aligned
1687 in IP options!
1688 */
1689
8e36360a 1690void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1691{
a61ced5d 1692 __be32 src;
1da177e4 1693
c7537967 1694 if (rt_is_output_route(rt))
c5be24ff 1695 src = ip_hdr(skb)->saddr;
ebc0ffae 1696 else {
8e36360a
DM
1697 struct fib_result res;
1698 struct flowi4 fl4;
1699 struct iphdr *iph;
1700
1701 iph = ip_hdr(skb);
1702
1703 memset(&fl4, 0, sizeof(fl4));
1704 fl4.daddr = iph->daddr;
1705 fl4.saddr = iph->saddr;
1706 fl4.flowi4_tos = iph->tos;
1707 fl4.flowi4_oif = rt->dst.dev->ifindex;
1708 fl4.flowi4_iif = skb->dev->ifindex;
1709 fl4.flowi4_mark = skb->mark;
5e2b61f7 1710
ebc0ffae 1711 rcu_read_lock();
68a5e3dd 1712 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1713 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1714 else
1715 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1716 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1717 rcu_read_unlock();
1718 }
1da177e4
LT
1719 memcpy(addr, &src, 4);
1720}
1721
c7066f70 1722#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1723static void set_class_tag(struct rtable *rt, u32 tag)
1724{
d8d1f30b
CG
1725 if (!(rt->dst.tclassid & 0xFFFF))
1726 rt->dst.tclassid |= tag & 0xFFFF;
1727 if (!(rt->dst.tclassid & 0xFFFF0000))
1728 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1729}
1730#endif
1731
0dbaee3b
DM
1732static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1733{
1734 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1735
1736 if (advmss == 0) {
1737 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1738 ip_rt_min_advmss);
1739 if (advmss > 65535 - 40)
1740 advmss = 65535 - 40;
1741 }
1742 return advmss;
1743}
1744
d33e4553
DM
1745static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1746{
1747 unsigned int mtu = dst->dev->mtu;
1748
1749 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1750 const struct rtable *rt = (const struct rtable *) dst;
1751
1752 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1753 mtu = 576;
1754 }
1755
1756 if (mtu > IP_MAX_MTU)
1757 mtu = IP_MAX_MTU;
1758
1759 return mtu;
1760}
1761
813b3b5d 1762static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1763 struct fib_info *fi)
a4daad6b 1764{
0131ba45
DM
1765 struct inet_peer *peer;
1766 int create = 0;
a4daad6b 1767
0131ba45
DM
1768 /* If a peer entry exists for this destination, we must hook
1769 * it up in order to get at cached metrics.
1770 */
813b3b5d 1771 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1772 create = 1;
1773
3c0afdca 1774 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1775 if (peer) {
3c0afdca 1776 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1777 if (inet_metrics_new(peer))
1778 memcpy(peer->metrics, fi->fib_metrics,
1779 sizeof(u32) * RTAX_MAX);
1780 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1781
fe6fe792 1782 check_peer_pmtu(&rt->dst, peer);
f39925db
DM
1783 if (peer->redirect_learned.a4 &&
1784 peer->redirect_learned.a4 != rt->rt_gateway) {
1785 rt->rt_gateway = peer->redirect_learned.a4;
1786 rt->rt_flags |= RTCF_REDIRECTED;
1787 }
0131ba45
DM
1788 } else {
1789 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1790 rt->fi = fi;
1791 atomic_inc(&fi->fib_clntref);
1792 }
1793 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1794 }
1795}
1796
813b3b5d 1797static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1798 const struct fib_result *res,
982721f3 1799 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1800{
defb3519 1801 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1802
1803 if (fi) {
1804 if (FIB_RES_GW(*res) &&
1805 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1806 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1807 rt_init_metrics(rt, fl4, fi);
c7066f70 1808#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1809 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1810#endif
d33e4553 1811 }
defb3519 1812
defb3519
DM
1813 if (dst_mtu(dst) > IP_MAX_MTU)
1814 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1815 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1816 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1817
c7066f70 1818#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1819#ifdef CONFIG_IP_MULTIPLE_TABLES
1820 set_class_tag(rt, fib_rules_tclass(res));
1821#endif
1822 set_class_tag(rt, itag);
1823#endif
1da177e4
LT
1824}
1825
5c1e6aa3
DM
1826static struct rtable *rt_dst_alloc(struct net_device *dev,
1827 bool nopolicy, bool noxfrm)
0c4dcd58 1828{
5c1e6aa3
DM
1829 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1830 DST_HOST |
1831 (nopolicy ? DST_NOPOLICY : 0) |
1832 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1833}
1834
96d36220 1835/* called in rcu_read_lock() section */
9e12bb22 1836static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1837 u8 tos, struct net_device *dev, int our)
1838{
96d36220 1839 unsigned int hash;
1da177e4 1840 struct rtable *rth;
a61ced5d 1841 __be32 spec_dst;
96d36220 1842 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1843 u32 itag = 0;
b5f7e755 1844 int err;
1da177e4
LT
1845
1846 /* Primary sanity checks. */
1847
1848 if (in_dev == NULL)
1849 return -EINVAL;
1850
1e637c74 1851 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1852 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1853 goto e_inval;
1854
f97c1e0c
JP
1855 if (ipv4_is_zeronet(saddr)) {
1856 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1857 goto e_inval;
1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 1859 } else {
5c04c819
MS
1860 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1861 &itag);
b5f7e755
ED
1862 if (err < 0)
1863 goto e_err;
1864 }
5c1e6aa3
DM
1865 rth = rt_dst_alloc(init_net.loopback_dev,
1866 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
1867 if (!rth)
1868 goto e_nobufs;
1869
cf911662
DM
1870#ifdef CONFIG_IP_ROUTE_CLASSID
1871 rth->dst.tclassid = itag;
1872#endif
d8d1f30b 1873 rth->dst.output = ip_rt_bug;
1da177e4 1874
5e2b61f7 1875 rth->rt_key_dst = daddr;
5e2b61f7 1876 rth->rt_key_src = saddr;
cf911662
DM
1877 rth->rt_genid = rt_genid(dev_net(dev));
1878 rth->rt_flags = RTCF_MULTICAST;
1879 rth->rt_type = RTN_MULTICAST;
475949d8 1880 rth->rt_key_tos = tos;
cf911662 1881 rth->rt_dst = daddr;
1da177e4 1882 rth->rt_src = saddr;
1b86a58f 1883 rth->rt_route_iif = dev->ifindex;
5e2b61f7 1884 rth->rt_iif = dev->ifindex;
5e2b61f7 1885 rth->rt_oif = 0;
cf911662 1886 rth->rt_mark = skb->mark;
1da177e4
LT
1887 rth->rt_gateway = daddr;
1888 rth->rt_spec_dst= spec_dst;
cf911662
DM
1889 rth->rt_peer_genid = 0;
1890 rth->peer = NULL;
1891 rth->fi = NULL;
1da177e4 1892 if (our) {
d8d1f30b 1893 rth->dst.input= ip_local_deliver;
1da177e4
LT
1894 rth->rt_flags |= RTCF_LOCAL;
1895 }
1896
1897#ifdef CONFIG_IP_MROUTE
f97c1e0c 1898 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1899 rth->dst.input = ip_mr_input;
1da177e4
LT
1900#endif
1901 RT_CACHE_STAT_INC(in_slow_mc);
1902
e84f84f2 1903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe
DM
1904 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1905 err = 0;
1906 if (IS_ERR(rth))
1907 err = PTR_ERR(rth);
1da177e4
LT
1908
1909e_nobufs:
1da177e4 1910 return -ENOBUFS;
1da177e4 1911e_inval:
96d36220 1912 return -EINVAL;
b5f7e755 1913e_err:
b5f7e755 1914 return err;
1da177e4
LT
1915}
1916
1917
1918static void ip_handle_martian_source(struct net_device *dev,
1919 struct in_device *in_dev,
1920 struct sk_buff *skb,
9e12bb22
AV
1921 __be32 daddr,
1922 __be32 saddr)
1da177e4
LT
1923{
1924 RT_CACHE_STAT_INC(in_martian_src);
1925#ifdef CONFIG_IP_ROUTE_VERBOSE
1926 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1927 /*
1928 * RFC1812 recommendation, if source is martian,
1929 * the only hint is MAC header.
1930 */
673d57e7
HH
1931 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1932 &daddr, &saddr, dev->name);
98e399f8 1933 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1934 int i;
98e399f8 1935 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1936 printk(KERN_WARNING "ll header: ");
1937 for (i = 0; i < dev->hard_header_len; i++, p++) {
1938 printk("%02x", *p);
1939 if (i < (dev->hard_header_len - 1))
1940 printk(":");
1941 }
1942 printk("\n");
1943 }
1944 }
1945#endif
1946}
1947
47360228 1948/* called in rcu_read_lock() section */
5969f71d 1949static int __mkroute_input(struct sk_buff *skb,
982721f3 1950 const struct fib_result *res,
5969f71d
SH
1951 struct in_device *in_dev,
1952 __be32 daddr, __be32 saddr, u32 tos,
1953 struct rtable **result)
1da177e4 1954{
1da177e4
LT
1955 struct rtable *rth;
1956 int err;
1957 struct in_device *out_dev;
47360228 1958 unsigned int flags = 0;
d9c9df8c
AV
1959 __be32 spec_dst;
1960 u32 itag;
1da177e4
LT
1961
1962 /* get a working reference to the output device */
47360228 1963 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
1964 if (out_dev == NULL) {
1965 if (net_ratelimit())
1966 printk(KERN_CRIT "Bug in ip_route_input" \
1967 "_slow(). Please, report\n");
1968 return -EINVAL;
1969 }
1970
1971
5c04c819
MS
1972 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1973 in_dev->dev, &spec_dst, &itag);
1da177e4 1974 if (err < 0) {
e905a9ed 1975 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1976 saddr);
e905a9ed 1977
1da177e4
LT
1978 goto cleanup;
1979 }
1980
1981 if (err)
1982 flags |= RTCF_DIRECTSRC;
1983
51b77cae 1984 if (out_dev == in_dev && err &&
1da177e4
LT
1985 (IN_DEV_SHARED_MEDIA(out_dev) ||
1986 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1987 flags |= RTCF_DOREDIRECT;
1988
1989 if (skb->protocol != htons(ETH_P_IP)) {
1990 /* Not IP (i.e. ARP). Do not create route, if it is
1991 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1992 *
1993 * Proxy arp feature have been extended to allow, ARP
1994 * replies back to the same interface, to support
1995 * Private VLAN switch technologies. See arp.c.
1da177e4 1996 */
65324144
JDB
1997 if (out_dev == in_dev &&
1998 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1999 err = -EINVAL;
2000 goto cleanup;
2001 }
2002 }
2003
5c1e6aa3
DM
2004 rth = rt_dst_alloc(out_dev->dev,
2005 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2006 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2007 if (!rth) {
2008 err = -ENOBUFS;
2009 goto cleanup;
2010 }
2011
5e2b61f7 2012 rth->rt_key_dst = daddr;
5e2b61f7 2013 rth->rt_key_src = saddr;
cf911662
DM
2014 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2015 rth->rt_flags = flags;
2016 rth->rt_type = res->type;
475949d8 2017 rth->rt_key_tos = tos;
cf911662 2018 rth->rt_dst = daddr;
1da177e4 2019 rth->rt_src = saddr;
1b86a58f 2020 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2021 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2022 rth->rt_oif = 0;
cf911662
DM
2023 rth->rt_mark = skb->mark;
2024 rth->rt_gateway = daddr;
1da177e4 2025 rth->rt_spec_dst= spec_dst;
cf911662
DM
2026 rth->rt_peer_genid = 0;
2027 rth->peer = NULL;
2028 rth->fi = NULL;
1da177e4 2029
d8d1f30b
CG
2030 rth->dst.input = ip_forward;
2031 rth->dst.output = ip_output;
1da177e4 2032
5e2b61f7 2033 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2034
1da177e4
LT
2035 *result = rth;
2036 err = 0;
2037 cleanup:
1da177e4 2038 return err;
e905a9ed 2039}
1da177e4 2040
5969f71d
SH
2041static int ip_mkroute_input(struct sk_buff *skb,
2042 struct fib_result *res,
68a5e3dd 2043 const struct flowi4 *fl4,
5969f71d
SH
2044 struct in_device *in_dev,
2045 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2046{
7abaa27c 2047 struct rtable* rth = NULL;
1da177e4
LT
2048 int err;
2049 unsigned hash;
2050
2051#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2052 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2053 fib_select_multipath(res);
1da177e4
LT
2054#endif
2055
2056 /* create a routing cache entry */
2057 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2058 if (err)
2059 return err;
1da177e4
LT
2060
2061 /* put it into the cache */
68a5e3dd 2062 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2063 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2064 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2065 if (IS_ERR(rth))
2066 return PTR_ERR(rth);
2067 return 0;
1da177e4
LT
2068}
2069
1da177e4
LT
2070/*
2071 * NOTE. We drop all the packets that has local source
2072 * addresses, because every properly looped back packet
2073 * must have correct destination already attached by output routine.
2074 *
2075 * Such approach solves two big problems:
2076 * 1. Not simplex devices are handled properly.
2077 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2078 * called with rcu_read_lock()
1da177e4
LT
2079 */
2080
9e12bb22 2081static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2082 u8 tos, struct net_device *dev)
2083{
2084 struct fib_result res;
96d36220 2085 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2086 struct flowi4 fl4;
1da177e4
LT
2087 unsigned flags = 0;
2088 u32 itag = 0;
2089 struct rtable * rth;
2090 unsigned hash;
9e12bb22 2091 __be32 spec_dst;
1da177e4 2092 int err = -EINVAL;
c346dca1 2093 struct net * net = dev_net(dev);
1da177e4
LT
2094
2095 /* IP on this device is disabled. */
2096
2097 if (!in_dev)
2098 goto out;
2099
2100 /* Check for the most weird martians, which can be not detected
2101 by fib_lookup.
2102 */
2103
1e637c74 2104 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2105 ipv4_is_loopback(saddr))
1da177e4
LT
2106 goto martian_source;
2107
27a954bd 2108 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2109 goto brd_input;
2110
2111 /* Accept zero addresses only to limited broadcast;
2112 * I even do not know to fix it or not. Waiting for complains :-)
2113 */
f97c1e0c 2114 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2115 goto martian_source;
2116
27a954bd 2117 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2118 goto martian_destination;
2119
2120 /*
2121 * Now we are ready to route packet.
2122 */
68a5e3dd
DM
2123 fl4.flowi4_oif = 0;
2124 fl4.flowi4_iif = dev->ifindex;
2125 fl4.flowi4_mark = skb->mark;
2126 fl4.flowi4_tos = tos;
2127 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2128 fl4.daddr = daddr;
2129 fl4.saddr = saddr;
2130 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2131 if (err != 0) {
1da177e4 2132 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2133 goto e_hostunreach;
1da177e4
LT
2134 goto no_route;
2135 }
1da177e4
LT
2136
2137 RT_CACHE_STAT_INC(in_slow_tot);
2138
2139 if (res.type == RTN_BROADCAST)
2140 goto brd_input;
2141
2142 if (res.type == RTN_LOCAL) {
5c04c819 2143 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2144 net->loopback_dev->ifindex,
5c04c819 2145 dev, &spec_dst, &itag);
b5f7e755
ED
2146 if (err < 0)
2147 goto martian_source_keep_err;
2148 if (err)
1da177e4
LT
2149 flags |= RTCF_DIRECTSRC;
2150 spec_dst = daddr;
2151 goto local_input;
2152 }
2153
2154 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2155 goto e_hostunreach;
1da177e4
LT
2156 if (res.type != RTN_UNICAST)
2157 goto martian_destination;
2158
68a5e3dd 2159 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2160out: return err;
2161
2162brd_input:
2163 if (skb->protocol != htons(ETH_P_IP))
2164 goto e_inval;
2165
f97c1e0c 2166 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2167 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2168 else {
5c04c819
MS
2169 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2170 &itag);
1da177e4 2171 if (err < 0)
b5f7e755 2172 goto martian_source_keep_err;
1da177e4
LT
2173 if (err)
2174 flags |= RTCF_DIRECTSRC;
2175 }
2176 flags |= RTCF_BROADCAST;
2177 res.type = RTN_BROADCAST;
2178 RT_CACHE_STAT_INC(in_brd);
2179
2180local_input:
5c1e6aa3
DM
2181 rth = rt_dst_alloc(net->loopback_dev,
2182 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2183 if (!rth)
2184 goto e_nobufs;
2185
cf911662 2186 rth->dst.input= ip_local_deliver;
d8d1f30b 2187 rth->dst.output= ip_rt_bug;
cf911662
DM
2188#ifdef CONFIG_IP_ROUTE_CLASSID
2189 rth->dst.tclassid = itag;
2190#endif
1da177e4 2191
5e2b61f7 2192 rth->rt_key_dst = daddr;
5e2b61f7 2193 rth->rt_key_src = saddr;
cf911662
DM
2194 rth->rt_genid = rt_genid(net);
2195 rth->rt_flags = flags|RTCF_LOCAL;
2196 rth->rt_type = res.type;
475949d8 2197 rth->rt_key_tos = tos;
cf911662 2198 rth->rt_dst = daddr;
1da177e4 2199 rth->rt_src = saddr;
c7066f70 2200#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2201 rth->dst.tclassid = itag;
1da177e4 2202#endif
1b86a58f 2203 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2204 rth->rt_iif = dev->ifindex;
cf911662
DM
2205 rth->rt_oif = 0;
2206 rth->rt_mark = skb->mark;
1da177e4
LT
2207 rth->rt_gateway = daddr;
2208 rth->rt_spec_dst= spec_dst;
cf911662
DM
2209 rth->rt_peer_genid = 0;
2210 rth->peer = NULL;
2211 rth->fi = NULL;
1da177e4 2212 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2213 rth->dst.input= ip_error;
2214 rth->dst.error= -err;
1da177e4
LT
2215 rth->rt_flags &= ~RTCF_LOCAL;
2216 }
68a5e3dd
DM
2217 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2218 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2219 err = 0;
2220 if (IS_ERR(rth))
2221 err = PTR_ERR(rth);
ebc0ffae 2222 goto out;
1da177e4
LT
2223
2224no_route:
2225 RT_CACHE_STAT_INC(in_no_route);
2226 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2227 res.type = RTN_UNREACHABLE;
7f53878d
MC
2228 if (err == -ESRCH)
2229 err = -ENETUNREACH;
1da177e4
LT
2230 goto local_input;
2231
2232 /*
2233 * Do not cache martian addresses: they should be logged (RFC1812)
2234 */
2235martian_destination:
2236 RT_CACHE_STAT_INC(in_martian_dst);
2237#ifdef CONFIG_IP_ROUTE_VERBOSE
2238 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2239 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2240 &daddr, &saddr, dev->name);
1da177e4 2241#endif
2c2910a4
DE
2242
2243e_hostunreach:
e905a9ed 2244 err = -EHOSTUNREACH;
ebc0ffae 2245 goto out;
2c2910a4 2246
1da177e4
LT
2247e_inval:
2248 err = -EINVAL;
ebc0ffae 2249 goto out;
1da177e4
LT
2250
2251e_nobufs:
2252 err = -ENOBUFS;
ebc0ffae 2253 goto out;
1da177e4
LT
2254
2255martian_source:
b5f7e755
ED
2256 err = -EINVAL;
2257martian_source_keep_err:
1da177e4 2258 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2259 goto out;
1da177e4
LT
2260}
2261
407eadd9
ED
2262int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2264{
2265 struct rtable * rth;
2266 unsigned hash;
2267 int iif = dev->ifindex;
b5921910 2268 struct net *net;
96d36220 2269 int res;
1da177e4 2270
c346dca1 2271 net = dev_net(dev);
1080d709 2272
96d36220
ED
2273 rcu_read_lock();
2274
1080d709
NH
2275 if (!rt_caching(net))
2276 goto skip_cache;
2277
1da177e4 2278 tos &= IPTOS_RT_MASK;
e84f84f2 2279 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2280
1da177e4 2281 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2282 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2283 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2284 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2285 (rth->rt_iif ^ iif) |
2286 rth->rt_oif |
475949d8 2287 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2288 rth->rt_mark == skb->mark &&
d8d1f30b 2289 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2290 !rt_is_expired(rth)) {
407eadd9 2291 if (noref) {
d8d1f30b
CG
2292 dst_use_noref(&rth->dst, jiffies);
2293 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2294 } else {
d8d1f30b
CG
2295 dst_use(&rth->dst, jiffies);
2296 skb_dst_set(skb, &rth->dst);
407eadd9 2297 }
1da177e4
LT
2298 RT_CACHE_STAT_INC(in_hit);
2299 rcu_read_unlock();
1da177e4
LT
2300 return 0;
2301 }
2302 RT_CACHE_STAT_INC(in_hlist_search);
2303 }
1da177e4 2304
1080d709 2305skip_cache:
1da177e4
LT
2306 /* Multicast recognition logic is moved from route cache to here.
2307 The problem was that too many Ethernet cards have broken/missing
2308 hardware multicast filters :-( As result the host on multicasting
2309 network acquires a lot of useless route cache entries, sort of
2310 SDR messages from all the world. Now we try to get rid of them.
2311 Really, provided software IP multicast filter is organized
2312 reasonably (at least, hashed), it does not result in a slowdown
2313 comparing with route cache reject entries.
2314 Note, that multicast routers are not affected, because
2315 route cache entry is created eventually.
2316 */
f97c1e0c 2317 if (ipv4_is_multicast(daddr)) {
96d36220 2318 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2319
96d36220 2320 if (in_dev) {
dbdd9a52
DM
2321 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2322 ip_hdr(skb)->protocol);
1da177e4
LT
2323 if (our
2324#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2325 ||
2326 (!ipv4_is_local_multicast(daddr) &&
2327 IN_DEV_MFORWARD(in_dev))
1da177e4 2328#endif
9d4fb27d 2329 ) {
96d36220
ED
2330 int res = ip_route_input_mc(skb, daddr, saddr,
2331 tos, dev, our);
1da177e4 2332 rcu_read_unlock();
96d36220 2333 return res;
1da177e4
LT
2334 }
2335 }
2336 rcu_read_unlock();
2337 return -EINVAL;
2338 }
96d36220
ED
2339 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2340 rcu_read_unlock();
2341 return res;
1da177e4 2342}
407eadd9 2343EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2344
ebc0ffae 2345/* called with rcu_read_lock() */
982721f3 2346static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2347 const struct flowi4 *fl4,
813b3b5d
DM
2348 __be32 orig_daddr, __be32 orig_saddr,
2349 int orig_oif, struct net_device *dev_out,
5ada5527 2350 unsigned int flags)
1da177e4 2351{
982721f3 2352 struct fib_info *fi = res->fi;
813b3b5d 2353 u32 tos = RT_FL_TOS(fl4);
5ada5527 2354 struct in_device *in_dev;
982721f3 2355 u16 type = res->type;
5ada5527 2356 struct rtable *rth;
1da177e4 2357
68a5e3dd 2358 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2359 return ERR_PTR(-EINVAL);
1da177e4 2360
68a5e3dd 2361 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2362 type = RTN_BROADCAST;
68a5e3dd 2363 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2364 type = RTN_MULTICAST;
68a5e3dd 2365 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2366 return ERR_PTR(-EINVAL);
1da177e4
LT
2367
2368 if (dev_out->flags & IFF_LOOPBACK)
2369 flags |= RTCF_LOCAL;
2370
dd28d1a0 2371 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2372 if (!in_dev)
5ada5527 2373 return ERR_PTR(-EINVAL);
ebc0ffae 2374
982721f3 2375 if (type == RTN_BROADCAST) {
1da177e4 2376 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2377 fi = NULL;
2378 } else if (type == RTN_MULTICAST) {
dd28d1a0 2379 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2380 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2381 fl4->flowi4_proto))
1da177e4
LT
2382 flags &= ~RTCF_LOCAL;
2383 /* If multicast route do not exist use
dd28d1a0
ED
2384 * default one, but do not gateway in this case.
2385 * Yes, it is hack.
1da177e4 2386 */
982721f3
DM
2387 if (fi && res->prefixlen < 4)
2388 fi = NULL;
1da177e4
LT
2389 }
2390
5c1e6aa3
DM
2391 rth = rt_dst_alloc(dev_out,
2392 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2393 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2394 if (!rth)
5ada5527 2395 return ERR_PTR(-ENOBUFS);
8391d07b 2396
cf911662
DM
2397 rth->dst.output = ip_output;
2398
813b3b5d
DM
2399 rth->rt_key_dst = orig_daddr;
2400 rth->rt_key_src = orig_saddr;
cf911662
DM
2401 rth->rt_genid = rt_genid(dev_net(dev_out));
2402 rth->rt_flags = flags;
2403 rth->rt_type = type;
475949d8 2404 rth->rt_key_tos = tos;
68a5e3dd
DM
2405 rth->rt_dst = fl4->daddr;
2406 rth->rt_src = fl4->saddr;
1b86a58f 2407 rth->rt_route_iif = 0;
813b3b5d
DM
2408 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2409 rth->rt_oif = orig_oif;
2410 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2411 rth->rt_gateway = fl4->daddr;
2412 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2413 rth->rt_peer_genid = 0;
2414 rth->peer = NULL;
2415 rth->fi = NULL;
1da177e4
LT
2416
2417 RT_CACHE_STAT_INC(out_slow_tot);
2418
2419 if (flags & RTCF_LOCAL) {
d8d1f30b 2420 rth->dst.input = ip_local_deliver;
68a5e3dd 2421 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2422 }
2423 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2424 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2425 if (flags & RTCF_LOCAL &&
1da177e4 2426 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2427 rth->dst.output = ip_mc_output;
1da177e4
LT
2428 RT_CACHE_STAT_INC(out_slow_mc);
2429 }
2430#ifdef CONFIG_IP_MROUTE
982721f3 2431 if (type == RTN_MULTICAST) {
1da177e4 2432 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2433 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2434 rth->dst.input = ip_mr_input;
2435 rth->dst.output = ip_mc_output;
1da177e4
LT
2436 }
2437 }
2438#endif
2439 }
2440
813b3b5d 2441 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2442
5ada5527 2443 return rth;
1da177e4
LT
2444}
2445
1da177e4
LT
2446/*
2447 * Major route resolver routine.
0197aa38 2448 * called with rcu_read_lock();
1da177e4
LT
2449 */
2450
813b3b5d 2451static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2452{
1da177e4 2453 struct net_device *dev_out = NULL;
813b3b5d
DM
2454 u32 tos = RT_FL_TOS(fl4);
2455 unsigned int flags = 0;
2456 struct fib_result res;
5ada5527 2457 struct rtable *rth;
813b3b5d
DM
2458 __be32 orig_daddr;
2459 __be32 orig_saddr;
2460 int orig_oif;
1da177e4
LT
2461
2462 res.fi = NULL;
2463#ifdef CONFIG_IP_MULTIPLE_TABLES
2464 res.r = NULL;
2465#endif
2466
813b3b5d
DM
2467 orig_daddr = fl4->daddr;
2468 orig_saddr = fl4->saddr;
2469 orig_oif = fl4->flowi4_oif;
2470
2471 fl4->flowi4_iif = net->loopback_dev->ifindex;
2472 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2473 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2474 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2475
010c2708 2476 rcu_read_lock();
813b3b5d 2477 if (fl4->saddr) {
b23dd4fe 2478 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2479 if (ipv4_is_multicast(fl4->saddr) ||
2480 ipv4_is_lbcast(fl4->saddr) ||
2481 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2482 goto out;
2483
1da177e4
LT
2484 /* I removed check for oif == dev_out->oif here.
2485 It was wrong for two reasons:
1ab35276
DL
2486 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2487 is assigned to multiple interfaces.
1da177e4
LT
2488 2. Moreover, we are allowed to send packets with saddr
2489 of another iface. --ANK
2490 */
2491
813b3b5d
DM
2492 if (fl4->flowi4_oif == 0 &&
2493 (ipv4_is_multicast(fl4->daddr) ||
2494 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2495 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2496 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2497 if (dev_out == NULL)
2498 goto out;
2499
1da177e4
LT
2500 /* Special hack: user can direct multicasts
2501 and limited broadcast via necessary interface
2502 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2503 This hack is not just for fun, it allows
2504 vic,vat and friends to work.
2505 They bind socket to loopback, set ttl to zero
2506 and expect that it will work.
2507 From the viewpoint of routing cache they are broken,
2508 because we are not allowed to build multicast path
2509 with loopback source addr (look, routing cache
2510 cannot know, that ttl is zero, so that packet
2511 will not leave this host and route is valid).
2512 Luckily, this hack is good workaround.
2513 */
2514
813b3b5d 2515 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2516 goto make_route;
2517 }
a210d01a 2518
813b3b5d 2519 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2520 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2521 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2522 goto out;
a210d01a 2523 }
1da177e4
LT
2524 }
2525
2526
813b3b5d
DM
2527 if (fl4->flowi4_oif) {
2528 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2529 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2530 if (dev_out == NULL)
2531 goto out;
e5ed6399
HX
2532
2533 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2534 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2535 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2536 goto out;
2537 }
813b3b5d
DM
2538 if (ipv4_is_local_multicast(fl4->daddr) ||
2539 ipv4_is_lbcast(fl4->daddr)) {
2540 if (!fl4->saddr)
2541 fl4->saddr = inet_select_addr(dev_out, 0,
2542 RT_SCOPE_LINK);
1da177e4
LT
2543 goto make_route;
2544 }
813b3b5d
DM
2545 if (fl4->saddr) {
2546 if (ipv4_is_multicast(fl4->daddr))
2547 fl4->saddr = inet_select_addr(dev_out, 0,
2548 fl4->flowi4_scope);
2549 else if (!fl4->daddr)
2550 fl4->saddr = inet_select_addr(dev_out, 0,
2551 RT_SCOPE_HOST);
1da177e4
LT
2552 }
2553 }
2554
813b3b5d
DM
2555 if (!fl4->daddr) {
2556 fl4->daddr = fl4->saddr;
2557 if (!fl4->daddr)
2558 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2559 dev_out = net->loopback_dev;
813b3b5d 2560 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2561 res.type = RTN_LOCAL;
2562 flags |= RTCF_LOCAL;
2563 goto make_route;
2564 }
2565
813b3b5d 2566 if (fib_lookup(net, fl4, &res)) {
1da177e4 2567 res.fi = NULL;
813b3b5d 2568 if (fl4->flowi4_oif) {
1da177e4
LT
2569 /* Apparently, routing tables are wrong. Assume,
2570 that the destination is on link.
2571
2572 WHY? DW.
2573 Because we are allowed to send to iface
2574 even if it has NO routes and NO assigned
2575 addresses. When oif is specified, routing
2576 tables are looked up with only one purpose:
2577 to catch if destination is gatewayed, rather than
2578 direct. Moreover, if MSG_DONTROUTE is set,
2579 we send packet, ignoring both routing tables
2580 and ifaddr state. --ANK
2581
2582
2583 We could make it even if oif is unknown,
2584 likely IPv6, but we do not.
2585 */
2586
813b3b5d
DM
2587 if (fl4->saddr == 0)
2588 fl4->saddr = inet_select_addr(dev_out, 0,
2589 RT_SCOPE_LINK);
1da177e4
LT
2590 res.type = RTN_UNICAST;
2591 goto make_route;
2592 }
b23dd4fe 2593 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2594 goto out;
2595 }
1da177e4
LT
2596
2597 if (res.type == RTN_LOCAL) {
813b3b5d 2598 if (!fl4->saddr) {
9fc3bbb4 2599 if (res.fi->fib_prefsrc)
813b3b5d 2600 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2601 else
813b3b5d 2602 fl4->saddr = fl4->daddr;
9fc3bbb4 2603 }
b40afd0e 2604 dev_out = net->loopback_dev;
813b3b5d 2605 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2606 res.fi = NULL;
2607 flags |= RTCF_LOCAL;
2608 goto make_route;
2609 }
2610
2611#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2612 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2613 fib_select_multipath(&res);
1da177e4
LT
2614 else
2615#endif
21d8c49e
DM
2616 if (!res.prefixlen &&
2617 res.table->tb_num_default > 1 &&
813b3b5d 2618 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2619 fib_select_default(&res);
1da177e4 2620
813b3b5d
DM
2621 if (!fl4->saddr)
2622 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2623
1da177e4 2624 dev_out = FIB_RES_DEV(res);
813b3b5d 2625 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2626
2627
2628make_route:
813b3b5d
DM
2629 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2630 dev_out, flags);
b23dd4fe 2631 if (!IS_ERR(rth)) {
5ada5527
DM
2632 unsigned int hash;
2633
813b3b5d 2634 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2635 rt_genid(dev_net(dev_out)));
813b3b5d 2636 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2637 }
1da177e4 2638
010c2708
DM
2639out:
2640 rcu_read_unlock();
b23dd4fe 2641 return rth;
1da177e4
LT
2642}
2643
813b3b5d 2644struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2645{
1da177e4 2646 struct rtable *rth;
010c2708 2647 unsigned int hash;
1da177e4 2648
1080d709
NH
2649 if (!rt_caching(net))
2650 goto slow_output;
2651
9d6ec938 2652 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2653
2654 rcu_read_lock_bh();
a898def2 2655 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2656 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2657 if (rth->rt_key_dst == flp4->daddr &&
2658 rth->rt_key_src == flp4->saddr &&
c7537967 2659 rt_is_output_route(rth) &&
9d6ec938
DM
2660 rth->rt_oif == flp4->flowi4_oif &&
2661 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2662 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2663 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2664 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2665 !rt_is_expired(rth)) {
d8d1f30b 2666 dst_use(&rth->dst, jiffies);
1da177e4
LT
2667 RT_CACHE_STAT_INC(out_hit);
2668 rcu_read_unlock_bh();
56157872
DM
2669 if (!flp4->saddr)
2670 flp4->saddr = rth->rt_src;
2671 if (!flp4->daddr)
2672 flp4->daddr = rth->rt_dst;
b23dd4fe 2673 return rth;
1da177e4
LT
2674 }
2675 RT_CACHE_STAT_INC(out_hlist_search);
2676 }
2677 rcu_read_unlock_bh();
2678
1080d709 2679slow_output:
9d6ec938 2680 return ip_route_output_slow(net, flp4);
1da177e4 2681}
d8c97a94
ACM
2682EXPORT_SYMBOL_GPL(__ip_route_output_key);
2683
ae2688d5
JW
2684static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2685{
2686 return NULL;
2687}
2688
ec831ea7
RD
2689static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2690{
2691 return 0;
2692}
2693
14e50e57
DM
2694static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2695{
2696}
2697
0972ddb2
HB
2698static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2699 unsigned long old)
2700{
2701 return NULL;
2702}
2703
14e50e57
DM
2704static struct dst_ops ipv4_dst_blackhole_ops = {
2705 .family = AF_INET,
09640e63 2706 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2707 .destroy = ipv4_dst_destroy,
ae2688d5 2708 .check = ipv4_blackhole_dst_check,
ec831ea7 2709 .default_mtu = ipv4_blackhole_default_mtu,
214f45c9 2710 .default_advmss = ipv4_default_advmss,
14e50e57 2711 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2712 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
14e50e57
DM
2713};
2714
2774c131 2715struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2716{
5c1e6aa3 2717 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2718 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2719
2720 if (rt) {
d8d1f30b 2721 struct dst_entry *new = &rt->dst;
14e50e57 2722
14e50e57 2723 new->__use = 1;
352e512c
HX
2724 new->input = dst_discard;
2725 new->output = dst_discard;
defb3519 2726 dst_copy_metrics(new, &ort->dst);
14e50e57 2727
d8d1f30b 2728 new->dev = ort->dst.dev;
14e50e57
DM
2729 if (new->dev)
2730 dev_hold(new->dev);
2731
5e2b61f7
DM
2732 rt->rt_key_dst = ort->rt_key_dst;
2733 rt->rt_key_src = ort->rt_key_src;
475949d8 2734 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2735 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2736 rt->rt_iif = ort->rt_iif;
2737 rt->rt_oif = ort->rt_oif;
2738 rt->rt_mark = ort->rt_mark;
14e50e57 2739
e84f84f2 2740 rt->rt_genid = rt_genid(net);
14e50e57
DM
2741 rt->rt_flags = ort->rt_flags;
2742 rt->rt_type = ort->rt_type;
2743 rt->rt_dst = ort->rt_dst;
2744 rt->rt_src = ort->rt_src;
14e50e57
DM
2745 rt->rt_gateway = ort->rt_gateway;
2746 rt->rt_spec_dst = ort->rt_spec_dst;
2747 rt->peer = ort->peer;
2748 if (rt->peer)
2749 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2750 rt->fi = ort->fi;
2751 if (rt->fi)
2752 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2753
2754 dst_free(new);
2755 }
2756
2774c131
DM
2757 dst_release(dst_orig);
2758
2759 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2760}
2761
9d6ec938 2762struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2763 struct sock *sk)
1da177e4 2764{
9d6ec938 2765 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2766
b23dd4fe
DM
2767 if (IS_ERR(rt))
2768 return rt;
1da177e4 2769
56157872 2770 if (flp4->flowi4_proto)
9d6ec938
DM
2771 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2772 flowi4_to_flowi(flp4),
2773 sk, 0);
1da177e4 2774
b23dd4fe 2775 return rt;
1da177e4 2776}
d8c97a94
ACM
2777EXPORT_SYMBOL_GPL(ip_route_output_flow);
2778
4feb88e5
BT
2779static int rt_fill_info(struct net *net,
2780 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2781 int nowait, unsigned int flags)
1da177e4 2782{
511c3f92 2783 struct rtable *rt = skb_rtable(skb);
1da177e4 2784 struct rtmsg *r;
be403ea1 2785 struct nlmsghdr *nlh;
fe6fe792
ED
2786 long expires = 0;
2787 const struct inet_peer *peer = rt->peer;
e3703b3d 2788 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2789
2790 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2791 if (nlh == NULL)
26932566 2792 return -EMSGSIZE;
be403ea1
TG
2793
2794 r = nlmsg_data(nlh);
1da177e4
LT
2795 r->rtm_family = AF_INET;
2796 r->rtm_dst_len = 32;
2797 r->rtm_src_len = 0;
475949d8 2798 r->rtm_tos = rt->rt_key_tos;
1da177e4 2799 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2800 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2801 r->rtm_type = rt->rt_type;
2802 r->rtm_scope = RT_SCOPE_UNIVERSE;
2803 r->rtm_protocol = RTPROT_UNSPEC;
2804 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2805 if (rt->rt_flags & RTCF_NOTIFY)
2806 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2807
17fb2c64 2808 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2809
5e2b61f7 2810 if (rt->rt_key_src) {
1da177e4 2811 r->rtm_src_len = 32;
5e2b61f7 2812 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 2813 }
d8d1f30b
CG
2814 if (rt->dst.dev)
2815 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2816#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2817 if (rt->dst.tclassid)
2818 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2819#endif
c7537967 2820 if (rt_is_input_route(rt))
17fb2c64 2821 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 2822 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 2823 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2824
1da177e4 2825 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2826 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2827
defb3519 2828 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2829 goto nla_put_failure;
2830
5e2b61f7
DM
2831 if (rt->rt_mark)
2832 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 2833
d8d1f30b 2834 error = rt->dst.error;
fe6fe792 2835 if (peer) {
317fe0e6 2836 inet_peer_refcheck(rt->peer);
fe6fe792
ED
2837 id = atomic_read(&peer->ip_id_count) & 0xffff;
2838 if (peer->tcp_ts_stamp) {
2839 ts = peer->tcp_ts;
2840 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 2841 }
fe6fe792
ED
2842 expires = ACCESS_ONCE(peer->pmtu_expires);
2843 if (expires)
2844 expires -= jiffies;
1da177e4 2845 }
be403ea1 2846
c7537967 2847 if (rt_is_input_route(rt)) {
1da177e4 2848#ifdef CONFIG_IP_MROUTE
e448515c 2849 __be32 dst = rt->rt_dst;
1da177e4 2850
f97c1e0c 2851 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 2852 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
2853 int err = ipmr_get_route(net, skb,
2854 rt->rt_src, rt->rt_dst,
2855 r, nowait);
1da177e4
LT
2856 if (err <= 0) {
2857 if (!nowait) {
2858 if (err == 0)
2859 return 0;
be403ea1 2860 goto nla_put_failure;
1da177e4
LT
2861 } else {
2862 if (err == -EMSGSIZE)
be403ea1 2863 goto nla_put_failure;
e3703b3d 2864 error = err;
1da177e4
LT
2865 }
2866 }
2867 } else
2868#endif
5e2b61f7 2869 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
2870 }
2871
d8d1f30b 2872 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
2873 expires, error) < 0)
2874 goto nla_put_failure;
be403ea1
TG
2875
2876 return nlmsg_end(skb, nlh);
1da177e4 2877
be403ea1 2878nla_put_failure:
26932566
PM
2879 nlmsg_cancel(skb, nlh);
2880 return -EMSGSIZE;
1da177e4
LT
2881}
2882
63f3444f 2883static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2884{
3b1e0a65 2885 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2886 struct rtmsg *rtm;
2887 struct nlattr *tb[RTA_MAX+1];
1da177e4 2888 struct rtable *rt = NULL;
9e12bb22
AV
2889 __be32 dst = 0;
2890 __be32 src = 0;
2891 u32 iif;
d889ce3b 2892 int err;
963bfeee 2893 int mark;
1da177e4
LT
2894 struct sk_buff *skb;
2895
d889ce3b
TG
2896 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2897 if (err < 0)
2898 goto errout;
2899
2900 rtm = nlmsg_data(nlh);
2901
1da177e4 2902 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2903 if (skb == NULL) {
2904 err = -ENOBUFS;
2905 goto errout;
2906 }
1da177e4
LT
2907
2908 /* Reserve room for dummy headers, this skb can pass
2909 through good chunk of routing engine.
2910 */
459a98ed 2911 skb_reset_mac_header(skb);
c1d2bbe1 2912 skb_reset_network_header(skb);
d2c962b8
SH
2913
2914 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2915 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2916 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2917
17fb2c64
AV
2918 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2919 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2920 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2921 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
2922
2923 if (iif) {
d889ce3b
TG
2924 struct net_device *dev;
2925
1937504d 2926 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2927 if (dev == NULL) {
2928 err = -ENODEV;
2929 goto errout_free;
2930 }
2931
1da177e4
LT
2932 skb->protocol = htons(ETH_P_IP);
2933 skb->dev = dev;
963bfeee 2934 skb->mark = mark;
1da177e4
LT
2935 local_bh_disable();
2936 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2937 local_bh_enable();
d889ce3b 2938
511c3f92 2939 rt = skb_rtable(skb);
d8d1f30b
CG
2940 if (err == 0 && rt->dst.error)
2941 err = -rt->dst.error;
1da177e4 2942 } else {
68a5e3dd
DM
2943 struct flowi4 fl4 = {
2944 .daddr = dst,
2945 .saddr = src,
2946 .flowi4_tos = rtm->rtm_tos,
2947 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2948 .flowi4_mark = mark,
d889ce3b 2949 };
9d6ec938 2950 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2951
2952 err = 0;
2953 if (IS_ERR(rt))
2954 err = PTR_ERR(rt);
1da177e4 2955 }
d889ce3b 2956
1da177e4 2957 if (err)
d889ce3b 2958 goto errout_free;
1da177e4 2959
d8d1f30b 2960 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2961 if (rtm->rtm_flags & RTM_F_NOTIFY)
2962 rt->rt_flags |= RTCF_NOTIFY;
2963
4feb88e5 2964 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2965 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2966 if (err <= 0)
2967 goto errout_free;
1da177e4 2968
1937504d 2969 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2970errout:
2942e900 2971 return err;
1da177e4 2972
d889ce3b 2973errout_free:
1da177e4 2974 kfree_skb(skb);
d889ce3b 2975 goto errout;
1da177e4
LT
2976}
2977
2978int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2979{
2980 struct rtable *rt;
2981 int h, s_h;
2982 int idx, s_idx;
1937504d
DL
2983 struct net *net;
2984
3b1e0a65 2985 net = sock_net(skb->sk);
1da177e4
LT
2986
2987 s_h = cb->args[0];
d8c92830
ED
2988 if (s_h < 0)
2989 s_h = 0;
1da177e4 2990 s_idx = idx = cb->args[1];
a6272665
ED
2991 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2992 if (!rt_hash_table[h].chain)
2993 continue;
1da177e4 2994 rcu_read_lock_bh();
a898def2 2995 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
2996 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2997 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 2998 continue;
e84f84f2 2999 if (rt_is_expired(rt))
29e75252 3000 continue;
d8d1f30b 3001 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3002 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3003 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3004 1, NLM_F_MULTI) <= 0) {
adf30907 3005 skb_dst_drop(skb);
1da177e4
LT
3006 rcu_read_unlock_bh();
3007 goto done;
3008 }
adf30907 3009 skb_dst_drop(skb);
1da177e4
LT
3010 }
3011 rcu_read_unlock_bh();
3012 }
3013
3014done:
3015 cb->args[0] = h;
3016 cb->args[1] = idx;
3017 return skb->len;
3018}
3019
3020void ip_rt_multicast_event(struct in_device *in_dev)
3021{
76e6ebfb 3022 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3023}
3024
3025#ifdef CONFIG_SYSCTL
81c684d1 3026static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3027 void __user *buffer,
1da177e4
LT
3028 size_t *lenp, loff_t *ppos)
3029{
3030 if (write) {
639e104f 3031 int flush_delay;
81c684d1 3032 ctl_table ctl;
39a23e75 3033 struct net *net;
639e104f 3034
81c684d1
DL
3035 memcpy(&ctl, __ctl, sizeof(ctl));
3036 ctl.data = &flush_delay;
8d65af78 3037 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3038
81c684d1 3039 net = (struct net *)__ctl->extra1;
39a23e75 3040 rt_cache_flush(net, flush_delay);
1da177e4 3041 return 0;
e905a9ed 3042 }
1da177e4
LT
3043
3044 return -EINVAL;
3045}
3046
eeb61f71 3047static ctl_table ipv4_route_table[] = {
1da177e4 3048 {
1da177e4
LT
3049 .procname = "gc_thresh",
3050 .data = &ipv4_dst_ops.gc_thresh,
3051 .maxlen = sizeof(int),
3052 .mode = 0644,
6d9f239a 3053 .proc_handler = proc_dointvec,
1da177e4
LT
3054 },
3055 {
1da177e4
LT
3056 .procname = "max_size",
3057 .data = &ip_rt_max_size,
3058 .maxlen = sizeof(int),
3059 .mode = 0644,
6d9f239a 3060 .proc_handler = proc_dointvec,
1da177e4
LT
3061 },
3062 {
3063 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3064
1da177e4
LT
3065 .procname = "gc_min_interval",
3066 .data = &ip_rt_gc_min_interval,
3067 .maxlen = sizeof(int),
3068 .mode = 0644,
6d9f239a 3069 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3070 },
3071 {
1da177e4
LT
3072 .procname = "gc_min_interval_ms",
3073 .data = &ip_rt_gc_min_interval,
3074 .maxlen = sizeof(int),
3075 .mode = 0644,
6d9f239a 3076 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3077 },
3078 {
1da177e4
LT
3079 .procname = "gc_timeout",
3080 .data = &ip_rt_gc_timeout,
3081 .maxlen = sizeof(int),
3082 .mode = 0644,
6d9f239a 3083 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3084 },
3085 {
1da177e4
LT
3086 .procname = "gc_interval",
3087 .data = &ip_rt_gc_interval,
3088 .maxlen = sizeof(int),
3089 .mode = 0644,
6d9f239a 3090 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3091 },
3092 {
1da177e4
LT
3093 .procname = "redirect_load",
3094 .data = &ip_rt_redirect_load,
3095 .maxlen = sizeof(int),
3096 .mode = 0644,
6d9f239a 3097 .proc_handler = proc_dointvec,
1da177e4
LT
3098 },
3099 {
1da177e4
LT
3100 .procname = "redirect_number",
3101 .data = &ip_rt_redirect_number,
3102 .maxlen = sizeof(int),
3103 .mode = 0644,
6d9f239a 3104 .proc_handler = proc_dointvec,
1da177e4
LT
3105 },
3106 {
1da177e4
LT
3107 .procname = "redirect_silence",
3108 .data = &ip_rt_redirect_silence,
3109 .maxlen = sizeof(int),
3110 .mode = 0644,
6d9f239a 3111 .proc_handler = proc_dointvec,
1da177e4
LT
3112 },
3113 {
1da177e4
LT
3114 .procname = "error_cost",
3115 .data = &ip_rt_error_cost,
3116 .maxlen = sizeof(int),
3117 .mode = 0644,
6d9f239a 3118 .proc_handler = proc_dointvec,
1da177e4
LT
3119 },
3120 {
1da177e4
LT
3121 .procname = "error_burst",
3122 .data = &ip_rt_error_burst,
3123 .maxlen = sizeof(int),
3124 .mode = 0644,
6d9f239a 3125 .proc_handler = proc_dointvec,
1da177e4
LT
3126 },
3127 {
1da177e4
LT
3128 .procname = "gc_elasticity",
3129 .data = &ip_rt_gc_elasticity,
3130 .maxlen = sizeof(int),
3131 .mode = 0644,
6d9f239a 3132 .proc_handler = proc_dointvec,
1da177e4
LT
3133 },
3134 {
1da177e4
LT
3135 .procname = "mtu_expires",
3136 .data = &ip_rt_mtu_expires,
3137 .maxlen = sizeof(int),
3138 .mode = 0644,
6d9f239a 3139 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3140 },
3141 {
1da177e4
LT
3142 .procname = "min_pmtu",
3143 .data = &ip_rt_min_pmtu,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
6d9f239a 3146 .proc_handler = proc_dointvec,
1da177e4
LT
3147 },
3148 {
1da177e4
LT
3149 .procname = "min_adv_mss",
3150 .data = &ip_rt_min_advmss,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
6d9f239a 3153 .proc_handler = proc_dointvec,
1da177e4 3154 },
f8572d8f 3155 { }
1da177e4 3156};
39a23e75 3157
2f4520d3
AV
3158static struct ctl_table empty[1];
3159
3160static struct ctl_table ipv4_skeleton[] =
3161{
f8572d8f 3162 { .procname = "route",
d994af0d 3163 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3164 { .procname = "neigh",
d994af0d 3165 .mode = 0555, .child = empty},
2f4520d3
AV
3166 { }
3167};
3168
3169static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3170 { .procname = "net", },
3171 { .procname = "ipv4", },
39a23e75
DL
3172 { },
3173};
3174
39a23e75
DL
3175static struct ctl_table ipv4_route_flush_table[] = {
3176 {
39a23e75
DL
3177 .procname = "flush",
3178 .maxlen = sizeof(int),
3179 .mode = 0200,
6d9f239a 3180 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3181 },
f8572d8f 3182 { },
39a23e75
DL
3183};
3184
2f4520d3 3185static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3186 { .procname = "net", },
3187 { .procname = "ipv4", },
3188 { .procname = "route", },
2f4520d3
AV
3189 { },
3190};
3191
39a23e75
DL
3192static __net_init int sysctl_route_net_init(struct net *net)
3193{
3194 struct ctl_table *tbl;
3195
3196 tbl = ipv4_route_flush_table;
09ad9bc7 3197 if (!net_eq(net, &init_net)) {
39a23e75
DL
3198 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3199 if (tbl == NULL)
3200 goto err_dup;
3201 }
3202 tbl[0].extra1 = net;
3203
3204 net->ipv4.route_hdr =
3205 register_net_sysctl_table(net, ipv4_route_path, tbl);
3206 if (net->ipv4.route_hdr == NULL)
3207 goto err_reg;
3208 return 0;
3209
3210err_reg:
3211 if (tbl != ipv4_route_flush_table)
3212 kfree(tbl);
3213err_dup:
3214 return -ENOMEM;
3215}
3216
3217static __net_exit void sysctl_route_net_exit(struct net *net)
3218{
3219 struct ctl_table *tbl;
3220
3221 tbl = net->ipv4.route_hdr->ctl_table_arg;
3222 unregister_net_sysctl_table(net->ipv4.route_hdr);
3223 BUG_ON(tbl == ipv4_route_flush_table);
3224 kfree(tbl);
3225}
3226
3227static __net_initdata struct pernet_operations sysctl_route_ops = {
3228 .init = sysctl_route_net_init,
3229 .exit = sysctl_route_net_exit,
3230};
1da177e4
LT
3231#endif
3232
3ee94372 3233static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3234{
3ee94372
NH
3235 get_random_bytes(&net->ipv4.rt_genid,
3236 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3237 get_random_bytes(&net->ipv4.dev_addr_genid,
3238 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3239 return 0;
3240}
3241
3ee94372
NH
3242static __net_initdata struct pernet_operations rt_genid_ops = {
3243 .init = rt_genid_init,
9f5e97e5
DL
3244};
3245
3246
c7066f70 3247#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3248struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3249#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3250
3251static __initdata unsigned long rhash_entries;
3252static int __init set_rhash_entries(char *str)
3253{
3254 if (!str)
3255 return 0;
3256 rhash_entries = simple_strtoul(str, &str, 0);
3257 return 1;
3258}
3259__setup("rhash_entries=", set_rhash_entries);
3260
3261int __init ip_rt_init(void)
3262{
424c4b70 3263 int rc = 0;
1da177e4 3264
c7066f70 3265#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3266 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3267 if (!ip_rt_acct)
3268 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3269#endif
3270
e5d679f3
AD
3271 ipv4_dst_ops.kmem_cachep =
3272 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3273 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3274
14e50e57
DM
3275 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3276
fc66f95c
ED
3277 if (dst_entries_init(&ipv4_dst_ops) < 0)
3278 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3279
3280 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3281 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3282
424c4b70
ED
3283 rt_hash_table = (struct rt_hash_bucket *)
3284 alloc_large_system_hash("IP route cache",
3285 sizeof(struct rt_hash_bucket),
3286 rhash_entries,
4481374c 3287 (totalram_pages >= 128 * 1024) ?
18955cfc 3288 15 : 17,
8d1502de 3289 0,
424c4b70
ED
3290 &rt_hash_log,
3291 &rt_hash_mask,
c9503e0f 3292 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3293 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3294 rt_hash_lock_init();
1da177e4
LT
3295
3296 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3297 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3298
1da177e4
LT
3299 devinet_init();
3300 ip_fib_init();
3301
73b38711 3302 if (ip_rt_proc_init())
107f1634 3303 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3304#ifdef CONFIG_XFRM
3305 xfrm_init();
a33bc5c1 3306 xfrm4_init(ip_rt_max_size);
1da177e4 3307#endif
63f3444f
TG
3308 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3309
39a23e75
DL
3310#ifdef CONFIG_SYSCTL
3311 register_pernet_subsys(&sysctl_route_ops);
3312#endif
3ee94372 3313 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3314 return rc;
3315}
3316
a1bc6eb4 3317#ifdef CONFIG_SYSCTL
eeb61f71
AV
3318/*
3319 * We really need to sanitize the damn ipv4 init order, then all
3320 * this nonsense will go away.
3321 */
3322void __init ip_static_sysctl_init(void)
3323{
2f4520d3 3324 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3325}
a1bc6eb4 3326#endif