]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - net/ipv4/route.c
ipv4: Don't miss existing cached metrics in new routes.
[mirror_ubuntu-focal-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
1080d709 132static int rt_chain_length_max __read_mostly = 20;
1da177e4 133
125bb8f5
ED
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
1da177e4
LT
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
d33e4553 143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
1da177e4 144static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 148static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 149
72cdd1d9
ED
150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
1da177e4 154
62fa8a84
DM
155static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156{
06582540
DM
157 struct rtable *rt = (struct rtable *) dst;
158 struct inet_peer *peer;
159 u32 *p = NULL;
160
161 if (!rt->peer)
162 rt_bind_peer(rt, 1);
62fa8a84 163
06582540
DM
164 peer = rt->peer;
165 if (peer) {
62fa8a84
DM
166 u32 *old_p = __DST_METRICS_PTR(old);
167 unsigned long prev, new;
168
06582540
DM
169 p = peer->metrics;
170 if (inet_metrics_new(peer))
171 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
172
173 new = (unsigned long) p;
174 prev = cmpxchg(&dst->_metrics, old, new);
175
176 if (prev != old) {
62fa8a84
DM
177 p = __DST_METRICS_PTR(prev);
178 if (prev & DST_METRICS_READ_ONLY)
179 p = NULL;
180 } else {
62fa8a84
DM
181 if (rt->fi) {
182 fib_info_put(rt->fi);
183 rt->fi = NULL;
184 }
185 }
186 }
187 return p;
188}
189
1da177e4
LT
190static struct dst_ops ipv4_dst_ops = {
191 .family = AF_INET,
09640e63 192 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
193 .gc = rt_garbage_collect,
194 .check = ipv4_dst_check,
0dbaee3b 195 .default_advmss = ipv4_default_advmss,
d33e4553 196 .default_mtu = ipv4_default_mtu,
62fa8a84 197 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
198 .destroy = ipv4_dst_destroy,
199 .ifdown = ipv4_dst_ifdown,
200 .negative_advice = ipv4_negative_advice,
201 .link_failure = ipv4_link_failure,
202 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 203 .local_out = __ip_local_out,
1da177e4
LT
204};
205
206#define ECN_OR_COST(class) TC_PRIO_##class
207
4839c52b 208const __u8 ip_tos2prio[16] = {
1da177e4
LT
209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(FILLER),
211 TC_PRIO_BESTEFFORT,
212 ECN_OR_COST(BESTEFFORT),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK)
225};
226
227
228/*
229 * Route cache.
230 */
231
232/* The locking scheme is rather straight forward:
233 *
234 * 1) Read-Copy Update protects the buckets of the central route hash.
235 * 2) Only writers remove entries, and they hold the lock
236 * as they look at rtable reference counts.
237 * 3) Only readers acquire references to rtable entries,
238 * they do so with atomic increments and with the
239 * lock held.
240 */
241
242struct rt_hash_bucket {
1c31720a 243 struct rtable __rcu *chain;
22c047cc 244};
1080d709 245
8a25d5de
IM
246#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
248/*
249 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250 * The size of this table is a power of two and depends on the number of CPUS.
62051200 251 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 252 */
62051200
IM
253#ifdef CONFIG_LOCKDEP
254# define RT_HASH_LOCK_SZ 256
22c047cc 255#else
62051200
IM
256# if NR_CPUS >= 32
257# define RT_HASH_LOCK_SZ 4096
258# elif NR_CPUS >= 16
259# define RT_HASH_LOCK_SZ 2048
260# elif NR_CPUS >= 8
261# define RT_HASH_LOCK_SZ 1024
262# elif NR_CPUS >= 4
263# define RT_HASH_LOCK_SZ 512
264# else
265# define RT_HASH_LOCK_SZ 256
266# endif
22c047cc
ED
267#endif
268
269static spinlock_t *rt_hash_locks;
270# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
271
272static __init void rt_hash_lock_init(void)
273{
274 int i;
275
276 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277 GFP_KERNEL);
278 if (!rt_hash_locks)
279 panic("IP: failed to allocate rt_hash_locks\n");
280
281 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282 spin_lock_init(&rt_hash_locks[i]);
283}
22c047cc
ED
284#else
285# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
286
287static inline void rt_hash_lock_init(void)
288{
289}
22c047cc 290#endif
1da177e4 291
817bc4db
SH
292static struct rt_hash_bucket *rt_hash_table __read_mostly;
293static unsigned rt_hash_mask __read_mostly;
294static unsigned int rt_hash_log __read_mostly;
1da177e4 295
2f970d83 296static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 297#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 298
b00180de 299static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 300 int genid)
1da177e4 301{
0eae88f3 302 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 303 idx, genid)
29e75252 304 & rt_hash_mask;
1da177e4
LT
305}
306
e84f84f2
DL
307static inline int rt_genid(struct net *net)
308{
309 return atomic_read(&net->ipv4.rt_genid);
310}
311
1da177e4
LT
312#ifdef CONFIG_PROC_FS
313struct rt_cache_iter_state {
a75e936f 314 struct seq_net_private p;
1da177e4 315 int bucket;
29e75252 316 int genid;
1da177e4
LT
317};
318
1218854a 319static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 320{
1218854a 321 struct rt_cache_iter_state *st = seq->private;
1da177e4 322 struct rtable *r = NULL;
1da177e4
LT
323
324 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
1c31720a 325 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
a6272665 326 continue;
1da177e4 327 rcu_read_lock_bh();
a898def2 328 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 329 while (r) {
d8d1f30b 330 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 331 r->rt_genid == st->genid)
29e75252 332 return r;
d8d1f30b 333 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 334 }
1da177e4
LT
335 rcu_read_unlock_bh();
336 }
29e75252 337 return r;
1da177e4
LT
338}
339
1218854a 340static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 341 struct rtable *r)
1da177e4 342{
1218854a 343 struct rt_cache_iter_state *st = seq->private;
a6272665 344
1c31720a 345 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
346 while (!r) {
347 rcu_read_unlock_bh();
a6272665
ED
348 do {
349 if (--st->bucket < 0)
350 return NULL;
1c31720a 351 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
1da177e4 352 rcu_read_lock_bh();
1c31720a 353 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 354 }
1c31720a 355 return r;
1da177e4
LT
356}
357
1218854a 358static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
359 struct rtable *r)
360{
1218854a
YH
361 struct rt_cache_iter_state *st = seq->private;
362 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 363 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 364 continue;
642d6318
DL
365 if (r->rt_genid == st->genid)
366 break;
367 }
368 return r;
369}
370
1218854a 371static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 372{
1218854a 373 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
374
375 if (r)
1218854a 376 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
377 --pos;
378 return pos ? NULL : r;
379}
380
381static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382{
29e75252 383 struct rt_cache_iter_state *st = seq->private;
29e75252 384 if (*pos)
1218854a 385 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 386 st->genid = rt_genid(seq_file_net(seq));
29e75252 387 return SEQ_START_TOKEN;
1da177e4
LT
388}
389
390static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391{
29e75252 392 struct rtable *r;
1da177e4
LT
393
394 if (v == SEQ_START_TOKEN)
1218854a 395 r = rt_cache_get_first(seq);
1da177e4 396 else
1218854a 397 r = rt_cache_get_next(seq, v);
1da177e4
LT
398 ++*pos;
399 return r;
400}
401
402static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403{
404 if (v && v != SEQ_START_TOKEN)
405 rcu_read_unlock_bh();
406}
407
408static int rt_cache_seq_show(struct seq_file *seq, void *v)
409{
410 if (v == SEQ_START_TOKEN)
411 seq_printf(seq, "%-127s\n",
412 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414 "HHUptod\tSpecDst");
415 else {
416 struct rtable *r = v;
5e659e4c 417 int len;
1da177e4 418
0eae88f3
ED
419 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 421 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
422 (__force u32)r->rt_dst,
423 (__force u32)r->rt_gateway,
d8d1f30b
CG
424 r->rt_flags, atomic_read(&r->dst.__refcnt),
425 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 426 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
427 dst_metric(&r->dst, RTAX_WINDOW),
428 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429 dst_metric(&r->dst, RTAX_RTTVAR)),
1da177e4 430 r->fl.fl4_tos,
d8d1f30b
CG
431 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432 r->dst.hh ? (r->dst.hh->hh_output ==
1da177e4 433 dev_queue_xmit) : 0,
5e659e4c
PE
434 r->rt_spec_dst, &len);
435
436 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
437 }
438 return 0;
1da177e4
LT
439}
440
f690808e 441static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
442 .start = rt_cache_seq_start,
443 .next = rt_cache_seq_next,
444 .stop = rt_cache_seq_stop,
445 .show = rt_cache_seq_show,
446};
447
448static int rt_cache_seq_open(struct inode *inode, struct file *file)
449{
a75e936f 450 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 451 sizeof(struct rt_cache_iter_state));
1da177e4
LT
452}
453
9a32144e 454static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
455 .owner = THIS_MODULE,
456 .open = rt_cache_seq_open,
457 .read = seq_read,
458 .llseek = seq_lseek,
a75e936f 459 .release = seq_release_net,
1da177e4
LT
460};
461
462
463static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464{
465 int cpu;
466
467 if (*pos == 0)
468 return SEQ_START_TOKEN;
469
0f23174a 470 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
471 if (!cpu_possible(cpu))
472 continue;
473 *pos = cpu+1;
2f970d83 474 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
475 }
476 return NULL;
477}
478
479static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480{
481 int cpu;
482
0f23174a 483 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
484 if (!cpu_possible(cpu))
485 continue;
486 *pos = cpu+1;
2f970d83 487 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
488 }
489 return NULL;
e905a9ed 490
1da177e4
LT
491}
492
493static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494{
495
496}
497
498static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499{
500 struct rt_cache_stat *st = v;
501
502 if (v == SEQ_START_TOKEN) {
5bec0039 503 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
504 return 0;
505 }
e905a9ed 506
1da177e4
LT
507 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
508 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 509 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
510 st->in_hit,
511 st->in_slow_tot,
512 st->in_slow_mc,
513 st->in_no_route,
514 st->in_brd,
515 st->in_martian_dst,
516 st->in_martian_src,
517
518 st->out_hit,
519 st->out_slow_tot,
e905a9ed 520 st->out_slow_mc,
1da177e4
LT
521
522 st->gc_total,
523 st->gc_ignored,
524 st->gc_goal_miss,
525 st->gc_dst_overflow,
526 st->in_hlist_search,
527 st->out_hlist_search
528 );
529 return 0;
530}
531
f690808e 532static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
533 .start = rt_cpu_seq_start,
534 .next = rt_cpu_seq_next,
535 .stop = rt_cpu_seq_stop,
536 .show = rt_cpu_seq_show,
537};
538
539
540static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541{
542 return seq_open(file, &rt_cpu_seq_ops);
543}
544
9a32144e 545static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
546 .owner = THIS_MODULE,
547 .open = rt_cpu_seq_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = seq_release,
551};
552
c7066f70 553#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 554static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 555{
a661c419
AD
556 struct ip_rt_acct *dst, *src;
557 unsigned int i, j;
558
559 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560 if (!dst)
561 return -ENOMEM;
562
563 for_each_possible_cpu(i) {
564 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565 for (j = 0; j < 256; j++) {
566 dst[j].o_bytes += src[j].o_bytes;
567 dst[j].o_packets += src[j].o_packets;
568 dst[j].i_bytes += src[j].i_bytes;
569 dst[j].i_packets += src[j].i_packets;
570 }
78c686e9
PE
571 }
572
a661c419
AD
573 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574 kfree(dst);
575 return 0;
576}
78c686e9 577
a661c419
AD
578static int rt_acct_proc_open(struct inode *inode, struct file *file)
579{
580 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 581}
a661c419
AD
582
583static const struct file_operations rt_acct_proc_fops = {
584 .owner = THIS_MODULE,
585 .open = rt_acct_proc_open,
586 .read = seq_read,
587 .llseek = seq_lseek,
588 .release = single_release,
589};
78c686e9 590#endif
107f1634 591
73b38711 592static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
593{
594 struct proc_dir_entry *pde;
595
596 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597 &rt_cache_seq_fops);
598 if (!pde)
599 goto err1;
600
77020720
WC
601 pde = proc_create("rt_cache", S_IRUGO,
602 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
603 if (!pde)
604 goto err2;
605
c7066f70 606#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 607 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
608 if (!pde)
609 goto err3;
610#endif
611 return 0;
612
c7066f70 613#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
614err3:
615 remove_proc_entry("rt_cache", net->proc_net_stat);
616#endif
617err2:
618 remove_proc_entry("rt_cache", net->proc_net);
619err1:
620 return -ENOMEM;
621}
73b38711
DL
622
623static void __net_exit ip_rt_do_proc_exit(struct net *net)
624{
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 627#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 628 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 629#endif
73b38711
DL
630}
631
632static struct pernet_operations ip_rt_proc_ops __net_initdata = {
633 .init = ip_rt_do_proc_init,
634 .exit = ip_rt_do_proc_exit,
635};
636
637static int __init ip_rt_proc_init(void)
638{
639 return register_pernet_subsys(&ip_rt_proc_ops);
640}
641
107f1634 642#else
73b38711 643static inline int ip_rt_proc_init(void)
107f1634
PE
644{
645 return 0;
646}
1da177e4 647#endif /* CONFIG_PROC_FS */
e905a9ed 648
5969f71d 649static inline void rt_free(struct rtable *rt)
1da177e4 650{
d8d1f30b 651 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
652}
653
5969f71d 654static inline void rt_drop(struct rtable *rt)
1da177e4 655{
1da177e4 656 ip_rt_put(rt);
d8d1f30b 657 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
658}
659
5969f71d 660static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
661{
662 /* Kill broadcast/multicast entries very aggresively, if they
663 collide in hash table with more useful entries */
664 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 665 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
666}
667
5969f71d 668static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
669{
670 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
d8d1f30b 671 rth->dst.expires;
1da177e4
LT
672}
673
674static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675{
676 unsigned long age;
677 int ret = 0;
678
d8d1f30b 679 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
680 goto out;
681
682 ret = 1;
d8d1f30b
CG
683 if (rth->dst.expires &&
684 time_after_eq(jiffies, rth->dst.expires))
1da177e4
LT
685 goto out;
686
d8d1f30b 687 age = jiffies - rth->dst.lastuse;
1da177e4
LT
688 ret = 0;
689 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690 (age <= tmo2 && rt_valuable(rth)))
691 goto out;
692 ret = 1;
693out: return ret;
694}
695
696/* Bits of score are:
697 * 31: very valuable
698 * 30: not quite useless
699 * 29..0: usage counter
700 */
701static inline u32 rt_score(struct rtable *rt)
702{
d8d1f30b 703 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
704
705 score = ~score & ~(3<<30);
706
707 if (rt_valuable(rt))
708 score |= (1<<31);
709
c7537967 710 if (rt_is_output_route(rt) ||
1da177e4
LT
711 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712 score |= (1<<30);
713
714 return score;
715}
716
1080d709
NH
717static inline bool rt_caching(const struct net *net)
718{
719 return net->ipv4.current_rt_cache_rebuild_count <=
720 net->ipv4.sysctl_rt_cache_rebuild_count;
721}
722
723static inline bool compare_hash_inputs(const struct flowi *fl1,
724 const struct flowi *fl2)
725{
5811662b
CG
726 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
1080d709
NH
728 (fl1->iif ^ fl2->iif)) == 0);
729}
730
1da177e4
LT
731static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732{
5811662b
CG
733 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
47dcf0cb 735 (fl1->mark ^ fl2->mark) |
5811662b 736 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
8238b218
DM
737 (fl1->oif ^ fl2->oif) |
738 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
739}
740
b5921910
DL
741static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742{
d8d1f30b 743 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
744}
745
e84f84f2
DL
746static inline int rt_is_expired(struct rtable *rth)
747{
d8d1f30b 748 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
749}
750
beb659bd
ED
751/*
752 * Perform a full scan of hash table and free all entries.
753 * Can be called by a softirq or a process.
754 * In the later case, we want to be reschedule if necessary
755 */
6561a3b1 756static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
757{
758 unsigned int i;
759 struct rtable *rth, *next;
760
761 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
762 struct rtable __rcu **pprev;
763 struct rtable *list;
764
beb659bd
ED
765 if (process_context && need_resched())
766 cond_resched();
1c31720a 767 rth = rcu_dereference_raw(rt_hash_table[i].chain);
beb659bd
ED
768 if (!rth)
769 continue;
770
771 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 772
6561a3b1
DM
773 list = NULL;
774 pprev = &rt_hash_table[i].chain;
775 rth = rcu_dereference_protected(*pprev,
1c31720a 776 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 777
6561a3b1
DM
778 while (rth) {
779 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 780 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
781
782 if (!net ||
783 net_eq(dev_net(rth->dst.dev), net)) {
784 rcu_assign_pointer(*pprev, next);
785 rcu_assign_pointer(rth->dst.rt_next, list);
786 list = rth;
32cb5b4e 787 } else {
6561a3b1 788 pprev = &rth->dst.rt_next;
32cb5b4e 789 }
6561a3b1 790 rth = next;
32cb5b4e 791 }
6561a3b1 792
beb659bd
ED
793 spin_unlock_bh(rt_hash_lock_addr(i));
794
6561a3b1
DM
795 for (; list; list = next) {
796 next = rcu_dereference_protected(list->dst.rt_next, 1);
797 rt_free(list);
beb659bd
ED
798 }
799 }
800}
801
1080d709
NH
802/*
803 * While freeing expired entries, we compute average chain length
804 * and standard deviation, using fixed-point arithmetic.
805 * This to have an estimation of rt_chain_length_max
806 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
807 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808 */
809
810#define FRACT_BITS 3
811#define ONE (1UL << FRACT_BITS)
812
98376387
ED
813/*
814 * Given a hash chain and an item in this hash chain,
815 * find if a previous entry has the same hash_inputs
816 * (but differs on tos, mark or oif)
817 * Returns 0 if an alias is found.
818 * Returns ONE if rth has no alias before itself.
819 */
820static int has_noalias(const struct rtable *head, const struct rtable *rth)
821{
822 const struct rtable *aux = head;
823
824 while (aux != rth) {
825 if (compare_hash_inputs(&aux->fl, &rth->fl))
826 return 0;
1c31720a 827 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
828 }
829 return ONE;
830}
831
beb659bd 832static void rt_check_expire(void)
1da177e4 833{
bb1d23b0
ED
834 static unsigned int rover;
835 unsigned int i = rover, goal;
1c31720a
ED
836 struct rtable *rth;
837 struct rtable __rcu **rthp;
cf8da764 838 unsigned long samples = 0;
1080d709 839 unsigned long sum = 0, sum2 = 0;
125bb8f5 840 unsigned long delta;
bb1d23b0
ED
841 u64 mult;
842
125bb8f5
ED
843 delta = jiffies - expires_ljiffies;
844 expires_ljiffies = jiffies;
845 mult = ((u64)delta) << rt_hash_log;
bb1d23b0
ED
846 if (ip_rt_gc_timeout > 1)
847 do_div(mult, ip_rt_gc_timeout);
848 goal = (unsigned int)mult;
39c90ece
ED
849 if (goal > rt_hash_mask)
850 goal = rt_hash_mask + 1;
bb1d23b0 851 for (; goal > 0; goal--) {
1da177e4 852 unsigned long tmo = ip_rt_gc_timeout;
cf8da764 853 unsigned long length;
1da177e4
LT
854
855 i = (i + 1) & rt_hash_mask;
856 rthp = &rt_hash_table[i].chain;
857
d90bf5a9
ED
858 if (need_resched())
859 cond_resched();
860
1080d709
NH
861 samples++;
862
1c31720a 863 if (rcu_dereference_raw(*rthp) == NULL)
bb1d23b0 864 continue;
cf8da764 865 length = 0;
39c90ece 866 spin_lock_bh(rt_hash_lock_addr(i));
1c31720a
ED
867 while ((rth = rcu_dereference_protected(*rthp,
868 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
d8d1f30b 869 prefetch(rth->dst.rt_next);
e84f84f2 870 if (rt_is_expired(rth)) {
d8d1f30b 871 *rthp = rth->dst.rt_next;
29e75252
ED
872 rt_free(rth);
873 continue;
874 }
d8d1f30b 875 if (rth->dst.expires) {
1da177e4 876 /* Entry is expired even if it is in use */
d8d1f30b 877 if (time_before_eq(jiffies, rth->dst.expires)) {
1ddbcb00 878nofree:
1da177e4 879 tmo >>= 1;
d8d1f30b 880 rthp = &rth->dst.rt_next;
1080d709 881 /*
1ddbcb00 882 * We only count entries on
1080d709
NH
883 * a chain with equal hash inputs once
884 * so that entries for different QOS
885 * levels, and other non-hash input
886 * attributes don't unfairly skew
887 * the length computation
888 */
98376387 889 length += has_noalias(rt_hash_table[i].chain, rth);
1da177e4
LT
890 continue;
891 }
1ddbcb00
ED
892 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893 goto nofree;
1da177e4
LT
894
895 /* Cleanup aged off entries. */
d8d1f30b 896 *rthp = rth->dst.rt_next;
e905a9ed 897 rt_free(rth);
1da177e4 898 }
39c90ece 899 spin_unlock_bh(rt_hash_lock_addr(i));
1080d709
NH
900 sum += length;
901 sum2 += length*length;
902 }
903 if (samples) {
904 unsigned long avg = sum / samples;
905 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906 rt_chain_length_max = max_t(unsigned long,
907 ip_rt_gc_elasticity,
908 (avg + 4*sd) >> FRACT_BITS);
1da177e4
LT
909 }
910 rover = i;
beb659bd
ED
911}
912
913/*
914 * rt_worker_func() is run in process context.
29e75252 915 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
916 */
917static void rt_worker_func(struct work_struct *work)
918{
29e75252 919 rt_check_expire();
39c90ece 920 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
921}
922
29e75252
ED
923/*
924 * Pertubation of rt_genid by a small quantity [1..256]
925 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926 * many times (2^24) without giving recent rt_genid.
927 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 928 */
86c657f6 929static void rt_cache_invalidate(struct net *net)
1da177e4 930{
29e75252 931 unsigned char shuffle;
1da177e4 932
29e75252 933 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 934 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
935}
936
29e75252
ED
937/*
938 * delay < 0 : invalidate cache (fast : entries will be deleted later)
939 * delay >= 0 : invalidate & flush cache (can be long)
940 */
76e6ebfb 941void rt_cache_flush(struct net *net, int delay)
1da177e4 942{
86c657f6 943 rt_cache_invalidate(net);
29e75252 944 if (delay >= 0)
6561a3b1 945 rt_do_flush(net, !in_softirq());
1da177e4
LT
946}
947
a5ee1551 948/* Flush previous cache invalidated entries from the cache */
6561a3b1 949void rt_cache_flush_batch(struct net *net)
a5ee1551 950{
6561a3b1 951 rt_do_flush(net, !in_softirq());
a5ee1551
EB
952}
953
1080d709
NH
954static void rt_emergency_hash_rebuild(struct net *net)
955{
3ee94372 956 if (net_ratelimit())
1080d709 957 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 958 rt_cache_invalidate(net);
1080d709
NH
959}
960
1da177e4
LT
961/*
962 Short description of GC goals.
963
964 We want to build algorithm, which will keep routing cache
965 at some equilibrium point, when number of aged off entries
966 is kept approximately equal to newly generated ones.
967
968 Current expiration strength is variable "expire".
969 We try to adjust it dynamically, so that if networking
970 is idle expires is large enough to keep enough of warm entries,
971 and when load increases it reduces to limit cache size.
972 */
973
569d3645 974static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
975{
976 static unsigned long expire = RT_GC_TIMEOUT;
977 static unsigned long last_gc;
978 static int rover;
979 static int equilibrium;
1c31720a
ED
980 struct rtable *rth;
981 struct rtable __rcu **rthp;
1da177e4
LT
982 unsigned long now = jiffies;
983 int goal;
fc66f95c 984 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
985
986 /*
987 * Garbage collection is pretty expensive,
988 * do not make it too frequently.
989 */
990
991 RT_CACHE_STAT_INC(gc_total);
992
993 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 994 entries < ip_rt_max_size) {
1da177e4
LT
995 RT_CACHE_STAT_INC(gc_ignored);
996 goto out;
997 }
998
fc66f95c 999 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1000 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1001 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1002 if (goal <= 0) {
1003 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1005 goal = entries - equilibrium;
1da177e4 1006 if (goal > 0) {
b790cedd 1007 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1008 goal = entries - equilibrium;
1da177e4
LT
1009 }
1010 } else {
1011 /* We are in dangerous area. Try to reduce cache really
1012 * aggressively.
1013 */
b790cedd 1014 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1015 equilibrium = entries - goal;
1da177e4
LT
1016 }
1017
1018 if (now - last_gc >= ip_rt_gc_min_interval)
1019 last_gc = now;
1020
1021 if (goal <= 0) {
1022 equilibrium += goal;
1023 goto work_done;
1024 }
1025
1026 do {
1027 int i, k;
1028
1029 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030 unsigned long tmo = expire;
1031
1032 k = (k + 1) & rt_hash_mask;
1033 rthp = &rt_hash_table[k].chain;
22c047cc 1034 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1035 while ((rth = rcu_dereference_protected(*rthp,
1036 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1037 if (!rt_is_expired(rth) &&
29e75252 1038 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1039 tmo >>= 1;
d8d1f30b 1040 rthp = &rth->dst.rt_next;
1da177e4
LT
1041 continue;
1042 }
d8d1f30b 1043 *rthp = rth->dst.rt_next;
1da177e4
LT
1044 rt_free(rth);
1045 goal--;
1da177e4 1046 }
22c047cc 1047 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1048 if (goal <= 0)
1049 break;
1050 }
1051 rover = k;
1052
1053 if (goal <= 0)
1054 goto work_done;
1055
1056 /* Goal is not achieved. We stop process if:
1057
1058 - if expire reduced to zero. Otherwise, expire is halfed.
1059 - if table is not full.
1060 - if we are called from interrupt.
1061 - jiffies check is just fallback/debug loop breaker.
1062 We will not spin here for long time in any case.
1063 */
1064
1065 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067 if (expire == 0)
1068 break;
1069
1070 expire >>= 1;
1071#if RT_CACHE_DEBUG >= 2
1072 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
fc66f95c 1073 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1da177e4
LT
1074#endif
1075
fc66f95c 1076 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1077 goto out;
1078 } while (!in_softirq() && time_before_eq(jiffies, now));
1079
fc66f95c
ED
1080 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081 goto out;
1082 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1083 goto out;
1084 if (net_ratelimit())
1085 printk(KERN_WARNING "dst cache overflow\n");
1086 RT_CACHE_STAT_INC(gc_dst_overflow);
1087 return 1;
1088
1089work_done:
1090 expire += ip_rt_gc_min_interval;
1091 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1092 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4
LT
1094 expire = ip_rt_gc_timeout;
1095#if RT_CACHE_DEBUG >= 2
1096 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
fc66f95c 1097 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1da177e4
LT
1098#endif
1099out: return 0;
1100}
1101
98376387
ED
1102/*
1103 * Returns number of entries in a hash chain that have different hash_inputs
1104 */
1105static int slow_chain_length(const struct rtable *head)
1106{
1107 int length = 0;
1108 const struct rtable *rth = head;
1109
1110 while (rth) {
1111 length += has_noalias(head, rth);
1c31720a 1112 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1113 }
1114 return length >> FRACT_BITS;
1115}
1116
511c3f92 1117static int rt_intern_hash(unsigned hash, struct rtable *rt,
6a2bad70 1118 struct rtable **rp, struct sk_buff *skb, int ifindex)
1da177e4 1119{
1c31720a
ED
1120 struct rtable *rth, *cand;
1121 struct rtable __rcu **rthp, **candp;
1da177e4 1122 unsigned long now;
1da177e4
LT
1123 u32 min_score;
1124 int chain_length;
1125 int attempts = !in_softirq();
1126
1127restart:
1128 chain_length = 0;
1129 min_score = ~(u32)0;
1130 cand = NULL;
1131 candp = NULL;
1132 now = jiffies;
1133
d8d1f30b 1134 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1135 /*
1136 * If we're not caching, just tell the caller we
1137 * were successful and don't touch the route. The
1138 * caller hold the sole reference to the cache entry, and
1139 * it will be released when the caller is done with it.
1140 * If we drop it here, the callers have no way to resolve routes
1141 * when we're not caching. Instead, just point *rp at rt, so
1142 * the caller gets a single use out of the route
b6280b47
NH
1143 * Note that we do rt_free on this new route entry, so that
1144 * once its refcount hits zero, we are still able to reap it
1145 * (Thanks Alexey)
27b75c95
ED
1146 * Note: To avoid expensive rcu stuff for this uncached dst,
1147 * we set DST_NOCACHE so that dst_release() can free dst without
1148 * waiting a grace period.
73e42897 1149 */
b6280b47 1150
c7d4426a 1151 rt->dst.flags |= DST_NOCACHE;
c7537967 1152 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1153 int err = arp_bind_neighbour(&rt->dst);
b6280b47
NH
1154 if (err) {
1155 if (net_ratelimit())
1156 printk(KERN_WARNING
1157 "Neighbour table failure & not caching routes.\n");
27b75c95 1158 ip_rt_put(rt);
b6280b47
NH
1159 return err;
1160 }
1161 }
1162
b6280b47 1163 goto skip_hashing;
1080d709
NH
1164 }
1165
1da177e4
LT
1166 rthp = &rt_hash_table[hash].chain;
1167
22c047cc 1168 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1169 while ((rth = rcu_dereference_protected(*rthp,
1170 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1171 if (rt_is_expired(rth)) {
d8d1f30b 1172 *rthp = rth->dst.rt_next;
29e75252
ED
1173 rt_free(rth);
1174 continue;
1175 }
b5921910 1176 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 1177 /* Put it first */
d8d1f30b 1178 *rthp = rth->dst.rt_next;
1da177e4
LT
1179 /*
1180 * Since lookup is lockfree, the deletion
1181 * must be visible to another weakly ordered CPU before
1182 * the insertion at the start of the hash chain.
1183 */
d8d1f30b 1184 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1185 rt_hash_table[hash].chain);
1186 /*
1187 * Since lookup is lockfree, the update writes
1188 * must be ordered for consistency on SMP.
1189 */
1190 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
d8d1f30b 1192 dst_use(&rth->dst, now);
22c047cc 1193 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1194
1195 rt_drop(rt);
511c3f92
ED
1196 if (rp)
1197 *rp = rth;
1198 else
d8d1f30b 1199 skb_dst_set(skb, &rth->dst);
1da177e4
LT
1200 return 0;
1201 }
1202
d8d1f30b 1203 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1204 u32 score = rt_score(rth);
1205
1206 if (score <= min_score) {
1207 cand = rth;
1208 candp = rthp;
1209 min_score = score;
1210 }
1211 }
1212
1213 chain_length++;
1214
d8d1f30b 1215 rthp = &rth->dst.rt_next;
1da177e4
LT
1216 }
1217
1218 if (cand) {
1219 /* ip_rt_gc_elasticity used to be average length of chain
1220 * length, when exceeded gc becomes really aggressive.
1221 *
1222 * The second limit is less certain. At the moment it allows
1223 * only 2 entries per bucket. We will see.
1224 */
1225 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1226 *candp = cand->dst.rt_next;
1da177e4
LT
1227 rt_free(cand);
1228 }
1080d709 1229 } else {
98376387
ED
1230 if (chain_length > rt_chain_length_max &&
1231 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1232 struct net *net = dev_net(rt->dst.dev);
1080d709 1233 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1234 if (!rt_caching(net)) {
1080d709 1235 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1236 rt->dst.dev->name, num);
1080d709 1237 }
b35ecb5d 1238 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1239 spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242 ifindex, rt_genid(net));
1243 goto restart;
1080d709 1244 }
1da177e4
LT
1245 }
1246
1247 /* Try to bind route to arp only if it is output
1248 route or unicast forwarding path.
1249 */
c7537967 1250 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1251 int err = arp_bind_neighbour(&rt->dst);
1da177e4 1252 if (err) {
22c047cc 1253 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1254
1255 if (err != -ENOBUFS) {
1256 rt_drop(rt);
1257 return err;
1258 }
1259
1260 /* Neighbour tables are full and nothing
1261 can be released. Try to shrink route cache,
1262 it is most likely it holds some neighbour records.
1263 */
1264 if (attempts-- > 0) {
1265 int saved_elasticity = ip_rt_gc_elasticity;
1266 int saved_int = ip_rt_gc_min_interval;
1267 ip_rt_gc_elasticity = 1;
1268 ip_rt_gc_min_interval = 0;
569d3645 1269 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1270 ip_rt_gc_min_interval = saved_int;
1271 ip_rt_gc_elasticity = saved_elasticity;
1272 goto restart;
1273 }
1274
1275 if (net_ratelimit())
7e1b33e5 1276 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4
LT
1277 rt_drop(rt);
1278 return -ENOBUFS;
1279 }
1280 }
1281
d8d1f30b 1282 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1283
1da177e4 1284#if RT_CACHE_DEBUG >= 2
d8d1f30b 1285 if (rt->dst.rt_next) {
1da177e4 1286 struct rtable *trt;
b6280b47
NH
1287 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288 hash, &rt->rt_dst);
d8d1f30b 1289 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
673d57e7 1290 printk(" . %pI4", &trt->rt_dst);
1da177e4
LT
1291 printk("\n");
1292 }
1293#endif
00269b54
ED
1294 /*
1295 * Since lookup is lockfree, we must make sure
1296 * previous writes to rt are comitted to memory
1297 * before making rt visible to other CPUS.
1298 */
1ddbcb00 1299 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1300
22c047cc 1301 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1302
b6280b47 1303skip_hashing:
511c3f92
ED
1304 if (rp)
1305 *rp = rt;
1306 else
d8d1f30b 1307 skb_dst_set(skb, &rt->dst);
1da177e4
LT
1308 return 0;
1309}
1310
1311void rt_bind_peer(struct rtable *rt, int create)
1312{
1da177e4
LT
1313 struct inet_peer *peer;
1314
b534ecf1 1315 peer = inet_getpeer_v4(rt->rt_dst, create);
1da177e4 1316
49e8ab03 1317 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4
LT
1318 inet_putpeer(peer);
1319}
1320
1321/*
1322 * Peer allocation may fail only in serious out-of-memory conditions. However
1323 * we still can generate some output.
1324 * Random ID selection looks a bit dangerous because we have no chances to
1325 * select ID being unique in a reasonable period of time.
1326 * But broken packet identifier may be better than no packet at all.
1327 */
1328static void ip_select_fb_ident(struct iphdr *iph)
1329{
1330 static DEFINE_SPINLOCK(ip_fb_id_lock);
1331 static u32 ip_fallback_id;
1332 u32 salt;
1333
1334 spin_lock_bh(&ip_fb_id_lock);
e448515c 1335 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1336 iph->id = htons(salt & 0xFFFF);
1337 ip_fallback_id = salt;
1338 spin_unlock_bh(&ip_fb_id_lock);
1339}
1340
1341void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1342{
1343 struct rtable *rt = (struct rtable *) dst;
1344
1345 if (rt) {
1346 if (rt->peer == NULL)
1347 rt_bind_peer(rt, 1);
1348
1349 /* If peer is attached to destination, it is never detached,
1350 so that we need not to grab a lock to dereference it.
1351 */
1352 if (rt->peer) {
1353 iph->id = htons(inet_getid(rt->peer, more));
1354 return;
1355 }
1356 } else
e905a9ed 1357 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1358 __builtin_return_address(0));
1da177e4
LT
1359
1360 ip_select_fb_ident(iph);
1361}
4bc2f18b 1362EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1363
1364static void rt_del(unsigned hash, struct rtable *rt)
1365{
1c31720a
ED
1366 struct rtable __rcu **rthp;
1367 struct rtable *aux;
1da177e4 1368
29e75252 1369 rthp = &rt_hash_table[hash].chain;
22c047cc 1370 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1371 ip_rt_put(rt);
1c31720a
ED
1372 while ((aux = rcu_dereference_protected(*rthp,
1373 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1374 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1375 *rthp = aux->dst.rt_next;
29e75252
ED
1376 rt_free(aux);
1377 continue;
1da177e4 1378 }
d8d1f30b 1379 rthp = &aux->dst.rt_next;
29e75252 1380 }
22c047cc 1381 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1382}
1383
ed7865a4 1384/* called in rcu_read_lock() section */
f7655229
AV
1385void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386 __be32 saddr, struct net_device *dev)
1da177e4
LT
1387{
1388 int i, k;
ed7865a4 1389 struct in_device *in_dev = __in_dev_get_rcu(dev);
1c31720a
ED
1390 struct rtable *rth;
1391 struct rtable __rcu **rthp;
f7655229 1392 __be32 skeys[2] = { saddr, 0 };
1da177e4 1393 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1394 struct netevent_redirect netevent;
317805b8 1395 struct net *net;
1da177e4 1396
1da177e4
LT
1397 if (!in_dev)
1398 return;
1399
c346dca1 1400 net = dev_net(dev);
9d4fb27d
JP
1401 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1402 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1403 ipv4_is_zeronet(new_gw))
1da177e4
LT
1404 goto reject_redirect;
1405
1080d709
NH
1406 if (!rt_caching(net))
1407 goto reject_redirect;
1408
1da177e4
LT
1409 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1410 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1411 goto reject_redirect;
1412 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1413 goto reject_redirect;
1414 } else {
317805b8 1415 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1416 goto reject_redirect;
1417 }
1418
1419 for (i = 0; i < 2; i++) {
1420 for (k = 0; k < 2; k++) {
b00180de 1421 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1422 rt_genid(net));
1da177e4 1423
1c31720a 1424 rthp = &rt_hash_table[hash].chain;
1da177e4 1425
1da177e4
LT
1426 while ((rth = rcu_dereference(*rthp)) != NULL) {
1427 struct rtable *rt;
1428
1429 if (rth->fl.fl4_dst != daddr ||
1430 rth->fl.fl4_src != skeys[i] ||
1da177e4 1431 rth->fl.oif != ikeys[k] ||
c7537967 1432 rt_is_input_route(rth) ||
e84f84f2 1433 rt_is_expired(rth) ||
d8d1f30b
CG
1434 !net_eq(dev_net(rth->dst.dev), net)) {
1435 rthp = &rth->dst.rt_next;
1da177e4
LT
1436 continue;
1437 }
1438
1439 if (rth->rt_dst != daddr ||
1440 rth->rt_src != saddr ||
d8d1f30b 1441 rth->dst.error ||
1da177e4 1442 rth->rt_gateway != old_gw ||
d8d1f30b 1443 rth->dst.dev != dev)
1da177e4
LT
1444 break;
1445
d8d1f30b 1446 dst_hold(&rth->dst);
1da177e4
LT
1447
1448 rt = dst_alloc(&ipv4_dst_ops);
1449 if (rt == NULL) {
1450 ip_rt_put(rth);
1da177e4
LT
1451 return;
1452 }
1453
1454 /* Copy all the information. */
1455 *rt = *rth;
d8d1f30b
CG
1456 rt->dst.__use = 1;
1457 atomic_set(&rt->dst.__refcnt, 1);
1458 rt->dst.child = NULL;
1459 if (rt->dst.dev)
1460 dev_hold(rt->dst.dev);
d8d1f30b
CG
1461 rt->dst.obsolete = -1;
1462 rt->dst.lastuse = jiffies;
1463 rt->dst.path = &rt->dst;
1464 rt->dst.neighbour = NULL;
1465 rt->dst.hh = NULL;
def8b4fa 1466#ifdef CONFIG_XFRM
d8d1f30b 1467 rt->dst.xfrm = NULL;
def8b4fa 1468#endif
e84f84f2 1469 rt->rt_genid = rt_genid(net);
1da177e4
LT
1470 rt->rt_flags |= RTCF_REDIRECTED;
1471
1472 /* Gateway is different ... */
1473 rt->rt_gateway = new_gw;
1474
1475 /* Redirect received -> path was valid */
d8d1f30b 1476 dst_confirm(&rth->dst);
1da177e4
LT
1477
1478 if (rt->peer)
1479 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
1480 if (rt->fi)
1481 atomic_inc(&rt->fi->fib_clntref);
1da177e4 1482
d8d1f30b
CG
1483 if (arp_bind_neighbour(&rt->dst) ||
1484 !(rt->dst.neighbour->nud_state &
1da177e4 1485 NUD_VALID)) {
d8d1f30b
CG
1486 if (rt->dst.neighbour)
1487 neigh_event_send(rt->dst.neighbour, NULL);
1da177e4
LT
1488 ip_rt_put(rth);
1489 rt_drop(rt);
1490 goto do_next;
1491 }
e905a9ed 1492
d8d1f30b
CG
1493 netevent.old = &rth->dst;
1494 netevent.new = &rt->dst;
e905a9ed
YH
1495 call_netevent_notifiers(NETEVENT_REDIRECT,
1496 &netevent);
1da177e4
LT
1497
1498 rt_del(hash, rth);
6a2bad70 1499 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1da177e4
LT
1500 ip_rt_put(rt);
1501 goto do_next;
1502 }
1da177e4
LT
1503 do_next:
1504 ;
1505 }
1506 }
1da177e4
LT
1507 return;
1508
1509reject_redirect:
1510#ifdef CONFIG_IP_ROUTE_VERBOSE
1511 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1512 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1513 " Advised path = %pI4 -> %pI4\n",
1514 &old_gw, dev->name, &new_gw,
1515 &saddr, &daddr);
1da177e4 1516#endif
ed7865a4 1517 ;
1da177e4
LT
1518}
1519
1520static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1521{
ee6b9673 1522 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1523 struct dst_entry *ret = dst;
1524
1525 if (rt) {
d11a4dc1 1526 if (dst->obsolete > 0) {
1da177e4
LT
1527 ip_rt_put(rt);
1528 ret = NULL;
1529 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
d8d1f30b
CG
1530 (rt->dst.expires &&
1531 time_after_eq(jiffies, rt->dst.expires))) {
8c7bc840 1532 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
b00180de 1533 rt->fl.oif,
e84f84f2 1534 rt_genid(dev_net(dst->dev)));
1da177e4 1535#if RT_CACHE_DEBUG >= 1
673d57e7
HH
1536 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1537 &rt->rt_dst, rt->fl.fl4_tos);
1da177e4
LT
1538#endif
1539 rt_del(hash, rt);
1540 ret = NULL;
1541 }
1542 }
1543 return ret;
1544}
1545
1546/*
1547 * Algorithm:
1548 * 1. The first ip_rt_redirect_number redirects are sent
1549 * with exponential backoff, then we stop sending them at all,
1550 * assuming that the host ignores our redirects.
1551 * 2. If we did not see packets requiring redirects
1552 * during ip_rt_redirect_silence, we assume that the host
1553 * forgot redirected route and start to send redirects again.
1554 *
1555 * This algorithm is much cheaper and more intelligent than dumb load limiting
1556 * in icmp.c.
1557 *
1558 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1559 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1560 */
1561
1562void ip_rt_send_redirect(struct sk_buff *skb)
1563{
511c3f92 1564 struct rtable *rt = skb_rtable(skb);
30038fc6
ED
1565 struct in_device *in_dev;
1566 int log_martians;
1da177e4 1567
30038fc6 1568 rcu_read_lock();
d8d1f30b 1569 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1570 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1571 rcu_read_unlock();
1da177e4 1572 return;
30038fc6
ED
1573 }
1574 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1575 rcu_read_unlock();
1da177e4
LT
1576
1577 /* No redirected packets during ip_rt_redirect_silence;
1578 * reset the algorithm.
1579 */
d8d1f30b
CG
1580 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1581 rt->dst.rate_tokens = 0;
1da177e4
LT
1582
1583 /* Too many ignored redirects; do not send anything
d8d1f30b 1584 * set dst.rate_last to the last seen redirected packet.
1da177e4 1585 */
d8d1f30b
CG
1586 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1587 rt->dst.rate_last = jiffies;
30038fc6 1588 return;
1da177e4
LT
1589 }
1590
1591 /* Check for load limit; set rate_last to the latest sent
1592 * redirect.
1593 */
d8d1f30b 1594 if (rt->dst.rate_tokens == 0 ||
14fb8a76 1595 time_after(jiffies,
d8d1f30b
CG
1596 (rt->dst.rate_last +
1597 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1da177e4 1598 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
d8d1f30b
CG
1599 rt->dst.rate_last = jiffies;
1600 ++rt->dst.rate_tokens;
1da177e4 1601#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1602 if (log_martians &&
d8d1f30b 1603 rt->dst.rate_tokens == ip_rt_redirect_number &&
1da177e4 1604 net_ratelimit())
673d57e7
HH
1605 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1606 &rt->rt_src, rt->rt_iif,
1607 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1608#endif
1609 }
1da177e4
LT
1610}
1611
1612static int ip_error(struct sk_buff *skb)
1613{
511c3f92 1614 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1615 unsigned long now;
1616 int code;
1617
d8d1f30b 1618 switch (rt->dst.error) {
1da177e4
LT
1619 case EINVAL:
1620 default:
1621 goto out;
1622 case EHOSTUNREACH:
1623 code = ICMP_HOST_UNREACH;
1624 break;
1625 case ENETUNREACH:
1626 code = ICMP_NET_UNREACH;
d8d1f30b 1627 IP_INC_STATS_BH(dev_net(rt->dst.dev),
7c73a6fa 1628 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1629 break;
1630 case EACCES:
1631 code = ICMP_PKT_FILTERED;
1632 break;
1633 }
1634
1635 now = jiffies;
d8d1f30b
CG
1636 rt->dst.rate_tokens += now - rt->dst.rate_last;
1637 if (rt->dst.rate_tokens > ip_rt_error_burst)
1638 rt->dst.rate_tokens = ip_rt_error_burst;
1639 rt->dst.rate_last = now;
1640 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1641 rt->dst.rate_tokens -= ip_rt_error_cost;
1da177e4
LT
1642 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1643 }
1644
1645out: kfree_skb(skb);
1646 return 0;
e905a9ed 1647}
1da177e4
LT
1648
1649/*
1650 * The last two values are not from the RFC but
1651 * are needed for AMPRnet AX.25 paths.
1652 */
1653
9b5b5cff 1654static const unsigned short mtu_plateau[] =
1da177e4
LT
1655{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1656
5969f71d 1657static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1658{
1659 int i;
e905a9ed 1660
1da177e4
LT
1661 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1662 if (old_mtu > mtu_plateau[i])
1663 return mtu_plateau[i];
1664 return 68;
1665}
1666
b5921910 1667unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1668 unsigned short new_mtu,
1669 struct net_device *dev)
1da177e4 1670{
0010e465 1671 int i, k;
1da177e4
LT
1672 unsigned short old_mtu = ntohs(iph->tot_len);
1673 struct rtable *rth;
0010e465 1674 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1675 __be32 skeys[2] = { iph->saddr, 0, };
1676 __be32 daddr = iph->daddr;
1da177e4
LT
1677 unsigned short est_mtu = 0;
1678
0010e465
TT
1679 for (k = 0; k < 2; k++) {
1680 for (i = 0; i < 2; i++) {
b00180de 1681 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1682 rt_genid(net));
0010e465
TT
1683
1684 rcu_read_lock();
1685 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 1686 rth = rcu_dereference(rth->dst.rt_next)) {
1da177e4
LT
1687 unsigned short mtu = new_mtu;
1688
0010e465
TT
1689 if (rth->fl.fl4_dst != daddr ||
1690 rth->fl.fl4_src != skeys[i] ||
1691 rth->rt_dst != daddr ||
1692 rth->rt_src != iph->saddr ||
1693 rth->fl.oif != ikeys[k] ||
c7537967 1694 rt_is_input_route(rth) ||
d8d1f30b
CG
1695 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1696 !net_eq(dev_net(rth->dst.dev), net) ||
6c3b8fc6 1697 rt_is_expired(rth))
0010e465
TT
1698 continue;
1699
1da177e4
LT
1700 if (new_mtu < 68 || new_mtu >= old_mtu) {
1701
1702 /* BSD 4.2 compatibility hack :-( */
1703 if (mtu == 0 &&
d8d1f30b 1704 old_mtu >= dst_mtu(&rth->dst) &&
1da177e4
LT
1705 old_mtu >= 68 + (iph->ihl << 2))
1706 old_mtu -= iph->ihl << 2;
1707
1708 mtu = guess_mtu(old_mtu);
1709 }
d8d1f30b
CG
1710 if (mtu <= dst_mtu(&rth->dst)) {
1711 if (mtu < dst_mtu(&rth->dst)) {
1712 dst_confirm(&rth->dst);
1da177e4 1713 if (mtu < ip_rt_min_pmtu) {
defb3519
DM
1714 u32 lock = dst_metric(&rth->dst,
1715 RTAX_LOCK);
1da177e4 1716 mtu = ip_rt_min_pmtu;
defb3519
DM
1717 lock |= (1 << RTAX_MTU);
1718 dst_metric_set(&rth->dst, RTAX_LOCK,
1719 lock);
1da177e4 1720 }
defb3519 1721 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
d8d1f30b 1722 dst_set_expires(&rth->dst,
1da177e4
LT
1723 ip_rt_mtu_expires);
1724 }
1725 est_mtu = mtu;
1726 }
1727 }
0010e465 1728 rcu_read_unlock();
1da177e4 1729 }
1da177e4
LT
1730 }
1731 return est_mtu ? : new_mtu;
1732}
1733
1734static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1735{
6d273f8d 1736 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1da177e4
LT
1737 !(dst_metric_locked(dst, RTAX_MTU))) {
1738 if (mtu < ip_rt_min_pmtu) {
defb3519 1739 u32 lock = dst_metric(dst, RTAX_LOCK);
1da177e4 1740 mtu = ip_rt_min_pmtu;
defb3519 1741 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1da177e4 1742 }
defb3519 1743 dst_metric_set(dst, RTAX_MTU, mtu);
1da177e4 1744 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1745 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1746 }
1747}
1748
1749static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1750{
d11a4dc1
TT
1751 if (rt_is_expired((struct rtable *)dst))
1752 return NULL;
1753 return dst;
1da177e4
LT
1754}
1755
1756static void ipv4_dst_destroy(struct dst_entry *dst)
1757{
1758 struct rtable *rt = (struct rtable *) dst;
1759 struct inet_peer *peer = rt->peer;
1da177e4 1760
62fa8a84
DM
1761 if (rt->fi) {
1762 fib_info_put(rt->fi);
1763 rt->fi = NULL;
1764 }
1da177e4
LT
1765 if (peer) {
1766 rt->peer = NULL;
1767 inet_putpeer(peer);
1768 }
1da177e4
LT
1769}
1770
1da177e4
LT
1771
1772static void ipv4_link_failure(struct sk_buff *skb)
1773{
1774 struct rtable *rt;
1775
1776 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
511c3f92 1778 rt = skb_rtable(skb);
1da177e4 1779 if (rt)
d8d1f30b 1780 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1781}
1782
1783static int ip_rt_bug(struct sk_buff *skb)
1784{
673d57e7
HH
1785 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1787 skb->dev ? skb->dev->name : "?");
1788 kfree_skb(skb);
1789 return 0;
1790}
1791
1792/*
1793 We do not cache source address of outgoing interface,
1794 because it is used only by IP RR, TS and SRR options,
1795 so that it out of fast path.
1796
1797 BTW remember: "addr" is allowed to be not aligned
1798 in IP options!
1799 */
1800
1801void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802{
a61ced5d 1803 __be32 src;
1da177e4
LT
1804 struct fib_result res;
1805
c7537967 1806 if (rt_is_output_route(rt))
1da177e4 1807 src = rt->rt_src;
ebc0ffae
ED
1808 else {
1809 rcu_read_lock();
1810 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1811 src = FIB_RES_PREFSRC(res);
1812 else
1813 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1814 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1815 rcu_read_unlock();
1816 }
1da177e4
LT
1817 memcpy(addr, &src, 4);
1818}
1819
c7066f70 1820#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1821static void set_class_tag(struct rtable *rt, u32 tag)
1822{
d8d1f30b
CG
1823 if (!(rt->dst.tclassid & 0xFFFF))
1824 rt->dst.tclassid |= tag & 0xFFFF;
1825 if (!(rt->dst.tclassid & 0xFFFF0000))
1826 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1827}
1828#endif
1829
0dbaee3b
DM
1830static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1831{
1832 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1833
1834 if (advmss == 0) {
1835 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1836 ip_rt_min_advmss);
1837 if (advmss > 65535 - 40)
1838 advmss = 65535 - 40;
1839 }
1840 return advmss;
1841}
1842
d33e4553
DM
1843static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1844{
1845 unsigned int mtu = dst->dev->mtu;
1846
1847 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1848 const struct rtable *rt = (const struct rtable *) dst;
1849
1850 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1851 mtu = 576;
1852 }
1853
1854 if (mtu > IP_MAX_MTU)
1855 mtu = IP_MAX_MTU;
1856
1857 return mtu;
1858}
1859
a4daad6b
DM
1860static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1861{
0131ba45
DM
1862 struct inet_peer *peer;
1863 int create = 0;
a4daad6b 1864
0131ba45
DM
1865 /* If a peer entry exists for this destination, we must hook
1866 * it up in order to get at cached metrics.
1867 */
1868 if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1869 create = 1;
1870
1871 rt_bind_peer(rt, create);
1872 peer = rt->peer;
1873 if (peer) {
a4daad6b
DM
1874 if (inet_metrics_new(peer))
1875 memcpy(peer->metrics, fi->fib_metrics,
1876 sizeof(u32) * RTAX_MAX);
1877 dst_init_metrics(&rt->dst, peer->metrics, false);
0131ba45
DM
1878 } else {
1879 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1880 rt->fi = fi;
1881 atomic_inc(&fi->fib_clntref);
1882 }
1883 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1884 }
1885}
1886
1da177e4
LT
1887static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1888{
defb3519 1889 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1890 struct fib_info *fi = res->fi;
1891
1892 if (fi) {
1893 if (FIB_RES_GW(*res) &&
1894 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1895 rt->rt_gateway = FIB_RES_GW(*res);
a4daad6b 1896 rt_init_metrics(rt, fi);
c7066f70 1897#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1898 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1899#endif
d33e4553 1900 }
defb3519 1901
defb3519
DM
1902 if (dst_mtu(dst) > IP_MAX_MTU)
1903 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1904 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1905 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1906
c7066f70 1907#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1908#ifdef CONFIG_IP_MULTIPLE_TABLES
1909 set_class_tag(rt, fib_rules_tclass(res));
1910#endif
1911 set_class_tag(rt, itag);
1912#endif
e905a9ed 1913 rt->rt_type = res->type;
1da177e4
LT
1914}
1915
96d36220 1916/* called in rcu_read_lock() section */
9e12bb22 1917static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1918 u8 tos, struct net_device *dev, int our)
1919{
96d36220 1920 unsigned int hash;
1da177e4 1921 struct rtable *rth;
a61ced5d 1922 __be32 spec_dst;
96d36220 1923 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1924 u32 itag = 0;
b5f7e755 1925 int err;
1da177e4
LT
1926
1927 /* Primary sanity checks. */
1928
1929 if (in_dev == NULL)
1930 return -EINVAL;
1931
1e637c74 1932 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1933 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1934 goto e_inval;
1935
f97c1e0c
JP
1936 if (ipv4_is_zeronet(saddr)) {
1937 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1938 goto e_inval;
1939 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755
ED
1940 } else {
1941 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1942 &itag, 0);
1943 if (err < 0)
1944 goto e_err;
1945 }
1da177e4
LT
1946 rth = dst_alloc(&ipv4_dst_ops);
1947 if (!rth)
1948 goto e_nobufs;
1949
d8d1f30b
CG
1950 rth->dst.output = ip_rt_bug;
1951 rth->dst.obsolete = -1;
1da177e4 1952
d8d1f30b
CG
1953 atomic_set(&rth->dst.__refcnt, 1);
1954 rth->dst.flags= DST_HOST;
42f811b8 1955 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 1956 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
1957 rth->fl.fl4_dst = daddr;
1958 rth->rt_dst = daddr;
1959 rth->fl.fl4_tos = tos;
47dcf0cb 1960 rth->fl.mark = skb->mark;
1da177e4
LT
1961 rth->fl.fl4_src = saddr;
1962 rth->rt_src = saddr;
c7066f70 1963#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 1964 rth->dst.tclassid = itag;
1da177e4
LT
1965#endif
1966 rth->rt_iif =
1967 rth->fl.iif = dev->ifindex;
d8d1f30b
CG
1968 rth->dst.dev = init_net.loopback_dev;
1969 dev_hold(rth->dst.dev);
1da177e4
LT
1970 rth->fl.oif = 0;
1971 rth->rt_gateway = daddr;
1972 rth->rt_spec_dst= spec_dst;
e84f84f2 1973 rth->rt_genid = rt_genid(dev_net(dev));
1da177e4 1974 rth->rt_flags = RTCF_MULTICAST;
29e75252 1975 rth->rt_type = RTN_MULTICAST;
1da177e4 1976 if (our) {
d8d1f30b 1977 rth->dst.input= ip_local_deliver;
1da177e4
LT
1978 rth->rt_flags |= RTCF_LOCAL;
1979 }
1980
1981#ifdef CONFIG_IP_MROUTE
f97c1e0c 1982 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1983 rth->dst.input = ip_mr_input;
1da177e4
LT
1984#endif
1985 RT_CACHE_STAT_INC(in_slow_mc);
1986
e84f84f2 1987 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
6a2bad70 1988 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1da177e4
LT
1989
1990e_nobufs:
1da177e4 1991 return -ENOBUFS;
1da177e4 1992e_inval:
96d36220 1993 return -EINVAL;
b5f7e755 1994e_err:
b5f7e755 1995 return err;
1da177e4
LT
1996}
1997
1998
1999static void ip_handle_martian_source(struct net_device *dev,
2000 struct in_device *in_dev,
2001 struct sk_buff *skb,
9e12bb22
AV
2002 __be32 daddr,
2003 __be32 saddr)
1da177e4
LT
2004{
2005 RT_CACHE_STAT_INC(in_martian_src);
2006#ifdef CONFIG_IP_ROUTE_VERBOSE
2007 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2008 /*
2009 * RFC1812 recommendation, if source is martian,
2010 * the only hint is MAC header.
2011 */
673d57e7
HH
2012 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2013 &daddr, &saddr, dev->name);
98e399f8 2014 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 2015 int i;
98e399f8 2016 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
2017 printk(KERN_WARNING "ll header: ");
2018 for (i = 0; i < dev->hard_header_len; i++, p++) {
2019 printk("%02x", *p);
2020 if (i < (dev->hard_header_len - 1))
2021 printk(":");
2022 }
2023 printk("\n");
2024 }
2025 }
2026#endif
2027}
2028
47360228 2029/* called in rcu_read_lock() section */
5969f71d
SH
2030static int __mkroute_input(struct sk_buff *skb,
2031 struct fib_result *res,
2032 struct in_device *in_dev,
2033 __be32 daddr, __be32 saddr, u32 tos,
2034 struct rtable **result)
1da177e4 2035{
1da177e4
LT
2036 struct rtable *rth;
2037 int err;
2038 struct in_device *out_dev;
47360228 2039 unsigned int flags = 0;
d9c9df8c
AV
2040 __be32 spec_dst;
2041 u32 itag;
1da177e4
LT
2042
2043 /* get a working reference to the output device */
47360228 2044 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2045 if (out_dev == NULL) {
2046 if (net_ratelimit())
2047 printk(KERN_CRIT "Bug in ip_route_input" \
2048 "_slow(). Please, report\n");
2049 return -EINVAL;
2050 }
2051
2052
e905a9ed 2053 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
b0c110ca 2054 in_dev->dev, &spec_dst, &itag, skb->mark);
1da177e4 2055 if (err < 0) {
e905a9ed 2056 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2057 saddr);
e905a9ed 2058
1da177e4
LT
2059 goto cleanup;
2060 }
2061
2062 if (err)
2063 flags |= RTCF_DIRECTSRC;
2064
51b77cae 2065 if (out_dev == in_dev && err &&
1da177e4
LT
2066 (IN_DEV_SHARED_MEDIA(out_dev) ||
2067 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2068 flags |= RTCF_DOREDIRECT;
2069
2070 if (skb->protocol != htons(ETH_P_IP)) {
2071 /* Not IP (i.e. ARP). Do not create route, if it is
2072 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2073 *
2074 * Proxy arp feature have been extended to allow, ARP
2075 * replies back to the same interface, to support
2076 * Private VLAN switch technologies. See arp.c.
1da177e4 2077 */
65324144
JDB
2078 if (out_dev == in_dev &&
2079 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2080 err = -EINVAL;
2081 goto cleanup;
2082 }
2083 }
2084
2085
2086 rth = dst_alloc(&ipv4_dst_ops);
2087 if (!rth) {
2088 err = -ENOBUFS;
2089 goto cleanup;
2090 }
2091
d8d1f30b
CG
2092 atomic_set(&rth->dst.__refcnt, 1);
2093 rth->dst.flags= DST_HOST;
42f811b8 2094 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2095 rth->dst.flags |= DST_NOPOLICY;
42f811b8 2096 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
d8d1f30b 2097 rth->dst.flags |= DST_NOXFRM;
1da177e4
LT
2098 rth->fl.fl4_dst = daddr;
2099 rth->rt_dst = daddr;
2100 rth->fl.fl4_tos = tos;
47dcf0cb 2101 rth->fl.mark = skb->mark;
1da177e4
LT
2102 rth->fl.fl4_src = saddr;
2103 rth->rt_src = saddr;
2104 rth->rt_gateway = daddr;
2105 rth->rt_iif =
2106 rth->fl.iif = in_dev->dev->ifindex;
d8d1f30b
CG
2107 rth->dst.dev = (out_dev)->dev;
2108 dev_hold(rth->dst.dev);
1da177e4
LT
2109 rth->fl.oif = 0;
2110 rth->rt_spec_dst= spec_dst;
2111
d8d1f30b
CG
2112 rth->dst.obsolete = -1;
2113 rth->dst.input = ip_forward;
2114 rth->dst.output = ip_output;
2115 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1da177e4
LT
2116
2117 rt_set_nexthop(rth, res, itag);
2118
2119 rth->rt_flags = flags;
2120
2121 *result = rth;
2122 err = 0;
2123 cleanup:
1da177e4 2124 return err;
e905a9ed 2125}
1da177e4 2126
5969f71d
SH
2127static int ip_mkroute_input(struct sk_buff *skb,
2128 struct fib_result *res,
2129 const struct flowi *fl,
2130 struct in_device *in_dev,
2131 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2132{
7abaa27c 2133 struct rtable* rth = NULL;
1da177e4
LT
2134 int err;
2135 unsigned hash;
2136
2137#ifdef CONFIG_IP_ROUTE_MULTIPATH
2138 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2139 fib_select_multipath(fl, res);
2140#endif
2141
2142 /* create a routing cache entry */
2143 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2144 if (err)
2145 return err;
1da177e4
LT
2146
2147 /* put it into the cache */
e84f84f2 2148 hash = rt_hash(daddr, saddr, fl->iif,
d8d1f30b 2149 rt_genid(dev_net(rth->dst.dev)));
6a2bad70 2150 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
1da177e4
LT
2151}
2152
1da177e4
LT
2153/*
2154 * NOTE. We drop all the packets that has local source
2155 * addresses, because every properly looped back packet
2156 * must have correct destination already attached by output routine.
2157 *
2158 * Such approach solves two big problems:
2159 * 1. Not simplex devices are handled properly.
2160 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2161 * called with rcu_read_lock()
1da177e4
LT
2162 */
2163
9e12bb22 2164static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2165 u8 tos, struct net_device *dev)
2166{
2167 struct fib_result res;
96d36220 2168 struct in_device *in_dev = __in_dev_get_rcu(dev);
5811662b
CG
2169 struct flowi fl = { .fl4_dst = daddr,
2170 .fl4_src = saddr,
2171 .fl4_tos = tos,
2172 .fl4_scope = RT_SCOPE_UNIVERSE,
47dcf0cb 2173 .mark = skb->mark,
1da177e4
LT
2174 .iif = dev->ifindex };
2175 unsigned flags = 0;
2176 u32 itag = 0;
2177 struct rtable * rth;
2178 unsigned hash;
9e12bb22 2179 __be32 spec_dst;
1da177e4 2180 int err = -EINVAL;
c346dca1 2181 struct net * net = dev_net(dev);
1da177e4
LT
2182
2183 /* IP on this device is disabled. */
2184
2185 if (!in_dev)
2186 goto out;
2187
2188 /* Check for the most weird martians, which can be not detected
2189 by fib_lookup.
2190 */
2191
1e637c74 2192 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2193 ipv4_is_loopback(saddr))
1da177e4
LT
2194 goto martian_source;
2195
27a954bd 2196 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2197 goto brd_input;
2198
2199 /* Accept zero addresses only to limited broadcast;
2200 * I even do not know to fix it or not. Waiting for complains :-)
2201 */
f97c1e0c 2202 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2203 goto martian_source;
2204
27a954bd 2205 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2206 goto martian_destination;
2207
2208 /*
2209 * Now we are ready to route packet.
2210 */
ebc0ffae
ED
2211 err = fib_lookup(net, &fl, &res);
2212 if (err != 0) {
1da177e4 2213 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2214 goto e_hostunreach;
1da177e4
LT
2215 goto no_route;
2216 }
1da177e4
LT
2217
2218 RT_CACHE_STAT_INC(in_slow_tot);
2219
2220 if (res.type == RTN_BROADCAST)
2221 goto brd_input;
2222
2223 if (res.type == RTN_LOCAL) {
b5f7e755 2224 err = fib_validate_source(saddr, daddr, tos,
ebc0ffae
ED
2225 net->loopback_dev->ifindex,
2226 dev, &spec_dst, &itag, skb->mark);
b5f7e755
ED
2227 if (err < 0)
2228 goto martian_source_keep_err;
2229 if (err)
1da177e4
LT
2230 flags |= RTCF_DIRECTSRC;
2231 spec_dst = daddr;
2232 goto local_input;
2233 }
2234
2235 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2236 goto e_hostunreach;
1da177e4
LT
2237 if (res.type != RTN_UNICAST)
2238 goto martian_destination;
2239
2240 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
2241out: return err;
2242
2243brd_input:
2244 if (skb->protocol != htons(ETH_P_IP))
2245 goto e_inval;
2246
f97c1e0c 2247 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2248 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2249 else {
2250 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
b0c110ca 2251 &itag, skb->mark);
1da177e4 2252 if (err < 0)
b5f7e755 2253 goto martian_source_keep_err;
1da177e4
LT
2254 if (err)
2255 flags |= RTCF_DIRECTSRC;
2256 }
2257 flags |= RTCF_BROADCAST;
2258 res.type = RTN_BROADCAST;
2259 RT_CACHE_STAT_INC(in_brd);
2260
2261local_input:
2262 rth = dst_alloc(&ipv4_dst_ops);
2263 if (!rth)
2264 goto e_nobufs;
2265
d8d1f30b
CG
2266 rth->dst.output= ip_rt_bug;
2267 rth->dst.obsolete = -1;
e84f84f2 2268 rth->rt_genid = rt_genid(net);
1da177e4 2269
d8d1f30b
CG
2270 atomic_set(&rth->dst.__refcnt, 1);
2271 rth->dst.flags= DST_HOST;
42f811b8 2272 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2273 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
2274 rth->fl.fl4_dst = daddr;
2275 rth->rt_dst = daddr;
2276 rth->fl.fl4_tos = tos;
47dcf0cb 2277 rth->fl.mark = skb->mark;
1da177e4
LT
2278 rth->fl.fl4_src = saddr;
2279 rth->rt_src = saddr;
c7066f70 2280#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2281 rth->dst.tclassid = itag;
1da177e4
LT
2282#endif
2283 rth->rt_iif =
2284 rth->fl.iif = dev->ifindex;
d8d1f30b
CG
2285 rth->dst.dev = net->loopback_dev;
2286 dev_hold(rth->dst.dev);
1da177e4
LT
2287 rth->rt_gateway = daddr;
2288 rth->rt_spec_dst= spec_dst;
d8d1f30b 2289 rth->dst.input= ip_local_deliver;
1da177e4
LT
2290 rth->rt_flags = flags|RTCF_LOCAL;
2291 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2292 rth->dst.input= ip_error;
2293 rth->dst.error= -err;
1da177e4
LT
2294 rth->rt_flags &= ~RTCF_LOCAL;
2295 }
2296 rth->rt_type = res.type;
e84f84f2 2297 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
6a2bad70 2298 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
ebc0ffae 2299 goto out;
1da177e4
LT
2300
2301no_route:
2302 RT_CACHE_STAT_INC(in_no_route);
2303 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2304 res.type = RTN_UNREACHABLE;
7f53878d
MC
2305 if (err == -ESRCH)
2306 err = -ENETUNREACH;
1da177e4
LT
2307 goto local_input;
2308
2309 /*
2310 * Do not cache martian addresses: they should be logged (RFC1812)
2311 */
2312martian_destination:
2313 RT_CACHE_STAT_INC(in_martian_dst);
2314#ifdef CONFIG_IP_ROUTE_VERBOSE
2315 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2316 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2317 &daddr, &saddr, dev->name);
1da177e4 2318#endif
2c2910a4
DE
2319
2320e_hostunreach:
e905a9ed 2321 err = -EHOSTUNREACH;
ebc0ffae 2322 goto out;
2c2910a4 2323
1da177e4
LT
2324e_inval:
2325 err = -EINVAL;
ebc0ffae 2326 goto out;
1da177e4
LT
2327
2328e_nobufs:
2329 err = -ENOBUFS;
ebc0ffae 2330 goto out;
1da177e4
LT
2331
2332martian_source:
b5f7e755
ED
2333 err = -EINVAL;
2334martian_source_keep_err:
1da177e4 2335 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2336 goto out;
1da177e4
LT
2337}
2338
407eadd9
ED
2339int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2340 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2341{
2342 struct rtable * rth;
2343 unsigned hash;
2344 int iif = dev->ifindex;
b5921910 2345 struct net *net;
96d36220 2346 int res;
1da177e4 2347
c346dca1 2348 net = dev_net(dev);
1080d709 2349
96d36220
ED
2350 rcu_read_lock();
2351
1080d709
NH
2352 if (!rt_caching(net))
2353 goto skip_cache;
2354
1da177e4 2355 tos &= IPTOS_RT_MASK;
e84f84f2 2356 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2357
1da177e4 2358 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2359 rth = rcu_dereference(rth->dst.rt_next)) {
0eae88f3
ED
2360 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2361 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
c0b8c32b
SH
2362 (rth->fl.iif ^ iif) |
2363 rth->fl.oif |
2364 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2365 rth->fl.mark == skb->mark &&
d8d1f30b 2366 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2367 !rt_is_expired(rth)) {
407eadd9 2368 if (noref) {
d8d1f30b
CG
2369 dst_use_noref(&rth->dst, jiffies);
2370 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2371 } else {
d8d1f30b
CG
2372 dst_use(&rth->dst, jiffies);
2373 skb_dst_set(skb, &rth->dst);
407eadd9 2374 }
1da177e4
LT
2375 RT_CACHE_STAT_INC(in_hit);
2376 rcu_read_unlock();
1da177e4
LT
2377 return 0;
2378 }
2379 RT_CACHE_STAT_INC(in_hlist_search);
2380 }
1da177e4 2381
1080d709 2382skip_cache:
1da177e4
LT
2383 /* Multicast recognition logic is moved from route cache to here.
2384 The problem was that too many Ethernet cards have broken/missing
2385 hardware multicast filters :-( As result the host on multicasting
2386 network acquires a lot of useless route cache entries, sort of
2387 SDR messages from all the world. Now we try to get rid of them.
2388 Really, provided software IP multicast filter is organized
2389 reasonably (at least, hashed), it does not result in a slowdown
2390 comparing with route cache reject entries.
2391 Note, that multicast routers are not affected, because
2392 route cache entry is created eventually.
2393 */
f97c1e0c 2394 if (ipv4_is_multicast(daddr)) {
96d36220 2395 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2396
96d36220 2397 if (in_dev) {
1da177e4 2398 int our = ip_check_mc(in_dev, daddr, saddr,
96d36220 2399 ip_hdr(skb)->protocol);
1da177e4
LT
2400 if (our
2401#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2402 ||
2403 (!ipv4_is_local_multicast(daddr) &&
2404 IN_DEV_MFORWARD(in_dev))
1da177e4 2405#endif
9d4fb27d 2406 ) {
96d36220
ED
2407 int res = ip_route_input_mc(skb, daddr, saddr,
2408 tos, dev, our);
1da177e4 2409 rcu_read_unlock();
96d36220 2410 return res;
1da177e4
LT
2411 }
2412 }
2413 rcu_read_unlock();
2414 return -EINVAL;
2415 }
96d36220
ED
2416 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2417 rcu_read_unlock();
2418 return res;
1da177e4 2419}
407eadd9 2420EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2421
ebc0ffae 2422/* called with rcu_read_lock() */
5969f71d
SH
2423static int __mkroute_output(struct rtable **result,
2424 struct fib_result *res,
2425 const struct flowi *fl,
2426 const struct flowi *oldflp,
2427 struct net_device *dev_out,
2428 unsigned flags)
1da177e4
LT
2429{
2430 struct rtable *rth;
2431 struct in_device *in_dev;
2432 u32 tos = RT_FL_TOS(oldflp);
1da177e4 2433
dd28d1a0 2434 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
1da177e4
LT
2435 return -EINVAL;
2436
27a954bd 2437 if (ipv4_is_lbcast(fl->fl4_dst))
1da177e4 2438 res->type = RTN_BROADCAST;
f97c1e0c 2439 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2440 res->type = RTN_MULTICAST;
27a954bd 2441 else if (ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2442 return -EINVAL;
2443
2444 if (dev_out->flags & IFF_LOOPBACK)
2445 flags |= RTCF_LOCAL;
2446
dd28d1a0 2447 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2448 if (!in_dev)
1da177e4 2449 return -EINVAL;
ebc0ffae 2450
1da177e4
LT
2451 if (res->type == RTN_BROADCAST) {
2452 flags |= RTCF_BROADCAST | RTCF_LOCAL;
ebc0ffae 2453 res->fi = NULL;
1da177e4 2454 } else if (res->type == RTN_MULTICAST) {
dd28d1a0 2455 flags |= RTCF_MULTICAST | RTCF_LOCAL;
e905a9ed 2456 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2457 oldflp->proto))
2458 flags &= ~RTCF_LOCAL;
2459 /* If multicast route do not exist use
dd28d1a0
ED
2460 * default one, but do not gateway in this case.
2461 * Yes, it is hack.
1da177e4 2462 */
ebc0ffae 2463 if (res->fi && res->prefixlen < 4)
1da177e4 2464 res->fi = NULL;
1da177e4
LT
2465 }
2466
2467
2468 rth = dst_alloc(&ipv4_dst_ops);
8391d07b 2469 if (!rth)
dd28d1a0 2470 return -ENOBUFS;
8391d07b 2471
d8d1f30b
CG
2472 atomic_set(&rth->dst.__refcnt, 1);
2473 rth->dst.flags= DST_HOST;
42f811b8 2474 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
d8d1f30b 2475 rth->dst.flags |= DST_NOXFRM;
42f811b8 2476 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2477 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
2478
2479 rth->fl.fl4_dst = oldflp->fl4_dst;
2480 rth->fl.fl4_tos = tos;
2481 rth->fl.fl4_src = oldflp->fl4_src;
2482 rth->fl.oif = oldflp->oif;
47dcf0cb 2483 rth->fl.mark = oldflp->mark;
1da177e4
LT
2484 rth->rt_dst = fl->fl4_dst;
2485 rth->rt_src = fl->fl4_src;
2486 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2487 /* get references to the devices that are to be hold by the routing
1da177e4 2488 cache entry */
d8d1f30b 2489 rth->dst.dev = dev_out;
1da177e4 2490 dev_hold(dev_out);
1da177e4
LT
2491 rth->rt_gateway = fl->fl4_dst;
2492 rth->rt_spec_dst= fl->fl4_src;
2493
d8d1f30b
CG
2494 rth->dst.output=ip_output;
2495 rth->dst.obsolete = -1;
e84f84f2 2496 rth->rt_genid = rt_genid(dev_net(dev_out));
1da177e4
LT
2497
2498 RT_CACHE_STAT_INC(out_slow_tot);
2499
2500 if (flags & RTCF_LOCAL) {
d8d1f30b 2501 rth->dst.input = ip_local_deliver;
1da177e4
LT
2502 rth->rt_spec_dst = fl->fl4_dst;
2503 }
2504 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2505 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2506 if (flags & RTCF_LOCAL &&
1da177e4 2507 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2508 rth->dst.output = ip_mc_output;
1da177e4
LT
2509 RT_CACHE_STAT_INC(out_slow_mc);
2510 }
2511#ifdef CONFIG_IP_MROUTE
2512 if (res->type == RTN_MULTICAST) {
2513 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2514 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
d8d1f30b
CG
2515 rth->dst.input = ip_mr_input;
2516 rth->dst.output = ip_mc_output;
1da177e4
LT
2517 }
2518 }
2519#endif
2520 }
2521
2522 rt_set_nexthop(rth, res, 0);
2523
2524 rth->rt_flags = flags;
1da177e4 2525 *result = rth;
dd28d1a0 2526 return 0;
1da177e4
LT
2527}
2528
ebc0ffae 2529/* called with rcu_read_lock() */
5969f71d
SH
2530static int ip_mkroute_output(struct rtable **rp,
2531 struct fib_result *res,
2532 const struct flowi *fl,
2533 const struct flowi *oldflp,
2534 struct net_device *dev_out,
2535 unsigned flags)
1da177e4 2536{
7abaa27c 2537 struct rtable *rth = NULL;
1da177e4
LT
2538 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2539 unsigned hash;
2540 if (err == 0) {
b00180de 2541 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
e84f84f2 2542 rt_genid(dev_net(dev_out)));
6a2bad70 2543 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
1da177e4 2544 }
e905a9ed 2545
1da177e4
LT
2546 return err;
2547}
2548
1da177e4
LT
2549/*
2550 * Major route resolver routine.
0197aa38 2551 * called with rcu_read_lock();
1da177e4
LT
2552 */
2553
b40afd0e
DL
2554static int ip_route_output_slow(struct net *net, struct rtable **rp,
2555 const struct flowi *oldflp)
1da177e4
LT
2556{
2557 u32 tos = RT_FL_TOS(oldflp);
5811662b
CG
2558 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2559 .fl4_src = oldflp->fl4_src,
2560 .fl4_tos = tos & IPTOS_RT_MASK,
2561 .fl4_scope = ((tos & RTO_ONLINK) ?
2562 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
47dcf0cb 2563 .mark = oldflp->mark,
b40afd0e 2564 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2565 .oif = oldflp->oif };
2566 struct fib_result res;
0197aa38 2567 unsigned int flags = 0;
1da177e4 2568 struct net_device *dev_out = NULL;
1da177e4
LT
2569 int err;
2570
2571
2572 res.fi = NULL;
2573#ifdef CONFIG_IP_MULTIPLE_TABLES
2574 res.r = NULL;
2575#endif
2576
2577 if (oldflp->fl4_src) {
2578 err = -EINVAL;
f97c1e0c 2579 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2580 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2581 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2582 goto out;
2583
1da177e4
LT
2584 /* I removed check for oif == dev_out->oif here.
2585 It was wrong for two reasons:
1ab35276
DL
2586 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2587 is assigned to multiple interfaces.
1da177e4
LT
2588 2. Moreover, we are allowed to send packets with saddr
2589 of another iface. --ANK
2590 */
2591
9d4fb27d
JP
2592 if (oldflp->oif == 0 &&
2593 (ipv4_is_multicast(oldflp->fl4_dst) ||
27a954bd 2594 ipv4_is_lbcast(oldflp->fl4_dst))) {
a210d01a 2595 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
0197aa38 2596 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
a210d01a
JA
2597 if (dev_out == NULL)
2598 goto out;
2599
1da177e4
LT
2600 /* Special hack: user can direct multicasts
2601 and limited broadcast via necessary interface
2602 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2603 This hack is not just for fun, it allows
2604 vic,vat and friends to work.
2605 They bind socket to loopback, set ttl to zero
2606 and expect that it will work.
2607 From the viewpoint of routing cache they are broken,
2608 because we are not allowed to build multicast path
2609 with loopback source addr (look, routing cache
2610 cannot know, that ttl is zero, so that packet
2611 will not leave this host and route is valid).
2612 Luckily, this hack is good workaround.
2613 */
2614
2615 fl.oif = dev_out->ifindex;
2616 goto make_route;
2617 }
a210d01a
JA
2618
2619 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2620 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
0197aa38 2621 if (!__ip_dev_find(net, oldflp->fl4_src, false))
a210d01a 2622 goto out;
a210d01a 2623 }
1da177e4
LT
2624 }
2625
2626
2627 if (oldflp->oif) {
0197aa38 2628 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
1da177e4
LT
2629 err = -ENODEV;
2630 if (dev_out == NULL)
2631 goto out;
e5ed6399
HX
2632
2633 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83
ED
2634 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2635 err = -ENETUNREACH;
2636 goto out;
2637 }
f97c1e0c 2638 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
27a954bd 2639 ipv4_is_lbcast(oldflp->fl4_dst)) {
1da177e4
LT
2640 if (!fl.fl4_src)
2641 fl.fl4_src = inet_select_addr(dev_out, 0,
2642 RT_SCOPE_LINK);
2643 goto make_route;
2644 }
2645 if (!fl.fl4_src) {
f97c1e0c 2646 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2647 fl.fl4_src = inet_select_addr(dev_out, 0,
2648 fl.fl4_scope);
2649 else if (!oldflp->fl4_dst)
2650 fl.fl4_src = inet_select_addr(dev_out, 0,
2651 RT_SCOPE_HOST);
2652 }
2653 }
2654
2655 if (!fl.fl4_dst) {
2656 fl.fl4_dst = fl.fl4_src;
2657 if (!fl.fl4_dst)
2658 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
b40afd0e 2659 dev_out = net->loopback_dev;
b40afd0e 2660 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2661 res.type = RTN_LOCAL;
2662 flags |= RTCF_LOCAL;
2663 goto make_route;
2664 }
2665
b40afd0e 2666 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2667 res.fi = NULL;
2668 if (oldflp->oif) {
2669 /* Apparently, routing tables are wrong. Assume,
2670 that the destination is on link.
2671
2672 WHY? DW.
2673 Because we are allowed to send to iface
2674 even if it has NO routes and NO assigned
2675 addresses. When oif is specified, routing
2676 tables are looked up with only one purpose:
2677 to catch if destination is gatewayed, rather than
2678 direct. Moreover, if MSG_DONTROUTE is set,
2679 we send packet, ignoring both routing tables
2680 and ifaddr state. --ANK
2681
2682
2683 We could make it even if oif is unknown,
2684 likely IPv6, but we do not.
2685 */
2686
2687 if (fl.fl4_src == 0)
2688 fl.fl4_src = inet_select_addr(dev_out, 0,
2689 RT_SCOPE_LINK);
2690 res.type = RTN_UNICAST;
2691 goto make_route;
2692 }
1da177e4
LT
2693 err = -ENETUNREACH;
2694 goto out;
2695 }
1da177e4
LT
2696
2697 if (res.type == RTN_LOCAL) {
9fc3bbb4
JS
2698 if (!fl.fl4_src) {
2699 if (res.fi->fib_prefsrc)
2700 fl.fl4_src = res.fi->fib_prefsrc;
2701 else
2702 fl.fl4_src = fl.fl4_dst;
2703 }
b40afd0e 2704 dev_out = net->loopback_dev;
1da177e4 2705 fl.oif = dev_out->ifindex;
1da177e4
LT
2706 res.fi = NULL;
2707 flags |= RTCF_LOCAL;
2708 goto make_route;
2709 }
2710
2711#ifdef CONFIG_IP_ROUTE_MULTIPATH
2712 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2713 fib_select_multipath(&fl, &res);
2714 else
2715#endif
2716 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
0c838ff1 2717 fib_select_default(&res);
1da177e4
LT
2718
2719 if (!fl.fl4_src)
2720 fl.fl4_src = FIB_RES_PREFSRC(res);
2721
1da177e4 2722 dev_out = FIB_RES_DEV(res);
1da177e4
LT
2723 fl.oif = dev_out->ifindex;
2724
2725
2726make_route:
2727 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2728
1da177e4
LT
2729out: return err;
2730}
2731
611c183e
DL
2732int __ip_route_output_key(struct net *net, struct rtable **rp,
2733 const struct flowi *flp)
1da177e4 2734{
0197aa38
ED
2735 unsigned int hash;
2736 int res;
1da177e4
LT
2737 struct rtable *rth;
2738
1080d709
NH
2739 if (!rt_caching(net))
2740 goto slow_output;
2741
e84f84f2 2742 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
1da177e4
LT
2743
2744 rcu_read_lock_bh();
a898def2 2745 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2746 rth = rcu_dereference_bh(rth->dst.rt_next)) {
1da177e4
LT
2747 if (rth->fl.fl4_dst == flp->fl4_dst &&
2748 rth->fl.fl4_src == flp->fl4_src &&
c7537967 2749 rt_is_output_route(rth) &&
1da177e4 2750 rth->fl.oif == flp->oif &&
47dcf0cb 2751 rth->fl.mark == flp->mark &&
1da177e4 2752 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2753 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2754 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2755 !rt_is_expired(rth)) {
d8d1f30b 2756 dst_use(&rth->dst, jiffies);
1da177e4
LT
2757 RT_CACHE_STAT_INC(out_hit);
2758 rcu_read_unlock_bh();
2759 *rp = rth;
2760 return 0;
2761 }
2762 RT_CACHE_STAT_INC(out_hlist_search);
2763 }
2764 rcu_read_unlock_bh();
2765
1080d709 2766slow_output:
0197aa38
ED
2767 rcu_read_lock();
2768 res = ip_route_output_slow(net, rp, flp);
2769 rcu_read_unlock();
2770 return res;
1da177e4 2771}
d8c97a94
ACM
2772EXPORT_SYMBOL_GPL(__ip_route_output_key);
2773
ae2688d5
JW
2774static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2775{
2776 return NULL;
2777}
2778
ec831ea7
RD
2779static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2780{
2781 return 0;
2782}
2783
14e50e57
DM
2784static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2785{
2786}
2787
2788static struct dst_ops ipv4_dst_blackhole_ops = {
2789 .family = AF_INET,
09640e63 2790 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2791 .destroy = ipv4_dst_destroy,
ae2688d5 2792 .check = ipv4_blackhole_dst_check,
ec831ea7 2793 .default_mtu = ipv4_blackhole_default_mtu,
14e50e57 2794 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
14e50e57
DM
2795};
2796
2797
e84f84f2 2798static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
14e50e57
DM
2799{
2800 struct rtable *ort = *rp;
2801 struct rtable *rt = (struct rtable *)
2802 dst_alloc(&ipv4_dst_blackhole_ops);
2803
2804 if (rt) {
d8d1f30b 2805 struct dst_entry *new = &rt->dst;
14e50e57
DM
2806
2807 atomic_set(&new->__refcnt, 1);
2808 new->__use = 1;
352e512c
HX
2809 new->input = dst_discard;
2810 new->output = dst_discard;
defb3519 2811 dst_copy_metrics(new, &ort->dst);
14e50e57 2812
d8d1f30b 2813 new->dev = ort->dst.dev;
14e50e57
DM
2814 if (new->dev)
2815 dev_hold(new->dev);
2816
2817 rt->fl = ort->fl;
2818
e84f84f2 2819 rt->rt_genid = rt_genid(net);
14e50e57
DM
2820 rt->rt_flags = ort->rt_flags;
2821 rt->rt_type = ort->rt_type;
2822 rt->rt_dst = ort->rt_dst;
2823 rt->rt_src = ort->rt_src;
2824 rt->rt_iif = ort->rt_iif;
2825 rt->rt_gateway = ort->rt_gateway;
2826 rt->rt_spec_dst = ort->rt_spec_dst;
2827 rt->peer = ort->peer;
2828 if (rt->peer)
2829 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2830 rt->fi = ort->fi;
2831 if (rt->fi)
2832 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2833
2834 dst_free(new);
2835 }
2836
d8d1f30b 2837 dst_release(&(*rp)->dst);
14e50e57 2838 *rp = rt;
a02cec21 2839 return rt ? 0 : -ENOMEM;
14e50e57
DM
2840}
2841
f1b050bf
DL
2842int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2843 struct sock *sk, int flags)
1da177e4
LT
2844{
2845 int err;
2846
f1b050bf 2847 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2848 return err;
2849
2850 if (flp->proto) {
2851 if (!flp->fl4_src)
2852 flp->fl4_src = (*rp)->rt_src;
2853 if (!flp->fl4_dst)
2854 flp->fl4_dst = (*rp)->rt_dst;
52479b62 2855 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
bb72845e 2856 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2857 if (err == -EREMOTE)
e84f84f2 2858 err = ipv4_dst_blackhole(net, rp, flp);
14e50e57
DM
2859
2860 return err;
1da177e4
LT
2861 }
2862
2863 return 0;
2864}
d8c97a94
ACM
2865EXPORT_SYMBOL_GPL(ip_route_output_flow);
2866
f206351a 2867int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2868{
f206351a 2869 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4 2870}
4bc2f18b 2871EXPORT_SYMBOL(ip_route_output_key);
1da177e4 2872
4feb88e5
BT
2873static int rt_fill_info(struct net *net,
2874 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2875 int nowait, unsigned int flags)
1da177e4 2876{
511c3f92 2877 struct rtable *rt = skb_rtable(skb);
1da177e4 2878 struct rtmsg *r;
be403ea1 2879 struct nlmsghdr *nlh;
e3703b3d
TG
2880 long expires;
2881 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2882
2883 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2884 if (nlh == NULL)
26932566 2885 return -EMSGSIZE;
be403ea1
TG
2886
2887 r = nlmsg_data(nlh);
1da177e4
LT
2888 r->rtm_family = AF_INET;
2889 r->rtm_dst_len = 32;
2890 r->rtm_src_len = 0;
2891 r->rtm_tos = rt->fl.fl4_tos;
2892 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2893 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2894 r->rtm_type = rt->rt_type;
2895 r->rtm_scope = RT_SCOPE_UNIVERSE;
2896 r->rtm_protocol = RTPROT_UNSPEC;
2897 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2898 if (rt->rt_flags & RTCF_NOTIFY)
2899 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2900
17fb2c64 2901 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2902
1da177e4
LT
2903 if (rt->fl.fl4_src) {
2904 r->rtm_src_len = 32;
17fb2c64 2905 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4 2906 }
d8d1f30b
CG
2907 if (rt->dst.dev)
2908 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2909#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2910 if (rt->dst.tclassid)
2911 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2912#endif
c7537967 2913 if (rt_is_input_route(rt))
17fb2c64 2914 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2915 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2916 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2917
1da177e4 2918 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2919 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2920
defb3519 2921 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2922 goto nla_put_failure;
2923
963bfeee
ED
2924 if (rt->fl.mark)
2925 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2926
d8d1f30b
CG
2927 error = rt->dst.error;
2928 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
1da177e4 2929 if (rt->peer) {
317fe0e6 2930 inet_peer_refcheck(rt->peer);
2c1409a0 2931 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
1da177e4 2932 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2933 ts = rt->peer->tcp_ts;
9d729f72 2934 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2935 }
2936 }
be403ea1 2937
c7537967 2938 if (rt_is_input_route(rt)) {
1da177e4 2939#ifdef CONFIG_IP_MROUTE
e448515c 2940 __be32 dst = rt->rt_dst;
1da177e4 2941
f97c1e0c 2942 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5
BT
2943 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2944 int err = ipmr_get_route(net, skb, r, nowait);
1da177e4
LT
2945 if (err <= 0) {
2946 if (!nowait) {
2947 if (err == 0)
2948 return 0;
be403ea1 2949 goto nla_put_failure;
1da177e4
LT
2950 } else {
2951 if (err == -EMSGSIZE)
be403ea1 2952 goto nla_put_failure;
e3703b3d 2953 error = err;
1da177e4
LT
2954 }
2955 }
2956 } else
2957#endif
be403ea1 2958 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2959 }
2960
d8d1f30b 2961 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
2962 expires, error) < 0)
2963 goto nla_put_failure;
be403ea1
TG
2964
2965 return nlmsg_end(skb, nlh);
1da177e4 2966
be403ea1 2967nla_put_failure:
26932566
PM
2968 nlmsg_cancel(skb, nlh);
2969 return -EMSGSIZE;
1da177e4
LT
2970}
2971
63f3444f 2972static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2973{
3b1e0a65 2974 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2975 struct rtmsg *rtm;
2976 struct nlattr *tb[RTA_MAX+1];
1da177e4 2977 struct rtable *rt = NULL;
9e12bb22
AV
2978 __be32 dst = 0;
2979 __be32 src = 0;
2980 u32 iif;
d889ce3b 2981 int err;
963bfeee 2982 int mark;
1da177e4
LT
2983 struct sk_buff *skb;
2984
d889ce3b
TG
2985 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2986 if (err < 0)
2987 goto errout;
2988
2989 rtm = nlmsg_data(nlh);
2990
1da177e4 2991 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2992 if (skb == NULL) {
2993 err = -ENOBUFS;
2994 goto errout;
2995 }
1da177e4
LT
2996
2997 /* Reserve room for dummy headers, this skb can pass
2998 through good chunk of routing engine.
2999 */
459a98ed 3000 skb_reset_mac_header(skb);
c1d2bbe1 3001 skb_reset_network_header(skb);
d2c962b8
SH
3002
3003 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3004 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3005 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3006
17fb2c64
AV
3007 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3008 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3009 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3010 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3011
3012 if (iif) {
d889ce3b
TG
3013 struct net_device *dev;
3014
1937504d 3015 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3016 if (dev == NULL) {
3017 err = -ENODEV;
3018 goto errout_free;
3019 }
3020
1da177e4
LT
3021 skb->protocol = htons(ETH_P_IP);
3022 skb->dev = dev;
963bfeee 3023 skb->mark = mark;
1da177e4
LT
3024 local_bh_disable();
3025 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3026 local_bh_enable();
d889ce3b 3027
511c3f92 3028 rt = skb_rtable(skb);
d8d1f30b
CG
3029 if (err == 0 && rt->dst.error)
3030 err = -rt->dst.error;
1da177e4 3031 } else {
d889ce3b 3032 struct flowi fl = {
5811662b
CG
3033 .fl4_dst = dst,
3034 .fl4_src = src,
3035 .fl4_tos = rtm->rtm_tos,
d889ce3b 3036 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
963bfeee 3037 .mark = mark,
d889ce3b 3038 };
1937504d 3039 err = ip_route_output_key(net, &rt, &fl);
1da177e4 3040 }
d889ce3b 3041
1da177e4 3042 if (err)
d889ce3b 3043 goto errout_free;
1da177e4 3044
d8d1f30b 3045 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3046 if (rtm->rtm_flags & RTM_F_NOTIFY)
3047 rt->rt_flags |= RTCF_NOTIFY;
3048
4feb88e5 3049 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3050 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3051 if (err <= 0)
3052 goto errout_free;
1da177e4 3053
1937504d 3054 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3055errout:
2942e900 3056 return err;
1da177e4 3057
d889ce3b 3058errout_free:
1da177e4 3059 kfree_skb(skb);
d889ce3b 3060 goto errout;
1da177e4
LT
3061}
3062
3063int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3064{
3065 struct rtable *rt;
3066 int h, s_h;
3067 int idx, s_idx;
1937504d
DL
3068 struct net *net;
3069
3b1e0a65 3070 net = sock_net(skb->sk);
1da177e4
LT
3071
3072 s_h = cb->args[0];
d8c92830
ED
3073 if (s_h < 0)
3074 s_h = 0;
1da177e4 3075 s_idx = idx = cb->args[1];
a6272665
ED
3076 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3077 if (!rt_hash_table[h].chain)
3078 continue;
1da177e4 3079 rcu_read_lock_bh();
a898def2 3080 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3081 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3082 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3083 continue;
e84f84f2 3084 if (rt_is_expired(rt))
29e75252 3085 continue;
d8d1f30b 3086 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3087 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3088 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3089 1, NLM_F_MULTI) <= 0) {
adf30907 3090 skb_dst_drop(skb);
1da177e4
LT
3091 rcu_read_unlock_bh();
3092 goto done;
3093 }
adf30907 3094 skb_dst_drop(skb);
1da177e4
LT
3095 }
3096 rcu_read_unlock_bh();
3097 }
3098
3099done:
3100 cb->args[0] = h;
3101 cb->args[1] = idx;
3102 return skb->len;
3103}
3104
3105void ip_rt_multicast_event(struct in_device *in_dev)
3106{
76e6ebfb 3107 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3108}
3109
3110#ifdef CONFIG_SYSCTL
81c684d1 3111static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3112 void __user *buffer,
1da177e4
LT
3113 size_t *lenp, loff_t *ppos)
3114{
3115 if (write) {
639e104f 3116 int flush_delay;
81c684d1 3117 ctl_table ctl;
39a23e75 3118 struct net *net;
639e104f 3119
81c684d1
DL
3120 memcpy(&ctl, __ctl, sizeof(ctl));
3121 ctl.data = &flush_delay;
8d65af78 3122 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3123
81c684d1 3124 net = (struct net *)__ctl->extra1;
39a23e75 3125 rt_cache_flush(net, flush_delay);
1da177e4 3126 return 0;
e905a9ed 3127 }
1da177e4
LT
3128
3129 return -EINVAL;
3130}
3131
eeb61f71 3132static ctl_table ipv4_route_table[] = {
1da177e4 3133 {
1da177e4
LT
3134 .procname = "gc_thresh",
3135 .data = &ipv4_dst_ops.gc_thresh,
3136 .maxlen = sizeof(int),
3137 .mode = 0644,
6d9f239a 3138 .proc_handler = proc_dointvec,
1da177e4
LT
3139 },
3140 {
1da177e4
LT
3141 .procname = "max_size",
3142 .data = &ip_rt_max_size,
3143 .maxlen = sizeof(int),
3144 .mode = 0644,
6d9f239a 3145 .proc_handler = proc_dointvec,
1da177e4
LT
3146 },
3147 {
3148 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3149
1da177e4
LT
3150 .procname = "gc_min_interval",
3151 .data = &ip_rt_gc_min_interval,
3152 .maxlen = sizeof(int),
3153 .mode = 0644,
6d9f239a 3154 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3155 },
3156 {
1da177e4
LT
3157 .procname = "gc_min_interval_ms",
3158 .data = &ip_rt_gc_min_interval,
3159 .maxlen = sizeof(int),
3160 .mode = 0644,
6d9f239a 3161 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3162 },
3163 {
1da177e4
LT
3164 .procname = "gc_timeout",
3165 .data = &ip_rt_gc_timeout,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
6d9f239a 3168 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3169 },
3170 {
1da177e4
LT
3171 .procname = "gc_interval",
3172 .data = &ip_rt_gc_interval,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
6d9f239a 3175 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3176 },
3177 {
1da177e4
LT
3178 .procname = "redirect_load",
3179 .data = &ip_rt_redirect_load,
3180 .maxlen = sizeof(int),
3181 .mode = 0644,
6d9f239a 3182 .proc_handler = proc_dointvec,
1da177e4
LT
3183 },
3184 {
1da177e4
LT
3185 .procname = "redirect_number",
3186 .data = &ip_rt_redirect_number,
3187 .maxlen = sizeof(int),
3188 .mode = 0644,
6d9f239a 3189 .proc_handler = proc_dointvec,
1da177e4
LT
3190 },
3191 {
1da177e4
LT
3192 .procname = "redirect_silence",
3193 .data = &ip_rt_redirect_silence,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
6d9f239a 3196 .proc_handler = proc_dointvec,
1da177e4
LT
3197 },
3198 {
1da177e4
LT
3199 .procname = "error_cost",
3200 .data = &ip_rt_error_cost,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
6d9f239a 3203 .proc_handler = proc_dointvec,
1da177e4
LT
3204 },
3205 {
1da177e4
LT
3206 .procname = "error_burst",
3207 .data = &ip_rt_error_burst,
3208 .maxlen = sizeof(int),
3209 .mode = 0644,
6d9f239a 3210 .proc_handler = proc_dointvec,
1da177e4
LT
3211 },
3212 {
1da177e4
LT
3213 .procname = "gc_elasticity",
3214 .data = &ip_rt_gc_elasticity,
3215 .maxlen = sizeof(int),
3216 .mode = 0644,
6d9f239a 3217 .proc_handler = proc_dointvec,
1da177e4
LT
3218 },
3219 {
1da177e4
LT
3220 .procname = "mtu_expires",
3221 .data = &ip_rt_mtu_expires,
3222 .maxlen = sizeof(int),
3223 .mode = 0644,
6d9f239a 3224 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3225 },
3226 {
1da177e4
LT
3227 .procname = "min_pmtu",
3228 .data = &ip_rt_min_pmtu,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
6d9f239a 3231 .proc_handler = proc_dointvec,
1da177e4
LT
3232 },
3233 {
1da177e4
LT
3234 .procname = "min_adv_mss",
3235 .data = &ip_rt_min_advmss,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
6d9f239a 3238 .proc_handler = proc_dointvec,
1da177e4 3239 },
f8572d8f 3240 { }
1da177e4 3241};
39a23e75 3242
2f4520d3
AV
3243static struct ctl_table empty[1];
3244
3245static struct ctl_table ipv4_skeleton[] =
3246{
f8572d8f 3247 { .procname = "route",
d994af0d 3248 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3249 { .procname = "neigh",
d994af0d 3250 .mode = 0555, .child = empty},
2f4520d3
AV
3251 { }
3252};
3253
3254static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3255 { .procname = "net", },
3256 { .procname = "ipv4", },
39a23e75
DL
3257 { },
3258};
3259
39a23e75
DL
3260static struct ctl_table ipv4_route_flush_table[] = {
3261 {
39a23e75
DL
3262 .procname = "flush",
3263 .maxlen = sizeof(int),
3264 .mode = 0200,
6d9f239a 3265 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3266 },
f8572d8f 3267 { },
39a23e75
DL
3268};
3269
2f4520d3 3270static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3271 { .procname = "net", },
3272 { .procname = "ipv4", },
3273 { .procname = "route", },
2f4520d3
AV
3274 { },
3275};
3276
39a23e75
DL
3277static __net_init int sysctl_route_net_init(struct net *net)
3278{
3279 struct ctl_table *tbl;
3280
3281 tbl = ipv4_route_flush_table;
09ad9bc7 3282 if (!net_eq(net, &init_net)) {
39a23e75
DL
3283 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3284 if (tbl == NULL)
3285 goto err_dup;
3286 }
3287 tbl[0].extra1 = net;
3288
3289 net->ipv4.route_hdr =
3290 register_net_sysctl_table(net, ipv4_route_path, tbl);
3291 if (net->ipv4.route_hdr == NULL)
3292 goto err_reg;
3293 return 0;
3294
3295err_reg:
3296 if (tbl != ipv4_route_flush_table)
3297 kfree(tbl);
3298err_dup:
3299 return -ENOMEM;
3300}
3301
3302static __net_exit void sysctl_route_net_exit(struct net *net)
3303{
3304 struct ctl_table *tbl;
3305
3306 tbl = net->ipv4.route_hdr->ctl_table_arg;
3307 unregister_net_sysctl_table(net->ipv4.route_hdr);
3308 BUG_ON(tbl == ipv4_route_flush_table);
3309 kfree(tbl);
3310}
3311
3312static __net_initdata struct pernet_operations sysctl_route_ops = {
3313 .init = sysctl_route_net_init,
3314 .exit = sysctl_route_net_exit,
3315};
1da177e4
LT
3316#endif
3317
3ee94372 3318static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3319{
3ee94372
NH
3320 get_random_bytes(&net->ipv4.rt_genid,
3321 sizeof(net->ipv4.rt_genid));
9f5e97e5
DL
3322 return 0;
3323}
3324
3ee94372
NH
3325static __net_initdata struct pernet_operations rt_genid_ops = {
3326 .init = rt_genid_init,
9f5e97e5
DL
3327};
3328
3329
c7066f70 3330#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3331struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3332#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3333
3334static __initdata unsigned long rhash_entries;
3335static int __init set_rhash_entries(char *str)
3336{
3337 if (!str)
3338 return 0;
3339 rhash_entries = simple_strtoul(str, &str, 0);
3340 return 1;
3341}
3342__setup("rhash_entries=", set_rhash_entries);
3343
3344int __init ip_rt_init(void)
3345{
424c4b70 3346 int rc = 0;
1da177e4 3347
c7066f70 3348#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3349 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3350 if (!ip_rt_acct)
3351 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3352#endif
3353
e5d679f3
AD
3354 ipv4_dst_ops.kmem_cachep =
3355 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3356 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3357
14e50e57
DM
3358 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3359
fc66f95c
ED
3360 if (dst_entries_init(&ipv4_dst_ops) < 0)
3361 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3362
3363 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3364 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3365
424c4b70
ED
3366 rt_hash_table = (struct rt_hash_bucket *)
3367 alloc_large_system_hash("IP route cache",
3368 sizeof(struct rt_hash_bucket),
3369 rhash_entries,
4481374c 3370 (totalram_pages >= 128 * 1024) ?
18955cfc 3371 15 : 17,
8d1502de 3372 0,
424c4b70
ED
3373 &rt_hash_log,
3374 &rt_hash_mask,
c9503e0f 3375 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3376 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3377 rt_hash_lock_init();
1da177e4
LT
3378
3379 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3380 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3381
1da177e4
LT
3382 devinet_init();
3383 ip_fib_init();
3384
1da177e4
LT
3385 /* All the timers, started at system startup tend
3386 to synchronize. Perturb it a bit.
3387 */
125bb8f5
ED
3388 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3389 expires_ljiffies = jiffies;
39c90ece
ED
3390 schedule_delayed_work(&expires_work,
3391 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3392
73b38711 3393 if (ip_rt_proc_init())
107f1634 3394 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3395#ifdef CONFIG_XFRM
3396 xfrm_init();
a33bc5c1 3397 xfrm4_init(ip_rt_max_size);
1da177e4 3398#endif
63f3444f
TG
3399 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3400
39a23e75
DL
3401#ifdef CONFIG_SYSCTL
3402 register_pernet_subsys(&sysctl_route_ops);
3403#endif
3ee94372 3404 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3405 return rc;
3406}
3407
a1bc6eb4 3408#ifdef CONFIG_SYSCTL
eeb61f71
AV
3409/*
3410 * We really need to sanitize the damn ipv4 init order, then all
3411 * this nonsense will go away.
3412 */
3413void __init ip_static_sysctl_init(void)
3414{
2f4520d3 3415 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3416}
a1bc6eb4 3417#endif