]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/route.c
tcp: add LINUX_MIB_TCPRETRANSFAIL counter
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
b9eda06f 94#include <linux/prefetch.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
6e5714ea 112#include <net/secure_seq.h>
1da177e4 113
68a5e3dd 114#define RT_FL_TOS(oldflp4) \
f61759e6 115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
1da177e4 121static int ip_rt_max_size;
817bc4db 122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
1080d709 134static int rt_chain_length_max __read_mostly = 20;
de68dca1 135static int redirect_genid;
1da177e4 136
9f28a2fc
ED
137static struct delayed_work expires_work;
138static unsigned long expires_ljiffies;
139
1da177e4
LT
140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 145static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 146static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 147static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 151static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 152
72cdd1d9
ED
153static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 int how)
155{
156}
1da177e4 157
62fa8a84
DM
158static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159{
06582540
DM
160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
162 u32 *p = NULL;
163
164 if (!rt->peer)
a48eff12 165 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 166
06582540
DM
167 peer = rt->peer;
168 if (peer) {
62fa8a84
DM
169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
171
06582540
DM
172 p = peer->metrics;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
175
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
178
179 if (prev != old) {
62fa8a84
DM
180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
182 p = NULL;
183 } else {
62fa8a84
DM
184 if (rt->fi) {
185 fib_info_put(rt->fi);
186 rt->fi = NULL;
187 }
188 }
189 }
190 return p;
191}
192
d3aaeb38
DM
193static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
1da177e4
LT
195static struct dst_ops ipv4_dst_ops = {
196 .family = AF_INET,
09640e63 197 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
0dbaee3b 200 .default_advmss = ipv4_default_advmss,
ebb762f2 201 .mtu = ipv4_mtu,
62fa8a84 202 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 208 .local_out = __ip_local_out,
d3aaeb38 209 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
210};
211
212#define ECN_OR_COST(class) TC_PRIO_##class
213
4839c52b 214const __u8 ip_tos2prio[16] = {
1da177e4 215 TC_PRIO_BESTEFFORT,
4a2b9c37 216 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
217 TC_PRIO_BESTEFFORT,
218 ECN_OR_COST(BESTEFFORT),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_BULK,
222 ECN_OR_COST(BULK),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE,
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
231};
232
233
234/*
235 * Route cache.
236 */
237
238/* The locking scheme is rather straight forward:
239 *
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
245 * lock held.
246 */
247
248struct rt_hash_bucket {
1c31720a 249 struct rtable __rcu *chain;
22c047cc 250};
1080d709 251
8a25d5de
IM
252#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
254/*
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
62051200 257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 258 */
62051200
IM
259#ifdef CONFIG_LOCKDEP
260# define RT_HASH_LOCK_SZ 256
22c047cc 261#else
62051200
IM
262# if NR_CPUS >= 32
263# define RT_HASH_LOCK_SZ 4096
264# elif NR_CPUS >= 16
265# define RT_HASH_LOCK_SZ 2048
266# elif NR_CPUS >= 8
267# define RT_HASH_LOCK_SZ 1024
268# elif NR_CPUS >= 4
269# define RT_HASH_LOCK_SZ 512
270# else
271# define RT_HASH_LOCK_SZ 256
272# endif
22c047cc
ED
273#endif
274
275static spinlock_t *rt_hash_locks;
276# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
277
278static __init void rt_hash_lock_init(void)
279{
280 int i;
281
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 GFP_KERNEL);
284 if (!rt_hash_locks)
285 panic("IP: failed to allocate rt_hash_locks\n");
286
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
289}
22c047cc
ED
290#else
291# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
292
293static inline void rt_hash_lock_init(void)
294{
295}
22c047cc 296#endif
1da177e4 297
817bc4db
SH
298static struct rt_hash_bucket *rt_hash_table __read_mostly;
299static unsigned rt_hash_mask __read_mostly;
300static unsigned int rt_hash_log __read_mostly;
1da177e4 301
2f970d83 302static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 303#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 304
b00180de 305static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 306 int genid)
1da177e4 307{
0eae88f3 308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 309 idx, genid)
29e75252 310 & rt_hash_mask;
1da177e4
LT
311}
312
e84f84f2
DL
313static inline int rt_genid(struct net *net)
314{
315 return atomic_read(&net->ipv4.rt_genid);
316}
317
1da177e4
LT
318#ifdef CONFIG_PROC_FS
319struct rt_cache_iter_state {
a75e936f 320 struct seq_net_private p;
1da177e4 321 int bucket;
29e75252 322 int genid;
1da177e4
LT
323};
324
1218854a 325static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 326{
1218854a 327 struct rt_cache_iter_state *st = seq->private;
1da177e4 328 struct rtable *r = NULL;
1da177e4
LT
329
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 332 continue;
1da177e4 333 rcu_read_lock_bh();
a898def2 334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 335 while (r) {
d8d1f30b 336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 337 r->rt_genid == st->genid)
29e75252 338 return r;
d8d1f30b 339 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 340 }
1da177e4
LT
341 rcu_read_unlock_bh();
342 }
29e75252 343 return r;
1da177e4
LT
344}
345
1218854a 346static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 347 struct rtable *r)
1da177e4 348{
1218854a 349 struct rt_cache_iter_state *st = seq->private;
a6272665 350
1c31720a 351 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
352 while (!r) {
353 rcu_read_unlock_bh();
a6272665
ED
354 do {
355 if (--st->bucket < 0)
356 return NULL;
33d480ce 357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 358 rcu_read_lock_bh();
1c31720a 359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 360 }
1c31720a 361 return r;
1da177e4
LT
362}
363
1218854a 364static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
365 struct rtable *r)
366{
1218854a
YH
367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 369 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 370 continue;
642d6318
DL
371 if (r->rt_genid == st->genid)
372 break;
373 }
374 return r;
375}
376
1218854a 377static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 378{
1218854a 379 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
380
381 if (r)
1218854a 382 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
383 --pos;
384 return pos ? NULL : r;
385}
386
387static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388{
29e75252 389 struct rt_cache_iter_state *st = seq->private;
29e75252 390 if (*pos)
1218854a 391 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 392 st->genid = rt_genid(seq_file_net(seq));
29e75252 393 return SEQ_START_TOKEN;
1da177e4
LT
394}
395
396static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397{
29e75252 398 struct rtable *r;
1da177e4
LT
399
400 if (v == SEQ_START_TOKEN)
1218854a 401 r = rt_cache_get_first(seq);
1da177e4 402 else
1218854a 403 r = rt_cache_get_next(seq, v);
1da177e4
LT
404 ++*pos;
405 return r;
406}
407
408static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409{
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
412}
413
414static int rt_cache_seq_show(struct seq_file *seq, void *v)
415{
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 "HHUptod\tSpecDst");
421 else {
422 struct rtable *r = v;
69cce1d1 423 struct neighbour *n;
218fa90f 424 int len, HHUptod;
1da177e4 425
218fa90f 426 rcu_read_lock();
27217455 427 n = dst_get_neighbour_noref(&r->dst);
218fa90f
ED
428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 rcu_read_unlock();
430
0eae88f3
ED
431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 433 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
d8d1f30b
CG
436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 438 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 442 r->rt_key_tos,
f6b72b62 443 -1,
218fa90f 444 HHUptod,
5e659e4c
PE
445 r->rt_spec_dst, &len);
446
447 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
448 }
449 return 0;
1da177e4
LT
450}
451
f690808e 452static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
457};
458
459static int rt_cache_seq_open(struct inode *inode, struct file *file)
460{
a75e936f 461 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 462 sizeof(struct rt_cache_iter_state));
1da177e4
LT
463}
464
9a32144e 465static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
a75e936f 470 .release = seq_release_net,
1da177e4
LT
471};
472
473
474static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475{
476 int cpu;
477
478 if (*pos == 0)
479 return SEQ_START_TOKEN;
480
0f23174a 481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
2f970d83 485 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
486 }
487 return NULL;
488}
489
490static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491{
492 int cpu;
493
0f23174a 494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
495 if (!cpu_possible(cpu))
496 continue;
497 *pos = cpu+1;
2f970d83 498 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
499 }
500 return NULL;
e905a9ed 501
1da177e4
LT
502}
503
504static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505{
506
507}
508
509static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510{
511 struct rt_cache_stat *st = v;
512
513 if (v == SEQ_START_TOKEN) {
5bec0039 514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
515 return 0;
516 }
e905a9ed 517
1da177e4
LT
518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 520 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
521 st->in_hit,
522 st->in_slow_tot,
523 st->in_slow_mc,
524 st->in_no_route,
525 st->in_brd,
526 st->in_martian_dst,
527 st->in_martian_src,
528
529 st->out_hit,
530 st->out_slow_tot,
e905a9ed 531 st->out_slow_mc,
1da177e4
LT
532
533 st->gc_total,
534 st->gc_ignored,
535 st->gc_goal_miss,
536 st->gc_dst_overflow,
537 st->in_hlist_search,
538 st->out_hlist_search
539 );
540 return 0;
541}
542
f690808e 543static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
548};
549
550
551static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552{
553 return seq_open(file, &rt_cpu_seq_ops);
554}
555
9a32144e 556static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
559 .read = seq_read,
560 .llseek = seq_lseek,
561 .release = seq_release,
562};
563
c7066f70 564#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 565static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 566{
a661c419
AD
567 struct ip_rt_acct *dst, *src;
568 unsigned int i, j;
569
570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 if (!dst)
572 return -ENOMEM;
573
574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
581 }
78c686e9
PE
582 }
583
a661c419
AD
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 kfree(dst);
586 return 0;
587}
78c686e9 588
a661c419
AD
589static int rt_acct_proc_open(struct inode *inode, struct file *file)
590{
591 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 592}
a661c419
AD
593
594static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
597 .read = seq_read,
598 .llseek = seq_lseek,
599 .release = single_release,
600};
78c686e9 601#endif
107f1634 602
73b38711 603static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
604{
605 struct proc_dir_entry *pde;
606
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 &rt_cache_seq_fops);
609 if (!pde)
610 goto err1;
611
77020720
WC
612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
614 if (!pde)
615 goto err2;
616
c7066f70 617#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
619 if (!pde)
620 goto err3;
621#endif
622 return 0;
623
c7066f70 624#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
625err3:
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627#endif
628err2:
629 remove_proc_entry("rt_cache", net->proc_net);
630err1:
631 return -ENOMEM;
632}
73b38711
DL
633
634static void __net_exit ip_rt_do_proc_exit(struct net *net)
635{
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 638#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 639 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 640#endif
73b38711
DL
641}
642
643static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
646};
647
648static int __init ip_rt_proc_init(void)
649{
650 return register_pernet_subsys(&ip_rt_proc_ops);
651}
652
107f1634 653#else
73b38711 654static inline int ip_rt_proc_init(void)
107f1634
PE
655{
656 return 0;
657}
1da177e4 658#endif /* CONFIG_PROC_FS */
e905a9ed 659
5969f71d 660static inline void rt_free(struct rtable *rt)
1da177e4 661{
d8d1f30b 662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
663}
664
5969f71d 665static inline void rt_drop(struct rtable *rt)
1da177e4 666{
1da177e4 667 ip_rt_put(rt);
d8d1f30b 668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
669}
670
5969f71d 671static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
672{
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 676 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
677}
678
5969f71d 679static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
680{
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 682 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
683}
684
685static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686{
687 unsigned long age;
688 int ret = 0;
689
d8d1f30b 690 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
691 goto out;
692
d8d1f30b 693 age = jiffies - rth->dst.lastuse;
1da177e4
LT
694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
696 goto out;
697 ret = 1;
698out: return ret;
699}
700
701/* Bits of score are:
702 * 31: very valuable
703 * 30: not quite useless
704 * 29..0: usage counter
705 */
706static inline u32 rt_score(struct rtable *rt)
707{
d8d1f30b 708 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
709
710 score = ~score & ~(3<<30);
711
712 if (rt_valuable(rt))
713 score |= (1<<31);
714
c7537967 715 if (rt_is_output_route(rt) ||
1da177e4
LT
716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 score |= (1<<30);
718
719 return score;
720}
721
1080d709
NH
722static inline bool rt_caching(const struct net *net)
723{
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
726}
727
5e2b61f7
DM
728static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
1080d709 730{
5e2b61f7
DM
731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
734}
735
5e2b61f7 736static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 737{
5e2b61f7
DM
738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 743 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
744}
745
b5921910
DL
746static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747{
d8d1f30b 748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
749}
750
e84f84f2
DL
751static inline int rt_is_expired(struct rtable *rth)
752{
d8d1f30b 753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
754}
755
beb659bd
ED
756/*
757 * Perform a full scan of hash table and free all entries.
758 * Can be called by a softirq or a process.
759 * In the later case, we want to be reschedule if necessary
760 */
6561a3b1 761static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
762{
763 unsigned int i;
764 struct rtable *rth, *next;
765
766 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
767 struct rtable __rcu **pprev;
768 struct rtable *list;
769
beb659bd
ED
770 if (process_context && need_resched())
771 cond_resched();
33d480ce 772 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
773 if (!rth)
774 continue;
775
776 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 777
6561a3b1
DM
778 list = NULL;
779 pprev = &rt_hash_table[i].chain;
780 rth = rcu_dereference_protected(*pprev,
1c31720a 781 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 782
6561a3b1
DM
783 while (rth) {
784 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 785 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
786
787 if (!net ||
788 net_eq(dev_net(rth->dst.dev), net)) {
789 rcu_assign_pointer(*pprev, next);
790 rcu_assign_pointer(rth->dst.rt_next, list);
791 list = rth;
32cb5b4e 792 } else {
6561a3b1 793 pprev = &rth->dst.rt_next;
32cb5b4e 794 }
6561a3b1 795 rth = next;
32cb5b4e 796 }
6561a3b1 797
beb659bd
ED
798 spin_unlock_bh(rt_hash_lock_addr(i));
799
6561a3b1
DM
800 for (; list; list = next) {
801 next = rcu_dereference_protected(list->dst.rt_next, 1);
802 rt_free(list);
beb659bd
ED
803 }
804 }
805}
806
1080d709
NH
807/*
808 * While freeing expired entries, we compute average chain length
809 * and standard deviation, using fixed-point arithmetic.
810 * This to have an estimation of rt_chain_length_max
811 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
812 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813 */
814
815#define FRACT_BITS 3
816#define ONE (1UL << FRACT_BITS)
817
98376387
ED
818/*
819 * Given a hash chain and an item in this hash chain,
820 * find if a previous entry has the same hash_inputs
821 * (but differs on tos, mark or oif)
822 * Returns 0 if an alias is found.
823 * Returns ONE if rth has no alias before itself.
824 */
825static int has_noalias(const struct rtable *head, const struct rtable *rth)
826{
827 const struct rtable *aux = head;
828
829 while (aux != rth) {
5e2b61f7 830 if (compare_hash_inputs(aux, rth))
98376387 831 return 0;
1c31720a 832 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
833 }
834 return ONE;
835}
836
9f28a2fc
ED
837static void rt_check_expire(void)
838{
839 static unsigned int rover;
840 unsigned int i = rover, goal;
841 struct rtable *rth;
842 struct rtable __rcu **rthp;
843 unsigned long samples = 0;
844 unsigned long sum = 0, sum2 = 0;
845 unsigned long delta;
846 u64 mult;
847
848 delta = jiffies - expires_ljiffies;
849 expires_ljiffies = jiffies;
850 mult = ((u64)delta) << rt_hash_log;
851 if (ip_rt_gc_timeout > 1)
852 do_div(mult, ip_rt_gc_timeout);
853 goal = (unsigned int)mult;
854 if (goal > rt_hash_mask)
855 goal = rt_hash_mask + 1;
856 for (; goal > 0; goal--) {
857 unsigned long tmo = ip_rt_gc_timeout;
858 unsigned long length;
859
860 i = (i + 1) & rt_hash_mask;
861 rthp = &rt_hash_table[i].chain;
862
863 if (need_resched())
864 cond_resched();
865
866 samples++;
867
868 if (rcu_dereference_raw(*rthp) == NULL)
869 continue;
870 length = 0;
871 spin_lock_bh(rt_hash_lock_addr(i));
872 while ((rth = rcu_dereference_protected(*rthp,
873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 prefetch(rth->dst.rt_next);
875 if (rt_is_expired(rth)) {
876 *rthp = rth->dst.rt_next;
877 rt_free(rth);
878 continue;
879 }
880 if (rth->dst.expires) {
881 /* Entry is expired even if it is in use */
882 if (time_before_eq(jiffies, rth->dst.expires)) {
883nofree:
884 tmo >>= 1;
885 rthp = &rth->dst.rt_next;
886 /*
887 * We only count entries on
888 * a chain with equal hash inputs once
889 * so that entries for different QOS
890 * levels, and other non-hash input
891 * attributes don't unfairly skew
892 * the length computation
893 */
894 length += has_noalias(rt_hash_table[i].chain, rth);
895 continue;
896 }
897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 goto nofree;
899
900 /* Cleanup aged off entries. */
901 *rthp = rth->dst.rt_next;
902 rt_free(rth);
903 }
904 spin_unlock_bh(rt_hash_lock_addr(i));
905 sum += length;
906 sum2 += length*length;
907 }
908 if (samples) {
909 unsigned long avg = sum / samples;
910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 rt_chain_length_max = max_t(unsigned long,
912 ip_rt_gc_elasticity,
913 (avg + 4*sd) >> FRACT_BITS);
914 }
915 rover = i;
916}
917
918/*
919 * rt_worker_func() is run in process context.
920 * we call rt_check_expire() to scan part of the hash table
921 */
922static void rt_worker_func(struct work_struct *work)
923{
924 rt_check_expire();
925 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926}
927
29e75252 928/*
25985edc 929 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931 * many times (2^24) without giving recent rt_genid.
932 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 933 */
86c657f6 934static void rt_cache_invalidate(struct net *net)
1da177e4 935{
29e75252 936 unsigned char shuffle;
1da177e4 937
29e75252 938 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
de68dca1 940 redirect_genid++;
1da177e4
LT
941}
942
29e75252
ED
943/*
944 * delay < 0 : invalidate cache (fast : entries will be deleted later)
945 * delay >= 0 : invalidate & flush cache (can be long)
946 */
76e6ebfb 947void rt_cache_flush(struct net *net, int delay)
1da177e4 948{
86c657f6 949 rt_cache_invalidate(net);
29e75252 950 if (delay >= 0)
6561a3b1 951 rt_do_flush(net, !in_softirq());
1da177e4
LT
952}
953
a5ee1551 954/* Flush previous cache invalidated entries from the cache */
6561a3b1 955void rt_cache_flush_batch(struct net *net)
a5ee1551 956{
6561a3b1 957 rt_do_flush(net, !in_softirq());
a5ee1551
EB
958}
959
1080d709
NH
960static void rt_emergency_hash_rebuild(struct net *net)
961{
3ee94372 962 if (net_ratelimit())
1080d709 963 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 964 rt_cache_invalidate(net);
1080d709
NH
965}
966
1da177e4
LT
967/*
968 Short description of GC goals.
969
970 We want to build algorithm, which will keep routing cache
971 at some equilibrium point, when number of aged off entries
972 is kept approximately equal to newly generated ones.
973
974 Current expiration strength is variable "expire".
975 We try to adjust it dynamically, so that if networking
976 is idle expires is large enough to keep enough of warm entries,
977 and when load increases it reduces to limit cache size.
978 */
979
569d3645 980static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
981{
982 static unsigned long expire = RT_GC_TIMEOUT;
983 static unsigned long last_gc;
984 static int rover;
985 static int equilibrium;
1c31720a
ED
986 struct rtable *rth;
987 struct rtable __rcu **rthp;
1da177e4
LT
988 unsigned long now = jiffies;
989 int goal;
fc66f95c 990 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
991
992 /*
993 * Garbage collection is pretty expensive,
994 * do not make it too frequently.
995 */
996
997 RT_CACHE_STAT_INC(gc_total);
998
999 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 1000 entries < ip_rt_max_size) {
1da177e4
LT
1001 RT_CACHE_STAT_INC(gc_ignored);
1002 goto out;
1003 }
1004
fc66f95c 1005 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1006 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1008 if (goal <= 0) {
1009 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1011 goal = entries - equilibrium;
1da177e4 1012 if (goal > 0) {
b790cedd 1013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1014 goal = entries - equilibrium;
1da177e4
LT
1015 }
1016 } else {
1017 /* We are in dangerous area. Try to reduce cache really
1018 * aggressively.
1019 */
b790cedd 1020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1021 equilibrium = entries - goal;
1da177e4
LT
1022 }
1023
1024 if (now - last_gc >= ip_rt_gc_min_interval)
1025 last_gc = now;
1026
1027 if (goal <= 0) {
1028 equilibrium += goal;
1029 goto work_done;
1030 }
1031
1032 do {
1033 int i, k;
1034
1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 unsigned long tmo = expire;
1037
1038 k = (k + 1) & rt_hash_mask;
1039 rthp = &rt_hash_table[k].chain;
22c047cc 1040 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1041 while ((rth = rcu_dereference_protected(*rthp,
1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1043 if (!rt_is_expired(rth) &&
29e75252 1044 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1045 tmo >>= 1;
d8d1f30b 1046 rthp = &rth->dst.rt_next;
1da177e4
LT
1047 continue;
1048 }
d8d1f30b 1049 *rthp = rth->dst.rt_next;
1da177e4
LT
1050 rt_free(rth);
1051 goal--;
1da177e4 1052 }
22c047cc 1053 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1054 if (goal <= 0)
1055 break;
1056 }
1057 rover = k;
1058
1059 if (goal <= 0)
1060 goto work_done;
1061
1062 /* Goal is not achieved. We stop process if:
1063
1064 - if expire reduced to zero. Otherwise, expire is halfed.
1065 - if table is not full.
1066 - if we are called from interrupt.
1067 - jiffies check is just fallback/debug loop breaker.
1068 We will not spin here for long time in any case.
1069 */
1070
1071 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073 if (expire == 0)
1074 break;
1075
1076 expire >>= 1;
1da177e4 1077
fc66f95c 1078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1079 goto out;
1080 } while (!in_softirq() && time_before_eq(jiffies, now));
1081
fc66f95c
ED
1082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 goto out;
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1085 goto out;
1086 if (net_ratelimit())
1087 printk(KERN_WARNING "dst cache overflow\n");
1088 RT_CACHE_STAT_INC(gc_dst_overflow);
1089 return 1;
1090
1091work_done:
1092 expire += ip_rt_gc_min_interval;
1093 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1094 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1096 expire = ip_rt_gc_timeout;
1da177e4
LT
1097out: return 0;
1098}
1099
98376387
ED
1100/*
1101 * Returns number of entries in a hash chain that have different hash_inputs
1102 */
1103static int slow_chain_length(const struct rtable *head)
1104{
1105 int length = 0;
1106 const struct rtable *rth = head;
1107
1108 while (rth) {
1109 length += has_noalias(head, rth);
1c31720a 1110 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1111 }
1112 return length >> FRACT_BITS;
1113}
1114
d3aaeb38 1115static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
3769cffb 1116{
d3aaeb38
DM
1117 static const __be32 inaddr_any = 0;
1118 struct net_device *dev = dst->dev;
1119 const __be32 *pkey = daddr;
3769cffb
DM
1120 struct neighbour *n;
1121
3769cffb 1122 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
d3aaeb38
DM
1123 pkey = &inaddr_any;
1124
32092ecf 1125 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
d3aaeb38
DM
1126 if (n)
1127 return n;
32092ecf 1128 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1129}
1130
1131static int rt_bind_neighbour(struct rtable *rt)
1132{
1133 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
3769cffb
DM
1134 if (IS_ERR(n))
1135 return PTR_ERR(n);
69cce1d1 1136 dst_set_neighbour(&rt->dst, n);
3769cffb
DM
1137
1138 return 0;
1139}
1140
b23dd4fe
DM
1141static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1142 struct sk_buff *skb, int ifindex)
1da177e4 1143{
1c31720a
ED
1144 struct rtable *rth, *cand;
1145 struct rtable __rcu **rthp, **candp;
1da177e4 1146 unsigned long now;
1da177e4
LT
1147 u32 min_score;
1148 int chain_length;
1149 int attempts = !in_softirq();
1150
1151restart:
1152 chain_length = 0;
1153 min_score = ~(u32)0;
1154 cand = NULL;
1155 candp = NULL;
1156 now = jiffies;
1157
d8d1f30b 1158 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1159 /*
1160 * If we're not caching, just tell the caller we
1161 * were successful and don't touch the route. The
1162 * caller hold the sole reference to the cache entry, and
1163 * it will be released when the caller is done with it.
1164 * If we drop it here, the callers have no way to resolve routes
1165 * when we're not caching. Instead, just point *rp at rt, so
1166 * the caller gets a single use out of the route
b6280b47
NH
1167 * Note that we do rt_free on this new route entry, so that
1168 * once its refcount hits zero, we are still able to reap it
1169 * (Thanks Alexey)
27b75c95
ED
1170 * Note: To avoid expensive rcu stuff for this uncached dst,
1171 * we set DST_NOCACHE so that dst_release() can free dst without
1172 * waiting a grace period.
73e42897 1173 */
b6280b47 1174
c7d4426a 1175 rt->dst.flags |= DST_NOCACHE;
c7537967 1176 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1177 int err = rt_bind_neighbour(rt);
b6280b47
NH
1178 if (err) {
1179 if (net_ratelimit())
1180 printk(KERN_WARNING
1181 "Neighbour table failure & not caching routes.\n");
27b75c95 1182 ip_rt_put(rt);
b23dd4fe 1183 return ERR_PTR(err);
b6280b47
NH
1184 }
1185 }
1186
b6280b47 1187 goto skip_hashing;
1080d709
NH
1188 }
1189
1da177e4
LT
1190 rthp = &rt_hash_table[hash].chain;
1191
22c047cc 1192 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1193 while ((rth = rcu_dereference_protected(*rthp,
1194 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1195 if (rt_is_expired(rth)) {
d8d1f30b 1196 *rthp = rth->dst.rt_next;
29e75252
ED
1197 rt_free(rth);
1198 continue;
1199 }
5e2b61f7 1200 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1201 /* Put it first */
d8d1f30b 1202 *rthp = rth->dst.rt_next;
1da177e4
LT
1203 /*
1204 * Since lookup is lockfree, the deletion
1205 * must be visible to another weakly ordered CPU before
1206 * the insertion at the start of the hash chain.
1207 */
d8d1f30b 1208 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1209 rt_hash_table[hash].chain);
1210 /*
1211 * Since lookup is lockfree, the update writes
1212 * must be ordered for consistency on SMP.
1213 */
1214 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1215
d8d1f30b 1216 dst_use(&rth->dst, now);
22c047cc 1217 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1218
1219 rt_drop(rt);
b23dd4fe 1220 if (skb)
d8d1f30b 1221 skb_dst_set(skb, &rth->dst);
b23dd4fe 1222 return rth;
1da177e4
LT
1223 }
1224
d8d1f30b 1225 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1226 u32 score = rt_score(rth);
1227
1228 if (score <= min_score) {
1229 cand = rth;
1230 candp = rthp;
1231 min_score = score;
1232 }
1233 }
1234
1235 chain_length++;
1236
d8d1f30b 1237 rthp = &rth->dst.rt_next;
1da177e4
LT
1238 }
1239
1240 if (cand) {
1241 /* ip_rt_gc_elasticity used to be average length of chain
1242 * length, when exceeded gc becomes really aggressive.
1243 *
1244 * The second limit is less certain. At the moment it allows
1245 * only 2 entries per bucket. We will see.
1246 */
1247 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1248 *candp = cand->dst.rt_next;
1da177e4
LT
1249 rt_free(cand);
1250 }
1080d709 1251 } else {
98376387
ED
1252 if (chain_length > rt_chain_length_max &&
1253 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1254 struct net *net = dev_net(rt->dst.dev);
1080d709 1255 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1256 if (!rt_caching(net)) {
1080d709 1257 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1258 rt->dst.dev->name, num);
1080d709 1259 }
b35ecb5d 1260 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1261 spin_unlock_bh(rt_hash_lock_addr(hash));
1262
5e2b61f7 1263 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1264 ifindex, rt_genid(net));
1265 goto restart;
1080d709 1266 }
1da177e4
LT
1267 }
1268
1269 /* Try to bind route to arp only if it is output
1270 route or unicast forwarding path.
1271 */
c7537967 1272 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1273 int err = rt_bind_neighbour(rt);
1da177e4 1274 if (err) {
22c047cc 1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1276
1277 if (err != -ENOBUFS) {
1278 rt_drop(rt);
b23dd4fe 1279 return ERR_PTR(err);
1da177e4
LT
1280 }
1281
1282 /* Neighbour tables are full and nothing
1283 can be released. Try to shrink route cache,
1284 it is most likely it holds some neighbour records.
1285 */
1286 if (attempts-- > 0) {
1287 int saved_elasticity = ip_rt_gc_elasticity;
1288 int saved_int = ip_rt_gc_min_interval;
1289 ip_rt_gc_elasticity = 1;
1290 ip_rt_gc_min_interval = 0;
569d3645 1291 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1292 ip_rt_gc_min_interval = saved_int;
1293 ip_rt_gc_elasticity = saved_elasticity;
1294 goto restart;
1295 }
1296
1297 if (net_ratelimit())
7e1b33e5 1298 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4 1299 rt_drop(rt);
b23dd4fe 1300 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1301 }
1302 }
1303
d8d1f30b 1304 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1305
00269b54
ED
1306 /*
1307 * Since lookup is lockfree, we must make sure
25985edc 1308 * previous writes to rt are committed to memory
00269b54
ED
1309 * before making rt visible to other CPUS.
1310 */
1ddbcb00 1311 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1312
22c047cc 1313 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1314
b6280b47 1315skip_hashing:
b23dd4fe 1316 if (skb)
d8d1f30b 1317 skb_dst_set(skb, &rt->dst);
b23dd4fe 1318 return rt;
1da177e4
LT
1319}
1320
6431cbc2
DM
1321static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1322
1323static u32 rt_peer_genid(void)
1324{
1325 return atomic_read(&__rt_peer_genid);
1326}
1327
a48eff12 1328void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1329{
1da177e4
LT
1330 struct inet_peer *peer;
1331
a48eff12 1332 peer = inet_getpeer_v4(daddr, create);
1da177e4 1333
49e8ab03 1334 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1335 inet_putpeer(peer);
6431cbc2
DM
1336 else
1337 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1338}
1339
1340/*
1341 * Peer allocation may fail only in serious out-of-memory conditions. However
1342 * we still can generate some output.
1343 * Random ID selection looks a bit dangerous because we have no chances to
1344 * select ID being unique in a reasonable period of time.
1345 * But broken packet identifier may be better than no packet at all.
1346 */
1347static void ip_select_fb_ident(struct iphdr *iph)
1348{
1349 static DEFINE_SPINLOCK(ip_fb_id_lock);
1350 static u32 ip_fallback_id;
1351 u32 salt;
1352
1353 spin_lock_bh(&ip_fb_id_lock);
e448515c 1354 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1355 iph->id = htons(salt & 0xFFFF);
1356 ip_fallback_id = salt;
1357 spin_unlock_bh(&ip_fb_id_lock);
1358}
1359
1360void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1361{
1362 struct rtable *rt = (struct rtable *) dst;
1363
e688a604 1364 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1da177e4 1365 if (rt->peer == NULL)
a48eff12 1366 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1367
1368 /* If peer is attached to destination, it is never detached,
1369 so that we need not to grab a lock to dereference it.
1370 */
1371 if (rt->peer) {
1372 iph->id = htons(inet_getid(rt->peer, more));
1373 return;
1374 }
e688a604 1375 } else if (!rt)
e905a9ed 1376 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1377 __builtin_return_address(0));
1da177e4
LT
1378
1379 ip_select_fb_ident(iph);
1380}
4bc2f18b 1381EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1382
1383static void rt_del(unsigned hash, struct rtable *rt)
1384{
1c31720a
ED
1385 struct rtable __rcu **rthp;
1386 struct rtable *aux;
1da177e4 1387
29e75252 1388 rthp = &rt_hash_table[hash].chain;
22c047cc 1389 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1390 ip_rt_put(rt);
1c31720a
ED
1391 while ((aux = rcu_dereference_protected(*rthp,
1392 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1393 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1394 *rthp = aux->dst.rt_next;
29e75252
ED
1395 rt_free(aux);
1396 continue;
1da177e4 1397 }
d8d1f30b 1398 rthp = &aux->dst.rt_next;
29e75252 1399 }
22c047cc 1400 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1401}
1402
de398fb8 1403static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
9cc20b26
ED
1404{
1405 struct rtable *rt = (struct rtable *) dst;
1406 __be32 orig_gw = rt->rt_gateway;
1407 struct neighbour *n, *old_n;
1408
1409 dst_confirm(&rt->dst);
1410
1411 rt->rt_gateway = peer->redirect_learned.a4;
1412
1413 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
de398fb8
DM
1414 if (IS_ERR(n)) {
1415 rt->rt_gateway = orig_gw;
1416 return;
1417 }
9cc20b26
ED
1418 old_n = xchg(&rt->dst._neighbour, n);
1419 if (old_n)
1420 neigh_release(old_n);
de398fb8
DM
1421 if (!(n->nud_state & NUD_VALID)) {
1422 neigh_event_send(n, NULL);
9cc20b26
ED
1423 } else {
1424 rt->rt_flags |= RTCF_REDIRECTED;
1425 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426 }
9cc20b26
ED
1427}
1428
ed7865a4 1429/* called in rcu_read_lock() section */
f7655229
AV
1430void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431 __be32 saddr, struct net_device *dev)
1da177e4 1432{
7cc9150e 1433 int s, i;
ed7865a4 1434 struct in_device *in_dev = __in_dev_get_rcu(dev);
7cc9150e
FL
1435 __be32 skeys[2] = { saddr, 0 };
1436 int ikeys[2] = { dev->ifindex, 0 };
f39925db 1437 struct inet_peer *peer;
317805b8 1438 struct net *net;
1da177e4 1439
1da177e4
LT
1440 if (!in_dev)
1441 return;
1442
c346dca1 1443 net = dev_net(dev);
9d4fb27d
JP
1444 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446 ipv4_is_zeronet(new_gw))
1da177e4
LT
1447 goto reject_redirect;
1448
1449 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451 goto reject_redirect;
1452 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453 goto reject_redirect;
1454 } else {
317805b8 1455 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1456 goto reject_redirect;
1457 }
1458
7cc9150e
FL
1459 for (s = 0; s < 2; s++) {
1460 for (i = 0; i < 2; i++) {
9cc20b26
ED
1461 unsigned int hash;
1462 struct rtable __rcu **rthp;
1463 struct rtable *rt;
1464
1465 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466
1467 rthp = &rt_hash_table[hash].chain;
1468
1469 while ((rt = rcu_dereference(*rthp)) != NULL) {
1470 rthp = &rt->dst.rt_next;
1471
1472 if (rt->rt_key_dst != daddr ||
1473 rt->rt_key_src != skeys[s] ||
1474 rt->rt_oif != ikeys[i] ||
1475 rt_is_input_route(rt) ||
1476 rt_is_expired(rt) ||
1477 !net_eq(dev_net(rt->dst.dev), net) ||
1478 rt->dst.error ||
1479 rt->dst.dev != dev ||
1480 rt->rt_gateway != old_gw)
1481 continue;
e905a9ed 1482
9cc20b26
ED
1483 if (!rt->peer)
1484 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4 1485
9cc20b26
ED
1486 peer = rt->peer;
1487 if (peer) {
de68dca1
ED
1488 if (peer->redirect_learned.a4 != new_gw ||
1489 peer->redirect_genid != redirect_genid) {
9cc20b26 1490 peer->redirect_learned.a4 = new_gw;
de68dca1 1491 peer->redirect_genid = redirect_genid;
9cc20b26
ED
1492 atomic_inc(&__rt_peer_genid);
1493 }
1494 check_peer_redir(&rt->dst, peer);
1495 }
7cc9150e 1496 }
7cc9150e 1497 }
1da177e4 1498 }
1da177e4
LT
1499 return;
1500
1501reject_redirect:
1502#ifdef CONFIG_IP_ROUTE_VERBOSE
1503 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1504 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1505 " Advised path = %pI4 -> %pI4\n",
1506 &old_gw, dev->name, &new_gw,
1507 &saddr, &daddr);
1da177e4 1508#endif
ed7865a4 1509 ;
1da177e4
LT
1510}
1511
fe6fe792
ED
1512static bool peer_pmtu_expired(struct inet_peer *peer)
1513{
1514 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1515
1516 return orig &&
1517 time_after_eq(jiffies, orig) &&
1518 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1519}
1520
1521static bool peer_pmtu_cleaned(struct inet_peer *peer)
1522{
1523 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1524
1525 return orig &&
1526 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1527}
1528
1da177e4
LT
1529static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1530{
ee6b9673 1531 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1532 struct dst_entry *ret = dst;
1533
1534 if (rt) {
d11a4dc1 1535 if (dst->obsolete > 0) {
1da177e4
LT
1536 ip_rt_put(rt);
1537 ret = NULL;
2c8cec5c 1538 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1539 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1540 rt->rt_oif,
e84f84f2 1541 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1542 rt_del(hash, rt);
1543 ret = NULL;
fe6fe792
ED
1544 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1545 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1546 }
1547 }
1548 return ret;
1549}
1550
1551/*
1552 * Algorithm:
1553 * 1. The first ip_rt_redirect_number redirects are sent
1554 * with exponential backoff, then we stop sending them at all,
1555 * assuming that the host ignores our redirects.
1556 * 2. If we did not see packets requiring redirects
1557 * during ip_rt_redirect_silence, we assume that the host
1558 * forgot redirected route and start to send redirects again.
1559 *
1560 * This algorithm is much cheaper and more intelligent than dumb load limiting
1561 * in icmp.c.
1562 *
1563 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1564 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1565 */
1566
1567void ip_rt_send_redirect(struct sk_buff *skb)
1568{
511c3f92 1569 struct rtable *rt = skb_rtable(skb);
30038fc6 1570 struct in_device *in_dev;
92d86829 1571 struct inet_peer *peer;
30038fc6 1572 int log_martians;
1da177e4 1573
30038fc6 1574 rcu_read_lock();
d8d1f30b 1575 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1576 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1577 rcu_read_unlock();
1da177e4 1578 return;
30038fc6
ED
1579 }
1580 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1581 rcu_read_unlock();
1da177e4 1582
92d86829 1583 if (!rt->peer)
a48eff12 1584 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1585 peer = rt->peer;
1586 if (!peer) {
1587 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 return;
1589 }
1590
1da177e4
LT
1591 /* No redirected packets during ip_rt_redirect_silence;
1592 * reset the algorithm.
1593 */
92d86829
DM
1594 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1595 peer->rate_tokens = 0;
1da177e4
LT
1596
1597 /* Too many ignored redirects; do not send anything
d8d1f30b 1598 * set dst.rate_last to the last seen redirected packet.
1da177e4 1599 */
92d86829
DM
1600 if (peer->rate_tokens >= ip_rt_redirect_number) {
1601 peer->rate_last = jiffies;
30038fc6 1602 return;
1da177e4
LT
1603 }
1604
1605 /* Check for load limit; set rate_last to the latest sent
1606 * redirect.
1607 */
92d86829 1608 if (peer->rate_tokens == 0 ||
14fb8a76 1609 time_after(jiffies,
92d86829
DM
1610 (peer->rate_last +
1611 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1612 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1613 peer->rate_last = jiffies;
1614 ++peer->rate_tokens;
1da177e4 1615#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1616 if (log_martians &&
92d86829 1617 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1618 net_ratelimit())
673d57e7 1619 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
c5be24ff 1620 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1621 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1622#endif
1623 }
1da177e4
LT
1624}
1625
1626static int ip_error(struct sk_buff *skb)
1627{
511c3f92 1628 struct rtable *rt = skb_rtable(skb);
92d86829 1629 struct inet_peer *peer;
1da177e4 1630 unsigned long now;
92d86829 1631 bool send;
1da177e4
LT
1632 int code;
1633
d8d1f30b 1634 switch (rt->dst.error) {
4500ebf8
JP
1635 case EINVAL:
1636 default:
1637 goto out;
1638 case EHOSTUNREACH:
1639 code = ICMP_HOST_UNREACH;
1640 break;
1641 case ENETUNREACH:
1642 code = ICMP_NET_UNREACH;
1643 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1644 IPSTATS_MIB_INNOROUTES);
1645 break;
1646 case EACCES:
1647 code = ICMP_PKT_FILTERED;
1648 break;
1da177e4
LT
1649 }
1650
92d86829 1651 if (!rt->peer)
a48eff12 1652 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1653 peer = rt->peer;
1654
1655 send = true;
1656 if (peer) {
1657 now = jiffies;
1658 peer->rate_tokens += now - peer->rate_last;
1659 if (peer->rate_tokens > ip_rt_error_burst)
1660 peer->rate_tokens = ip_rt_error_burst;
1661 peer->rate_last = now;
1662 if (peer->rate_tokens >= ip_rt_error_cost)
1663 peer->rate_tokens -= ip_rt_error_cost;
1664 else
1665 send = false;
1da177e4 1666 }
92d86829
DM
1667 if (send)
1668 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1669
1670out: kfree_skb(skb);
1671 return 0;
e905a9ed 1672}
1da177e4
LT
1673
1674/*
1675 * The last two values are not from the RFC but
1676 * are needed for AMPRnet AX.25 paths.
1677 */
1678
9b5b5cff 1679static const unsigned short mtu_plateau[] =
1da177e4
LT
1680{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1681
5969f71d 1682static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1683{
1684 int i;
e905a9ed 1685
1da177e4
LT
1686 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1687 if (old_mtu > mtu_plateau[i])
1688 return mtu_plateau[i];
1689 return 68;
1690}
1691
b71d1d42 1692unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1693 unsigned short new_mtu,
1694 struct net_device *dev)
1da177e4 1695{
1da177e4 1696 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1697 unsigned short est_mtu = 0;
2c8cec5c 1698 struct inet_peer *peer;
1da177e4 1699
2c8cec5c
DM
1700 peer = inet_getpeer_v4(iph->daddr, 1);
1701 if (peer) {
1702 unsigned short mtu = new_mtu;
1da177e4 1703
2c8cec5c
DM
1704 if (new_mtu < 68 || new_mtu >= old_mtu) {
1705 /* BSD 4.2 derived systems incorrectly adjust
1706 * tot_len by the IP header length, and report
1707 * a zero MTU in the ICMP message.
1708 */
1709 if (mtu == 0 &&
1710 old_mtu >= 68 + (iph->ihl << 2))
1711 old_mtu -= iph->ihl << 2;
1712 mtu = guess_mtu(old_mtu);
1713 }
0010e465 1714
2c8cec5c
DM
1715 if (mtu < ip_rt_min_pmtu)
1716 mtu = ip_rt_min_pmtu;
1717 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1718 unsigned long pmtu_expires;
1719
1720 pmtu_expires = jiffies + ip_rt_mtu_expires;
1721 if (!pmtu_expires)
1722 pmtu_expires = 1UL;
1723
2c8cec5c
DM
1724 est_mtu = mtu;
1725 peer->pmtu_learned = mtu;
46af3180 1726 peer->pmtu_expires = pmtu_expires;
59445b6b 1727 atomic_inc(&__rt_peer_genid);
2c8cec5c 1728 }
1da177e4 1729
2c8cec5c 1730 inet_putpeer(peer);
1da177e4
LT
1731 }
1732 return est_mtu ? : new_mtu;
1733}
1734
2c8cec5c
DM
1735static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1736{
fe6fe792 1737 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1738
fe6fe792
ED
1739 if (!expires)
1740 return;
46af3180 1741 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1742 u32 orig_dst_mtu = dst_mtu(dst);
1743 if (peer->pmtu_learned < orig_dst_mtu) {
1744 if (!peer->pmtu_orig)
1745 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1746 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1747 }
1748 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1749 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1750}
1751
1da177e4
LT
1752static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1753{
2c8cec5c
DM
1754 struct rtable *rt = (struct rtable *) dst;
1755 struct inet_peer *peer;
1756
1757 dst_confirm(dst);
1758
1759 if (!rt->peer)
a48eff12 1760 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1761 peer = rt->peer;
1762 if (peer) {
fe6fe792
ED
1763 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1764
2c8cec5c 1765 if (mtu < ip_rt_min_pmtu)
1da177e4 1766 mtu = ip_rt_min_pmtu;
fe6fe792 1767 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1768
1769 pmtu_expires = jiffies + ip_rt_mtu_expires;
1770 if (!pmtu_expires)
1771 pmtu_expires = 1UL;
1772
2c8cec5c 1773 peer->pmtu_learned = mtu;
46af3180 1774 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1775
1776 atomic_inc(&__rt_peer_genid);
1777 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1778 }
46af3180 1779 check_peer_pmtu(dst, peer);
1da177e4
LT
1780 }
1781}
1782
f39925db 1783
de398fb8 1784static void ipv4_validate_peer(struct rtable *rt)
1da177e4 1785{
6431cbc2 1786 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1787 struct inet_peer *peer;
1788
6431cbc2 1789 if (!rt->peer)
a48eff12 1790 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1791
2c8cec5c 1792 peer = rt->peer;
fe6fe792 1793 if (peer) {
efbc368d 1794 check_peer_pmtu(&rt->dst, peer);
2c8cec5c 1795
de68dca1
ED
1796 if (peer->redirect_genid != redirect_genid)
1797 peer->redirect_learned.a4 = 0;
fe6fe792 1798 if (peer->redirect_learned.a4 &&
de398fb8
DM
1799 peer->redirect_learned.a4 != rt->rt_gateway)
1800 check_peer_redir(&rt->dst, peer);
f39925db
DM
1801 }
1802
6431cbc2
DM
1803 rt->rt_peer_genid = rt_peer_genid();
1804 }
efbc368d
DM
1805}
1806
1807static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808{
1809 struct rtable *rt = (struct rtable *) dst;
1810
1811 if (rt_is_expired(rt))
1812 return NULL;
de398fb8 1813 ipv4_validate_peer(rt);
d11a4dc1 1814 return dst;
1da177e4
LT
1815}
1816
1817static void ipv4_dst_destroy(struct dst_entry *dst)
1818{
1819 struct rtable *rt = (struct rtable *) dst;
1820 struct inet_peer *peer = rt->peer;
1da177e4 1821
62fa8a84
DM
1822 if (rt->fi) {
1823 fib_info_put(rt->fi);
1824 rt->fi = NULL;
1825 }
1da177e4
LT
1826 if (peer) {
1827 rt->peer = NULL;
1828 inet_putpeer(peer);
1829 }
1da177e4
LT
1830}
1831
1da177e4
LT
1832
1833static void ipv4_link_failure(struct sk_buff *skb)
1834{
1835 struct rtable *rt;
1836
1837 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838
511c3f92 1839 rt = skb_rtable(skb);
fe6fe792
ED
1840 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1842}
1843
1844static int ip_rt_bug(struct sk_buff *skb)
1845{
673d57e7
HH
1846 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1848 skb->dev ? skb->dev->name : "?");
1849 kfree_skb(skb);
c378a9c0 1850 WARN_ON(1);
1da177e4
LT
1851 return 0;
1852}
1853
1854/*
1855 We do not cache source address of outgoing interface,
1856 because it is used only by IP RR, TS and SRR options,
1857 so that it out of fast path.
1858
1859 BTW remember: "addr" is allowed to be not aligned
1860 in IP options!
1861 */
1862
8e36360a 1863void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1864{
a61ced5d 1865 __be32 src;
1da177e4 1866
c7537967 1867 if (rt_is_output_route(rt))
c5be24ff 1868 src = ip_hdr(skb)->saddr;
ebc0ffae 1869 else {
8e36360a
DM
1870 struct fib_result res;
1871 struct flowi4 fl4;
1872 struct iphdr *iph;
1873
1874 iph = ip_hdr(skb);
1875
1876 memset(&fl4, 0, sizeof(fl4));
1877 fl4.daddr = iph->daddr;
1878 fl4.saddr = iph->saddr;
b0fe4a31 1879 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1880 fl4.flowi4_oif = rt->dst.dev->ifindex;
1881 fl4.flowi4_iif = skb->dev->ifindex;
1882 fl4.flowi4_mark = skb->mark;
5e2b61f7 1883
ebc0ffae 1884 rcu_read_lock();
68a5e3dd 1885 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1886 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1887 else
1888 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1889 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1890 rcu_read_unlock();
1891 }
1da177e4
LT
1892 memcpy(addr, &src, 4);
1893}
1894
c7066f70 1895#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1896static void set_class_tag(struct rtable *rt, u32 tag)
1897{
d8d1f30b
CG
1898 if (!(rt->dst.tclassid & 0xFFFF))
1899 rt->dst.tclassid |= tag & 0xFFFF;
1900 if (!(rt->dst.tclassid & 0xFFFF0000))
1901 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1902}
1903#endif
1904
0dbaee3b
DM
1905static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906{
1907 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908
1909 if (advmss == 0) {
1910 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911 ip_rt_min_advmss);
1912 if (advmss > 65535 - 40)
1913 advmss = 65535 - 40;
1914 }
1915 return advmss;
1916}
1917
ebb762f2 1918static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1919{
261663b0 1920 const struct rtable *rt = (const struct rtable *) dst;
618f9bc7
SK
1921 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922
261663b0 1923 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1924 return mtu;
1925
1926 mtu = dst->dev->mtu;
d33e4553
DM
1927
1928 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1929
1930 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931 mtu = 576;
1932 }
1933
1934 if (mtu > IP_MAX_MTU)
1935 mtu = IP_MAX_MTU;
1936
1937 return mtu;
1938}
1939
813b3b5d 1940static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1941 struct fib_info *fi)
a4daad6b 1942{
0131ba45
DM
1943 struct inet_peer *peer;
1944 int create = 0;
a4daad6b 1945
0131ba45
DM
1946 /* If a peer entry exists for this destination, we must hook
1947 * it up in order to get at cached metrics.
1948 */
813b3b5d 1949 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1950 create = 1;
1951
3c0afdca 1952 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1953 if (peer) {
3c0afdca 1954 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1955 if (inet_metrics_new(peer))
1956 memcpy(peer->metrics, fi->fib_metrics,
1957 sizeof(u32) * RTAX_MAX);
1958 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1959
fe6fe792 1960 check_peer_pmtu(&rt->dst, peer);
de68dca1
ED
1961 if (peer->redirect_genid != redirect_genid)
1962 peer->redirect_learned.a4 = 0;
f39925db
DM
1963 if (peer->redirect_learned.a4 &&
1964 peer->redirect_learned.a4 != rt->rt_gateway) {
1965 rt->rt_gateway = peer->redirect_learned.a4;
1966 rt->rt_flags |= RTCF_REDIRECTED;
1967 }
0131ba45
DM
1968 } else {
1969 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970 rt->fi = fi;
1971 atomic_inc(&fi->fib_clntref);
1972 }
1973 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1974 }
1975}
1976
813b3b5d 1977static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1978 const struct fib_result *res,
982721f3 1979 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1980{
defb3519 1981 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1982
1983 if (fi) {
1984 if (FIB_RES_GW(*res) &&
1985 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1987 rt_init_metrics(rt, fl4, fi);
c7066f70 1988#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1989 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1990#endif
d33e4553 1991 }
defb3519 1992
defb3519
DM
1993 if (dst_mtu(dst) > IP_MAX_MTU)
1994 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1995 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1996 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1997
c7066f70 1998#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1999#ifdef CONFIG_IP_MULTIPLE_TABLES
2000 set_class_tag(rt, fib_rules_tclass(res));
2001#endif
2002 set_class_tag(rt, itag);
2003#endif
1da177e4
LT
2004}
2005
5c1e6aa3
DM
2006static struct rtable *rt_dst_alloc(struct net_device *dev,
2007 bool nopolicy, bool noxfrm)
0c4dcd58 2008{
5c1e6aa3
DM
2009 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010 DST_HOST |
2011 (nopolicy ? DST_NOPOLICY : 0) |
2012 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
2013}
2014
96d36220 2015/* called in rcu_read_lock() section */
9e12bb22 2016static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2017 u8 tos, struct net_device *dev, int our)
2018{
96d36220 2019 unsigned int hash;
1da177e4 2020 struct rtable *rth;
a61ced5d 2021 __be32 spec_dst;
96d36220 2022 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2023 u32 itag = 0;
b5f7e755 2024 int err;
1da177e4
LT
2025
2026 /* Primary sanity checks. */
2027
2028 if (in_dev == NULL)
2029 return -EINVAL;
2030
1e637c74 2031 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2032 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
2033 goto e_inval;
2034
f97c1e0c
JP
2035 if (ipv4_is_zeronet(saddr)) {
2036 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
2037 goto e_inval;
2038 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 2039 } else {
5c04c819
MS
2040 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041 &itag);
b5f7e755
ED
2042 if (err < 0)
2043 goto e_err;
2044 }
5c1e6aa3
DM
2045 rth = rt_dst_alloc(init_net.loopback_dev,
2046 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2047 if (!rth)
2048 goto e_nobufs;
2049
cf911662
DM
2050#ifdef CONFIG_IP_ROUTE_CLASSID
2051 rth->dst.tclassid = itag;
2052#endif
d8d1f30b 2053 rth->dst.output = ip_rt_bug;
1da177e4 2054
5e2b61f7 2055 rth->rt_key_dst = daddr;
5e2b61f7 2056 rth->rt_key_src = saddr;
cf911662
DM
2057 rth->rt_genid = rt_genid(dev_net(dev));
2058 rth->rt_flags = RTCF_MULTICAST;
2059 rth->rt_type = RTN_MULTICAST;
475949d8 2060 rth->rt_key_tos = tos;
cf911662 2061 rth->rt_dst = daddr;
1da177e4 2062 rth->rt_src = saddr;
1b86a58f 2063 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2064 rth->rt_iif = dev->ifindex;
5e2b61f7 2065 rth->rt_oif = 0;
cf911662 2066 rth->rt_mark = skb->mark;
1da177e4
LT
2067 rth->rt_gateway = daddr;
2068 rth->rt_spec_dst= spec_dst;
cf911662
DM
2069 rth->rt_peer_genid = 0;
2070 rth->peer = NULL;
2071 rth->fi = NULL;
1da177e4 2072 if (our) {
d8d1f30b 2073 rth->dst.input= ip_local_deliver;
1da177e4
LT
2074 rth->rt_flags |= RTCF_LOCAL;
2075 }
2076
2077#ifdef CONFIG_IP_MROUTE
f97c1e0c 2078 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 2079 rth->dst.input = ip_mr_input;
1da177e4
LT
2080#endif
2081 RT_CACHE_STAT_INC(in_slow_mc);
2082
e84f84f2 2083 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 2084 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 2085 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
2086
2087e_nobufs:
1da177e4 2088 return -ENOBUFS;
1da177e4 2089e_inval:
96d36220 2090 return -EINVAL;
b5f7e755 2091e_err:
b5f7e755 2092 return err;
1da177e4
LT
2093}
2094
2095
2096static void ip_handle_martian_source(struct net_device *dev,
2097 struct in_device *in_dev,
2098 struct sk_buff *skb,
9e12bb22
AV
2099 __be32 daddr,
2100 __be32 saddr)
1da177e4
LT
2101{
2102 RT_CACHE_STAT_INC(in_martian_src);
2103#ifdef CONFIG_IP_ROUTE_VERBOSE
2104 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105 /*
2106 * RFC1812 recommendation, if source is martian,
2107 * the only hint is MAC header.
2108 */
673d57e7
HH
2109 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2110 &daddr, &saddr, dev->name);
98e399f8 2111 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 2112 int i;
98e399f8 2113 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
2114 printk(KERN_WARNING "ll header: ");
2115 for (i = 0; i < dev->hard_header_len; i++, p++) {
2116 printk("%02x", *p);
2117 if (i < (dev->hard_header_len - 1))
2118 printk(":");
2119 }
2120 printk("\n");
2121 }
2122 }
2123#endif
2124}
2125
47360228 2126/* called in rcu_read_lock() section */
5969f71d 2127static int __mkroute_input(struct sk_buff *skb,
982721f3 2128 const struct fib_result *res,
5969f71d
SH
2129 struct in_device *in_dev,
2130 __be32 daddr, __be32 saddr, u32 tos,
2131 struct rtable **result)
1da177e4 2132{
1da177e4
LT
2133 struct rtable *rth;
2134 int err;
2135 struct in_device *out_dev;
47360228 2136 unsigned int flags = 0;
d9c9df8c
AV
2137 __be32 spec_dst;
2138 u32 itag;
1da177e4
LT
2139
2140 /* get a working reference to the output device */
47360228 2141 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2142 if (out_dev == NULL) {
2143 if (net_ratelimit())
2144 printk(KERN_CRIT "Bug in ip_route_input" \
2145 "_slow(). Please, report\n");
2146 return -EINVAL;
2147 }
2148
2149
5c04c819
MS
2150 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2151 in_dev->dev, &spec_dst, &itag);
1da177e4 2152 if (err < 0) {
e905a9ed 2153 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2154 saddr);
e905a9ed 2155
1da177e4
LT
2156 goto cleanup;
2157 }
2158
2159 if (err)
2160 flags |= RTCF_DIRECTSRC;
2161
51b77cae 2162 if (out_dev == in_dev && err &&
1da177e4
LT
2163 (IN_DEV_SHARED_MEDIA(out_dev) ||
2164 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2165 flags |= RTCF_DOREDIRECT;
2166
2167 if (skb->protocol != htons(ETH_P_IP)) {
2168 /* Not IP (i.e. ARP). Do not create route, if it is
2169 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2170 *
2171 * Proxy arp feature have been extended to allow, ARP
2172 * replies back to the same interface, to support
2173 * Private VLAN switch technologies. See arp.c.
1da177e4 2174 */
65324144
JDB
2175 if (out_dev == in_dev &&
2176 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2177 err = -EINVAL;
2178 goto cleanup;
2179 }
2180 }
2181
5c1e6aa3
DM
2182 rth = rt_dst_alloc(out_dev->dev,
2183 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2184 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2185 if (!rth) {
2186 err = -ENOBUFS;
2187 goto cleanup;
2188 }
2189
5e2b61f7 2190 rth->rt_key_dst = daddr;
5e2b61f7 2191 rth->rt_key_src = saddr;
cf911662
DM
2192 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2193 rth->rt_flags = flags;
2194 rth->rt_type = res->type;
475949d8 2195 rth->rt_key_tos = tos;
cf911662 2196 rth->rt_dst = daddr;
1da177e4 2197 rth->rt_src = saddr;
1b86a58f 2198 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2199 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2200 rth->rt_oif = 0;
cf911662
DM
2201 rth->rt_mark = skb->mark;
2202 rth->rt_gateway = daddr;
1da177e4 2203 rth->rt_spec_dst= spec_dst;
cf911662
DM
2204 rth->rt_peer_genid = 0;
2205 rth->peer = NULL;
2206 rth->fi = NULL;
1da177e4 2207
d8d1f30b
CG
2208 rth->dst.input = ip_forward;
2209 rth->dst.output = ip_output;
1da177e4 2210
5e2b61f7 2211 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2212
1da177e4
LT
2213 *result = rth;
2214 err = 0;
2215 cleanup:
1da177e4 2216 return err;
e905a9ed 2217}
1da177e4 2218
5969f71d
SH
2219static int ip_mkroute_input(struct sk_buff *skb,
2220 struct fib_result *res,
68a5e3dd 2221 const struct flowi4 *fl4,
5969f71d
SH
2222 struct in_device *in_dev,
2223 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2224{
7abaa27c 2225 struct rtable* rth = NULL;
1da177e4
LT
2226 int err;
2227 unsigned hash;
2228
2229#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2230 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2231 fib_select_multipath(res);
1da177e4
LT
2232#endif
2233
2234 /* create a routing cache entry */
2235 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2236 if (err)
2237 return err;
1da177e4
LT
2238
2239 /* put it into the cache */
68a5e3dd 2240 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2241 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2242 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2243 if (IS_ERR(rth))
2244 return PTR_ERR(rth);
2245 return 0;
1da177e4
LT
2246}
2247
1da177e4
LT
2248/*
2249 * NOTE. We drop all the packets that has local source
2250 * addresses, because every properly looped back packet
2251 * must have correct destination already attached by output routine.
2252 *
2253 * Such approach solves two big problems:
2254 * 1. Not simplex devices are handled properly.
2255 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2256 * called with rcu_read_lock()
1da177e4
LT
2257 */
2258
9e12bb22 2259static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2260 u8 tos, struct net_device *dev)
2261{
2262 struct fib_result res;
96d36220 2263 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2264 struct flowi4 fl4;
1da177e4
LT
2265 unsigned flags = 0;
2266 u32 itag = 0;
2267 struct rtable * rth;
2268 unsigned hash;
9e12bb22 2269 __be32 spec_dst;
1da177e4 2270 int err = -EINVAL;
c346dca1 2271 struct net * net = dev_net(dev);
1da177e4
LT
2272
2273 /* IP on this device is disabled. */
2274
2275 if (!in_dev)
2276 goto out;
2277
2278 /* Check for the most weird martians, which can be not detected
2279 by fib_lookup.
2280 */
2281
1e637c74 2282 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2283 ipv4_is_loopback(saddr))
1da177e4
LT
2284 goto martian_source;
2285
27a954bd 2286 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2287 goto brd_input;
2288
2289 /* Accept zero addresses only to limited broadcast;
2290 * I even do not know to fix it or not. Waiting for complains :-)
2291 */
f97c1e0c 2292 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2293 goto martian_source;
2294
27a954bd 2295 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2296 goto martian_destination;
2297
2298 /*
2299 * Now we are ready to route packet.
2300 */
68a5e3dd
DM
2301 fl4.flowi4_oif = 0;
2302 fl4.flowi4_iif = dev->ifindex;
2303 fl4.flowi4_mark = skb->mark;
2304 fl4.flowi4_tos = tos;
2305 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2306 fl4.daddr = daddr;
2307 fl4.saddr = saddr;
2308 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2309 if (err != 0) {
1da177e4 2310 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2311 goto e_hostunreach;
1da177e4
LT
2312 goto no_route;
2313 }
1da177e4
LT
2314
2315 RT_CACHE_STAT_INC(in_slow_tot);
2316
2317 if (res.type == RTN_BROADCAST)
2318 goto brd_input;
2319
2320 if (res.type == RTN_LOCAL) {
5c04c819 2321 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2322 net->loopback_dev->ifindex,
5c04c819 2323 dev, &spec_dst, &itag);
b5f7e755
ED
2324 if (err < 0)
2325 goto martian_source_keep_err;
2326 if (err)
1da177e4
LT
2327 flags |= RTCF_DIRECTSRC;
2328 spec_dst = daddr;
2329 goto local_input;
2330 }
2331
2332 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2333 goto e_hostunreach;
1da177e4
LT
2334 if (res.type != RTN_UNICAST)
2335 goto martian_destination;
2336
68a5e3dd 2337 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2338out: return err;
2339
2340brd_input:
2341 if (skb->protocol != htons(ETH_P_IP))
2342 goto e_inval;
2343
f97c1e0c 2344 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2345 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2346 else {
5c04c819
MS
2347 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2348 &itag);
1da177e4 2349 if (err < 0)
b5f7e755 2350 goto martian_source_keep_err;
1da177e4
LT
2351 if (err)
2352 flags |= RTCF_DIRECTSRC;
2353 }
2354 flags |= RTCF_BROADCAST;
2355 res.type = RTN_BROADCAST;
2356 RT_CACHE_STAT_INC(in_brd);
2357
2358local_input:
5c1e6aa3
DM
2359 rth = rt_dst_alloc(net->loopback_dev,
2360 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2361 if (!rth)
2362 goto e_nobufs;
2363
cf911662 2364 rth->dst.input= ip_local_deliver;
d8d1f30b 2365 rth->dst.output= ip_rt_bug;
cf911662
DM
2366#ifdef CONFIG_IP_ROUTE_CLASSID
2367 rth->dst.tclassid = itag;
2368#endif
1da177e4 2369
5e2b61f7 2370 rth->rt_key_dst = daddr;
5e2b61f7 2371 rth->rt_key_src = saddr;
cf911662
DM
2372 rth->rt_genid = rt_genid(net);
2373 rth->rt_flags = flags|RTCF_LOCAL;
2374 rth->rt_type = res.type;
475949d8 2375 rth->rt_key_tos = tos;
cf911662 2376 rth->rt_dst = daddr;
1da177e4 2377 rth->rt_src = saddr;
c7066f70 2378#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2379 rth->dst.tclassid = itag;
1da177e4 2380#endif
1b86a58f 2381 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2382 rth->rt_iif = dev->ifindex;
cf911662
DM
2383 rth->rt_oif = 0;
2384 rth->rt_mark = skb->mark;
1da177e4
LT
2385 rth->rt_gateway = daddr;
2386 rth->rt_spec_dst= spec_dst;
cf911662
DM
2387 rth->rt_peer_genid = 0;
2388 rth->peer = NULL;
2389 rth->fi = NULL;
1da177e4 2390 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2391 rth->dst.input= ip_error;
2392 rth->dst.error= -err;
1da177e4
LT
2393 rth->rt_flags &= ~RTCF_LOCAL;
2394 }
68a5e3dd
DM
2395 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2396 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2397 err = 0;
2398 if (IS_ERR(rth))
2399 err = PTR_ERR(rth);
ebc0ffae 2400 goto out;
1da177e4
LT
2401
2402no_route:
2403 RT_CACHE_STAT_INC(in_no_route);
2404 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2405 res.type = RTN_UNREACHABLE;
7f53878d
MC
2406 if (err == -ESRCH)
2407 err = -ENETUNREACH;
1da177e4
LT
2408 goto local_input;
2409
2410 /*
2411 * Do not cache martian addresses: they should be logged (RFC1812)
2412 */
2413martian_destination:
2414 RT_CACHE_STAT_INC(in_martian_dst);
2415#ifdef CONFIG_IP_ROUTE_VERBOSE
2416 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2417 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2418 &daddr, &saddr, dev->name);
1da177e4 2419#endif
2c2910a4
DE
2420
2421e_hostunreach:
e905a9ed 2422 err = -EHOSTUNREACH;
ebc0ffae 2423 goto out;
2c2910a4 2424
1da177e4
LT
2425e_inval:
2426 err = -EINVAL;
ebc0ffae 2427 goto out;
1da177e4
LT
2428
2429e_nobufs:
2430 err = -ENOBUFS;
ebc0ffae 2431 goto out;
1da177e4
LT
2432
2433martian_source:
b5f7e755
ED
2434 err = -EINVAL;
2435martian_source_keep_err:
1da177e4 2436 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2437 goto out;
1da177e4
LT
2438}
2439
407eadd9
ED
2440int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2441 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2442{
2443 struct rtable * rth;
2444 unsigned hash;
2445 int iif = dev->ifindex;
b5921910 2446 struct net *net;
96d36220 2447 int res;
1da177e4 2448
c346dca1 2449 net = dev_net(dev);
1080d709 2450
96d36220
ED
2451 rcu_read_lock();
2452
1080d709
NH
2453 if (!rt_caching(net))
2454 goto skip_cache;
2455
1da177e4 2456 tos &= IPTOS_RT_MASK;
e84f84f2 2457 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2458
1da177e4 2459 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2460 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2461 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2462 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2463 (rth->rt_route_iif ^ iif) |
475949d8 2464 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2465 rth->rt_mark == skb->mark &&
d8d1f30b 2466 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2467 !rt_is_expired(rth)) {
de398fb8 2468 ipv4_validate_peer(rth);
407eadd9 2469 if (noref) {
d8d1f30b
CG
2470 dst_use_noref(&rth->dst, jiffies);
2471 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2472 } else {
d8d1f30b
CG
2473 dst_use(&rth->dst, jiffies);
2474 skb_dst_set(skb, &rth->dst);
407eadd9 2475 }
1da177e4
LT
2476 RT_CACHE_STAT_INC(in_hit);
2477 rcu_read_unlock();
1da177e4
LT
2478 return 0;
2479 }
2480 RT_CACHE_STAT_INC(in_hlist_search);
2481 }
1da177e4 2482
1080d709 2483skip_cache:
1da177e4
LT
2484 /* Multicast recognition logic is moved from route cache to here.
2485 The problem was that too many Ethernet cards have broken/missing
2486 hardware multicast filters :-( As result the host on multicasting
2487 network acquires a lot of useless route cache entries, sort of
2488 SDR messages from all the world. Now we try to get rid of them.
2489 Really, provided software IP multicast filter is organized
2490 reasonably (at least, hashed), it does not result in a slowdown
2491 comparing with route cache reject entries.
2492 Note, that multicast routers are not affected, because
2493 route cache entry is created eventually.
2494 */
f97c1e0c 2495 if (ipv4_is_multicast(daddr)) {
96d36220 2496 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2497
96d36220 2498 if (in_dev) {
dbdd9a52
DM
2499 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2500 ip_hdr(skb)->protocol);
1da177e4
LT
2501 if (our
2502#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2503 ||
2504 (!ipv4_is_local_multicast(daddr) &&
2505 IN_DEV_MFORWARD(in_dev))
1da177e4 2506#endif
9d4fb27d 2507 ) {
96d36220
ED
2508 int res = ip_route_input_mc(skb, daddr, saddr,
2509 tos, dev, our);
1da177e4 2510 rcu_read_unlock();
96d36220 2511 return res;
1da177e4
LT
2512 }
2513 }
2514 rcu_read_unlock();
2515 return -EINVAL;
2516 }
96d36220
ED
2517 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2518 rcu_read_unlock();
2519 return res;
1da177e4 2520}
407eadd9 2521EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2522
ebc0ffae 2523/* called with rcu_read_lock() */
982721f3 2524static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2525 const struct flowi4 *fl4,
813b3b5d 2526 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2527 int orig_oif, __u8 orig_rtos,
2528 struct net_device *dev_out,
5ada5527 2529 unsigned int flags)
1da177e4 2530{
982721f3 2531 struct fib_info *fi = res->fi;
5ada5527 2532 struct in_device *in_dev;
982721f3 2533 u16 type = res->type;
5ada5527 2534 struct rtable *rth;
1da177e4 2535
68a5e3dd 2536 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2537 return ERR_PTR(-EINVAL);
1da177e4 2538
68a5e3dd 2539 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2540 type = RTN_BROADCAST;
68a5e3dd 2541 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2542 type = RTN_MULTICAST;
68a5e3dd 2543 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2544 return ERR_PTR(-EINVAL);
1da177e4
LT
2545
2546 if (dev_out->flags & IFF_LOOPBACK)
2547 flags |= RTCF_LOCAL;
2548
dd28d1a0 2549 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2550 if (!in_dev)
5ada5527 2551 return ERR_PTR(-EINVAL);
ebc0ffae 2552
982721f3 2553 if (type == RTN_BROADCAST) {
1da177e4 2554 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2555 fi = NULL;
2556 } else if (type == RTN_MULTICAST) {
dd28d1a0 2557 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2558 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2559 fl4->flowi4_proto))
1da177e4
LT
2560 flags &= ~RTCF_LOCAL;
2561 /* If multicast route do not exist use
dd28d1a0
ED
2562 * default one, but do not gateway in this case.
2563 * Yes, it is hack.
1da177e4 2564 */
982721f3
DM
2565 if (fi && res->prefixlen < 4)
2566 fi = NULL;
1da177e4
LT
2567 }
2568
5c1e6aa3
DM
2569 rth = rt_dst_alloc(dev_out,
2570 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2571 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2572 if (!rth)
5ada5527 2573 return ERR_PTR(-ENOBUFS);
8391d07b 2574
cf911662
DM
2575 rth->dst.output = ip_output;
2576
813b3b5d
DM
2577 rth->rt_key_dst = orig_daddr;
2578 rth->rt_key_src = orig_saddr;
cf911662
DM
2579 rth->rt_genid = rt_genid(dev_net(dev_out));
2580 rth->rt_flags = flags;
2581 rth->rt_type = type;
f61759e6 2582 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2583 rth->rt_dst = fl4->daddr;
2584 rth->rt_src = fl4->saddr;
1b86a58f 2585 rth->rt_route_iif = 0;
813b3b5d
DM
2586 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2587 rth->rt_oif = orig_oif;
2588 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2589 rth->rt_gateway = fl4->daddr;
2590 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2591 rth->rt_peer_genid = 0;
2592 rth->peer = NULL;
2593 rth->fi = NULL;
1da177e4
LT
2594
2595 RT_CACHE_STAT_INC(out_slow_tot);
2596
2597 if (flags & RTCF_LOCAL) {
d8d1f30b 2598 rth->dst.input = ip_local_deliver;
68a5e3dd 2599 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2600 }
2601 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2602 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2603 if (flags & RTCF_LOCAL &&
1da177e4 2604 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2605 rth->dst.output = ip_mc_output;
1da177e4
LT
2606 RT_CACHE_STAT_INC(out_slow_mc);
2607 }
2608#ifdef CONFIG_IP_MROUTE
982721f3 2609 if (type == RTN_MULTICAST) {
1da177e4 2610 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2611 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2612 rth->dst.input = ip_mr_input;
2613 rth->dst.output = ip_mc_output;
1da177e4
LT
2614 }
2615 }
2616#endif
2617 }
2618
813b3b5d 2619 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2620
5ada5527 2621 return rth;
1da177e4
LT
2622}
2623
1da177e4
LT
2624/*
2625 * Major route resolver routine.
0197aa38 2626 * called with rcu_read_lock();
1da177e4
LT
2627 */
2628
813b3b5d 2629static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2630{
1da177e4 2631 struct net_device *dev_out = NULL;
f61759e6 2632 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2633 unsigned int flags = 0;
2634 struct fib_result res;
5ada5527 2635 struct rtable *rth;
813b3b5d
DM
2636 __be32 orig_daddr;
2637 __be32 orig_saddr;
2638 int orig_oif;
1da177e4
LT
2639
2640 res.fi = NULL;
2641#ifdef CONFIG_IP_MULTIPLE_TABLES
2642 res.r = NULL;
2643#endif
2644
813b3b5d
DM
2645 orig_daddr = fl4->daddr;
2646 orig_saddr = fl4->saddr;
2647 orig_oif = fl4->flowi4_oif;
2648
2649 fl4->flowi4_iif = net->loopback_dev->ifindex;
2650 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2651 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2652 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2653
010c2708 2654 rcu_read_lock();
813b3b5d 2655 if (fl4->saddr) {
b23dd4fe 2656 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2657 if (ipv4_is_multicast(fl4->saddr) ||
2658 ipv4_is_lbcast(fl4->saddr) ||
2659 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2660 goto out;
2661
1da177e4
LT
2662 /* I removed check for oif == dev_out->oif here.
2663 It was wrong for two reasons:
1ab35276
DL
2664 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2665 is assigned to multiple interfaces.
1da177e4
LT
2666 2. Moreover, we are allowed to send packets with saddr
2667 of another iface. --ANK
2668 */
2669
813b3b5d
DM
2670 if (fl4->flowi4_oif == 0 &&
2671 (ipv4_is_multicast(fl4->daddr) ||
2672 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2673 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2674 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2675 if (dev_out == NULL)
2676 goto out;
2677
1da177e4
LT
2678 /* Special hack: user can direct multicasts
2679 and limited broadcast via necessary interface
2680 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2681 This hack is not just for fun, it allows
2682 vic,vat and friends to work.
2683 They bind socket to loopback, set ttl to zero
2684 and expect that it will work.
2685 From the viewpoint of routing cache they are broken,
2686 because we are not allowed to build multicast path
2687 with loopback source addr (look, routing cache
2688 cannot know, that ttl is zero, so that packet
2689 will not leave this host and route is valid).
2690 Luckily, this hack is good workaround.
2691 */
2692
813b3b5d 2693 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2694 goto make_route;
2695 }
a210d01a 2696
813b3b5d 2697 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2698 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2699 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2700 goto out;
a210d01a 2701 }
1da177e4
LT
2702 }
2703
2704
813b3b5d
DM
2705 if (fl4->flowi4_oif) {
2706 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2707 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2708 if (dev_out == NULL)
2709 goto out;
e5ed6399
HX
2710
2711 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2712 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2713 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2714 goto out;
2715 }
813b3b5d
DM
2716 if (ipv4_is_local_multicast(fl4->daddr) ||
2717 ipv4_is_lbcast(fl4->daddr)) {
2718 if (!fl4->saddr)
2719 fl4->saddr = inet_select_addr(dev_out, 0,
2720 RT_SCOPE_LINK);
1da177e4
LT
2721 goto make_route;
2722 }
813b3b5d
DM
2723 if (fl4->saddr) {
2724 if (ipv4_is_multicast(fl4->daddr))
2725 fl4->saddr = inet_select_addr(dev_out, 0,
2726 fl4->flowi4_scope);
2727 else if (!fl4->daddr)
2728 fl4->saddr = inet_select_addr(dev_out, 0,
2729 RT_SCOPE_HOST);
1da177e4
LT
2730 }
2731 }
2732
813b3b5d
DM
2733 if (!fl4->daddr) {
2734 fl4->daddr = fl4->saddr;
2735 if (!fl4->daddr)
2736 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2737 dev_out = net->loopback_dev;
813b3b5d 2738 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2739 res.type = RTN_LOCAL;
2740 flags |= RTCF_LOCAL;
2741 goto make_route;
2742 }
2743
813b3b5d 2744 if (fib_lookup(net, fl4, &res)) {
1da177e4 2745 res.fi = NULL;
813b3b5d 2746 if (fl4->flowi4_oif) {
1da177e4
LT
2747 /* Apparently, routing tables are wrong. Assume,
2748 that the destination is on link.
2749
2750 WHY? DW.
2751 Because we are allowed to send to iface
2752 even if it has NO routes and NO assigned
2753 addresses. When oif is specified, routing
2754 tables are looked up with only one purpose:
2755 to catch if destination is gatewayed, rather than
2756 direct. Moreover, if MSG_DONTROUTE is set,
2757 we send packet, ignoring both routing tables
2758 and ifaddr state. --ANK
2759
2760
2761 We could make it even if oif is unknown,
2762 likely IPv6, but we do not.
2763 */
2764
813b3b5d
DM
2765 if (fl4->saddr == 0)
2766 fl4->saddr = inet_select_addr(dev_out, 0,
2767 RT_SCOPE_LINK);
1da177e4
LT
2768 res.type = RTN_UNICAST;
2769 goto make_route;
2770 }
b23dd4fe 2771 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2772 goto out;
2773 }
1da177e4
LT
2774
2775 if (res.type == RTN_LOCAL) {
813b3b5d 2776 if (!fl4->saddr) {
9fc3bbb4 2777 if (res.fi->fib_prefsrc)
813b3b5d 2778 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2779 else
813b3b5d 2780 fl4->saddr = fl4->daddr;
9fc3bbb4 2781 }
b40afd0e 2782 dev_out = net->loopback_dev;
813b3b5d 2783 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2784 res.fi = NULL;
2785 flags |= RTCF_LOCAL;
2786 goto make_route;
2787 }
2788
2789#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2790 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2791 fib_select_multipath(&res);
1da177e4
LT
2792 else
2793#endif
21d8c49e
DM
2794 if (!res.prefixlen &&
2795 res.table->tb_num_default > 1 &&
813b3b5d 2796 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2797 fib_select_default(&res);
1da177e4 2798
813b3b5d
DM
2799 if (!fl4->saddr)
2800 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2801
1da177e4 2802 dev_out = FIB_RES_DEV(res);
813b3b5d 2803 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2804
2805
2806make_route:
813b3b5d 2807 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2808 tos, dev_out, flags);
b23dd4fe 2809 if (!IS_ERR(rth)) {
5ada5527
DM
2810 unsigned int hash;
2811
813b3b5d 2812 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2813 rt_genid(dev_net(dev_out)));
813b3b5d 2814 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2815 }
1da177e4 2816
010c2708
DM
2817out:
2818 rcu_read_unlock();
b23dd4fe 2819 return rth;
1da177e4
LT
2820}
2821
813b3b5d 2822struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2823{
1da177e4 2824 struct rtable *rth;
010c2708 2825 unsigned int hash;
1da177e4 2826
1080d709
NH
2827 if (!rt_caching(net))
2828 goto slow_output;
2829
9d6ec938 2830 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2831
2832 rcu_read_lock_bh();
a898def2 2833 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2834 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2835 if (rth->rt_key_dst == flp4->daddr &&
2836 rth->rt_key_src == flp4->saddr &&
c7537967 2837 rt_is_output_route(rth) &&
9d6ec938
DM
2838 rth->rt_oif == flp4->flowi4_oif &&
2839 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2840 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2841 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2842 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2843 !rt_is_expired(rth)) {
de398fb8 2844 ipv4_validate_peer(rth);
d8d1f30b 2845 dst_use(&rth->dst, jiffies);
1da177e4
LT
2846 RT_CACHE_STAT_INC(out_hit);
2847 rcu_read_unlock_bh();
56157872
DM
2848 if (!flp4->saddr)
2849 flp4->saddr = rth->rt_src;
2850 if (!flp4->daddr)
2851 flp4->daddr = rth->rt_dst;
b23dd4fe 2852 return rth;
1da177e4
LT
2853 }
2854 RT_CACHE_STAT_INC(out_hlist_search);
2855 }
2856 rcu_read_unlock_bh();
2857
1080d709 2858slow_output:
9d6ec938 2859 return ip_route_output_slow(net, flp4);
1da177e4 2860}
d8c97a94
ACM
2861EXPORT_SYMBOL_GPL(__ip_route_output_key);
2862
ae2688d5
JW
2863static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2864{
2865 return NULL;
2866}
2867
ebb762f2 2868static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2869{
618f9bc7
SK
2870 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2871
2872 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2873}
2874
14e50e57
DM
2875static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2876{
2877}
2878
0972ddb2
HB
2879static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2880 unsigned long old)
2881{
2882 return NULL;
2883}
2884
14e50e57
DM
2885static struct dst_ops ipv4_dst_blackhole_ops = {
2886 .family = AF_INET,
09640e63 2887 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2888 .destroy = ipv4_dst_destroy,
ae2688d5 2889 .check = ipv4_blackhole_dst_check,
ebb762f2 2890 .mtu = ipv4_blackhole_mtu,
214f45c9 2891 .default_advmss = ipv4_default_advmss,
14e50e57 2892 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2893 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2894 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2895};
2896
2774c131 2897struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2898{
5c1e6aa3 2899 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2900 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2901
2902 if (rt) {
d8d1f30b 2903 struct dst_entry *new = &rt->dst;
14e50e57 2904
14e50e57 2905 new->__use = 1;
352e512c
HX
2906 new->input = dst_discard;
2907 new->output = dst_discard;
defb3519 2908 dst_copy_metrics(new, &ort->dst);
14e50e57 2909
d8d1f30b 2910 new->dev = ort->dst.dev;
14e50e57
DM
2911 if (new->dev)
2912 dev_hold(new->dev);
2913
5e2b61f7
DM
2914 rt->rt_key_dst = ort->rt_key_dst;
2915 rt->rt_key_src = ort->rt_key_src;
475949d8 2916 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2917 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2918 rt->rt_iif = ort->rt_iif;
2919 rt->rt_oif = ort->rt_oif;
2920 rt->rt_mark = ort->rt_mark;
14e50e57 2921
e84f84f2 2922 rt->rt_genid = rt_genid(net);
14e50e57
DM
2923 rt->rt_flags = ort->rt_flags;
2924 rt->rt_type = ort->rt_type;
2925 rt->rt_dst = ort->rt_dst;
2926 rt->rt_src = ort->rt_src;
14e50e57
DM
2927 rt->rt_gateway = ort->rt_gateway;
2928 rt->rt_spec_dst = ort->rt_spec_dst;
2929 rt->peer = ort->peer;
2930 if (rt->peer)
2931 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2932 rt->fi = ort->fi;
2933 if (rt->fi)
2934 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2935
2936 dst_free(new);
2937 }
2938
2774c131
DM
2939 dst_release(dst_orig);
2940
2941 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2942}
2943
9d6ec938 2944struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2945 struct sock *sk)
1da177e4 2946{
9d6ec938 2947 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2948
b23dd4fe
DM
2949 if (IS_ERR(rt))
2950 return rt;
1da177e4 2951
56157872 2952 if (flp4->flowi4_proto)
9d6ec938
DM
2953 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2954 flowi4_to_flowi(flp4),
2955 sk, 0);
1da177e4 2956
b23dd4fe 2957 return rt;
1da177e4 2958}
d8c97a94
ACM
2959EXPORT_SYMBOL_GPL(ip_route_output_flow);
2960
4feb88e5
BT
2961static int rt_fill_info(struct net *net,
2962 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2963 int nowait, unsigned int flags)
1da177e4 2964{
511c3f92 2965 struct rtable *rt = skb_rtable(skb);
1da177e4 2966 struct rtmsg *r;
be403ea1 2967 struct nlmsghdr *nlh;
2bc8ca40 2968 unsigned long expires = 0;
fe6fe792 2969 const struct inet_peer *peer = rt->peer;
e3703b3d 2970 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2971
2972 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2973 if (nlh == NULL)
26932566 2974 return -EMSGSIZE;
be403ea1
TG
2975
2976 r = nlmsg_data(nlh);
1da177e4
LT
2977 r->rtm_family = AF_INET;
2978 r->rtm_dst_len = 32;
2979 r->rtm_src_len = 0;
475949d8 2980 r->rtm_tos = rt->rt_key_tos;
1da177e4 2981 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2982 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2983 r->rtm_type = rt->rt_type;
2984 r->rtm_scope = RT_SCOPE_UNIVERSE;
2985 r->rtm_protocol = RTPROT_UNSPEC;
2986 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2987 if (rt->rt_flags & RTCF_NOTIFY)
2988 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2989
17fb2c64 2990 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2991
5e2b61f7 2992 if (rt->rt_key_src) {
1da177e4 2993 r->rtm_src_len = 32;
5e2b61f7 2994 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 2995 }
d8d1f30b
CG
2996 if (rt->dst.dev)
2997 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2998#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2999 if (rt->dst.tclassid)
3000 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 3001#endif
c7537967 3002 if (rt_is_input_route(rt))
17fb2c64 3003 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 3004 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 3005 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 3006
1da177e4 3007 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 3008 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 3009
defb3519 3010 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
3011 goto nla_put_failure;
3012
5e2b61f7
DM
3013 if (rt->rt_mark)
3014 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 3015
d8d1f30b 3016 error = rt->dst.error;
fe6fe792 3017 if (peer) {
317fe0e6 3018 inet_peer_refcheck(rt->peer);
fe6fe792
ED
3019 id = atomic_read(&peer->ip_id_count) & 0xffff;
3020 if (peer->tcp_ts_stamp) {
3021 ts = peer->tcp_ts;
3022 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 3023 }
fe6fe792 3024 expires = ACCESS_ONCE(peer->pmtu_expires);
2bc8ca40
SK
3025 if (expires) {
3026 if (time_before(jiffies, expires))
3027 expires -= jiffies;
3028 else
3029 expires = 0;
3030 }
1da177e4 3031 }
be403ea1 3032
c7537967 3033 if (rt_is_input_route(rt)) {
1da177e4 3034#ifdef CONFIG_IP_MROUTE
e448515c 3035 __be32 dst = rt->rt_dst;
1da177e4 3036
f97c1e0c 3037 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 3038 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
3039 int err = ipmr_get_route(net, skb,
3040 rt->rt_src, rt->rt_dst,
3041 r, nowait);
1da177e4
LT
3042 if (err <= 0) {
3043 if (!nowait) {
3044 if (err == 0)
3045 return 0;
be403ea1 3046 goto nla_put_failure;
1da177e4
LT
3047 } else {
3048 if (err == -EMSGSIZE)
be403ea1 3049 goto nla_put_failure;
e3703b3d 3050 error = err;
1da177e4
LT
3051 }
3052 }
3053 } else
3054#endif
5e2b61f7 3055 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
3056 }
3057
d8d1f30b 3058 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
3059 expires, error) < 0)
3060 goto nla_put_failure;
be403ea1
TG
3061
3062 return nlmsg_end(skb, nlh);
1da177e4 3063
be403ea1 3064nla_put_failure:
26932566
PM
3065 nlmsg_cancel(skb, nlh);
3066 return -EMSGSIZE;
1da177e4
LT
3067}
3068
63f3444f 3069static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 3070{
3b1e0a65 3071 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
3072 struct rtmsg *rtm;
3073 struct nlattr *tb[RTA_MAX+1];
1da177e4 3074 struct rtable *rt = NULL;
9e12bb22
AV
3075 __be32 dst = 0;
3076 __be32 src = 0;
3077 u32 iif;
d889ce3b 3078 int err;
963bfeee 3079 int mark;
1da177e4
LT
3080 struct sk_buff *skb;
3081
d889ce3b
TG
3082 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3083 if (err < 0)
3084 goto errout;
3085
3086 rtm = nlmsg_data(nlh);
3087
1da177e4 3088 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
3089 if (skb == NULL) {
3090 err = -ENOBUFS;
3091 goto errout;
3092 }
1da177e4
LT
3093
3094 /* Reserve room for dummy headers, this skb can pass
3095 through good chunk of routing engine.
3096 */
459a98ed 3097 skb_reset_mac_header(skb);
c1d2bbe1 3098 skb_reset_network_header(skb);
d2c962b8
SH
3099
3100 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3101 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3102 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3103
17fb2c64
AV
3104 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3105 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3106 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3107 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3108
3109 if (iif) {
d889ce3b
TG
3110 struct net_device *dev;
3111
1937504d 3112 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3113 if (dev == NULL) {
3114 err = -ENODEV;
3115 goto errout_free;
3116 }
3117
1da177e4
LT
3118 skb->protocol = htons(ETH_P_IP);
3119 skb->dev = dev;
963bfeee 3120 skb->mark = mark;
1da177e4
LT
3121 local_bh_disable();
3122 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3123 local_bh_enable();
d889ce3b 3124
511c3f92 3125 rt = skb_rtable(skb);
d8d1f30b
CG
3126 if (err == 0 && rt->dst.error)
3127 err = -rt->dst.error;
1da177e4 3128 } else {
68a5e3dd
DM
3129 struct flowi4 fl4 = {
3130 .daddr = dst,
3131 .saddr = src,
3132 .flowi4_tos = rtm->rtm_tos,
3133 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3134 .flowi4_mark = mark,
d889ce3b 3135 };
9d6ec938 3136 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
3137
3138 err = 0;
3139 if (IS_ERR(rt))
3140 err = PTR_ERR(rt);
1da177e4 3141 }
d889ce3b 3142
1da177e4 3143 if (err)
d889ce3b 3144 goto errout_free;
1da177e4 3145
d8d1f30b 3146 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3147 if (rtm->rtm_flags & RTM_F_NOTIFY)
3148 rt->rt_flags |= RTCF_NOTIFY;
3149
4feb88e5 3150 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3151 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3152 if (err <= 0)
3153 goto errout_free;
1da177e4 3154
1937504d 3155 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3156errout:
2942e900 3157 return err;
1da177e4 3158
d889ce3b 3159errout_free:
1da177e4 3160 kfree_skb(skb);
d889ce3b 3161 goto errout;
1da177e4
LT
3162}
3163
3164int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3165{
3166 struct rtable *rt;
3167 int h, s_h;
3168 int idx, s_idx;
1937504d
DL
3169 struct net *net;
3170
3b1e0a65 3171 net = sock_net(skb->sk);
1da177e4
LT
3172
3173 s_h = cb->args[0];
d8c92830
ED
3174 if (s_h < 0)
3175 s_h = 0;
1da177e4 3176 s_idx = idx = cb->args[1];
a6272665
ED
3177 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3178 if (!rt_hash_table[h].chain)
3179 continue;
1da177e4 3180 rcu_read_lock_bh();
a898def2 3181 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3182 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3183 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3184 continue;
e84f84f2 3185 if (rt_is_expired(rt))
29e75252 3186 continue;
d8d1f30b 3187 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3188 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3189 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3190 1, NLM_F_MULTI) <= 0) {
adf30907 3191 skb_dst_drop(skb);
1da177e4
LT
3192 rcu_read_unlock_bh();
3193 goto done;
3194 }
adf30907 3195 skb_dst_drop(skb);
1da177e4
LT
3196 }
3197 rcu_read_unlock_bh();
3198 }
3199
3200done:
3201 cb->args[0] = h;
3202 cb->args[1] = idx;
3203 return skb->len;
3204}
3205
3206void ip_rt_multicast_event(struct in_device *in_dev)
3207{
76e6ebfb 3208 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3209}
3210
3211#ifdef CONFIG_SYSCTL
81c684d1 3212static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3213 void __user *buffer,
1da177e4
LT
3214 size_t *lenp, loff_t *ppos)
3215{
3216 if (write) {
639e104f 3217 int flush_delay;
81c684d1 3218 ctl_table ctl;
39a23e75 3219 struct net *net;
639e104f 3220
81c684d1
DL
3221 memcpy(&ctl, __ctl, sizeof(ctl));
3222 ctl.data = &flush_delay;
8d65af78 3223 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3224
81c684d1 3225 net = (struct net *)__ctl->extra1;
39a23e75 3226 rt_cache_flush(net, flush_delay);
1da177e4 3227 return 0;
e905a9ed 3228 }
1da177e4
LT
3229
3230 return -EINVAL;
3231}
3232
eeb61f71 3233static ctl_table ipv4_route_table[] = {
1da177e4 3234 {
1da177e4
LT
3235 .procname = "gc_thresh",
3236 .data = &ipv4_dst_ops.gc_thresh,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
6d9f239a 3239 .proc_handler = proc_dointvec,
1da177e4
LT
3240 },
3241 {
1da177e4
LT
3242 .procname = "max_size",
3243 .data = &ip_rt_max_size,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
6d9f239a 3246 .proc_handler = proc_dointvec,
1da177e4
LT
3247 },
3248 {
3249 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3250
1da177e4
LT
3251 .procname = "gc_min_interval",
3252 .data = &ip_rt_gc_min_interval,
3253 .maxlen = sizeof(int),
3254 .mode = 0644,
6d9f239a 3255 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3256 },
3257 {
1da177e4
LT
3258 .procname = "gc_min_interval_ms",
3259 .data = &ip_rt_gc_min_interval,
3260 .maxlen = sizeof(int),
3261 .mode = 0644,
6d9f239a 3262 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3263 },
3264 {
1da177e4
LT
3265 .procname = "gc_timeout",
3266 .data = &ip_rt_gc_timeout,
3267 .maxlen = sizeof(int),
3268 .mode = 0644,
6d9f239a 3269 .proc_handler = proc_dointvec_jiffies,
1da177e4 3270 },
9f28a2fc
ED
3271 {
3272 .procname = "gc_interval",
3273 .data = &ip_rt_gc_interval,
3274 .maxlen = sizeof(int),
3275 .mode = 0644,
3276 .proc_handler = proc_dointvec_jiffies,
3277 },
1da177e4 3278 {
1da177e4
LT
3279 .procname = "redirect_load",
3280 .data = &ip_rt_redirect_load,
3281 .maxlen = sizeof(int),
3282 .mode = 0644,
6d9f239a 3283 .proc_handler = proc_dointvec,
1da177e4
LT
3284 },
3285 {
1da177e4
LT
3286 .procname = "redirect_number",
3287 .data = &ip_rt_redirect_number,
3288 .maxlen = sizeof(int),
3289 .mode = 0644,
6d9f239a 3290 .proc_handler = proc_dointvec,
1da177e4
LT
3291 },
3292 {
1da177e4
LT
3293 .procname = "redirect_silence",
3294 .data = &ip_rt_redirect_silence,
3295 .maxlen = sizeof(int),
3296 .mode = 0644,
6d9f239a 3297 .proc_handler = proc_dointvec,
1da177e4
LT
3298 },
3299 {
1da177e4
LT
3300 .procname = "error_cost",
3301 .data = &ip_rt_error_cost,
3302 .maxlen = sizeof(int),
3303 .mode = 0644,
6d9f239a 3304 .proc_handler = proc_dointvec,
1da177e4
LT
3305 },
3306 {
1da177e4
LT
3307 .procname = "error_burst",
3308 .data = &ip_rt_error_burst,
3309 .maxlen = sizeof(int),
3310 .mode = 0644,
6d9f239a 3311 .proc_handler = proc_dointvec,
1da177e4
LT
3312 },
3313 {
1da177e4
LT
3314 .procname = "gc_elasticity",
3315 .data = &ip_rt_gc_elasticity,
3316 .maxlen = sizeof(int),
3317 .mode = 0644,
6d9f239a 3318 .proc_handler = proc_dointvec,
1da177e4
LT
3319 },
3320 {
1da177e4
LT
3321 .procname = "mtu_expires",
3322 .data = &ip_rt_mtu_expires,
3323 .maxlen = sizeof(int),
3324 .mode = 0644,
6d9f239a 3325 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3326 },
3327 {
1da177e4
LT
3328 .procname = "min_pmtu",
3329 .data = &ip_rt_min_pmtu,
3330 .maxlen = sizeof(int),
3331 .mode = 0644,
6d9f239a 3332 .proc_handler = proc_dointvec,
1da177e4
LT
3333 },
3334 {
1da177e4
LT
3335 .procname = "min_adv_mss",
3336 .data = &ip_rt_min_advmss,
3337 .maxlen = sizeof(int),
3338 .mode = 0644,
6d9f239a 3339 .proc_handler = proc_dointvec,
1da177e4 3340 },
f8572d8f 3341 { }
1da177e4 3342};
39a23e75 3343
2f4520d3
AV
3344static struct ctl_table empty[1];
3345
3346static struct ctl_table ipv4_skeleton[] =
3347{
f8572d8f 3348 { .procname = "route",
d994af0d 3349 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3350 { .procname = "neigh",
d994af0d 3351 .mode = 0555, .child = empty},
2f4520d3
AV
3352 { }
3353};
3354
3355static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3356 { .procname = "net", },
3357 { .procname = "ipv4", },
39a23e75
DL
3358 { },
3359};
3360
39a23e75
DL
3361static struct ctl_table ipv4_route_flush_table[] = {
3362 {
39a23e75
DL
3363 .procname = "flush",
3364 .maxlen = sizeof(int),
3365 .mode = 0200,
6d9f239a 3366 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3367 },
f8572d8f 3368 { },
39a23e75
DL
3369};
3370
2f4520d3 3371static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3372 { .procname = "net", },
3373 { .procname = "ipv4", },
3374 { .procname = "route", },
2f4520d3
AV
3375 { },
3376};
3377
39a23e75
DL
3378static __net_init int sysctl_route_net_init(struct net *net)
3379{
3380 struct ctl_table *tbl;
3381
3382 tbl = ipv4_route_flush_table;
09ad9bc7 3383 if (!net_eq(net, &init_net)) {
39a23e75
DL
3384 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3385 if (tbl == NULL)
3386 goto err_dup;
3387 }
3388 tbl[0].extra1 = net;
3389
3390 net->ipv4.route_hdr =
3391 register_net_sysctl_table(net, ipv4_route_path, tbl);
3392 if (net->ipv4.route_hdr == NULL)
3393 goto err_reg;
3394 return 0;
3395
3396err_reg:
3397 if (tbl != ipv4_route_flush_table)
3398 kfree(tbl);
3399err_dup:
3400 return -ENOMEM;
3401}
3402
3403static __net_exit void sysctl_route_net_exit(struct net *net)
3404{
3405 struct ctl_table *tbl;
3406
3407 tbl = net->ipv4.route_hdr->ctl_table_arg;
3408 unregister_net_sysctl_table(net->ipv4.route_hdr);
3409 BUG_ON(tbl == ipv4_route_flush_table);
3410 kfree(tbl);
3411}
3412
3413static __net_initdata struct pernet_operations sysctl_route_ops = {
3414 .init = sysctl_route_net_init,
3415 .exit = sysctl_route_net_exit,
3416};
1da177e4
LT
3417#endif
3418
3ee94372 3419static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3420{
3ee94372
NH
3421 get_random_bytes(&net->ipv4.rt_genid,
3422 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3423 get_random_bytes(&net->ipv4.dev_addr_genid,
3424 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3425 return 0;
3426}
3427
3ee94372
NH
3428static __net_initdata struct pernet_operations rt_genid_ops = {
3429 .init = rt_genid_init,
9f5e97e5
DL
3430};
3431
3432
c7066f70 3433#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3434struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3435#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3436
3437static __initdata unsigned long rhash_entries;
3438static int __init set_rhash_entries(char *str)
3439{
3440 if (!str)
3441 return 0;
3442 rhash_entries = simple_strtoul(str, &str, 0);
3443 return 1;
3444}
3445__setup("rhash_entries=", set_rhash_entries);
3446
3447int __init ip_rt_init(void)
3448{
424c4b70 3449 int rc = 0;
1da177e4 3450
c7066f70 3451#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3452 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3453 if (!ip_rt_acct)
3454 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3455#endif
3456
e5d679f3
AD
3457 ipv4_dst_ops.kmem_cachep =
3458 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3459 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3460
14e50e57
DM
3461 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3462
fc66f95c
ED
3463 if (dst_entries_init(&ipv4_dst_ops) < 0)
3464 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3465
3466 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3467 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3468
424c4b70
ED
3469 rt_hash_table = (struct rt_hash_bucket *)
3470 alloc_large_system_hash("IP route cache",
3471 sizeof(struct rt_hash_bucket),
3472 rhash_entries,
4481374c 3473 (totalram_pages >= 128 * 1024) ?
18955cfc 3474 15 : 17,
8d1502de 3475 0,
424c4b70
ED
3476 &rt_hash_log,
3477 &rt_hash_mask,
c9503e0f 3478 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3479 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3480 rt_hash_lock_init();
1da177e4
LT
3481
3482 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3483 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3484
1da177e4
LT
3485 devinet_init();
3486 ip_fib_init();
3487
9f28a2fc
ED
3488 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3489 expires_ljiffies = jiffies;
3490 schedule_delayed_work(&expires_work,
3491 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3492
73b38711 3493 if (ip_rt_proc_init())
107f1634 3494 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3495#ifdef CONFIG_XFRM
3496 xfrm_init();
a33bc5c1 3497 xfrm4_init(ip_rt_max_size);
1da177e4 3498#endif
c7ac8679 3499 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3500
39a23e75
DL
3501#ifdef CONFIG_SYSCTL
3502 register_pernet_subsys(&sysctl_route_ops);
3503#endif
3ee94372 3504 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3505 return rc;
3506}
3507
a1bc6eb4 3508#ifdef CONFIG_SYSCTL
eeb61f71
AV
3509/*
3510 * We really need to sanitize the damn ipv4 init order, then all
3511 * this nonsense will go away.
3512 */
3513void __init ip_static_sysctl_init(void)
3514{
2f4520d3 3515 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3516}
a1bc6eb4 3517#endif