]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/route.c
net: Convert printks to pr_<level>
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
b9eda06f 94#include <linux/prefetch.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
6e5714ea 112#include <net/secure_seq.h>
1da177e4 113
68a5e3dd 114#define RT_FL_TOS(oldflp4) \
f61759e6 115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
1da177e4 121static int ip_rt_max_size;
817bc4db 122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
1080d709 134static int rt_chain_length_max __read_mostly = 20;
1da177e4 135
9f28a2fc
ED
136static struct delayed_work expires_work;
137static unsigned long expires_ljiffies;
138
1da177e4
LT
139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 144static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 145static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 146static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 150static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 151
72cdd1d9
ED
152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154{
155}
1da177e4 156
62fa8a84
DM
157static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158{
06582540
DM
159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
161 u32 *p = NULL;
162
163 if (!rt->peer)
a48eff12 164 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 165
06582540
DM
166 peer = rt->peer;
167 if (peer) {
62fa8a84
DM
168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
170
06582540
DM
171 p = peer->metrics;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
174
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
177
178 if (prev != old) {
62fa8a84
DM
179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
181 p = NULL;
182 } else {
62fa8a84
DM
183 if (rt->fi) {
184 fib_info_put(rt->fi);
185 rt->fi = NULL;
186 }
187 }
188 }
189 return p;
190}
191
d3aaeb38
DM
192static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193
1da177e4
LT
194static struct dst_ops ipv4_dst_ops = {
195 .family = AF_INET,
09640e63 196 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
197 .gc = rt_garbage_collect,
198 .check = ipv4_dst_check,
0dbaee3b 199 .default_advmss = ipv4_default_advmss,
ebb762f2 200 .mtu = ipv4_mtu,
62fa8a84 201 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
202 .destroy = ipv4_dst_destroy,
203 .ifdown = ipv4_dst_ifdown,
204 .negative_advice = ipv4_negative_advice,
205 .link_failure = ipv4_link_failure,
206 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 207 .local_out = __ip_local_out,
d3aaeb38 208 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
209};
210
211#define ECN_OR_COST(class) TC_PRIO_##class
212
4839c52b 213const __u8 ip_tos2prio[16] = {
1da177e4 214 TC_PRIO_BESTEFFORT,
4a2b9c37 215 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
216 TC_PRIO_BESTEFFORT,
217 ECN_OR_COST(BESTEFFORT),
218 TC_PRIO_BULK,
219 ECN_OR_COST(BULK),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_INTERACTIVE,
223 ECN_OR_COST(INTERACTIVE),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE_BULK,
227 ECN_OR_COST(INTERACTIVE_BULK),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK)
230};
231
232
233/*
234 * Route cache.
235 */
236
237/* The locking scheme is rather straight forward:
238 *
239 * 1) Read-Copy Update protects the buckets of the central route hash.
240 * 2) Only writers remove entries, and they hold the lock
241 * as they look at rtable reference counts.
242 * 3) Only readers acquire references to rtable entries,
243 * they do so with atomic increments and with the
244 * lock held.
245 */
246
247struct rt_hash_bucket {
1c31720a 248 struct rtable __rcu *chain;
22c047cc 249};
1080d709 250
8a25d5de
IM
251#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
253/*
254 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
255 * The size of this table is a power of two and depends on the number of CPUS.
62051200 256 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 257 */
62051200
IM
258#ifdef CONFIG_LOCKDEP
259# define RT_HASH_LOCK_SZ 256
22c047cc 260#else
62051200
IM
261# if NR_CPUS >= 32
262# define RT_HASH_LOCK_SZ 4096
263# elif NR_CPUS >= 16
264# define RT_HASH_LOCK_SZ 2048
265# elif NR_CPUS >= 8
266# define RT_HASH_LOCK_SZ 1024
267# elif NR_CPUS >= 4
268# define RT_HASH_LOCK_SZ 512
269# else
270# define RT_HASH_LOCK_SZ 256
271# endif
22c047cc
ED
272#endif
273
274static spinlock_t *rt_hash_locks;
275# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
276
277static __init void rt_hash_lock_init(void)
278{
279 int i;
280
281 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282 GFP_KERNEL);
283 if (!rt_hash_locks)
284 panic("IP: failed to allocate rt_hash_locks\n");
285
286 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287 spin_lock_init(&rt_hash_locks[i]);
288}
22c047cc
ED
289#else
290# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
291
292static inline void rt_hash_lock_init(void)
293{
294}
22c047cc 295#endif
1da177e4 296
817bc4db
SH
297static struct rt_hash_bucket *rt_hash_table __read_mostly;
298static unsigned rt_hash_mask __read_mostly;
299static unsigned int rt_hash_log __read_mostly;
1da177e4 300
2f970d83 301static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 302#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 303
b00180de 304static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 305 int genid)
1da177e4 306{
0eae88f3 307 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 308 idx, genid)
29e75252 309 & rt_hash_mask;
1da177e4
LT
310}
311
e84f84f2
DL
312static inline int rt_genid(struct net *net)
313{
314 return atomic_read(&net->ipv4.rt_genid);
315}
316
1da177e4
LT
317#ifdef CONFIG_PROC_FS
318struct rt_cache_iter_state {
a75e936f 319 struct seq_net_private p;
1da177e4 320 int bucket;
29e75252 321 int genid;
1da177e4
LT
322};
323
1218854a 324static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 325{
1218854a 326 struct rt_cache_iter_state *st = seq->private;
1da177e4 327 struct rtable *r = NULL;
1da177e4
LT
328
329 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 330 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 331 continue;
1da177e4 332 rcu_read_lock_bh();
a898def2 333 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 334 while (r) {
d8d1f30b 335 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 336 r->rt_genid == st->genid)
29e75252 337 return r;
d8d1f30b 338 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 339 }
1da177e4
LT
340 rcu_read_unlock_bh();
341 }
29e75252 342 return r;
1da177e4
LT
343}
344
1218854a 345static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 346 struct rtable *r)
1da177e4 347{
1218854a 348 struct rt_cache_iter_state *st = seq->private;
a6272665 349
1c31720a 350 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
351 while (!r) {
352 rcu_read_unlock_bh();
a6272665
ED
353 do {
354 if (--st->bucket < 0)
355 return NULL;
33d480ce 356 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 357 rcu_read_lock_bh();
1c31720a 358 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 359 }
1c31720a 360 return r;
1da177e4
LT
361}
362
1218854a 363static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
364 struct rtable *r)
365{
1218854a
YH
366 struct rt_cache_iter_state *st = seq->private;
367 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 368 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 369 continue;
642d6318
DL
370 if (r->rt_genid == st->genid)
371 break;
372 }
373 return r;
374}
375
1218854a 376static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 377{
1218854a 378 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
379
380 if (r)
1218854a 381 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
382 --pos;
383 return pos ? NULL : r;
384}
385
386static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387{
29e75252 388 struct rt_cache_iter_state *st = seq->private;
29e75252 389 if (*pos)
1218854a 390 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 391 st->genid = rt_genid(seq_file_net(seq));
29e75252 392 return SEQ_START_TOKEN;
1da177e4
LT
393}
394
395static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396{
29e75252 397 struct rtable *r;
1da177e4
LT
398
399 if (v == SEQ_START_TOKEN)
1218854a 400 r = rt_cache_get_first(seq);
1da177e4 401 else
1218854a 402 r = rt_cache_get_next(seq, v);
1da177e4
LT
403 ++*pos;
404 return r;
405}
406
407static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408{
409 if (v && v != SEQ_START_TOKEN)
410 rcu_read_unlock_bh();
411}
412
413static int rt_cache_seq_show(struct seq_file *seq, void *v)
414{
415 if (v == SEQ_START_TOKEN)
416 seq_printf(seq, "%-127s\n",
417 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419 "HHUptod\tSpecDst");
420 else {
421 struct rtable *r = v;
69cce1d1 422 struct neighbour *n;
218fa90f 423 int len, HHUptod;
1da177e4 424
218fa90f 425 rcu_read_lock();
27217455 426 n = dst_get_neighbour_noref(&r->dst);
218fa90f
ED
427 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428 rcu_read_unlock();
429
0eae88f3
ED
430 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 432 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
433 (__force u32)r->rt_dst,
434 (__force u32)r->rt_gateway,
d8d1f30b
CG
435 r->rt_flags, atomic_read(&r->dst.__refcnt),
436 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 437 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
438 dst_metric(&r->dst, RTAX_WINDOW),
439 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 441 r->rt_key_tos,
f6b72b62 442 -1,
218fa90f 443 HHUptod,
5e659e4c
PE
444 r->rt_spec_dst, &len);
445
446 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
447 }
448 return 0;
1da177e4
LT
449}
450
f690808e 451static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
452 .start = rt_cache_seq_start,
453 .next = rt_cache_seq_next,
454 .stop = rt_cache_seq_stop,
455 .show = rt_cache_seq_show,
456};
457
458static int rt_cache_seq_open(struct inode *inode, struct file *file)
459{
a75e936f 460 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 461 sizeof(struct rt_cache_iter_state));
1da177e4
LT
462}
463
9a32144e 464static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
465 .owner = THIS_MODULE,
466 .open = rt_cache_seq_open,
467 .read = seq_read,
468 .llseek = seq_lseek,
a75e936f 469 .release = seq_release_net,
1da177e4
LT
470};
471
472
473static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474{
475 int cpu;
476
477 if (*pos == 0)
478 return SEQ_START_TOKEN;
479
0f23174a 480 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
2f970d83 484 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
485 }
486 return NULL;
487}
488
489static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490{
491 int cpu;
492
0f23174a 493 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
494 if (!cpu_possible(cpu))
495 continue;
496 *pos = cpu+1;
2f970d83 497 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
498 }
499 return NULL;
e905a9ed 500
1da177e4
LT
501}
502
503static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504{
505
506}
507
508static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509{
510 struct rt_cache_stat *st = v;
511
512 if (v == SEQ_START_TOKEN) {
5bec0039 513 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
514 return 0;
515 }
e905a9ed 516
1da177e4
LT
517 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
518 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 519 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
520 st->in_hit,
521 st->in_slow_tot,
522 st->in_slow_mc,
523 st->in_no_route,
524 st->in_brd,
525 st->in_martian_dst,
526 st->in_martian_src,
527
528 st->out_hit,
529 st->out_slow_tot,
e905a9ed 530 st->out_slow_mc,
1da177e4
LT
531
532 st->gc_total,
533 st->gc_ignored,
534 st->gc_goal_miss,
535 st->gc_dst_overflow,
536 st->in_hlist_search,
537 st->out_hlist_search
538 );
539 return 0;
540}
541
f690808e 542static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
543 .start = rt_cpu_seq_start,
544 .next = rt_cpu_seq_next,
545 .stop = rt_cpu_seq_stop,
546 .show = rt_cpu_seq_show,
547};
548
549
550static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551{
552 return seq_open(file, &rt_cpu_seq_ops);
553}
554
9a32144e 555static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
556 .owner = THIS_MODULE,
557 .open = rt_cpu_seq_open,
558 .read = seq_read,
559 .llseek = seq_lseek,
560 .release = seq_release,
561};
562
c7066f70 563#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 564static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 565{
a661c419
AD
566 struct ip_rt_acct *dst, *src;
567 unsigned int i, j;
568
569 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 if (!dst)
571 return -ENOMEM;
572
573 for_each_possible_cpu(i) {
574 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575 for (j = 0; j < 256; j++) {
576 dst[j].o_bytes += src[j].o_bytes;
577 dst[j].o_packets += src[j].o_packets;
578 dst[j].i_bytes += src[j].i_bytes;
579 dst[j].i_packets += src[j].i_packets;
580 }
78c686e9
PE
581 }
582
a661c419
AD
583 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584 kfree(dst);
585 return 0;
586}
78c686e9 587
a661c419
AD
588static int rt_acct_proc_open(struct inode *inode, struct file *file)
589{
590 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 591}
a661c419
AD
592
593static const struct file_operations rt_acct_proc_fops = {
594 .owner = THIS_MODULE,
595 .open = rt_acct_proc_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = single_release,
599};
78c686e9 600#endif
107f1634 601
73b38711 602static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
603{
604 struct proc_dir_entry *pde;
605
606 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607 &rt_cache_seq_fops);
608 if (!pde)
609 goto err1;
610
77020720
WC
611 pde = proc_create("rt_cache", S_IRUGO,
612 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
613 if (!pde)
614 goto err2;
615
c7066f70 616#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 617 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
618 if (!pde)
619 goto err3;
620#endif
621 return 0;
622
c7066f70 623#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
624err3:
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626#endif
627err2:
628 remove_proc_entry("rt_cache", net->proc_net);
629err1:
630 return -ENOMEM;
631}
73b38711
DL
632
633static void __net_exit ip_rt_do_proc_exit(struct net *net)
634{
635 remove_proc_entry("rt_cache", net->proc_net_stat);
636 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 637#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 638 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 639#endif
73b38711
DL
640}
641
642static struct pernet_operations ip_rt_proc_ops __net_initdata = {
643 .init = ip_rt_do_proc_init,
644 .exit = ip_rt_do_proc_exit,
645};
646
647static int __init ip_rt_proc_init(void)
648{
649 return register_pernet_subsys(&ip_rt_proc_ops);
650}
651
107f1634 652#else
73b38711 653static inline int ip_rt_proc_init(void)
107f1634
PE
654{
655 return 0;
656}
1da177e4 657#endif /* CONFIG_PROC_FS */
e905a9ed 658
5969f71d 659static inline void rt_free(struct rtable *rt)
1da177e4 660{
d8d1f30b 661 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
662}
663
5969f71d 664static inline void rt_drop(struct rtable *rt)
1da177e4 665{
1da177e4 666 ip_rt_put(rt);
d8d1f30b 667 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
668}
669
5969f71d 670static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
671{
672 /* Kill broadcast/multicast entries very aggresively, if they
673 collide in hash table with more useful entries */
674 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 675 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
676}
677
5969f71d 678static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
679{
680 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 681 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
682}
683
684static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685{
686 unsigned long age;
687 int ret = 0;
688
d8d1f30b 689 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
690 goto out;
691
d8d1f30b 692 age = jiffies - rth->dst.lastuse;
1da177e4
LT
693 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694 (age <= tmo2 && rt_valuable(rth)))
695 goto out;
696 ret = 1;
697out: return ret;
698}
699
700/* Bits of score are:
701 * 31: very valuable
702 * 30: not quite useless
703 * 29..0: usage counter
704 */
705static inline u32 rt_score(struct rtable *rt)
706{
d8d1f30b 707 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
708
709 score = ~score & ~(3<<30);
710
711 if (rt_valuable(rt))
712 score |= (1<<31);
713
c7537967 714 if (rt_is_output_route(rt) ||
1da177e4
LT
715 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716 score |= (1<<30);
717
718 return score;
719}
720
1080d709
NH
721static inline bool rt_caching(const struct net *net)
722{
723 return net->ipv4.current_rt_cache_rebuild_count <=
724 net->ipv4.sysctl_rt_cache_rebuild_count;
725}
726
5e2b61f7
DM
727static inline bool compare_hash_inputs(const struct rtable *rt1,
728 const struct rtable *rt2)
1080d709 729{
5e2b61f7
DM
730 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 732 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
733}
734
5e2b61f7 735static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 736{
5e2b61f7
DM
737 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 740 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 741 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 742 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
743}
744
b5921910
DL
745static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746{
d8d1f30b 747 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
748}
749
e84f84f2
DL
750static inline int rt_is_expired(struct rtable *rth)
751{
d8d1f30b 752 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
753}
754
beb659bd
ED
755/*
756 * Perform a full scan of hash table and free all entries.
757 * Can be called by a softirq or a process.
758 * In the later case, we want to be reschedule if necessary
759 */
6561a3b1 760static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
761{
762 unsigned int i;
763 struct rtable *rth, *next;
764
765 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
766 struct rtable __rcu **pprev;
767 struct rtable *list;
768
beb659bd
ED
769 if (process_context && need_resched())
770 cond_resched();
33d480ce 771 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
772 if (!rth)
773 continue;
774
775 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 776
6561a3b1
DM
777 list = NULL;
778 pprev = &rt_hash_table[i].chain;
779 rth = rcu_dereference_protected(*pprev,
1c31720a 780 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 781
6561a3b1
DM
782 while (rth) {
783 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 784 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
785
786 if (!net ||
787 net_eq(dev_net(rth->dst.dev), net)) {
788 rcu_assign_pointer(*pprev, next);
789 rcu_assign_pointer(rth->dst.rt_next, list);
790 list = rth;
32cb5b4e 791 } else {
6561a3b1 792 pprev = &rth->dst.rt_next;
32cb5b4e 793 }
6561a3b1 794 rth = next;
32cb5b4e 795 }
6561a3b1 796
beb659bd
ED
797 spin_unlock_bh(rt_hash_lock_addr(i));
798
6561a3b1
DM
799 for (; list; list = next) {
800 next = rcu_dereference_protected(list->dst.rt_next, 1);
801 rt_free(list);
beb659bd
ED
802 }
803 }
804}
805
1080d709
NH
806/*
807 * While freeing expired entries, we compute average chain length
808 * and standard deviation, using fixed-point arithmetic.
809 * This to have an estimation of rt_chain_length_max
810 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
811 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 */
813
814#define FRACT_BITS 3
815#define ONE (1UL << FRACT_BITS)
816
98376387
ED
817/*
818 * Given a hash chain and an item in this hash chain,
819 * find if a previous entry has the same hash_inputs
820 * (but differs on tos, mark or oif)
821 * Returns 0 if an alias is found.
822 * Returns ONE if rth has no alias before itself.
823 */
824static int has_noalias(const struct rtable *head, const struct rtable *rth)
825{
826 const struct rtable *aux = head;
827
828 while (aux != rth) {
5e2b61f7 829 if (compare_hash_inputs(aux, rth))
98376387 830 return 0;
1c31720a 831 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
832 }
833 return ONE;
834}
835
9f28a2fc
ED
836static void rt_check_expire(void)
837{
838 static unsigned int rover;
839 unsigned int i = rover, goal;
840 struct rtable *rth;
841 struct rtable __rcu **rthp;
842 unsigned long samples = 0;
843 unsigned long sum = 0, sum2 = 0;
844 unsigned long delta;
845 u64 mult;
846
847 delta = jiffies - expires_ljiffies;
848 expires_ljiffies = jiffies;
849 mult = ((u64)delta) << rt_hash_log;
850 if (ip_rt_gc_timeout > 1)
851 do_div(mult, ip_rt_gc_timeout);
852 goal = (unsigned int)mult;
853 if (goal > rt_hash_mask)
854 goal = rt_hash_mask + 1;
855 for (; goal > 0; goal--) {
856 unsigned long tmo = ip_rt_gc_timeout;
857 unsigned long length;
858
859 i = (i + 1) & rt_hash_mask;
860 rthp = &rt_hash_table[i].chain;
861
862 if (need_resched())
863 cond_resched();
864
865 samples++;
866
867 if (rcu_dereference_raw(*rthp) == NULL)
868 continue;
869 length = 0;
870 spin_lock_bh(rt_hash_lock_addr(i));
871 while ((rth = rcu_dereference_protected(*rthp,
872 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873 prefetch(rth->dst.rt_next);
874 if (rt_is_expired(rth)) {
875 *rthp = rth->dst.rt_next;
876 rt_free(rth);
877 continue;
878 }
879 if (rth->dst.expires) {
880 /* Entry is expired even if it is in use */
881 if (time_before_eq(jiffies, rth->dst.expires)) {
882nofree:
883 tmo >>= 1;
884 rthp = &rth->dst.rt_next;
885 /*
886 * We only count entries on
887 * a chain with equal hash inputs once
888 * so that entries for different QOS
889 * levels, and other non-hash input
890 * attributes don't unfairly skew
891 * the length computation
892 */
893 length += has_noalias(rt_hash_table[i].chain, rth);
894 continue;
895 }
896 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897 goto nofree;
898
899 /* Cleanup aged off entries. */
900 *rthp = rth->dst.rt_next;
901 rt_free(rth);
902 }
903 spin_unlock_bh(rt_hash_lock_addr(i));
904 sum += length;
905 sum2 += length*length;
906 }
907 if (samples) {
908 unsigned long avg = sum / samples;
909 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910 rt_chain_length_max = max_t(unsigned long,
911 ip_rt_gc_elasticity,
912 (avg + 4*sd) >> FRACT_BITS);
913 }
914 rover = i;
915}
916
917/*
918 * rt_worker_func() is run in process context.
919 * we call rt_check_expire() to scan part of the hash table
920 */
921static void rt_worker_func(struct work_struct *work)
922{
923 rt_check_expire();
924 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925}
926
29e75252 927/*
25985edc 928 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
929 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
930 * many times (2^24) without giving recent rt_genid.
931 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 932 */
86c657f6 933static void rt_cache_invalidate(struct net *net)
1da177e4 934{
29e75252 935 unsigned char shuffle;
1da177e4 936
29e75252 937 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 938 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
5faa5df1 939 inetpeer_invalidate_tree(AF_INET);
1da177e4
LT
940}
941
29e75252
ED
942/*
943 * delay < 0 : invalidate cache (fast : entries will be deleted later)
944 * delay >= 0 : invalidate & flush cache (can be long)
945 */
76e6ebfb 946void rt_cache_flush(struct net *net, int delay)
1da177e4 947{
86c657f6 948 rt_cache_invalidate(net);
29e75252 949 if (delay >= 0)
6561a3b1 950 rt_do_flush(net, !in_softirq());
1da177e4
LT
951}
952
a5ee1551 953/* Flush previous cache invalidated entries from the cache */
6561a3b1 954void rt_cache_flush_batch(struct net *net)
a5ee1551 955{
6561a3b1 956 rt_do_flush(net, !in_softirq());
a5ee1551
EB
957}
958
1080d709
NH
959static void rt_emergency_hash_rebuild(struct net *net)
960{
3ee94372 961 if (net_ratelimit())
058bd4d2 962 pr_warn("Route hash chain too long!\n");
3ee94372 963 rt_cache_invalidate(net);
1080d709
NH
964}
965
1da177e4
LT
966/*
967 Short description of GC goals.
968
969 We want to build algorithm, which will keep routing cache
970 at some equilibrium point, when number of aged off entries
971 is kept approximately equal to newly generated ones.
972
973 Current expiration strength is variable "expire".
974 We try to adjust it dynamically, so that if networking
975 is idle expires is large enough to keep enough of warm entries,
976 and when load increases it reduces to limit cache size.
977 */
978
569d3645 979static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
980{
981 static unsigned long expire = RT_GC_TIMEOUT;
982 static unsigned long last_gc;
983 static int rover;
984 static int equilibrium;
1c31720a
ED
985 struct rtable *rth;
986 struct rtable __rcu **rthp;
1da177e4
LT
987 unsigned long now = jiffies;
988 int goal;
fc66f95c 989 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
990
991 /*
992 * Garbage collection is pretty expensive,
993 * do not make it too frequently.
994 */
995
996 RT_CACHE_STAT_INC(gc_total);
997
998 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 999 entries < ip_rt_max_size) {
1da177e4
LT
1000 RT_CACHE_STAT_INC(gc_ignored);
1001 goto out;
1002 }
1003
fc66f95c 1004 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1005 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1006 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1007 if (goal <= 0) {
1008 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1010 goal = entries - equilibrium;
1da177e4 1011 if (goal > 0) {
b790cedd 1012 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1013 goal = entries - equilibrium;
1da177e4
LT
1014 }
1015 } else {
1016 /* We are in dangerous area. Try to reduce cache really
1017 * aggressively.
1018 */
b790cedd 1019 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1020 equilibrium = entries - goal;
1da177e4
LT
1021 }
1022
1023 if (now - last_gc >= ip_rt_gc_min_interval)
1024 last_gc = now;
1025
1026 if (goal <= 0) {
1027 equilibrium += goal;
1028 goto work_done;
1029 }
1030
1031 do {
1032 int i, k;
1033
1034 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035 unsigned long tmo = expire;
1036
1037 k = (k + 1) & rt_hash_mask;
1038 rthp = &rt_hash_table[k].chain;
22c047cc 1039 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1040 while ((rth = rcu_dereference_protected(*rthp,
1041 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1042 if (!rt_is_expired(rth) &&
29e75252 1043 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1044 tmo >>= 1;
d8d1f30b 1045 rthp = &rth->dst.rt_next;
1da177e4
LT
1046 continue;
1047 }
d8d1f30b 1048 *rthp = rth->dst.rt_next;
1da177e4
LT
1049 rt_free(rth);
1050 goal--;
1da177e4 1051 }
22c047cc 1052 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1053 if (goal <= 0)
1054 break;
1055 }
1056 rover = k;
1057
1058 if (goal <= 0)
1059 goto work_done;
1060
1061 /* Goal is not achieved. We stop process if:
1062
1063 - if expire reduced to zero. Otherwise, expire is halfed.
1064 - if table is not full.
1065 - if we are called from interrupt.
1066 - jiffies check is just fallback/debug loop breaker.
1067 We will not spin here for long time in any case.
1068 */
1069
1070 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072 if (expire == 0)
1073 break;
1074
1075 expire >>= 1;
1da177e4 1076
fc66f95c 1077 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1078 goto out;
1079 } while (!in_softirq() && time_before_eq(jiffies, now));
1080
fc66f95c
ED
1081 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082 goto out;
1083 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1084 goto out;
1085 if (net_ratelimit())
058bd4d2 1086 pr_warn("dst cache overflow\n");
1da177e4
LT
1087 RT_CACHE_STAT_INC(gc_dst_overflow);
1088 return 1;
1089
1090work_done:
1091 expire += ip_rt_gc_min_interval;
1092 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1093 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1095 expire = ip_rt_gc_timeout;
1da177e4
LT
1096out: return 0;
1097}
1098
98376387
ED
1099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104 int length = 0;
1105 const struct rtable *rth = head;
1106
1107 while (rth) {
1108 length += has_noalias(head, rth);
1c31720a 1109 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1110 }
1111 return length >> FRACT_BITS;
1112}
1113
d3aaeb38 1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
3769cffb 1115{
d3aaeb38
DM
1116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr;
39232973 1119 const struct rtable *rt;
3769cffb
DM
1120 struct neighbour *n;
1121
39232973
DM
1122 rt = (const struct rtable *) dst;
1123
3769cffb 1124 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
d3aaeb38 1125 pkey = &inaddr_any;
39232973
DM
1126 else if (rt->rt_gateway)
1127 pkey = (const __be32 *) &rt->rt_gateway;
d3aaeb38 1128
80703d26 1129 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
1130 if (n)
1131 return n;
32092ecf 1132 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1133}
1134
1135static int rt_bind_neighbour(struct rtable *rt)
1136{
1137 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
3769cffb
DM
1138 if (IS_ERR(n))
1139 return PTR_ERR(n);
69cce1d1 1140 dst_set_neighbour(&rt->dst, n);
3769cffb
DM
1141
1142 return 0;
1143}
1144
b23dd4fe
DM
1145static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1146 struct sk_buff *skb, int ifindex)
1da177e4 1147{
1c31720a
ED
1148 struct rtable *rth, *cand;
1149 struct rtable __rcu **rthp, **candp;
1da177e4 1150 unsigned long now;
1da177e4
LT
1151 u32 min_score;
1152 int chain_length;
1153 int attempts = !in_softirq();
1154
1155restart:
1156 chain_length = 0;
1157 min_score = ~(u32)0;
1158 cand = NULL;
1159 candp = NULL;
1160 now = jiffies;
1161
d8d1f30b 1162 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1163 /*
1164 * If we're not caching, just tell the caller we
1165 * were successful and don't touch the route. The
1166 * caller hold the sole reference to the cache entry, and
1167 * it will be released when the caller is done with it.
1168 * If we drop it here, the callers have no way to resolve routes
1169 * when we're not caching. Instead, just point *rp at rt, so
1170 * the caller gets a single use out of the route
b6280b47
NH
1171 * Note that we do rt_free on this new route entry, so that
1172 * once its refcount hits zero, we are still able to reap it
1173 * (Thanks Alexey)
27b75c95
ED
1174 * Note: To avoid expensive rcu stuff for this uncached dst,
1175 * we set DST_NOCACHE so that dst_release() can free dst without
1176 * waiting a grace period.
73e42897 1177 */
b6280b47 1178
c7d4426a 1179 rt->dst.flags |= DST_NOCACHE;
c7537967 1180 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1181 int err = rt_bind_neighbour(rt);
b6280b47
NH
1182 if (err) {
1183 if (net_ratelimit())
058bd4d2 1184 pr_warn("Neighbour table failure & not caching routes\n");
27b75c95 1185 ip_rt_put(rt);
b23dd4fe 1186 return ERR_PTR(err);
b6280b47
NH
1187 }
1188 }
1189
b6280b47 1190 goto skip_hashing;
1080d709
NH
1191 }
1192
1da177e4
LT
1193 rthp = &rt_hash_table[hash].chain;
1194
22c047cc 1195 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1196 while ((rth = rcu_dereference_protected(*rthp,
1197 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1198 if (rt_is_expired(rth)) {
d8d1f30b 1199 *rthp = rth->dst.rt_next;
29e75252
ED
1200 rt_free(rth);
1201 continue;
1202 }
5e2b61f7 1203 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1204 /* Put it first */
d8d1f30b 1205 *rthp = rth->dst.rt_next;
1da177e4
LT
1206 /*
1207 * Since lookup is lockfree, the deletion
1208 * must be visible to another weakly ordered CPU before
1209 * the insertion at the start of the hash chain.
1210 */
d8d1f30b 1211 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1212 rt_hash_table[hash].chain);
1213 /*
1214 * Since lookup is lockfree, the update writes
1215 * must be ordered for consistency on SMP.
1216 */
1217 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1218
d8d1f30b 1219 dst_use(&rth->dst, now);
22c047cc 1220 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1221
1222 rt_drop(rt);
b23dd4fe 1223 if (skb)
d8d1f30b 1224 skb_dst_set(skb, &rth->dst);
b23dd4fe 1225 return rth;
1da177e4
LT
1226 }
1227
d8d1f30b 1228 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1229 u32 score = rt_score(rth);
1230
1231 if (score <= min_score) {
1232 cand = rth;
1233 candp = rthp;
1234 min_score = score;
1235 }
1236 }
1237
1238 chain_length++;
1239
d8d1f30b 1240 rthp = &rth->dst.rt_next;
1da177e4
LT
1241 }
1242
1243 if (cand) {
1244 /* ip_rt_gc_elasticity used to be average length of chain
1245 * length, when exceeded gc becomes really aggressive.
1246 *
1247 * The second limit is less certain. At the moment it allows
1248 * only 2 entries per bucket. We will see.
1249 */
1250 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1251 *candp = cand->dst.rt_next;
1da177e4
LT
1252 rt_free(cand);
1253 }
1080d709 1254 } else {
98376387
ED
1255 if (chain_length > rt_chain_length_max &&
1256 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1257 struct net *net = dev_net(rt->dst.dev);
1080d709 1258 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1259 if (!rt_caching(net)) {
058bd4d2 1260 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1261 rt->dst.dev->name, num);
1080d709 1262 }
b35ecb5d 1263 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1264 spin_unlock_bh(rt_hash_lock_addr(hash));
1265
5e2b61f7 1266 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1267 ifindex, rt_genid(net));
1268 goto restart;
1080d709 1269 }
1da177e4
LT
1270 }
1271
1272 /* Try to bind route to arp only if it is output
1273 route or unicast forwarding path.
1274 */
c7537967 1275 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1276 int err = rt_bind_neighbour(rt);
1da177e4 1277 if (err) {
22c047cc 1278 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1279
1280 if (err != -ENOBUFS) {
1281 rt_drop(rt);
b23dd4fe 1282 return ERR_PTR(err);
1da177e4
LT
1283 }
1284
1285 /* Neighbour tables are full and nothing
1286 can be released. Try to shrink route cache,
1287 it is most likely it holds some neighbour records.
1288 */
1289 if (attempts-- > 0) {
1290 int saved_elasticity = ip_rt_gc_elasticity;
1291 int saved_int = ip_rt_gc_min_interval;
1292 ip_rt_gc_elasticity = 1;
1293 ip_rt_gc_min_interval = 0;
569d3645 1294 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1295 ip_rt_gc_min_interval = saved_int;
1296 ip_rt_gc_elasticity = saved_elasticity;
1297 goto restart;
1298 }
1299
1300 if (net_ratelimit())
058bd4d2 1301 pr_warn("ipv4: Neighbour table overflow\n");
1da177e4 1302 rt_drop(rt);
b23dd4fe 1303 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1304 }
1305 }
1306
d8d1f30b 1307 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1308
00269b54
ED
1309 /*
1310 * Since lookup is lockfree, we must make sure
25985edc 1311 * previous writes to rt are committed to memory
00269b54
ED
1312 * before making rt visible to other CPUS.
1313 */
1ddbcb00 1314 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1315
22c047cc 1316 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1317
b6280b47 1318skip_hashing:
b23dd4fe 1319 if (skb)
d8d1f30b 1320 skb_dst_set(skb, &rt->dst);
b23dd4fe 1321 return rt;
1da177e4
LT
1322}
1323
6431cbc2
DM
1324static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1325
1326static u32 rt_peer_genid(void)
1327{
1328 return atomic_read(&__rt_peer_genid);
1329}
1330
a48eff12 1331void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1332{
1da177e4
LT
1333 struct inet_peer *peer;
1334
a48eff12 1335 peer = inet_getpeer_v4(daddr, create);
1da177e4 1336
49e8ab03 1337 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1338 inet_putpeer(peer);
6431cbc2
DM
1339 else
1340 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1341}
1342
1343/*
1344 * Peer allocation may fail only in serious out-of-memory conditions. However
1345 * we still can generate some output.
1346 * Random ID selection looks a bit dangerous because we have no chances to
1347 * select ID being unique in a reasonable period of time.
1348 * But broken packet identifier may be better than no packet at all.
1349 */
1350static void ip_select_fb_ident(struct iphdr *iph)
1351{
1352 static DEFINE_SPINLOCK(ip_fb_id_lock);
1353 static u32 ip_fallback_id;
1354 u32 salt;
1355
1356 spin_lock_bh(&ip_fb_id_lock);
e448515c 1357 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1358 iph->id = htons(salt & 0xFFFF);
1359 ip_fallback_id = salt;
1360 spin_unlock_bh(&ip_fb_id_lock);
1361}
1362
1363void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1364{
1365 struct rtable *rt = (struct rtable *) dst;
1366
e688a604 1367 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1da177e4 1368 if (rt->peer == NULL)
a48eff12 1369 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1370
1371 /* If peer is attached to destination, it is never detached,
1372 so that we need not to grab a lock to dereference it.
1373 */
1374 if (rt->peer) {
1375 iph->id = htons(inet_getid(rt->peer, more));
1376 return;
1377 }
e688a604 1378 } else if (!rt)
e905a9ed 1379 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1380 __builtin_return_address(0));
1da177e4
LT
1381
1382 ip_select_fb_ident(iph);
1383}
4bc2f18b 1384EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1385
1386static void rt_del(unsigned hash, struct rtable *rt)
1387{
1c31720a
ED
1388 struct rtable __rcu **rthp;
1389 struct rtable *aux;
1da177e4 1390
29e75252 1391 rthp = &rt_hash_table[hash].chain;
22c047cc 1392 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1393 ip_rt_put(rt);
1c31720a
ED
1394 while ((aux = rcu_dereference_protected(*rthp,
1395 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1396 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1397 *rthp = aux->dst.rt_next;
29e75252
ED
1398 rt_free(aux);
1399 continue;
1da177e4 1400 }
d8d1f30b 1401 rthp = &aux->dst.rt_next;
29e75252 1402 }
22c047cc 1403 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1404}
1405
de398fb8 1406static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
9cc20b26
ED
1407{
1408 struct rtable *rt = (struct rtable *) dst;
1409 __be32 orig_gw = rt->rt_gateway;
1410 struct neighbour *n, *old_n;
1411
1412 dst_confirm(&rt->dst);
1413
1414 rt->rt_gateway = peer->redirect_learned.a4;
1415
1416 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
de398fb8
DM
1417 if (IS_ERR(n)) {
1418 rt->rt_gateway = orig_gw;
1419 return;
1420 }
9cc20b26
ED
1421 old_n = xchg(&rt->dst._neighbour, n);
1422 if (old_n)
1423 neigh_release(old_n);
de398fb8
DM
1424 if (!(n->nud_state & NUD_VALID)) {
1425 neigh_event_send(n, NULL);
9cc20b26
ED
1426 } else {
1427 rt->rt_flags |= RTCF_REDIRECTED;
1428 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1429 }
9cc20b26
ED
1430}
1431
ed7865a4 1432/* called in rcu_read_lock() section */
f7655229
AV
1433void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1434 __be32 saddr, struct net_device *dev)
1da177e4 1435{
7cc9150e 1436 int s, i;
ed7865a4 1437 struct in_device *in_dev = __in_dev_get_rcu(dev);
7cc9150e
FL
1438 __be32 skeys[2] = { saddr, 0 };
1439 int ikeys[2] = { dev->ifindex, 0 };
f39925db 1440 struct inet_peer *peer;
317805b8 1441 struct net *net;
1da177e4 1442
1da177e4
LT
1443 if (!in_dev)
1444 return;
1445
c346dca1 1446 net = dev_net(dev);
9d4fb27d
JP
1447 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1448 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1449 ipv4_is_zeronet(new_gw))
1da177e4
LT
1450 goto reject_redirect;
1451
1452 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1453 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1454 goto reject_redirect;
1455 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1456 goto reject_redirect;
1457 } else {
317805b8 1458 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1459 goto reject_redirect;
1460 }
1461
7cc9150e
FL
1462 for (s = 0; s < 2; s++) {
1463 for (i = 0; i < 2; i++) {
9cc20b26
ED
1464 unsigned int hash;
1465 struct rtable __rcu **rthp;
1466 struct rtable *rt;
1467
1468 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1469
1470 rthp = &rt_hash_table[hash].chain;
1471
1472 while ((rt = rcu_dereference(*rthp)) != NULL) {
1473 rthp = &rt->dst.rt_next;
1474
1475 if (rt->rt_key_dst != daddr ||
1476 rt->rt_key_src != skeys[s] ||
1477 rt->rt_oif != ikeys[i] ||
1478 rt_is_input_route(rt) ||
1479 rt_is_expired(rt) ||
1480 !net_eq(dev_net(rt->dst.dev), net) ||
1481 rt->dst.error ||
1482 rt->dst.dev != dev ||
1483 rt->rt_gateway != old_gw)
1484 continue;
e905a9ed 1485
9cc20b26
ED
1486 if (!rt->peer)
1487 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4 1488
9cc20b26
ED
1489 peer = rt->peer;
1490 if (peer) {
ac3f48de 1491 if (peer->redirect_learned.a4 != new_gw) {
9cc20b26
ED
1492 peer->redirect_learned.a4 = new_gw;
1493 atomic_inc(&__rt_peer_genid);
1494 }
1495 check_peer_redir(&rt->dst, peer);
1496 }
7cc9150e 1497 }
7cc9150e 1498 }
1da177e4 1499 }
1da177e4
LT
1500 return;
1501
1502reject_redirect:
1503#ifdef CONFIG_IP_ROUTE_VERBOSE
1504 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
058bd4d2 1505 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
673d57e7 1506 " Advised path = %pI4 -> %pI4\n",
058bd4d2
JP
1507 &old_gw, dev->name, &new_gw,
1508 &saddr, &daddr);
1da177e4 1509#endif
ed7865a4 1510 ;
1da177e4
LT
1511}
1512
fe6fe792
ED
1513static bool peer_pmtu_expired(struct inet_peer *peer)
1514{
1515 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1516
1517 return orig &&
1518 time_after_eq(jiffies, orig) &&
1519 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1520}
1521
1522static bool peer_pmtu_cleaned(struct inet_peer *peer)
1523{
1524 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1525
1526 return orig &&
1527 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1528}
1529
1da177e4
LT
1530static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1531{
ee6b9673 1532 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1533 struct dst_entry *ret = dst;
1534
1535 if (rt) {
d11a4dc1 1536 if (dst->obsolete > 0) {
1da177e4
LT
1537 ip_rt_put(rt);
1538 ret = NULL;
2c8cec5c 1539 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1540 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1541 rt->rt_oif,
e84f84f2 1542 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1543 rt_del(hash, rt);
1544 ret = NULL;
fe6fe792
ED
1545 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1546 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1547 }
1548 }
1549 return ret;
1550}
1551
1552/*
1553 * Algorithm:
1554 * 1. The first ip_rt_redirect_number redirects are sent
1555 * with exponential backoff, then we stop sending them at all,
1556 * assuming that the host ignores our redirects.
1557 * 2. If we did not see packets requiring redirects
1558 * during ip_rt_redirect_silence, we assume that the host
1559 * forgot redirected route and start to send redirects again.
1560 *
1561 * This algorithm is much cheaper and more intelligent than dumb load limiting
1562 * in icmp.c.
1563 *
1564 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1565 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1566 */
1567
1568void ip_rt_send_redirect(struct sk_buff *skb)
1569{
511c3f92 1570 struct rtable *rt = skb_rtable(skb);
30038fc6 1571 struct in_device *in_dev;
92d86829 1572 struct inet_peer *peer;
30038fc6 1573 int log_martians;
1da177e4 1574
30038fc6 1575 rcu_read_lock();
d8d1f30b 1576 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1577 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1578 rcu_read_unlock();
1da177e4 1579 return;
30038fc6
ED
1580 }
1581 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1582 rcu_read_unlock();
1da177e4 1583
92d86829 1584 if (!rt->peer)
a48eff12 1585 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1586 peer = rt->peer;
1587 if (!peer) {
1588 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1589 return;
1590 }
1591
1da177e4
LT
1592 /* No redirected packets during ip_rt_redirect_silence;
1593 * reset the algorithm.
1594 */
92d86829
DM
1595 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1596 peer->rate_tokens = 0;
1da177e4
LT
1597
1598 /* Too many ignored redirects; do not send anything
d8d1f30b 1599 * set dst.rate_last to the last seen redirected packet.
1da177e4 1600 */
92d86829
DM
1601 if (peer->rate_tokens >= ip_rt_redirect_number) {
1602 peer->rate_last = jiffies;
30038fc6 1603 return;
1da177e4
LT
1604 }
1605
1606 /* Check for load limit; set rate_last to the latest sent
1607 * redirect.
1608 */
92d86829 1609 if (peer->rate_tokens == 0 ||
14fb8a76 1610 time_after(jiffies,
92d86829
DM
1611 (peer->rate_last +
1612 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1613 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1614 peer->rate_last = jiffies;
1615 ++peer->rate_tokens;
1da177e4 1616#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1617 if (log_martians &&
92d86829 1618 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1619 net_ratelimit())
058bd4d2
JP
1620 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1621 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1622 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1623#endif
1624 }
1da177e4
LT
1625}
1626
1627static int ip_error(struct sk_buff *skb)
1628{
511c3f92 1629 struct rtable *rt = skb_rtable(skb);
92d86829 1630 struct inet_peer *peer;
1da177e4 1631 unsigned long now;
92d86829 1632 bool send;
1da177e4
LT
1633 int code;
1634
d8d1f30b 1635 switch (rt->dst.error) {
4500ebf8
JP
1636 case EINVAL:
1637 default:
1638 goto out;
1639 case EHOSTUNREACH:
1640 code = ICMP_HOST_UNREACH;
1641 break;
1642 case ENETUNREACH:
1643 code = ICMP_NET_UNREACH;
1644 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1645 IPSTATS_MIB_INNOROUTES);
1646 break;
1647 case EACCES:
1648 code = ICMP_PKT_FILTERED;
1649 break;
1da177e4
LT
1650 }
1651
92d86829 1652 if (!rt->peer)
a48eff12 1653 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1654 peer = rt->peer;
1655
1656 send = true;
1657 if (peer) {
1658 now = jiffies;
1659 peer->rate_tokens += now - peer->rate_last;
1660 if (peer->rate_tokens > ip_rt_error_burst)
1661 peer->rate_tokens = ip_rt_error_burst;
1662 peer->rate_last = now;
1663 if (peer->rate_tokens >= ip_rt_error_cost)
1664 peer->rate_tokens -= ip_rt_error_cost;
1665 else
1666 send = false;
1da177e4 1667 }
92d86829
DM
1668 if (send)
1669 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1670
1671out: kfree_skb(skb);
1672 return 0;
e905a9ed 1673}
1da177e4
LT
1674
1675/*
1676 * The last two values are not from the RFC but
1677 * are needed for AMPRnet AX.25 paths.
1678 */
1679
9b5b5cff 1680static const unsigned short mtu_plateau[] =
1da177e4
LT
1681{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1682
5969f71d 1683static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1684{
1685 int i;
e905a9ed 1686
1da177e4
LT
1687 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1688 if (old_mtu > mtu_plateau[i])
1689 return mtu_plateau[i];
1690 return 68;
1691}
1692
b71d1d42 1693unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1694 unsigned short new_mtu,
1695 struct net_device *dev)
1da177e4 1696{
1da177e4 1697 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1698 unsigned short est_mtu = 0;
2c8cec5c 1699 struct inet_peer *peer;
1da177e4 1700
2c8cec5c
DM
1701 peer = inet_getpeer_v4(iph->daddr, 1);
1702 if (peer) {
1703 unsigned short mtu = new_mtu;
1da177e4 1704
2c8cec5c
DM
1705 if (new_mtu < 68 || new_mtu >= old_mtu) {
1706 /* BSD 4.2 derived systems incorrectly adjust
1707 * tot_len by the IP header length, and report
1708 * a zero MTU in the ICMP message.
1709 */
1710 if (mtu == 0 &&
1711 old_mtu >= 68 + (iph->ihl << 2))
1712 old_mtu -= iph->ihl << 2;
1713 mtu = guess_mtu(old_mtu);
1714 }
0010e465 1715
2c8cec5c
DM
1716 if (mtu < ip_rt_min_pmtu)
1717 mtu = ip_rt_min_pmtu;
1718 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1719 unsigned long pmtu_expires;
1720
1721 pmtu_expires = jiffies + ip_rt_mtu_expires;
1722 if (!pmtu_expires)
1723 pmtu_expires = 1UL;
1724
2c8cec5c
DM
1725 est_mtu = mtu;
1726 peer->pmtu_learned = mtu;
46af3180 1727 peer->pmtu_expires = pmtu_expires;
59445b6b 1728 atomic_inc(&__rt_peer_genid);
2c8cec5c 1729 }
1da177e4 1730
2c8cec5c 1731 inet_putpeer(peer);
1da177e4
LT
1732 }
1733 return est_mtu ? : new_mtu;
1734}
1735
2c8cec5c
DM
1736static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1737{
fe6fe792 1738 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1739
fe6fe792
ED
1740 if (!expires)
1741 return;
46af3180 1742 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1743 u32 orig_dst_mtu = dst_mtu(dst);
1744 if (peer->pmtu_learned < orig_dst_mtu) {
1745 if (!peer->pmtu_orig)
1746 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1747 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1748 }
1749 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1750 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1751}
1752
1da177e4
LT
1753static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1754{
2c8cec5c
DM
1755 struct rtable *rt = (struct rtable *) dst;
1756 struct inet_peer *peer;
1757
1758 dst_confirm(dst);
1759
1760 if (!rt->peer)
a48eff12 1761 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1762 peer = rt->peer;
1763 if (peer) {
fe6fe792
ED
1764 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1765
2c8cec5c 1766 if (mtu < ip_rt_min_pmtu)
1da177e4 1767 mtu = ip_rt_min_pmtu;
fe6fe792 1768 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1769
1770 pmtu_expires = jiffies + ip_rt_mtu_expires;
1771 if (!pmtu_expires)
1772 pmtu_expires = 1UL;
1773
2c8cec5c 1774 peer->pmtu_learned = mtu;
46af3180 1775 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1776
1777 atomic_inc(&__rt_peer_genid);
1778 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1779 }
46af3180 1780 check_peer_pmtu(dst, peer);
1da177e4
LT
1781 }
1782}
1783
f39925db 1784
de398fb8 1785static void ipv4_validate_peer(struct rtable *rt)
1da177e4 1786{
6431cbc2 1787 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1788 struct inet_peer *peer;
1789
6431cbc2 1790 if (!rt->peer)
a48eff12 1791 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1792
2c8cec5c 1793 peer = rt->peer;
fe6fe792 1794 if (peer) {
efbc368d 1795 check_peer_pmtu(&rt->dst, peer);
2c8cec5c 1796
fe6fe792 1797 if (peer->redirect_learned.a4 &&
de398fb8
DM
1798 peer->redirect_learned.a4 != rt->rt_gateway)
1799 check_peer_redir(&rt->dst, peer);
f39925db
DM
1800 }
1801
6431cbc2
DM
1802 rt->rt_peer_genid = rt_peer_genid();
1803 }
efbc368d
DM
1804}
1805
1806static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1807{
1808 struct rtable *rt = (struct rtable *) dst;
1809
1810 if (rt_is_expired(rt))
1811 return NULL;
de398fb8 1812 ipv4_validate_peer(rt);
d11a4dc1 1813 return dst;
1da177e4
LT
1814}
1815
1816static void ipv4_dst_destroy(struct dst_entry *dst)
1817{
1818 struct rtable *rt = (struct rtable *) dst;
1819 struct inet_peer *peer = rt->peer;
1da177e4 1820
62fa8a84
DM
1821 if (rt->fi) {
1822 fib_info_put(rt->fi);
1823 rt->fi = NULL;
1824 }
1da177e4
LT
1825 if (peer) {
1826 rt->peer = NULL;
1827 inet_putpeer(peer);
1828 }
1da177e4
LT
1829}
1830
1da177e4
LT
1831
1832static void ipv4_link_failure(struct sk_buff *skb)
1833{
1834 struct rtable *rt;
1835
1836 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1837
511c3f92 1838 rt = skb_rtable(skb);
fe6fe792
ED
1839 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1840 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1841}
1842
1843static int ip_rt_bug(struct sk_buff *skb)
1844{
673d57e7
HH
1845 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1846 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1847 skb->dev ? skb->dev->name : "?");
1848 kfree_skb(skb);
c378a9c0 1849 WARN_ON(1);
1da177e4
LT
1850 return 0;
1851}
1852
1853/*
1854 We do not cache source address of outgoing interface,
1855 because it is used only by IP RR, TS and SRR options,
1856 so that it out of fast path.
1857
1858 BTW remember: "addr" is allowed to be not aligned
1859 in IP options!
1860 */
1861
8e36360a 1862void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1863{
a61ced5d 1864 __be32 src;
1da177e4 1865
c7537967 1866 if (rt_is_output_route(rt))
c5be24ff 1867 src = ip_hdr(skb)->saddr;
ebc0ffae 1868 else {
8e36360a
DM
1869 struct fib_result res;
1870 struct flowi4 fl4;
1871 struct iphdr *iph;
1872
1873 iph = ip_hdr(skb);
1874
1875 memset(&fl4, 0, sizeof(fl4));
1876 fl4.daddr = iph->daddr;
1877 fl4.saddr = iph->saddr;
b0fe4a31 1878 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1879 fl4.flowi4_oif = rt->dst.dev->ifindex;
1880 fl4.flowi4_iif = skb->dev->ifindex;
1881 fl4.flowi4_mark = skb->mark;
5e2b61f7 1882
ebc0ffae 1883 rcu_read_lock();
68a5e3dd 1884 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1885 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1886 else
1887 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1888 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1889 rcu_read_unlock();
1890 }
1da177e4
LT
1891 memcpy(addr, &src, 4);
1892}
1893
c7066f70 1894#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1895static void set_class_tag(struct rtable *rt, u32 tag)
1896{
d8d1f30b
CG
1897 if (!(rt->dst.tclassid & 0xFFFF))
1898 rt->dst.tclassid |= tag & 0xFFFF;
1899 if (!(rt->dst.tclassid & 0xFFFF0000))
1900 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1901}
1902#endif
1903
0dbaee3b
DM
1904static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1905{
1906 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1907
1908 if (advmss == 0) {
1909 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1910 ip_rt_min_advmss);
1911 if (advmss > 65535 - 40)
1912 advmss = 65535 - 40;
1913 }
1914 return advmss;
1915}
1916
ebb762f2 1917static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1918{
261663b0 1919 const struct rtable *rt = (const struct rtable *) dst;
618f9bc7
SK
1920 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1921
261663b0 1922 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1923 return mtu;
1924
1925 mtu = dst->dev->mtu;
d33e4553
DM
1926
1927 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1928
1929 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1930 mtu = 576;
1931 }
1932
1933 if (mtu > IP_MAX_MTU)
1934 mtu = IP_MAX_MTU;
1935
1936 return mtu;
1937}
1938
813b3b5d 1939static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1940 struct fib_info *fi)
a4daad6b 1941{
0131ba45
DM
1942 struct inet_peer *peer;
1943 int create = 0;
a4daad6b 1944
0131ba45
DM
1945 /* If a peer entry exists for this destination, we must hook
1946 * it up in order to get at cached metrics.
1947 */
813b3b5d 1948 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1949 create = 1;
1950
3c0afdca 1951 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1952 if (peer) {
3c0afdca 1953 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1954 if (inet_metrics_new(peer))
1955 memcpy(peer->metrics, fi->fib_metrics,
1956 sizeof(u32) * RTAX_MAX);
1957 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1958
fe6fe792 1959 check_peer_pmtu(&rt->dst, peer);
ac3f48de 1960
f39925db
DM
1961 if (peer->redirect_learned.a4 &&
1962 peer->redirect_learned.a4 != rt->rt_gateway) {
1963 rt->rt_gateway = peer->redirect_learned.a4;
1964 rt->rt_flags |= RTCF_REDIRECTED;
1965 }
0131ba45
DM
1966 } else {
1967 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1968 rt->fi = fi;
1969 atomic_inc(&fi->fib_clntref);
1970 }
1971 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1972 }
1973}
1974
813b3b5d 1975static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1976 const struct fib_result *res,
982721f3 1977 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1978{
defb3519 1979 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1980
1981 if (fi) {
1982 if (FIB_RES_GW(*res) &&
1983 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1984 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1985 rt_init_metrics(rt, fl4, fi);
c7066f70 1986#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1987 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1988#endif
d33e4553 1989 }
defb3519 1990
defb3519
DM
1991 if (dst_mtu(dst) > IP_MAX_MTU)
1992 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1993 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1994 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1995
c7066f70 1996#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1997#ifdef CONFIG_IP_MULTIPLE_TABLES
1998 set_class_tag(rt, fib_rules_tclass(res));
1999#endif
2000 set_class_tag(rt, itag);
2001#endif
1da177e4
LT
2002}
2003
5c1e6aa3
DM
2004static struct rtable *rt_dst_alloc(struct net_device *dev,
2005 bool nopolicy, bool noxfrm)
0c4dcd58 2006{
5c1e6aa3
DM
2007 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2008 DST_HOST |
2009 (nopolicy ? DST_NOPOLICY : 0) |
2010 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
2011}
2012
96d36220 2013/* called in rcu_read_lock() section */
9e12bb22 2014static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2015 u8 tos, struct net_device *dev, int our)
2016{
96d36220 2017 unsigned int hash;
1da177e4 2018 struct rtable *rth;
a61ced5d 2019 __be32 spec_dst;
96d36220 2020 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2021 u32 itag = 0;
b5f7e755 2022 int err;
1da177e4
LT
2023
2024 /* Primary sanity checks. */
2025
2026 if (in_dev == NULL)
2027 return -EINVAL;
2028
1e637c74 2029 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2030 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
2031 goto e_inval;
2032
f97c1e0c
JP
2033 if (ipv4_is_zeronet(saddr)) {
2034 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
2035 goto e_inval;
2036 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 2037 } else {
5c04c819
MS
2038 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2039 &itag);
b5f7e755
ED
2040 if (err < 0)
2041 goto e_err;
2042 }
5c1e6aa3
DM
2043 rth = rt_dst_alloc(init_net.loopback_dev,
2044 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2045 if (!rth)
2046 goto e_nobufs;
2047
cf911662
DM
2048#ifdef CONFIG_IP_ROUTE_CLASSID
2049 rth->dst.tclassid = itag;
2050#endif
d8d1f30b 2051 rth->dst.output = ip_rt_bug;
1da177e4 2052
5e2b61f7 2053 rth->rt_key_dst = daddr;
5e2b61f7 2054 rth->rt_key_src = saddr;
cf911662
DM
2055 rth->rt_genid = rt_genid(dev_net(dev));
2056 rth->rt_flags = RTCF_MULTICAST;
2057 rth->rt_type = RTN_MULTICAST;
475949d8 2058 rth->rt_key_tos = tos;
cf911662 2059 rth->rt_dst = daddr;
1da177e4 2060 rth->rt_src = saddr;
1b86a58f 2061 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2062 rth->rt_iif = dev->ifindex;
5e2b61f7 2063 rth->rt_oif = 0;
cf911662 2064 rth->rt_mark = skb->mark;
1da177e4
LT
2065 rth->rt_gateway = daddr;
2066 rth->rt_spec_dst= spec_dst;
cf911662
DM
2067 rth->rt_peer_genid = 0;
2068 rth->peer = NULL;
2069 rth->fi = NULL;
1da177e4 2070 if (our) {
d8d1f30b 2071 rth->dst.input= ip_local_deliver;
1da177e4
LT
2072 rth->rt_flags |= RTCF_LOCAL;
2073 }
2074
2075#ifdef CONFIG_IP_MROUTE
f97c1e0c 2076 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 2077 rth->dst.input = ip_mr_input;
1da177e4
LT
2078#endif
2079 RT_CACHE_STAT_INC(in_slow_mc);
2080
e84f84f2 2081 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 2082 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 2083 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
2084
2085e_nobufs:
1da177e4 2086 return -ENOBUFS;
1da177e4 2087e_inval:
96d36220 2088 return -EINVAL;
b5f7e755 2089e_err:
b5f7e755 2090 return err;
1da177e4
LT
2091}
2092
2093
2094static void ip_handle_martian_source(struct net_device *dev,
2095 struct in_device *in_dev,
2096 struct sk_buff *skb,
9e12bb22
AV
2097 __be32 daddr,
2098 __be32 saddr)
1da177e4
LT
2099{
2100 RT_CACHE_STAT_INC(in_martian_src);
2101#ifdef CONFIG_IP_ROUTE_VERBOSE
2102 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2103 /*
2104 * RFC1812 recommendation, if source is martian,
2105 * the only hint is MAC header.
2106 */
058bd4d2 2107 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 2108 &daddr, &saddr, dev->name);
98e399f8 2109 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
2110 print_hex_dump(KERN_WARNING, "ll header: ",
2111 DUMP_PREFIX_OFFSET, 16, 1,
2112 skb_mac_header(skb),
2113 dev->hard_header_len, true);
1da177e4
LT
2114 }
2115 }
2116#endif
2117}
2118
47360228 2119/* called in rcu_read_lock() section */
5969f71d 2120static int __mkroute_input(struct sk_buff *skb,
982721f3 2121 const struct fib_result *res,
5969f71d
SH
2122 struct in_device *in_dev,
2123 __be32 daddr, __be32 saddr, u32 tos,
2124 struct rtable **result)
1da177e4 2125{
1da177e4
LT
2126 struct rtable *rth;
2127 int err;
2128 struct in_device *out_dev;
47360228 2129 unsigned int flags = 0;
d9c9df8c
AV
2130 __be32 spec_dst;
2131 u32 itag;
1da177e4
LT
2132
2133 /* get a working reference to the output device */
47360228 2134 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2135 if (out_dev == NULL) {
2136 if (net_ratelimit())
058bd4d2 2137 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
2138 return -EINVAL;
2139 }
2140
2141
5c04c819
MS
2142 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2143 in_dev->dev, &spec_dst, &itag);
1da177e4 2144 if (err < 0) {
e905a9ed 2145 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2146 saddr);
e905a9ed 2147
1da177e4
LT
2148 goto cleanup;
2149 }
2150
2151 if (err)
2152 flags |= RTCF_DIRECTSRC;
2153
51b77cae 2154 if (out_dev == in_dev && err &&
1da177e4
LT
2155 (IN_DEV_SHARED_MEDIA(out_dev) ||
2156 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2157 flags |= RTCF_DOREDIRECT;
2158
2159 if (skb->protocol != htons(ETH_P_IP)) {
2160 /* Not IP (i.e. ARP). Do not create route, if it is
2161 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2162 *
2163 * Proxy arp feature have been extended to allow, ARP
2164 * replies back to the same interface, to support
2165 * Private VLAN switch technologies. See arp.c.
1da177e4 2166 */
65324144
JDB
2167 if (out_dev == in_dev &&
2168 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2169 err = -EINVAL;
2170 goto cleanup;
2171 }
2172 }
2173
5c1e6aa3
DM
2174 rth = rt_dst_alloc(out_dev->dev,
2175 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2176 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2177 if (!rth) {
2178 err = -ENOBUFS;
2179 goto cleanup;
2180 }
2181
5e2b61f7 2182 rth->rt_key_dst = daddr;
5e2b61f7 2183 rth->rt_key_src = saddr;
cf911662
DM
2184 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2185 rth->rt_flags = flags;
2186 rth->rt_type = res->type;
475949d8 2187 rth->rt_key_tos = tos;
cf911662 2188 rth->rt_dst = daddr;
1da177e4 2189 rth->rt_src = saddr;
1b86a58f 2190 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2191 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2192 rth->rt_oif = 0;
cf911662
DM
2193 rth->rt_mark = skb->mark;
2194 rth->rt_gateway = daddr;
1da177e4 2195 rth->rt_spec_dst= spec_dst;
cf911662
DM
2196 rth->rt_peer_genid = 0;
2197 rth->peer = NULL;
2198 rth->fi = NULL;
1da177e4 2199
d8d1f30b
CG
2200 rth->dst.input = ip_forward;
2201 rth->dst.output = ip_output;
1da177e4 2202
5e2b61f7 2203 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2204
1da177e4
LT
2205 *result = rth;
2206 err = 0;
2207 cleanup:
1da177e4 2208 return err;
e905a9ed 2209}
1da177e4 2210
5969f71d
SH
2211static int ip_mkroute_input(struct sk_buff *skb,
2212 struct fib_result *res,
68a5e3dd 2213 const struct flowi4 *fl4,
5969f71d
SH
2214 struct in_device *in_dev,
2215 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2216{
7abaa27c 2217 struct rtable* rth = NULL;
1da177e4
LT
2218 int err;
2219 unsigned hash;
2220
2221#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2222 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2223 fib_select_multipath(res);
1da177e4
LT
2224#endif
2225
2226 /* create a routing cache entry */
2227 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2228 if (err)
2229 return err;
1da177e4
LT
2230
2231 /* put it into the cache */
68a5e3dd 2232 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2233 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2234 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2235 if (IS_ERR(rth))
2236 return PTR_ERR(rth);
2237 return 0;
1da177e4
LT
2238}
2239
1da177e4
LT
2240/*
2241 * NOTE. We drop all the packets that has local source
2242 * addresses, because every properly looped back packet
2243 * must have correct destination already attached by output routine.
2244 *
2245 * Such approach solves two big problems:
2246 * 1. Not simplex devices are handled properly.
2247 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2248 * called with rcu_read_lock()
1da177e4
LT
2249 */
2250
9e12bb22 2251static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2252 u8 tos, struct net_device *dev)
2253{
2254 struct fib_result res;
96d36220 2255 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2256 struct flowi4 fl4;
1da177e4
LT
2257 unsigned flags = 0;
2258 u32 itag = 0;
2259 struct rtable * rth;
2260 unsigned hash;
9e12bb22 2261 __be32 spec_dst;
1da177e4 2262 int err = -EINVAL;
c346dca1 2263 struct net * net = dev_net(dev);
1da177e4
LT
2264
2265 /* IP on this device is disabled. */
2266
2267 if (!in_dev)
2268 goto out;
2269
2270 /* Check for the most weird martians, which can be not detected
2271 by fib_lookup.
2272 */
2273
1e637c74 2274 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2275 ipv4_is_loopback(saddr))
1da177e4
LT
2276 goto martian_source;
2277
27a954bd 2278 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2279 goto brd_input;
2280
2281 /* Accept zero addresses only to limited broadcast;
2282 * I even do not know to fix it or not. Waiting for complains :-)
2283 */
f97c1e0c 2284 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2285 goto martian_source;
2286
27a954bd 2287 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2288 goto martian_destination;
2289
2290 /*
2291 * Now we are ready to route packet.
2292 */
68a5e3dd
DM
2293 fl4.flowi4_oif = 0;
2294 fl4.flowi4_iif = dev->ifindex;
2295 fl4.flowi4_mark = skb->mark;
2296 fl4.flowi4_tos = tos;
2297 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2298 fl4.daddr = daddr;
2299 fl4.saddr = saddr;
2300 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2301 if (err != 0) {
1da177e4 2302 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2303 goto e_hostunreach;
1da177e4
LT
2304 goto no_route;
2305 }
1da177e4
LT
2306
2307 RT_CACHE_STAT_INC(in_slow_tot);
2308
2309 if (res.type == RTN_BROADCAST)
2310 goto brd_input;
2311
2312 if (res.type == RTN_LOCAL) {
5c04c819 2313 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2314 net->loopback_dev->ifindex,
5c04c819 2315 dev, &spec_dst, &itag);
b5f7e755
ED
2316 if (err < 0)
2317 goto martian_source_keep_err;
2318 if (err)
1da177e4
LT
2319 flags |= RTCF_DIRECTSRC;
2320 spec_dst = daddr;
2321 goto local_input;
2322 }
2323
2324 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2325 goto e_hostunreach;
1da177e4
LT
2326 if (res.type != RTN_UNICAST)
2327 goto martian_destination;
2328
68a5e3dd 2329 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2330out: return err;
2331
2332brd_input:
2333 if (skb->protocol != htons(ETH_P_IP))
2334 goto e_inval;
2335
f97c1e0c 2336 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2337 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2338 else {
5c04c819
MS
2339 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2340 &itag);
1da177e4 2341 if (err < 0)
b5f7e755 2342 goto martian_source_keep_err;
1da177e4
LT
2343 if (err)
2344 flags |= RTCF_DIRECTSRC;
2345 }
2346 flags |= RTCF_BROADCAST;
2347 res.type = RTN_BROADCAST;
2348 RT_CACHE_STAT_INC(in_brd);
2349
2350local_input:
5c1e6aa3
DM
2351 rth = rt_dst_alloc(net->loopback_dev,
2352 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2353 if (!rth)
2354 goto e_nobufs;
2355
cf911662 2356 rth->dst.input= ip_local_deliver;
d8d1f30b 2357 rth->dst.output= ip_rt_bug;
cf911662
DM
2358#ifdef CONFIG_IP_ROUTE_CLASSID
2359 rth->dst.tclassid = itag;
2360#endif
1da177e4 2361
5e2b61f7 2362 rth->rt_key_dst = daddr;
5e2b61f7 2363 rth->rt_key_src = saddr;
cf911662
DM
2364 rth->rt_genid = rt_genid(net);
2365 rth->rt_flags = flags|RTCF_LOCAL;
2366 rth->rt_type = res.type;
475949d8 2367 rth->rt_key_tos = tos;
cf911662 2368 rth->rt_dst = daddr;
1da177e4 2369 rth->rt_src = saddr;
c7066f70 2370#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2371 rth->dst.tclassid = itag;
1da177e4 2372#endif
1b86a58f 2373 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2374 rth->rt_iif = dev->ifindex;
cf911662
DM
2375 rth->rt_oif = 0;
2376 rth->rt_mark = skb->mark;
1da177e4
LT
2377 rth->rt_gateway = daddr;
2378 rth->rt_spec_dst= spec_dst;
cf911662
DM
2379 rth->rt_peer_genid = 0;
2380 rth->peer = NULL;
2381 rth->fi = NULL;
1da177e4 2382 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2383 rth->dst.input= ip_error;
2384 rth->dst.error= -err;
1da177e4
LT
2385 rth->rt_flags &= ~RTCF_LOCAL;
2386 }
68a5e3dd
DM
2387 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2388 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2389 err = 0;
2390 if (IS_ERR(rth))
2391 err = PTR_ERR(rth);
ebc0ffae 2392 goto out;
1da177e4
LT
2393
2394no_route:
2395 RT_CACHE_STAT_INC(in_no_route);
2396 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2397 res.type = RTN_UNREACHABLE;
7f53878d
MC
2398 if (err == -ESRCH)
2399 err = -ENETUNREACH;
1da177e4
LT
2400 goto local_input;
2401
2402 /*
2403 * Do not cache martian addresses: they should be logged (RFC1812)
2404 */
2405martian_destination:
2406 RT_CACHE_STAT_INC(in_martian_dst);
2407#ifdef CONFIG_IP_ROUTE_VERBOSE
2408 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
058bd4d2 2409 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
673d57e7 2410 &daddr, &saddr, dev->name);
1da177e4 2411#endif
2c2910a4
DE
2412
2413e_hostunreach:
e905a9ed 2414 err = -EHOSTUNREACH;
ebc0ffae 2415 goto out;
2c2910a4 2416
1da177e4
LT
2417e_inval:
2418 err = -EINVAL;
ebc0ffae 2419 goto out;
1da177e4
LT
2420
2421e_nobufs:
2422 err = -ENOBUFS;
ebc0ffae 2423 goto out;
1da177e4
LT
2424
2425martian_source:
b5f7e755
ED
2426 err = -EINVAL;
2427martian_source_keep_err:
1da177e4 2428 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2429 goto out;
1da177e4
LT
2430}
2431
407eadd9
ED
2432int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2433 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2434{
2435 struct rtable * rth;
2436 unsigned hash;
2437 int iif = dev->ifindex;
b5921910 2438 struct net *net;
96d36220 2439 int res;
1da177e4 2440
c346dca1 2441 net = dev_net(dev);
1080d709 2442
96d36220
ED
2443 rcu_read_lock();
2444
1080d709
NH
2445 if (!rt_caching(net))
2446 goto skip_cache;
2447
1da177e4 2448 tos &= IPTOS_RT_MASK;
e84f84f2 2449 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2450
1da177e4 2451 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2452 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2453 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2454 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2455 (rth->rt_route_iif ^ iif) |
475949d8 2456 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2457 rth->rt_mark == skb->mark &&
d8d1f30b 2458 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2459 !rt_is_expired(rth)) {
de398fb8 2460 ipv4_validate_peer(rth);
407eadd9 2461 if (noref) {
d8d1f30b
CG
2462 dst_use_noref(&rth->dst, jiffies);
2463 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2464 } else {
d8d1f30b
CG
2465 dst_use(&rth->dst, jiffies);
2466 skb_dst_set(skb, &rth->dst);
407eadd9 2467 }
1da177e4
LT
2468 RT_CACHE_STAT_INC(in_hit);
2469 rcu_read_unlock();
1da177e4
LT
2470 return 0;
2471 }
2472 RT_CACHE_STAT_INC(in_hlist_search);
2473 }
1da177e4 2474
1080d709 2475skip_cache:
1da177e4
LT
2476 /* Multicast recognition logic is moved from route cache to here.
2477 The problem was that too many Ethernet cards have broken/missing
2478 hardware multicast filters :-( As result the host on multicasting
2479 network acquires a lot of useless route cache entries, sort of
2480 SDR messages from all the world. Now we try to get rid of them.
2481 Really, provided software IP multicast filter is organized
2482 reasonably (at least, hashed), it does not result in a slowdown
2483 comparing with route cache reject entries.
2484 Note, that multicast routers are not affected, because
2485 route cache entry is created eventually.
2486 */
f97c1e0c 2487 if (ipv4_is_multicast(daddr)) {
96d36220 2488 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2489
96d36220 2490 if (in_dev) {
dbdd9a52
DM
2491 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2492 ip_hdr(skb)->protocol);
1da177e4
LT
2493 if (our
2494#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2495 ||
2496 (!ipv4_is_local_multicast(daddr) &&
2497 IN_DEV_MFORWARD(in_dev))
1da177e4 2498#endif
9d4fb27d 2499 ) {
96d36220
ED
2500 int res = ip_route_input_mc(skb, daddr, saddr,
2501 tos, dev, our);
1da177e4 2502 rcu_read_unlock();
96d36220 2503 return res;
1da177e4
LT
2504 }
2505 }
2506 rcu_read_unlock();
2507 return -EINVAL;
2508 }
96d36220
ED
2509 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2510 rcu_read_unlock();
2511 return res;
1da177e4 2512}
407eadd9 2513EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2514
ebc0ffae 2515/* called with rcu_read_lock() */
982721f3 2516static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2517 const struct flowi4 *fl4,
813b3b5d 2518 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2519 int orig_oif, __u8 orig_rtos,
2520 struct net_device *dev_out,
5ada5527 2521 unsigned int flags)
1da177e4 2522{
982721f3 2523 struct fib_info *fi = res->fi;
5ada5527 2524 struct in_device *in_dev;
982721f3 2525 u16 type = res->type;
5ada5527 2526 struct rtable *rth;
1da177e4 2527
68a5e3dd 2528 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2529 return ERR_PTR(-EINVAL);
1da177e4 2530
68a5e3dd 2531 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2532 type = RTN_BROADCAST;
68a5e3dd 2533 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2534 type = RTN_MULTICAST;
68a5e3dd 2535 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2536 return ERR_PTR(-EINVAL);
1da177e4
LT
2537
2538 if (dev_out->flags & IFF_LOOPBACK)
2539 flags |= RTCF_LOCAL;
2540
dd28d1a0 2541 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2542 if (!in_dev)
5ada5527 2543 return ERR_PTR(-EINVAL);
ebc0ffae 2544
982721f3 2545 if (type == RTN_BROADCAST) {
1da177e4 2546 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2547 fi = NULL;
2548 } else if (type == RTN_MULTICAST) {
dd28d1a0 2549 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2550 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2551 fl4->flowi4_proto))
1da177e4
LT
2552 flags &= ~RTCF_LOCAL;
2553 /* If multicast route do not exist use
dd28d1a0
ED
2554 * default one, but do not gateway in this case.
2555 * Yes, it is hack.
1da177e4 2556 */
982721f3
DM
2557 if (fi && res->prefixlen < 4)
2558 fi = NULL;
1da177e4
LT
2559 }
2560
5c1e6aa3
DM
2561 rth = rt_dst_alloc(dev_out,
2562 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2563 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2564 if (!rth)
5ada5527 2565 return ERR_PTR(-ENOBUFS);
8391d07b 2566
cf911662
DM
2567 rth->dst.output = ip_output;
2568
813b3b5d
DM
2569 rth->rt_key_dst = orig_daddr;
2570 rth->rt_key_src = orig_saddr;
cf911662
DM
2571 rth->rt_genid = rt_genid(dev_net(dev_out));
2572 rth->rt_flags = flags;
2573 rth->rt_type = type;
f61759e6 2574 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2575 rth->rt_dst = fl4->daddr;
2576 rth->rt_src = fl4->saddr;
1b86a58f 2577 rth->rt_route_iif = 0;
813b3b5d
DM
2578 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2579 rth->rt_oif = orig_oif;
2580 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2581 rth->rt_gateway = fl4->daddr;
2582 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2583 rth->rt_peer_genid = 0;
2584 rth->peer = NULL;
2585 rth->fi = NULL;
1da177e4
LT
2586
2587 RT_CACHE_STAT_INC(out_slow_tot);
2588
2589 if (flags & RTCF_LOCAL) {
d8d1f30b 2590 rth->dst.input = ip_local_deliver;
68a5e3dd 2591 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2592 }
2593 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2594 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2595 if (flags & RTCF_LOCAL &&
1da177e4 2596 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2597 rth->dst.output = ip_mc_output;
1da177e4
LT
2598 RT_CACHE_STAT_INC(out_slow_mc);
2599 }
2600#ifdef CONFIG_IP_MROUTE
982721f3 2601 if (type == RTN_MULTICAST) {
1da177e4 2602 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2603 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2604 rth->dst.input = ip_mr_input;
2605 rth->dst.output = ip_mc_output;
1da177e4
LT
2606 }
2607 }
2608#endif
2609 }
2610
813b3b5d 2611 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2612
5ada5527 2613 return rth;
1da177e4
LT
2614}
2615
1da177e4
LT
2616/*
2617 * Major route resolver routine.
0197aa38 2618 * called with rcu_read_lock();
1da177e4
LT
2619 */
2620
813b3b5d 2621static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2622{
1da177e4 2623 struct net_device *dev_out = NULL;
f61759e6 2624 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2625 unsigned int flags = 0;
2626 struct fib_result res;
5ada5527 2627 struct rtable *rth;
813b3b5d
DM
2628 __be32 orig_daddr;
2629 __be32 orig_saddr;
2630 int orig_oif;
1da177e4
LT
2631
2632 res.fi = NULL;
2633#ifdef CONFIG_IP_MULTIPLE_TABLES
2634 res.r = NULL;
2635#endif
2636
813b3b5d
DM
2637 orig_daddr = fl4->daddr;
2638 orig_saddr = fl4->saddr;
2639 orig_oif = fl4->flowi4_oif;
2640
2641 fl4->flowi4_iif = net->loopback_dev->ifindex;
2642 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2643 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2644 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2645
010c2708 2646 rcu_read_lock();
813b3b5d 2647 if (fl4->saddr) {
b23dd4fe 2648 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2649 if (ipv4_is_multicast(fl4->saddr) ||
2650 ipv4_is_lbcast(fl4->saddr) ||
2651 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2652 goto out;
2653
1da177e4
LT
2654 /* I removed check for oif == dev_out->oif here.
2655 It was wrong for two reasons:
1ab35276
DL
2656 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2657 is assigned to multiple interfaces.
1da177e4
LT
2658 2. Moreover, we are allowed to send packets with saddr
2659 of another iface. --ANK
2660 */
2661
813b3b5d
DM
2662 if (fl4->flowi4_oif == 0 &&
2663 (ipv4_is_multicast(fl4->daddr) ||
2664 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2665 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2666 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2667 if (dev_out == NULL)
2668 goto out;
2669
1da177e4
LT
2670 /* Special hack: user can direct multicasts
2671 and limited broadcast via necessary interface
2672 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2673 This hack is not just for fun, it allows
2674 vic,vat and friends to work.
2675 They bind socket to loopback, set ttl to zero
2676 and expect that it will work.
2677 From the viewpoint of routing cache they are broken,
2678 because we are not allowed to build multicast path
2679 with loopback source addr (look, routing cache
2680 cannot know, that ttl is zero, so that packet
2681 will not leave this host and route is valid).
2682 Luckily, this hack is good workaround.
2683 */
2684
813b3b5d 2685 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2686 goto make_route;
2687 }
a210d01a 2688
813b3b5d 2689 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2690 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2691 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2692 goto out;
a210d01a 2693 }
1da177e4
LT
2694 }
2695
2696
813b3b5d
DM
2697 if (fl4->flowi4_oif) {
2698 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2699 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2700 if (dev_out == NULL)
2701 goto out;
e5ed6399
HX
2702
2703 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2704 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2705 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2706 goto out;
2707 }
813b3b5d
DM
2708 if (ipv4_is_local_multicast(fl4->daddr) ||
2709 ipv4_is_lbcast(fl4->daddr)) {
2710 if (!fl4->saddr)
2711 fl4->saddr = inet_select_addr(dev_out, 0,
2712 RT_SCOPE_LINK);
1da177e4
LT
2713 goto make_route;
2714 }
813b3b5d
DM
2715 if (fl4->saddr) {
2716 if (ipv4_is_multicast(fl4->daddr))
2717 fl4->saddr = inet_select_addr(dev_out, 0,
2718 fl4->flowi4_scope);
2719 else if (!fl4->daddr)
2720 fl4->saddr = inet_select_addr(dev_out, 0,
2721 RT_SCOPE_HOST);
1da177e4
LT
2722 }
2723 }
2724
813b3b5d
DM
2725 if (!fl4->daddr) {
2726 fl4->daddr = fl4->saddr;
2727 if (!fl4->daddr)
2728 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2729 dev_out = net->loopback_dev;
813b3b5d 2730 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2731 res.type = RTN_LOCAL;
2732 flags |= RTCF_LOCAL;
2733 goto make_route;
2734 }
2735
813b3b5d 2736 if (fib_lookup(net, fl4, &res)) {
1da177e4 2737 res.fi = NULL;
813b3b5d 2738 if (fl4->flowi4_oif) {
1da177e4
LT
2739 /* Apparently, routing tables are wrong. Assume,
2740 that the destination is on link.
2741
2742 WHY? DW.
2743 Because we are allowed to send to iface
2744 even if it has NO routes and NO assigned
2745 addresses. When oif is specified, routing
2746 tables are looked up with only one purpose:
2747 to catch if destination is gatewayed, rather than
2748 direct. Moreover, if MSG_DONTROUTE is set,
2749 we send packet, ignoring both routing tables
2750 and ifaddr state. --ANK
2751
2752
2753 We could make it even if oif is unknown,
2754 likely IPv6, but we do not.
2755 */
2756
813b3b5d
DM
2757 if (fl4->saddr == 0)
2758 fl4->saddr = inet_select_addr(dev_out, 0,
2759 RT_SCOPE_LINK);
1da177e4
LT
2760 res.type = RTN_UNICAST;
2761 goto make_route;
2762 }
b23dd4fe 2763 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2764 goto out;
2765 }
1da177e4
LT
2766
2767 if (res.type == RTN_LOCAL) {
813b3b5d 2768 if (!fl4->saddr) {
9fc3bbb4 2769 if (res.fi->fib_prefsrc)
813b3b5d 2770 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2771 else
813b3b5d 2772 fl4->saddr = fl4->daddr;
9fc3bbb4 2773 }
b40afd0e 2774 dev_out = net->loopback_dev;
813b3b5d 2775 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2776 res.fi = NULL;
2777 flags |= RTCF_LOCAL;
2778 goto make_route;
2779 }
2780
2781#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2782 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2783 fib_select_multipath(&res);
1da177e4
LT
2784 else
2785#endif
21d8c49e
DM
2786 if (!res.prefixlen &&
2787 res.table->tb_num_default > 1 &&
813b3b5d 2788 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2789 fib_select_default(&res);
1da177e4 2790
813b3b5d
DM
2791 if (!fl4->saddr)
2792 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2793
1da177e4 2794 dev_out = FIB_RES_DEV(res);
813b3b5d 2795 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2796
2797
2798make_route:
813b3b5d 2799 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2800 tos, dev_out, flags);
b23dd4fe 2801 if (!IS_ERR(rth)) {
5ada5527
DM
2802 unsigned int hash;
2803
813b3b5d 2804 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2805 rt_genid(dev_net(dev_out)));
813b3b5d 2806 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2807 }
1da177e4 2808
010c2708
DM
2809out:
2810 rcu_read_unlock();
b23dd4fe 2811 return rth;
1da177e4
LT
2812}
2813
813b3b5d 2814struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2815{
1da177e4 2816 struct rtable *rth;
010c2708 2817 unsigned int hash;
1da177e4 2818
1080d709
NH
2819 if (!rt_caching(net))
2820 goto slow_output;
2821
9d6ec938 2822 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2823
2824 rcu_read_lock_bh();
a898def2 2825 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2826 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2827 if (rth->rt_key_dst == flp4->daddr &&
2828 rth->rt_key_src == flp4->saddr &&
c7537967 2829 rt_is_output_route(rth) &&
9d6ec938
DM
2830 rth->rt_oif == flp4->flowi4_oif &&
2831 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2832 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2833 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2834 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2835 !rt_is_expired(rth)) {
de398fb8 2836 ipv4_validate_peer(rth);
d8d1f30b 2837 dst_use(&rth->dst, jiffies);
1da177e4
LT
2838 RT_CACHE_STAT_INC(out_hit);
2839 rcu_read_unlock_bh();
56157872
DM
2840 if (!flp4->saddr)
2841 flp4->saddr = rth->rt_src;
2842 if (!flp4->daddr)
2843 flp4->daddr = rth->rt_dst;
b23dd4fe 2844 return rth;
1da177e4
LT
2845 }
2846 RT_CACHE_STAT_INC(out_hlist_search);
2847 }
2848 rcu_read_unlock_bh();
2849
1080d709 2850slow_output:
9d6ec938 2851 return ip_route_output_slow(net, flp4);
1da177e4 2852}
d8c97a94
ACM
2853EXPORT_SYMBOL_GPL(__ip_route_output_key);
2854
ae2688d5
JW
2855static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2856{
2857 return NULL;
2858}
2859
ebb762f2 2860static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2861{
618f9bc7
SK
2862 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2863
2864 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2865}
2866
14e50e57
DM
2867static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2868{
2869}
2870
0972ddb2
HB
2871static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2872 unsigned long old)
2873{
2874 return NULL;
2875}
2876
14e50e57
DM
2877static struct dst_ops ipv4_dst_blackhole_ops = {
2878 .family = AF_INET,
09640e63 2879 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2880 .destroy = ipv4_dst_destroy,
ae2688d5 2881 .check = ipv4_blackhole_dst_check,
ebb762f2 2882 .mtu = ipv4_blackhole_mtu,
214f45c9 2883 .default_advmss = ipv4_default_advmss,
14e50e57 2884 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2885 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2886 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2887};
2888
2774c131 2889struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2890{
5c1e6aa3 2891 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2892 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2893
2894 if (rt) {
d8d1f30b 2895 struct dst_entry *new = &rt->dst;
14e50e57 2896
14e50e57 2897 new->__use = 1;
352e512c
HX
2898 new->input = dst_discard;
2899 new->output = dst_discard;
defb3519 2900 dst_copy_metrics(new, &ort->dst);
14e50e57 2901
d8d1f30b 2902 new->dev = ort->dst.dev;
14e50e57
DM
2903 if (new->dev)
2904 dev_hold(new->dev);
2905
5e2b61f7
DM
2906 rt->rt_key_dst = ort->rt_key_dst;
2907 rt->rt_key_src = ort->rt_key_src;
475949d8 2908 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2909 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2910 rt->rt_iif = ort->rt_iif;
2911 rt->rt_oif = ort->rt_oif;
2912 rt->rt_mark = ort->rt_mark;
14e50e57 2913
e84f84f2 2914 rt->rt_genid = rt_genid(net);
14e50e57
DM
2915 rt->rt_flags = ort->rt_flags;
2916 rt->rt_type = ort->rt_type;
2917 rt->rt_dst = ort->rt_dst;
2918 rt->rt_src = ort->rt_src;
14e50e57
DM
2919 rt->rt_gateway = ort->rt_gateway;
2920 rt->rt_spec_dst = ort->rt_spec_dst;
2921 rt->peer = ort->peer;
2922 if (rt->peer)
2923 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2924 rt->fi = ort->fi;
2925 if (rt->fi)
2926 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2927
2928 dst_free(new);
2929 }
2930
2774c131
DM
2931 dst_release(dst_orig);
2932
2933 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2934}
2935
9d6ec938 2936struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2937 struct sock *sk)
1da177e4 2938{
9d6ec938 2939 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2940
b23dd4fe
DM
2941 if (IS_ERR(rt))
2942 return rt;
1da177e4 2943
56157872 2944 if (flp4->flowi4_proto)
9d6ec938
DM
2945 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2946 flowi4_to_flowi(flp4),
2947 sk, 0);
1da177e4 2948
b23dd4fe 2949 return rt;
1da177e4 2950}
d8c97a94
ACM
2951EXPORT_SYMBOL_GPL(ip_route_output_flow);
2952
4feb88e5
BT
2953static int rt_fill_info(struct net *net,
2954 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2955 int nowait, unsigned int flags)
1da177e4 2956{
511c3f92 2957 struct rtable *rt = skb_rtable(skb);
1da177e4 2958 struct rtmsg *r;
be403ea1 2959 struct nlmsghdr *nlh;
2bc8ca40 2960 unsigned long expires = 0;
fe6fe792 2961 const struct inet_peer *peer = rt->peer;
e3703b3d 2962 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2963
2964 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2965 if (nlh == NULL)
26932566 2966 return -EMSGSIZE;
be403ea1
TG
2967
2968 r = nlmsg_data(nlh);
1da177e4
LT
2969 r->rtm_family = AF_INET;
2970 r->rtm_dst_len = 32;
2971 r->rtm_src_len = 0;
475949d8 2972 r->rtm_tos = rt->rt_key_tos;
1da177e4 2973 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2974 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2975 r->rtm_type = rt->rt_type;
2976 r->rtm_scope = RT_SCOPE_UNIVERSE;
2977 r->rtm_protocol = RTPROT_UNSPEC;
2978 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2979 if (rt->rt_flags & RTCF_NOTIFY)
2980 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2981
17fb2c64 2982 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2983
5e2b61f7 2984 if (rt->rt_key_src) {
1da177e4 2985 r->rtm_src_len = 32;
5e2b61f7 2986 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 2987 }
d8d1f30b
CG
2988 if (rt->dst.dev)
2989 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2990#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2991 if (rt->dst.tclassid)
2992 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2993#endif
c7537967 2994 if (rt_is_input_route(rt))
17fb2c64 2995 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 2996 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 2997 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2998
1da177e4 2999 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 3000 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 3001
defb3519 3002 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
3003 goto nla_put_failure;
3004
5e2b61f7
DM
3005 if (rt->rt_mark)
3006 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 3007
d8d1f30b 3008 error = rt->dst.error;
fe6fe792 3009 if (peer) {
317fe0e6 3010 inet_peer_refcheck(rt->peer);
fe6fe792
ED
3011 id = atomic_read(&peer->ip_id_count) & 0xffff;
3012 if (peer->tcp_ts_stamp) {
3013 ts = peer->tcp_ts;
3014 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 3015 }
fe6fe792 3016 expires = ACCESS_ONCE(peer->pmtu_expires);
2bc8ca40
SK
3017 if (expires) {
3018 if (time_before(jiffies, expires))
3019 expires -= jiffies;
3020 else
3021 expires = 0;
3022 }
1da177e4 3023 }
be403ea1 3024
c7537967 3025 if (rt_is_input_route(rt)) {
1da177e4 3026#ifdef CONFIG_IP_MROUTE
e448515c 3027 __be32 dst = rt->rt_dst;
1da177e4 3028
f97c1e0c 3029 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 3030 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
3031 int err = ipmr_get_route(net, skb,
3032 rt->rt_src, rt->rt_dst,
3033 r, nowait);
1da177e4
LT
3034 if (err <= 0) {
3035 if (!nowait) {
3036 if (err == 0)
3037 return 0;
be403ea1 3038 goto nla_put_failure;
1da177e4
LT
3039 } else {
3040 if (err == -EMSGSIZE)
be403ea1 3041 goto nla_put_failure;
e3703b3d 3042 error = err;
1da177e4
LT
3043 }
3044 }
3045 } else
3046#endif
5e2b61f7 3047 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
3048 }
3049
d8d1f30b 3050 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
3051 expires, error) < 0)
3052 goto nla_put_failure;
be403ea1
TG
3053
3054 return nlmsg_end(skb, nlh);
1da177e4 3055
be403ea1 3056nla_put_failure:
26932566
PM
3057 nlmsg_cancel(skb, nlh);
3058 return -EMSGSIZE;
1da177e4
LT
3059}
3060
63f3444f 3061static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 3062{
3b1e0a65 3063 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
3064 struct rtmsg *rtm;
3065 struct nlattr *tb[RTA_MAX+1];
1da177e4 3066 struct rtable *rt = NULL;
9e12bb22
AV
3067 __be32 dst = 0;
3068 __be32 src = 0;
3069 u32 iif;
d889ce3b 3070 int err;
963bfeee 3071 int mark;
1da177e4
LT
3072 struct sk_buff *skb;
3073
d889ce3b
TG
3074 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3075 if (err < 0)
3076 goto errout;
3077
3078 rtm = nlmsg_data(nlh);
3079
1da177e4 3080 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
3081 if (skb == NULL) {
3082 err = -ENOBUFS;
3083 goto errout;
3084 }
1da177e4
LT
3085
3086 /* Reserve room for dummy headers, this skb can pass
3087 through good chunk of routing engine.
3088 */
459a98ed 3089 skb_reset_mac_header(skb);
c1d2bbe1 3090 skb_reset_network_header(skb);
d2c962b8
SH
3091
3092 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3093 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3094 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3095
17fb2c64
AV
3096 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3097 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3098 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3099 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3100
3101 if (iif) {
d889ce3b
TG
3102 struct net_device *dev;
3103
1937504d 3104 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3105 if (dev == NULL) {
3106 err = -ENODEV;
3107 goto errout_free;
3108 }
3109
1da177e4
LT
3110 skb->protocol = htons(ETH_P_IP);
3111 skb->dev = dev;
963bfeee 3112 skb->mark = mark;
1da177e4
LT
3113 local_bh_disable();
3114 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3115 local_bh_enable();
d889ce3b 3116
511c3f92 3117 rt = skb_rtable(skb);
d8d1f30b
CG
3118 if (err == 0 && rt->dst.error)
3119 err = -rt->dst.error;
1da177e4 3120 } else {
68a5e3dd
DM
3121 struct flowi4 fl4 = {
3122 .daddr = dst,
3123 .saddr = src,
3124 .flowi4_tos = rtm->rtm_tos,
3125 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3126 .flowi4_mark = mark,
d889ce3b 3127 };
9d6ec938 3128 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
3129
3130 err = 0;
3131 if (IS_ERR(rt))
3132 err = PTR_ERR(rt);
1da177e4 3133 }
d889ce3b 3134
1da177e4 3135 if (err)
d889ce3b 3136 goto errout_free;
1da177e4 3137
d8d1f30b 3138 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3139 if (rtm->rtm_flags & RTM_F_NOTIFY)
3140 rt->rt_flags |= RTCF_NOTIFY;
3141
4feb88e5 3142 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3143 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3144 if (err <= 0)
3145 goto errout_free;
1da177e4 3146
1937504d 3147 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3148errout:
2942e900 3149 return err;
1da177e4 3150
d889ce3b 3151errout_free:
1da177e4 3152 kfree_skb(skb);
d889ce3b 3153 goto errout;
1da177e4
LT
3154}
3155
3156int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3157{
3158 struct rtable *rt;
3159 int h, s_h;
3160 int idx, s_idx;
1937504d
DL
3161 struct net *net;
3162
3b1e0a65 3163 net = sock_net(skb->sk);
1da177e4
LT
3164
3165 s_h = cb->args[0];
d8c92830
ED
3166 if (s_h < 0)
3167 s_h = 0;
1da177e4 3168 s_idx = idx = cb->args[1];
a6272665
ED
3169 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3170 if (!rt_hash_table[h].chain)
3171 continue;
1da177e4 3172 rcu_read_lock_bh();
a898def2 3173 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3174 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3175 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3176 continue;
e84f84f2 3177 if (rt_is_expired(rt))
29e75252 3178 continue;
d8d1f30b 3179 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3180 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3181 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3182 1, NLM_F_MULTI) <= 0) {
adf30907 3183 skb_dst_drop(skb);
1da177e4
LT
3184 rcu_read_unlock_bh();
3185 goto done;
3186 }
adf30907 3187 skb_dst_drop(skb);
1da177e4
LT
3188 }
3189 rcu_read_unlock_bh();
3190 }
3191
3192done:
3193 cb->args[0] = h;
3194 cb->args[1] = idx;
3195 return skb->len;
3196}
3197
3198void ip_rt_multicast_event(struct in_device *in_dev)
3199{
76e6ebfb 3200 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3201}
3202
3203#ifdef CONFIG_SYSCTL
81c684d1 3204static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3205 void __user *buffer,
1da177e4
LT
3206 size_t *lenp, loff_t *ppos)
3207{
3208 if (write) {
639e104f 3209 int flush_delay;
81c684d1 3210 ctl_table ctl;
39a23e75 3211 struct net *net;
639e104f 3212
81c684d1
DL
3213 memcpy(&ctl, __ctl, sizeof(ctl));
3214 ctl.data = &flush_delay;
8d65af78 3215 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3216
81c684d1 3217 net = (struct net *)__ctl->extra1;
39a23e75 3218 rt_cache_flush(net, flush_delay);
1da177e4 3219 return 0;
e905a9ed 3220 }
1da177e4
LT
3221
3222 return -EINVAL;
3223}
3224
eeb61f71 3225static ctl_table ipv4_route_table[] = {
1da177e4 3226 {
1da177e4
LT
3227 .procname = "gc_thresh",
3228 .data = &ipv4_dst_ops.gc_thresh,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
6d9f239a 3231 .proc_handler = proc_dointvec,
1da177e4
LT
3232 },
3233 {
1da177e4
LT
3234 .procname = "max_size",
3235 .data = &ip_rt_max_size,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
6d9f239a 3238 .proc_handler = proc_dointvec,
1da177e4
LT
3239 },
3240 {
3241 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3242
1da177e4
LT
3243 .procname = "gc_min_interval",
3244 .data = &ip_rt_gc_min_interval,
3245 .maxlen = sizeof(int),
3246 .mode = 0644,
6d9f239a 3247 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3248 },
3249 {
1da177e4
LT
3250 .procname = "gc_min_interval_ms",
3251 .data = &ip_rt_gc_min_interval,
3252 .maxlen = sizeof(int),
3253 .mode = 0644,
6d9f239a 3254 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3255 },
3256 {
1da177e4
LT
3257 .procname = "gc_timeout",
3258 .data = &ip_rt_gc_timeout,
3259 .maxlen = sizeof(int),
3260 .mode = 0644,
6d9f239a 3261 .proc_handler = proc_dointvec_jiffies,
1da177e4 3262 },
9f28a2fc
ED
3263 {
3264 .procname = "gc_interval",
3265 .data = &ip_rt_gc_interval,
3266 .maxlen = sizeof(int),
3267 .mode = 0644,
3268 .proc_handler = proc_dointvec_jiffies,
3269 },
1da177e4 3270 {
1da177e4
LT
3271 .procname = "redirect_load",
3272 .data = &ip_rt_redirect_load,
3273 .maxlen = sizeof(int),
3274 .mode = 0644,
6d9f239a 3275 .proc_handler = proc_dointvec,
1da177e4
LT
3276 },
3277 {
1da177e4
LT
3278 .procname = "redirect_number",
3279 .data = &ip_rt_redirect_number,
3280 .maxlen = sizeof(int),
3281 .mode = 0644,
6d9f239a 3282 .proc_handler = proc_dointvec,
1da177e4
LT
3283 },
3284 {
1da177e4
LT
3285 .procname = "redirect_silence",
3286 .data = &ip_rt_redirect_silence,
3287 .maxlen = sizeof(int),
3288 .mode = 0644,
6d9f239a 3289 .proc_handler = proc_dointvec,
1da177e4
LT
3290 },
3291 {
1da177e4
LT
3292 .procname = "error_cost",
3293 .data = &ip_rt_error_cost,
3294 .maxlen = sizeof(int),
3295 .mode = 0644,
6d9f239a 3296 .proc_handler = proc_dointvec,
1da177e4
LT
3297 },
3298 {
1da177e4
LT
3299 .procname = "error_burst",
3300 .data = &ip_rt_error_burst,
3301 .maxlen = sizeof(int),
3302 .mode = 0644,
6d9f239a 3303 .proc_handler = proc_dointvec,
1da177e4
LT
3304 },
3305 {
1da177e4
LT
3306 .procname = "gc_elasticity",
3307 .data = &ip_rt_gc_elasticity,
3308 .maxlen = sizeof(int),
3309 .mode = 0644,
6d9f239a 3310 .proc_handler = proc_dointvec,
1da177e4
LT
3311 },
3312 {
1da177e4
LT
3313 .procname = "mtu_expires",
3314 .data = &ip_rt_mtu_expires,
3315 .maxlen = sizeof(int),
3316 .mode = 0644,
6d9f239a 3317 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3318 },
3319 {
1da177e4
LT
3320 .procname = "min_pmtu",
3321 .data = &ip_rt_min_pmtu,
3322 .maxlen = sizeof(int),
3323 .mode = 0644,
6d9f239a 3324 .proc_handler = proc_dointvec,
1da177e4
LT
3325 },
3326 {
1da177e4
LT
3327 .procname = "min_adv_mss",
3328 .data = &ip_rt_min_advmss,
3329 .maxlen = sizeof(int),
3330 .mode = 0644,
6d9f239a 3331 .proc_handler = proc_dointvec,
1da177e4 3332 },
f8572d8f 3333 { }
1da177e4 3334};
39a23e75 3335
2f4520d3
AV
3336static struct ctl_table empty[1];
3337
3338static struct ctl_table ipv4_skeleton[] =
3339{
f8572d8f 3340 { .procname = "route",
d994af0d 3341 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3342 { .procname = "neigh",
d994af0d 3343 .mode = 0555, .child = empty},
2f4520d3
AV
3344 { }
3345};
3346
3347static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3348 { .procname = "net", },
3349 { .procname = "ipv4", },
39a23e75
DL
3350 { },
3351};
3352
39a23e75
DL
3353static struct ctl_table ipv4_route_flush_table[] = {
3354 {
39a23e75
DL
3355 .procname = "flush",
3356 .maxlen = sizeof(int),
3357 .mode = 0200,
6d9f239a 3358 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3359 },
f8572d8f 3360 { },
39a23e75
DL
3361};
3362
2f4520d3 3363static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3364 { .procname = "net", },
3365 { .procname = "ipv4", },
3366 { .procname = "route", },
2f4520d3
AV
3367 { },
3368};
3369
39a23e75
DL
3370static __net_init int sysctl_route_net_init(struct net *net)
3371{
3372 struct ctl_table *tbl;
3373
3374 tbl = ipv4_route_flush_table;
09ad9bc7 3375 if (!net_eq(net, &init_net)) {
39a23e75
DL
3376 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3377 if (tbl == NULL)
3378 goto err_dup;
3379 }
3380 tbl[0].extra1 = net;
3381
3382 net->ipv4.route_hdr =
3383 register_net_sysctl_table(net, ipv4_route_path, tbl);
3384 if (net->ipv4.route_hdr == NULL)
3385 goto err_reg;
3386 return 0;
3387
3388err_reg:
3389 if (tbl != ipv4_route_flush_table)
3390 kfree(tbl);
3391err_dup:
3392 return -ENOMEM;
3393}
3394
3395static __net_exit void sysctl_route_net_exit(struct net *net)
3396{
3397 struct ctl_table *tbl;
3398
3399 tbl = net->ipv4.route_hdr->ctl_table_arg;
3400 unregister_net_sysctl_table(net->ipv4.route_hdr);
3401 BUG_ON(tbl == ipv4_route_flush_table);
3402 kfree(tbl);
3403}
3404
3405static __net_initdata struct pernet_operations sysctl_route_ops = {
3406 .init = sysctl_route_net_init,
3407 .exit = sysctl_route_net_exit,
3408};
1da177e4
LT
3409#endif
3410
3ee94372 3411static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3412{
3ee94372
NH
3413 get_random_bytes(&net->ipv4.rt_genid,
3414 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3415 get_random_bytes(&net->ipv4.dev_addr_genid,
3416 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3417 return 0;
3418}
3419
3ee94372
NH
3420static __net_initdata struct pernet_operations rt_genid_ops = {
3421 .init = rt_genid_init,
9f5e97e5
DL
3422};
3423
3424
c7066f70 3425#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3426struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3427#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3428
3429static __initdata unsigned long rhash_entries;
3430static int __init set_rhash_entries(char *str)
3431{
3432 if (!str)
3433 return 0;
3434 rhash_entries = simple_strtoul(str, &str, 0);
3435 return 1;
3436}
3437__setup("rhash_entries=", set_rhash_entries);
3438
3439int __init ip_rt_init(void)
3440{
424c4b70 3441 int rc = 0;
1da177e4 3442
c7066f70 3443#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3444 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3445 if (!ip_rt_acct)
3446 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3447#endif
3448
e5d679f3
AD
3449 ipv4_dst_ops.kmem_cachep =
3450 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3451 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3452
14e50e57
DM
3453 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3454
fc66f95c
ED
3455 if (dst_entries_init(&ipv4_dst_ops) < 0)
3456 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3457
3458 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3459 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3460
424c4b70
ED
3461 rt_hash_table = (struct rt_hash_bucket *)
3462 alloc_large_system_hash("IP route cache",
3463 sizeof(struct rt_hash_bucket),
3464 rhash_entries,
4481374c 3465 (totalram_pages >= 128 * 1024) ?
18955cfc 3466 15 : 17,
8d1502de 3467 0,
424c4b70
ED
3468 &rt_hash_log,
3469 &rt_hash_mask,
c9503e0f 3470 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3471 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3472 rt_hash_lock_init();
1da177e4
LT
3473
3474 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3475 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3476
1da177e4
LT
3477 devinet_init();
3478 ip_fib_init();
3479
9f28a2fc
ED
3480 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3481 expires_ljiffies = jiffies;
3482 schedule_delayed_work(&expires_work,
3483 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3484
73b38711 3485 if (ip_rt_proc_init())
058bd4d2 3486 pr_err("Unable to create route proc files\n");
1da177e4
LT
3487#ifdef CONFIG_XFRM
3488 xfrm_init();
a33bc5c1 3489 xfrm4_init(ip_rt_max_size);
1da177e4 3490#endif
c7ac8679 3491 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3492
39a23e75
DL
3493#ifdef CONFIG_SYSCTL
3494 register_pernet_subsys(&sysctl_route_ops);
3495#endif
3ee94372 3496 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3497 return rc;
3498}
3499
a1bc6eb4 3500#ifdef CONFIG_SYSCTL
eeb61f71
AV
3501/*
3502 * We really need to sanitize the damn ipv4 init order, then all
3503 * this nonsense will go away.
3504 */
3505void __init ip_static_sysctl_init(void)
3506{
2f4520d3 3507 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3508}
a1bc6eb4 3509#endif