]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/route.c
ipv6: Use icmpv6_notify() to propagate redirect, instead of rt6_redirect().
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4 72#include <linux/mm.h>
424c4b70 73#include <linux/bootmem.h>
1da177e4
LT
74#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
39c90ece 83#include <linux/workqueue.h>
1da177e4 84#include <linux/skbuff.h>
1da177e4
LT
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
5a0e3ad6 94#include <linux/slab.h>
b9eda06f 95#include <linux/prefetch.h>
352e512c 96#include <net/dst.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
8d71740c 108#include <net/netevent.h>
63f3444f 109#include <net/rtnetlink.h>
1da177e4
LT
110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
7426a564 112#include <linux/kmemleak.h>
1da177e4 113#endif
6e5714ea 114#include <net/secure_seq.h>
1da177e4 115
68a5e3dd 116#define RT_FL_TOS(oldflp4) \
f61759e6 117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
1da177e4 123static int ip_rt_max_size;
817bc4db 124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
1080d709 136static int rt_chain_length_max __read_mostly = 20;
1da177e4 137
9f28a2fc
ED
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
1da177e4
LT
141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 147static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
e47a185b 152static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
569d3645 153static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 154
72cdd1d9
ED
155static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
156 int how)
157{
158}
1da177e4 159
62fa8a84
DM
160static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
161{
31248731
DM
162 WARN_ON(1);
163 return NULL;
62fa8a84
DM
164}
165
f894cbf8
DM
166static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
167 struct sk_buff *skb,
168 const void *daddr);
d3aaeb38 169
1da177e4
LT
170static struct dst_ops ipv4_dst_ops = {
171 .family = AF_INET,
09640e63 172 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
173 .gc = rt_garbage_collect,
174 .check = ipv4_dst_check,
0dbaee3b 175 .default_advmss = ipv4_default_advmss,
ebb762f2 176 .mtu = ipv4_mtu,
62fa8a84 177 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
178 .destroy = ipv4_dst_destroy,
179 .ifdown = ipv4_dst_ifdown,
180 .negative_advice = ipv4_negative_advice,
181 .link_failure = ipv4_link_failure,
182 .update_pmtu = ip_rt_update_pmtu,
e47a185b 183 .redirect = ip_do_redirect,
1ac06e03 184 .local_out = __ip_local_out,
d3aaeb38 185 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
186};
187
188#define ECN_OR_COST(class) TC_PRIO_##class
189
4839c52b 190const __u8 ip_tos2prio[16] = {
1da177e4 191 TC_PRIO_BESTEFFORT,
4a2b9c37 192 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
193 TC_PRIO_BESTEFFORT,
194 ECN_OR_COST(BESTEFFORT),
195 TC_PRIO_BULK,
196 ECN_OR_COST(BULK),
197 TC_PRIO_BULK,
198 ECN_OR_COST(BULK),
199 TC_PRIO_INTERACTIVE,
200 ECN_OR_COST(INTERACTIVE),
201 TC_PRIO_INTERACTIVE,
202 ECN_OR_COST(INTERACTIVE),
203 TC_PRIO_INTERACTIVE_BULK,
204 ECN_OR_COST(INTERACTIVE_BULK),
205 TC_PRIO_INTERACTIVE_BULK,
206 ECN_OR_COST(INTERACTIVE_BULK)
207};
d4a96865 208EXPORT_SYMBOL(ip_tos2prio);
1da177e4
LT
209
210/*
211 * Route cache.
212 */
213
214/* The locking scheme is rather straight forward:
215 *
216 * 1) Read-Copy Update protects the buckets of the central route hash.
217 * 2) Only writers remove entries, and they hold the lock
218 * as they look at rtable reference counts.
219 * 3) Only readers acquire references to rtable entries,
220 * they do so with atomic increments and with the
221 * lock held.
222 */
223
224struct rt_hash_bucket {
1c31720a 225 struct rtable __rcu *chain;
22c047cc 226};
1080d709 227
8a25d5de
IM
228#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
229 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
230/*
231 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
232 * The size of this table is a power of two and depends on the number of CPUS.
62051200 233 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 234 */
62051200
IM
235#ifdef CONFIG_LOCKDEP
236# define RT_HASH_LOCK_SZ 256
22c047cc 237#else
62051200
IM
238# if NR_CPUS >= 32
239# define RT_HASH_LOCK_SZ 4096
240# elif NR_CPUS >= 16
241# define RT_HASH_LOCK_SZ 2048
242# elif NR_CPUS >= 8
243# define RT_HASH_LOCK_SZ 1024
244# elif NR_CPUS >= 4
245# define RT_HASH_LOCK_SZ 512
246# else
247# define RT_HASH_LOCK_SZ 256
248# endif
22c047cc
ED
249#endif
250
251static spinlock_t *rt_hash_locks;
252# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
253
254static __init void rt_hash_lock_init(void)
255{
256 int i;
257
258 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
259 GFP_KERNEL);
260 if (!rt_hash_locks)
261 panic("IP: failed to allocate rt_hash_locks\n");
262
263 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
264 spin_lock_init(&rt_hash_locks[i]);
265}
22c047cc
ED
266#else
267# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
268
269static inline void rt_hash_lock_init(void)
270{
271}
22c047cc 272#endif
1da177e4 273
817bc4db 274static struct rt_hash_bucket *rt_hash_table __read_mostly;
95c96174 275static unsigned int rt_hash_mask __read_mostly;
817bc4db 276static unsigned int rt_hash_log __read_mostly;
1da177e4 277
2f970d83 278static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 279#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 280
b00180de 281static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 282 int genid)
1da177e4 283{
0eae88f3 284 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 285 idx, genid)
29e75252 286 & rt_hash_mask;
1da177e4
LT
287}
288
e84f84f2
DL
289static inline int rt_genid(struct net *net)
290{
291 return atomic_read(&net->ipv4.rt_genid);
292}
293
1da177e4
LT
294#ifdef CONFIG_PROC_FS
295struct rt_cache_iter_state {
a75e936f 296 struct seq_net_private p;
1da177e4 297 int bucket;
29e75252 298 int genid;
1da177e4
LT
299};
300
1218854a 301static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 302{
1218854a 303 struct rt_cache_iter_state *st = seq->private;
1da177e4 304 struct rtable *r = NULL;
1da177e4
LT
305
306 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
33d480ce 307 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
a6272665 308 continue;
1da177e4 309 rcu_read_lock_bh();
a898def2 310 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 311 while (r) {
d8d1f30b 312 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 313 r->rt_genid == st->genid)
29e75252 314 return r;
d8d1f30b 315 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 316 }
1da177e4
LT
317 rcu_read_unlock_bh();
318 }
29e75252 319 return r;
1da177e4
LT
320}
321
1218854a 322static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 323 struct rtable *r)
1da177e4 324{
1218854a 325 struct rt_cache_iter_state *st = seq->private;
a6272665 326
1c31720a 327 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
328 while (!r) {
329 rcu_read_unlock_bh();
a6272665
ED
330 do {
331 if (--st->bucket < 0)
332 return NULL;
33d480ce 333 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
1da177e4 334 rcu_read_lock_bh();
1c31720a 335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 336 }
1c31720a 337 return r;
1da177e4
LT
338}
339
1218854a 340static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
341 struct rtable *r)
342{
1218854a
YH
343 struct rt_cache_iter_state *st = seq->private;
344 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 345 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 346 continue;
642d6318
DL
347 if (r->rt_genid == st->genid)
348 break;
349 }
350 return r;
351}
352
1218854a 353static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 354{
1218854a 355 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
356
357 if (r)
1218854a 358 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
359 --pos;
360 return pos ? NULL : r;
361}
362
363static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
364{
29e75252 365 struct rt_cache_iter_state *st = seq->private;
29e75252 366 if (*pos)
1218854a 367 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 368 st->genid = rt_genid(seq_file_net(seq));
29e75252 369 return SEQ_START_TOKEN;
1da177e4
LT
370}
371
372static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373{
29e75252 374 struct rtable *r;
1da177e4
LT
375
376 if (v == SEQ_START_TOKEN)
1218854a 377 r = rt_cache_get_first(seq);
1da177e4 378 else
1218854a 379 r = rt_cache_get_next(seq, v);
1da177e4
LT
380 ++*pos;
381 return r;
382}
383
384static void rt_cache_seq_stop(struct seq_file *seq, void *v)
385{
386 if (v && v != SEQ_START_TOKEN)
387 rcu_read_unlock_bh();
388}
389
390static int rt_cache_seq_show(struct seq_file *seq, void *v)
391{
392 if (v == SEQ_START_TOKEN)
393 seq_printf(seq, "%-127s\n",
394 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
395 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
396 "HHUptod\tSpecDst");
397 else {
398 struct rtable *r = v;
3c521f2b 399 int len;
218fa90f 400
0eae88f3 401 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
794785bf
DM
402 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
403 r->dst.dev ? r->dst.dev->name : "*",
404 (__force u32)r->rt_dst,
405 (__force u32)r->rt_gateway,
406 r->rt_flags, atomic_read(&r->dst.__refcnt),
407 r->dst.__use, 0, (__force u32)r->rt_src,
408 dst_metric_advmss(&r->dst) + 40,
409 dst_metric(&r->dst, RTAX_WINDOW), 0,
410 r->rt_key_tos,
411 -1, 0, 0, &len);
5e659e4c
PE
412
413 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
414 }
415 return 0;
1da177e4
LT
416}
417
f690808e 418static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
419 .start = rt_cache_seq_start,
420 .next = rt_cache_seq_next,
421 .stop = rt_cache_seq_stop,
422 .show = rt_cache_seq_show,
423};
424
425static int rt_cache_seq_open(struct inode *inode, struct file *file)
426{
a75e936f 427 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 428 sizeof(struct rt_cache_iter_state));
1da177e4
LT
429}
430
9a32144e 431static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
432 .owner = THIS_MODULE,
433 .open = rt_cache_seq_open,
434 .read = seq_read,
435 .llseek = seq_lseek,
a75e936f 436 .release = seq_release_net,
1da177e4
LT
437};
438
439
440static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
441{
442 int cpu;
443
444 if (*pos == 0)
445 return SEQ_START_TOKEN;
446
0f23174a 447 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
448 if (!cpu_possible(cpu))
449 continue;
450 *pos = cpu+1;
2f970d83 451 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
452 }
453 return NULL;
454}
455
456static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
457{
458 int cpu;
459
0f23174a 460 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
461 if (!cpu_possible(cpu))
462 continue;
463 *pos = cpu+1;
2f970d83 464 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
465 }
466 return NULL;
e905a9ed 467
1da177e4
LT
468}
469
470static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
471{
472
473}
474
475static int rt_cpu_seq_show(struct seq_file *seq, void *v)
476{
477 struct rt_cache_stat *st = v;
478
479 if (v == SEQ_START_TOKEN) {
5bec0039 480 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
481 return 0;
482 }
e905a9ed 483
1da177e4
LT
484 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
485 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 486 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
487 st->in_hit,
488 st->in_slow_tot,
489 st->in_slow_mc,
490 st->in_no_route,
491 st->in_brd,
492 st->in_martian_dst,
493 st->in_martian_src,
494
495 st->out_hit,
496 st->out_slow_tot,
e905a9ed 497 st->out_slow_mc,
1da177e4
LT
498
499 st->gc_total,
500 st->gc_ignored,
501 st->gc_goal_miss,
502 st->gc_dst_overflow,
503 st->in_hlist_search,
504 st->out_hlist_search
505 );
506 return 0;
507}
508
f690808e 509static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
510 .start = rt_cpu_seq_start,
511 .next = rt_cpu_seq_next,
512 .stop = rt_cpu_seq_stop,
513 .show = rt_cpu_seq_show,
514};
515
516
517static int rt_cpu_seq_open(struct inode *inode, struct file *file)
518{
519 return seq_open(file, &rt_cpu_seq_ops);
520}
521
9a32144e 522static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
523 .owner = THIS_MODULE,
524 .open = rt_cpu_seq_open,
525 .read = seq_read,
526 .llseek = seq_lseek,
527 .release = seq_release,
528};
529
c7066f70 530#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 531static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 532{
a661c419
AD
533 struct ip_rt_acct *dst, *src;
534 unsigned int i, j;
535
536 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
537 if (!dst)
538 return -ENOMEM;
539
540 for_each_possible_cpu(i) {
541 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
542 for (j = 0; j < 256; j++) {
543 dst[j].o_bytes += src[j].o_bytes;
544 dst[j].o_packets += src[j].o_packets;
545 dst[j].i_bytes += src[j].i_bytes;
546 dst[j].i_packets += src[j].i_packets;
547 }
78c686e9
PE
548 }
549
a661c419
AD
550 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
551 kfree(dst);
552 return 0;
553}
78c686e9 554
a661c419
AD
555static int rt_acct_proc_open(struct inode *inode, struct file *file)
556{
557 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 558}
a661c419
AD
559
560static const struct file_operations rt_acct_proc_fops = {
561 .owner = THIS_MODULE,
562 .open = rt_acct_proc_open,
563 .read = seq_read,
564 .llseek = seq_lseek,
565 .release = single_release,
566};
78c686e9 567#endif
107f1634 568
73b38711 569static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
570{
571 struct proc_dir_entry *pde;
572
573 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
574 &rt_cache_seq_fops);
575 if (!pde)
576 goto err1;
577
77020720
WC
578 pde = proc_create("rt_cache", S_IRUGO,
579 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
580 if (!pde)
581 goto err2;
582
c7066f70 583#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 584 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
585 if (!pde)
586 goto err3;
587#endif
588 return 0;
589
c7066f70 590#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
591err3:
592 remove_proc_entry("rt_cache", net->proc_net_stat);
593#endif
594err2:
595 remove_proc_entry("rt_cache", net->proc_net);
596err1:
597 return -ENOMEM;
598}
73b38711
DL
599
600static void __net_exit ip_rt_do_proc_exit(struct net *net)
601{
602 remove_proc_entry("rt_cache", net->proc_net_stat);
603 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 604#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 605 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 606#endif
73b38711
DL
607}
608
609static struct pernet_operations ip_rt_proc_ops __net_initdata = {
610 .init = ip_rt_do_proc_init,
611 .exit = ip_rt_do_proc_exit,
612};
613
614static int __init ip_rt_proc_init(void)
615{
616 return register_pernet_subsys(&ip_rt_proc_ops);
617}
618
107f1634 619#else
73b38711 620static inline int ip_rt_proc_init(void)
107f1634
PE
621{
622 return 0;
623}
1da177e4 624#endif /* CONFIG_PROC_FS */
e905a9ed 625
5969f71d 626static inline void rt_free(struct rtable *rt)
1da177e4 627{
d8d1f30b 628 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
629}
630
5969f71d 631static inline void rt_drop(struct rtable *rt)
1da177e4 632{
1da177e4 633 ip_rt_put(rt);
d8d1f30b 634 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
635}
636
5969f71d 637static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
638{
639 /* Kill broadcast/multicast entries very aggresively, if they
640 collide in hash table with more useful entries */
641 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 642 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
643}
644
5969f71d 645static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
646{
647 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
5943634f 648 rth->dst.expires;
1da177e4
LT
649}
650
651static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
652{
653 unsigned long age;
654 int ret = 0;
655
d8d1f30b 656 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
657 goto out;
658
d8d1f30b 659 age = jiffies - rth->dst.lastuse;
1da177e4
LT
660 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
661 (age <= tmo2 && rt_valuable(rth)))
662 goto out;
663 ret = 1;
664out: return ret;
665}
666
667/* Bits of score are:
668 * 31: very valuable
669 * 30: not quite useless
670 * 29..0: usage counter
671 */
672static inline u32 rt_score(struct rtable *rt)
673{
d8d1f30b 674 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
675
676 score = ~score & ~(3<<30);
677
678 if (rt_valuable(rt))
679 score |= (1<<31);
680
c7537967 681 if (rt_is_output_route(rt) ||
1da177e4
LT
682 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
683 score |= (1<<30);
684
685 return score;
686}
687
1080d709
NH
688static inline bool rt_caching(const struct net *net)
689{
690 return net->ipv4.current_rt_cache_rebuild_count <=
691 net->ipv4.sysctl_rt_cache_rebuild_count;
692}
693
5e2b61f7
DM
694static inline bool compare_hash_inputs(const struct rtable *rt1,
695 const struct rtable *rt2)
1080d709 696{
5e2b61f7
DM
697 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
698 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 699 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
700}
701
5e2b61f7 702static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 703{
5e2b61f7
DM
704 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
705 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
706 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 707 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 708 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 709 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
710}
711
b5921910
DL
712static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
713{
d8d1f30b 714 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
715}
716
e84f84f2
DL
717static inline int rt_is_expired(struct rtable *rth)
718{
d8d1f30b 719 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
720}
721
beb659bd
ED
722/*
723 * Perform a full scan of hash table and free all entries.
724 * Can be called by a softirq or a process.
725 * In the later case, we want to be reschedule if necessary
726 */
6561a3b1 727static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
728{
729 unsigned int i;
730 struct rtable *rth, *next;
731
732 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
733 struct rtable __rcu **pprev;
734 struct rtable *list;
735
beb659bd
ED
736 if (process_context && need_resched())
737 cond_resched();
33d480ce 738 rth = rcu_access_pointer(rt_hash_table[i].chain);
beb659bd
ED
739 if (!rth)
740 continue;
741
742 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 743
6561a3b1
DM
744 list = NULL;
745 pprev = &rt_hash_table[i].chain;
746 rth = rcu_dereference_protected(*pprev,
1c31720a 747 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 748
6561a3b1
DM
749 while (rth) {
750 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 751 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
752
753 if (!net ||
754 net_eq(dev_net(rth->dst.dev), net)) {
755 rcu_assign_pointer(*pprev, next);
756 rcu_assign_pointer(rth->dst.rt_next, list);
757 list = rth;
32cb5b4e 758 } else {
6561a3b1 759 pprev = &rth->dst.rt_next;
32cb5b4e 760 }
6561a3b1 761 rth = next;
32cb5b4e 762 }
6561a3b1 763
beb659bd
ED
764 spin_unlock_bh(rt_hash_lock_addr(i));
765
6561a3b1
DM
766 for (; list; list = next) {
767 next = rcu_dereference_protected(list->dst.rt_next, 1);
768 rt_free(list);
beb659bd
ED
769 }
770 }
771}
772
1080d709
NH
773/*
774 * While freeing expired entries, we compute average chain length
775 * and standard deviation, using fixed-point arithmetic.
776 * This to have an estimation of rt_chain_length_max
777 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
778 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
779 */
780
781#define FRACT_BITS 3
782#define ONE (1UL << FRACT_BITS)
783
98376387
ED
784/*
785 * Given a hash chain and an item in this hash chain,
786 * find if a previous entry has the same hash_inputs
787 * (but differs on tos, mark or oif)
788 * Returns 0 if an alias is found.
789 * Returns ONE if rth has no alias before itself.
790 */
791static int has_noalias(const struct rtable *head, const struct rtable *rth)
792{
793 const struct rtable *aux = head;
794
795 while (aux != rth) {
5e2b61f7 796 if (compare_hash_inputs(aux, rth))
98376387 797 return 0;
1c31720a 798 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
799 }
800 return ONE;
801}
802
9f28a2fc
ED
803static void rt_check_expire(void)
804{
805 static unsigned int rover;
806 unsigned int i = rover, goal;
807 struct rtable *rth;
808 struct rtable __rcu **rthp;
809 unsigned long samples = 0;
810 unsigned long sum = 0, sum2 = 0;
811 unsigned long delta;
812 u64 mult;
813
814 delta = jiffies - expires_ljiffies;
815 expires_ljiffies = jiffies;
816 mult = ((u64)delta) << rt_hash_log;
817 if (ip_rt_gc_timeout > 1)
818 do_div(mult, ip_rt_gc_timeout);
819 goal = (unsigned int)mult;
820 if (goal > rt_hash_mask)
821 goal = rt_hash_mask + 1;
822 for (; goal > 0; goal--) {
823 unsigned long tmo = ip_rt_gc_timeout;
824 unsigned long length;
825
826 i = (i + 1) & rt_hash_mask;
827 rthp = &rt_hash_table[i].chain;
828
829 if (need_resched())
830 cond_resched();
831
832 samples++;
833
834 if (rcu_dereference_raw(*rthp) == NULL)
835 continue;
836 length = 0;
837 spin_lock_bh(rt_hash_lock_addr(i));
838 while ((rth = rcu_dereference_protected(*rthp,
839 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
840 prefetch(rth->dst.rt_next);
df67e6c9
DM
841 if (rt_is_expired(rth) ||
842 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
9f28a2fc
ED
843 *rthp = rth->dst.rt_next;
844 rt_free(rth);
845 continue;
846 }
df67e6c9
DM
847
848 /* We only count entries on a chain with equal
849 * hash inputs once so that entries for
850 * different QOS levels, and other non-hash
851 * input attributes don't unfairly skew the
852 * length computation
853 */
854 tmo >>= 1;
855 rthp = &rth->dst.rt_next;
856 length += has_noalias(rt_hash_table[i].chain, rth);
9f28a2fc
ED
857 }
858 spin_unlock_bh(rt_hash_lock_addr(i));
859 sum += length;
860 sum2 += length*length;
861 }
862 if (samples) {
863 unsigned long avg = sum / samples;
864 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
865 rt_chain_length_max = max_t(unsigned long,
866 ip_rt_gc_elasticity,
867 (avg + 4*sd) >> FRACT_BITS);
868 }
869 rover = i;
870}
871
872/*
873 * rt_worker_func() is run in process context.
874 * we call rt_check_expire() to scan part of the hash table
875 */
876static void rt_worker_func(struct work_struct *work)
877{
878 rt_check_expire();
879 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
880}
881
29e75252 882/*
25985edc 883 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
884 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
885 * many times (2^24) without giving recent rt_genid.
886 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 887 */
86c657f6 888static void rt_cache_invalidate(struct net *net)
1da177e4 889{
29e75252 890 unsigned char shuffle;
1da177e4 891
29e75252 892 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 893 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
894}
895
29e75252
ED
896/*
897 * delay < 0 : invalidate cache (fast : entries will be deleted later)
898 * delay >= 0 : invalidate & flush cache (can be long)
899 */
76e6ebfb 900void rt_cache_flush(struct net *net, int delay)
1da177e4 901{
86c657f6 902 rt_cache_invalidate(net);
29e75252 903 if (delay >= 0)
6561a3b1 904 rt_do_flush(net, !in_softirq());
1da177e4
LT
905}
906
a5ee1551 907/* Flush previous cache invalidated entries from the cache */
6561a3b1 908void rt_cache_flush_batch(struct net *net)
a5ee1551 909{
6561a3b1 910 rt_do_flush(net, !in_softirq());
a5ee1551
EB
911}
912
1080d709
NH
913static void rt_emergency_hash_rebuild(struct net *net)
914{
e87cc472 915 net_warn_ratelimited("Route hash chain too long!\n");
3ee94372 916 rt_cache_invalidate(net);
1080d709
NH
917}
918
1da177e4
LT
919/*
920 Short description of GC goals.
921
922 We want to build algorithm, which will keep routing cache
923 at some equilibrium point, when number of aged off entries
924 is kept approximately equal to newly generated ones.
925
926 Current expiration strength is variable "expire".
927 We try to adjust it dynamically, so that if networking
928 is idle expires is large enough to keep enough of warm entries,
929 and when load increases it reduces to limit cache size.
930 */
931
569d3645 932static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
933{
934 static unsigned long expire = RT_GC_TIMEOUT;
935 static unsigned long last_gc;
936 static int rover;
937 static int equilibrium;
1c31720a
ED
938 struct rtable *rth;
939 struct rtable __rcu **rthp;
1da177e4
LT
940 unsigned long now = jiffies;
941 int goal;
fc66f95c 942 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
943
944 /*
945 * Garbage collection is pretty expensive,
946 * do not make it too frequently.
947 */
948
949 RT_CACHE_STAT_INC(gc_total);
950
951 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 952 entries < ip_rt_max_size) {
1da177e4
LT
953 RT_CACHE_STAT_INC(gc_ignored);
954 goto out;
955 }
956
fc66f95c 957 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 958 /* Calculate number of entries, which we want to expire now. */
fc66f95c 959 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
960 if (goal <= 0) {
961 if (equilibrium < ipv4_dst_ops.gc_thresh)
962 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 963 goal = entries - equilibrium;
1da177e4 964 if (goal > 0) {
b790cedd 965 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 966 goal = entries - equilibrium;
1da177e4
LT
967 }
968 } else {
969 /* We are in dangerous area. Try to reduce cache really
970 * aggressively.
971 */
b790cedd 972 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 973 equilibrium = entries - goal;
1da177e4
LT
974 }
975
976 if (now - last_gc >= ip_rt_gc_min_interval)
977 last_gc = now;
978
979 if (goal <= 0) {
980 equilibrium += goal;
981 goto work_done;
982 }
983
984 do {
985 int i, k;
986
987 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
988 unsigned long tmo = expire;
989
990 k = (k + 1) & rt_hash_mask;
991 rthp = &rt_hash_table[k].chain;
22c047cc 992 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
993 while ((rth = rcu_dereference_protected(*rthp,
994 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 995 if (!rt_is_expired(rth) &&
29e75252 996 !rt_may_expire(rth, tmo, expire)) {
1da177e4 997 tmo >>= 1;
d8d1f30b 998 rthp = &rth->dst.rt_next;
1da177e4
LT
999 continue;
1000 }
d8d1f30b 1001 *rthp = rth->dst.rt_next;
1da177e4
LT
1002 rt_free(rth);
1003 goal--;
1da177e4 1004 }
22c047cc 1005 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1006 if (goal <= 0)
1007 break;
1008 }
1009 rover = k;
1010
1011 if (goal <= 0)
1012 goto work_done;
1013
1014 /* Goal is not achieved. We stop process if:
1015
1016 - if expire reduced to zero. Otherwise, expire is halfed.
1017 - if table is not full.
1018 - if we are called from interrupt.
1019 - jiffies check is just fallback/debug loop breaker.
1020 We will not spin here for long time in any case.
1021 */
1022
1023 RT_CACHE_STAT_INC(gc_goal_miss);
1024
1025 if (expire == 0)
1026 break;
1027
1028 expire >>= 1;
1da177e4 1029
fc66f95c 1030 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1031 goto out;
1032 } while (!in_softirq() && time_before_eq(jiffies, now));
1033
fc66f95c
ED
1034 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1035 goto out;
1036 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4 1037 goto out;
e87cc472 1038 net_warn_ratelimited("dst cache overflow\n");
1da177e4
LT
1039 RT_CACHE_STAT_INC(gc_dst_overflow);
1040 return 1;
1041
1042work_done:
1043 expire += ip_rt_gc_min_interval;
1044 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1045 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1046 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 1047 expire = ip_rt_gc_timeout;
1da177e4
LT
1048out: return 0;
1049}
1050
98376387
ED
1051/*
1052 * Returns number of entries in a hash chain that have different hash_inputs
1053 */
1054static int slow_chain_length(const struct rtable *head)
1055{
1056 int length = 0;
1057 const struct rtable *rth = head;
1058
1059 while (rth) {
1060 length += has_noalias(head, rth);
1c31720a 1061 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1062 }
1063 return length >> FRACT_BITS;
1064}
1065
f894cbf8
DM
1066static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1067 struct sk_buff *skb,
1068 const void *daddr)
3769cffb 1069{
d3aaeb38
DM
1070 struct net_device *dev = dst->dev;
1071 const __be32 *pkey = daddr;
39232973 1072 const struct rtable *rt;
3769cffb
DM
1073 struct neighbour *n;
1074
39232973 1075 rt = (const struct rtable *) dst;
a263b309 1076 if (rt->rt_gateway)
39232973 1077 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
1078 else if (skb)
1079 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 1080
80703d26 1081 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
1082 if (n)
1083 return n;
32092ecf 1084 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
1085}
1086
95c96174 1087static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
b23dd4fe 1088 struct sk_buff *skb, int ifindex)
1da177e4 1089{
1c31720a
ED
1090 struct rtable *rth, *cand;
1091 struct rtable __rcu **rthp, **candp;
1da177e4 1092 unsigned long now;
1da177e4
LT
1093 u32 min_score;
1094 int chain_length;
1da177e4
LT
1095
1096restart:
1097 chain_length = 0;
1098 min_score = ~(u32)0;
1099 cand = NULL;
1100 candp = NULL;
1101 now = jiffies;
1102
7586eceb 1103 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
73e42897
NH
1104 /*
1105 * If we're not caching, just tell the caller we
1106 * were successful and don't touch the route. The
1107 * caller hold the sole reference to the cache entry, and
1108 * it will be released when the caller is done with it.
1109 * If we drop it here, the callers have no way to resolve routes
1110 * when we're not caching. Instead, just point *rp at rt, so
1111 * the caller gets a single use out of the route
b6280b47
NH
1112 * Note that we do rt_free on this new route entry, so that
1113 * once its refcount hits zero, we are still able to reap it
1114 * (Thanks Alexey)
27b75c95
ED
1115 * Note: To avoid expensive rcu stuff for this uncached dst,
1116 * we set DST_NOCACHE so that dst_release() can free dst without
1117 * waiting a grace period.
73e42897 1118 */
b6280b47 1119
c7d4426a 1120 rt->dst.flags |= DST_NOCACHE;
b6280b47 1121 goto skip_hashing;
1080d709
NH
1122 }
1123
1da177e4
LT
1124 rthp = &rt_hash_table[hash].chain;
1125
22c047cc 1126 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1127 while ((rth = rcu_dereference_protected(*rthp,
1128 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1129 if (rt_is_expired(rth)) {
d8d1f30b 1130 *rthp = rth->dst.rt_next;
29e75252
ED
1131 rt_free(rth);
1132 continue;
1133 }
5e2b61f7 1134 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1135 /* Put it first */
d8d1f30b 1136 *rthp = rth->dst.rt_next;
1da177e4
LT
1137 /*
1138 * Since lookup is lockfree, the deletion
1139 * must be visible to another weakly ordered CPU before
1140 * the insertion at the start of the hash chain.
1141 */
d8d1f30b 1142 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1143 rt_hash_table[hash].chain);
1144 /*
1145 * Since lookup is lockfree, the update writes
1146 * must be ordered for consistency on SMP.
1147 */
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
d8d1f30b 1150 dst_use(&rth->dst, now);
22c047cc 1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1152
1153 rt_drop(rt);
b23dd4fe 1154 if (skb)
d8d1f30b 1155 skb_dst_set(skb, &rth->dst);
b23dd4fe 1156 return rth;
1da177e4
LT
1157 }
1158
d8d1f30b 1159 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1160 u32 score = rt_score(rth);
1161
1162 if (score <= min_score) {
1163 cand = rth;
1164 candp = rthp;
1165 min_score = score;
1166 }
1167 }
1168
1169 chain_length++;
1170
d8d1f30b 1171 rthp = &rth->dst.rt_next;
1da177e4
LT
1172 }
1173
1174 if (cand) {
1175 /* ip_rt_gc_elasticity used to be average length of chain
1176 * length, when exceeded gc becomes really aggressive.
1177 *
1178 * The second limit is less certain. At the moment it allows
1179 * only 2 entries per bucket. We will see.
1180 */
1181 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1182 *candp = cand->dst.rt_next;
1da177e4
LT
1183 rt_free(cand);
1184 }
1080d709 1185 } else {
98376387
ED
1186 if (chain_length > rt_chain_length_max &&
1187 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1188 struct net *net = dev_net(rt->dst.dev);
1080d709 1189 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1190 if (!rt_caching(net)) {
058bd4d2 1191 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1192 rt->dst.dev->name, num);
1080d709 1193 }
b35ecb5d 1194 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1195 spin_unlock_bh(rt_hash_lock_addr(hash));
1196
5e2b61f7 1197 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1198 ifindex, rt_genid(net));
1199 goto restart;
1080d709 1200 }
1da177e4
LT
1201 }
1202
d8d1f30b 1203 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1204
00269b54
ED
1205 /*
1206 * Since lookup is lockfree, we must make sure
25985edc 1207 * previous writes to rt are committed to memory
00269b54
ED
1208 * before making rt visible to other CPUS.
1209 */
1ddbcb00 1210 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1211
22c047cc 1212 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1213
b6280b47 1214skip_hashing:
b23dd4fe 1215 if (skb)
d8d1f30b 1216 skb_dst_set(skb, &rt->dst);
b23dd4fe 1217 return rt;
1da177e4
LT
1218}
1219
1da177e4
LT
1220/*
1221 * Peer allocation may fail only in serious out-of-memory conditions. However
1222 * we still can generate some output.
1223 * Random ID selection looks a bit dangerous because we have no chances to
1224 * select ID being unique in a reasonable period of time.
1225 * But broken packet identifier may be better than no packet at all.
1226 */
1227static void ip_select_fb_ident(struct iphdr *iph)
1228{
1229 static DEFINE_SPINLOCK(ip_fb_id_lock);
1230 static u32 ip_fallback_id;
1231 u32 salt;
1232
1233 spin_lock_bh(&ip_fb_id_lock);
e448515c 1234 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1235 iph->id = htons(salt & 0xFFFF);
1236 ip_fallback_id = salt;
1237 spin_unlock_bh(&ip_fb_id_lock);
1238}
1239
1240void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1241{
1d861aa4
DM
1242 struct net *net = dev_net(dst->dev);
1243 struct inet_peer *peer;
1da177e4 1244
1d861aa4
DM
1245 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1246 if (peer) {
1247 iph->id = htons(inet_getid(peer, more));
1248 inet_putpeer(peer);
1249 return;
1250 }
1da177e4
LT
1251
1252 ip_select_fb_ident(iph);
1253}
4bc2f18b 1254EXPORT_SYMBOL(__ip_select_ident);
1da177e4 1255
95c96174 1256static void rt_del(unsigned int hash, struct rtable *rt)
1da177e4 1257{
1c31720a
ED
1258 struct rtable __rcu **rthp;
1259 struct rtable *aux;
1da177e4 1260
29e75252 1261 rthp = &rt_hash_table[hash].chain;
22c047cc 1262 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1263 ip_rt_put(rt);
1c31720a
ED
1264 while ((aux = rcu_dereference_protected(*rthp,
1265 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1266 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1267 *rthp = aux->dst.rt_next;
29e75252
ED
1268 rt_free(aux);
1269 continue;
1da177e4 1270 }
d8d1f30b 1271 rthp = &aux->dst.rt_next;
29e75252 1272 }
22c047cc 1273 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1274}
1275
e47a185b 1276static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
1da177e4 1277{
94206125 1278 const struct iphdr *iph = (const struct iphdr *) skb->data;
e47a185b 1279 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 1280 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 1281 struct net_device *dev = skb->dev;
94206125
DM
1282 __be32 daddr = iph->daddr;
1283 __be32 saddr = iph->saddr;
e47a185b
DM
1284 struct in_device *in_dev;
1285 struct neighbour *n;
1286 struct rtable *rt;
317805b8 1287 struct net *net;
1da177e4 1288
94206125
DM
1289 switch (icmp_hdr(skb)->code & 7) {
1290 case ICMP_REDIR_NET:
1291 case ICMP_REDIR_NETTOS:
1292 case ICMP_REDIR_HOST:
1293 case ICMP_REDIR_HOSTTOS:
1294 break;
1295
1296 default:
1297 return;
1298 }
1299
e47a185b
DM
1300 rt = (struct rtable *) dst;
1301 if (rt->rt_gateway != old_gw)
1302 return;
1303
1304 in_dev = __in_dev_get_rcu(dev);
1305 if (!in_dev)
1306 return;
1307
c346dca1 1308 net = dev_net(dev);
9d4fb27d
JP
1309 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1310 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1311 ipv4_is_zeronet(new_gw))
1da177e4
LT
1312 goto reject_redirect;
1313
1314 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1315 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1316 goto reject_redirect;
1317 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1318 goto reject_redirect;
1319 } else {
317805b8 1320 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1321 goto reject_redirect;
1322 }
1323
e47a185b
DM
1324 n = ipv4_neigh_lookup(dst, NULL, &new_gw);
1325 if (n) {
1326 if (!(n->nud_state & NUD_VALID)) {
1327 neigh_event_send(n, NULL);
1328 } else {
1329 rt->rt_gateway = new_gw;
1330 rt->rt_flags |= RTCF_REDIRECTED;
1331 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1332 }
1333 neigh_release(n);
1334 }
1335 return;
1336
1337reject_redirect:
1338#ifdef CONFIG_IP_ROUTE_VERBOSE
1339 if (IN_DEV_LOG_MARTIANS(in_dev))
1340 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1341 " Advised path = %pI4 -> %pI4\n",
1342 &old_gw, dev->name, &new_gw,
1343 &saddr, &daddr);
1344#endif
1345 ;
1346}
1347
1da177e4
LT
1348static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1349{
ee6b9673 1350 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1351 struct dst_entry *ret = dst;
1352
1353 if (rt) {
d11a4dc1 1354 if (dst->obsolete > 0) {
1da177e4
LT
1355 ip_rt_put(rt);
1356 ret = NULL;
5943634f
DM
1357 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1358 rt->dst.expires) {
95c96174 1359 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
5e2b61f7 1360 rt->rt_oif,
e84f84f2 1361 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1362 rt_del(hash, rt);
1363 ret = NULL;
1364 }
1365 }
1366 return ret;
1367}
1368
1369/*
1370 * Algorithm:
1371 * 1. The first ip_rt_redirect_number redirects are sent
1372 * with exponential backoff, then we stop sending them at all,
1373 * assuming that the host ignores our redirects.
1374 * 2. If we did not see packets requiring redirects
1375 * during ip_rt_redirect_silence, we assume that the host
1376 * forgot redirected route and start to send redirects again.
1377 *
1378 * This algorithm is much cheaper and more intelligent than dumb load limiting
1379 * in icmp.c.
1380 *
1381 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1382 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1383 */
1384
1385void ip_rt_send_redirect(struct sk_buff *skb)
1386{
511c3f92 1387 struct rtable *rt = skb_rtable(skb);
30038fc6 1388 struct in_device *in_dev;
92d86829 1389 struct inet_peer *peer;
1d861aa4 1390 struct net *net;
30038fc6 1391 int log_martians;
1da177e4 1392
30038fc6 1393 rcu_read_lock();
d8d1f30b 1394 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1395 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1396 rcu_read_unlock();
1da177e4 1397 return;
30038fc6
ED
1398 }
1399 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1400 rcu_read_unlock();
1da177e4 1401
1d861aa4
DM
1402 net = dev_net(rt->dst.dev);
1403 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
1404 if (!peer) {
1405 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1406 return;
1407 }
1408
1da177e4
LT
1409 /* No redirected packets during ip_rt_redirect_silence;
1410 * reset the algorithm.
1411 */
92d86829
DM
1412 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1413 peer->rate_tokens = 0;
1da177e4
LT
1414
1415 /* Too many ignored redirects; do not send anything
d8d1f30b 1416 * set dst.rate_last to the last seen redirected packet.
1da177e4 1417 */
92d86829
DM
1418 if (peer->rate_tokens >= ip_rt_redirect_number) {
1419 peer->rate_last = jiffies;
1d861aa4 1420 goto out_put_peer;
1da177e4
LT
1421 }
1422
1423 /* Check for load limit; set rate_last to the latest sent
1424 * redirect.
1425 */
92d86829 1426 if (peer->rate_tokens == 0 ||
14fb8a76 1427 time_after(jiffies,
92d86829
DM
1428 (peer->rate_last +
1429 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1430 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1431 peer->rate_last = jiffies;
1432 ++peer->rate_tokens;
1da177e4 1433#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1434 if (log_martians &&
e87cc472
JP
1435 peer->rate_tokens == ip_rt_redirect_number)
1436 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1437 &ip_hdr(skb)->saddr, rt->rt_iif,
1438 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1439#endif
1440 }
1d861aa4
DM
1441out_put_peer:
1442 inet_putpeer(peer);
1da177e4
LT
1443}
1444
1445static int ip_error(struct sk_buff *skb)
1446{
251da413 1447 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 1448 struct rtable *rt = skb_rtable(skb);
92d86829 1449 struct inet_peer *peer;
1da177e4 1450 unsigned long now;
251da413 1451 struct net *net;
92d86829 1452 bool send;
1da177e4
LT
1453 int code;
1454
251da413
DM
1455 net = dev_net(rt->dst.dev);
1456 if (!IN_DEV_FORWARD(in_dev)) {
1457 switch (rt->dst.error) {
1458 case EHOSTUNREACH:
1459 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1460 break;
1461
1462 case ENETUNREACH:
1463 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1464 break;
1465 }
1466 goto out;
1467 }
1468
d8d1f30b 1469 switch (rt->dst.error) {
4500ebf8
JP
1470 case EINVAL:
1471 default:
1472 goto out;
1473 case EHOSTUNREACH:
1474 code = ICMP_HOST_UNREACH;
1475 break;
1476 case ENETUNREACH:
1477 code = ICMP_NET_UNREACH;
251da413 1478 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
1479 break;
1480 case EACCES:
1481 code = ICMP_PKT_FILTERED;
1482 break;
1da177e4
LT
1483 }
1484
1d861aa4 1485 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
1486
1487 send = true;
1488 if (peer) {
1489 now = jiffies;
1490 peer->rate_tokens += now - peer->rate_last;
1491 if (peer->rate_tokens > ip_rt_error_burst)
1492 peer->rate_tokens = ip_rt_error_burst;
1493 peer->rate_last = now;
1494 if (peer->rate_tokens >= ip_rt_error_cost)
1495 peer->rate_tokens -= ip_rt_error_cost;
1496 else
1497 send = false;
1d861aa4 1498 inet_putpeer(peer);
1da177e4 1499 }
92d86829
DM
1500 if (send)
1501 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1502
1503out: kfree_skb(skb);
1504 return 0;
e905a9ed 1505}
1da177e4 1506
1da177e4
LT
1507static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1508{
2c8cec5c 1509 struct rtable *rt = (struct rtable *) dst;
2c8cec5c
DM
1510
1511 dst_confirm(dst);
1512
5943634f
DM
1513 if (mtu < ip_rt_min_pmtu)
1514 mtu = ip_rt_min_pmtu;
2c8cec5c 1515
5943634f
DM
1516 rt->rt_pmtu = mtu;
1517 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1da177e4
LT
1518}
1519
36393395
DM
1520void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1521 int oif, u32 mark, u8 protocol, int flow_flags)
1522{
1523 const struct iphdr *iph = (const struct iphdr *)skb->data;
1524 struct flowi4 fl4;
1525 struct rtable *rt;
1526
1527 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
3e12939a 1528 protocol, flow_flags,
36393395
DM
1529 iph->daddr, iph->saddr, 0, 0);
1530 rt = __ip_route_output_key(net, &fl4);
1531 if (!IS_ERR(rt)) {
1532 ip_rt_update_pmtu(&rt->dst, mtu);
1533 ip_rt_put(rt);
1534 }
1535}
1536EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1537
1538void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1539{
1540 const struct inet_sock *inet = inet_sk(sk);
1541
1542 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1543 sk->sk_bound_dev_if, sk->sk_mark,
1544 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1545 inet_sk_flowi_flags(sk));
1546}
1547EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1548
b42597e2
DM
1549void ipv4_redirect(struct sk_buff *skb, struct net *net,
1550 int oif, u32 mark, u8 protocol, int flow_flags)
1551{
1552 const struct iphdr *iph = (const struct iphdr *)skb->data;
1553 struct flowi4 fl4;
1554 struct rtable *rt;
1555
1556 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1557 protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
1558 rt = __ip_route_output_key(net, &fl4);
1559 if (!IS_ERR(rt)) {
1560 ip_do_redirect(&rt->dst, skb);
1561 ip_rt_put(rt);
1562 }
1563}
1564EXPORT_SYMBOL_GPL(ipv4_redirect);
1565
1566void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1567{
1568 const struct inet_sock *inet = inet_sk(sk);
1569
1570 return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
1571 sk->sk_mark,
1572 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1573 inet_sk_flowi_flags(sk));
1574}
1575EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1576
efbc368d
DM
1577static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1578{
1579 struct rtable *rt = (struct rtable *) dst;
1580
1581 if (rt_is_expired(rt))
1582 return NULL;
d11a4dc1 1583 return dst;
1da177e4
LT
1584}
1585
1586static void ipv4_dst_destroy(struct dst_entry *dst)
1587{
1588 struct rtable *rt = (struct rtable *) dst;
1da177e4 1589
62fa8a84
DM
1590 if (rt->fi) {
1591 fib_info_put(rt->fi);
1592 rt->fi = NULL;
1593 }
1da177e4
LT
1594}
1595
1da177e4
LT
1596
1597static void ipv4_link_failure(struct sk_buff *skb)
1598{
1599 struct rtable *rt;
1600
1601 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1602
511c3f92 1603 rt = skb_rtable(skb);
5943634f
DM
1604 if (rt)
1605 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1606}
1607
1608static int ip_rt_bug(struct sk_buff *skb)
1609{
91df42be
JP
1610 pr_debug("%s: %pI4 -> %pI4, %s\n",
1611 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1612 skb->dev ? skb->dev->name : "?");
1da177e4 1613 kfree_skb(skb);
c378a9c0 1614 WARN_ON(1);
1da177e4
LT
1615 return 0;
1616}
1617
1618/*
1619 We do not cache source address of outgoing interface,
1620 because it is used only by IP RR, TS and SRR options,
1621 so that it out of fast path.
1622
1623 BTW remember: "addr" is allowed to be not aligned
1624 in IP options!
1625 */
1626
8e36360a 1627void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1628{
a61ced5d 1629 __be32 src;
1da177e4 1630
c7537967 1631 if (rt_is_output_route(rt))
c5be24ff 1632 src = ip_hdr(skb)->saddr;
ebc0ffae 1633 else {
8e36360a
DM
1634 struct fib_result res;
1635 struct flowi4 fl4;
1636 struct iphdr *iph;
1637
1638 iph = ip_hdr(skb);
1639
1640 memset(&fl4, 0, sizeof(fl4));
1641 fl4.daddr = iph->daddr;
1642 fl4.saddr = iph->saddr;
b0fe4a31 1643 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1644 fl4.flowi4_oif = rt->dst.dev->ifindex;
1645 fl4.flowi4_iif = skb->dev->ifindex;
1646 fl4.flowi4_mark = skb->mark;
5e2b61f7 1647
ebc0ffae 1648 rcu_read_lock();
68a5e3dd 1649 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1650 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1651 else
1652 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1653 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1654 rcu_read_unlock();
1655 }
1da177e4
LT
1656 memcpy(addr, &src, 4);
1657}
1658
c7066f70 1659#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1660static void set_class_tag(struct rtable *rt, u32 tag)
1661{
d8d1f30b
CG
1662 if (!(rt->dst.tclassid & 0xFFFF))
1663 rt->dst.tclassid |= tag & 0xFFFF;
1664 if (!(rt->dst.tclassid & 0xFFFF0000))
1665 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1666}
1667#endif
1668
0dbaee3b
DM
1669static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1670{
1671 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1672
1673 if (advmss == 0) {
1674 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1675 ip_rt_min_advmss);
1676 if (advmss > 65535 - 40)
1677 advmss = 65535 - 40;
1678 }
1679 return advmss;
1680}
1681
ebb762f2 1682static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1683{
261663b0 1684 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1685 unsigned int mtu = rt->rt_pmtu;
1686
1687 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1688 mtu = 0;
1689
1690 if (!mtu)
1691 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1692
261663b0 1693 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1694 return mtu;
1695
1696 mtu = dst->dev->mtu;
d33e4553
DM
1697
1698 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
d33e4553
DM
1699
1700 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1701 mtu = 576;
1702 }
1703
1704 if (mtu > IP_MAX_MTU)
1705 mtu = IP_MAX_MTU;
1706
1707 return mtu;
1708}
1709
813b3b5d 1710static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1711 struct fib_info *fi)
a4daad6b 1712{
f185071d
DM
1713 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1714 rt->fi = fi;
1715 atomic_inc(&fi->fib_clntref);
a4daad6b 1716 }
f185071d 1717 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1718}
1719
813b3b5d 1720static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1721 const struct fib_result *res,
982721f3 1722 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1723{
1da177e4
LT
1724 if (fi) {
1725 if (FIB_RES_GW(*res) &&
1726 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1727 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1728 rt_init_metrics(rt, fl4, fi);
c7066f70 1729#ifdef CONFIG_IP_ROUTE_CLASSID
710ab6c0 1730 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1731#endif
d33e4553 1732 }
defb3519 1733
c7066f70 1734#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1735#ifdef CONFIG_IP_MULTIPLE_TABLES
1736 set_class_tag(rt, fib_rules_tclass(res));
1737#endif
1738 set_class_tag(rt, itag);
1739#endif
1da177e4
LT
1740}
1741
5c1e6aa3
DM
1742static struct rtable *rt_dst_alloc(struct net_device *dev,
1743 bool nopolicy, bool noxfrm)
0c4dcd58 1744{
5c1e6aa3
DM
1745 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1746 DST_HOST |
1747 (nopolicy ? DST_NOPOLICY : 0) |
1748 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1749}
1750
96d36220 1751/* called in rcu_read_lock() section */
9e12bb22 1752static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1753 u8 tos, struct net_device *dev, int our)
1754{
96d36220 1755 unsigned int hash;
1da177e4 1756 struct rtable *rth;
96d36220 1757 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1758 u32 itag = 0;
b5f7e755 1759 int err;
1da177e4
LT
1760
1761 /* Primary sanity checks. */
1762
1763 if (in_dev == NULL)
1764 return -EINVAL;
1765
1e637c74 1766 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1767 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1768 goto e_inval;
1769
d0daebc3
TG
1770 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1771 if (ipv4_is_loopback(saddr))
1772 goto e_inval;
1773
f97c1e0c
JP
1774 if (ipv4_is_zeronet(saddr)) {
1775 if (!ipv4_is_local_multicast(daddr))
1da177e4 1776 goto e_inval;
b5f7e755 1777 } else {
9e56e380
DM
1778 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1779 in_dev, &itag);
b5f7e755
ED
1780 if (err < 0)
1781 goto e_err;
1782 }
4e7b2f14 1783 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
5c1e6aa3 1784 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
1785 if (!rth)
1786 goto e_nobufs;
1787
cf911662
DM
1788#ifdef CONFIG_IP_ROUTE_CLASSID
1789 rth->dst.tclassid = itag;
1790#endif
d8d1f30b 1791 rth->dst.output = ip_rt_bug;
1da177e4 1792
5e2b61f7 1793 rth->rt_key_dst = daddr;
5e2b61f7 1794 rth->rt_key_src = saddr;
cf911662
DM
1795 rth->rt_genid = rt_genid(dev_net(dev));
1796 rth->rt_flags = RTCF_MULTICAST;
1797 rth->rt_type = RTN_MULTICAST;
475949d8 1798 rth->rt_key_tos = tos;
cf911662 1799 rth->rt_dst = daddr;
1da177e4 1800 rth->rt_src = saddr;
1b86a58f 1801 rth->rt_route_iif = dev->ifindex;
5e2b61f7 1802 rth->rt_iif = dev->ifindex;
5e2b61f7 1803 rth->rt_oif = 0;
cf911662 1804 rth->rt_mark = skb->mark;
5943634f 1805 rth->rt_pmtu = 0;
1da177e4 1806 rth->rt_gateway = daddr;
cf911662 1807 rth->fi = NULL;
1da177e4 1808 if (our) {
d8d1f30b 1809 rth->dst.input= ip_local_deliver;
1da177e4
LT
1810 rth->rt_flags |= RTCF_LOCAL;
1811 }
1812
1813#ifdef CONFIG_IP_MROUTE
f97c1e0c 1814 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1815 rth->dst.input = ip_mr_input;
1da177e4
LT
1816#endif
1817 RT_CACHE_STAT_INC(in_slow_mc);
1818
e84f84f2 1819 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 1820 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 1821 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
1822
1823e_nobufs:
1da177e4 1824 return -ENOBUFS;
1da177e4 1825e_inval:
96d36220 1826 return -EINVAL;
b5f7e755 1827e_err:
b5f7e755 1828 return err;
1da177e4
LT
1829}
1830
1831
1832static void ip_handle_martian_source(struct net_device *dev,
1833 struct in_device *in_dev,
1834 struct sk_buff *skb,
9e12bb22
AV
1835 __be32 daddr,
1836 __be32 saddr)
1da177e4
LT
1837{
1838 RT_CACHE_STAT_INC(in_martian_src);
1839#ifdef CONFIG_IP_ROUTE_VERBOSE
1840 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1841 /*
1842 * RFC1812 recommendation, if source is martian,
1843 * the only hint is MAC header.
1844 */
058bd4d2 1845 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1846 &daddr, &saddr, dev->name);
98e399f8 1847 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1848 print_hex_dump(KERN_WARNING, "ll header: ",
1849 DUMP_PREFIX_OFFSET, 16, 1,
1850 skb_mac_header(skb),
1851 dev->hard_header_len, true);
1da177e4
LT
1852 }
1853 }
1854#endif
1855}
1856
47360228 1857/* called in rcu_read_lock() section */
5969f71d 1858static int __mkroute_input(struct sk_buff *skb,
982721f3 1859 const struct fib_result *res,
5969f71d
SH
1860 struct in_device *in_dev,
1861 __be32 daddr, __be32 saddr, u32 tos,
1862 struct rtable **result)
1da177e4 1863{
1da177e4
LT
1864 struct rtable *rth;
1865 int err;
1866 struct in_device *out_dev;
47360228 1867 unsigned int flags = 0;
d9c9df8c 1868 u32 itag;
1da177e4
LT
1869
1870 /* get a working reference to the output device */
47360228 1871 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 1872 if (out_dev == NULL) {
e87cc472 1873 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1874 return -EINVAL;
1875 }
1876
1877
5c04c819 1878 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1879 in_dev->dev, in_dev, &itag);
1da177e4 1880 if (err < 0) {
e905a9ed 1881 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1882 saddr);
e905a9ed 1883
1da177e4
LT
1884 goto cleanup;
1885 }
1886
1887 if (err)
1888 flags |= RTCF_DIRECTSRC;
1889
51b77cae 1890 if (out_dev == in_dev && err &&
1da177e4
LT
1891 (IN_DEV_SHARED_MEDIA(out_dev) ||
1892 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1893 flags |= RTCF_DOREDIRECT;
1894
1895 if (skb->protocol != htons(ETH_P_IP)) {
1896 /* Not IP (i.e. ARP). Do not create route, if it is
1897 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1898 *
1899 * Proxy arp feature have been extended to allow, ARP
1900 * replies back to the same interface, to support
1901 * Private VLAN switch technologies. See arp.c.
1da177e4 1902 */
65324144
JDB
1903 if (out_dev == in_dev &&
1904 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1905 err = -EINVAL;
1906 goto cleanup;
1907 }
1908 }
1909
5c1e6aa3
DM
1910 rth = rt_dst_alloc(out_dev->dev,
1911 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 1912 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
1913 if (!rth) {
1914 err = -ENOBUFS;
1915 goto cleanup;
1916 }
1917
5e2b61f7 1918 rth->rt_key_dst = daddr;
5e2b61f7 1919 rth->rt_key_src = saddr;
cf911662
DM
1920 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1921 rth->rt_flags = flags;
1922 rth->rt_type = res->type;
475949d8 1923 rth->rt_key_tos = tos;
cf911662 1924 rth->rt_dst = daddr;
1da177e4 1925 rth->rt_src = saddr;
1b86a58f 1926 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 1927 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 1928 rth->rt_oif = 0;
cf911662 1929 rth->rt_mark = skb->mark;
5943634f 1930 rth->rt_pmtu = 0;
cf911662 1931 rth->rt_gateway = daddr;
cf911662 1932 rth->fi = NULL;
1da177e4 1933
d8d1f30b
CG
1934 rth->dst.input = ip_forward;
1935 rth->dst.output = ip_output;
1da177e4 1936
5e2b61f7 1937 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 1938
1da177e4
LT
1939 *result = rth;
1940 err = 0;
1941 cleanup:
1da177e4 1942 return err;
e905a9ed 1943}
1da177e4 1944
5969f71d
SH
1945static int ip_mkroute_input(struct sk_buff *skb,
1946 struct fib_result *res,
68a5e3dd 1947 const struct flowi4 *fl4,
5969f71d
SH
1948 struct in_device *in_dev,
1949 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1950{
5e73ea1a 1951 struct rtable *rth = NULL;
1da177e4 1952 int err;
95c96174 1953 unsigned int hash;
1da177e4
LT
1954
1955#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 1956 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 1957 fib_select_multipath(res);
1da177e4
LT
1958#endif
1959
1960 /* create a routing cache entry */
1961 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1962 if (err)
1963 return err;
1da177e4
LT
1964
1965 /* put it into the cache */
68a5e3dd 1966 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 1967 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 1968 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
1969 if (IS_ERR(rth))
1970 return PTR_ERR(rth);
1971 return 0;
1da177e4
LT
1972}
1973
1da177e4
LT
1974/*
1975 * NOTE. We drop all the packets that has local source
1976 * addresses, because every properly looped back packet
1977 * must have correct destination already attached by output routine.
1978 *
1979 * Such approach solves two big problems:
1980 * 1. Not simplex devices are handled properly.
1981 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1982 * called with rcu_read_lock()
1da177e4
LT
1983 */
1984
9e12bb22 1985static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 1986 u8 tos, struct net_device *dev)
1da177e4
LT
1987{
1988 struct fib_result res;
96d36220 1989 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 1990 struct flowi4 fl4;
95c96174 1991 unsigned int flags = 0;
1da177e4 1992 u32 itag = 0;
95c96174
ED
1993 struct rtable *rth;
1994 unsigned int hash;
1da177e4 1995 int err = -EINVAL;
5e73ea1a 1996 struct net *net = dev_net(dev);
1da177e4
LT
1997
1998 /* IP on this device is disabled. */
1999
2000 if (!in_dev)
2001 goto out;
2002
2003 /* Check for the most weird martians, which can be not detected
2004 by fib_lookup.
2005 */
2006
d0daebc3 2007 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
2008 goto martian_source;
2009
27a954bd 2010 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2011 goto brd_input;
2012
2013 /* Accept zero addresses only to limited broadcast;
2014 * I even do not know to fix it or not. Waiting for complains :-)
2015 */
f97c1e0c 2016 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2017 goto martian_source;
2018
d0daebc3 2019 if (ipv4_is_zeronet(daddr))
1da177e4
LT
2020 goto martian_destination;
2021
d0daebc3
TG
2022 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2023 if (ipv4_is_loopback(daddr))
2024 goto martian_destination;
2025
2026 if (ipv4_is_loopback(saddr))
2027 goto martian_source;
2028 }
2029
1da177e4
LT
2030 /*
2031 * Now we are ready to route packet.
2032 */
68a5e3dd
DM
2033 fl4.flowi4_oif = 0;
2034 fl4.flowi4_iif = dev->ifindex;
2035 fl4.flowi4_mark = skb->mark;
2036 fl4.flowi4_tos = tos;
2037 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2038 fl4.daddr = daddr;
2039 fl4.saddr = saddr;
2040 err = fib_lookup(net, &fl4, &res);
251da413 2041 if (err != 0)
1da177e4 2042 goto no_route;
1da177e4
LT
2043
2044 RT_CACHE_STAT_INC(in_slow_tot);
2045
2046 if (res.type == RTN_BROADCAST)
2047 goto brd_input;
2048
2049 if (res.type == RTN_LOCAL) {
5c04c819 2050 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2051 net->loopback_dev->ifindex,
9e56e380 2052 dev, in_dev, &itag);
b5f7e755
ED
2053 if (err < 0)
2054 goto martian_source_keep_err;
2055 if (err)
1da177e4 2056 flags |= RTCF_DIRECTSRC;
1da177e4
LT
2057 goto local_input;
2058 }
2059
2060 if (!IN_DEV_FORWARD(in_dev))
251da413 2061 goto no_route;
1da177e4
LT
2062 if (res.type != RTN_UNICAST)
2063 goto martian_destination;
2064
68a5e3dd 2065 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2066out: return err;
2067
2068brd_input:
2069 if (skb->protocol != htons(ETH_P_IP))
2070 goto e_inval;
2071
41347dcd 2072 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2073 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2074 in_dev, &itag);
1da177e4 2075 if (err < 0)
b5f7e755 2076 goto martian_source_keep_err;
1da177e4
LT
2077 if (err)
2078 flags |= RTCF_DIRECTSRC;
2079 }
2080 flags |= RTCF_BROADCAST;
2081 res.type = RTN_BROADCAST;
2082 RT_CACHE_STAT_INC(in_brd);
2083
2084local_input:
5c1e6aa3
DM
2085 rth = rt_dst_alloc(net->loopback_dev,
2086 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2087 if (!rth)
2088 goto e_nobufs;
2089
cf911662 2090 rth->dst.input= ip_local_deliver;
d8d1f30b 2091 rth->dst.output= ip_rt_bug;
cf911662
DM
2092#ifdef CONFIG_IP_ROUTE_CLASSID
2093 rth->dst.tclassid = itag;
2094#endif
1da177e4 2095
5e2b61f7 2096 rth->rt_key_dst = daddr;
5e2b61f7 2097 rth->rt_key_src = saddr;
cf911662
DM
2098 rth->rt_genid = rt_genid(net);
2099 rth->rt_flags = flags|RTCF_LOCAL;
2100 rth->rt_type = res.type;
475949d8 2101 rth->rt_key_tos = tos;
cf911662 2102 rth->rt_dst = daddr;
1da177e4 2103 rth->rt_src = saddr;
1b86a58f 2104 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2105 rth->rt_iif = dev->ifindex;
cf911662
DM
2106 rth->rt_oif = 0;
2107 rth->rt_mark = skb->mark;
5943634f 2108 rth->rt_pmtu = 0;
1da177e4 2109 rth->rt_gateway = daddr;
cf911662 2110 rth->fi = NULL;
1da177e4 2111 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2112 rth->dst.input= ip_error;
2113 rth->dst.error= -err;
1da177e4
LT
2114 rth->rt_flags &= ~RTCF_LOCAL;
2115 }
68a5e3dd
DM
2116 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2117 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2118 err = 0;
2119 if (IS_ERR(rth))
2120 err = PTR_ERR(rth);
ebc0ffae 2121 goto out;
1da177e4
LT
2122
2123no_route:
2124 RT_CACHE_STAT_INC(in_no_route);
1da177e4 2125 res.type = RTN_UNREACHABLE;
7f53878d
MC
2126 if (err == -ESRCH)
2127 err = -ENETUNREACH;
1da177e4
LT
2128 goto local_input;
2129
2130 /*
2131 * Do not cache martian addresses: they should be logged (RFC1812)
2132 */
2133martian_destination:
2134 RT_CACHE_STAT_INC(in_martian_dst);
2135#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2136 if (IN_DEV_LOG_MARTIANS(in_dev))
2137 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2138 &daddr, &saddr, dev->name);
1da177e4 2139#endif
2c2910a4 2140
1da177e4
LT
2141e_inval:
2142 err = -EINVAL;
ebc0ffae 2143 goto out;
1da177e4
LT
2144
2145e_nobufs:
2146 err = -ENOBUFS;
ebc0ffae 2147 goto out;
1da177e4
LT
2148
2149martian_source:
b5f7e755
ED
2150 err = -EINVAL;
2151martian_source_keep_err:
1da177e4 2152 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2153 goto out;
1da177e4
LT
2154}
2155
407eadd9 2156int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 2157 u8 tos, struct net_device *dev, bool noref)
1da177e4 2158{
95c96174
ED
2159 struct rtable *rth;
2160 unsigned int hash;
1da177e4 2161 int iif = dev->ifindex;
b5921910 2162 struct net *net;
96d36220 2163 int res;
1da177e4 2164
c346dca1 2165 net = dev_net(dev);
1080d709 2166
96d36220
ED
2167 rcu_read_lock();
2168
1080d709
NH
2169 if (!rt_caching(net))
2170 goto skip_cache;
2171
1da177e4 2172 tos &= IPTOS_RT_MASK;
e84f84f2 2173 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2174
1da177e4 2175 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2176 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2177 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2178 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2179 (rth->rt_route_iif ^ iif) |
475949d8 2180 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2181 rth->rt_mark == skb->mark &&
d8d1f30b 2182 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2183 !rt_is_expired(rth)) {
407eadd9 2184 if (noref) {
d8d1f30b
CG
2185 dst_use_noref(&rth->dst, jiffies);
2186 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2187 } else {
d8d1f30b
CG
2188 dst_use(&rth->dst, jiffies);
2189 skb_dst_set(skb, &rth->dst);
407eadd9 2190 }
1da177e4
LT
2191 RT_CACHE_STAT_INC(in_hit);
2192 rcu_read_unlock();
1da177e4
LT
2193 return 0;
2194 }
2195 RT_CACHE_STAT_INC(in_hlist_search);
2196 }
1da177e4 2197
1080d709 2198skip_cache:
1da177e4
LT
2199 /* Multicast recognition logic is moved from route cache to here.
2200 The problem was that too many Ethernet cards have broken/missing
2201 hardware multicast filters :-( As result the host on multicasting
2202 network acquires a lot of useless route cache entries, sort of
2203 SDR messages from all the world. Now we try to get rid of them.
2204 Really, provided software IP multicast filter is organized
2205 reasonably (at least, hashed), it does not result in a slowdown
2206 comparing with route cache reject entries.
2207 Note, that multicast routers are not affected, because
2208 route cache entry is created eventually.
2209 */
f97c1e0c 2210 if (ipv4_is_multicast(daddr)) {
96d36220 2211 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2212
96d36220 2213 if (in_dev) {
dbdd9a52
DM
2214 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2215 ip_hdr(skb)->protocol);
1da177e4
LT
2216 if (our
2217#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2218 ||
2219 (!ipv4_is_local_multicast(daddr) &&
2220 IN_DEV_MFORWARD(in_dev))
1da177e4 2221#endif
9d4fb27d 2222 ) {
96d36220
ED
2223 int res = ip_route_input_mc(skb, daddr, saddr,
2224 tos, dev, our);
1da177e4 2225 rcu_read_unlock();
96d36220 2226 return res;
1da177e4
LT
2227 }
2228 }
2229 rcu_read_unlock();
2230 return -EINVAL;
2231 }
c10237e0 2232 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
2233 rcu_read_unlock();
2234 return res;
1da177e4 2235}
407eadd9 2236EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2237
ebc0ffae 2238/* called with rcu_read_lock() */
982721f3 2239static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2240 const struct flowi4 *fl4,
813b3b5d 2241 __be32 orig_daddr, __be32 orig_saddr,
f61759e6
JA
2242 int orig_oif, __u8 orig_rtos,
2243 struct net_device *dev_out,
5ada5527 2244 unsigned int flags)
1da177e4 2245{
982721f3 2246 struct fib_info *fi = res->fi;
5ada5527 2247 struct in_device *in_dev;
982721f3 2248 u16 type = res->type;
5ada5527 2249 struct rtable *rth;
1da177e4 2250
d0daebc3
TG
2251 in_dev = __in_dev_get_rcu(dev_out);
2252 if (!in_dev)
5ada5527 2253 return ERR_PTR(-EINVAL);
1da177e4 2254
d0daebc3
TG
2255 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2256 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2257 return ERR_PTR(-EINVAL);
2258
68a5e3dd 2259 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2260 type = RTN_BROADCAST;
68a5e3dd 2261 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2262 type = RTN_MULTICAST;
68a5e3dd 2263 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2264 return ERR_PTR(-EINVAL);
1da177e4
LT
2265
2266 if (dev_out->flags & IFF_LOOPBACK)
2267 flags |= RTCF_LOCAL;
2268
982721f3 2269 if (type == RTN_BROADCAST) {
1da177e4 2270 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2271 fi = NULL;
2272 } else if (type == RTN_MULTICAST) {
dd28d1a0 2273 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2274 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2275 fl4->flowi4_proto))
1da177e4
LT
2276 flags &= ~RTCF_LOCAL;
2277 /* If multicast route do not exist use
dd28d1a0
ED
2278 * default one, but do not gateway in this case.
2279 * Yes, it is hack.
1da177e4 2280 */
982721f3
DM
2281 if (fi && res->prefixlen < 4)
2282 fi = NULL;
1da177e4
LT
2283 }
2284
5c1e6aa3
DM
2285 rth = rt_dst_alloc(dev_out,
2286 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2287 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2288 if (!rth)
5ada5527 2289 return ERR_PTR(-ENOBUFS);
8391d07b 2290
cf911662
DM
2291 rth->dst.output = ip_output;
2292
813b3b5d
DM
2293 rth->rt_key_dst = orig_daddr;
2294 rth->rt_key_src = orig_saddr;
cf911662
DM
2295 rth->rt_genid = rt_genid(dev_net(dev_out));
2296 rth->rt_flags = flags;
2297 rth->rt_type = type;
f61759e6 2298 rth->rt_key_tos = orig_rtos;
68a5e3dd
DM
2299 rth->rt_dst = fl4->daddr;
2300 rth->rt_src = fl4->saddr;
1b86a58f 2301 rth->rt_route_iif = 0;
813b3b5d
DM
2302 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2303 rth->rt_oif = orig_oif;
2304 rth->rt_mark = fl4->flowi4_mark;
5943634f 2305 rth->rt_pmtu = 0;
68a5e3dd 2306 rth->rt_gateway = fl4->daddr;
cf911662 2307 rth->fi = NULL;
1da177e4
LT
2308
2309 RT_CACHE_STAT_INC(out_slow_tot);
2310
41347dcd 2311 if (flags & RTCF_LOCAL)
d8d1f30b 2312 rth->dst.input = ip_local_deliver;
1da177e4 2313 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2314 if (flags & RTCF_LOCAL &&
1da177e4 2315 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2316 rth->dst.output = ip_mc_output;
1da177e4
LT
2317 RT_CACHE_STAT_INC(out_slow_mc);
2318 }
2319#ifdef CONFIG_IP_MROUTE
982721f3 2320 if (type == RTN_MULTICAST) {
1da177e4 2321 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2322 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2323 rth->dst.input = ip_mr_input;
2324 rth->dst.output = ip_mc_output;
1da177e4
LT
2325 }
2326 }
2327#endif
2328 }
2329
813b3b5d 2330 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2331
7586eceb
ED
2332 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2333 rth->dst.flags |= DST_NOCACHE;
2334
5ada5527 2335 return rth;
1da177e4
LT
2336}
2337
1da177e4
LT
2338/*
2339 * Major route resolver routine.
0197aa38 2340 * called with rcu_read_lock();
1da177e4
LT
2341 */
2342
813b3b5d 2343static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2344{
1da177e4 2345 struct net_device *dev_out = NULL;
f61759e6 2346 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
2347 unsigned int flags = 0;
2348 struct fib_result res;
5ada5527 2349 struct rtable *rth;
813b3b5d
DM
2350 __be32 orig_daddr;
2351 __be32 orig_saddr;
2352 int orig_oif;
1da177e4
LT
2353
2354 res.fi = NULL;
8b96d22d 2355 res.table = NULL;
1da177e4
LT
2356#ifdef CONFIG_IP_MULTIPLE_TABLES
2357 res.r = NULL;
2358#endif
2359
813b3b5d
DM
2360 orig_daddr = fl4->daddr;
2361 orig_saddr = fl4->saddr;
2362 orig_oif = fl4->flowi4_oif;
2363
2364 fl4->flowi4_iif = net->loopback_dev->ifindex;
2365 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2366 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2367 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2368
010c2708 2369 rcu_read_lock();
813b3b5d 2370 if (fl4->saddr) {
b23dd4fe 2371 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2372 if (ipv4_is_multicast(fl4->saddr) ||
2373 ipv4_is_lbcast(fl4->saddr) ||
2374 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2375 goto out;
2376
1da177e4
LT
2377 /* I removed check for oif == dev_out->oif here.
2378 It was wrong for two reasons:
1ab35276
DL
2379 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2380 is assigned to multiple interfaces.
1da177e4
LT
2381 2. Moreover, we are allowed to send packets with saddr
2382 of another iface. --ANK
2383 */
2384
813b3b5d
DM
2385 if (fl4->flowi4_oif == 0 &&
2386 (ipv4_is_multicast(fl4->daddr) ||
2387 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2388 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2389 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2390 if (dev_out == NULL)
2391 goto out;
2392
1da177e4
LT
2393 /* Special hack: user can direct multicasts
2394 and limited broadcast via necessary interface
2395 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2396 This hack is not just for fun, it allows
2397 vic,vat and friends to work.
2398 They bind socket to loopback, set ttl to zero
2399 and expect that it will work.
2400 From the viewpoint of routing cache they are broken,
2401 because we are not allowed to build multicast path
2402 with loopback source addr (look, routing cache
2403 cannot know, that ttl is zero, so that packet
2404 will not leave this host and route is valid).
2405 Luckily, this hack is good workaround.
2406 */
2407
813b3b5d 2408 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2409 goto make_route;
2410 }
a210d01a 2411
813b3b5d 2412 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2413 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2414 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2415 goto out;
a210d01a 2416 }
1da177e4
LT
2417 }
2418
2419
813b3b5d
DM
2420 if (fl4->flowi4_oif) {
2421 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2422 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2423 if (dev_out == NULL)
2424 goto out;
e5ed6399
HX
2425
2426 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2427 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2428 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2429 goto out;
2430 }
813b3b5d
DM
2431 if (ipv4_is_local_multicast(fl4->daddr) ||
2432 ipv4_is_lbcast(fl4->daddr)) {
2433 if (!fl4->saddr)
2434 fl4->saddr = inet_select_addr(dev_out, 0,
2435 RT_SCOPE_LINK);
1da177e4
LT
2436 goto make_route;
2437 }
813b3b5d
DM
2438 if (fl4->saddr) {
2439 if (ipv4_is_multicast(fl4->daddr))
2440 fl4->saddr = inet_select_addr(dev_out, 0,
2441 fl4->flowi4_scope);
2442 else if (!fl4->daddr)
2443 fl4->saddr = inet_select_addr(dev_out, 0,
2444 RT_SCOPE_HOST);
1da177e4
LT
2445 }
2446 }
2447
813b3b5d
DM
2448 if (!fl4->daddr) {
2449 fl4->daddr = fl4->saddr;
2450 if (!fl4->daddr)
2451 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2452 dev_out = net->loopback_dev;
813b3b5d 2453 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2454 res.type = RTN_LOCAL;
2455 flags |= RTCF_LOCAL;
2456 goto make_route;
2457 }
2458
813b3b5d 2459 if (fib_lookup(net, fl4, &res)) {
1da177e4 2460 res.fi = NULL;
8b96d22d 2461 res.table = NULL;
813b3b5d 2462 if (fl4->flowi4_oif) {
1da177e4
LT
2463 /* Apparently, routing tables are wrong. Assume,
2464 that the destination is on link.
2465
2466 WHY? DW.
2467 Because we are allowed to send to iface
2468 even if it has NO routes and NO assigned
2469 addresses. When oif is specified, routing
2470 tables are looked up with only one purpose:
2471 to catch if destination is gatewayed, rather than
2472 direct. Moreover, if MSG_DONTROUTE is set,
2473 we send packet, ignoring both routing tables
2474 and ifaddr state. --ANK
2475
2476
2477 We could make it even if oif is unknown,
2478 likely IPv6, but we do not.
2479 */
2480
813b3b5d
DM
2481 if (fl4->saddr == 0)
2482 fl4->saddr = inet_select_addr(dev_out, 0,
2483 RT_SCOPE_LINK);
1da177e4
LT
2484 res.type = RTN_UNICAST;
2485 goto make_route;
2486 }
b23dd4fe 2487 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2488 goto out;
2489 }
1da177e4
LT
2490
2491 if (res.type == RTN_LOCAL) {
813b3b5d 2492 if (!fl4->saddr) {
9fc3bbb4 2493 if (res.fi->fib_prefsrc)
813b3b5d 2494 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2495 else
813b3b5d 2496 fl4->saddr = fl4->daddr;
9fc3bbb4 2497 }
b40afd0e 2498 dev_out = net->loopback_dev;
813b3b5d 2499 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2500 res.fi = NULL;
2501 flags |= RTCF_LOCAL;
2502 goto make_route;
2503 }
2504
2505#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2506 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2507 fib_select_multipath(&res);
1da177e4
LT
2508 else
2509#endif
21d8c49e
DM
2510 if (!res.prefixlen &&
2511 res.table->tb_num_default > 1 &&
813b3b5d 2512 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2513 fib_select_default(&res);
1da177e4 2514
813b3b5d
DM
2515 if (!fl4->saddr)
2516 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2517
1da177e4 2518 dev_out = FIB_RES_DEV(res);
813b3b5d 2519 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2520
2521
2522make_route:
813b3b5d 2523 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
f61759e6 2524 tos, dev_out, flags);
b23dd4fe 2525 if (!IS_ERR(rth)) {
5ada5527
DM
2526 unsigned int hash;
2527
813b3b5d 2528 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2529 rt_genid(dev_net(dev_out)));
813b3b5d 2530 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2531 }
1da177e4 2532
010c2708
DM
2533out:
2534 rcu_read_unlock();
b23dd4fe 2535 return rth;
1da177e4
LT
2536}
2537
813b3b5d 2538struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2539{
1da177e4 2540 struct rtable *rth;
010c2708 2541 unsigned int hash;
1da177e4 2542
1080d709
NH
2543 if (!rt_caching(net))
2544 goto slow_output;
2545
9d6ec938 2546 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2547
2548 rcu_read_lock_bh();
a898def2 2549 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2550 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2551 if (rth->rt_key_dst == flp4->daddr &&
2552 rth->rt_key_src == flp4->saddr &&
c7537967 2553 rt_is_output_route(rth) &&
9d6ec938
DM
2554 rth->rt_oif == flp4->flowi4_oif &&
2555 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2556 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2557 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2558 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2559 !rt_is_expired(rth)) {
d8d1f30b 2560 dst_use(&rth->dst, jiffies);
1da177e4
LT
2561 RT_CACHE_STAT_INC(out_hit);
2562 rcu_read_unlock_bh();
56157872
DM
2563 if (!flp4->saddr)
2564 flp4->saddr = rth->rt_src;
2565 if (!flp4->daddr)
2566 flp4->daddr = rth->rt_dst;
b23dd4fe 2567 return rth;
1da177e4
LT
2568 }
2569 RT_CACHE_STAT_INC(out_hlist_search);
2570 }
2571 rcu_read_unlock_bh();
2572
1080d709 2573slow_output:
9d6ec938 2574 return ip_route_output_slow(net, flp4);
1da177e4 2575}
d8c97a94
ACM
2576EXPORT_SYMBOL_GPL(__ip_route_output_key);
2577
ae2688d5
JW
2578static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2579{
2580 return NULL;
2581}
2582
ebb762f2 2583static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2584{
618f9bc7
SK
2585 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2586
2587 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2588}
2589
14e50e57
DM
2590static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2591{
2592}
2593
0972ddb2
HB
2594static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2595 unsigned long old)
2596{
2597 return NULL;
2598}
2599
14e50e57
DM
2600static struct dst_ops ipv4_dst_blackhole_ops = {
2601 .family = AF_INET,
09640e63 2602 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2603 .destroy = ipv4_dst_destroy,
ae2688d5 2604 .check = ipv4_blackhole_dst_check,
ebb762f2 2605 .mtu = ipv4_blackhole_mtu,
214f45c9 2606 .default_advmss = ipv4_default_advmss,
14e50e57 2607 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2608 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2609 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2610};
2611
2774c131 2612struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2613{
5c1e6aa3 2614 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2615 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2616
2617 if (rt) {
d8d1f30b 2618 struct dst_entry *new = &rt->dst;
14e50e57 2619
14e50e57 2620 new->__use = 1;
352e512c
HX
2621 new->input = dst_discard;
2622 new->output = dst_discard;
14e50e57 2623
d8d1f30b 2624 new->dev = ort->dst.dev;
14e50e57
DM
2625 if (new->dev)
2626 dev_hold(new->dev);
2627
5e2b61f7
DM
2628 rt->rt_key_dst = ort->rt_key_dst;
2629 rt->rt_key_src = ort->rt_key_src;
475949d8 2630 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2631 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2632 rt->rt_iif = ort->rt_iif;
2633 rt->rt_oif = ort->rt_oif;
2634 rt->rt_mark = ort->rt_mark;
5943634f 2635 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2636
e84f84f2 2637 rt->rt_genid = rt_genid(net);
14e50e57
DM
2638 rt->rt_flags = ort->rt_flags;
2639 rt->rt_type = ort->rt_type;
2640 rt->rt_dst = ort->rt_dst;
2641 rt->rt_src = ort->rt_src;
14e50e57 2642 rt->rt_gateway = ort->rt_gateway;
62fa8a84
DM
2643 rt->fi = ort->fi;
2644 if (rt->fi)
2645 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2646
2647 dst_free(new);
2648 }
2649
2774c131
DM
2650 dst_release(dst_orig);
2651
2652 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2653}
2654
9d6ec938 2655struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2656 struct sock *sk)
1da177e4 2657{
9d6ec938 2658 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2659
b23dd4fe
DM
2660 if (IS_ERR(rt))
2661 return rt;
1da177e4 2662
56157872 2663 if (flp4->flowi4_proto)
9d6ec938
DM
2664 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2665 flowi4_to_flowi(flp4),
2666 sk, 0);
1da177e4 2667
b23dd4fe 2668 return rt;
1da177e4 2669}
d8c97a94
ACM
2670EXPORT_SYMBOL_GPL(ip_route_output_flow);
2671
4feb88e5
BT
2672static int rt_fill_info(struct net *net,
2673 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2674 int nowait, unsigned int flags)
1da177e4 2675{
511c3f92 2676 struct rtable *rt = skb_rtable(skb);
1da177e4 2677 struct rtmsg *r;
be403ea1 2678 struct nlmsghdr *nlh;
2bc8ca40 2679 unsigned long expires = 0;
f185071d 2680 u32 error;
be403ea1
TG
2681
2682 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2683 if (nlh == NULL)
26932566 2684 return -EMSGSIZE;
be403ea1
TG
2685
2686 r = nlmsg_data(nlh);
1da177e4
LT
2687 r->rtm_family = AF_INET;
2688 r->rtm_dst_len = 32;
2689 r->rtm_src_len = 0;
475949d8 2690 r->rtm_tos = rt->rt_key_tos;
1da177e4 2691 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2692 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2693 goto nla_put_failure;
1da177e4
LT
2694 r->rtm_type = rt->rt_type;
2695 r->rtm_scope = RT_SCOPE_UNIVERSE;
2696 r->rtm_protocol = RTPROT_UNSPEC;
2697 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2698 if (rt->rt_flags & RTCF_NOTIFY)
2699 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2700
f3756b79
DM
2701 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2702 goto nla_put_failure;
5e2b61f7 2703 if (rt->rt_key_src) {
1da177e4 2704 r->rtm_src_len = 32;
f3756b79
DM
2705 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2706 goto nla_put_failure;
1da177e4 2707 }
f3756b79
DM
2708 if (rt->dst.dev &&
2709 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2710 goto nla_put_failure;
c7066f70 2711#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2712 if (rt->dst.tclassid &&
2713 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2714 goto nla_put_failure;
1da177e4 2715#endif
41347dcd
DM
2716 if (!rt_is_input_route(rt) &&
2717 rt->rt_src != rt->rt_key_src) {
f3756b79
DM
2718 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2719 goto nla_put_failure;
2720 }
2721 if (rt->rt_dst != rt->rt_gateway &&
2722 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2723 goto nla_put_failure;
be403ea1 2724
defb3519 2725 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2726 goto nla_put_failure;
2727
f3756b79
DM
2728 if (rt->rt_mark &&
2729 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2730 goto nla_put_failure;
963bfeee 2731
d8d1f30b 2732 error = rt->dst.error;
5943634f
DM
2733 expires = rt->dst.expires;
2734 if (expires) {
2735 if (time_before(jiffies, expires))
2736 expires -= jiffies;
2737 else
2738 expires = 0;
1da177e4 2739 }
be403ea1 2740
c7537967 2741 if (rt_is_input_route(rt)) {
1da177e4 2742#ifdef CONFIG_IP_MROUTE
e448515c 2743 __be32 dst = rt->rt_dst;
1da177e4 2744
f97c1e0c 2745 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 2746 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
2747 int err = ipmr_get_route(net, skb,
2748 rt->rt_src, rt->rt_dst,
2749 r, nowait);
1da177e4
LT
2750 if (err <= 0) {
2751 if (!nowait) {
2752 if (err == 0)
2753 return 0;
be403ea1 2754 goto nla_put_failure;
1da177e4
LT
2755 } else {
2756 if (err == -EMSGSIZE)
be403ea1 2757 goto nla_put_failure;
e3703b3d 2758 error = err;
1da177e4
LT
2759 }
2760 }
2761 } else
2762#endif
f3756b79
DM
2763 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2764 goto nla_put_failure;
1da177e4
LT
2765 }
2766
f185071d 2767 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2768 goto nla_put_failure;
be403ea1
TG
2769
2770 return nlmsg_end(skb, nlh);
1da177e4 2771
be403ea1 2772nla_put_failure:
26932566
PM
2773 nlmsg_cancel(skb, nlh);
2774 return -EMSGSIZE;
1da177e4
LT
2775}
2776
5e73ea1a 2777static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1da177e4 2778{
3b1e0a65 2779 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2780 struct rtmsg *rtm;
2781 struct nlattr *tb[RTA_MAX+1];
1da177e4 2782 struct rtable *rt = NULL;
9e12bb22
AV
2783 __be32 dst = 0;
2784 __be32 src = 0;
2785 u32 iif;
d889ce3b 2786 int err;
963bfeee 2787 int mark;
1da177e4
LT
2788 struct sk_buff *skb;
2789
d889ce3b
TG
2790 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2791 if (err < 0)
2792 goto errout;
2793
2794 rtm = nlmsg_data(nlh);
2795
1da177e4 2796 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2797 if (skb == NULL) {
2798 err = -ENOBUFS;
2799 goto errout;
2800 }
1da177e4
LT
2801
2802 /* Reserve room for dummy headers, this skb can pass
2803 through good chunk of routing engine.
2804 */
459a98ed 2805 skb_reset_mac_header(skb);
c1d2bbe1 2806 skb_reset_network_header(skb);
d2c962b8
SH
2807
2808 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2809 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2810 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2811
17fb2c64
AV
2812 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2813 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2814 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2815 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
2816
2817 if (iif) {
d889ce3b
TG
2818 struct net_device *dev;
2819
1937504d 2820 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2821 if (dev == NULL) {
2822 err = -ENODEV;
2823 goto errout_free;
2824 }
2825
1da177e4
LT
2826 skb->protocol = htons(ETH_P_IP);
2827 skb->dev = dev;
963bfeee 2828 skb->mark = mark;
1da177e4
LT
2829 local_bh_disable();
2830 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2831 local_bh_enable();
d889ce3b 2832
511c3f92 2833 rt = skb_rtable(skb);
d8d1f30b
CG
2834 if (err == 0 && rt->dst.error)
2835 err = -rt->dst.error;
1da177e4 2836 } else {
68a5e3dd
DM
2837 struct flowi4 fl4 = {
2838 .daddr = dst,
2839 .saddr = src,
2840 .flowi4_tos = rtm->rtm_tos,
2841 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2842 .flowi4_mark = mark,
d889ce3b 2843 };
9d6ec938 2844 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2845
2846 err = 0;
2847 if (IS_ERR(rt))
2848 err = PTR_ERR(rt);
1da177e4 2849 }
d889ce3b 2850
1da177e4 2851 if (err)
d889ce3b 2852 goto errout_free;
1da177e4 2853
d8d1f30b 2854 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2855 if (rtm->rtm_flags & RTM_F_NOTIFY)
2856 rt->rt_flags |= RTCF_NOTIFY;
2857
4feb88e5 2858 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2859 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2860 if (err <= 0)
2861 goto errout_free;
1da177e4 2862
1937504d 2863 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2864errout:
2942e900 2865 return err;
1da177e4 2866
d889ce3b 2867errout_free:
1da177e4 2868 kfree_skb(skb);
d889ce3b 2869 goto errout;
1da177e4
LT
2870}
2871
2872int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2873{
2874 struct rtable *rt;
2875 int h, s_h;
2876 int idx, s_idx;
1937504d
DL
2877 struct net *net;
2878
3b1e0a65 2879 net = sock_net(skb->sk);
1da177e4
LT
2880
2881 s_h = cb->args[0];
d8c92830
ED
2882 if (s_h < 0)
2883 s_h = 0;
1da177e4 2884 s_idx = idx = cb->args[1];
a6272665
ED
2885 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2886 if (!rt_hash_table[h].chain)
2887 continue;
1da177e4 2888 rcu_read_lock_bh();
a898def2 2889 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
2890 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2891 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 2892 continue;
e84f84f2 2893 if (rt_is_expired(rt))
29e75252 2894 continue;
d8d1f30b 2895 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 2896 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2897 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2898 1, NLM_F_MULTI) <= 0) {
adf30907 2899 skb_dst_drop(skb);
1da177e4
LT
2900 rcu_read_unlock_bh();
2901 goto done;
2902 }
adf30907 2903 skb_dst_drop(skb);
1da177e4
LT
2904 }
2905 rcu_read_unlock_bh();
2906 }
2907
2908done:
2909 cb->args[0] = h;
2910 cb->args[1] = idx;
2911 return skb->len;
2912}
2913
2914void ip_rt_multicast_event(struct in_device *in_dev)
2915{
76e6ebfb 2916 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
2917}
2918
2919#ifdef CONFIG_SYSCTL
81c684d1 2920static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 2921 void __user *buffer,
1da177e4
LT
2922 size_t *lenp, loff_t *ppos)
2923{
2924 if (write) {
639e104f 2925 int flush_delay;
81c684d1 2926 ctl_table ctl;
39a23e75 2927 struct net *net;
639e104f 2928
81c684d1
DL
2929 memcpy(&ctl, __ctl, sizeof(ctl));
2930 ctl.data = &flush_delay;
8d65af78 2931 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 2932
81c684d1 2933 net = (struct net *)__ctl->extra1;
39a23e75 2934 rt_cache_flush(net, flush_delay);
1da177e4 2935 return 0;
e905a9ed 2936 }
1da177e4
LT
2937
2938 return -EINVAL;
2939}
2940
eeb61f71 2941static ctl_table ipv4_route_table[] = {
1da177e4 2942 {
1da177e4
LT
2943 .procname = "gc_thresh",
2944 .data = &ipv4_dst_ops.gc_thresh,
2945 .maxlen = sizeof(int),
2946 .mode = 0644,
6d9f239a 2947 .proc_handler = proc_dointvec,
1da177e4
LT
2948 },
2949 {
1da177e4
LT
2950 .procname = "max_size",
2951 .data = &ip_rt_max_size,
2952 .maxlen = sizeof(int),
2953 .mode = 0644,
6d9f239a 2954 .proc_handler = proc_dointvec,
1da177e4
LT
2955 },
2956 {
2957 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2958
1da177e4
LT
2959 .procname = "gc_min_interval",
2960 .data = &ip_rt_gc_min_interval,
2961 .maxlen = sizeof(int),
2962 .mode = 0644,
6d9f239a 2963 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2964 },
2965 {
1da177e4
LT
2966 .procname = "gc_min_interval_ms",
2967 .data = &ip_rt_gc_min_interval,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
6d9f239a 2970 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2971 },
2972 {
1da177e4
LT
2973 .procname = "gc_timeout",
2974 .data = &ip_rt_gc_timeout,
2975 .maxlen = sizeof(int),
2976 .mode = 0644,
6d9f239a 2977 .proc_handler = proc_dointvec_jiffies,
1da177e4 2978 },
9f28a2fc
ED
2979 {
2980 .procname = "gc_interval",
2981 .data = &ip_rt_gc_interval,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = proc_dointvec_jiffies,
2985 },
1da177e4 2986 {
1da177e4
LT
2987 .procname = "redirect_load",
2988 .data = &ip_rt_redirect_load,
2989 .maxlen = sizeof(int),
2990 .mode = 0644,
6d9f239a 2991 .proc_handler = proc_dointvec,
1da177e4
LT
2992 },
2993 {
1da177e4
LT
2994 .procname = "redirect_number",
2995 .data = &ip_rt_redirect_number,
2996 .maxlen = sizeof(int),
2997 .mode = 0644,
6d9f239a 2998 .proc_handler = proc_dointvec,
1da177e4
LT
2999 },
3000 {
1da177e4
LT
3001 .procname = "redirect_silence",
3002 .data = &ip_rt_redirect_silence,
3003 .maxlen = sizeof(int),
3004 .mode = 0644,
6d9f239a 3005 .proc_handler = proc_dointvec,
1da177e4
LT
3006 },
3007 {
1da177e4
LT
3008 .procname = "error_cost",
3009 .data = &ip_rt_error_cost,
3010 .maxlen = sizeof(int),
3011 .mode = 0644,
6d9f239a 3012 .proc_handler = proc_dointvec,
1da177e4
LT
3013 },
3014 {
1da177e4
LT
3015 .procname = "error_burst",
3016 .data = &ip_rt_error_burst,
3017 .maxlen = sizeof(int),
3018 .mode = 0644,
6d9f239a 3019 .proc_handler = proc_dointvec,
1da177e4
LT
3020 },
3021 {
1da177e4
LT
3022 .procname = "gc_elasticity",
3023 .data = &ip_rt_gc_elasticity,
3024 .maxlen = sizeof(int),
3025 .mode = 0644,
6d9f239a 3026 .proc_handler = proc_dointvec,
1da177e4
LT
3027 },
3028 {
1da177e4
LT
3029 .procname = "mtu_expires",
3030 .data = &ip_rt_mtu_expires,
3031 .maxlen = sizeof(int),
3032 .mode = 0644,
6d9f239a 3033 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3034 },
3035 {
1da177e4
LT
3036 .procname = "min_pmtu",
3037 .data = &ip_rt_min_pmtu,
3038 .maxlen = sizeof(int),
3039 .mode = 0644,
6d9f239a 3040 .proc_handler = proc_dointvec,
1da177e4
LT
3041 },
3042 {
1da177e4
LT
3043 .procname = "min_adv_mss",
3044 .data = &ip_rt_min_advmss,
3045 .maxlen = sizeof(int),
3046 .mode = 0644,
6d9f239a 3047 .proc_handler = proc_dointvec,
1da177e4 3048 },
f8572d8f 3049 { }
1da177e4 3050};
39a23e75 3051
39a23e75
DL
3052static struct ctl_table ipv4_route_flush_table[] = {
3053 {
39a23e75
DL
3054 .procname = "flush",
3055 .maxlen = sizeof(int),
3056 .mode = 0200,
6d9f239a 3057 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3058 },
f8572d8f 3059 { },
39a23e75
DL
3060};
3061
3062static __net_init int sysctl_route_net_init(struct net *net)
3063{
3064 struct ctl_table *tbl;
3065
3066 tbl = ipv4_route_flush_table;
09ad9bc7 3067 if (!net_eq(net, &init_net)) {
39a23e75
DL
3068 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3069 if (tbl == NULL)
3070 goto err_dup;
3071 }
3072 tbl[0].extra1 = net;
3073
ec8f23ce 3074 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
3075 if (net->ipv4.route_hdr == NULL)
3076 goto err_reg;
3077 return 0;
3078
3079err_reg:
3080 if (tbl != ipv4_route_flush_table)
3081 kfree(tbl);
3082err_dup:
3083 return -ENOMEM;
3084}
3085
3086static __net_exit void sysctl_route_net_exit(struct net *net)
3087{
3088 struct ctl_table *tbl;
3089
3090 tbl = net->ipv4.route_hdr->ctl_table_arg;
3091 unregister_net_sysctl_table(net->ipv4.route_hdr);
3092 BUG_ON(tbl == ipv4_route_flush_table);
3093 kfree(tbl);
3094}
3095
3096static __net_initdata struct pernet_operations sysctl_route_ops = {
3097 .init = sysctl_route_net_init,
3098 .exit = sysctl_route_net_exit,
3099};
1da177e4
LT
3100#endif
3101
3ee94372 3102static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3103{
3ee94372
NH
3104 get_random_bytes(&net->ipv4.rt_genid,
3105 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3106 get_random_bytes(&net->ipv4.dev_addr_genid,
3107 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3108 return 0;
3109}
3110
3ee94372
NH
3111static __net_initdata struct pernet_operations rt_genid_ops = {
3112 .init = rt_genid_init,
9f5e97e5
DL
3113};
3114
c3426b47
DM
3115static int __net_init ipv4_inetpeer_init(struct net *net)
3116{
3117 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3118
3119 if (!bp)
3120 return -ENOMEM;
3121 inet_peer_base_init(bp);
3122 net->ipv4.peers = bp;
3123 return 0;
3124}
3125
3126static void __net_exit ipv4_inetpeer_exit(struct net *net)
3127{
3128 struct inet_peer_base *bp = net->ipv4.peers;
3129
3130 net->ipv4.peers = NULL;
56a6b248 3131 inetpeer_invalidate_tree(bp);
c3426b47
DM
3132 kfree(bp);
3133}
3134
3135static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3136 .init = ipv4_inetpeer_init,
3137 .exit = ipv4_inetpeer_exit,
3138};
9f5e97e5 3139
c7066f70 3140#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3141struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3142#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3143
3144static __initdata unsigned long rhash_entries;
3145static int __init set_rhash_entries(char *str)
3146{
413c27d8
EZ
3147 ssize_t ret;
3148
1da177e4
LT
3149 if (!str)
3150 return 0;
413c27d8
EZ
3151
3152 ret = kstrtoul(str, 0, &rhash_entries);
3153 if (ret)
3154 return 0;
3155
1da177e4
LT
3156 return 1;
3157}
3158__setup("rhash_entries=", set_rhash_entries);
3159
3160int __init ip_rt_init(void)
3161{
424c4b70 3162 int rc = 0;
1da177e4 3163
c7066f70 3164#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3165 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3166 if (!ip_rt_acct)
3167 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3168#endif
3169
e5d679f3
AD
3170 ipv4_dst_ops.kmem_cachep =
3171 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3172 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3173
14e50e57
DM
3174 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3175
fc66f95c
ED
3176 if (dst_entries_init(&ipv4_dst_ops) < 0)
3177 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3178
3179 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3180 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3181
424c4b70
ED
3182 rt_hash_table = (struct rt_hash_bucket *)
3183 alloc_large_system_hash("IP route cache",
3184 sizeof(struct rt_hash_bucket),
3185 rhash_entries,
4481374c 3186 (totalram_pages >= 128 * 1024) ?
18955cfc 3187 15 : 17,
8d1502de 3188 0,
424c4b70
ED
3189 &rt_hash_log,
3190 &rt_hash_mask,
31fe62b9 3191 0,
c9503e0f 3192 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3193 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3194 rt_hash_lock_init();
1da177e4
LT
3195
3196 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3197 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3198
1da177e4
LT
3199 devinet_init();
3200 ip_fib_init();
3201
9f28a2fc
ED
3202 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3203 expires_ljiffies = jiffies;
3204 schedule_delayed_work(&expires_work,
3205 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3206
73b38711 3207 if (ip_rt_proc_init())
058bd4d2 3208 pr_err("Unable to create route proc files\n");
1da177e4
LT
3209#ifdef CONFIG_XFRM
3210 xfrm_init();
a33bc5c1 3211 xfrm4_init(ip_rt_max_size);
1da177e4 3212#endif
c7ac8679 3213 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3214
39a23e75
DL
3215#ifdef CONFIG_SYSCTL
3216 register_pernet_subsys(&sysctl_route_ops);
3217#endif
3ee94372 3218 register_pernet_subsys(&rt_genid_ops);
c3426b47 3219 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3220 return rc;
3221}
3222
a1bc6eb4 3223#ifdef CONFIG_SYSCTL
eeb61f71
AV
3224/*
3225 * We really need to sanitize the damn ipv4 init order, then all
3226 * this nonsense will go away.
3227 */
3228void __init ip_static_sysctl_init(void)
3229{
4e5ca785 3230 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3231}
a1bc6eb4 3232#endif