]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame_incremental - net/ipv4/route.c
phy/micrel: add ability to support 50MHz RMII clock on KZS8051RNL
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
... / ...
CommitLineData
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <linux/slab.h>
94#include <net/dst.h>
95#include <net/net_namespace.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/netevent.h>
107#include <net/rtnetlink.h>
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_max_size;
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20;
133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
144static void ipv4_dst_destroy(struct dst_entry *dst);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops);
149
150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
154
155static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156{
157 struct rtable *rt = (struct rtable *) dst;
158 struct inet_peer *peer;
159 u32 *p = NULL;
160
161 if (!rt->peer)
162 rt_bind_peer(rt, 1);
163
164 peer = rt->peer;
165 if (peer) {
166 u32 *old_p = __DST_METRICS_PTR(old);
167 unsigned long prev, new;
168
169 p = peer->metrics;
170 if (inet_metrics_new(peer))
171 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172
173 new = (unsigned long) p;
174 prev = cmpxchg(&dst->_metrics, old, new);
175
176 if (prev != old) {
177 p = __DST_METRICS_PTR(prev);
178 if (prev & DST_METRICS_READ_ONLY)
179 p = NULL;
180 } else {
181 if (rt->fi) {
182 fib_info_put(rt->fi);
183 rt->fi = NULL;
184 }
185 }
186 }
187 return p;
188}
189
190static struct dst_ops ipv4_dst_ops = {
191 .family = AF_INET,
192 .protocol = cpu_to_be16(ETH_P_IP),
193 .gc = rt_garbage_collect,
194 .check = ipv4_dst_check,
195 .default_advmss = ipv4_default_advmss,
196 .default_mtu = ipv4_default_mtu,
197 .cow_metrics = ipv4_cow_metrics,
198 .destroy = ipv4_dst_destroy,
199 .ifdown = ipv4_dst_ifdown,
200 .negative_advice = ipv4_negative_advice,
201 .link_failure = ipv4_link_failure,
202 .update_pmtu = ip_rt_update_pmtu,
203 .local_out = __ip_local_out,
204};
205
206#define ECN_OR_COST(class) TC_PRIO_##class
207
208const __u8 ip_tos2prio[16] = {
209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(FILLER),
211 TC_PRIO_BESTEFFORT,
212 ECN_OR_COST(BESTEFFORT),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK)
225};
226
227
228/*
229 * Route cache.
230 */
231
232/* The locking scheme is rather straight forward:
233 *
234 * 1) Read-Copy Update protects the buckets of the central route hash.
235 * 2) Only writers remove entries, and they hold the lock
236 * as they look at rtable reference counts.
237 * 3) Only readers acquire references to rtable entries,
238 * they do so with atomic increments and with the
239 * lock held.
240 */
241
242struct rt_hash_bucket {
243 struct rtable __rcu *chain;
244};
245
246#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247 defined(CONFIG_PROVE_LOCKING)
248/*
249 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250 * The size of this table is a power of two and depends on the number of CPUS.
251 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
252 */
253#ifdef CONFIG_LOCKDEP
254# define RT_HASH_LOCK_SZ 256
255#else
256# if NR_CPUS >= 32
257# define RT_HASH_LOCK_SZ 4096
258# elif NR_CPUS >= 16
259# define RT_HASH_LOCK_SZ 2048
260# elif NR_CPUS >= 8
261# define RT_HASH_LOCK_SZ 1024
262# elif NR_CPUS >= 4
263# define RT_HASH_LOCK_SZ 512
264# else
265# define RT_HASH_LOCK_SZ 256
266# endif
267#endif
268
269static spinlock_t *rt_hash_locks;
270# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
271
272static __init void rt_hash_lock_init(void)
273{
274 int i;
275
276 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277 GFP_KERNEL);
278 if (!rt_hash_locks)
279 panic("IP: failed to allocate rt_hash_locks\n");
280
281 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282 spin_lock_init(&rt_hash_locks[i]);
283}
284#else
285# define rt_hash_lock_addr(slot) NULL
286
287static inline void rt_hash_lock_init(void)
288{
289}
290#endif
291
292static struct rt_hash_bucket *rt_hash_table __read_mostly;
293static unsigned rt_hash_mask __read_mostly;
294static unsigned int rt_hash_log __read_mostly;
295
296static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
297#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
298
299static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
300 int genid)
301{
302 return jhash_3words((__force u32)daddr, (__force u32)saddr,
303 idx, genid)
304 & rt_hash_mask;
305}
306
307static inline int rt_genid(struct net *net)
308{
309 return atomic_read(&net->ipv4.rt_genid);
310}
311
312#ifdef CONFIG_PROC_FS
313struct rt_cache_iter_state {
314 struct seq_net_private p;
315 int bucket;
316 int genid;
317};
318
319static struct rtable *rt_cache_get_first(struct seq_file *seq)
320{
321 struct rt_cache_iter_state *st = seq->private;
322 struct rtable *r = NULL;
323
324 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
325 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
326 continue;
327 rcu_read_lock_bh();
328 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
329 while (r) {
330 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
331 r->rt_genid == st->genid)
332 return r;
333 r = rcu_dereference_bh(r->dst.rt_next);
334 }
335 rcu_read_unlock_bh();
336 }
337 return r;
338}
339
340static struct rtable *__rt_cache_get_next(struct seq_file *seq,
341 struct rtable *r)
342{
343 struct rt_cache_iter_state *st = seq->private;
344
345 r = rcu_dereference_bh(r->dst.rt_next);
346 while (!r) {
347 rcu_read_unlock_bh();
348 do {
349 if (--st->bucket < 0)
350 return NULL;
351 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
352 rcu_read_lock_bh();
353 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
354 }
355 return r;
356}
357
358static struct rtable *rt_cache_get_next(struct seq_file *seq,
359 struct rtable *r)
360{
361 struct rt_cache_iter_state *st = seq->private;
362 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
363 if (dev_net(r->dst.dev) != seq_file_net(seq))
364 continue;
365 if (r->rt_genid == st->genid)
366 break;
367 }
368 return r;
369}
370
371static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
372{
373 struct rtable *r = rt_cache_get_first(seq);
374
375 if (r)
376 while (pos && (r = rt_cache_get_next(seq, r)))
377 --pos;
378 return pos ? NULL : r;
379}
380
381static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382{
383 struct rt_cache_iter_state *st = seq->private;
384 if (*pos)
385 return rt_cache_get_idx(seq, *pos - 1);
386 st->genid = rt_genid(seq_file_net(seq));
387 return SEQ_START_TOKEN;
388}
389
390static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391{
392 struct rtable *r;
393
394 if (v == SEQ_START_TOKEN)
395 r = rt_cache_get_first(seq);
396 else
397 r = rt_cache_get_next(seq, v);
398 ++*pos;
399 return r;
400}
401
402static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403{
404 if (v && v != SEQ_START_TOKEN)
405 rcu_read_unlock_bh();
406}
407
408static int rt_cache_seq_show(struct seq_file *seq, void *v)
409{
410 if (v == SEQ_START_TOKEN)
411 seq_printf(seq, "%-127s\n",
412 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414 "HHUptod\tSpecDst");
415 else {
416 struct rtable *r = v;
417 int len;
418
419 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
421 r->dst.dev ? r->dst.dev->name : "*",
422 (__force u32)r->rt_dst,
423 (__force u32)r->rt_gateway,
424 r->rt_flags, atomic_read(&r->dst.__refcnt),
425 r->dst.__use, 0, (__force u32)r->rt_src,
426 dst_metric_advmss(&r->dst) + 40,
427 dst_metric(&r->dst, RTAX_WINDOW),
428 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429 dst_metric(&r->dst, RTAX_RTTVAR)),
430 r->fl.fl4_tos,
431 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432 r->dst.hh ? (r->dst.hh->hh_output ==
433 dev_queue_xmit) : 0,
434 r->rt_spec_dst, &len);
435
436 seq_printf(seq, "%*s\n", 127 - len, "");
437 }
438 return 0;
439}
440
441static const struct seq_operations rt_cache_seq_ops = {
442 .start = rt_cache_seq_start,
443 .next = rt_cache_seq_next,
444 .stop = rt_cache_seq_stop,
445 .show = rt_cache_seq_show,
446};
447
448static int rt_cache_seq_open(struct inode *inode, struct file *file)
449{
450 return seq_open_net(inode, file, &rt_cache_seq_ops,
451 sizeof(struct rt_cache_iter_state));
452}
453
454static const struct file_operations rt_cache_seq_fops = {
455 .owner = THIS_MODULE,
456 .open = rt_cache_seq_open,
457 .read = seq_read,
458 .llseek = seq_lseek,
459 .release = seq_release_net,
460};
461
462
463static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464{
465 int cpu;
466
467 if (*pos == 0)
468 return SEQ_START_TOKEN;
469
470 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
471 if (!cpu_possible(cpu))
472 continue;
473 *pos = cpu+1;
474 return &per_cpu(rt_cache_stat, cpu);
475 }
476 return NULL;
477}
478
479static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480{
481 int cpu;
482
483 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
484 if (!cpu_possible(cpu))
485 continue;
486 *pos = cpu+1;
487 return &per_cpu(rt_cache_stat, cpu);
488 }
489 return NULL;
490
491}
492
493static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494{
495
496}
497
498static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499{
500 struct rt_cache_stat *st = v;
501
502 if (v == SEQ_START_TOKEN) {
503 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
504 return 0;
505 }
506
507 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
508 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
509 dst_entries_get_slow(&ipv4_dst_ops),
510 st->in_hit,
511 st->in_slow_tot,
512 st->in_slow_mc,
513 st->in_no_route,
514 st->in_brd,
515 st->in_martian_dst,
516 st->in_martian_src,
517
518 st->out_hit,
519 st->out_slow_tot,
520 st->out_slow_mc,
521
522 st->gc_total,
523 st->gc_ignored,
524 st->gc_goal_miss,
525 st->gc_dst_overflow,
526 st->in_hlist_search,
527 st->out_hlist_search
528 );
529 return 0;
530}
531
532static const struct seq_operations rt_cpu_seq_ops = {
533 .start = rt_cpu_seq_start,
534 .next = rt_cpu_seq_next,
535 .stop = rt_cpu_seq_stop,
536 .show = rt_cpu_seq_show,
537};
538
539
540static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541{
542 return seq_open(file, &rt_cpu_seq_ops);
543}
544
545static const struct file_operations rt_cpu_seq_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_cpu_seq_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = seq_release,
551};
552
553#ifdef CONFIG_IP_ROUTE_CLASSID
554static int rt_acct_proc_show(struct seq_file *m, void *v)
555{
556 struct ip_rt_acct *dst, *src;
557 unsigned int i, j;
558
559 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560 if (!dst)
561 return -ENOMEM;
562
563 for_each_possible_cpu(i) {
564 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565 for (j = 0; j < 256; j++) {
566 dst[j].o_bytes += src[j].o_bytes;
567 dst[j].o_packets += src[j].o_packets;
568 dst[j].i_bytes += src[j].i_bytes;
569 dst[j].i_packets += src[j].i_packets;
570 }
571 }
572
573 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574 kfree(dst);
575 return 0;
576}
577
578static int rt_acct_proc_open(struct inode *inode, struct file *file)
579{
580 return single_open(file, rt_acct_proc_show, NULL);
581}
582
583static const struct file_operations rt_acct_proc_fops = {
584 .owner = THIS_MODULE,
585 .open = rt_acct_proc_open,
586 .read = seq_read,
587 .llseek = seq_lseek,
588 .release = single_release,
589};
590#endif
591
592static int __net_init ip_rt_do_proc_init(struct net *net)
593{
594 struct proc_dir_entry *pde;
595
596 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597 &rt_cache_seq_fops);
598 if (!pde)
599 goto err1;
600
601 pde = proc_create("rt_cache", S_IRUGO,
602 net->proc_net_stat, &rt_cpu_seq_fops);
603 if (!pde)
604 goto err2;
605
606#ifdef CONFIG_IP_ROUTE_CLASSID
607 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
608 if (!pde)
609 goto err3;
610#endif
611 return 0;
612
613#ifdef CONFIG_IP_ROUTE_CLASSID
614err3:
615 remove_proc_entry("rt_cache", net->proc_net_stat);
616#endif
617err2:
618 remove_proc_entry("rt_cache", net->proc_net);
619err1:
620 return -ENOMEM;
621}
622
623static void __net_exit ip_rt_do_proc_exit(struct net *net)
624{
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626 remove_proc_entry("rt_cache", net->proc_net);
627#ifdef CONFIG_IP_ROUTE_CLASSID
628 remove_proc_entry("rt_acct", net->proc_net);
629#endif
630}
631
632static struct pernet_operations ip_rt_proc_ops __net_initdata = {
633 .init = ip_rt_do_proc_init,
634 .exit = ip_rt_do_proc_exit,
635};
636
637static int __init ip_rt_proc_init(void)
638{
639 return register_pernet_subsys(&ip_rt_proc_ops);
640}
641
642#else
643static inline int ip_rt_proc_init(void)
644{
645 return 0;
646}
647#endif /* CONFIG_PROC_FS */
648
649static inline void rt_free(struct rtable *rt)
650{
651 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
652}
653
654static inline void rt_drop(struct rtable *rt)
655{
656 ip_rt_put(rt);
657 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
658}
659
660static inline int rt_fast_clean(struct rtable *rth)
661{
662 /* Kill broadcast/multicast entries very aggresively, if they
663 collide in hash table with more useful entries */
664 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
665 rt_is_input_route(rth) && rth->dst.rt_next;
666}
667
668static inline int rt_valuable(struct rtable *rth)
669{
670 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
671 rth->dst.expires;
672}
673
674static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675{
676 unsigned long age;
677 int ret = 0;
678
679 if (atomic_read(&rth->dst.__refcnt))
680 goto out;
681
682 ret = 1;
683 if (rth->dst.expires &&
684 time_after_eq(jiffies, rth->dst.expires))
685 goto out;
686
687 age = jiffies - rth->dst.lastuse;
688 ret = 0;
689 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690 (age <= tmo2 && rt_valuable(rth)))
691 goto out;
692 ret = 1;
693out: return ret;
694}
695
696/* Bits of score are:
697 * 31: very valuable
698 * 30: not quite useless
699 * 29..0: usage counter
700 */
701static inline u32 rt_score(struct rtable *rt)
702{
703 u32 score = jiffies - rt->dst.lastuse;
704
705 score = ~score & ~(3<<30);
706
707 if (rt_valuable(rt))
708 score |= (1<<31);
709
710 if (rt_is_output_route(rt) ||
711 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712 score |= (1<<30);
713
714 return score;
715}
716
717static inline bool rt_caching(const struct net *net)
718{
719 return net->ipv4.current_rt_cache_rebuild_count <=
720 net->ipv4.sysctl_rt_cache_rebuild_count;
721}
722
723static inline bool compare_hash_inputs(const struct flowi *fl1,
724 const struct flowi *fl2)
725{
726 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
728 (fl1->iif ^ fl2->iif)) == 0);
729}
730
731static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732{
733 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
735 (fl1->mark ^ fl2->mark) |
736 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
737 (fl1->oif ^ fl2->oif) |
738 (fl1->iif ^ fl2->iif)) == 0;
739}
740
741static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742{
743 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
744}
745
746static inline int rt_is_expired(struct rtable *rth)
747{
748 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
749}
750
751/*
752 * Perform a full scan of hash table and free all entries.
753 * Can be called by a softirq or a process.
754 * In the later case, we want to be reschedule if necessary
755 */
756static void rt_do_flush(struct net *net, int process_context)
757{
758 unsigned int i;
759 struct rtable *rth, *next;
760
761 for (i = 0; i <= rt_hash_mask; i++) {
762 struct rtable __rcu **pprev;
763 struct rtable *list;
764
765 if (process_context && need_resched())
766 cond_resched();
767 rth = rcu_dereference_raw(rt_hash_table[i].chain);
768 if (!rth)
769 continue;
770
771 spin_lock_bh(rt_hash_lock_addr(i));
772
773 list = NULL;
774 pprev = &rt_hash_table[i].chain;
775 rth = rcu_dereference_protected(*pprev,
776 lockdep_is_held(rt_hash_lock_addr(i)));
777
778 while (rth) {
779 next = rcu_dereference_protected(rth->dst.rt_next,
780 lockdep_is_held(rt_hash_lock_addr(i)));
781
782 if (!net ||
783 net_eq(dev_net(rth->dst.dev), net)) {
784 rcu_assign_pointer(*pprev, next);
785 rcu_assign_pointer(rth->dst.rt_next, list);
786 list = rth;
787 } else {
788 pprev = &rth->dst.rt_next;
789 }
790 rth = next;
791 }
792
793 spin_unlock_bh(rt_hash_lock_addr(i));
794
795 for (; list; list = next) {
796 next = rcu_dereference_protected(list->dst.rt_next, 1);
797 rt_free(list);
798 }
799 }
800}
801
802/*
803 * While freeing expired entries, we compute average chain length
804 * and standard deviation, using fixed-point arithmetic.
805 * This to have an estimation of rt_chain_length_max
806 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
807 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808 */
809
810#define FRACT_BITS 3
811#define ONE (1UL << FRACT_BITS)
812
813/*
814 * Given a hash chain and an item in this hash chain,
815 * find if a previous entry has the same hash_inputs
816 * (but differs on tos, mark or oif)
817 * Returns 0 if an alias is found.
818 * Returns ONE if rth has no alias before itself.
819 */
820static int has_noalias(const struct rtable *head, const struct rtable *rth)
821{
822 const struct rtable *aux = head;
823
824 while (aux != rth) {
825 if (compare_hash_inputs(&aux->fl, &rth->fl))
826 return 0;
827 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
828 }
829 return ONE;
830}
831
832static void rt_check_expire(void)
833{
834 static unsigned int rover;
835 unsigned int i = rover, goal;
836 struct rtable *rth;
837 struct rtable __rcu **rthp;
838 unsigned long samples = 0;
839 unsigned long sum = 0, sum2 = 0;
840 unsigned long delta;
841 u64 mult;
842
843 delta = jiffies - expires_ljiffies;
844 expires_ljiffies = jiffies;
845 mult = ((u64)delta) << rt_hash_log;
846 if (ip_rt_gc_timeout > 1)
847 do_div(mult, ip_rt_gc_timeout);
848 goal = (unsigned int)mult;
849 if (goal > rt_hash_mask)
850 goal = rt_hash_mask + 1;
851 for (; goal > 0; goal--) {
852 unsigned long tmo = ip_rt_gc_timeout;
853 unsigned long length;
854
855 i = (i + 1) & rt_hash_mask;
856 rthp = &rt_hash_table[i].chain;
857
858 if (need_resched())
859 cond_resched();
860
861 samples++;
862
863 if (rcu_dereference_raw(*rthp) == NULL)
864 continue;
865 length = 0;
866 spin_lock_bh(rt_hash_lock_addr(i));
867 while ((rth = rcu_dereference_protected(*rthp,
868 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
869 prefetch(rth->dst.rt_next);
870 if (rt_is_expired(rth)) {
871 *rthp = rth->dst.rt_next;
872 rt_free(rth);
873 continue;
874 }
875 if (rth->dst.expires) {
876 /* Entry is expired even if it is in use */
877 if (time_before_eq(jiffies, rth->dst.expires)) {
878nofree:
879 tmo >>= 1;
880 rthp = &rth->dst.rt_next;
881 /*
882 * We only count entries on
883 * a chain with equal hash inputs once
884 * so that entries for different QOS
885 * levels, and other non-hash input
886 * attributes don't unfairly skew
887 * the length computation
888 */
889 length += has_noalias(rt_hash_table[i].chain, rth);
890 continue;
891 }
892 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893 goto nofree;
894
895 /* Cleanup aged off entries. */
896 *rthp = rth->dst.rt_next;
897 rt_free(rth);
898 }
899 spin_unlock_bh(rt_hash_lock_addr(i));
900 sum += length;
901 sum2 += length*length;
902 }
903 if (samples) {
904 unsigned long avg = sum / samples;
905 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906 rt_chain_length_max = max_t(unsigned long,
907 ip_rt_gc_elasticity,
908 (avg + 4*sd) >> FRACT_BITS);
909 }
910 rover = i;
911}
912
913/*
914 * rt_worker_func() is run in process context.
915 * we call rt_check_expire() to scan part of the hash table
916 */
917static void rt_worker_func(struct work_struct *work)
918{
919 rt_check_expire();
920 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
921}
922
923/*
924 * Pertubation of rt_genid by a small quantity [1..256]
925 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926 * many times (2^24) without giving recent rt_genid.
927 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
928 */
929static void rt_cache_invalidate(struct net *net)
930{
931 unsigned char shuffle;
932
933 get_random_bytes(&shuffle, sizeof(shuffle));
934 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
935}
936
937/*
938 * delay < 0 : invalidate cache (fast : entries will be deleted later)
939 * delay >= 0 : invalidate & flush cache (can be long)
940 */
941void rt_cache_flush(struct net *net, int delay)
942{
943 rt_cache_invalidate(net);
944 if (delay >= 0)
945 rt_do_flush(net, !in_softirq());
946}
947
948/* Flush previous cache invalidated entries from the cache */
949void rt_cache_flush_batch(struct net *net)
950{
951 rt_do_flush(net, !in_softirq());
952}
953
954static void rt_emergency_hash_rebuild(struct net *net)
955{
956 if (net_ratelimit())
957 printk(KERN_WARNING "Route hash chain too long!\n");
958 rt_cache_invalidate(net);
959}
960
961/*
962 Short description of GC goals.
963
964 We want to build algorithm, which will keep routing cache
965 at some equilibrium point, when number of aged off entries
966 is kept approximately equal to newly generated ones.
967
968 Current expiration strength is variable "expire".
969 We try to adjust it dynamically, so that if networking
970 is idle expires is large enough to keep enough of warm entries,
971 and when load increases it reduces to limit cache size.
972 */
973
974static int rt_garbage_collect(struct dst_ops *ops)
975{
976 static unsigned long expire = RT_GC_TIMEOUT;
977 static unsigned long last_gc;
978 static int rover;
979 static int equilibrium;
980 struct rtable *rth;
981 struct rtable __rcu **rthp;
982 unsigned long now = jiffies;
983 int goal;
984 int entries = dst_entries_get_fast(&ipv4_dst_ops);
985
986 /*
987 * Garbage collection is pretty expensive,
988 * do not make it too frequently.
989 */
990
991 RT_CACHE_STAT_INC(gc_total);
992
993 if (now - last_gc < ip_rt_gc_min_interval &&
994 entries < ip_rt_max_size) {
995 RT_CACHE_STAT_INC(gc_ignored);
996 goto out;
997 }
998
999 entries = dst_entries_get_slow(&ipv4_dst_ops);
1000 /* Calculate number of entries, which we want to expire now. */
1001 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1002 if (goal <= 0) {
1003 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004 equilibrium = ipv4_dst_ops.gc_thresh;
1005 goal = entries - equilibrium;
1006 if (goal > 0) {
1007 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1008 goal = entries - equilibrium;
1009 }
1010 } else {
1011 /* We are in dangerous area. Try to reduce cache really
1012 * aggressively.
1013 */
1014 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015 equilibrium = entries - goal;
1016 }
1017
1018 if (now - last_gc >= ip_rt_gc_min_interval)
1019 last_gc = now;
1020
1021 if (goal <= 0) {
1022 equilibrium += goal;
1023 goto work_done;
1024 }
1025
1026 do {
1027 int i, k;
1028
1029 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030 unsigned long tmo = expire;
1031
1032 k = (k + 1) & rt_hash_mask;
1033 rthp = &rt_hash_table[k].chain;
1034 spin_lock_bh(rt_hash_lock_addr(k));
1035 while ((rth = rcu_dereference_protected(*rthp,
1036 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1037 if (!rt_is_expired(rth) &&
1038 !rt_may_expire(rth, tmo, expire)) {
1039 tmo >>= 1;
1040 rthp = &rth->dst.rt_next;
1041 continue;
1042 }
1043 *rthp = rth->dst.rt_next;
1044 rt_free(rth);
1045 goal--;
1046 }
1047 spin_unlock_bh(rt_hash_lock_addr(k));
1048 if (goal <= 0)
1049 break;
1050 }
1051 rover = k;
1052
1053 if (goal <= 0)
1054 goto work_done;
1055
1056 /* Goal is not achieved. We stop process if:
1057
1058 - if expire reduced to zero. Otherwise, expire is halfed.
1059 - if table is not full.
1060 - if we are called from interrupt.
1061 - jiffies check is just fallback/debug loop breaker.
1062 We will not spin here for long time in any case.
1063 */
1064
1065 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067 if (expire == 0)
1068 break;
1069
1070 expire >>= 1;
1071#if RT_CACHE_DEBUG >= 2
1072 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1073 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1074#endif
1075
1076 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1077 goto out;
1078 } while (!in_softirq() && time_before_eq(jiffies, now));
1079
1080 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081 goto out;
1082 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1083 goto out;
1084 if (net_ratelimit())
1085 printk(KERN_WARNING "dst cache overflow\n");
1086 RT_CACHE_STAT_INC(gc_dst_overflow);
1087 return 1;
1088
1089work_done:
1090 expire += ip_rt_gc_min_interval;
1091 if (expire > ip_rt_gc_timeout ||
1092 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1094 expire = ip_rt_gc_timeout;
1095#if RT_CACHE_DEBUG >= 2
1096 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1097 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1098#endif
1099out: return 0;
1100}
1101
1102/*
1103 * Returns number of entries in a hash chain that have different hash_inputs
1104 */
1105static int slow_chain_length(const struct rtable *head)
1106{
1107 int length = 0;
1108 const struct rtable *rth = head;
1109
1110 while (rth) {
1111 length += has_noalias(head, rth);
1112 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113 }
1114 return length >> FRACT_BITS;
1115}
1116
1117static int rt_intern_hash(unsigned hash, struct rtable *rt,
1118 struct rtable **rp, struct sk_buff *skb, int ifindex)
1119{
1120 struct rtable *rth, *cand;
1121 struct rtable __rcu **rthp, **candp;
1122 unsigned long now;
1123 u32 min_score;
1124 int chain_length;
1125 int attempts = !in_softirq();
1126
1127restart:
1128 chain_length = 0;
1129 min_score = ~(u32)0;
1130 cand = NULL;
1131 candp = NULL;
1132 now = jiffies;
1133
1134 if (!rt_caching(dev_net(rt->dst.dev))) {
1135 /*
1136 * If we're not caching, just tell the caller we
1137 * were successful and don't touch the route. The
1138 * caller hold the sole reference to the cache entry, and
1139 * it will be released when the caller is done with it.
1140 * If we drop it here, the callers have no way to resolve routes
1141 * when we're not caching. Instead, just point *rp at rt, so
1142 * the caller gets a single use out of the route
1143 * Note that we do rt_free on this new route entry, so that
1144 * once its refcount hits zero, we are still able to reap it
1145 * (Thanks Alexey)
1146 * Note: To avoid expensive rcu stuff for this uncached dst,
1147 * we set DST_NOCACHE so that dst_release() can free dst without
1148 * waiting a grace period.
1149 */
1150
1151 rt->dst.flags |= DST_NOCACHE;
1152 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1153 int err = arp_bind_neighbour(&rt->dst);
1154 if (err) {
1155 if (net_ratelimit())
1156 printk(KERN_WARNING
1157 "Neighbour table failure & not caching routes.\n");
1158 ip_rt_put(rt);
1159 return err;
1160 }
1161 }
1162
1163 goto skip_hashing;
1164 }
1165
1166 rthp = &rt_hash_table[hash].chain;
1167
1168 spin_lock_bh(rt_hash_lock_addr(hash));
1169 while ((rth = rcu_dereference_protected(*rthp,
1170 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1171 if (rt_is_expired(rth)) {
1172 *rthp = rth->dst.rt_next;
1173 rt_free(rth);
1174 continue;
1175 }
1176 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1177 /* Put it first */
1178 *rthp = rth->dst.rt_next;
1179 /*
1180 * Since lookup is lockfree, the deletion
1181 * must be visible to another weakly ordered CPU before
1182 * the insertion at the start of the hash chain.
1183 */
1184 rcu_assign_pointer(rth->dst.rt_next,
1185 rt_hash_table[hash].chain);
1186 /*
1187 * Since lookup is lockfree, the update writes
1188 * must be ordered for consistency on SMP.
1189 */
1190 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
1192 dst_use(&rth->dst, now);
1193 spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195 rt_drop(rt);
1196 if (rp)
1197 *rp = rth;
1198 else
1199 skb_dst_set(skb, &rth->dst);
1200 return 0;
1201 }
1202
1203 if (!atomic_read(&rth->dst.__refcnt)) {
1204 u32 score = rt_score(rth);
1205
1206 if (score <= min_score) {
1207 cand = rth;
1208 candp = rthp;
1209 min_score = score;
1210 }
1211 }
1212
1213 chain_length++;
1214
1215 rthp = &rth->dst.rt_next;
1216 }
1217
1218 if (cand) {
1219 /* ip_rt_gc_elasticity used to be average length of chain
1220 * length, when exceeded gc becomes really aggressive.
1221 *
1222 * The second limit is less certain. At the moment it allows
1223 * only 2 entries per bucket. We will see.
1224 */
1225 if (chain_length > ip_rt_gc_elasticity) {
1226 *candp = cand->dst.rt_next;
1227 rt_free(cand);
1228 }
1229 } else {
1230 if (chain_length > rt_chain_length_max &&
1231 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1232 struct net *net = dev_net(rt->dst.dev);
1233 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1234 if (!rt_caching(net)) {
1235 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1236 rt->dst.dev->name, num);
1237 }
1238 rt_emergency_hash_rebuild(net);
1239 spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242 ifindex, rt_genid(net));
1243 goto restart;
1244 }
1245 }
1246
1247 /* Try to bind route to arp only if it is output
1248 route or unicast forwarding path.
1249 */
1250 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1251 int err = arp_bind_neighbour(&rt->dst);
1252 if (err) {
1253 spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255 if (err != -ENOBUFS) {
1256 rt_drop(rt);
1257 return err;
1258 }
1259
1260 /* Neighbour tables are full and nothing
1261 can be released. Try to shrink route cache,
1262 it is most likely it holds some neighbour records.
1263 */
1264 if (attempts-- > 0) {
1265 int saved_elasticity = ip_rt_gc_elasticity;
1266 int saved_int = ip_rt_gc_min_interval;
1267 ip_rt_gc_elasticity = 1;
1268 ip_rt_gc_min_interval = 0;
1269 rt_garbage_collect(&ipv4_dst_ops);
1270 ip_rt_gc_min_interval = saved_int;
1271 ip_rt_gc_elasticity = saved_elasticity;
1272 goto restart;
1273 }
1274
1275 if (net_ratelimit())
1276 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1277 rt_drop(rt);
1278 return -ENOBUFS;
1279 }
1280 }
1281
1282 rt->dst.rt_next = rt_hash_table[hash].chain;
1283
1284#if RT_CACHE_DEBUG >= 2
1285 if (rt->dst.rt_next) {
1286 struct rtable *trt;
1287 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288 hash, &rt->rt_dst);
1289 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1290 printk(" . %pI4", &trt->rt_dst);
1291 printk("\n");
1292 }
1293#endif
1294 /*
1295 * Since lookup is lockfree, we must make sure
1296 * previous writes to rt are comitted to memory
1297 * before making rt visible to other CPUS.
1298 */
1299 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1300
1301 spin_unlock_bh(rt_hash_lock_addr(hash));
1302
1303skip_hashing:
1304 if (rp)
1305 *rp = rt;
1306 else
1307 skb_dst_set(skb, &rt->dst);
1308 return 0;
1309}
1310
1311static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1312
1313static u32 rt_peer_genid(void)
1314{
1315 return atomic_read(&__rt_peer_genid);
1316}
1317
1318void rt_bind_peer(struct rtable *rt, int create)
1319{
1320 struct inet_peer *peer;
1321
1322 peer = inet_getpeer_v4(rt->rt_dst, create);
1323
1324 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1325 inet_putpeer(peer);
1326 else
1327 rt->rt_peer_genid = rt_peer_genid();
1328}
1329
1330/*
1331 * Peer allocation may fail only in serious out-of-memory conditions. However
1332 * we still can generate some output.
1333 * Random ID selection looks a bit dangerous because we have no chances to
1334 * select ID being unique in a reasonable period of time.
1335 * But broken packet identifier may be better than no packet at all.
1336 */
1337static void ip_select_fb_ident(struct iphdr *iph)
1338{
1339 static DEFINE_SPINLOCK(ip_fb_id_lock);
1340 static u32 ip_fallback_id;
1341 u32 salt;
1342
1343 spin_lock_bh(&ip_fb_id_lock);
1344 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1345 iph->id = htons(salt & 0xFFFF);
1346 ip_fallback_id = salt;
1347 spin_unlock_bh(&ip_fb_id_lock);
1348}
1349
1350void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1351{
1352 struct rtable *rt = (struct rtable *) dst;
1353
1354 if (rt) {
1355 if (rt->peer == NULL)
1356 rt_bind_peer(rt, 1);
1357
1358 /* If peer is attached to destination, it is never detached,
1359 so that we need not to grab a lock to dereference it.
1360 */
1361 if (rt->peer) {
1362 iph->id = htons(inet_getid(rt->peer, more));
1363 return;
1364 }
1365 } else
1366 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1367 __builtin_return_address(0));
1368
1369 ip_select_fb_ident(iph);
1370}
1371EXPORT_SYMBOL(__ip_select_ident);
1372
1373static void rt_del(unsigned hash, struct rtable *rt)
1374{
1375 struct rtable __rcu **rthp;
1376 struct rtable *aux;
1377
1378 rthp = &rt_hash_table[hash].chain;
1379 spin_lock_bh(rt_hash_lock_addr(hash));
1380 ip_rt_put(rt);
1381 while ((aux = rcu_dereference_protected(*rthp,
1382 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1383 if (aux == rt || rt_is_expired(aux)) {
1384 *rthp = aux->dst.rt_next;
1385 rt_free(aux);
1386 continue;
1387 }
1388 rthp = &aux->dst.rt_next;
1389 }
1390 spin_unlock_bh(rt_hash_lock_addr(hash));
1391}
1392
1393/* called in rcu_read_lock() section */
1394void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1395 __be32 saddr, struct net_device *dev)
1396{
1397 int i, k;
1398 struct in_device *in_dev = __in_dev_get_rcu(dev);
1399 struct rtable *rth;
1400 struct rtable __rcu **rthp;
1401 __be32 skeys[2] = { saddr, 0 };
1402 int ikeys[2] = { dev->ifindex, 0 };
1403 struct netevent_redirect netevent;
1404 struct net *net;
1405
1406 if (!in_dev)
1407 return;
1408
1409 net = dev_net(dev);
1410 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1411 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1412 ipv4_is_zeronet(new_gw))
1413 goto reject_redirect;
1414
1415 if (!rt_caching(net))
1416 goto reject_redirect;
1417
1418 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1419 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1420 goto reject_redirect;
1421 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1422 goto reject_redirect;
1423 } else {
1424 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1425 goto reject_redirect;
1426 }
1427
1428 for (i = 0; i < 2; i++) {
1429 for (k = 0; k < 2; k++) {
1430 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1431 rt_genid(net));
1432
1433 rthp = &rt_hash_table[hash].chain;
1434
1435 while ((rth = rcu_dereference(*rthp)) != NULL) {
1436 struct rtable *rt;
1437
1438 if (rth->fl.fl4_dst != daddr ||
1439 rth->fl.fl4_src != skeys[i] ||
1440 rth->fl.oif != ikeys[k] ||
1441 rt_is_input_route(rth) ||
1442 rt_is_expired(rth) ||
1443 !net_eq(dev_net(rth->dst.dev), net)) {
1444 rthp = &rth->dst.rt_next;
1445 continue;
1446 }
1447
1448 if (rth->rt_dst != daddr ||
1449 rth->rt_src != saddr ||
1450 rth->dst.error ||
1451 rth->rt_gateway != old_gw ||
1452 rth->dst.dev != dev)
1453 break;
1454
1455 dst_hold(&rth->dst);
1456
1457 rt = dst_alloc(&ipv4_dst_ops);
1458 if (rt == NULL) {
1459 ip_rt_put(rth);
1460 return;
1461 }
1462
1463 /* Copy all the information. */
1464 *rt = *rth;
1465 rt->dst.__use = 1;
1466 atomic_set(&rt->dst.__refcnt, 1);
1467 rt->dst.child = NULL;
1468 if (rt->dst.dev)
1469 dev_hold(rt->dst.dev);
1470 rt->dst.obsolete = -1;
1471 rt->dst.lastuse = jiffies;
1472 rt->dst.path = &rt->dst;
1473 rt->dst.neighbour = NULL;
1474 rt->dst.hh = NULL;
1475#ifdef CONFIG_XFRM
1476 rt->dst.xfrm = NULL;
1477#endif
1478 rt->rt_genid = rt_genid(net);
1479 rt->rt_flags |= RTCF_REDIRECTED;
1480
1481 /* Gateway is different ... */
1482 rt->rt_gateway = new_gw;
1483
1484 /* Redirect received -> path was valid */
1485 dst_confirm(&rth->dst);
1486
1487 if (rt->peer)
1488 atomic_inc(&rt->peer->refcnt);
1489 if (rt->fi)
1490 atomic_inc(&rt->fi->fib_clntref);
1491
1492 if (arp_bind_neighbour(&rt->dst) ||
1493 !(rt->dst.neighbour->nud_state &
1494 NUD_VALID)) {
1495 if (rt->dst.neighbour)
1496 neigh_event_send(rt->dst.neighbour, NULL);
1497 ip_rt_put(rth);
1498 rt_drop(rt);
1499 goto do_next;
1500 }
1501
1502 netevent.old = &rth->dst;
1503 netevent.new = &rt->dst;
1504 call_netevent_notifiers(NETEVENT_REDIRECT,
1505 &netevent);
1506
1507 rt_del(hash, rth);
1508 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1509 ip_rt_put(rt);
1510 goto do_next;
1511 }
1512 do_next:
1513 ;
1514 }
1515 }
1516 return;
1517
1518reject_redirect:
1519#ifdef CONFIG_IP_ROUTE_VERBOSE
1520 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1521 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1522 " Advised path = %pI4 -> %pI4\n",
1523 &old_gw, dev->name, &new_gw,
1524 &saddr, &daddr);
1525#endif
1526 ;
1527}
1528
1529static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1530{
1531 struct rtable *rt = (struct rtable *)dst;
1532 struct dst_entry *ret = dst;
1533
1534 if (rt) {
1535 if (dst->obsolete > 0) {
1536 ip_rt_put(rt);
1537 ret = NULL;
1538 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1539 (rt->dst.expires &&
1540 time_after_eq(jiffies, rt->dst.expires))) {
1541 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1542 rt->fl.oif,
1543 rt_genid(dev_net(dst->dev)));
1544#if RT_CACHE_DEBUG >= 1
1545 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1546 &rt->rt_dst, rt->fl.fl4_tos);
1547#endif
1548 rt_del(hash, rt);
1549 ret = NULL;
1550 }
1551 }
1552 return ret;
1553}
1554
1555/*
1556 * Algorithm:
1557 * 1. The first ip_rt_redirect_number redirects are sent
1558 * with exponential backoff, then we stop sending them at all,
1559 * assuming that the host ignores our redirects.
1560 * 2. If we did not see packets requiring redirects
1561 * during ip_rt_redirect_silence, we assume that the host
1562 * forgot redirected route and start to send redirects again.
1563 *
1564 * This algorithm is much cheaper and more intelligent than dumb load limiting
1565 * in icmp.c.
1566 *
1567 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1568 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1569 */
1570
1571void ip_rt_send_redirect(struct sk_buff *skb)
1572{
1573 struct rtable *rt = skb_rtable(skb);
1574 struct in_device *in_dev;
1575 struct inet_peer *peer;
1576 int log_martians;
1577
1578 rcu_read_lock();
1579 in_dev = __in_dev_get_rcu(rt->dst.dev);
1580 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1581 rcu_read_unlock();
1582 return;
1583 }
1584 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1585 rcu_read_unlock();
1586
1587 if (!rt->peer)
1588 rt_bind_peer(rt, 1);
1589 peer = rt->peer;
1590 if (!peer) {
1591 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1592 return;
1593 }
1594
1595 /* No redirected packets during ip_rt_redirect_silence;
1596 * reset the algorithm.
1597 */
1598 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1599 peer->rate_tokens = 0;
1600
1601 /* Too many ignored redirects; do not send anything
1602 * set dst.rate_last to the last seen redirected packet.
1603 */
1604 if (peer->rate_tokens >= ip_rt_redirect_number) {
1605 peer->rate_last = jiffies;
1606 return;
1607 }
1608
1609 /* Check for load limit; set rate_last to the latest sent
1610 * redirect.
1611 */
1612 if (peer->rate_tokens == 0 ||
1613 time_after(jiffies,
1614 (peer->rate_last +
1615 (ip_rt_redirect_load << peer->rate_tokens)))) {
1616 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1617 peer->rate_last = jiffies;
1618 ++peer->rate_tokens;
1619#ifdef CONFIG_IP_ROUTE_VERBOSE
1620 if (log_martians &&
1621 peer->rate_tokens == ip_rt_redirect_number &&
1622 net_ratelimit())
1623 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1624 &rt->rt_src, rt->rt_iif,
1625 &rt->rt_dst, &rt->rt_gateway);
1626#endif
1627 }
1628}
1629
1630static int ip_error(struct sk_buff *skb)
1631{
1632 struct rtable *rt = skb_rtable(skb);
1633 struct inet_peer *peer;
1634 unsigned long now;
1635 bool send;
1636 int code;
1637
1638 switch (rt->dst.error) {
1639 case EINVAL:
1640 default:
1641 goto out;
1642 case EHOSTUNREACH:
1643 code = ICMP_HOST_UNREACH;
1644 break;
1645 case ENETUNREACH:
1646 code = ICMP_NET_UNREACH;
1647 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1648 IPSTATS_MIB_INNOROUTES);
1649 break;
1650 case EACCES:
1651 code = ICMP_PKT_FILTERED;
1652 break;
1653 }
1654
1655 if (!rt->peer)
1656 rt_bind_peer(rt, 1);
1657 peer = rt->peer;
1658
1659 send = true;
1660 if (peer) {
1661 now = jiffies;
1662 peer->rate_tokens += now - peer->rate_last;
1663 if (peer->rate_tokens > ip_rt_error_burst)
1664 peer->rate_tokens = ip_rt_error_burst;
1665 peer->rate_last = now;
1666 if (peer->rate_tokens >= ip_rt_error_cost)
1667 peer->rate_tokens -= ip_rt_error_cost;
1668 else
1669 send = false;
1670 }
1671 if (send)
1672 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1673
1674out: kfree_skb(skb);
1675 return 0;
1676}
1677
1678/*
1679 * The last two values are not from the RFC but
1680 * are needed for AMPRnet AX.25 paths.
1681 */
1682
1683static const unsigned short mtu_plateau[] =
1684{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1685
1686static inline unsigned short guess_mtu(unsigned short old_mtu)
1687{
1688 int i;
1689
1690 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1691 if (old_mtu > mtu_plateau[i])
1692 return mtu_plateau[i];
1693 return 68;
1694}
1695
1696unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1697 unsigned short new_mtu,
1698 struct net_device *dev)
1699{
1700 int i, k;
1701 unsigned short old_mtu = ntohs(iph->tot_len);
1702 struct rtable *rth;
1703 int ikeys[2] = { dev->ifindex, 0 };
1704 __be32 skeys[2] = { iph->saddr, 0, };
1705 __be32 daddr = iph->daddr;
1706 unsigned short est_mtu = 0;
1707
1708 for (k = 0; k < 2; k++) {
1709 for (i = 0; i < 2; i++) {
1710 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1711 rt_genid(net));
1712
1713 rcu_read_lock();
1714 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1715 rth = rcu_dereference(rth->dst.rt_next)) {
1716 unsigned short mtu = new_mtu;
1717
1718 if (rth->fl.fl4_dst != daddr ||
1719 rth->fl.fl4_src != skeys[i] ||
1720 rth->rt_dst != daddr ||
1721 rth->rt_src != iph->saddr ||
1722 rth->fl.oif != ikeys[k] ||
1723 rt_is_input_route(rth) ||
1724 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1725 !net_eq(dev_net(rth->dst.dev), net) ||
1726 rt_is_expired(rth))
1727 continue;
1728
1729 if (new_mtu < 68 || new_mtu >= old_mtu) {
1730
1731 /* BSD 4.2 compatibility hack :-( */
1732 if (mtu == 0 &&
1733 old_mtu >= dst_mtu(&rth->dst) &&
1734 old_mtu >= 68 + (iph->ihl << 2))
1735 old_mtu -= iph->ihl << 2;
1736
1737 mtu = guess_mtu(old_mtu);
1738 }
1739 if (mtu <= dst_mtu(&rth->dst)) {
1740 if (mtu < dst_mtu(&rth->dst)) {
1741 dst_confirm(&rth->dst);
1742 if (mtu < ip_rt_min_pmtu) {
1743 u32 lock = dst_metric(&rth->dst,
1744 RTAX_LOCK);
1745 mtu = ip_rt_min_pmtu;
1746 lock |= (1 << RTAX_MTU);
1747 dst_metric_set(&rth->dst, RTAX_LOCK,
1748 lock);
1749 }
1750 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1751 dst_set_expires(&rth->dst,
1752 ip_rt_mtu_expires);
1753 }
1754 est_mtu = mtu;
1755 }
1756 }
1757 rcu_read_unlock();
1758 }
1759 }
1760 return est_mtu ? : new_mtu;
1761}
1762
1763static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1764{
1765 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1766 !(dst_metric_locked(dst, RTAX_MTU))) {
1767 if (mtu < ip_rt_min_pmtu) {
1768 u32 lock = dst_metric(dst, RTAX_LOCK);
1769 mtu = ip_rt_min_pmtu;
1770 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1771 }
1772 dst_metric_set(dst, RTAX_MTU, mtu);
1773 dst_set_expires(dst, ip_rt_mtu_expires);
1774 }
1775}
1776
1777static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1778{
1779 struct rtable *rt = (struct rtable *) dst;
1780
1781 if (rt_is_expired(rt))
1782 return NULL;
1783 if (rt->rt_peer_genid != rt_peer_genid()) {
1784 if (!rt->peer)
1785 rt_bind_peer(rt, 0);
1786
1787 rt->rt_peer_genid = rt_peer_genid();
1788 }
1789 return dst;
1790}
1791
1792static void ipv4_dst_destroy(struct dst_entry *dst)
1793{
1794 struct rtable *rt = (struct rtable *) dst;
1795 struct inet_peer *peer = rt->peer;
1796
1797 if (rt->fi) {
1798 fib_info_put(rt->fi);
1799 rt->fi = NULL;
1800 }
1801 if (peer) {
1802 rt->peer = NULL;
1803 inet_putpeer(peer);
1804 }
1805}
1806
1807
1808static void ipv4_link_failure(struct sk_buff *skb)
1809{
1810 struct rtable *rt;
1811
1812 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1813
1814 rt = skb_rtable(skb);
1815 if (rt)
1816 dst_set_expires(&rt->dst, 0);
1817}
1818
1819static int ip_rt_bug(struct sk_buff *skb)
1820{
1821 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1822 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1823 skb->dev ? skb->dev->name : "?");
1824 kfree_skb(skb);
1825 return 0;
1826}
1827
1828/*
1829 We do not cache source address of outgoing interface,
1830 because it is used only by IP RR, TS and SRR options,
1831 so that it out of fast path.
1832
1833 BTW remember: "addr" is allowed to be not aligned
1834 in IP options!
1835 */
1836
1837void ip_rt_get_source(u8 *addr, struct rtable *rt)
1838{
1839 __be32 src;
1840 struct fib_result res;
1841
1842 if (rt_is_output_route(rt))
1843 src = rt->rt_src;
1844 else {
1845 rcu_read_lock();
1846 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1847 src = FIB_RES_PREFSRC(res);
1848 else
1849 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1850 RT_SCOPE_UNIVERSE);
1851 rcu_read_unlock();
1852 }
1853 memcpy(addr, &src, 4);
1854}
1855
1856#ifdef CONFIG_IP_ROUTE_CLASSID
1857static void set_class_tag(struct rtable *rt, u32 tag)
1858{
1859 if (!(rt->dst.tclassid & 0xFFFF))
1860 rt->dst.tclassid |= tag & 0xFFFF;
1861 if (!(rt->dst.tclassid & 0xFFFF0000))
1862 rt->dst.tclassid |= tag & 0xFFFF0000;
1863}
1864#endif
1865
1866static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1867{
1868 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1869
1870 if (advmss == 0) {
1871 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1872 ip_rt_min_advmss);
1873 if (advmss > 65535 - 40)
1874 advmss = 65535 - 40;
1875 }
1876 return advmss;
1877}
1878
1879static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1880{
1881 unsigned int mtu = dst->dev->mtu;
1882
1883 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1884 const struct rtable *rt = (const struct rtable *) dst;
1885
1886 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1887 mtu = 576;
1888 }
1889
1890 if (mtu > IP_MAX_MTU)
1891 mtu = IP_MAX_MTU;
1892
1893 return mtu;
1894}
1895
1896static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1897{
1898 struct inet_peer *peer;
1899 int create = 0;
1900
1901 /* If a peer entry exists for this destination, we must hook
1902 * it up in order to get at cached metrics.
1903 */
1904 if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1905 create = 1;
1906
1907 rt_bind_peer(rt, create);
1908 peer = rt->peer;
1909 if (peer) {
1910 if (inet_metrics_new(peer))
1911 memcpy(peer->metrics, fi->fib_metrics,
1912 sizeof(u32) * RTAX_MAX);
1913 dst_init_metrics(&rt->dst, peer->metrics, false);
1914 } else {
1915 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1916 rt->fi = fi;
1917 atomic_inc(&fi->fib_clntref);
1918 }
1919 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1920 }
1921}
1922
1923static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1924{
1925 struct dst_entry *dst = &rt->dst;
1926 struct fib_info *fi = res->fi;
1927
1928 if (fi) {
1929 if (FIB_RES_GW(*res) &&
1930 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1931 rt->rt_gateway = FIB_RES_GW(*res);
1932 rt_init_metrics(rt, fi);
1933#ifdef CONFIG_IP_ROUTE_CLASSID
1934 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1935#endif
1936 }
1937
1938 if (dst_mtu(dst) > IP_MAX_MTU)
1939 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1940 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1941 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1942
1943#ifdef CONFIG_IP_ROUTE_CLASSID
1944#ifdef CONFIG_IP_MULTIPLE_TABLES
1945 set_class_tag(rt, fib_rules_tclass(res));
1946#endif
1947 set_class_tag(rt, itag);
1948#endif
1949 rt->rt_type = res->type;
1950}
1951
1952/* called in rcu_read_lock() section */
1953static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1954 u8 tos, struct net_device *dev, int our)
1955{
1956 unsigned int hash;
1957 struct rtable *rth;
1958 __be32 spec_dst;
1959 struct in_device *in_dev = __in_dev_get_rcu(dev);
1960 u32 itag = 0;
1961 int err;
1962
1963 /* Primary sanity checks. */
1964
1965 if (in_dev == NULL)
1966 return -EINVAL;
1967
1968 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1969 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1970 goto e_inval;
1971
1972 if (ipv4_is_zeronet(saddr)) {
1973 if (!ipv4_is_local_multicast(daddr))
1974 goto e_inval;
1975 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1976 } else {
1977 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1978 &itag, 0);
1979 if (err < 0)
1980 goto e_err;
1981 }
1982 rth = dst_alloc(&ipv4_dst_ops);
1983 if (!rth)
1984 goto e_nobufs;
1985
1986 rth->dst.output = ip_rt_bug;
1987 rth->dst.obsolete = -1;
1988
1989 atomic_set(&rth->dst.__refcnt, 1);
1990 rth->dst.flags= DST_HOST;
1991 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1992 rth->dst.flags |= DST_NOPOLICY;
1993 rth->fl.fl4_dst = daddr;
1994 rth->rt_dst = daddr;
1995 rth->fl.fl4_tos = tos;
1996 rth->fl.mark = skb->mark;
1997 rth->fl.fl4_src = saddr;
1998 rth->rt_src = saddr;
1999#ifdef CONFIG_IP_ROUTE_CLASSID
2000 rth->dst.tclassid = itag;
2001#endif
2002 rth->rt_iif =
2003 rth->fl.iif = dev->ifindex;
2004 rth->dst.dev = init_net.loopback_dev;
2005 dev_hold(rth->dst.dev);
2006 rth->fl.oif = 0;
2007 rth->rt_gateway = daddr;
2008 rth->rt_spec_dst= spec_dst;
2009 rth->rt_genid = rt_genid(dev_net(dev));
2010 rth->rt_flags = RTCF_MULTICAST;
2011 rth->rt_type = RTN_MULTICAST;
2012 if (our) {
2013 rth->dst.input= ip_local_deliver;
2014 rth->rt_flags |= RTCF_LOCAL;
2015 }
2016
2017#ifdef CONFIG_IP_MROUTE
2018 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2019 rth->dst.input = ip_mr_input;
2020#endif
2021 RT_CACHE_STAT_INC(in_slow_mc);
2022
2023 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2024 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
2025
2026e_nobufs:
2027 return -ENOBUFS;
2028e_inval:
2029 return -EINVAL;
2030e_err:
2031 return err;
2032}
2033
2034
2035static void ip_handle_martian_source(struct net_device *dev,
2036 struct in_device *in_dev,
2037 struct sk_buff *skb,
2038 __be32 daddr,
2039 __be32 saddr)
2040{
2041 RT_CACHE_STAT_INC(in_martian_src);
2042#ifdef CONFIG_IP_ROUTE_VERBOSE
2043 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2044 /*
2045 * RFC1812 recommendation, if source is martian,
2046 * the only hint is MAC header.
2047 */
2048 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2049 &daddr, &saddr, dev->name);
2050 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2051 int i;
2052 const unsigned char *p = skb_mac_header(skb);
2053 printk(KERN_WARNING "ll header: ");
2054 for (i = 0; i < dev->hard_header_len; i++, p++) {
2055 printk("%02x", *p);
2056 if (i < (dev->hard_header_len - 1))
2057 printk(":");
2058 }
2059 printk("\n");
2060 }
2061 }
2062#endif
2063}
2064
2065/* called in rcu_read_lock() section */
2066static int __mkroute_input(struct sk_buff *skb,
2067 struct fib_result *res,
2068 struct in_device *in_dev,
2069 __be32 daddr, __be32 saddr, u32 tos,
2070 struct rtable **result)
2071{
2072 struct rtable *rth;
2073 int err;
2074 struct in_device *out_dev;
2075 unsigned int flags = 0;
2076 __be32 spec_dst;
2077 u32 itag;
2078
2079 /* get a working reference to the output device */
2080 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2081 if (out_dev == NULL) {
2082 if (net_ratelimit())
2083 printk(KERN_CRIT "Bug in ip_route_input" \
2084 "_slow(). Please, report\n");
2085 return -EINVAL;
2086 }
2087
2088
2089 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2090 in_dev->dev, &spec_dst, &itag, skb->mark);
2091 if (err < 0) {
2092 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2093 saddr);
2094
2095 goto cleanup;
2096 }
2097
2098 if (err)
2099 flags |= RTCF_DIRECTSRC;
2100
2101 if (out_dev == in_dev && err &&
2102 (IN_DEV_SHARED_MEDIA(out_dev) ||
2103 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2104 flags |= RTCF_DOREDIRECT;
2105
2106 if (skb->protocol != htons(ETH_P_IP)) {
2107 /* Not IP (i.e. ARP). Do not create route, if it is
2108 * invalid for proxy arp. DNAT routes are always valid.
2109 *
2110 * Proxy arp feature have been extended to allow, ARP
2111 * replies back to the same interface, to support
2112 * Private VLAN switch technologies. See arp.c.
2113 */
2114 if (out_dev == in_dev &&
2115 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2116 err = -EINVAL;
2117 goto cleanup;
2118 }
2119 }
2120
2121
2122 rth = dst_alloc(&ipv4_dst_ops);
2123 if (!rth) {
2124 err = -ENOBUFS;
2125 goto cleanup;
2126 }
2127
2128 atomic_set(&rth->dst.__refcnt, 1);
2129 rth->dst.flags= DST_HOST;
2130 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2131 rth->dst.flags |= DST_NOPOLICY;
2132 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2133 rth->dst.flags |= DST_NOXFRM;
2134 rth->fl.fl4_dst = daddr;
2135 rth->rt_dst = daddr;
2136 rth->fl.fl4_tos = tos;
2137 rth->fl.mark = skb->mark;
2138 rth->fl.fl4_src = saddr;
2139 rth->rt_src = saddr;
2140 rth->rt_gateway = daddr;
2141 rth->rt_iif =
2142 rth->fl.iif = in_dev->dev->ifindex;
2143 rth->dst.dev = (out_dev)->dev;
2144 dev_hold(rth->dst.dev);
2145 rth->fl.oif = 0;
2146 rth->rt_spec_dst= spec_dst;
2147
2148 rth->dst.obsolete = -1;
2149 rth->dst.input = ip_forward;
2150 rth->dst.output = ip_output;
2151 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2152
2153 rt_set_nexthop(rth, res, itag);
2154
2155 rth->rt_flags = flags;
2156
2157 *result = rth;
2158 err = 0;
2159 cleanup:
2160 return err;
2161}
2162
2163static int ip_mkroute_input(struct sk_buff *skb,
2164 struct fib_result *res,
2165 const struct flowi *fl,
2166 struct in_device *in_dev,
2167 __be32 daddr, __be32 saddr, u32 tos)
2168{
2169 struct rtable* rth = NULL;
2170 int err;
2171 unsigned hash;
2172
2173#ifdef CONFIG_IP_ROUTE_MULTIPATH
2174 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2175 fib_select_multipath(fl, res);
2176#endif
2177
2178 /* create a routing cache entry */
2179 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2180 if (err)
2181 return err;
2182
2183 /* put it into the cache */
2184 hash = rt_hash(daddr, saddr, fl->iif,
2185 rt_genid(dev_net(rth->dst.dev)));
2186 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2187}
2188
2189/*
2190 * NOTE. We drop all the packets that has local source
2191 * addresses, because every properly looped back packet
2192 * must have correct destination already attached by output routine.
2193 *
2194 * Such approach solves two big problems:
2195 * 1. Not simplex devices are handled properly.
2196 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2197 * called with rcu_read_lock()
2198 */
2199
2200static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2201 u8 tos, struct net_device *dev)
2202{
2203 struct fib_result res;
2204 struct in_device *in_dev = __in_dev_get_rcu(dev);
2205 struct flowi fl = { .fl4_dst = daddr,
2206 .fl4_src = saddr,
2207 .fl4_tos = tos,
2208 .fl4_scope = RT_SCOPE_UNIVERSE,
2209 .mark = skb->mark,
2210 .iif = dev->ifindex };
2211 unsigned flags = 0;
2212 u32 itag = 0;
2213 struct rtable * rth;
2214 unsigned hash;
2215 __be32 spec_dst;
2216 int err = -EINVAL;
2217 struct net * net = dev_net(dev);
2218
2219 /* IP on this device is disabled. */
2220
2221 if (!in_dev)
2222 goto out;
2223
2224 /* Check for the most weird martians, which can be not detected
2225 by fib_lookup.
2226 */
2227
2228 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2229 ipv4_is_loopback(saddr))
2230 goto martian_source;
2231
2232 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2233 goto brd_input;
2234
2235 /* Accept zero addresses only to limited broadcast;
2236 * I even do not know to fix it or not. Waiting for complains :-)
2237 */
2238 if (ipv4_is_zeronet(saddr))
2239 goto martian_source;
2240
2241 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2242 goto martian_destination;
2243
2244 /*
2245 * Now we are ready to route packet.
2246 */
2247 err = fib_lookup(net, &fl, &res);
2248 if (err != 0) {
2249 if (!IN_DEV_FORWARD(in_dev))
2250 goto e_hostunreach;
2251 goto no_route;
2252 }
2253
2254 RT_CACHE_STAT_INC(in_slow_tot);
2255
2256 if (res.type == RTN_BROADCAST)
2257 goto brd_input;
2258
2259 if (res.type == RTN_LOCAL) {
2260 err = fib_validate_source(saddr, daddr, tos,
2261 net->loopback_dev->ifindex,
2262 dev, &spec_dst, &itag, skb->mark);
2263 if (err < 0)
2264 goto martian_source_keep_err;
2265 if (err)
2266 flags |= RTCF_DIRECTSRC;
2267 spec_dst = daddr;
2268 goto local_input;
2269 }
2270
2271 if (!IN_DEV_FORWARD(in_dev))
2272 goto e_hostunreach;
2273 if (res.type != RTN_UNICAST)
2274 goto martian_destination;
2275
2276 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2277out: return err;
2278
2279brd_input:
2280 if (skb->protocol != htons(ETH_P_IP))
2281 goto e_inval;
2282
2283 if (ipv4_is_zeronet(saddr))
2284 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2285 else {
2286 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2287 &itag, skb->mark);
2288 if (err < 0)
2289 goto martian_source_keep_err;
2290 if (err)
2291 flags |= RTCF_DIRECTSRC;
2292 }
2293 flags |= RTCF_BROADCAST;
2294 res.type = RTN_BROADCAST;
2295 RT_CACHE_STAT_INC(in_brd);
2296
2297local_input:
2298 rth = dst_alloc(&ipv4_dst_ops);
2299 if (!rth)
2300 goto e_nobufs;
2301
2302 rth->dst.output= ip_rt_bug;
2303 rth->dst.obsolete = -1;
2304 rth->rt_genid = rt_genid(net);
2305
2306 atomic_set(&rth->dst.__refcnt, 1);
2307 rth->dst.flags= DST_HOST;
2308 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2309 rth->dst.flags |= DST_NOPOLICY;
2310 rth->fl.fl4_dst = daddr;
2311 rth->rt_dst = daddr;
2312 rth->fl.fl4_tos = tos;
2313 rth->fl.mark = skb->mark;
2314 rth->fl.fl4_src = saddr;
2315 rth->rt_src = saddr;
2316#ifdef CONFIG_IP_ROUTE_CLASSID
2317 rth->dst.tclassid = itag;
2318#endif
2319 rth->rt_iif =
2320 rth->fl.iif = dev->ifindex;
2321 rth->dst.dev = net->loopback_dev;
2322 dev_hold(rth->dst.dev);
2323 rth->rt_gateway = daddr;
2324 rth->rt_spec_dst= spec_dst;
2325 rth->dst.input= ip_local_deliver;
2326 rth->rt_flags = flags|RTCF_LOCAL;
2327 if (res.type == RTN_UNREACHABLE) {
2328 rth->dst.input= ip_error;
2329 rth->dst.error= -err;
2330 rth->rt_flags &= ~RTCF_LOCAL;
2331 }
2332 rth->rt_type = res.type;
2333 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2334 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2335 goto out;
2336
2337no_route:
2338 RT_CACHE_STAT_INC(in_no_route);
2339 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2340 res.type = RTN_UNREACHABLE;
2341 if (err == -ESRCH)
2342 err = -ENETUNREACH;
2343 goto local_input;
2344
2345 /*
2346 * Do not cache martian addresses: they should be logged (RFC1812)
2347 */
2348martian_destination:
2349 RT_CACHE_STAT_INC(in_martian_dst);
2350#ifdef CONFIG_IP_ROUTE_VERBOSE
2351 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2352 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2353 &daddr, &saddr, dev->name);
2354#endif
2355
2356e_hostunreach:
2357 err = -EHOSTUNREACH;
2358 goto out;
2359
2360e_inval:
2361 err = -EINVAL;
2362 goto out;
2363
2364e_nobufs:
2365 err = -ENOBUFS;
2366 goto out;
2367
2368martian_source:
2369 err = -EINVAL;
2370martian_source_keep_err:
2371 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2372 goto out;
2373}
2374
2375int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2376 u8 tos, struct net_device *dev, bool noref)
2377{
2378 struct rtable * rth;
2379 unsigned hash;
2380 int iif = dev->ifindex;
2381 struct net *net;
2382 int res;
2383
2384 net = dev_net(dev);
2385
2386 rcu_read_lock();
2387
2388 if (!rt_caching(net))
2389 goto skip_cache;
2390
2391 tos &= IPTOS_RT_MASK;
2392 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2393
2394 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2395 rth = rcu_dereference(rth->dst.rt_next)) {
2396 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2397 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2398 (rth->fl.iif ^ iif) |
2399 rth->fl.oif |
2400 (rth->fl.fl4_tos ^ tos)) == 0 &&
2401 rth->fl.mark == skb->mark &&
2402 net_eq(dev_net(rth->dst.dev), net) &&
2403 !rt_is_expired(rth)) {
2404 if (noref) {
2405 dst_use_noref(&rth->dst, jiffies);
2406 skb_dst_set_noref(skb, &rth->dst);
2407 } else {
2408 dst_use(&rth->dst, jiffies);
2409 skb_dst_set(skb, &rth->dst);
2410 }
2411 RT_CACHE_STAT_INC(in_hit);
2412 rcu_read_unlock();
2413 return 0;
2414 }
2415 RT_CACHE_STAT_INC(in_hlist_search);
2416 }
2417
2418skip_cache:
2419 /* Multicast recognition logic is moved from route cache to here.
2420 The problem was that too many Ethernet cards have broken/missing
2421 hardware multicast filters :-( As result the host on multicasting
2422 network acquires a lot of useless route cache entries, sort of
2423 SDR messages from all the world. Now we try to get rid of them.
2424 Really, provided software IP multicast filter is organized
2425 reasonably (at least, hashed), it does not result in a slowdown
2426 comparing with route cache reject entries.
2427 Note, that multicast routers are not affected, because
2428 route cache entry is created eventually.
2429 */
2430 if (ipv4_is_multicast(daddr)) {
2431 struct in_device *in_dev = __in_dev_get_rcu(dev);
2432
2433 if (in_dev) {
2434 int our = ip_check_mc(in_dev, daddr, saddr,
2435 ip_hdr(skb)->protocol);
2436 if (our
2437#ifdef CONFIG_IP_MROUTE
2438 ||
2439 (!ipv4_is_local_multicast(daddr) &&
2440 IN_DEV_MFORWARD(in_dev))
2441#endif
2442 ) {
2443 int res = ip_route_input_mc(skb, daddr, saddr,
2444 tos, dev, our);
2445 rcu_read_unlock();
2446 return res;
2447 }
2448 }
2449 rcu_read_unlock();
2450 return -EINVAL;
2451 }
2452 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2453 rcu_read_unlock();
2454 return res;
2455}
2456EXPORT_SYMBOL(ip_route_input_common);
2457
2458/* called with rcu_read_lock() */
2459static int __mkroute_output(struct rtable **result,
2460 struct fib_result *res,
2461 const struct flowi *fl,
2462 const struct flowi *oldflp,
2463 struct net_device *dev_out,
2464 unsigned flags)
2465{
2466 struct rtable *rth;
2467 struct in_device *in_dev;
2468 u32 tos = RT_FL_TOS(oldflp);
2469
2470 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2471 return -EINVAL;
2472
2473 if (ipv4_is_lbcast(fl->fl4_dst))
2474 res->type = RTN_BROADCAST;
2475 else if (ipv4_is_multicast(fl->fl4_dst))
2476 res->type = RTN_MULTICAST;
2477 else if (ipv4_is_zeronet(fl->fl4_dst))
2478 return -EINVAL;
2479
2480 if (dev_out->flags & IFF_LOOPBACK)
2481 flags |= RTCF_LOCAL;
2482
2483 in_dev = __in_dev_get_rcu(dev_out);
2484 if (!in_dev)
2485 return -EINVAL;
2486
2487 if (res->type == RTN_BROADCAST) {
2488 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2489 res->fi = NULL;
2490 } else if (res->type == RTN_MULTICAST) {
2491 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2492 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2493 oldflp->proto))
2494 flags &= ~RTCF_LOCAL;
2495 /* If multicast route do not exist use
2496 * default one, but do not gateway in this case.
2497 * Yes, it is hack.
2498 */
2499 if (res->fi && res->prefixlen < 4)
2500 res->fi = NULL;
2501 }
2502
2503
2504 rth = dst_alloc(&ipv4_dst_ops);
2505 if (!rth)
2506 return -ENOBUFS;
2507
2508 atomic_set(&rth->dst.__refcnt, 1);
2509 rth->dst.flags= DST_HOST;
2510 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2511 rth->dst.flags |= DST_NOXFRM;
2512 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2513 rth->dst.flags |= DST_NOPOLICY;
2514
2515 rth->fl.fl4_dst = oldflp->fl4_dst;
2516 rth->fl.fl4_tos = tos;
2517 rth->fl.fl4_src = oldflp->fl4_src;
2518 rth->fl.oif = oldflp->oif;
2519 rth->fl.mark = oldflp->mark;
2520 rth->rt_dst = fl->fl4_dst;
2521 rth->rt_src = fl->fl4_src;
2522 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2523 /* get references to the devices that are to be hold by the routing
2524 cache entry */
2525 rth->dst.dev = dev_out;
2526 dev_hold(dev_out);
2527 rth->rt_gateway = fl->fl4_dst;
2528 rth->rt_spec_dst= fl->fl4_src;
2529
2530 rth->dst.output=ip_output;
2531 rth->dst.obsolete = -1;
2532 rth->rt_genid = rt_genid(dev_net(dev_out));
2533
2534 RT_CACHE_STAT_INC(out_slow_tot);
2535
2536 if (flags & RTCF_LOCAL) {
2537 rth->dst.input = ip_local_deliver;
2538 rth->rt_spec_dst = fl->fl4_dst;
2539 }
2540 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2541 rth->rt_spec_dst = fl->fl4_src;
2542 if (flags & RTCF_LOCAL &&
2543 !(dev_out->flags & IFF_LOOPBACK)) {
2544 rth->dst.output = ip_mc_output;
2545 RT_CACHE_STAT_INC(out_slow_mc);
2546 }
2547#ifdef CONFIG_IP_MROUTE
2548 if (res->type == RTN_MULTICAST) {
2549 if (IN_DEV_MFORWARD(in_dev) &&
2550 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2551 rth->dst.input = ip_mr_input;
2552 rth->dst.output = ip_mc_output;
2553 }
2554 }
2555#endif
2556 }
2557
2558 rt_set_nexthop(rth, res, 0);
2559
2560 rth->rt_flags = flags;
2561 *result = rth;
2562 return 0;
2563}
2564
2565/* called with rcu_read_lock() */
2566static int ip_mkroute_output(struct rtable **rp,
2567 struct fib_result *res,
2568 const struct flowi *fl,
2569 const struct flowi *oldflp,
2570 struct net_device *dev_out,
2571 unsigned flags)
2572{
2573 struct rtable *rth = NULL;
2574 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2575 unsigned hash;
2576 if (err == 0) {
2577 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2578 rt_genid(dev_net(dev_out)));
2579 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2580 }
2581
2582 return err;
2583}
2584
2585/*
2586 * Major route resolver routine.
2587 * called with rcu_read_lock();
2588 */
2589
2590static int ip_route_output_slow(struct net *net, struct rtable **rp,
2591 const struct flowi *oldflp)
2592{
2593 u32 tos = RT_FL_TOS(oldflp);
2594 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2595 .fl4_src = oldflp->fl4_src,
2596 .fl4_tos = tos & IPTOS_RT_MASK,
2597 .fl4_scope = ((tos & RTO_ONLINK) ?
2598 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2599 .mark = oldflp->mark,
2600 .iif = net->loopback_dev->ifindex,
2601 .oif = oldflp->oif };
2602 struct fib_result res;
2603 unsigned int flags = 0;
2604 struct net_device *dev_out = NULL;
2605 int err;
2606
2607
2608 res.fi = NULL;
2609#ifdef CONFIG_IP_MULTIPLE_TABLES
2610 res.r = NULL;
2611#endif
2612
2613 if (oldflp->fl4_src) {
2614 err = -EINVAL;
2615 if (ipv4_is_multicast(oldflp->fl4_src) ||
2616 ipv4_is_lbcast(oldflp->fl4_src) ||
2617 ipv4_is_zeronet(oldflp->fl4_src))
2618 goto out;
2619
2620 /* I removed check for oif == dev_out->oif here.
2621 It was wrong for two reasons:
2622 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2623 is assigned to multiple interfaces.
2624 2. Moreover, we are allowed to send packets with saddr
2625 of another iface. --ANK
2626 */
2627
2628 if (oldflp->oif == 0 &&
2629 (ipv4_is_multicast(oldflp->fl4_dst) ||
2630 ipv4_is_lbcast(oldflp->fl4_dst))) {
2631 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2632 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2633 if (dev_out == NULL)
2634 goto out;
2635
2636 /* Special hack: user can direct multicasts
2637 and limited broadcast via necessary interface
2638 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2639 This hack is not just for fun, it allows
2640 vic,vat and friends to work.
2641 They bind socket to loopback, set ttl to zero
2642 and expect that it will work.
2643 From the viewpoint of routing cache they are broken,
2644 because we are not allowed to build multicast path
2645 with loopback source addr (look, routing cache
2646 cannot know, that ttl is zero, so that packet
2647 will not leave this host and route is valid).
2648 Luckily, this hack is good workaround.
2649 */
2650
2651 fl.oif = dev_out->ifindex;
2652 goto make_route;
2653 }
2654
2655 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2656 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2657 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2658 goto out;
2659 }
2660 }
2661
2662
2663 if (oldflp->oif) {
2664 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2665 err = -ENODEV;
2666 if (dev_out == NULL)
2667 goto out;
2668
2669 /* RACE: Check return value of inet_select_addr instead. */
2670 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2671 err = -ENETUNREACH;
2672 goto out;
2673 }
2674 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2675 ipv4_is_lbcast(oldflp->fl4_dst)) {
2676 if (!fl.fl4_src)
2677 fl.fl4_src = inet_select_addr(dev_out, 0,
2678 RT_SCOPE_LINK);
2679 goto make_route;
2680 }
2681 if (!fl.fl4_src) {
2682 if (ipv4_is_multicast(oldflp->fl4_dst))
2683 fl.fl4_src = inet_select_addr(dev_out, 0,
2684 fl.fl4_scope);
2685 else if (!oldflp->fl4_dst)
2686 fl.fl4_src = inet_select_addr(dev_out, 0,
2687 RT_SCOPE_HOST);
2688 }
2689 }
2690
2691 if (!fl.fl4_dst) {
2692 fl.fl4_dst = fl.fl4_src;
2693 if (!fl.fl4_dst)
2694 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2695 dev_out = net->loopback_dev;
2696 fl.oif = net->loopback_dev->ifindex;
2697 res.type = RTN_LOCAL;
2698 flags |= RTCF_LOCAL;
2699 goto make_route;
2700 }
2701
2702 if (fib_lookup(net, &fl, &res)) {
2703 res.fi = NULL;
2704 if (oldflp->oif) {
2705 /* Apparently, routing tables are wrong. Assume,
2706 that the destination is on link.
2707
2708 WHY? DW.
2709 Because we are allowed to send to iface
2710 even if it has NO routes and NO assigned
2711 addresses. When oif is specified, routing
2712 tables are looked up with only one purpose:
2713 to catch if destination is gatewayed, rather than
2714 direct. Moreover, if MSG_DONTROUTE is set,
2715 we send packet, ignoring both routing tables
2716 and ifaddr state. --ANK
2717
2718
2719 We could make it even if oif is unknown,
2720 likely IPv6, but we do not.
2721 */
2722
2723 if (fl.fl4_src == 0)
2724 fl.fl4_src = inet_select_addr(dev_out, 0,
2725 RT_SCOPE_LINK);
2726 res.type = RTN_UNICAST;
2727 goto make_route;
2728 }
2729 err = -ENETUNREACH;
2730 goto out;
2731 }
2732
2733 if (res.type == RTN_LOCAL) {
2734 if (!fl.fl4_src) {
2735 if (res.fi->fib_prefsrc)
2736 fl.fl4_src = res.fi->fib_prefsrc;
2737 else
2738 fl.fl4_src = fl.fl4_dst;
2739 }
2740 dev_out = net->loopback_dev;
2741 fl.oif = dev_out->ifindex;
2742 res.fi = NULL;
2743 flags |= RTCF_LOCAL;
2744 goto make_route;
2745 }
2746
2747#ifdef CONFIG_IP_ROUTE_MULTIPATH
2748 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2749 fib_select_multipath(&fl, &res);
2750 else
2751#endif
2752 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2753 fib_select_default(&res);
2754
2755 if (!fl.fl4_src)
2756 fl.fl4_src = FIB_RES_PREFSRC(res);
2757
2758 dev_out = FIB_RES_DEV(res);
2759 fl.oif = dev_out->ifindex;
2760
2761
2762make_route:
2763 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2764
2765out: return err;
2766}
2767
2768int __ip_route_output_key(struct net *net, struct rtable **rp,
2769 const struct flowi *flp)
2770{
2771 unsigned int hash;
2772 int res;
2773 struct rtable *rth;
2774
2775 if (!rt_caching(net))
2776 goto slow_output;
2777
2778 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2779
2780 rcu_read_lock_bh();
2781 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2782 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2783 if (rth->fl.fl4_dst == flp->fl4_dst &&
2784 rth->fl.fl4_src == flp->fl4_src &&
2785 rt_is_output_route(rth) &&
2786 rth->fl.oif == flp->oif &&
2787 rth->fl.mark == flp->mark &&
2788 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2789 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2790 net_eq(dev_net(rth->dst.dev), net) &&
2791 !rt_is_expired(rth)) {
2792 dst_use(&rth->dst, jiffies);
2793 RT_CACHE_STAT_INC(out_hit);
2794 rcu_read_unlock_bh();
2795 *rp = rth;
2796 return 0;
2797 }
2798 RT_CACHE_STAT_INC(out_hlist_search);
2799 }
2800 rcu_read_unlock_bh();
2801
2802slow_output:
2803 rcu_read_lock();
2804 res = ip_route_output_slow(net, rp, flp);
2805 rcu_read_unlock();
2806 return res;
2807}
2808EXPORT_SYMBOL_GPL(__ip_route_output_key);
2809
2810static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2811{
2812 return NULL;
2813}
2814
2815static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2816{
2817 return 0;
2818}
2819
2820static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2821{
2822}
2823
2824static struct dst_ops ipv4_dst_blackhole_ops = {
2825 .family = AF_INET,
2826 .protocol = cpu_to_be16(ETH_P_IP),
2827 .destroy = ipv4_dst_destroy,
2828 .check = ipv4_blackhole_dst_check,
2829 .default_mtu = ipv4_blackhole_default_mtu,
2830 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2831};
2832
2833
2834static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2835{
2836 struct rtable *ort = *rp;
2837 struct rtable *rt = (struct rtable *)
2838 dst_alloc(&ipv4_dst_blackhole_ops);
2839
2840 if (rt) {
2841 struct dst_entry *new = &rt->dst;
2842
2843 atomic_set(&new->__refcnt, 1);
2844 new->__use = 1;
2845 new->input = dst_discard;
2846 new->output = dst_discard;
2847 dst_copy_metrics(new, &ort->dst);
2848
2849 new->dev = ort->dst.dev;
2850 if (new->dev)
2851 dev_hold(new->dev);
2852
2853 rt->fl = ort->fl;
2854
2855 rt->rt_genid = rt_genid(net);
2856 rt->rt_flags = ort->rt_flags;
2857 rt->rt_type = ort->rt_type;
2858 rt->rt_dst = ort->rt_dst;
2859 rt->rt_src = ort->rt_src;
2860 rt->rt_iif = ort->rt_iif;
2861 rt->rt_gateway = ort->rt_gateway;
2862 rt->rt_spec_dst = ort->rt_spec_dst;
2863 rt->peer = ort->peer;
2864 if (rt->peer)
2865 atomic_inc(&rt->peer->refcnt);
2866 rt->fi = ort->fi;
2867 if (rt->fi)
2868 atomic_inc(&rt->fi->fib_clntref);
2869
2870 dst_free(new);
2871 }
2872
2873 dst_release(&(*rp)->dst);
2874 *rp = rt;
2875 return rt ? 0 : -ENOMEM;
2876}
2877
2878int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2879 struct sock *sk, int flags)
2880{
2881 int err;
2882
2883 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2884 return err;
2885
2886 if (flp->proto) {
2887 if (!flp->fl4_src)
2888 flp->fl4_src = (*rp)->rt_src;
2889 if (!flp->fl4_dst)
2890 flp->fl4_dst = (*rp)->rt_dst;
2891 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2892 flags ? XFRM_LOOKUP_WAIT : 0);
2893 if (err == -EREMOTE)
2894 err = ipv4_dst_blackhole(net, rp, flp);
2895
2896 return err;
2897 }
2898
2899 return 0;
2900}
2901EXPORT_SYMBOL_GPL(ip_route_output_flow);
2902
2903int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2904{
2905 return ip_route_output_flow(net, rp, flp, NULL, 0);
2906}
2907EXPORT_SYMBOL(ip_route_output_key);
2908
2909static int rt_fill_info(struct net *net,
2910 struct sk_buff *skb, u32 pid, u32 seq, int event,
2911 int nowait, unsigned int flags)
2912{
2913 struct rtable *rt = skb_rtable(skb);
2914 struct rtmsg *r;
2915 struct nlmsghdr *nlh;
2916 long expires;
2917 u32 id = 0, ts = 0, tsage = 0, error;
2918
2919 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2920 if (nlh == NULL)
2921 return -EMSGSIZE;
2922
2923 r = nlmsg_data(nlh);
2924 r->rtm_family = AF_INET;
2925 r->rtm_dst_len = 32;
2926 r->rtm_src_len = 0;
2927 r->rtm_tos = rt->fl.fl4_tos;
2928 r->rtm_table = RT_TABLE_MAIN;
2929 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2930 r->rtm_type = rt->rt_type;
2931 r->rtm_scope = RT_SCOPE_UNIVERSE;
2932 r->rtm_protocol = RTPROT_UNSPEC;
2933 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2934 if (rt->rt_flags & RTCF_NOTIFY)
2935 r->rtm_flags |= RTM_F_NOTIFY;
2936
2937 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2938
2939 if (rt->fl.fl4_src) {
2940 r->rtm_src_len = 32;
2941 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2942 }
2943 if (rt->dst.dev)
2944 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2945#ifdef CONFIG_IP_ROUTE_CLASSID
2946 if (rt->dst.tclassid)
2947 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2948#endif
2949 if (rt_is_input_route(rt))
2950 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2951 else if (rt->rt_src != rt->fl.fl4_src)
2952 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2953
2954 if (rt->rt_dst != rt->rt_gateway)
2955 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2956
2957 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2958 goto nla_put_failure;
2959
2960 if (rt->fl.mark)
2961 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2962
2963 error = rt->dst.error;
2964 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2965 if (rt->peer) {
2966 inet_peer_refcheck(rt->peer);
2967 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2968 if (rt->peer->tcp_ts_stamp) {
2969 ts = rt->peer->tcp_ts;
2970 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2971 }
2972 }
2973
2974 if (rt_is_input_route(rt)) {
2975#ifdef CONFIG_IP_MROUTE
2976 __be32 dst = rt->rt_dst;
2977
2978 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2979 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2980 int err = ipmr_get_route(net, skb, r, nowait);
2981 if (err <= 0) {
2982 if (!nowait) {
2983 if (err == 0)
2984 return 0;
2985 goto nla_put_failure;
2986 } else {
2987 if (err == -EMSGSIZE)
2988 goto nla_put_failure;
2989 error = err;
2990 }
2991 }
2992 } else
2993#endif
2994 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2995 }
2996
2997 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2998 expires, error) < 0)
2999 goto nla_put_failure;
3000
3001 return nlmsg_end(skb, nlh);
3002
3003nla_put_failure:
3004 nlmsg_cancel(skb, nlh);
3005 return -EMSGSIZE;
3006}
3007
3008static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3009{
3010 struct net *net = sock_net(in_skb->sk);
3011 struct rtmsg *rtm;
3012 struct nlattr *tb[RTA_MAX+1];
3013 struct rtable *rt = NULL;
3014 __be32 dst = 0;
3015 __be32 src = 0;
3016 u32 iif;
3017 int err;
3018 int mark;
3019 struct sk_buff *skb;
3020
3021 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3022 if (err < 0)
3023 goto errout;
3024
3025 rtm = nlmsg_data(nlh);
3026
3027 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3028 if (skb == NULL) {
3029 err = -ENOBUFS;
3030 goto errout;
3031 }
3032
3033 /* Reserve room for dummy headers, this skb can pass
3034 through good chunk of routing engine.
3035 */
3036 skb_reset_mac_header(skb);
3037 skb_reset_network_header(skb);
3038
3039 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3040 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3041 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3042
3043 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3044 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3045 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3046 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3047
3048 if (iif) {
3049 struct net_device *dev;
3050
3051 dev = __dev_get_by_index(net, iif);
3052 if (dev == NULL) {
3053 err = -ENODEV;
3054 goto errout_free;
3055 }
3056
3057 skb->protocol = htons(ETH_P_IP);
3058 skb->dev = dev;
3059 skb->mark = mark;
3060 local_bh_disable();
3061 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3062 local_bh_enable();
3063
3064 rt = skb_rtable(skb);
3065 if (err == 0 && rt->dst.error)
3066 err = -rt->dst.error;
3067 } else {
3068 struct flowi fl = {
3069 .fl4_dst = dst,
3070 .fl4_src = src,
3071 .fl4_tos = rtm->rtm_tos,
3072 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3073 .mark = mark,
3074 };
3075 err = ip_route_output_key(net, &rt, &fl);
3076 }
3077
3078 if (err)
3079 goto errout_free;
3080
3081 skb_dst_set(skb, &rt->dst);
3082 if (rtm->rtm_flags & RTM_F_NOTIFY)
3083 rt->rt_flags |= RTCF_NOTIFY;
3084
3085 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3086 RTM_NEWROUTE, 0, 0);
3087 if (err <= 0)
3088 goto errout_free;
3089
3090 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3091errout:
3092 return err;
3093
3094errout_free:
3095 kfree_skb(skb);
3096 goto errout;
3097}
3098
3099int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3100{
3101 struct rtable *rt;
3102 int h, s_h;
3103 int idx, s_idx;
3104 struct net *net;
3105
3106 net = sock_net(skb->sk);
3107
3108 s_h = cb->args[0];
3109 if (s_h < 0)
3110 s_h = 0;
3111 s_idx = idx = cb->args[1];
3112 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3113 if (!rt_hash_table[h].chain)
3114 continue;
3115 rcu_read_lock_bh();
3116 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3117 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3118 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3119 continue;
3120 if (rt_is_expired(rt))
3121 continue;
3122 skb_dst_set_noref(skb, &rt->dst);
3123 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3124 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3125 1, NLM_F_MULTI) <= 0) {
3126 skb_dst_drop(skb);
3127 rcu_read_unlock_bh();
3128 goto done;
3129 }
3130 skb_dst_drop(skb);
3131 }
3132 rcu_read_unlock_bh();
3133 }
3134
3135done:
3136 cb->args[0] = h;
3137 cb->args[1] = idx;
3138 return skb->len;
3139}
3140
3141void ip_rt_multicast_event(struct in_device *in_dev)
3142{
3143 rt_cache_flush(dev_net(in_dev->dev), 0);
3144}
3145
3146#ifdef CONFIG_SYSCTL
3147static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3148 void __user *buffer,
3149 size_t *lenp, loff_t *ppos)
3150{
3151 if (write) {
3152 int flush_delay;
3153 ctl_table ctl;
3154 struct net *net;
3155
3156 memcpy(&ctl, __ctl, sizeof(ctl));
3157 ctl.data = &flush_delay;
3158 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3159
3160 net = (struct net *)__ctl->extra1;
3161 rt_cache_flush(net, flush_delay);
3162 return 0;
3163 }
3164
3165 return -EINVAL;
3166}
3167
3168static ctl_table ipv4_route_table[] = {
3169 {
3170 .procname = "gc_thresh",
3171 .data = &ipv4_dst_ops.gc_thresh,
3172 .maxlen = sizeof(int),
3173 .mode = 0644,
3174 .proc_handler = proc_dointvec,
3175 },
3176 {
3177 .procname = "max_size",
3178 .data = &ip_rt_max_size,
3179 .maxlen = sizeof(int),
3180 .mode = 0644,
3181 .proc_handler = proc_dointvec,
3182 },
3183 {
3184 /* Deprecated. Use gc_min_interval_ms */
3185
3186 .procname = "gc_min_interval",
3187 .data = &ip_rt_gc_min_interval,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
3190 .proc_handler = proc_dointvec_jiffies,
3191 },
3192 {
3193 .procname = "gc_min_interval_ms",
3194 .data = &ip_rt_gc_min_interval,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
3197 .proc_handler = proc_dointvec_ms_jiffies,
3198 },
3199 {
3200 .procname = "gc_timeout",
3201 .data = &ip_rt_gc_timeout,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
3204 .proc_handler = proc_dointvec_jiffies,
3205 },
3206 {
3207 .procname = "gc_interval",
3208 .data = &ip_rt_gc_interval,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
3211 .proc_handler = proc_dointvec_jiffies,
3212 },
3213 {
3214 .procname = "redirect_load",
3215 .data = &ip_rt_redirect_load,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
3218 .proc_handler = proc_dointvec,
3219 },
3220 {
3221 .procname = "redirect_number",
3222 .data = &ip_rt_redirect_number,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
3225 .proc_handler = proc_dointvec,
3226 },
3227 {
3228 .procname = "redirect_silence",
3229 .data = &ip_rt_redirect_silence,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
3232 .proc_handler = proc_dointvec,
3233 },
3234 {
3235 .procname = "error_cost",
3236 .data = &ip_rt_error_cost,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
3239 .proc_handler = proc_dointvec,
3240 },
3241 {
3242 .procname = "error_burst",
3243 .data = &ip_rt_error_burst,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
3246 .proc_handler = proc_dointvec,
3247 },
3248 {
3249 .procname = "gc_elasticity",
3250 .data = &ip_rt_gc_elasticity,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
3253 .proc_handler = proc_dointvec,
3254 },
3255 {
3256 .procname = "mtu_expires",
3257 .data = &ip_rt_mtu_expires,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
3260 .proc_handler = proc_dointvec_jiffies,
3261 },
3262 {
3263 .procname = "min_pmtu",
3264 .data = &ip_rt_min_pmtu,
3265 .maxlen = sizeof(int),
3266 .mode = 0644,
3267 .proc_handler = proc_dointvec,
3268 },
3269 {
3270 .procname = "min_adv_mss",
3271 .data = &ip_rt_min_advmss,
3272 .maxlen = sizeof(int),
3273 .mode = 0644,
3274 .proc_handler = proc_dointvec,
3275 },
3276 { }
3277};
3278
3279static struct ctl_table empty[1];
3280
3281static struct ctl_table ipv4_skeleton[] =
3282{
3283 { .procname = "route",
3284 .mode = 0555, .child = ipv4_route_table},
3285 { .procname = "neigh",
3286 .mode = 0555, .child = empty},
3287 { }
3288};
3289
3290static __net_initdata struct ctl_path ipv4_path[] = {
3291 { .procname = "net", },
3292 { .procname = "ipv4", },
3293 { },
3294};
3295
3296static struct ctl_table ipv4_route_flush_table[] = {
3297 {
3298 .procname = "flush",
3299 .maxlen = sizeof(int),
3300 .mode = 0200,
3301 .proc_handler = ipv4_sysctl_rtcache_flush,
3302 },
3303 { },
3304};
3305
3306static __net_initdata struct ctl_path ipv4_route_path[] = {
3307 { .procname = "net", },
3308 { .procname = "ipv4", },
3309 { .procname = "route", },
3310 { },
3311};
3312
3313static __net_init int sysctl_route_net_init(struct net *net)
3314{
3315 struct ctl_table *tbl;
3316
3317 tbl = ipv4_route_flush_table;
3318 if (!net_eq(net, &init_net)) {
3319 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3320 if (tbl == NULL)
3321 goto err_dup;
3322 }
3323 tbl[0].extra1 = net;
3324
3325 net->ipv4.route_hdr =
3326 register_net_sysctl_table(net, ipv4_route_path, tbl);
3327 if (net->ipv4.route_hdr == NULL)
3328 goto err_reg;
3329 return 0;
3330
3331err_reg:
3332 if (tbl != ipv4_route_flush_table)
3333 kfree(tbl);
3334err_dup:
3335 return -ENOMEM;
3336}
3337
3338static __net_exit void sysctl_route_net_exit(struct net *net)
3339{
3340 struct ctl_table *tbl;
3341
3342 tbl = net->ipv4.route_hdr->ctl_table_arg;
3343 unregister_net_sysctl_table(net->ipv4.route_hdr);
3344 BUG_ON(tbl == ipv4_route_flush_table);
3345 kfree(tbl);
3346}
3347
3348static __net_initdata struct pernet_operations sysctl_route_ops = {
3349 .init = sysctl_route_net_init,
3350 .exit = sysctl_route_net_exit,
3351};
3352#endif
3353
3354static __net_init int rt_genid_init(struct net *net)
3355{
3356 get_random_bytes(&net->ipv4.rt_genid,
3357 sizeof(net->ipv4.rt_genid));
3358 return 0;
3359}
3360
3361static __net_initdata struct pernet_operations rt_genid_ops = {
3362 .init = rt_genid_init,
3363};
3364
3365
3366#ifdef CONFIG_IP_ROUTE_CLASSID
3367struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3368#endif /* CONFIG_IP_ROUTE_CLASSID */
3369
3370static __initdata unsigned long rhash_entries;
3371static int __init set_rhash_entries(char *str)
3372{
3373 if (!str)
3374 return 0;
3375 rhash_entries = simple_strtoul(str, &str, 0);
3376 return 1;
3377}
3378__setup("rhash_entries=", set_rhash_entries);
3379
3380int __init ip_rt_init(void)
3381{
3382 int rc = 0;
3383
3384#ifdef CONFIG_IP_ROUTE_CLASSID
3385 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3386 if (!ip_rt_acct)
3387 panic("IP: failed to allocate ip_rt_acct\n");
3388#endif
3389
3390 ipv4_dst_ops.kmem_cachep =
3391 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3392 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3393
3394 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3395
3396 if (dst_entries_init(&ipv4_dst_ops) < 0)
3397 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3398
3399 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3400 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3401
3402 rt_hash_table = (struct rt_hash_bucket *)
3403 alloc_large_system_hash("IP route cache",
3404 sizeof(struct rt_hash_bucket),
3405 rhash_entries,
3406 (totalram_pages >= 128 * 1024) ?
3407 15 : 17,
3408 0,
3409 &rt_hash_log,
3410 &rt_hash_mask,
3411 rhash_entries ? 0 : 512 * 1024);
3412 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3413 rt_hash_lock_init();
3414
3415 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3416 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3417
3418 devinet_init();
3419 ip_fib_init();
3420
3421 /* All the timers, started at system startup tend
3422 to synchronize. Perturb it a bit.
3423 */
3424 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3425 expires_ljiffies = jiffies;
3426 schedule_delayed_work(&expires_work,
3427 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3428
3429 if (ip_rt_proc_init())
3430 printk(KERN_ERR "Unable to create route proc files\n");
3431#ifdef CONFIG_XFRM
3432 xfrm_init();
3433 xfrm4_init(ip_rt_max_size);
3434#endif
3435 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3436
3437#ifdef CONFIG_SYSCTL
3438 register_pernet_subsys(&sysctl_route_ops);
3439#endif
3440 register_pernet_subsys(&rt_genid_ops);
3441 return rc;
3442}
3443
3444#ifdef CONFIG_SYSCTL
3445/*
3446 * We really need to sanitize the damn ipv4 init order, then all
3447 * this nonsense will go away.
3448 */
3449void __init ip_static_sysctl_init(void)
3450{
3451 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3452}
3453#endif