]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - net/ipv4/route.c
2c25581bf25c1342b719ae8cca36f41609a3f029
[mirror_ubuntu-hirsute-kernel.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU 0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly = 9;
128 static int ip_rt_redirect_load __read_mostly = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly = HZ;
131 static int ip_rt_error_burst __read_mostly = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly = 8;
133 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly = 256;
136 static int rt_chain_length_max __read_mostly = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142 * Interface to generic destination cache.
143 */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int ipv4_mtu(const struct dst_entry *dst);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152 struct sk_buff *skb, u32 mtu);
153 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
154 struct sk_buff *skb);
155 static int rt_garbage_collect(struct dst_ops *ops);
156
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 int how)
159 {
160 }
161
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
163 {
164 WARN_ON(1);
165 return NULL;
166 }
167
168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
169 struct sk_buff *skb,
170 const void *daddr);
171
172 static struct dst_ops ipv4_dst_ops = {
173 .family = AF_INET,
174 .protocol = cpu_to_be16(ETH_P_IP),
175 .gc = rt_garbage_collect,
176 .check = ipv4_dst_check,
177 .default_advmss = ipv4_default_advmss,
178 .mtu = ipv4_mtu,
179 .cow_metrics = ipv4_cow_metrics,
180 .destroy = ipv4_dst_destroy,
181 .ifdown = ipv4_dst_ifdown,
182 .negative_advice = ipv4_negative_advice,
183 .link_failure = ipv4_link_failure,
184 .update_pmtu = ip_rt_update_pmtu,
185 .redirect = ip_do_redirect,
186 .local_out = __ip_local_out,
187 .neigh_lookup = ipv4_neigh_lookup,
188 };
189
190 #define ECN_OR_COST(class) TC_PRIO_##class
191
192 const __u8 ip_tos2prio[16] = {
193 TC_PRIO_BESTEFFORT,
194 ECN_OR_COST(BESTEFFORT),
195 TC_PRIO_BESTEFFORT,
196 ECN_OR_COST(BESTEFFORT),
197 TC_PRIO_BULK,
198 ECN_OR_COST(BULK),
199 TC_PRIO_BULK,
200 ECN_OR_COST(BULK),
201 TC_PRIO_INTERACTIVE,
202 ECN_OR_COST(INTERACTIVE),
203 TC_PRIO_INTERACTIVE,
204 ECN_OR_COST(INTERACTIVE),
205 TC_PRIO_INTERACTIVE_BULK,
206 ECN_OR_COST(INTERACTIVE_BULK),
207 TC_PRIO_INTERACTIVE_BULK,
208 ECN_OR_COST(INTERACTIVE_BULK)
209 };
210 EXPORT_SYMBOL(ip_tos2prio);
211
212 /*
213 * Route cache.
214 */
215
216 /* The locking scheme is rather straight forward:
217 *
218 * 1) Read-Copy Update protects the buckets of the central route hash.
219 * 2) Only writers remove entries, and they hold the lock
220 * as they look at rtable reference counts.
221 * 3) Only readers acquire references to rtable entries,
222 * they do so with atomic increments and with the
223 * lock held.
224 */
225
226 struct rt_hash_bucket {
227 struct rtable __rcu *chain;
228 };
229
230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
231 defined(CONFIG_PROVE_LOCKING)
232 /*
233 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
234 * The size of this table is a power of two and depends on the number of CPUS.
235 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
236 */
237 #ifdef CONFIG_LOCKDEP
238 # define RT_HASH_LOCK_SZ 256
239 #else
240 # if NR_CPUS >= 32
241 # define RT_HASH_LOCK_SZ 4096
242 # elif NR_CPUS >= 16
243 # define RT_HASH_LOCK_SZ 2048
244 # elif NR_CPUS >= 8
245 # define RT_HASH_LOCK_SZ 1024
246 # elif NR_CPUS >= 4
247 # define RT_HASH_LOCK_SZ 512
248 # else
249 # define RT_HASH_LOCK_SZ 256
250 # endif
251 #endif
252
253 static spinlock_t *rt_hash_locks;
254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
255
256 static __init void rt_hash_lock_init(void)
257 {
258 int i;
259
260 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
261 GFP_KERNEL);
262 if (!rt_hash_locks)
263 panic("IP: failed to allocate rt_hash_locks\n");
264
265 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
266 spin_lock_init(&rt_hash_locks[i]);
267 }
268 #else
269 # define rt_hash_lock_addr(slot) NULL
270
271 static inline void rt_hash_lock_init(void)
272 {
273 }
274 #endif
275
276 static struct rt_hash_bucket *rt_hash_table __read_mostly;
277 static unsigned int rt_hash_mask __read_mostly;
278 static unsigned int rt_hash_log __read_mostly;
279
280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
282
283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
284 int genid)
285 {
286 return jhash_3words((__force u32)daddr, (__force u32)saddr,
287 idx, genid)
288 & rt_hash_mask;
289 }
290
291 static inline int rt_genid(struct net *net)
292 {
293 return atomic_read(&net->ipv4.rt_genid);
294 }
295
296 #ifdef CONFIG_PROC_FS
297 struct rt_cache_iter_state {
298 struct seq_net_private p;
299 int bucket;
300 int genid;
301 };
302
303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
304 {
305 struct rt_cache_iter_state *st = seq->private;
306 struct rtable *r = NULL;
307
308 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
309 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
310 continue;
311 rcu_read_lock_bh();
312 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313 while (r) {
314 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
315 r->rt_genid == st->genid)
316 return r;
317 r = rcu_dereference_bh(r->dst.rt_next);
318 }
319 rcu_read_unlock_bh();
320 }
321 return r;
322 }
323
324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
325 struct rtable *r)
326 {
327 struct rt_cache_iter_state *st = seq->private;
328
329 r = rcu_dereference_bh(r->dst.rt_next);
330 while (!r) {
331 rcu_read_unlock_bh();
332 do {
333 if (--st->bucket < 0)
334 return NULL;
335 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
336 rcu_read_lock_bh();
337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
338 }
339 return r;
340 }
341
342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
343 struct rtable *r)
344 {
345 struct rt_cache_iter_state *st = seq->private;
346 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
347 if (dev_net(r->dst.dev) != seq_file_net(seq))
348 continue;
349 if (r->rt_genid == st->genid)
350 break;
351 }
352 return r;
353 }
354
355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
356 {
357 struct rtable *r = rt_cache_get_first(seq);
358
359 if (r)
360 while (pos && (r = rt_cache_get_next(seq, r)))
361 --pos;
362 return pos ? NULL : r;
363 }
364
365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
366 {
367 struct rt_cache_iter_state *st = seq->private;
368 if (*pos)
369 return rt_cache_get_idx(seq, *pos - 1);
370 st->genid = rt_genid(seq_file_net(seq));
371 return SEQ_START_TOKEN;
372 }
373
374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
375 {
376 struct rtable *r;
377
378 if (v == SEQ_START_TOKEN)
379 r = rt_cache_get_first(seq);
380 else
381 r = rt_cache_get_next(seq, v);
382 ++*pos;
383 return r;
384 }
385
386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
387 {
388 if (v && v != SEQ_START_TOKEN)
389 rcu_read_unlock_bh();
390 }
391
392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
393 {
394 if (v == SEQ_START_TOKEN)
395 seq_printf(seq, "%-127s\n",
396 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
397 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
398 "HHUptod\tSpecDst");
399 else {
400 struct rtable *r = v;
401 int len;
402
403 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
404 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
405 r->dst.dev ? r->dst.dev->name : "*",
406 (__force u32)r->rt_dst,
407 (__force u32)r->rt_gateway,
408 r->rt_flags, atomic_read(&r->dst.__refcnt),
409 r->dst.__use, 0, (__force u32)r->rt_src,
410 dst_metric_advmss(&r->dst) + 40,
411 dst_metric(&r->dst, RTAX_WINDOW), 0,
412 r->rt_key_tos,
413 -1, 0, 0, &len);
414
415 seq_printf(seq, "%*s\n", 127 - len, "");
416 }
417 return 0;
418 }
419
420 static const struct seq_operations rt_cache_seq_ops = {
421 .start = rt_cache_seq_start,
422 .next = rt_cache_seq_next,
423 .stop = rt_cache_seq_stop,
424 .show = rt_cache_seq_show,
425 };
426
427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
428 {
429 return seq_open_net(inode, file, &rt_cache_seq_ops,
430 sizeof(struct rt_cache_iter_state));
431 }
432
433 static const struct file_operations rt_cache_seq_fops = {
434 .owner = THIS_MODULE,
435 .open = rt_cache_seq_open,
436 .read = seq_read,
437 .llseek = seq_lseek,
438 .release = seq_release_net,
439 };
440
441
442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
443 {
444 int cpu;
445
446 if (*pos == 0)
447 return SEQ_START_TOKEN;
448
449 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
450 if (!cpu_possible(cpu))
451 continue;
452 *pos = cpu+1;
453 return &per_cpu(rt_cache_stat, cpu);
454 }
455 return NULL;
456 }
457
458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
459 {
460 int cpu;
461
462 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
463 if (!cpu_possible(cpu))
464 continue;
465 *pos = cpu+1;
466 return &per_cpu(rt_cache_stat, cpu);
467 }
468 return NULL;
469
470 }
471
472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
473 {
474
475 }
476
477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
478 {
479 struct rt_cache_stat *st = v;
480
481 if (v == SEQ_START_TOKEN) {
482 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
483 return 0;
484 }
485
486 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
487 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
488 dst_entries_get_slow(&ipv4_dst_ops),
489 st->in_hit,
490 st->in_slow_tot,
491 st->in_slow_mc,
492 st->in_no_route,
493 st->in_brd,
494 st->in_martian_dst,
495 st->in_martian_src,
496
497 st->out_hit,
498 st->out_slow_tot,
499 st->out_slow_mc,
500
501 st->gc_total,
502 st->gc_ignored,
503 st->gc_goal_miss,
504 st->gc_dst_overflow,
505 st->in_hlist_search,
506 st->out_hlist_search
507 );
508 return 0;
509 }
510
511 static const struct seq_operations rt_cpu_seq_ops = {
512 .start = rt_cpu_seq_start,
513 .next = rt_cpu_seq_next,
514 .stop = rt_cpu_seq_stop,
515 .show = rt_cpu_seq_show,
516 };
517
518
519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
520 {
521 return seq_open(file, &rt_cpu_seq_ops);
522 }
523
524 static const struct file_operations rt_cpu_seq_fops = {
525 .owner = THIS_MODULE,
526 .open = rt_cpu_seq_open,
527 .read = seq_read,
528 .llseek = seq_lseek,
529 .release = seq_release,
530 };
531
532 #ifdef CONFIG_IP_ROUTE_CLASSID
533 static int rt_acct_proc_show(struct seq_file *m, void *v)
534 {
535 struct ip_rt_acct *dst, *src;
536 unsigned int i, j;
537
538 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
539 if (!dst)
540 return -ENOMEM;
541
542 for_each_possible_cpu(i) {
543 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
544 for (j = 0; j < 256; j++) {
545 dst[j].o_bytes += src[j].o_bytes;
546 dst[j].o_packets += src[j].o_packets;
547 dst[j].i_bytes += src[j].i_bytes;
548 dst[j].i_packets += src[j].i_packets;
549 }
550 }
551
552 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
553 kfree(dst);
554 return 0;
555 }
556
557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
558 {
559 return single_open(file, rt_acct_proc_show, NULL);
560 }
561
562 static const struct file_operations rt_acct_proc_fops = {
563 .owner = THIS_MODULE,
564 .open = rt_acct_proc_open,
565 .read = seq_read,
566 .llseek = seq_lseek,
567 .release = single_release,
568 };
569 #endif
570
571 static int __net_init ip_rt_do_proc_init(struct net *net)
572 {
573 struct proc_dir_entry *pde;
574
575 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
576 &rt_cache_seq_fops);
577 if (!pde)
578 goto err1;
579
580 pde = proc_create("rt_cache", S_IRUGO,
581 net->proc_net_stat, &rt_cpu_seq_fops);
582 if (!pde)
583 goto err2;
584
585 #ifdef CONFIG_IP_ROUTE_CLASSID
586 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
587 if (!pde)
588 goto err3;
589 #endif
590 return 0;
591
592 #ifdef CONFIG_IP_ROUTE_CLASSID
593 err3:
594 remove_proc_entry("rt_cache", net->proc_net_stat);
595 #endif
596 err2:
597 remove_proc_entry("rt_cache", net->proc_net);
598 err1:
599 return -ENOMEM;
600 }
601
602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
603 {
604 remove_proc_entry("rt_cache", net->proc_net_stat);
605 remove_proc_entry("rt_cache", net->proc_net);
606 #ifdef CONFIG_IP_ROUTE_CLASSID
607 remove_proc_entry("rt_acct", net->proc_net);
608 #endif
609 }
610
611 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
612 .init = ip_rt_do_proc_init,
613 .exit = ip_rt_do_proc_exit,
614 };
615
616 static int __init ip_rt_proc_init(void)
617 {
618 return register_pernet_subsys(&ip_rt_proc_ops);
619 }
620
621 #else
622 static inline int ip_rt_proc_init(void)
623 {
624 return 0;
625 }
626 #endif /* CONFIG_PROC_FS */
627
628 static inline void rt_free(struct rtable *rt)
629 {
630 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
631 }
632
633 static inline void rt_drop(struct rtable *rt)
634 {
635 ip_rt_put(rt);
636 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
637 }
638
639 static inline int rt_fast_clean(struct rtable *rth)
640 {
641 /* Kill broadcast/multicast entries very aggresively, if they
642 collide in hash table with more useful entries */
643 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
644 rt_is_input_route(rth) && rth->dst.rt_next;
645 }
646
647 static inline int rt_valuable(struct rtable *rth)
648 {
649 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
650 rth->dst.expires;
651 }
652
653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
654 {
655 unsigned long age;
656 int ret = 0;
657
658 if (atomic_read(&rth->dst.__refcnt))
659 goto out;
660
661 age = jiffies - rth->dst.lastuse;
662 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
663 (age <= tmo2 && rt_valuable(rth)))
664 goto out;
665 ret = 1;
666 out: return ret;
667 }
668
669 /* Bits of score are:
670 * 31: very valuable
671 * 30: not quite useless
672 * 29..0: usage counter
673 */
674 static inline u32 rt_score(struct rtable *rt)
675 {
676 u32 score = jiffies - rt->dst.lastuse;
677
678 score = ~score & ~(3<<30);
679
680 if (rt_valuable(rt))
681 score |= (1<<31);
682
683 if (rt_is_output_route(rt) ||
684 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
685 score |= (1<<30);
686
687 return score;
688 }
689
690 static inline bool rt_caching(const struct net *net)
691 {
692 return net->ipv4.current_rt_cache_rebuild_count <=
693 net->ipv4.sysctl_rt_cache_rebuild_count;
694 }
695
696 static inline bool compare_hash_inputs(const struct rtable *rt1,
697 const struct rtable *rt2)
698 {
699 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
700 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
701 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
702 }
703
704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
705 {
706 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
707 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
708 (rt1->rt_mark ^ rt2->rt_mark) |
709 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
710 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
711 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
712 }
713
714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
715 {
716 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
717 }
718
719 static inline int rt_is_expired(struct rtable *rth)
720 {
721 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
722 }
723
724 /*
725 * Perform a full scan of hash table and free all entries.
726 * Can be called by a softirq or a process.
727 * In the later case, we want to be reschedule if necessary
728 */
729 static void rt_do_flush(struct net *net, int process_context)
730 {
731 unsigned int i;
732 struct rtable *rth, *next;
733
734 for (i = 0; i <= rt_hash_mask; i++) {
735 struct rtable __rcu **pprev;
736 struct rtable *list;
737
738 if (process_context && need_resched())
739 cond_resched();
740 rth = rcu_access_pointer(rt_hash_table[i].chain);
741 if (!rth)
742 continue;
743
744 spin_lock_bh(rt_hash_lock_addr(i));
745
746 list = NULL;
747 pprev = &rt_hash_table[i].chain;
748 rth = rcu_dereference_protected(*pprev,
749 lockdep_is_held(rt_hash_lock_addr(i)));
750
751 while (rth) {
752 next = rcu_dereference_protected(rth->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i)));
754
755 if (!net ||
756 net_eq(dev_net(rth->dst.dev), net)) {
757 rcu_assign_pointer(*pprev, next);
758 rcu_assign_pointer(rth->dst.rt_next, list);
759 list = rth;
760 } else {
761 pprev = &rth->dst.rt_next;
762 }
763 rth = next;
764 }
765
766 spin_unlock_bh(rt_hash_lock_addr(i));
767
768 for (; list; list = next) {
769 next = rcu_dereference_protected(list->dst.rt_next, 1);
770 rt_free(list);
771 }
772 }
773 }
774
775 /*
776 * While freeing expired entries, we compute average chain length
777 * and standard deviation, using fixed-point arithmetic.
778 * This to have an estimation of rt_chain_length_max
779 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
780 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781 */
782
783 #define FRACT_BITS 3
784 #define ONE (1UL << FRACT_BITS)
785
786 /*
787 * Given a hash chain and an item in this hash chain,
788 * find if a previous entry has the same hash_inputs
789 * (but differs on tos, mark or oif)
790 * Returns 0 if an alias is found.
791 * Returns ONE if rth has no alias before itself.
792 */
793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
794 {
795 const struct rtable *aux = head;
796
797 while (aux != rth) {
798 if (compare_hash_inputs(aux, rth))
799 return 0;
800 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
801 }
802 return ONE;
803 }
804
805 static void rt_check_expire(void)
806 {
807 static unsigned int rover;
808 unsigned int i = rover, goal;
809 struct rtable *rth;
810 struct rtable __rcu **rthp;
811 unsigned long samples = 0;
812 unsigned long sum = 0, sum2 = 0;
813 unsigned long delta;
814 u64 mult;
815
816 delta = jiffies - expires_ljiffies;
817 expires_ljiffies = jiffies;
818 mult = ((u64)delta) << rt_hash_log;
819 if (ip_rt_gc_timeout > 1)
820 do_div(mult, ip_rt_gc_timeout);
821 goal = (unsigned int)mult;
822 if (goal > rt_hash_mask)
823 goal = rt_hash_mask + 1;
824 for (; goal > 0; goal--) {
825 unsigned long tmo = ip_rt_gc_timeout;
826 unsigned long length;
827
828 i = (i + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[i].chain;
830
831 if (need_resched())
832 cond_resched();
833
834 samples++;
835
836 if (rcu_dereference_raw(*rthp) == NULL)
837 continue;
838 length = 0;
839 spin_lock_bh(rt_hash_lock_addr(i));
840 while ((rth = rcu_dereference_protected(*rthp,
841 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
842 prefetch(rth->dst.rt_next);
843 if (rt_is_expired(rth) ||
844 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
845 *rthp = rth->dst.rt_next;
846 rt_free(rth);
847 continue;
848 }
849
850 /* We only count entries on a chain with equal
851 * hash inputs once so that entries for
852 * different QOS levels, and other non-hash
853 * input attributes don't unfairly skew the
854 * length computation
855 */
856 tmo >>= 1;
857 rthp = &rth->dst.rt_next;
858 length += has_noalias(rt_hash_table[i].chain, rth);
859 }
860 spin_unlock_bh(rt_hash_lock_addr(i));
861 sum += length;
862 sum2 += length*length;
863 }
864 if (samples) {
865 unsigned long avg = sum / samples;
866 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
867 rt_chain_length_max = max_t(unsigned long,
868 ip_rt_gc_elasticity,
869 (avg + 4*sd) >> FRACT_BITS);
870 }
871 rover = i;
872 }
873
874 /*
875 * rt_worker_func() is run in process context.
876 * we call rt_check_expire() to scan part of the hash table
877 */
878 static void rt_worker_func(struct work_struct *work)
879 {
880 rt_check_expire();
881 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
882 }
883
884 /*
885 * Perturbation of rt_genid by a small quantity [1..256]
886 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
887 * many times (2^24) without giving recent rt_genid.
888 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
889 */
890 static void rt_cache_invalidate(struct net *net)
891 {
892 unsigned char shuffle;
893
894 get_random_bytes(&shuffle, sizeof(shuffle));
895 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
896 }
897
898 /*
899 * delay < 0 : invalidate cache (fast : entries will be deleted later)
900 * delay >= 0 : invalidate & flush cache (can be long)
901 */
902 void rt_cache_flush(struct net *net, int delay)
903 {
904 rt_cache_invalidate(net);
905 if (delay >= 0)
906 rt_do_flush(net, !in_softirq());
907 }
908
909 /* Flush previous cache invalidated entries from the cache */
910 void rt_cache_flush_batch(struct net *net)
911 {
912 rt_do_flush(net, !in_softirq());
913 }
914
915 static void rt_emergency_hash_rebuild(struct net *net)
916 {
917 net_warn_ratelimited("Route hash chain too long!\n");
918 rt_cache_invalidate(net);
919 }
920
921 /*
922 Short description of GC goals.
923
924 We want to build algorithm, which will keep routing cache
925 at some equilibrium point, when number of aged off entries
926 is kept approximately equal to newly generated ones.
927
928 Current expiration strength is variable "expire".
929 We try to adjust it dynamically, so that if networking
930 is idle expires is large enough to keep enough of warm entries,
931 and when load increases it reduces to limit cache size.
932 */
933
934 static int rt_garbage_collect(struct dst_ops *ops)
935 {
936 static unsigned long expire = RT_GC_TIMEOUT;
937 static unsigned long last_gc;
938 static int rover;
939 static int equilibrium;
940 struct rtable *rth;
941 struct rtable __rcu **rthp;
942 unsigned long now = jiffies;
943 int goal;
944 int entries = dst_entries_get_fast(&ipv4_dst_ops);
945
946 /*
947 * Garbage collection is pretty expensive,
948 * do not make it too frequently.
949 */
950
951 RT_CACHE_STAT_INC(gc_total);
952
953 if (now - last_gc < ip_rt_gc_min_interval &&
954 entries < ip_rt_max_size) {
955 RT_CACHE_STAT_INC(gc_ignored);
956 goto out;
957 }
958
959 entries = dst_entries_get_slow(&ipv4_dst_ops);
960 /* Calculate number of entries, which we want to expire now. */
961 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
962 if (goal <= 0) {
963 if (equilibrium < ipv4_dst_ops.gc_thresh)
964 equilibrium = ipv4_dst_ops.gc_thresh;
965 goal = entries - equilibrium;
966 if (goal > 0) {
967 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
968 goal = entries - equilibrium;
969 }
970 } else {
971 /* We are in dangerous area. Try to reduce cache really
972 * aggressively.
973 */
974 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
975 equilibrium = entries - goal;
976 }
977
978 if (now - last_gc >= ip_rt_gc_min_interval)
979 last_gc = now;
980
981 if (goal <= 0) {
982 equilibrium += goal;
983 goto work_done;
984 }
985
986 do {
987 int i, k;
988
989 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
990 unsigned long tmo = expire;
991
992 k = (k + 1) & rt_hash_mask;
993 rthp = &rt_hash_table[k].chain;
994 spin_lock_bh(rt_hash_lock_addr(k));
995 while ((rth = rcu_dereference_protected(*rthp,
996 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
997 if (!rt_is_expired(rth) &&
998 !rt_may_expire(rth, tmo, expire)) {
999 tmo >>= 1;
1000 rthp = &rth->dst.rt_next;
1001 continue;
1002 }
1003 *rthp = rth->dst.rt_next;
1004 rt_free(rth);
1005 goal--;
1006 }
1007 spin_unlock_bh(rt_hash_lock_addr(k));
1008 if (goal <= 0)
1009 break;
1010 }
1011 rover = k;
1012
1013 if (goal <= 0)
1014 goto work_done;
1015
1016 /* Goal is not achieved. We stop process if:
1017
1018 - if expire reduced to zero. Otherwise, expire is halfed.
1019 - if table is not full.
1020 - if we are called from interrupt.
1021 - jiffies check is just fallback/debug loop breaker.
1022 We will not spin here for long time in any case.
1023 */
1024
1025 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027 if (expire == 0)
1028 break;
1029
1030 expire >>= 1;
1031
1032 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033 goto out;
1034 } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037 goto out;
1038 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out;
1040 net_warn_ratelimited("dst cache overflow\n");
1041 RT_CACHE_STAT_INC(gc_dst_overflow);
1042 return 1;
1043
1044 work_done:
1045 expire += ip_rt_gc_min_interval;
1046 if (expire > ip_rt_gc_timeout ||
1047 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049 expire = ip_rt_gc_timeout;
1050 out: return 0;
1051 }
1052
1053 /*
1054 * Returns number of entries in a hash chain that have different hash_inputs
1055 */
1056 static int slow_chain_length(const struct rtable *head)
1057 {
1058 int length = 0;
1059 const struct rtable *rth = head;
1060
1061 while (rth) {
1062 length += has_noalias(head, rth);
1063 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064 }
1065 return length >> FRACT_BITS;
1066 }
1067
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069 struct sk_buff *skb,
1070 const void *daddr)
1071 {
1072 struct net_device *dev = dst->dev;
1073 const __be32 *pkey = daddr;
1074 const struct rtable *rt;
1075 struct neighbour *n;
1076
1077 rt = (const struct rtable *) dst;
1078 if (rt->rt_gateway)
1079 pkey = (const __be32 *) &rt->rt_gateway;
1080 else if (skb)
1081 pkey = &ip_hdr(skb)->daddr;
1082
1083 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1084 if (n)
1085 return n;
1086 return neigh_create(&arp_tbl, pkey, dev);
1087 }
1088
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090 struct sk_buff *skb, int ifindex)
1091 {
1092 struct rtable *rth, *cand;
1093 struct rtable __rcu **rthp, **candp;
1094 unsigned long now;
1095 u32 min_score;
1096 int chain_length;
1097
1098 restart:
1099 chain_length = 0;
1100 min_score = ~(u32)0;
1101 cand = NULL;
1102 candp = NULL;
1103 now = jiffies;
1104
1105 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106 /*
1107 * If we're not caching, just tell the caller we
1108 * were successful and don't touch the route. The
1109 * caller hold the sole reference to the cache entry, and
1110 * it will be released when the caller is done with it.
1111 * If we drop it here, the callers have no way to resolve routes
1112 * when we're not caching. Instead, just point *rp at rt, so
1113 * the caller gets a single use out of the route
1114 * Note that we do rt_free on this new route entry, so that
1115 * once its refcount hits zero, we are still able to reap it
1116 * (Thanks Alexey)
1117 * Note: To avoid expensive rcu stuff for this uncached dst,
1118 * we set DST_NOCACHE so that dst_release() can free dst without
1119 * waiting a grace period.
1120 */
1121
1122 rt->dst.flags |= DST_NOCACHE;
1123 goto skip_hashing;
1124 }
1125
1126 rthp = &rt_hash_table[hash].chain;
1127
1128 spin_lock_bh(rt_hash_lock_addr(hash));
1129 while ((rth = rcu_dereference_protected(*rthp,
1130 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131 if (rt_is_expired(rth)) {
1132 *rthp = rth->dst.rt_next;
1133 rt_free(rth);
1134 continue;
1135 }
1136 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137 /* Put it first */
1138 *rthp = rth->dst.rt_next;
1139 /*
1140 * Since lookup is lockfree, the deletion
1141 * must be visible to another weakly ordered CPU before
1142 * the insertion at the start of the hash chain.
1143 */
1144 rcu_assign_pointer(rth->dst.rt_next,
1145 rt_hash_table[hash].chain);
1146 /*
1147 * Since lookup is lockfree, the update writes
1148 * must be ordered for consistency on SMP.
1149 */
1150 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152 dst_use(&rth->dst, now);
1153 spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155 rt_drop(rt);
1156 if (skb)
1157 skb_dst_set(skb, &rth->dst);
1158 return rth;
1159 }
1160
1161 if (!atomic_read(&rth->dst.__refcnt)) {
1162 u32 score = rt_score(rth);
1163
1164 if (score <= min_score) {
1165 cand = rth;
1166 candp = rthp;
1167 min_score = score;
1168 }
1169 }
1170
1171 chain_length++;
1172
1173 rthp = &rth->dst.rt_next;
1174 }
1175
1176 if (cand) {
1177 /* ip_rt_gc_elasticity used to be average length of chain
1178 * length, when exceeded gc becomes really aggressive.
1179 *
1180 * The second limit is less certain. At the moment it allows
1181 * only 2 entries per bucket. We will see.
1182 */
1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->dst.rt_next;
1185 rt_free(cand);
1186 }
1187 } else {
1188 if (chain_length > rt_chain_length_max &&
1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->dst.dev);
1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192 if (!rt_caching(net)) {
1193 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194 rt->dst.dev->name, num);
1195 }
1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200 ifindex, rt_genid(net));
1201 goto restart;
1202 }
1203 }
1204
1205 rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207 /*
1208 * Since lookup is lockfree, we must make sure
1209 * previous writes to rt are committed to memory
1210 * before making rt visible to other CPUS.
1211 */
1212 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214 spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216 skip_hashing:
1217 if (skb)
1218 skb_dst_set(skb, &rt->dst);
1219 return rt;
1220 }
1221
1222 /*
1223 * Peer allocation may fail only in serious out-of-memory conditions. However
1224 * we still can generate some output.
1225 * Random ID selection looks a bit dangerous because we have no chances to
1226 * select ID being unique in a reasonable period of time.
1227 * But broken packet identifier may be better than no packet at all.
1228 */
1229 static void ip_select_fb_ident(struct iphdr *iph)
1230 {
1231 static DEFINE_SPINLOCK(ip_fb_id_lock);
1232 static u32 ip_fallback_id;
1233 u32 salt;
1234
1235 spin_lock_bh(&ip_fb_id_lock);
1236 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237 iph->id = htons(salt & 0xFFFF);
1238 ip_fallback_id = salt;
1239 spin_unlock_bh(&ip_fb_id_lock);
1240 }
1241
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243 {
1244 struct net *net = dev_net(dst->dev);
1245 struct inet_peer *peer;
1246
1247 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248 if (peer) {
1249 iph->id = htons(inet_getid(peer, more));
1250 inet_putpeer(peer);
1251 return;
1252 }
1253
1254 ip_select_fb_ident(iph);
1255 }
1256 EXPORT_SYMBOL(__ip_select_ident);
1257
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1259 {
1260 struct rtable __rcu **rthp;
1261 struct rtable *aux;
1262
1263 rthp = &rt_hash_table[hash].chain;
1264 spin_lock_bh(rt_hash_lock_addr(hash));
1265 ip_rt_put(rt);
1266 while ((aux = rcu_dereference_protected(*rthp,
1267 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268 if (aux == rt || rt_is_expired(aux)) {
1269 *rthp = aux->dst.rt_next;
1270 rt_free(aux);
1271 continue;
1272 }
1273 rthp = &aux->dst.rt_next;
1274 }
1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1276 }
1277
1278 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1279 const struct iphdr *iph,
1280 int oif, u8 tos,
1281 u8 prot, u32 mark, int flow_flags)
1282 {
1283 if (sk) {
1284 const struct inet_sock *inet = inet_sk(sk);
1285
1286 oif = sk->sk_bound_dev_if;
1287 mark = sk->sk_mark;
1288 tos = RT_CONN_FLAGS(sk);
1289 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290 }
1291 flowi4_init_output(fl4, oif, mark, tos,
1292 RT_SCOPE_UNIVERSE, prot,
1293 flow_flags,
1294 iph->daddr, iph->saddr, 0, 0);
1295 }
1296
1297 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
1298 const struct sock *sk)
1299 {
1300 const struct iphdr *iph = ip_hdr(skb);
1301 int oif = skb->dev->ifindex;
1302 u8 tos = RT_TOS(iph->tos);
1303 u8 prot = iph->protocol;
1304 u32 mark = skb->mark;
1305
1306 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1307 }
1308
1309 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
1310 {
1311 const struct inet_sock *inet = inet_sk(sk);
1312 const struct ip_options_rcu *inet_opt;
1313 __be32 daddr = inet->inet_daddr;
1314
1315 rcu_read_lock();
1316 inet_opt = rcu_dereference(inet->inet_opt);
1317 if (inet_opt && inet_opt->opt.srr)
1318 daddr = inet_opt->opt.faddr;
1319 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1320 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1321 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1322 inet_sk_flowi_flags(sk),
1323 daddr, inet->inet_saddr, 0, 0);
1324 rcu_read_unlock();
1325 }
1326
1327 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1328 const struct sk_buff *skb)
1329 {
1330 if (skb)
1331 build_skb_flow_key(fl4, skb, sk);
1332 else
1333 build_sk_flow_key(fl4, sk);
1334 }
1335
1336 static DEFINE_SPINLOCK(fnhe_lock);
1337
1338 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1339 {
1340 struct fib_nh_exception *fnhe, *oldest;
1341
1342 oldest = rcu_dereference(hash->chain);
1343 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1344 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1345 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1346 oldest = fnhe;
1347 }
1348 return oldest;
1349 }
1350
1351 static inline u32 fnhe_hashfun(__be32 daddr)
1352 {
1353 u32 hval;
1354
1355 hval = (__force u32) daddr;
1356 hval ^= (hval >> 11) ^ (hval >> 22);
1357
1358 return hval & (FNHE_HASH_SIZE - 1);
1359 }
1360
1361 static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1362 {
1363 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1364 struct fib_nh_exception *fnhe;
1365 int depth;
1366 u32 hval;
1367
1368 if (!hash) {
1369 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1370 GFP_ATOMIC);
1371 if (!hash)
1372 return NULL;
1373 }
1374
1375 hval = fnhe_hashfun(daddr);
1376 hash += hval;
1377
1378 depth = 0;
1379 for (fnhe = rcu_dereference(hash->chain); fnhe;
1380 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1381 if (fnhe->fnhe_daddr == daddr)
1382 goto out;
1383 depth++;
1384 }
1385
1386 if (depth > FNHE_RECLAIM_DEPTH) {
1387 fnhe = fnhe_oldest(hash + hval, daddr);
1388 goto out_daddr;
1389 }
1390 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1391 if (!fnhe)
1392 return NULL;
1393
1394 fnhe->fnhe_next = hash->chain;
1395 rcu_assign_pointer(hash->chain, fnhe);
1396
1397 out_daddr:
1398 fnhe->fnhe_daddr = daddr;
1399 out:
1400 fnhe->fnhe_stamp = jiffies;
1401 return fnhe;
1402 }
1403
1404 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1405 {
1406 __be32 new_gw = icmp_hdr(skb)->un.gateway;
1407 __be32 old_gw = ip_hdr(skb)->saddr;
1408 struct net_device *dev = skb->dev;
1409 struct in_device *in_dev;
1410 struct fib_result res;
1411 struct neighbour *n;
1412 struct net *net;
1413
1414 switch (icmp_hdr(skb)->code & 7) {
1415 case ICMP_REDIR_NET:
1416 case ICMP_REDIR_NETTOS:
1417 case ICMP_REDIR_HOST:
1418 case ICMP_REDIR_HOSTTOS:
1419 break;
1420
1421 default:
1422 return;
1423 }
1424
1425 if (rt->rt_gateway != old_gw)
1426 return;
1427
1428 in_dev = __in_dev_get_rcu(dev);
1429 if (!in_dev)
1430 return;
1431
1432 net = dev_net(dev);
1433 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1434 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1435 ipv4_is_zeronet(new_gw))
1436 goto reject_redirect;
1437
1438 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1439 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1440 goto reject_redirect;
1441 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1442 goto reject_redirect;
1443 } else {
1444 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1445 goto reject_redirect;
1446 }
1447
1448 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1449 if (n) {
1450 if (!(n->nud_state & NUD_VALID)) {
1451 neigh_event_send(n, NULL);
1452 } else {
1453 if (fib_lookup(net, fl4, &res) == 0) {
1454 struct fib_nh *nh = &FIB_RES_NH(res);
1455 struct fib_nh_exception *fnhe;
1456
1457 spin_lock_bh(&fnhe_lock);
1458 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1459 if (fnhe)
1460 fnhe->fnhe_gw = new_gw;
1461 spin_unlock_bh(&fnhe_lock);
1462 }
1463 rt->rt_gateway = new_gw;
1464 rt->rt_flags |= RTCF_REDIRECTED;
1465 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1466 }
1467 neigh_release(n);
1468 }
1469 return;
1470
1471 reject_redirect:
1472 #ifdef CONFIG_IP_ROUTE_VERBOSE
1473 if (IN_DEV_LOG_MARTIANS(in_dev)) {
1474 const struct iphdr *iph = (const struct iphdr *) skb->data;
1475 __be32 daddr = iph->daddr;
1476 __be32 saddr = iph->saddr;
1477
1478 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1479 " Advised path = %pI4 -> %pI4\n",
1480 &old_gw, dev->name, &new_gw,
1481 &saddr, &daddr);
1482 }
1483 #endif
1484 ;
1485 }
1486
1487 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1488 {
1489 struct rtable *rt;
1490 struct flowi4 fl4;
1491
1492 rt = (struct rtable *) dst;
1493
1494 ip_rt_build_flow_key(&fl4, sk, skb);
1495 __ip_do_redirect(rt, skb, &fl4);
1496 }
1497
1498 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1499 {
1500 struct rtable *rt = (struct rtable *)dst;
1501 struct dst_entry *ret = dst;
1502
1503 if (rt) {
1504 if (dst->obsolete > 0) {
1505 ip_rt_put(rt);
1506 ret = NULL;
1507 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1508 rt->dst.expires) {
1509 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1510 rt->rt_oif,
1511 rt_genid(dev_net(dst->dev)));
1512 rt_del(hash, rt);
1513 ret = NULL;
1514 }
1515 }
1516 return ret;
1517 }
1518
1519 /*
1520 * Algorithm:
1521 * 1. The first ip_rt_redirect_number redirects are sent
1522 * with exponential backoff, then we stop sending them at all,
1523 * assuming that the host ignores our redirects.
1524 * 2. If we did not see packets requiring redirects
1525 * during ip_rt_redirect_silence, we assume that the host
1526 * forgot redirected route and start to send redirects again.
1527 *
1528 * This algorithm is much cheaper and more intelligent than dumb load limiting
1529 * in icmp.c.
1530 *
1531 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1532 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1533 */
1534
1535 void ip_rt_send_redirect(struct sk_buff *skb)
1536 {
1537 struct rtable *rt = skb_rtable(skb);
1538 struct in_device *in_dev;
1539 struct inet_peer *peer;
1540 struct net *net;
1541 int log_martians;
1542
1543 rcu_read_lock();
1544 in_dev = __in_dev_get_rcu(rt->dst.dev);
1545 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1546 rcu_read_unlock();
1547 return;
1548 }
1549 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1550 rcu_read_unlock();
1551
1552 net = dev_net(rt->dst.dev);
1553 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1554 if (!peer) {
1555 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1556 return;
1557 }
1558
1559 /* No redirected packets during ip_rt_redirect_silence;
1560 * reset the algorithm.
1561 */
1562 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1563 peer->rate_tokens = 0;
1564
1565 /* Too many ignored redirects; do not send anything
1566 * set dst.rate_last to the last seen redirected packet.
1567 */
1568 if (peer->rate_tokens >= ip_rt_redirect_number) {
1569 peer->rate_last = jiffies;
1570 goto out_put_peer;
1571 }
1572
1573 /* Check for load limit; set rate_last to the latest sent
1574 * redirect.
1575 */
1576 if (peer->rate_tokens == 0 ||
1577 time_after(jiffies,
1578 (peer->rate_last +
1579 (ip_rt_redirect_load << peer->rate_tokens)))) {
1580 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1581 peer->rate_last = jiffies;
1582 ++peer->rate_tokens;
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584 if (log_martians &&
1585 peer->rate_tokens == ip_rt_redirect_number)
1586 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1587 &ip_hdr(skb)->saddr, rt->rt_iif,
1588 &rt->rt_dst, &rt->rt_gateway);
1589 #endif
1590 }
1591 out_put_peer:
1592 inet_putpeer(peer);
1593 }
1594
1595 static int ip_error(struct sk_buff *skb)
1596 {
1597 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1598 struct rtable *rt = skb_rtable(skb);
1599 struct inet_peer *peer;
1600 unsigned long now;
1601 struct net *net;
1602 bool send;
1603 int code;
1604
1605 net = dev_net(rt->dst.dev);
1606 if (!IN_DEV_FORWARD(in_dev)) {
1607 switch (rt->dst.error) {
1608 case EHOSTUNREACH:
1609 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1610 break;
1611
1612 case ENETUNREACH:
1613 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1614 break;
1615 }
1616 goto out;
1617 }
1618
1619 switch (rt->dst.error) {
1620 case EINVAL:
1621 default:
1622 goto out;
1623 case EHOSTUNREACH:
1624 code = ICMP_HOST_UNREACH;
1625 break;
1626 case ENETUNREACH:
1627 code = ICMP_NET_UNREACH;
1628 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1629 break;
1630 case EACCES:
1631 code = ICMP_PKT_FILTERED;
1632 break;
1633 }
1634
1635 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1636
1637 send = true;
1638 if (peer) {
1639 now = jiffies;
1640 peer->rate_tokens += now - peer->rate_last;
1641 if (peer->rate_tokens > ip_rt_error_burst)
1642 peer->rate_tokens = ip_rt_error_burst;
1643 peer->rate_last = now;
1644 if (peer->rate_tokens >= ip_rt_error_cost)
1645 peer->rate_tokens -= ip_rt_error_cost;
1646 else
1647 send = false;
1648 inet_putpeer(peer);
1649 }
1650 if (send)
1651 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1652
1653 out: kfree_skb(skb);
1654 return 0;
1655 }
1656
1657 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1658 {
1659 struct fib_result res;
1660
1661 if (mtu < ip_rt_min_pmtu)
1662 mtu = ip_rt_min_pmtu;
1663
1664 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1665 struct fib_nh *nh = &FIB_RES_NH(res);
1666 struct fib_nh_exception *fnhe;
1667
1668 spin_lock_bh(&fnhe_lock);
1669 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1670 if (fnhe) {
1671 fnhe->fnhe_pmtu = mtu;
1672 fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1673 }
1674 spin_unlock_bh(&fnhe_lock);
1675 }
1676 rt->rt_pmtu = mtu;
1677 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1678 }
1679
1680 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1681 struct sk_buff *skb, u32 mtu)
1682 {
1683 struct rtable *rt = (struct rtable *) dst;
1684 struct flowi4 fl4;
1685
1686 ip_rt_build_flow_key(&fl4, sk, skb);
1687 __ip_rt_update_pmtu(rt, &fl4, mtu);
1688 }
1689
1690 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1691 int oif, u32 mark, u8 protocol, int flow_flags)
1692 {
1693 const struct iphdr *iph = (const struct iphdr *) skb->data;
1694 struct flowi4 fl4;
1695 struct rtable *rt;
1696
1697 __build_flow_key(&fl4, NULL, iph, oif,
1698 RT_TOS(iph->tos), protocol, mark, flow_flags);
1699 rt = __ip_route_output_key(net, &fl4);
1700 if (!IS_ERR(rt)) {
1701 __ip_rt_update_pmtu(rt, &fl4, mtu);
1702 ip_rt_put(rt);
1703 }
1704 }
1705 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1706
1707 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1708 {
1709 const struct iphdr *iph = (const struct iphdr *) skb->data;
1710 struct flowi4 fl4;
1711 struct rtable *rt;
1712
1713 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1714 rt = __ip_route_output_key(sock_net(sk), &fl4);
1715 if (!IS_ERR(rt)) {
1716 __ip_rt_update_pmtu(rt, &fl4, mtu);
1717 ip_rt_put(rt);
1718 }
1719 }
1720 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1721
1722 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1723 int oif, u32 mark, u8 protocol, int flow_flags)
1724 {
1725 const struct iphdr *iph = (const struct iphdr *) skb->data;
1726 struct flowi4 fl4;
1727 struct rtable *rt;
1728
1729 __build_flow_key(&fl4, NULL, iph, oif,
1730 RT_TOS(iph->tos), protocol, mark, flow_flags);
1731 rt = __ip_route_output_key(net, &fl4);
1732 if (!IS_ERR(rt)) {
1733 __ip_do_redirect(rt, skb, &fl4);
1734 ip_rt_put(rt);
1735 }
1736 }
1737 EXPORT_SYMBOL_GPL(ipv4_redirect);
1738
1739 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1740 {
1741 const struct iphdr *iph = (const struct iphdr *) skb->data;
1742 struct flowi4 fl4;
1743 struct rtable *rt;
1744
1745 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1746 rt = __ip_route_output_key(sock_net(sk), &fl4);
1747 if (!IS_ERR(rt)) {
1748 __ip_do_redirect(rt, skb, &fl4);
1749 ip_rt_put(rt);
1750 }
1751 }
1752 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1753
1754 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1755 {
1756 struct rtable *rt = (struct rtable *) dst;
1757
1758 if (rt_is_expired(rt))
1759 return NULL;
1760 return dst;
1761 }
1762
1763 static void ipv4_dst_destroy(struct dst_entry *dst)
1764 {
1765 struct rtable *rt = (struct rtable *) dst;
1766
1767 if (rt->fi) {
1768 fib_info_put(rt->fi);
1769 rt->fi = NULL;
1770 }
1771 }
1772
1773
1774 static void ipv4_link_failure(struct sk_buff *skb)
1775 {
1776 struct rtable *rt;
1777
1778 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1779
1780 rt = skb_rtable(skb);
1781 if (rt)
1782 dst_set_expires(&rt->dst, 0);
1783 }
1784
1785 static int ip_rt_bug(struct sk_buff *skb)
1786 {
1787 pr_debug("%s: %pI4 -> %pI4, %s\n",
1788 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1789 skb->dev ? skb->dev->name : "?");
1790 kfree_skb(skb);
1791 WARN_ON(1);
1792 return 0;
1793 }
1794
1795 /*
1796 We do not cache source address of outgoing interface,
1797 because it is used only by IP RR, TS and SRR options,
1798 so that it out of fast path.
1799
1800 BTW remember: "addr" is allowed to be not aligned
1801 in IP options!
1802 */
1803
1804 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1805 {
1806 __be32 src;
1807
1808 if (rt_is_output_route(rt))
1809 src = ip_hdr(skb)->saddr;
1810 else {
1811 struct fib_result res;
1812 struct flowi4 fl4;
1813 struct iphdr *iph;
1814
1815 iph = ip_hdr(skb);
1816
1817 memset(&fl4, 0, sizeof(fl4));
1818 fl4.daddr = iph->daddr;
1819 fl4.saddr = iph->saddr;
1820 fl4.flowi4_tos = RT_TOS(iph->tos);
1821 fl4.flowi4_oif = rt->dst.dev->ifindex;
1822 fl4.flowi4_iif = skb->dev->ifindex;
1823 fl4.flowi4_mark = skb->mark;
1824
1825 rcu_read_lock();
1826 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1827 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1828 else
1829 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1830 RT_SCOPE_UNIVERSE);
1831 rcu_read_unlock();
1832 }
1833 memcpy(addr, &src, 4);
1834 }
1835
1836 #ifdef CONFIG_IP_ROUTE_CLASSID
1837 static void set_class_tag(struct rtable *rt, u32 tag)
1838 {
1839 if (!(rt->dst.tclassid & 0xFFFF))
1840 rt->dst.tclassid |= tag & 0xFFFF;
1841 if (!(rt->dst.tclassid & 0xFFFF0000))
1842 rt->dst.tclassid |= tag & 0xFFFF0000;
1843 }
1844 #endif
1845
1846 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1847 {
1848 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1849
1850 if (advmss == 0) {
1851 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1852 ip_rt_min_advmss);
1853 if (advmss > 65535 - 40)
1854 advmss = 65535 - 40;
1855 }
1856 return advmss;
1857 }
1858
1859 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1860 {
1861 const struct rtable *rt = (const struct rtable *) dst;
1862 unsigned int mtu = rt->rt_pmtu;
1863
1864 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1865 mtu = 0;
1866
1867 if (!mtu)
1868 mtu = dst_metric_raw(dst, RTAX_MTU);
1869
1870 if (mtu && rt_is_output_route(rt))
1871 return mtu;
1872
1873 mtu = dst->dev->mtu;
1874
1875 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1876
1877 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1878 mtu = 576;
1879 }
1880
1881 if (mtu > IP_MAX_MTU)
1882 mtu = IP_MAX_MTU;
1883
1884 return mtu;
1885 }
1886
1887 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1888 struct fib_info *fi)
1889 {
1890 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1891 rt->fi = fi;
1892 atomic_inc(&fi->fib_clntref);
1893 }
1894 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1895 }
1896
1897 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1898 {
1899 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1900 struct fib_nh_exception *fnhe;
1901 u32 hval;
1902
1903 hval = fnhe_hashfun(daddr);
1904
1905 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1906 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1907 if (fnhe->fnhe_daddr == daddr) {
1908 if (fnhe->fnhe_pmtu) {
1909 unsigned long expires = fnhe->fnhe_expires;
1910 unsigned long diff = expires - jiffies;
1911
1912 if (time_before(jiffies, expires)) {
1913 rt->rt_pmtu = fnhe->fnhe_pmtu;
1914 dst_set_expires(&rt->dst, diff);
1915 }
1916 }
1917 if (fnhe->fnhe_gw)
1918 rt->rt_gateway = fnhe->fnhe_gw;
1919 fnhe->fnhe_stamp = jiffies;
1920 break;
1921 }
1922 }
1923 }
1924
1925 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1926 const struct fib_result *res,
1927 struct fib_info *fi, u16 type, u32 itag)
1928 {
1929 if (fi) {
1930 struct fib_nh *nh = &FIB_RES_NH(*res);
1931
1932 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1933 rt->rt_gateway = nh->nh_gw;
1934 if (unlikely(nh->nh_exceptions))
1935 rt_bind_exception(rt, nh, fl4->daddr);
1936 rt_init_metrics(rt, fl4, fi);
1937 #ifdef CONFIG_IP_ROUTE_CLASSID
1938 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1939 #endif
1940 }
1941
1942 #ifdef CONFIG_IP_ROUTE_CLASSID
1943 #ifdef CONFIG_IP_MULTIPLE_TABLES
1944 set_class_tag(rt, res->tclassid);
1945 #endif
1946 set_class_tag(rt, itag);
1947 #endif
1948 }
1949
1950 static struct rtable *rt_dst_alloc(struct net_device *dev,
1951 bool nopolicy, bool noxfrm)
1952 {
1953 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1954 DST_HOST |
1955 (nopolicy ? DST_NOPOLICY : 0) |
1956 (noxfrm ? DST_NOXFRM : 0));
1957 }
1958
1959 /* called in rcu_read_lock() section */
1960 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1961 u8 tos, struct net_device *dev, int our)
1962 {
1963 unsigned int hash;
1964 struct rtable *rth;
1965 struct in_device *in_dev = __in_dev_get_rcu(dev);
1966 u32 itag = 0;
1967 int err;
1968
1969 /* Primary sanity checks. */
1970
1971 if (in_dev == NULL)
1972 return -EINVAL;
1973
1974 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1975 skb->protocol != htons(ETH_P_IP))
1976 goto e_inval;
1977
1978 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1979 if (ipv4_is_loopback(saddr))
1980 goto e_inval;
1981
1982 if (ipv4_is_zeronet(saddr)) {
1983 if (!ipv4_is_local_multicast(daddr))
1984 goto e_inval;
1985 } else {
1986 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1987 in_dev, &itag);
1988 if (err < 0)
1989 goto e_err;
1990 }
1991 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1992 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1993 if (!rth)
1994 goto e_nobufs;
1995
1996 #ifdef CONFIG_IP_ROUTE_CLASSID
1997 rth->dst.tclassid = itag;
1998 #endif
1999 rth->dst.output = ip_rt_bug;
2000
2001 rth->rt_key_dst = daddr;
2002 rth->rt_key_src = saddr;
2003 rth->rt_genid = rt_genid(dev_net(dev));
2004 rth->rt_flags = RTCF_MULTICAST;
2005 rth->rt_type = RTN_MULTICAST;
2006 rth->rt_key_tos = tos;
2007 rth->rt_dst = daddr;
2008 rth->rt_src = saddr;
2009 rth->rt_route_iif = dev->ifindex;
2010 rth->rt_iif = dev->ifindex;
2011 rth->rt_oif = 0;
2012 rth->rt_mark = skb->mark;
2013 rth->rt_pmtu = 0;
2014 rth->rt_gateway = daddr;
2015 rth->fi = NULL;
2016 if (our) {
2017 rth->dst.input= ip_local_deliver;
2018 rth->rt_flags |= RTCF_LOCAL;
2019 }
2020
2021 #ifdef CONFIG_IP_MROUTE
2022 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2023 rth->dst.input = ip_mr_input;
2024 #endif
2025 RT_CACHE_STAT_INC(in_slow_mc);
2026
2027 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2028 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2029 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2030
2031 e_nobufs:
2032 return -ENOBUFS;
2033 e_inval:
2034 return -EINVAL;
2035 e_err:
2036 return err;
2037 }
2038
2039
2040 static void ip_handle_martian_source(struct net_device *dev,
2041 struct in_device *in_dev,
2042 struct sk_buff *skb,
2043 __be32 daddr,
2044 __be32 saddr)
2045 {
2046 RT_CACHE_STAT_INC(in_martian_src);
2047 #ifdef CONFIG_IP_ROUTE_VERBOSE
2048 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2049 /*
2050 * RFC1812 recommendation, if source is martian,
2051 * the only hint is MAC header.
2052 */
2053 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2054 &daddr, &saddr, dev->name);
2055 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2056 print_hex_dump(KERN_WARNING, "ll header: ",
2057 DUMP_PREFIX_OFFSET, 16, 1,
2058 skb_mac_header(skb),
2059 dev->hard_header_len, true);
2060 }
2061 }
2062 #endif
2063 }
2064
2065 /* called in rcu_read_lock() section */
2066 static int __mkroute_input(struct sk_buff *skb,
2067 const struct fib_result *res,
2068 struct in_device *in_dev,
2069 __be32 daddr, __be32 saddr, u32 tos,
2070 struct rtable **result)
2071 {
2072 struct rtable *rth;
2073 int err;
2074 struct in_device *out_dev;
2075 unsigned int flags = 0;
2076 u32 itag;
2077
2078 /* get a working reference to the output device */
2079 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2080 if (out_dev == NULL) {
2081 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2082 return -EINVAL;
2083 }
2084
2085
2086 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2087 in_dev->dev, in_dev, &itag);
2088 if (err < 0) {
2089 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2090 saddr);
2091
2092 goto cleanup;
2093 }
2094
2095 if (err)
2096 flags |= RTCF_DIRECTSRC;
2097
2098 if (out_dev == in_dev && err &&
2099 (IN_DEV_SHARED_MEDIA(out_dev) ||
2100 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2101 flags |= RTCF_DOREDIRECT;
2102
2103 if (skb->protocol != htons(ETH_P_IP)) {
2104 /* Not IP (i.e. ARP). Do not create route, if it is
2105 * invalid for proxy arp. DNAT routes are always valid.
2106 *
2107 * Proxy arp feature have been extended to allow, ARP
2108 * replies back to the same interface, to support
2109 * Private VLAN switch technologies. See arp.c.
2110 */
2111 if (out_dev == in_dev &&
2112 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2113 err = -EINVAL;
2114 goto cleanup;
2115 }
2116 }
2117
2118 rth = rt_dst_alloc(out_dev->dev,
2119 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2120 IN_DEV_CONF_GET(out_dev, NOXFRM));
2121 if (!rth) {
2122 err = -ENOBUFS;
2123 goto cleanup;
2124 }
2125
2126 rth->rt_key_dst = daddr;
2127 rth->rt_key_src = saddr;
2128 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2129 rth->rt_flags = flags;
2130 rth->rt_type = res->type;
2131 rth->rt_key_tos = tos;
2132 rth->rt_dst = daddr;
2133 rth->rt_src = saddr;
2134 rth->rt_route_iif = in_dev->dev->ifindex;
2135 rth->rt_iif = in_dev->dev->ifindex;
2136 rth->rt_oif = 0;
2137 rth->rt_mark = skb->mark;
2138 rth->rt_pmtu = 0;
2139 rth->rt_gateway = daddr;
2140 rth->fi = NULL;
2141
2142 rth->dst.input = ip_forward;
2143 rth->dst.output = ip_output;
2144
2145 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2146
2147 *result = rth;
2148 err = 0;
2149 cleanup:
2150 return err;
2151 }
2152
2153 static int ip_mkroute_input(struct sk_buff *skb,
2154 struct fib_result *res,
2155 const struct flowi4 *fl4,
2156 struct in_device *in_dev,
2157 __be32 daddr, __be32 saddr, u32 tos)
2158 {
2159 struct rtable *rth = NULL;
2160 int err;
2161 unsigned int hash;
2162
2163 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2164 if (res->fi && res->fi->fib_nhs > 1)
2165 fib_select_multipath(res);
2166 #endif
2167
2168 /* create a routing cache entry */
2169 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2170 if (err)
2171 return err;
2172
2173 /* put it into the cache */
2174 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2175 rt_genid(dev_net(rth->dst.dev)));
2176 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2177 if (IS_ERR(rth))
2178 return PTR_ERR(rth);
2179 return 0;
2180 }
2181
2182 /*
2183 * NOTE. We drop all the packets that has local source
2184 * addresses, because every properly looped back packet
2185 * must have correct destination already attached by output routine.
2186 *
2187 * Such approach solves two big problems:
2188 * 1. Not simplex devices are handled properly.
2189 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2190 * called with rcu_read_lock()
2191 */
2192
2193 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2194 u8 tos, struct net_device *dev)
2195 {
2196 struct fib_result res;
2197 struct in_device *in_dev = __in_dev_get_rcu(dev);
2198 struct flowi4 fl4;
2199 unsigned int flags = 0;
2200 u32 itag = 0;
2201 struct rtable *rth;
2202 unsigned int hash;
2203 int err = -EINVAL;
2204 struct net *net = dev_net(dev);
2205
2206 /* IP on this device is disabled. */
2207
2208 if (!in_dev)
2209 goto out;
2210
2211 /* Check for the most weird martians, which can be not detected
2212 by fib_lookup.
2213 */
2214
2215 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2216 goto martian_source;
2217
2218 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2219 goto brd_input;
2220
2221 /* Accept zero addresses only to limited broadcast;
2222 * I even do not know to fix it or not. Waiting for complains :-)
2223 */
2224 if (ipv4_is_zeronet(saddr))
2225 goto martian_source;
2226
2227 if (ipv4_is_zeronet(daddr))
2228 goto martian_destination;
2229
2230 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2231 if (ipv4_is_loopback(daddr))
2232 goto martian_destination;
2233
2234 if (ipv4_is_loopback(saddr))
2235 goto martian_source;
2236 }
2237
2238 /*
2239 * Now we are ready to route packet.
2240 */
2241 fl4.flowi4_oif = 0;
2242 fl4.flowi4_iif = dev->ifindex;
2243 fl4.flowi4_mark = skb->mark;
2244 fl4.flowi4_tos = tos;
2245 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2246 fl4.daddr = daddr;
2247 fl4.saddr = saddr;
2248 err = fib_lookup(net, &fl4, &res);
2249 if (err != 0)
2250 goto no_route;
2251
2252 RT_CACHE_STAT_INC(in_slow_tot);
2253
2254 if (res.type == RTN_BROADCAST)
2255 goto brd_input;
2256
2257 if (res.type == RTN_LOCAL) {
2258 err = fib_validate_source(skb, saddr, daddr, tos,
2259 net->loopback_dev->ifindex,
2260 dev, in_dev, &itag);
2261 if (err < 0)
2262 goto martian_source_keep_err;
2263 if (err)
2264 flags |= RTCF_DIRECTSRC;
2265 goto local_input;
2266 }
2267
2268 if (!IN_DEV_FORWARD(in_dev))
2269 goto no_route;
2270 if (res.type != RTN_UNICAST)
2271 goto martian_destination;
2272
2273 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2274 out: return err;
2275
2276 brd_input:
2277 if (skb->protocol != htons(ETH_P_IP))
2278 goto e_inval;
2279
2280 if (!ipv4_is_zeronet(saddr)) {
2281 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2282 in_dev, &itag);
2283 if (err < 0)
2284 goto martian_source_keep_err;
2285 if (err)
2286 flags |= RTCF_DIRECTSRC;
2287 }
2288 flags |= RTCF_BROADCAST;
2289 res.type = RTN_BROADCAST;
2290 RT_CACHE_STAT_INC(in_brd);
2291
2292 local_input:
2293 rth = rt_dst_alloc(net->loopback_dev,
2294 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2295 if (!rth)
2296 goto e_nobufs;
2297
2298 rth->dst.input= ip_local_deliver;
2299 rth->dst.output= ip_rt_bug;
2300 #ifdef CONFIG_IP_ROUTE_CLASSID
2301 rth->dst.tclassid = itag;
2302 #endif
2303
2304 rth->rt_key_dst = daddr;
2305 rth->rt_key_src = saddr;
2306 rth->rt_genid = rt_genid(net);
2307 rth->rt_flags = flags|RTCF_LOCAL;
2308 rth->rt_type = res.type;
2309 rth->rt_key_tos = tos;
2310 rth->rt_dst = daddr;
2311 rth->rt_src = saddr;
2312 rth->rt_route_iif = dev->ifindex;
2313 rth->rt_iif = dev->ifindex;
2314 rth->rt_oif = 0;
2315 rth->rt_mark = skb->mark;
2316 rth->rt_pmtu = 0;
2317 rth->rt_gateway = daddr;
2318 rth->fi = NULL;
2319 if (res.type == RTN_UNREACHABLE) {
2320 rth->dst.input= ip_error;
2321 rth->dst.error= -err;
2322 rth->rt_flags &= ~RTCF_LOCAL;
2323 }
2324 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2325 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2326 err = 0;
2327 if (IS_ERR(rth))
2328 err = PTR_ERR(rth);
2329 goto out;
2330
2331 no_route:
2332 RT_CACHE_STAT_INC(in_no_route);
2333 res.type = RTN_UNREACHABLE;
2334 if (err == -ESRCH)
2335 err = -ENETUNREACH;
2336 goto local_input;
2337
2338 /*
2339 * Do not cache martian addresses: they should be logged (RFC1812)
2340 */
2341 martian_destination:
2342 RT_CACHE_STAT_INC(in_martian_dst);
2343 #ifdef CONFIG_IP_ROUTE_VERBOSE
2344 if (IN_DEV_LOG_MARTIANS(in_dev))
2345 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2346 &daddr, &saddr, dev->name);
2347 #endif
2348
2349 e_inval:
2350 err = -EINVAL;
2351 goto out;
2352
2353 e_nobufs:
2354 err = -ENOBUFS;
2355 goto out;
2356
2357 martian_source:
2358 err = -EINVAL;
2359 martian_source_keep_err:
2360 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2361 goto out;
2362 }
2363
2364 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2365 u8 tos, struct net_device *dev, bool noref)
2366 {
2367 struct rtable *rth;
2368 unsigned int hash;
2369 int iif = dev->ifindex;
2370 struct net *net;
2371 int res;
2372
2373 net = dev_net(dev);
2374
2375 rcu_read_lock();
2376
2377 if (!rt_caching(net))
2378 goto skip_cache;
2379
2380 tos &= IPTOS_RT_MASK;
2381 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2382
2383 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2384 rth = rcu_dereference(rth->dst.rt_next)) {
2385 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2386 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2387 (rth->rt_route_iif ^ iif) |
2388 (rth->rt_key_tos ^ tos)) == 0 &&
2389 rth->rt_mark == skb->mark &&
2390 net_eq(dev_net(rth->dst.dev), net) &&
2391 !rt_is_expired(rth)) {
2392 if (noref) {
2393 dst_use_noref(&rth->dst, jiffies);
2394 skb_dst_set_noref(skb, &rth->dst);
2395 } else {
2396 dst_use(&rth->dst, jiffies);
2397 skb_dst_set(skb, &rth->dst);
2398 }
2399 RT_CACHE_STAT_INC(in_hit);
2400 rcu_read_unlock();
2401 return 0;
2402 }
2403 RT_CACHE_STAT_INC(in_hlist_search);
2404 }
2405
2406 skip_cache:
2407 /* Multicast recognition logic is moved from route cache to here.
2408 The problem was that too many Ethernet cards have broken/missing
2409 hardware multicast filters :-( As result the host on multicasting
2410 network acquires a lot of useless route cache entries, sort of
2411 SDR messages from all the world. Now we try to get rid of them.
2412 Really, provided software IP multicast filter is organized
2413 reasonably (at least, hashed), it does not result in a slowdown
2414 comparing with route cache reject entries.
2415 Note, that multicast routers are not affected, because
2416 route cache entry is created eventually.
2417 */
2418 if (ipv4_is_multicast(daddr)) {
2419 struct in_device *in_dev = __in_dev_get_rcu(dev);
2420
2421 if (in_dev) {
2422 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2423 ip_hdr(skb)->protocol);
2424 if (our
2425 #ifdef CONFIG_IP_MROUTE
2426 ||
2427 (!ipv4_is_local_multicast(daddr) &&
2428 IN_DEV_MFORWARD(in_dev))
2429 #endif
2430 ) {
2431 int res = ip_route_input_mc(skb, daddr, saddr,
2432 tos, dev, our);
2433 rcu_read_unlock();
2434 return res;
2435 }
2436 }
2437 rcu_read_unlock();
2438 return -EINVAL;
2439 }
2440 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2441 rcu_read_unlock();
2442 return res;
2443 }
2444 EXPORT_SYMBOL(ip_route_input_common);
2445
2446 /* called with rcu_read_lock() */
2447 static struct rtable *__mkroute_output(const struct fib_result *res,
2448 const struct flowi4 *fl4,
2449 __be32 orig_daddr, __be32 orig_saddr,
2450 int orig_oif, __u8 orig_rtos,
2451 struct net_device *dev_out,
2452 unsigned int flags)
2453 {
2454 struct fib_info *fi = res->fi;
2455 struct in_device *in_dev;
2456 u16 type = res->type;
2457 struct rtable *rth;
2458
2459 in_dev = __in_dev_get_rcu(dev_out);
2460 if (!in_dev)
2461 return ERR_PTR(-EINVAL);
2462
2463 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2464 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2465 return ERR_PTR(-EINVAL);
2466
2467 if (ipv4_is_lbcast(fl4->daddr))
2468 type = RTN_BROADCAST;
2469 else if (ipv4_is_multicast(fl4->daddr))
2470 type = RTN_MULTICAST;
2471 else if (ipv4_is_zeronet(fl4->daddr))
2472 return ERR_PTR(-EINVAL);
2473
2474 if (dev_out->flags & IFF_LOOPBACK)
2475 flags |= RTCF_LOCAL;
2476
2477 if (type == RTN_BROADCAST) {
2478 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2479 fi = NULL;
2480 } else if (type == RTN_MULTICAST) {
2481 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2482 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2483 fl4->flowi4_proto))
2484 flags &= ~RTCF_LOCAL;
2485 /* If multicast route do not exist use
2486 * default one, but do not gateway in this case.
2487 * Yes, it is hack.
2488 */
2489 if (fi && res->prefixlen < 4)
2490 fi = NULL;
2491 }
2492
2493 rth = rt_dst_alloc(dev_out,
2494 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2495 IN_DEV_CONF_GET(in_dev, NOXFRM));
2496 if (!rth)
2497 return ERR_PTR(-ENOBUFS);
2498
2499 rth->dst.output = ip_output;
2500
2501 rth->rt_key_dst = orig_daddr;
2502 rth->rt_key_src = orig_saddr;
2503 rth->rt_genid = rt_genid(dev_net(dev_out));
2504 rth->rt_flags = flags;
2505 rth->rt_type = type;
2506 rth->rt_key_tos = orig_rtos;
2507 rth->rt_dst = fl4->daddr;
2508 rth->rt_src = fl4->saddr;
2509 rth->rt_route_iif = 0;
2510 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2511 rth->rt_oif = orig_oif;
2512 rth->rt_mark = fl4->flowi4_mark;
2513 rth->rt_pmtu = 0;
2514 rth->rt_gateway = fl4->daddr;
2515 rth->fi = NULL;
2516
2517 RT_CACHE_STAT_INC(out_slow_tot);
2518
2519 if (flags & RTCF_LOCAL)
2520 rth->dst.input = ip_local_deliver;
2521 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2522 if (flags & RTCF_LOCAL &&
2523 !(dev_out->flags & IFF_LOOPBACK)) {
2524 rth->dst.output = ip_mc_output;
2525 RT_CACHE_STAT_INC(out_slow_mc);
2526 }
2527 #ifdef CONFIG_IP_MROUTE
2528 if (type == RTN_MULTICAST) {
2529 if (IN_DEV_MFORWARD(in_dev) &&
2530 !ipv4_is_local_multicast(fl4->daddr)) {
2531 rth->dst.input = ip_mr_input;
2532 rth->dst.output = ip_mc_output;
2533 }
2534 }
2535 #endif
2536 }
2537
2538 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2539
2540 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2541 rth->dst.flags |= DST_NOCACHE;
2542
2543 return rth;
2544 }
2545
2546 /*
2547 * Major route resolver routine.
2548 * called with rcu_read_lock();
2549 */
2550
2551 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2552 {
2553 struct net_device *dev_out = NULL;
2554 __u8 tos = RT_FL_TOS(fl4);
2555 unsigned int flags = 0;
2556 struct fib_result res;
2557 struct rtable *rth;
2558 __be32 orig_daddr;
2559 __be32 orig_saddr;
2560 int orig_oif;
2561
2562 res.tclassid = 0;
2563 res.fi = NULL;
2564 res.table = NULL;
2565
2566 orig_daddr = fl4->daddr;
2567 orig_saddr = fl4->saddr;
2568 orig_oif = fl4->flowi4_oif;
2569
2570 fl4->flowi4_iif = net->loopback_dev->ifindex;
2571 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2572 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2573 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2574
2575 rcu_read_lock();
2576 if (fl4->saddr) {
2577 rth = ERR_PTR(-EINVAL);
2578 if (ipv4_is_multicast(fl4->saddr) ||
2579 ipv4_is_lbcast(fl4->saddr) ||
2580 ipv4_is_zeronet(fl4->saddr))
2581 goto out;
2582
2583 /* I removed check for oif == dev_out->oif here.
2584 It was wrong for two reasons:
2585 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2586 is assigned to multiple interfaces.
2587 2. Moreover, we are allowed to send packets with saddr
2588 of another iface. --ANK
2589 */
2590
2591 if (fl4->flowi4_oif == 0 &&
2592 (ipv4_is_multicast(fl4->daddr) ||
2593 ipv4_is_lbcast(fl4->daddr))) {
2594 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2595 dev_out = __ip_dev_find(net, fl4->saddr, false);
2596 if (dev_out == NULL)
2597 goto out;
2598
2599 /* Special hack: user can direct multicasts
2600 and limited broadcast via necessary interface
2601 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2602 This hack is not just for fun, it allows
2603 vic,vat and friends to work.
2604 They bind socket to loopback, set ttl to zero
2605 and expect that it will work.
2606 From the viewpoint of routing cache they are broken,
2607 because we are not allowed to build multicast path
2608 with loopback source addr (look, routing cache
2609 cannot know, that ttl is zero, so that packet
2610 will not leave this host and route is valid).
2611 Luckily, this hack is good workaround.
2612 */
2613
2614 fl4->flowi4_oif = dev_out->ifindex;
2615 goto make_route;
2616 }
2617
2618 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2619 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2620 if (!__ip_dev_find(net, fl4->saddr, false))
2621 goto out;
2622 }
2623 }
2624
2625
2626 if (fl4->flowi4_oif) {
2627 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2628 rth = ERR_PTR(-ENODEV);
2629 if (dev_out == NULL)
2630 goto out;
2631
2632 /* RACE: Check return value of inet_select_addr instead. */
2633 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2634 rth = ERR_PTR(-ENETUNREACH);
2635 goto out;
2636 }
2637 if (ipv4_is_local_multicast(fl4->daddr) ||
2638 ipv4_is_lbcast(fl4->daddr)) {
2639 if (!fl4->saddr)
2640 fl4->saddr = inet_select_addr(dev_out, 0,
2641 RT_SCOPE_LINK);
2642 goto make_route;
2643 }
2644 if (fl4->saddr) {
2645 if (ipv4_is_multicast(fl4->daddr))
2646 fl4->saddr = inet_select_addr(dev_out, 0,
2647 fl4->flowi4_scope);
2648 else if (!fl4->daddr)
2649 fl4->saddr = inet_select_addr(dev_out, 0,
2650 RT_SCOPE_HOST);
2651 }
2652 }
2653
2654 if (!fl4->daddr) {
2655 fl4->daddr = fl4->saddr;
2656 if (!fl4->daddr)
2657 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2658 dev_out = net->loopback_dev;
2659 fl4->flowi4_oif = net->loopback_dev->ifindex;
2660 res.type = RTN_LOCAL;
2661 flags |= RTCF_LOCAL;
2662 goto make_route;
2663 }
2664
2665 if (fib_lookup(net, fl4, &res)) {
2666 res.fi = NULL;
2667 res.table = NULL;
2668 if (fl4->flowi4_oif) {
2669 /* Apparently, routing tables are wrong. Assume,
2670 that the destination is on link.
2671
2672 WHY? DW.
2673 Because we are allowed to send to iface
2674 even if it has NO routes and NO assigned
2675 addresses. When oif is specified, routing
2676 tables are looked up with only one purpose:
2677 to catch if destination is gatewayed, rather than
2678 direct. Moreover, if MSG_DONTROUTE is set,
2679 we send packet, ignoring both routing tables
2680 and ifaddr state. --ANK
2681
2682
2683 We could make it even if oif is unknown,
2684 likely IPv6, but we do not.
2685 */
2686
2687 if (fl4->saddr == 0)
2688 fl4->saddr = inet_select_addr(dev_out, 0,
2689 RT_SCOPE_LINK);
2690 res.type = RTN_UNICAST;
2691 goto make_route;
2692 }
2693 rth = ERR_PTR(-ENETUNREACH);
2694 goto out;
2695 }
2696
2697 if (res.type == RTN_LOCAL) {
2698 if (!fl4->saddr) {
2699 if (res.fi->fib_prefsrc)
2700 fl4->saddr = res.fi->fib_prefsrc;
2701 else
2702 fl4->saddr = fl4->daddr;
2703 }
2704 dev_out = net->loopback_dev;
2705 fl4->flowi4_oif = dev_out->ifindex;
2706 res.fi = NULL;
2707 flags |= RTCF_LOCAL;
2708 goto make_route;
2709 }
2710
2711 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2712 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2713 fib_select_multipath(&res);
2714 else
2715 #endif
2716 if (!res.prefixlen &&
2717 res.table->tb_num_default > 1 &&
2718 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2719 fib_select_default(&res);
2720
2721 if (!fl4->saddr)
2722 fl4->saddr = FIB_RES_PREFSRC(net, res);
2723
2724 dev_out = FIB_RES_DEV(res);
2725 fl4->flowi4_oif = dev_out->ifindex;
2726
2727
2728 make_route:
2729 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2730 tos, dev_out, flags);
2731 if (!IS_ERR(rth)) {
2732 unsigned int hash;
2733
2734 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2735 rt_genid(dev_net(dev_out)));
2736 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2737 }
2738
2739 out:
2740 rcu_read_unlock();
2741 return rth;
2742 }
2743
2744 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2745 {
2746 struct rtable *rth;
2747 unsigned int hash;
2748
2749 if (!rt_caching(net))
2750 goto slow_output;
2751
2752 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2753
2754 rcu_read_lock_bh();
2755 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2756 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2757 if (rth->rt_key_dst == flp4->daddr &&
2758 rth->rt_key_src == flp4->saddr &&
2759 rt_is_output_route(rth) &&
2760 rth->rt_oif == flp4->flowi4_oif &&
2761 rth->rt_mark == flp4->flowi4_mark &&
2762 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2763 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2764 net_eq(dev_net(rth->dst.dev), net) &&
2765 !rt_is_expired(rth)) {
2766 dst_use(&rth->dst, jiffies);
2767 RT_CACHE_STAT_INC(out_hit);
2768 rcu_read_unlock_bh();
2769 if (!flp4->saddr)
2770 flp4->saddr = rth->rt_src;
2771 if (!flp4->daddr)
2772 flp4->daddr = rth->rt_dst;
2773 return rth;
2774 }
2775 RT_CACHE_STAT_INC(out_hlist_search);
2776 }
2777 rcu_read_unlock_bh();
2778
2779 slow_output:
2780 return ip_route_output_slow(net, flp4);
2781 }
2782 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2783
2784 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2785 {
2786 return NULL;
2787 }
2788
2789 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2790 {
2791 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2792
2793 return mtu ? : dst->dev->mtu;
2794 }
2795
2796 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2797 struct sk_buff *skb, u32 mtu)
2798 {
2799 }
2800
2801 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2802 struct sk_buff *skb)
2803 {
2804 }
2805
2806 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2807 unsigned long old)
2808 {
2809 return NULL;
2810 }
2811
2812 static struct dst_ops ipv4_dst_blackhole_ops = {
2813 .family = AF_INET,
2814 .protocol = cpu_to_be16(ETH_P_IP),
2815 .destroy = ipv4_dst_destroy,
2816 .check = ipv4_blackhole_dst_check,
2817 .mtu = ipv4_blackhole_mtu,
2818 .default_advmss = ipv4_default_advmss,
2819 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2820 .redirect = ipv4_rt_blackhole_redirect,
2821 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2822 .neigh_lookup = ipv4_neigh_lookup,
2823 };
2824
2825 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2826 {
2827 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2828 struct rtable *ort = (struct rtable *) dst_orig;
2829
2830 if (rt) {
2831 struct dst_entry *new = &rt->dst;
2832
2833 new->__use = 1;
2834 new->input = dst_discard;
2835 new->output = dst_discard;
2836
2837 new->dev = ort->dst.dev;
2838 if (new->dev)
2839 dev_hold(new->dev);
2840
2841 rt->rt_key_dst = ort->rt_key_dst;
2842 rt->rt_key_src = ort->rt_key_src;
2843 rt->rt_key_tos = ort->rt_key_tos;
2844 rt->rt_route_iif = ort->rt_route_iif;
2845 rt->rt_iif = ort->rt_iif;
2846 rt->rt_oif = ort->rt_oif;
2847 rt->rt_mark = ort->rt_mark;
2848 rt->rt_pmtu = ort->rt_pmtu;
2849
2850 rt->rt_genid = rt_genid(net);
2851 rt->rt_flags = ort->rt_flags;
2852 rt->rt_type = ort->rt_type;
2853 rt->rt_dst = ort->rt_dst;
2854 rt->rt_src = ort->rt_src;
2855 rt->rt_gateway = ort->rt_gateway;
2856 rt->fi = ort->fi;
2857 if (rt->fi)
2858 atomic_inc(&rt->fi->fib_clntref);
2859
2860 dst_free(new);
2861 }
2862
2863 dst_release(dst_orig);
2864
2865 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2866 }
2867
2868 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2869 struct sock *sk)
2870 {
2871 struct rtable *rt = __ip_route_output_key(net, flp4);
2872
2873 if (IS_ERR(rt))
2874 return rt;
2875
2876 if (flp4->flowi4_proto)
2877 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2878 flowi4_to_flowi(flp4),
2879 sk, 0);
2880
2881 return rt;
2882 }
2883 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2884
2885 static int rt_fill_info(struct net *net,
2886 struct sk_buff *skb, u32 pid, u32 seq, int event,
2887 int nowait, unsigned int flags)
2888 {
2889 struct rtable *rt = skb_rtable(skb);
2890 struct rtmsg *r;
2891 struct nlmsghdr *nlh;
2892 unsigned long expires = 0;
2893 u32 error;
2894
2895 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2896 if (nlh == NULL)
2897 return -EMSGSIZE;
2898
2899 r = nlmsg_data(nlh);
2900 r->rtm_family = AF_INET;
2901 r->rtm_dst_len = 32;
2902 r->rtm_src_len = 0;
2903 r->rtm_tos = rt->rt_key_tos;
2904 r->rtm_table = RT_TABLE_MAIN;
2905 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2906 goto nla_put_failure;
2907 r->rtm_type = rt->rt_type;
2908 r->rtm_scope = RT_SCOPE_UNIVERSE;
2909 r->rtm_protocol = RTPROT_UNSPEC;
2910 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2911 if (rt->rt_flags & RTCF_NOTIFY)
2912 r->rtm_flags |= RTM_F_NOTIFY;
2913
2914 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2915 goto nla_put_failure;
2916 if (rt->rt_key_src) {
2917 r->rtm_src_len = 32;
2918 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2919 goto nla_put_failure;
2920 }
2921 if (rt->dst.dev &&
2922 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2923 goto nla_put_failure;
2924 #ifdef CONFIG_IP_ROUTE_CLASSID
2925 if (rt->dst.tclassid &&
2926 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2927 goto nla_put_failure;
2928 #endif
2929 if (!rt_is_input_route(rt) &&
2930 rt->rt_src != rt->rt_key_src) {
2931 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2932 goto nla_put_failure;
2933 }
2934 if (rt->rt_dst != rt->rt_gateway &&
2935 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2936 goto nla_put_failure;
2937
2938 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2939 goto nla_put_failure;
2940
2941 if (rt->rt_mark &&
2942 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2943 goto nla_put_failure;
2944
2945 error = rt->dst.error;
2946 expires = rt->dst.expires;
2947 if (expires) {
2948 if (time_before(jiffies, expires))
2949 expires -= jiffies;
2950 else
2951 expires = 0;
2952 }
2953
2954 if (rt_is_input_route(rt)) {
2955 #ifdef CONFIG_IP_MROUTE
2956 __be32 dst = rt->rt_dst;
2957
2958 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2959 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2960 int err = ipmr_get_route(net, skb,
2961 rt->rt_src, rt->rt_dst,
2962 r, nowait);
2963 if (err <= 0) {
2964 if (!nowait) {
2965 if (err == 0)
2966 return 0;
2967 goto nla_put_failure;
2968 } else {
2969 if (err == -EMSGSIZE)
2970 goto nla_put_failure;
2971 error = err;
2972 }
2973 }
2974 } else
2975 #endif
2976 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2977 goto nla_put_failure;
2978 }
2979
2980 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2981 goto nla_put_failure;
2982
2983 return nlmsg_end(skb, nlh);
2984
2985 nla_put_failure:
2986 nlmsg_cancel(skb, nlh);
2987 return -EMSGSIZE;
2988 }
2989
2990 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2991 {
2992 struct net *net = sock_net(in_skb->sk);
2993 struct rtmsg *rtm;
2994 struct nlattr *tb[RTA_MAX+1];
2995 struct rtable *rt = NULL;
2996 __be32 dst = 0;
2997 __be32 src = 0;
2998 u32 iif;
2999 int err;
3000 int mark;
3001 struct sk_buff *skb;
3002
3003 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3004 if (err < 0)
3005 goto errout;
3006
3007 rtm = nlmsg_data(nlh);
3008
3009 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3010 if (skb == NULL) {
3011 err = -ENOBUFS;
3012 goto errout;
3013 }
3014
3015 /* Reserve room for dummy headers, this skb can pass
3016 through good chunk of routing engine.
3017 */
3018 skb_reset_mac_header(skb);
3019 skb_reset_network_header(skb);
3020
3021 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3022 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3023 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3024
3025 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3026 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3027 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3028 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3029
3030 if (iif) {
3031 struct net_device *dev;
3032
3033 dev = __dev_get_by_index(net, iif);
3034 if (dev == NULL) {
3035 err = -ENODEV;
3036 goto errout_free;
3037 }
3038
3039 skb->protocol = htons(ETH_P_IP);
3040 skb->dev = dev;
3041 skb->mark = mark;
3042 local_bh_disable();
3043 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3044 local_bh_enable();
3045
3046 rt = skb_rtable(skb);
3047 if (err == 0 && rt->dst.error)
3048 err = -rt->dst.error;
3049 } else {
3050 struct flowi4 fl4 = {
3051 .daddr = dst,
3052 .saddr = src,
3053 .flowi4_tos = rtm->rtm_tos,
3054 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3055 .flowi4_mark = mark,
3056 };
3057 rt = ip_route_output_key(net, &fl4);
3058
3059 err = 0;
3060 if (IS_ERR(rt))
3061 err = PTR_ERR(rt);
3062 }
3063
3064 if (err)
3065 goto errout_free;
3066
3067 skb_dst_set(skb, &rt->dst);
3068 if (rtm->rtm_flags & RTM_F_NOTIFY)
3069 rt->rt_flags |= RTCF_NOTIFY;
3070
3071 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3072 RTM_NEWROUTE, 0, 0);
3073 if (err <= 0)
3074 goto errout_free;
3075
3076 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3077 errout:
3078 return err;
3079
3080 errout_free:
3081 kfree_skb(skb);
3082 goto errout;
3083 }
3084
3085 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3086 {
3087 struct rtable *rt;
3088 int h, s_h;
3089 int idx, s_idx;
3090 struct net *net;
3091
3092 net = sock_net(skb->sk);
3093
3094 s_h = cb->args[0];
3095 if (s_h < 0)
3096 s_h = 0;
3097 s_idx = idx = cb->args[1];
3098 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3099 if (!rt_hash_table[h].chain)
3100 continue;
3101 rcu_read_lock_bh();
3102 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3103 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3104 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3105 continue;
3106 if (rt_is_expired(rt))
3107 continue;
3108 skb_dst_set_noref(skb, &rt->dst);
3109 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3110 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3111 1, NLM_F_MULTI) <= 0) {
3112 skb_dst_drop(skb);
3113 rcu_read_unlock_bh();
3114 goto done;
3115 }
3116 skb_dst_drop(skb);
3117 }
3118 rcu_read_unlock_bh();
3119 }
3120
3121 done:
3122 cb->args[0] = h;
3123 cb->args[1] = idx;
3124 return skb->len;
3125 }
3126
3127 void ip_rt_multicast_event(struct in_device *in_dev)
3128 {
3129 rt_cache_flush(dev_net(in_dev->dev), 0);
3130 }
3131
3132 #ifdef CONFIG_SYSCTL
3133 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3134 void __user *buffer,
3135 size_t *lenp, loff_t *ppos)
3136 {
3137 if (write) {
3138 int flush_delay;
3139 ctl_table ctl;
3140 struct net *net;
3141
3142 memcpy(&ctl, __ctl, sizeof(ctl));
3143 ctl.data = &flush_delay;
3144 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3145
3146 net = (struct net *)__ctl->extra1;
3147 rt_cache_flush(net, flush_delay);
3148 return 0;
3149 }
3150
3151 return -EINVAL;
3152 }
3153
3154 static ctl_table ipv4_route_table[] = {
3155 {
3156 .procname = "gc_thresh",
3157 .data = &ipv4_dst_ops.gc_thresh,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
3160 .proc_handler = proc_dointvec,
3161 },
3162 {
3163 .procname = "max_size",
3164 .data = &ip_rt_max_size,
3165 .maxlen = sizeof(int),
3166 .mode = 0644,
3167 .proc_handler = proc_dointvec,
3168 },
3169 {
3170 /* Deprecated. Use gc_min_interval_ms */
3171
3172 .procname = "gc_min_interval",
3173 .data = &ip_rt_gc_min_interval,
3174 .maxlen = sizeof(int),
3175 .mode = 0644,
3176 .proc_handler = proc_dointvec_jiffies,
3177 },
3178 {
3179 .procname = "gc_min_interval_ms",
3180 .data = &ip_rt_gc_min_interval,
3181 .maxlen = sizeof(int),
3182 .mode = 0644,
3183 .proc_handler = proc_dointvec_ms_jiffies,
3184 },
3185 {
3186 .procname = "gc_timeout",
3187 .data = &ip_rt_gc_timeout,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
3190 .proc_handler = proc_dointvec_jiffies,
3191 },
3192 {
3193 .procname = "gc_interval",
3194 .data = &ip_rt_gc_interval,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
3197 .proc_handler = proc_dointvec_jiffies,
3198 },
3199 {
3200 .procname = "redirect_load",
3201 .data = &ip_rt_redirect_load,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
3204 .proc_handler = proc_dointvec,
3205 },
3206 {
3207 .procname = "redirect_number",
3208 .data = &ip_rt_redirect_number,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
3211 .proc_handler = proc_dointvec,
3212 },
3213 {
3214 .procname = "redirect_silence",
3215 .data = &ip_rt_redirect_silence,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
3218 .proc_handler = proc_dointvec,
3219 },
3220 {
3221 .procname = "error_cost",
3222 .data = &ip_rt_error_cost,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
3225 .proc_handler = proc_dointvec,
3226 },
3227 {
3228 .procname = "error_burst",
3229 .data = &ip_rt_error_burst,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
3232 .proc_handler = proc_dointvec,
3233 },
3234 {
3235 .procname = "gc_elasticity",
3236 .data = &ip_rt_gc_elasticity,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
3239 .proc_handler = proc_dointvec,
3240 },
3241 {
3242 .procname = "mtu_expires",
3243 .data = &ip_rt_mtu_expires,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
3246 .proc_handler = proc_dointvec_jiffies,
3247 },
3248 {
3249 .procname = "min_pmtu",
3250 .data = &ip_rt_min_pmtu,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
3253 .proc_handler = proc_dointvec,
3254 },
3255 {
3256 .procname = "min_adv_mss",
3257 .data = &ip_rt_min_advmss,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
3260 .proc_handler = proc_dointvec,
3261 },
3262 { }
3263 };
3264
3265 static struct ctl_table ipv4_route_flush_table[] = {
3266 {
3267 .procname = "flush",
3268 .maxlen = sizeof(int),
3269 .mode = 0200,
3270 .proc_handler = ipv4_sysctl_rtcache_flush,
3271 },
3272 { },
3273 };
3274
3275 static __net_init int sysctl_route_net_init(struct net *net)
3276 {
3277 struct ctl_table *tbl;
3278
3279 tbl = ipv4_route_flush_table;
3280 if (!net_eq(net, &init_net)) {
3281 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3282 if (tbl == NULL)
3283 goto err_dup;
3284 }
3285 tbl[0].extra1 = net;
3286
3287 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3288 if (net->ipv4.route_hdr == NULL)
3289 goto err_reg;
3290 return 0;
3291
3292 err_reg:
3293 if (tbl != ipv4_route_flush_table)
3294 kfree(tbl);
3295 err_dup:
3296 return -ENOMEM;
3297 }
3298
3299 static __net_exit void sysctl_route_net_exit(struct net *net)
3300 {
3301 struct ctl_table *tbl;
3302
3303 tbl = net->ipv4.route_hdr->ctl_table_arg;
3304 unregister_net_sysctl_table(net->ipv4.route_hdr);
3305 BUG_ON(tbl == ipv4_route_flush_table);
3306 kfree(tbl);
3307 }
3308
3309 static __net_initdata struct pernet_operations sysctl_route_ops = {
3310 .init = sysctl_route_net_init,
3311 .exit = sysctl_route_net_exit,
3312 };
3313 #endif
3314
3315 static __net_init int rt_genid_init(struct net *net)
3316 {
3317 get_random_bytes(&net->ipv4.rt_genid,
3318 sizeof(net->ipv4.rt_genid));
3319 get_random_bytes(&net->ipv4.dev_addr_genid,
3320 sizeof(net->ipv4.dev_addr_genid));
3321 return 0;
3322 }
3323
3324 static __net_initdata struct pernet_operations rt_genid_ops = {
3325 .init = rt_genid_init,
3326 };
3327
3328 static int __net_init ipv4_inetpeer_init(struct net *net)
3329 {
3330 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3331
3332 if (!bp)
3333 return -ENOMEM;
3334 inet_peer_base_init(bp);
3335 net->ipv4.peers = bp;
3336 return 0;
3337 }
3338
3339 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3340 {
3341 struct inet_peer_base *bp = net->ipv4.peers;
3342
3343 net->ipv4.peers = NULL;
3344 inetpeer_invalidate_tree(bp);
3345 kfree(bp);
3346 }
3347
3348 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3349 .init = ipv4_inetpeer_init,
3350 .exit = ipv4_inetpeer_exit,
3351 };
3352
3353 #ifdef CONFIG_IP_ROUTE_CLASSID
3354 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3355 #endif /* CONFIG_IP_ROUTE_CLASSID */
3356
3357 static __initdata unsigned long rhash_entries;
3358 static int __init set_rhash_entries(char *str)
3359 {
3360 ssize_t ret;
3361
3362 if (!str)
3363 return 0;
3364
3365 ret = kstrtoul(str, 0, &rhash_entries);
3366 if (ret)
3367 return 0;
3368
3369 return 1;
3370 }
3371 __setup("rhash_entries=", set_rhash_entries);
3372
3373 int __init ip_rt_init(void)
3374 {
3375 int rc = 0;
3376
3377 #ifdef CONFIG_IP_ROUTE_CLASSID
3378 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3379 if (!ip_rt_acct)
3380 panic("IP: failed to allocate ip_rt_acct\n");
3381 #endif
3382
3383 ipv4_dst_ops.kmem_cachep =
3384 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3385 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3386
3387 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3388
3389 if (dst_entries_init(&ipv4_dst_ops) < 0)
3390 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3391
3392 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3393 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3394
3395 rt_hash_table = (struct rt_hash_bucket *)
3396 alloc_large_system_hash("IP route cache",
3397 sizeof(struct rt_hash_bucket),
3398 rhash_entries,
3399 (totalram_pages >= 128 * 1024) ?
3400 15 : 17,
3401 0,
3402 &rt_hash_log,
3403 &rt_hash_mask,
3404 0,
3405 rhash_entries ? 0 : 512 * 1024);
3406 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3407 rt_hash_lock_init();
3408
3409 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3410 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3411
3412 devinet_init();
3413 ip_fib_init();
3414
3415 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3416 expires_ljiffies = jiffies;
3417 schedule_delayed_work(&expires_work,
3418 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3419
3420 if (ip_rt_proc_init())
3421 pr_err("Unable to create route proc files\n");
3422 #ifdef CONFIG_XFRM
3423 xfrm_init();
3424 xfrm4_init(ip_rt_max_size);
3425 #endif
3426 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3427
3428 #ifdef CONFIG_SYSCTL
3429 register_pernet_subsys(&sysctl_route_ops);
3430 #endif
3431 register_pernet_subsys(&rt_genid_ops);
3432 register_pernet_subsys(&ipv4_inetpeer_ops);
3433 return rc;
3434 }
3435
3436 #ifdef CONFIG_SYSCTL
3437 /*
3438 * We really need to sanitize the damn ipv4 init order, then all
3439 * this nonsense will go away.
3440 */
3441 void __init ip_static_sysctl_init(void)
3442 {
3443 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3444 }
3445 #endif