]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv4/route.c
ipv4: Don't report neigh uptodate state in rtcache procfs.
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU 0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly = 9;
128 static int ip_rt_redirect_load __read_mostly = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly = HZ;
131 static int ip_rt_error_burst __read_mostly = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly = 8;
133 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly = 256;
136 static int rt_chain_length_max __read_mostly = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142 * Interface to generic destination cache.
143 */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int ipv4_mtu(const struct dst_entry *dst);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 int how)
156 {
157 }
158
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161 struct rtable *rt = (struct rtable *) dst;
162 struct inet_peer *peer;
163 u32 *p = NULL;
164
165 peer = rt_get_peer_create(rt, rt->rt_dst);
166 if (peer) {
167 u32 *old_p = __DST_METRICS_PTR(old);
168 unsigned long prev, new;
169
170 p = peer->metrics;
171 if (inet_metrics_new(peer))
172 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
173
174 new = (unsigned long) p;
175 prev = cmpxchg(&dst->_metrics, old, new);
176
177 if (prev != old) {
178 p = __DST_METRICS_PTR(prev);
179 if (prev & DST_METRICS_READ_ONLY)
180 p = NULL;
181 } else {
182 if (rt->fi) {
183 fib_info_put(rt->fi);
184 rt->fi = NULL;
185 }
186 }
187 }
188 return p;
189 }
190
191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
192
193 static struct dst_ops ipv4_dst_ops = {
194 .family = AF_INET,
195 .protocol = cpu_to_be16(ETH_P_IP),
196 .gc = rt_garbage_collect,
197 .check = ipv4_dst_check,
198 .default_advmss = ipv4_default_advmss,
199 .mtu = ipv4_mtu,
200 .cow_metrics = ipv4_cow_metrics,
201 .destroy = ipv4_dst_destroy,
202 .ifdown = ipv4_dst_ifdown,
203 .negative_advice = ipv4_negative_advice,
204 .link_failure = ipv4_link_failure,
205 .update_pmtu = ip_rt_update_pmtu,
206 .local_out = __ip_local_out,
207 .neigh_lookup = ipv4_neigh_lookup,
208 };
209
210 #define ECN_OR_COST(class) TC_PRIO_##class
211
212 const __u8 ip_tos2prio[16] = {
213 TC_PRIO_BESTEFFORT,
214 ECN_OR_COST(BESTEFFORT),
215 TC_PRIO_BESTEFFORT,
216 ECN_OR_COST(BESTEFFORT),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK)
229 };
230 EXPORT_SYMBOL(ip_tos2prio);
231
232 /*
233 * Route cache.
234 */
235
236 /* The locking scheme is rather straight forward:
237 *
238 * 1) Read-Copy Update protects the buckets of the central route hash.
239 * 2) Only writers remove entries, and they hold the lock
240 * as they look at rtable reference counts.
241 * 3) Only readers acquire references to rtable entries,
242 * they do so with atomic increments and with the
243 * lock held.
244 */
245
246 struct rt_hash_bucket {
247 struct rtable __rcu *chain;
248 };
249
250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
251 defined(CONFIG_PROVE_LOCKING)
252 /*
253 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
254 * The size of this table is a power of two and depends on the number of CPUS.
255 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
256 */
257 #ifdef CONFIG_LOCKDEP
258 # define RT_HASH_LOCK_SZ 256
259 #else
260 # if NR_CPUS >= 32
261 # define RT_HASH_LOCK_SZ 4096
262 # elif NR_CPUS >= 16
263 # define RT_HASH_LOCK_SZ 2048
264 # elif NR_CPUS >= 8
265 # define RT_HASH_LOCK_SZ 1024
266 # elif NR_CPUS >= 4
267 # define RT_HASH_LOCK_SZ 512
268 # else
269 # define RT_HASH_LOCK_SZ 256
270 # endif
271 #endif
272
273 static spinlock_t *rt_hash_locks;
274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
275
276 static __init void rt_hash_lock_init(void)
277 {
278 int i;
279
280 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281 GFP_KERNEL);
282 if (!rt_hash_locks)
283 panic("IP: failed to allocate rt_hash_locks\n");
284
285 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
286 spin_lock_init(&rt_hash_locks[i]);
287 }
288 #else
289 # define rt_hash_lock_addr(slot) NULL
290
291 static inline void rt_hash_lock_init(void)
292 {
293 }
294 #endif
295
296 static struct rt_hash_bucket *rt_hash_table __read_mostly;
297 static unsigned int rt_hash_mask __read_mostly;
298 static unsigned int rt_hash_log __read_mostly;
299
300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
302
303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304 int genid)
305 {
306 return jhash_3words((__force u32)daddr, (__force u32)saddr,
307 idx, genid)
308 & rt_hash_mask;
309 }
310
311 static inline int rt_genid(struct net *net)
312 {
313 return atomic_read(&net->ipv4.rt_genid);
314 }
315
316 #ifdef CONFIG_PROC_FS
317 struct rt_cache_iter_state {
318 struct seq_net_private p;
319 int bucket;
320 int genid;
321 };
322
323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
324 {
325 struct rt_cache_iter_state *st = seq->private;
326 struct rtable *r = NULL;
327
328 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
329 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330 continue;
331 rcu_read_lock_bh();
332 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
333 while (r) {
334 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
335 r->rt_genid == st->genid)
336 return r;
337 r = rcu_dereference_bh(r->dst.rt_next);
338 }
339 rcu_read_unlock_bh();
340 }
341 return r;
342 }
343
344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345 struct rtable *r)
346 {
347 struct rt_cache_iter_state *st = seq->private;
348
349 r = rcu_dereference_bh(r->dst.rt_next);
350 while (!r) {
351 rcu_read_unlock_bh();
352 do {
353 if (--st->bucket < 0)
354 return NULL;
355 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
356 rcu_read_lock_bh();
357 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
358 }
359 return r;
360 }
361
362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363 struct rtable *r)
364 {
365 struct rt_cache_iter_state *st = seq->private;
366 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
367 if (dev_net(r->dst.dev) != seq_file_net(seq))
368 continue;
369 if (r->rt_genid == st->genid)
370 break;
371 }
372 return r;
373 }
374
375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
376 {
377 struct rtable *r = rt_cache_get_first(seq);
378
379 if (r)
380 while (pos && (r = rt_cache_get_next(seq, r)))
381 --pos;
382 return pos ? NULL : r;
383 }
384
385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
386 {
387 struct rt_cache_iter_state *st = seq->private;
388 if (*pos)
389 return rt_cache_get_idx(seq, *pos - 1);
390 st->genid = rt_genid(seq_file_net(seq));
391 return SEQ_START_TOKEN;
392 }
393
394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
395 {
396 struct rtable *r;
397
398 if (v == SEQ_START_TOKEN)
399 r = rt_cache_get_first(seq);
400 else
401 r = rt_cache_get_next(seq, v);
402 ++*pos;
403 return r;
404 }
405
406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
407 {
408 if (v && v != SEQ_START_TOKEN)
409 rcu_read_unlock_bh();
410 }
411
412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
413 {
414 if (v == SEQ_START_TOKEN)
415 seq_printf(seq, "%-127s\n",
416 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
417 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 "HHUptod\tSpecDst");
419 else {
420 struct rtable *r = v;
421 int len;
422
423 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
424 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
425 r->dst.dev ? r->dst.dev->name : "*",
426 (__force u32)r->rt_dst,
427 (__force u32)r->rt_gateway,
428 r->rt_flags, atomic_read(&r->dst.__refcnt),
429 r->dst.__use, 0, (__force u32)r->rt_src,
430 dst_metric_advmss(&r->dst) + 40,
431 dst_metric(&r->dst, RTAX_WINDOW),
432 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
433 dst_metric(&r->dst, RTAX_RTTVAR)),
434 r->rt_key_tos,
435 -1, 0, 0, &len);
436
437 seq_printf(seq, "%*s\n", 127 - len, "");
438 }
439 return 0;
440 }
441
442 static const struct seq_operations rt_cache_seq_ops = {
443 .start = rt_cache_seq_start,
444 .next = rt_cache_seq_next,
445 .stop = rt_cache_seq_stop,
446 .show = rt_cache_seq_show,
447 };
448
449 static int rt_cache_seq_open(struct inode *inode, struct file *file)
450 {
451 return seq_open_net(inode, file, &rt_cache_seq_ops,
452 sizeof(struct rt_cache_iter_state));
453 }
454
455 static const struct file_operations rt_cache_seq_fops = {
456 .owner = THIS_MODULE,
457 .open = rt_cache_seq_open,
458 .read = seq_read,
459 .llseek = seq_lseek,
460 .release = seq_release_net,
461 };
462
463
464 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
465 {
466 int cpu;
467
468 if (*pos == 0)
469 return SEQ_START_TOKEN;
470
471 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
472 if (!cpu_possible(cpu))
473 continue;
474 *pos = cpu+1;
475 return &per_cpu(rt_cache_stat, cpu);
476 }
477 return NULL;
478 }
479
480 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
481 {
482 int cpu;
483
484 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
485 if (!cpu_possible(cpu))
486 continue;
487 *pos = cpu+1;
488 return &per_cpu(rt_cache_stat, cpu);
489 }
490 return NULL;
491
492 }
493
494 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
495 {
496
497 }
498
499 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
500 {
501 struct rt_cache_stat *st = v;
502
503 if (v == SEQ_START_TOKEN) {
504 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
505 return 0;
506 }
507
508 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
509 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
510 dst_entries_get_slow(&ipv4_dst_ops),
511 st->in_hit,
512 st->in_slow_tot,
513 st->in_slow_mc,
514 st->in_no_route,
515 st->in_brd,
516 st->in_martian_dst,
517 st->in_martian_src,
518
519 st->out_hit,
520 st->out_slow_tot,
521 st->out_slow_mc,
522
523 st->gc_total,
524 st->gc_ignored,
525 st->gc_goal_miss,
526 st->gc_dst_overflow,
527 st->in_hlist_search,
528 st->out_hlist_search
529 );
530 return 0;
531 }
532
533 static const struct seq_operations rt_cpu_seq_ops = {
534 .start = rt_cpu_seq_start,
535 .next = rt_cpu_seq_next,
536 .stop = rt_cpu_seq_stop,
537 .show = rt_cpu_seq_show,
538 };
539
540
541 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
542 {
543 return seq_open(file, &rt_cpu_seq_ops);
544 }
545
546 static const struct file_operations rt_cpu_seq_fops = {
547 .owner = THIS_MODULE,
548 .open = rt_cpu_seq_open,
549 .read = seq_read,
550 .llseek = seq_lseek,
551 .release = seq_release,
552 };
553
554 #ifdef CONFIG_IP_ROUTE_CLASSID
555 static int rt_acct_proc_show(struct seq_file *m, void *v)
556 {
557 struct ip_rt_acct *dst, *src;
558 unsigned int i, j;
559
560 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
561 if (!dst)
562 return -ENOMEM;
563
564 for_each_possible_cpu(i) {
565 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
566 for (j = 0; j < 256; j++) {
567 dst[j].o_bytes += src[j].o_bytes;
568 dst[j].o_packets += src[j].o_packets;
569 dst[j].i_bytes += src[j].i_bytes;
570 dst[j].i_packets += src[j].i_packets;
571 }
572 }
573
574 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
575 kfree(dst);
576 return 0;
577 }
578
579 static int rt_acct_proc_open(struct inode *inode, struct file *file)
580 {
581 return single_open(file, rt_acct_proc_show, NULL);
582 }
583
584 static const struct file_operations rt_acct_proc_fops = {
585 .owner = THIS_MODULE,
586 .open = rt_acct_proc_open,
587 .read = seq_read,
588 .llseek = seq_lseek,
589 .release = single_release,
590 };
591 #endif
592
593 static int __net_init ip_rt_do_proc_init(struct net *net)
594 {
595 struct proc_dir_entry *pde;
596
597 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
598 &rt_cache_seq_fops);
599 if (!pde)
600 goto err1;
601
602 pde = proc_create("rt_cache", S_IRUGO,
603 net->proc_net_stat, &rt_cpu_seq_fops);
604 if (!pde)
605 goto err2;
606
607 #ifdef CONFIG_IP_ROUTE_CLASSID
608 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
609 if (!pde)
610 goto err3;
611 #endif
612 return 0;
613
614 #ifdef CONFIG_IP_ROUTE_CLASSID
615 err3:
616 remove_proc_entry("rt_cache", net->proc_net_stat);
617 #endif
618 err2:
619 remove_proc_entry("rt_cache", net->proc_net);
620 err1:
621 return -ENOMEM;
622 }
623
624 static void __net_exit ip_rt_do_proc_exit(struct net *net)
625 {
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627 remove_proc_entry("rt_cache", net->proc_net);
628 #ifdef CONFIG_IP_ROUTE_CLASSID
629 remove_proc_entry("rt_acct", net->proc_net);
630 #endif
631 }
632
633 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
634 .init = ip_rt_do_proc_init,
635 .exit = ip_rt_do_proc_exit,
636 };
637
638 static int __init ip_rt_proc_init(void)
639 {
640 return register_pernet_subsys(&ip_rt_proc_ops);
641 }
642
643 #else
644 static inline int ip_rt_proc_init(void)
645 {
646 return 0;
647 }
648 #endif /* CONFIG_PROC_FS */
649
650 static inline void rt_free(struct rtable *rt)
651 {
652 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
653 }
654
655 static inline void rt_drop(struct rtable *rt)
656 {
657 ip_rt_put(rt);
658 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
659 }
660
661 static inline int rt_fast_clean(struct rtable *rth)
662 {
663 /* Kill broadcast/multicast entries very aggresively, if they
664 collide in hash table with more useful entries */
665 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
666 rt_is_input_route(rth) && rth->dst.rt_next;
667 }
668
669 static inline int rt_valuable(struct rtable *rth)
670 {
671 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
672 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
673 }
674
675 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
676 {
677 unsigned long age;
678 int ret = 0;
679
680 if (atomic_read(&rth->dst.__refcnt))
681 goto out;
682
683 age = jiffies - rth->dst.lastuse;
684 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
685 (age <= tmo2 && rt_valuable(rth)))
686 goto out;
687 ret = 1;
688 out: return ret;
689 }
690
691 /* Bits of score are:
692 * 31: very valuable
693 * 30: not quite useless
694 * 29..0: usage counter
695 */
696 static inline u32 rt_score(struct rtable *rt)
697 {
698 u32 score = jiffies - rt->dst.lastuse;
699
700 score = ~score & ~(3<<30);
701
702 if (rt_valuable(rt))
703 score |= (1<<31);
704
705 if (rt_is_output_route(rt) ||
706 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
707 score |= (1<<30);
708
709 return score;
710 }
711
712 static inline bool rt_caching(const struct net *net)
713 {
714 return net->ipv4.current_rt_cache_rebuild_count <=
715 net->ipv4.sysctl_rt_cache_rebuild_count;
716 }
717
718 static inline bool compare_hash_inputs(const struct rtable *rt1,
719 const struct rtable *rt2)
720 {
721 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
722 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
723 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
724 }
725
726 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
727 {
728 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
729 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
730 (rt1->rt_mark ^ rt2->rt_mark) |
731 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
732 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
733 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
734 }
735
736 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
737 {
738 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
739 }
740
741 static inline int rt_is_expired(struct rtable *rth)
742 {
743 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
744 }
745
746 /*
747 * Perform a full scan of hash table and free all entries.
748 * Can be called by a softirq or a process.
749 * In the later case, we want to be reschedule if necessary
750 */
751 static void rt_do_flush(struct net *net, int process_context)
752 {
753 unsigned int i;
754 struct rtable *rth, *next;
755
756 for (i = 0; i <= rt_hash_mask; i++) {
757 struct rtable __rcu **pprev;
758 struct rtable *list;
759
760 if (process_context && need_resched())
761 cond_resched();
762 rth = rcu_access_pointer(rt_hash_table[i].chain);
763 if (!rth)
764 continue;
765
766 spin_lock_bh(rt_hash_lock_addr(i));
767
768 list = NULL;
769 pprev = &rt_hash_table[i].chain;
770 rth = rcu_dereference_protected(*pprev,
771 lockdep_is_held(rt_hash_lock_addr(i)));
772
773 while (rth) {
774 next = rcu_dereference_protected(rth->dst.rt_next,
775 lockdep_is_held(rt_hash_lock_addr(i)));
776
777 if (!net ||
778 net_eq(dev_net(rth->dst.dev), net)) {
779 rcu_assign_pointer(*pprev, next);
780 rcu_assign_pointer(rth->dst.rt_next, list);
781 list = rth;
782 } else {
783 pprev = &rth->dst.rt_next;
784 }
785 rth = next;
786 }
787
788 spin_unlock_bh(rt_hash_lock_addr(i));
789
790 for (; list; list = next) {
791 next = rcu_dereference_protected(list->dst.rt_next, 1);
792 rt_free(list);
793 }
794 }
795 }
796
797 /*
798 * While freeing expired entries, we compute average chain length
799 * and standard deviation, using fixed-point arithmetic.
800 * This to have an estimation of rt_chain_length_max
801 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
802 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
803 */
804
805 #define FRACT_BITS 3
806 #define ONE (1UL << FRACT_BITS)
807
808 /*
809 * Given a hash chain and an item in this hash chain,
810 * find if a previous entry has the same hash_inputs
811 * (but differs on tos, mark or oif)
812 * Returns 0 if an alias is found.
813 * Returns ONE if rth has no alias before itself.
814 */
815 static int has_noalias(const struct rtable *head, const struct rtable *rth)
816 {
817 const struct rtable *aux = head;
818
819 while (aux != rth) {
820 if (compare_hash_inputs(aux, rth))
821 return 0;
822 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
823 }
824 return ONE;
825 }
826
827 static void rt_check_expire(void)
828 {
829 static unsigned int rover;
830 unsigned int i = rover, goal;
831 struct rtable *rth;
832 struct rtable __rcu **rthp;
833 unsigned long samples = 0;
834 unsigned long sum = 0, sum2 = 0;
835 unsigned long delta;
836 u64 mult;
837
838 delta = jiffies - expires_ljiffies;
839 expires_ljiffies = jiffies;
840 mult = ((u64)delta) << rt_hash_log;
841 if (ip_rt_gc_timeout > 1)
842 do_div(mult, ip_rt_gc_timeout);
843 goal = (unsigned int)mult;
844 if (goal > rt_hash_mask)
845 goal = rt_hash_mask + 1;
846 for (; goal > 0; goal--) {
847 unsigned long tmo = ip_rt_gc_timeout;
848 unsigned long length;
849
850 i = (i + 1) & rt_hash_mask;
851 rthp = &rt_hash_table[i].chain;
852
853 if (need_resched())
854 cond_resched();
855
856 samples++;
857
858 if (rcu_dereference_raw(*rthp) == NULL)
859 continue;
860 length = 0;
861 spin_lock_bh(rt_hash_lock_addr(i));
862 while ((rth = rcu_dereference_protected(*rthp,
863 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
864 prefetch(rth->dst.rt_next);
865 if (rt_is_expired(rth) ||
866 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
867 *rthp = rth->dst.rt_next;
868 rt_free(rth);
869 continue;
870 }
871
872 /* We only count entries on a chain with equal
873 * hash inputs once so that entries for
874 * different QOS levels, and other non-hash
875 * input attributes don't unfairly skew the
876 * length computation
877 */
878 tmo >>= 1;
879 rthp = &rth->dst.rt_next;
880 length += has_noalias(rt_hash_table[i].chain, rth);
881 }
882 spin_unlock_bh(rt_hash_lock_addr(i));
883 sum += length;
884 sum2 += length*length;
885 }
886 if (samples) {
887 unsigned long avg = sum / samples;
888 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
889 rt_chain_length_max = max_t(unsigned long,
890 ip_rt_gc_elasticity,
891 (avg + 4*sd) >> FRACT_BITS);
892 }
893 rover = i;
894 }
895
896 /*
897 * rt_worker_func() is run in process context.
898 * we call rt_check_expire() to scan part of the hash table
899 */
900 static void rt_worker_func(struct work_struct *work)
901 {
902 rt_check_expire();
903 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
904 }
905
906 /*
907 * Perturbation of rt_genid by a small quantity [1..256]
908 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
909 * many times (2^24) without giving recent rt_genid.
910 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
911 */
912 static void rt_cache_invalidate(struct net *net)
913 {
914 unsigned char shuffle;
915
916 get_random_bytes(&shuffle, sizeof(shuffle));
917 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
918 inetpeer_invalidate_family(AF_INET);
919 }
920
921 /*
922 * delay < 0 : invalidate cache (fast : entries will be deleted later)
923 * delay >= 0 : invalidate & flush cache (can be long)
924 */
925 void rt_cache_flush(struct net *net, int delay)
926 {
927 rt_cache_invalidate(net);
928 if (delay >= 0)
929 rt_do_flush(net, !in_softirq());
930 }
931
932 /* Flush previous cache invalidated entries from the cache */
933 void rt_cache_flush_batch(struct net *net)
934 {
935 rt_do_flush(net, !in_softirq());
936 }
937
938 static void rt_emergency_hash_rebuild(struct net *net)
939 {
940 net_warn_ratelimited("Route hash chain too long!\n");
941 rt_cache_invalidate(net);
942 }
943
944 /*
945 Short description of GC goals.
946
947 We want to build algorithm, which will keep routing cache
948 at some equilibrium point, when number of aged off entries
949 is kept approximately equal to newly generated ones.
950
951 Current expiration strength is variable "expire".
952 We try to adjust it dynamically, so that if networking
953 is idle expires is large enough to keep enough of warm entries,
954 and when load increases it reduces to limit cache size.
955 */
956
957 static int rt_garbage_collect(struct dst_ops *ops)
958 {
959 static unsigned long expire = RT_GC_TIMEOUT;
960 static unsigned long last_gc;
961 static int rover;
962 static int equilibrium;
963 struct rtable *rth;
964 struct rtable __rcu **rthp;
965 unsigned long now = jiffies;
966 int goal;
967 int entries = dst_entries_get_fast(&ipv4_dst_ops);
968
969 /*
970 * Garbage collection is pretty expensive,
971 * do not make it too frequently.
972 */
973
974 RT_CACHE_STAT_INC(gc_total);
975
976 if (now - last_gc < ip_rt_gc_min_interval &&
977 entries < ip_rt_max_size) {
978 RT_CACHE_STAT_INC(gc_ignored);
979 goto out;
980 }
981
982 entries = dst_entries_get_slow(&ipv4_dst_ops);
983 /* Calculate number of entries, which we want to expire now. */
984 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
985 if (goal <= 0) {
986 if (equilibrium < ipv4_dst_ops.gc_thresh)
987 equilibrium = ipv4_dst_ops.gc_thresh;
988 goal = entries - equilibrium;
989 if (goal > 0) {
990 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
991 goal = entries - equilibrium;
992 }
993 } else {
994 /* We are in dangerous area. Try to reduce cache really
995 * aggressively.
996 */
997 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
998 equilibrium = entries - goal;
999 }
1000
1001 if (now - last_gc >= ip_rt_gc_min_interval)
1002 last_gc = now;
1003
1004 if (goal <= 0) {
1005 equilibrium += goal;
1006 goto work_done;
1007 }
1008
1009 do {
1010 int i, k;
1011
1012 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1013 unsigned long tmo = expire;
1014
1015 k = (k + 1) & rt_hash_mask;
1016 rthp = &rt_hash_table[k].chain;
1017 spin_lock_bh(rt_hash_lock_addr(k));
1018 while ((rth = rcu_dereference_protected(*rthp,
1019 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1020 if (!rt_is_expired(rth) &&
1021 !rt_may_expire(rth, tmo, expire)) {
1022 tmo >>= 1;
1023 rthp = &rth->dst.rt_next;
1024 continue;
1025 }
1026 *rthp = rth->dst.rt_next;
1027 rt_free(rth);
1028 goal--;
1029 }
1030 spin_unlock_bh(rt_hash_lock_addr(k));
1031 if (goal <= 0)
1032 break;
1033 }
1034 rover = k;
1035
1036 if (goal <= 0)
1037 goto work_done;
1038
1039 /* Goal is not achieved. We stop process if:
1040
1041 - if expire reduced to zero. Otherwise, expire is halfed.
1042 - if table is not full.
1043 - if we are called from interrupt.
1044 - jiffies check is just fallback/debug loop breaker.
1045 We will not spin here for long time in any case.
1046 */
1047
1048 RT_CACHE_STAT_INC(gc_goal_miss);
1049
1050 if (expire == 0)
1051 break;
1052
1053 expire >>= 1;
1054
1055 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1056 goto out;
1057 } while (!in_softirq() && time_before_eq(jiffies, now));
1058
1059 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1060 goto out;
1061 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1062 goto out;
1063 net_warn_ratelimited("dst cache overflow\n");
1064 RT_CACHE_STAT_INC(gc_dst_overflow);
1065 return 1;
1066
1067 work_done:
1068 expire += ip_rt_gc_min_interval;
1069 if (expire > ip_rt_gc_timeout ||
1070 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1071 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1072 expire = ip_rt_gc_timeout;
1073 out: return 0;
1074 }
1075
1076 /*
1077 * Returns number of entries in a hash chain that have different hash_inputs
1078 */
1079 static int slow_chain_length(const struct rtable *head)
1080 {
1081 int length = 0;
1082 const struct rtable *rth = head;
1083
1084 while (rth) {
1085 length += has_noalias(head, rth);
1086 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1087 }
1088 return length >> FRACT_BITS;
1089 }
1090
1091 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1092 {
1093 struct net_device *dev = dst->dev;
1094 const __be32 *pkey = daddr;
1095 const struct rtable *rt;
1096 struct neighbour *n;
1097
1098 rt = (const struct rtable *) dst;
1099 if (rt->rt_gateway)
1100 pkey = (const __be32 *) &rt->rt_gateway;
1101
1102 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1103 if (n)
1104 return n;
1105 return neigh_create(&arp_tbl, pkey, dev);
1106 }
1107
1108 static int rt_bind_neighbour(struct rtable *rt)
1109 {
1110 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1111 if (IS_ERR(n))
1112 return PTR_ERR(n);
1113 dst_set_neighbour(&rt->dst, n);
1114
1115 return 0;
1116 }
1117
1118 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1119 struct sk_buff *skb, int ifindex)
1120 {
1121 struct rtable *rth, *cand;
1122 struct rtable __rcu **rthp, **candp;
1123 unsigned long now;
1124 u32 min_score;
1125 int chain_length;
1126 int attempts = !in_softirq();
1127
1128 restart:
1129 chain_length = 0;
1130 min_score = ~(u32)0;
1131 cand = NULL;
1132 candp = NULL;
1133 now = jiffies;
1134
1135 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1136 /*
1137 * If we're not caching, just tell the caller we
1138 * were successful and don't touch the route. The
1139 * caller hold the sole reference to the cache entry, and
1140 * it will be released when the caller is done with it.
1141 * If we drop it here, the callers have no way to resolve routes
1142 * when we're not caching. Instead, just point *rp at rt, so
1143 * the caller gets a single use out of the route
1144 * Note that we do rt_free on this new route entry, so that
1145 * once its refcount hits zero, we are still able to reap it
1146 * (Thanks Alexey)
1147 * Note: To avoid expensive rcu stuff for this uncached dst,
1148 * we set DST_NOCACHE so that dst_release() can free dst without
1149 * waiting a grace period.
1150 */
1151
1152 rt->dst.flags |= DST_NOCACHE;
1153 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1154 int err = rt_bind_neighbour(rt);
1155 if (err) {
1156 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1157 ip_rt_put(rt);
1158 return ERR_PTR(err);
1159 }
1160 }
1161
1162 goto skip_hashing;
1163 }
1164
1165 rthp = &rt_hash_table[hash].chain;
1166
1167 spin_lock_bh(rt_hash_lock_addr(hash));
1168 while ((rth = rcu_dereference_protected(*rthp,
1169 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1170 if (rt_is_expired(rth)) {
1171 *rthp = rth->dst.rt_next;
1172 rt_free(rth);
1173 continue;
1174 }
1175 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1176 /* Put it first */
1177 *rthp = rth->dst.rt_next;
1178 /*
1179 * Since lookup is lockfree, the deletion
1180 * must be visible to another weakly ordered CPU before
1181 * the insertion at the start of the hash chain.
1182 */
1183 rcu_assign_pointer(rth->dst.rt_next,
1184 rt_hash_table[hash].chain);
1185 /*
1186 * Since lookup is lockfree, the update writes
1187 * must be ordered for consistency on SMP.
1188 */
1189 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1190
1191 dst_use(&rth->dst, now);
1192 spin_unlock_bh(rt_hash_lock_addr(hash));
1193
1194 rt_drop(rt);
1195 if (skb)
1196 skb_dst_set(skb, &rth->dst);
1197 return rth;
1198 }
1199
1200 if (!atomic_read(&rth->dst.__refcnt)) {
1201 u32 score = rt_score(rth);
1202
1203 if (score <= min_score) {
1204 cand = rth;
1205 candp = rthp;
1206 min_score = score;
1207 }
1208 }
1209
1210 chain_length++;
1211
1212 rthp = &rth->dst.rt_next;
1213 }
1214
1215 if (cand) {
1216 /* ip_rt_gc_elasticity used to be average length of chain
1217 * length, when exceeded gc becomes really aggressive.
1218 *
1219 * The second limit is less certain. At the moment it allows
1220 * only 2 entries per bucket. We will see.
1221 */
1222 if (chain_length > ip_rt_gc_elasticity) {
1223 *candp = cand->dst.rt_next;
1224 rt_free(cand);
1225 }
1226 } else {
1227 if (chain_length > rt_chain_length_max &&
1228 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1229 struct net *net = dev_net(rt->dst.dev);
1230 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1231 if (!rt_caching(net)) {
1232 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1233 rt->dst.dev->name, num);
1234 }
1235 rt_emergency_hash_rebuild(net);
1236 spin_unlock_bh(rt_hash_lock_addr(hash));
1237
1238 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1239 ifindex, rt_genid(net));
1240 goto restart;
1241 }
1242 }
1243
1244 /* Try to bind route to arp only if it is output
1245 route or unicast forwarding path.
1246 */
1247 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1248 int err = rt_bind_neighbour(rt);
1249 if (err) {
1250 spin_unlock_bh(rt_hash_lock_addr(hash));
1251
1252 if (err != -ENOBUFS) {
1253 rt_drop(rt);
1254 return ERR_PTR(err);
1255 }
1256
1257 /* Neighbour tables are full and nothing
1258 can be released. Try to shrink route cache,
1259 it is most likely it holds some neighbour records.
1260 */
1261 if (attempts-- > 0) {
1262 int saved_elasticity = ip_rt_gc_elasticity;
1263 int saved_int = ip_rt_gc_min_interval;
1264 ip_rt_gc_elasticity = 1;
1265 ip_rt_gc_min_interval = 0;
1266 rt_garbage_collect(&ipv4_dst_ops);
1267 ip_rt_gc_min_interval = saved_int;
1268 ip_rt_gc_elasticity = saved_elasticity;
1269 goto restart;
1270 }
1271
1272 net_warn_ratelimited("Neighbour table overflow\n");
1273 rt_drop(rt);
1274 return ERR_PTR(-ENOBUFS);
1275 }
1276 }
1277
1278 rt->dst.rt_next = rt_hash_table[hash].chain;
1279
1280 /*
1281 * Since lookup is lockfree, we must make sure
1282 * previous writes to rt are committed to memory
1283 * before making rt visible to other CPUS.
1284 */
1285 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1286
1287 spin_unlock_bh(rt_hash_lock_addr(hash));
1288
1289 skip_hashing:
1290 if (skb)
1291 skb_dst_set(skb, &rt->dst);
1292 return rt;
1293 }
1294
1295 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1296
1297 static u32 rt_peer_genid(void)
1298 {
1299 return atomic_read(&__rt_peer_genid);
1300 }
1301
1302 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1303 {
1304 struct inet_peer_base *base;
1305 struct inet_peer *peer;
1306
1307 base = inetpeer_base_ptr(rt->_peer);
1308 if (!base)
1309 return;
1310
1311 peer = inet_getpeer_v4(base, daddr, create);
1312 if (peer) {
1313 if (!rt_set_peer(rt, peer))
1314 inet_putpeer(peer);
1315 else
1316 rt->rt_peer_genid = rt_peer_genid();
1317 }
1318 }
1319
1320 /*
1321 * Peer allocation may fail only in serious out-of-memory conditions. However
1322 * we still can generate some output.
1323 * Random ID selection looks a bit dangerous because we have no chances to
1324 * select ID being unique in a reasonable period of time.
1325 * But broken packet identifier may be better than no packet at all.
1326 */
1327 static void ip_select_fb_ident(struct iphdr *iph)
1328 {
1329 static DEFINE_SPINLOCK(ip_fb_id_lock);
1330 static u32 ip_fallback_id;
1331 u32 salt;
1332
1333 spin_lock_bh(&ip_fb_id_lock);
1334 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1335 iph->id = htons(salt & 0xFFFF);
1336 ip_fallback_id = salt;
1337 spin_unlock_bh(&ip_fb_id_lock);
1338 }
1339
1340 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1341 {
1342 struct rtable *rt = (struct rtable *) dst;
1343
1344 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1345 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1346
1347 /* If peer is attached to destination, it is never detached,
1348 so that we need not to grab a lock to dereference it.
1349 */
1350 if (peer) {
1351 iph->id = htons(inet_getid(peer, more));
1352 return;
1353 }
1354 } else if (!rt)
1355 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1356
1357 ip_select_fb_ident(iph);
1358 }
1359 EXPORT_SYMBOL(__ip_select_ident);
1360
1361 static void rt_del(unsigned int hash, struct rtable *rt)
1362 {
1363 struct rtable __rcu **rthp;
1364 struct rtable *aux;
1365
1366 rthp = &rt_hash_table[hash].chain;
1367 spin_lock_bh(rt_hash_lock_addr(hash));
1368 ip_rt_put(rt);
1369 while ((aux = rcu_dereference_protected(*rthp,
1370 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1371 if (aux == rt || rt_is_expired(aux)) {
1372 *rthp = aux->dst.rt_next;
1373 rt_free(aux);
1374 continue;
1375 }
1376 rthp = &aux->dst.rt_next;
1377 }
1378 spin_unlock_bh(rt_hash_lock_addr(hash));
1379 }
1380
1381 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1382 {
1383 struct rtable *rt = (struct rtable *) dst;
1384 __be32 orig_gw = rt->rt_gateway;
1385 struct neighbour *n, *old_n;
1386
1387 dst_confirm(&rt->dst);
1388
1389 rt->rt_gateway = peer->redirect_learned.a4;
1390
1391 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1392 if (IS_ERR(n)) {
1393 rt->rt_gateway = orig_gw;
1394 return;
1395 }
1396 old_n = xchg(&rt->dst._neighbour, n);
1397 if (old_n)
1398 neigh_release(old_n);
1399 if (!(n->nud_state & NUD_VALID)) {
1400 neigh_event_send(n, NULL);
1401 } else {
1402 rt->rt_flags |= RTCF_REDIRECTED;
1403 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1404 }
1405 }
1406
1407 /* called in rcu_read_lock() section */
1408 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1409 __be32 saddr, struct net_device *dev)
1410 {
1411 int s, i;
1412 struct in_device *in_dev = __in_dev_get_rcu(dev);
1413 __be32 skeys[2] = { saddr, 0 };
1414 int ikeys[2] = { dev->ifindex, 0 };
1415 struct inet_peer *peer;
1416 struct net *net;
1417
1418 if (!in_dev)
1419 return;
1420
1421 net = dev_net(dev);
1422 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1423 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1424 ipv4_is_zeronet(new_gw))
1425 goto reject_redirect;
1426
1427 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1428 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1429 goto reject_redirect;
1430 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1431 goto reject_redirect;
1432 } else {
1433 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1434 goto reject_redirect;
1435 }
1436
1437 for (s = 0; s < 2; s++) {
1438 for (i = 0; i < 2; i++) {
1439 unsigned int hash;
1440 struct rtable __rcu **rthp;
1441 struct rtable *rt;
1442
1443 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1444
1445 rthp = &rt_hash_table[hash].chain;
1446
1447 while ((rt = rcu_dereference(*rthp)) != NULL) {
1448 rthp = &rt->dst.rt_next;
1449
1450 if (rt->rt_key_dst != daddr ||
1451 rt->rt_key_src != skeys[s] ||
1452 rt->rt_oif != ikeys[i] ||
1453 rt_is_input_route(rt) ||
1454 rt_is_expired(rt) ||
1455 !net_eq(dev_net(rt->dst.dev), net) ||
1456 rt->dst.error ||
1457 rt->dst.dev != dev ||
1458 rt->rt_gateway != old_gw)
1459 continue;
1460
1461 peer = rt_get_peer_create(rt, rt->rt_dst);
1462 if (peer) {
1463 if (peer->redirect_learned.a4 != new_gw) {
1464 peer->redirect_learned.a4 = new_gw;
1465 atomic_inc(&__rt_peer_genid);
1466 }
1467 check_peer_redir(&rt->dst, peer);
1468 }
1469 }
1470 }
1471 }
1472 return;
1473
1474 reject_redirect:
1475 #ifdef CONFIG_IP_ROUTE_VERBOSE
1476 if (IN_DEV_LOG_MARTIANS(in_dev))
1477 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1478 " Advised path = %pI4 -> %pI4\n",
1479 &old_gw, dev->name, &new_gw,
1480 &saddr, &daddr);
1481 #endif
1482 ;
1483 }
1484
1485 static bool peer_pmtu_expired(struct inet_peer *peer)
1486 {
1487 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1488
1489 return orig &&
1490 time_after_eq(jiffies, orig) &&
1491 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1492 }
1493
1494 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1495 {
1496 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1497
1498 return orig &&
1499 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1500 }
1501
1502 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1503 {
1504 struct rtable *rt = (struct rtable *)dst;
1505 struct dst_entry *ret = dst;
1506
1507 if (rt) {
1508 if (dst->obsolete > 0) {
1509 ip_rt_put(rt);
1510 ret = NULL;
1511 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1512 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1513 rt->rt_oif,
1514 rt_genid(dev_net(dst->dev)));
1515 rt_del(hash, rt);
1516 ret = NULL;
1517 } else if (rt_has_peer(rt)) {
1518 struct inet_peer *peer = rt_peer_ptr(rt);
1519 if (peer_pmtu_expired(peer))
1520 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1521 }
1522 }
1523 return ret;
1524 }
1525
1526 /*
1527 * Algorithm:
1528 * 1. The first ip_rt_redirect_number redirects are sent
1529 * with exponential backoff, then we stop sending them at all,
1530 * assuming that the host ignores our redirects.
1531 * 2. If we did not see packets requiring redirects
1532 * during ip_rt_redirect_silence, we assume that the host
1533 * forgot redirected route and start to send redirects again.
1534 *
1535 * This algorithm is much cheaper and more intelligent than dumb load limiting
1536 * in icmp.c.
1537 *
1538 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1539 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1540 */
1541
1542 void ip_rt_send_redirect(struct sk_buff *skb)
1543 {
1544 struct rtable *rt = skb_rtable(skb);
1545 struct in_device *in_dev;
1546 struct inet_peer *peer;
1547 int log_martians;
1548
1549 rcu_read_lock();
1550 in_dev = __in_dev_get_rcu(rt->dst.dev);
1551 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1552 rcu_read_unlock();
1553 return;
1554 }
1555 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1556 rcu_read_unlock();
1557
1558 peer = rt_get_peer_create(rt, rt->rt_dst);
1559 if (!peer) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 return;
1562 }
1563
1564 /* No redirected packets during ip_rt_redirect_silence;
1565 * reset the algorithm.
1566 */
1567 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1568 peer->rate_tokens = 0;
1569
1570 /* Too many ignored redirects; do not send anything
1571 * set dst.rate_last to the last seen redirected packet.
1572 */
1573 if (peer->rate_tokens >= ip_rt_redirect_number) {
1574 peer->rate_last = jiffies;
1575 return;
1576 }
1577
1578 /* Check for load limit; set rate_last to the latest sent
1579 * redirect.
1580 */
1581 if (peer->rate_tokens == 0 ||
1582 time_after(jiffies,
1583 (peer->rate_last +
1584 (ip_rt_redirect_load << peer->rate_tokens)))) {
1585 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1586 peer->rate_last = jiffies;
1587 ++peer->rate_tokens;
1588 #ifdef CONFIG_IP_ROUTE_VERBOSE
1589 if (log_martians &&
1590 peer->rate_tokens == ip_rt_redirect_number)
1591 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1592 &ip_hdr(skb)->saddr, rt->rt_iif,
1593 &rt->rt_dst, &rt->rt_gateway);
1594 #endif
1595 }
1596 }
1597
1598 static int ip_error(struct sk_buff *skb)
1599 {
1600 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1601 struct rtable *rt = skb_rtable(skb);
1602 struct inet_peer *peer;
1603 unsigned long now;
1604 struct net *net;
1605 bool send;
1606 int code;
1607
1608 net = dev_net(rt->dst.dev);
1609 if (!IN_DEV_FORWARD(in_dev)) {
1610 switch (rt->dst.error) {
1611 case EHOSTUNREACH:
1612 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1613 break;
1614
1615 case ENETUNREACH:
1616 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1617 break;
1618 }
1619 goto out;
1620 }
1621
1622 switch (rt->dst.error) {
1623 case EINVAL:
1624 default:
1625 goto out;
1626 case EHOSTUNREACH:
1627 code = ICMP_HOST_UNREACH;
1628 break;
1629 case ENETUNREACH:
1630 code = ICMP_NET_UNREACH;
1631 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1632 break;
1633 case EACCES:
1634 code = ICMP_PKT_FILTERED;
1635 break;
1636 }
1637
1638 peer = rt_get_peer_create(rt, rt->rt_dst);
1639
1640 send = true;
1641 if (peer) {
1642 now = jiffies;
1643 peer->rate_tokens += now - peer->rate_last;
1644 if (peer->rate_tokens > ip_rt_error_burst)
1645 peer->rate_tokens = ip_rt_error_burst;
1646 peer->rate_last = now;
1647 if (peer->rate_tokens >= ip_rt_error_cost)
1648 peer->rate_tokens -= ip_rt_error_cost;
1649 else
1650 send = false;
1651 }
1652 if (send)
1653 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1654
1655 out: kfree_skb(skb);
1656 return 0;
1657 }
1658
1659 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1660 {
1661 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1662
1663 if (!expires)
1664 return;
1665 if (time_before(jiffies, expires)) {
1666 u32 orig_dst_mtu = dst_mtu(dst);
1667 if (peer->pmtu_learned < orig_dst_mtu) {
1668 if (!peer->pmtu_orig)
1669 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1670 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1671 }
1672 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1673 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1674 }
1675
1676 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1677 {
1678 struct rtable *rt = (struct rtable *) dst;
1679 struct inet_peer *peer;
1680
1681 dst_confirm(dst);
1682
1683 peer = rt_get_peer_create(rt, rt->rt_dst);
1684 if (peer) {
1685 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1686
1687 if (mtu < ip_rt_min_pmtu)
1688 mtu = ip_rt_min_pmtu;
1689 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1690
1691 pmtu_expires = jiffies + ip_rt_mtu_expires;
1692 if (!pmtu_expires)
1693 pmtu_expires = 1UL;
1694
1695 peer->pmtu_learned = mtu;
1696 peer->pmtu_expires = pmtu_expires;
1697
1698 atomic_inc(&__rt_peer_genid);
1699 rt->rt_peer_genid = rt_peer_genid();
1700 }
1701 check_peer_pmtu(dst, peer);
1702 }
1703 }
1704
1705 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1706 int oif, u32 mark, u8 protocol, int flow_flags)
1707 {
1708 const struct iphdr *iph = (const struct iphdr *)skb->data;
1709 struct flowi4 fl4;
1710 struct rtable *rt;
1711
1712 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1713 protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1714 iph->daddr, iph->saddr, 0, 0);
1715 rt = __ip_route_output_key(net, &fl4);
1716 if (!IS_ERR(rt)) {
1717 ip_rt_update_pmtu(&rt->dst, mtu);
1718 ip_rt_put(rt);
1719 }
1720 }
1721 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1722
1723 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1724 {
1725 const struct inet_sock *inet = inet_sk(sk);
1726
1727 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1728 sk->sk_bound_dev_if, sk->sk_mark,
1729 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1730 inet_sk_flowi_flags(sk));
1731 }
1732 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1733
1734 static void ipv4_validate_peer(struct rtable *rt)
1735 {
1736 if (rt->rt_peer_genid != rt_peer_genid()) {
1737 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1738
1739 if (peer) {
1740 check_peer_pmtu(&rt->dst, peer);
1741
1742 if (peer->redirect_learned.a4 &&
1743 peer->redirect_learned.a4 != rt->rt_gateway)
1744 check_peer_redir(&rt->dst, peer);
1745 }
1746
1747 rt->rt_peer_genid = rt_peer_genid();
1748 }
1749 }
1750
1751 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1752 {
1753 struct rtable *rt = (struct rtable *) dst;
1754
1755 if (rt_is_expired(rt))
1756 return NULL;
1757 ipv4_validate_peer(rt);
1758 return dst;
1759 }
1760
1761 static void ipv4_dst_destroy(struct dst_entry *dst)
1762 {
1763 struct rtable *rt = (struct rtable *) dst;
1764
1765 if (rt->fi) {
1766 fib_info_put(rt->fi);
1767 rt->fi = NULL;
1768 }
1769 if (rt_has_peer(rt)) {
1770 struct inet_peer *peer = rt_peer_ptr(rt);
1771 inet_putpeer(peer);
1772 }
1773 }
1774
1775
1776 static void ipv4_link_failure(struct sk_buff *skb)
1777 {
1778 struct rtable *rt;
1779
1780 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1781
1782 rt = skb_rtable(skb);
1783 if (rt && rt_has_peer(rt)) {
1784 struct inet_peer *peer = rt_peer_ptr(rt);
1785 if (peer_pmtu_cleaned(peer))
1786 dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1787 }
1788 }
1789
1790 static int ip_rt_bug(struct sk_buff *skb)
1791 {
1792 pr_debug("%s: %pI4 -> %pI4, %s\n",
1793 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1794 skb->dev ? skb->dev->name : "?");
1795 kfree_skb(skb);
1796 WARN_ON(1);
1797 return 0;
1798 }
1799
1800 /*
1801 We do not cache source address of outgoing interface,
1802 because it is used only by IP RR, TS and SRR options,
1803 so that it out of fast path.
1804
1805 BTW remember: "addr" is allowed to be not aligned
1806 in IP options!
1807 */
1808
1809 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1810 {
1811 __be32 src;
1812
1813 if (rt_is_output_route(rt))
1814 src = ip_hdr(skb)->saddr;
1815 else {
1816 struct fib_result res;
1817 struct flowi4 fl4;
1818 struct iphdr *iph;
1819
1820 iph = ip_hdr(skb);
1821
1822 memset(&fl4, 0, sizeof(fl4));
1823 fl4.daddr = iph->daddr;
1824 fl4.saddr = iph->saddr;
1825 fl4.flowi4_tos = RT_TOS(iph->tos);
1826 fl4.flowi4_oif = rt->dst.dev->ifindex;
1827 fl4.flowi4_iif = skb->dev->ifindex;
1828 fl4.flowi4_mark = skb->mark;
1829
1830 rcu_read_lock();
1831 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1832 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1833 else
1834 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1835 RT_SCOPE_UNIVERSE);
1836 rcu_read_unlock();
1837 }
1838 memcpy(addr, &src, 4);
1839 }
1840
1841 #ifdef CONFIG_IP_ROUTE_CLASSID
1842 static void set_class_tag(struct rtable *rt, u32 tag)
1843 {
1844 if (!(rt->dst.tclassid & 0xFFFF))
1845 rt->dst.tclassid |= tag & 0xFFFF;
1846 if (!(rt->dst.tclassid & 0xFFFF0000))
1847 rt->dst.tclassid |= tag & 0xFFFF0000;
1848 }
1849 #endif
1850
1851 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1852 {
1853 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1854
1855 if (advmss == 0) {
1856 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1857 ip_rt_min_advmss);
1858 if (advmss > 65535 - 40)
1859 advmss = 65535 - 40;
1860 }
1861 return advmss;
1862 }
1863
1864 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1865 {
1866 const struct rtable *rt = (const struct rtable *) dst;
1867 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1868
1869 if (mtu && rt_is_output_route(rt))
1870 return mtu;
1871
1872 mtu = dst->dev->mtu;
1873
1874 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1875
1876 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1877 mtu = 576;
1878 }
1879
1880 if (mtu > IP_MAX_MTU)
1881 mtu = IP_MAX_MTU;
1882
1883 return mtu;
1884 }
1885
1886 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1887 struct fib_info *fi)
1888 {
1889 struct inet_peer_base *base;
1890 struct inet_peer *peer;
1891 int create = 0;
1892
1893 /* If a peer entry exists for this destination, we must hook
1894 * it up in order to get at cached metrics.
1895 */
1896 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1897 create = 1;
1898
1899 base = inetpeer_base_ptr(rt->_peer);
1900 BUG_ON(!base);
1901
1902 peer = inet_getpeer_v4(base, rt->rt_dst, create);
1903 if (peer) {
1904 __rt_set_peer(rt, peer);
1905 rt->rt_peer_genid = rt_peer_genid();
1906 if (inet_metrics_new(peer))
1907 memcpy(peer->metrics, fi->fib_metrics,
1908 sizeof(u32) * RTAX_MAX);
1909 dst_init_metrics(&rt->dst, peer->metrics, false);
1910
1911 check_peer_pmtu(&rt->dst, peer);
1912
1913 if (peer->redirect_learned.a4 &&
1914 peer->redirect_learned.a4 != rt->rt_gateway) {
1915 rt->rt_gateway = peer->redirect_learned.a4;
1916 rt->rt_flags |= RTCF_REDIRECTED;
1917 }
1918 } else {
1919 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1920 rt->fi = fi;
1921 atomic_inc(&fi->fib_clntref);
1922 }
1923 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1924 }
1925 }
1926
1927 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1928 const struct fib_result *res,
1929 struct fib_info *fi, u16 type, u32 itag)
1930 {
1931 struct dst_entry *dst = &rt->dst;
1932
1933 if (fi) {
1934 if (FIB_RES_GW(*res) &&
1935 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1936 rt->rt_gateway = FIB_RES_GW(*res);
1937 rt_init_metrics(rt, fl4, fi);
1938 #ifdef CONFIG_IP_ROUTE_CLASSID
1939 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1940 #endif
1941 }
1942
1943 if (dst_mtu(dst) > IP_MAX_MTU)
1944 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1945
1946 #ifdef CONFIG_IP_ROUTE_CLASSID
1947 #ifdef CONFIG_IP_MULTIPLE_TABLES
1948 set_class_tag(rt, fib_rules_tclass(res));
1949 #endif
1950 set_class_tag(rt, itag);
1951 #endif
1952 }
1953
1954 static struct rtable *rt_dst_alloc(struct net_device *dev,
1955 bool nopolicy, bool noxfrm)
1956 {
1957 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1958 DST_HOST |
1959 (nopolicy ? DST_NOPOLICY : 0) |
1960 (noxfrm ? DST_NOXFRM : 0));
1961 }
1962
1963 /* called in rcu_read_lock() section */
1964 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1965 u8 tos, struct net_device *dev, int our)
1966 {
1967 unsigned int hash;
1968 struct rtable *rth;
1969 struct in_device *in_dev = __in_dev_get_rcu(dev);
1970 u32 itag = 0;
1971 int err;
1972
1973 /* Primary sanity checks. */
1974
1975 if (in_dev == NULL)
1976 return -EINVAL;
1977
1978 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1979 skb->protocol != htons(ETH_P_IP))
1980 goto e_inval;
1981
1982 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1983 if (ipv4_is_loopback(saddr))
1984 goto e_inval;
1985
1986 if (ipv4_is_zeronet(saddr)) {
1987 if (!ipv4_is_local_multicast(daddr))
1988 goto e_inval;
1989 } else {
1990 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1991 in_dev, &itag);
1992 if (err < 0)
1993 goto e_err;
1994 }
1995 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1996 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1997 if (!rth)
1998 goto e_nobufs;
1999
2000 #ifdef CONFIG_IP_ROUTE_CLASSID
2001 rth->dst.tclassid = itag;
2002 #endif
2003 rth->dst.output = ip_rt_bug;
2004
2005 rth->rt_key_dst = daddr;
2006 rth->rt_key_src = saddr;
2007 rth->rt_genid = rt_genid(dev_net(dev));
2008 rth->rt_flags = RTCF_MULTICAST;
2009 rth->rt_type = RTN_MULTICAST;
2010 rth->rt_key_tos = tos;
2011 rth->rt_dst = daddr;
2012 rth->rt_src = saddr;
2013 rth->rt_route_iif = dev->ifindex;
2014 rth->rt_iif = dev->ifindex;
2015 rth->rt_oif = 0;
2016 rth->rt_mark = skb->mark;
2017 rth->rt_gateway = daddr;
2018 rth->rt_peer_genid = 0;
2019 rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2020 rth->fi = NULL;
2021 if (our) {
2022 rth->dst.input= ip_local_deliver;
2023 rth->rt_flags |= RTCF_LOCAL;
2024 }
2025
2026 #ifdef CONFIG_IP_MROUTE
2027 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2028 rth->dst.input = ip_mr_input;
2029 #endif
2030 RT_CACHE_STAT_INC(in_slow_mc);
2031
2032 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2033 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2034 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2035
2036 e_nobufs:
2037 return -ENOBUFS;
2038 e_inval:
2039 return -EINVAL;
2040 e_err:
2041 return err;
2042 }
2043
2044
2045 static void ip_handle_martian_source(struct net_device *dev,
2046 struct in_device *in_dev,
2047 struct sk_buff *skb,
2048 __be32 daddr,
2049 __be32 saddr)
2050 {
2051 RT_CACHE_STAT_INC(in_martian_src);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2054 /*
2055 * RFC1812 recommendation, if source is martian,
2056 * the only hint is MAC header.
2057 */
2058 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2059 &daddr, &saddr, dev->name);
2060 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2061 print_hex_dump(KERN_WARNING, "ll header: ",
2062 DUMP_PREFIX_OFFSET, 16, 1,
2063 skb_mac_header(skb),
2064 dev->hard_header_len, true);
2065 }
2066 }
2067 #endif
2068 }
2069
2070 /* called in rcu_read_lock() section */
2071 static int __mkroute_input(struct sk_buff *skb,
2072 const struct fib_result *res,
2073 struct in_device *in_dev,
2074 __be32 daddr, __be32 saddr, u32 tos,
2075 struct rtable **result)
2076 {
2077 struct rtable *rth;
2078 int err;
2079 struct in_device *out_dev;
2080 unsigned int flags = 0;
2081 u32 itag;
2082
2083 /* get a working reference to the output device */
2084 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2085 if (out_dev == NULL) {
2086 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2087 return -EINVAL;
2088 }
2089
2090
2091 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2092 in_dev->dev, in_dev, &itag);
2093 if (err < 0) {
2094 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2095 saddr);
2096
2097 goto cleanup;
2098 }
2099
2100 if (err)
2101 flags |= RTCF_DIRECTSRC;
2102
2103 if (out_dev == in_dev && err &&
2104 (IN_DEV_SHARED_MEDIA(out_dev) ||
2105 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2106 flags |= RTCF_DOREDIRECT;
2107
2108 if (skb->protocol != htons(ETH_P_IP)) {
2109 /* Not IP (i.e. ARP). Do not create route, if it is
2110 * invalid for proxy arp. DNAT routes are always valid.
2111 *
2112 * Proxy arp feature have been extended to allow, ARP
2113 * replies back to the same interface, to support
2114 * Private VLAN switch technologies. See arp.c.
2115 */
2116 if (out_dev == in_dev &&
2117 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2118 err = -EINVAL;
2119 goto cleanup;
2120 }
2121 }
2122
2123 rth = rt_dst_alloc(out_dev->dev,
2124 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2125 IN_DEV_CONF_GET(out_dev, NOXFRM));
2126 if (!rth) {
2127 err = -ENOBUFS;
2128 goto cleanup;
2129 }
2130
2131 rth->rt_key_dst = daddr;
2132 rth->rt_key_src = saddr;
2133 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2134 rth->rt_flags = flags;
2135 rth->rt_type = res->type;
2136 rth->rt_key_tos = tos;
2137 rth->rt_dst = daddr;
2138 rth->rt_src = saddr;
2139 rth->rt_route_iif = in_dev->dev->ifindex;
2140 rth->rt_iif = in_dev->dev->ifindex;
2141 rth->rt_oif = 0;
2142 rth->rt_mark = skb->mark;
2143 rth->rt_gateway = daddr;
2144 rth->rt_peer_genid = 0;
2145 rt_init_peer(rth, &res->table->tb_peers);
2146 rth->fi = NULL;
2147
2148 rth->dst.input = ip_forward;
2149 rth->dst.output = ip_output;
2150
2151 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2152
2153 *result = rth;
2154 err = 0;
2155 cleanup:
2156 return err;
2157 }
2158
2159 static int ip_mkroute_input(struct sk_buff *skb,
2160 struct fib_result *res,
2161 const struct flowi4 *fl4,
2162 struct in_device *in_dev,
2163 __be32 daddr, __be32 saddr, u32 tos)
2164 {
2165 struct rtable *rth = NULL;
2166 int err;
2167 unsigned int hash;
2168
2169 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2170 if (res->fi && res->fi->fib_nhs > 1)
2171 fib_select_multipath(res);
2172 #endif
2173
2174 /* create a routing cache entry */
2175 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2176 if (err)
2177 return err;
2178
2179 /* put it into the cache */
2180 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2181 rt_genid(dev_net(rth->dst.dev)));
2182 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2183 if (IS_ERR(rth))
2184 return PTR_ERR(rth);
2185 return 0;
2186 }
2187
2188 /*
2189 * NOTE. We drop all the packets that has local source
2190 * addresses, because every properly looped back packet
2191 * must have correct destination already attached by output routine.
2192 *
2193 * Such approach solves two big problems:
2194 * 1. Not simplex devices are handled properly.
2195 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2196 * called with rcu_read_lock()
2197 */
2198
2199 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2200 u8 tos, struct net_device *dev)
2201 {
2202 struct fib_result res;
2203 struct in_device *in_dev = __in_dev_get_rcu(dev);
2204 struct flowi4 fl4;
2205 unsigned int flags = 0;
2206 u32 itag = 0;
2207 struct rtable *rth;
2208 unsigned int hash;
2209 int err = -EINVAL;
2210 struct net *net = dev_net(dev);
2211
2212 /* IP on this device is disabled. */
2213
2214 if (!in_dev)
2215 goto out;
2216
2217 /* Check for the most weird martians, which can be not detected
2218 by fib_lookup.
2219 */
2220
2221 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2222 goto martian_source;
2223
2224 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2225 goto brd_input;
2226
2227 /* Accept zero addresses only to limited broadcast;
2228 * I even do not know to fix it or not. Waiting for complains :-)
2229 */
2230 if (ipv4_is_zeronet(saddr))
2231 goto martian_source;
2232
2233 if (ipv4_is_zeronet(daddr))
2234 goto martian_destination;
2235
2236 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2237 if (ipv4_is_loopback(daddr))
2238 goto martian_destination;
2239
2240 if (ipv4_is_loopback(saddr))
2241 goto martian_source;
2242 }
2243
2244 /*
2245 * Now we are ready to route packet.
2246 */
2247 fl4.flowi4_oif = 0;
2248 fl4.flowi4_iif = dev->ifindex;
2249 fl4.flowi4_mark = skb->mark;
2250 fl4.flowi4_tos = tos;
2251 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2252 fl4.daddr = daddr;
2253 fl4.saddr = saddr;
2254 err = fib_lookup(net, &fl4, &res);
2255 if (err != 0)
2256 goto no_route;
2257
2258 RT_CACHE_STAT_INC(in_slow_tot);
2259
2260 if (res.type == RTN_BROADCAST)
2261 goto brd_input;
2262
2263 if (res.type == RTN_LOCAL) {
2264 err = fib_validate_source(skb, saddr, daddr, tos,
2265 net->loopback_dev->ifindex,
2266 dev, in_dev, &itag);
2267 if (err < 0)
2268 goto martian_source_keep_err;
2269 if (err)
2270 flags |= RTCF_DIRECTSRC;
2271 goto local_input;
2272 }
2273
2274 if (!IN_DEV_FORWARD(in_dev))
2275 goto no_route;
2276 if (res.type != RTN_UNICAST)
2277 goto martian_destination;
2278
2279 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2280 out: return err;
2281
2282 brd_input:
2283 if (skb->protocol != htons(ETH_P_IP))
2284 goto e_inval;
2285
2286 if (!ipv4_is_zeronet(saddr)) {
2287 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2288 in_dev, &itag);
2289 if (err < 0)
2290 goto martian_source_keep_err;
2291 if (err)
2292 flags |= RTCF_DIRECTSRC;
2293 }
2294 flags |= RTCF_BROADCAST;
2295 res.type = RTN_BROADCAST;
2296 RT_CACHE_STAT_INC(in_brd);
2297
2298 local_input:
2299 rth = rt_dst_alloc(net->loopback_dev,
2300 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2301 if (!rth)
2302 goto e_nobufs;
2303
2304 rth->dst.input= ip_local_deliver;
2305 rth->dst.output= ip_rt_bug;
2306 #ifdef CONFIG_IP_ROUTE_CLASSID
2307 rth->dst.tclassid = itag;
2308 #endif
2309
2310 rth->rt_key_dst = daddr;
2311 rth->rt_key_src = saddr;
2312 rth->rt_genid = rt_genid(net);
2313 rth->rt_flags = flags|RTCF_LOCAL;
2314 rth->rt_type = res.type;
2315 rth->rt_key_tos = tos;
2316 rth->rt_dst = daddr;
2317 rth->rt_src = saddr;
2318 rth->rt_route_iif = dev->ifindex;
2319 rth->rt_iif = dev->ifindex;
2320 rth->rt_oif = 0;
2321 rth->rt_mark = skb->mark;
2322 rth->rt_gateway = daddr;
2323 rth->rt_peer_genid = 0;
2324 rt_init_peer(rth, net->ipv4.peers);
2325 rth->fi = NULL;
2326 if (res.type == RTN_UNREACHABLE) {
2327 rth->dst.input= ip_error;
2328 rth->dst.error= -err;
2329 rth->rt_flags &= ~RTCF_LOCAL;
2330 }
2331 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2332 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2333 err = 0;
2334 if (IS_ERR(rth))
2335 err = PTR_ERR(rth);
2336 goto out;
2337
2338 no_route:
2339 RT_CACHE_STAT_INC(in_no_route);
2340 res.type = RTN_UNREACHABLE;
2341 if (err == -ESRCH)
2342 err = -ENETUNREACH;
2343 goto local_input;
2344
2345 /*
2346 * Do not cache martian addresses: they should be logged (RFC1812)
2347 */
2348 martian_destination:
2349 RT_CACHE_STAT_INC(in_martian_dst);
2350 #ifdef CONFIG_IP_ROUTE_VERBOSE
2351 if (IN_DEV_LOG_MARTIANS(in_dev))
2352 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2353 &daddr, &saddr, dev->name);
2354 #endif
2355
2356 e_inval:
2357 err = -EINVAL;
2358 goto out;
2359
2360 e_nobufs:
2361 err = -ENOBUFS;
2362 goto out;
2363
2364 martian_source:
2365 err = -EINVAL;
2366 martian_source_keep_err:
2367 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2368 goto out;
2369 }
2370
2371 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2372 u8 tos, struct net_device *dev, bool noref)
2373 {
2374 struct rtable *rth;
2375 unsigned int hash;
2376 int iif = dev->ifindex;
2377 struct net *net;
2378 int res;
2379
2380 net = dev_net(dev);
2381
2382 rcu_read_lock();
2383
2384 if (!rt_caching(net))
2385 goto skip_cache;
2386
2387 tos &= IPTOS_RT_MASK;
2388 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2389
2390 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2391 rth = rcu_dereference(rth->dst.rt_next)) {
2392 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2393 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2394 (rth->rt_route_iif ^ iif) |
2395 (rth->rt_key_tos ^ tos)) == 0 &&
2396 rth->rt_mark == skb->mark &&
2397 net_eq(dev_net(rth->dst.dev), net) &&
2398 !rt_is_expired(rth)) {
2399 ipv4_validate_peer(rth);
2400 if (noref) {
2401 dst_use_noref(&rth->dst, jiffies);
2402 skb_dst_set_noref(skb, &rth->dst);
2403 } else {
2404 dst_use(&rth->dst, jiffies);
2405 skb_dst_set(skb, &rth->dst);
2406 }
2407 RT_CACHE_STAT_INC(in_hit);
2408 rcu_read_unlock();
2409 return 0;
2410 }
2411 RT_CACHE_STAT_INC(in_hlist_search);
2412 }
2413
2414 skip_cache:
2415 /* Multicast recognition logic is moved from route cache to here.
2416 The problem was that too many Ethernet cards have broken/missing
2417 hardware multicast filters :-( As result the host on multicasting
2418 network acquires a lot of useless route cache entries, sort of
2419 SDR messages from all the world. Now we try to get rid of them.
2420 Really, provided software IP multicast filter is organized
2421 reasonably (at least, hashed), it does not result in a slowdown
2422 comparing with route cache reject entries.
2423 Note, that multicast routers are not affected, because
2424 route cache entry is created eventually.
2425 */
2426 if (ipv4_is_multicast(daddr)) {
2427 struct in_device *in_dev = __in_dev_get_rcu(dev);
2428
2429 if (in_dev) {
2430 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2431 ip_hdr(skb)->protocol);
2432 if (our
2433 #ifdef CONFIG_IP_MROUTE
2434 ||
2435 (!ipv4_is_local_multicast(daddr) &&
2436 IN_DEV_MFORWARD(in_dev))
2437 #endif
2438 ) {
2439 int res = ip_route_input_mc(skb, daddr, saddr,
2440 tos, dev, our);
2441 rcu_read_unlock();
2442 return res;
2443 }
2444 }
2445 rcu_read_unlock();
2446 return -EINVAL;
2447 }
2448 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2449 rcu_read_unlock();
2450 return res;
2451 }
2452 EXPORT_SYMBOL(ip_route_input_common);
2453
2454 /* called with rcu_read_lock() */
2455 static struct rtable *__mkroute_output(const struct fib_result *res,
2456 const struct flowi4 *fl4,
2457 __be32 orig_daddr, __be32 orig_saddr,
2458 int orig_oif, __u8 orig_rtos,
2459 struct net_device *dev_out,
2460 unsigned int flags)
2461 {
2462 struct fib_info *fi = res->fi;
2463 struct in_device *in_dev;
2464 u16 type = res->type;
2465 struct rtable *rth;
2466
2467 in_dev = __in_dev_get_rcu(dev_out);
2468 if (!in_dev)
2469 return ERR_PTR(-EINVAL);
2470
2471 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2472 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2473 return ERR_PTR(-EINVAL);
2474
2475 if (ipv4_is_lbcast(fl4->daddr))
2476 type = RTN_BROADCAST;
2477 else if (ipv4_is_multicast(fl4->daddr))
2478 type = RTN_MULTICAST;
2479 else if (ipv4_is_zeronet(fl4->daddr))
2480 return ERR_PTR(-EINVAL);
2481
2482 if (dev_out->flags & IFF_LOOPBACK)
2483 flags |= RTCF_LOCAL;
2484
2485 if (type == RTN_BROADCAST) {
2486 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2487 fi = NULL;
2488 } else if (type == RTN_MULTICAST) {
2489 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2490 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2491 fl4->flowi4_proto))
2492 flags &= ~RTCF_LOCAL;
2493 /* If multicast route do not exist use
2494 * default one, but do not gateway in this case.
2495 * Yes, it is hack.
2496 */
2497 if (fi && res->prefixlen < 4)
2498 fi = NULL;
2499 }
2500
2501 rth = rt_dst_alloc(dev_out,
2502 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2503 IN_DEV_CONF_GET(in_dev, NOXFRM));
2504 if (!rth)
2505 return ERR_PTR(-ENOBUFS);
2506
2507 rth->dst.output = ip_output;
2508
2509 rth->rt_key_dst = orig_daddr;
2510 rth->rt_key_src = orig_saddr;
2511 rth->rt_genid = rt_genid(dev_net(dev_out));
2512 rth->rt_flags = flags;
2513 rth->rt_type = type;
2514 rth->rt_key_tos = orig_rtos;
2515 rth->rt_dst = fl4->daddr;
2516 rth->rt_src = fl4->saddr;
2517 rth->rt_route_iif = 0;
2518 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2519 rth->rt_oif = orig_oif;
2520 rth->rt_mark = fl4->flowi4_mark;
2521 rth->rt_gateway = fl4->daddr;
2522 rth->rt_peer_genid = 0;
2523 rt_init_peer(rth, (res->table ?
2524 &res->table->tb_peers :
2525 dev_net(dev_out)->ipv4.peers));
2526 rth->fi = NULL;
2527
2528 RT_CACHE_STAT_INC(out_slow_tot);
2529
2530 if (flags & RTCF_LOCAL)
2531 rth->dst.input = ip_local_deliver;
2532 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2533 if (flags & RTCF_LOCAL &&
2534 !(dev_out->flags & IFF_LOOPBACK)) {
2535 rth->dst.output = ip_mc_output;
2536 RT_CACHE_STAT_INC(out_slow_mc);
2537 }
2538 #ifdef CONFIG_IP_MROUTE
2539 if (type == RTN_MULTICAST) {
2540 if (IN_DEV_MFORWARD(in_dev) &&
2541 !ipv4_is_local_multicast(fl4->daddr)) {
2542 rth->dst.input = ip_mr_input;
2543 rth->dst.output = ip_mc_output;
2544 }
2545 }
2546 #endif
2547 }
2548
2549 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2550
2551 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2552 rth->dst.flags |= DST_NOCACHE;
2553
2554 return rth;
2555 }
2556
2557 /*
2558 * Major route resolver routine.
2559 * called with rcu_read_lock();
2560 */
2561
2562 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2563 {
2564 struct net_device *dev_out = NULL;
2565 __u8 tos = RT_FL_TOS(fl4);
2566 unsigned int flags = 0;
2567 struct fib_result res;
2568 struct rtable *rth;
2569 __be32 orig_daddr;
2570 __be32 orig_saddr;
2571 int orig_oif;
2572
2573 res.fi = NULL;
2574 res.table = NULL;
2575 #ifdef CONFIG_IP_MULTIPLE_TABLES
2576 res.r = NULL;
2577 #endif
2578
2579 orig_daddr = fl4->daddr;
2580 orig_saddr = fl4->saddr;
2581 orig_oif = fl4->flowi4_oif;
2582
2583 fl4->flowi4_iif = net->loopback_dev->ifindex;
2584 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2585 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2586 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2587
2588 rcu_read_lock();
2589 if (fl4->saddr) {
2590 rth = ERR_PTR(-EINVAL);
2591 if (ipv4_is_multicast(fl4->saddr) ||
2592 ipv4_is_lbcast(fl4->saddr) ||
2593 ipv4_is_zeronet(fl4->saddr))
2594 goto out;
2595
2596 /* I removed check for oif == dev_out->oif here.
2597 It was wrong for two reasons:
2598 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2599 is assigned to multiple interfaces.
2600 2. Moreover, we are allowed to send packets with saddr
2601 of another iface. --ANK
2602 */
2603
2604 if (fl4->flowi4_oif == 0 &&
2605 (ipv4_is_multicast(fl4->daddr) ||
2606 ipv4_is_lbcast(fl4->daddr))) {
2607 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2608 dev_out = __ip_dev_find(net, fl4->saddr, false);
2609 if (dev_out == NULL)
2610 goto out;
2611
2612 /* Special hack: user can direct multicasts
2613 and limited broadcast via necessary interface
2614 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2615 This hack is not just for fun, it allows
2616 vic,vat and friends to work.
2617 They bind socket to loopback, set ttl to zero
2618 and expect that it will work.
2619 From the viewpoint of routing cache they are broken,
2620 because we are not allowed to build multicast path
2621 with loopback source addr (look, routing cache
2622 cannot know, that ttl is zero, so that packet
2623 will not leave this host and route is valid).
2624 Luckily, this hack is good workaround.
2625 */
2626
2627 fl4->flowi4_oif = dev_out->ifindex;
2628 goto make_route;
2629 }
2630
2631 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2632 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2633 if (!__ip_dev_find(net, fl4->saddr, false))
2634 goto out;
2635 }
2636 }
2637
2638
2639 if (fl4->flowi4_oif) {
2640 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2641 rth = ERR_PTR(-ENODEV);
2642 if (dev_out == NULL)
2643 goto out;
2644
2645 /* RACE: Check return value of inet_select_addr instead. */
2646 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2647 rth = ERR_PTR(-ENETUNREACH);
2648 goto out;
2649 }
2650 if (ipv4_is_local_multicast(fl4->daddr) ||
2651 ipv4_is_lbcast(fl4->daddr)) {
2652 if (!fl4->saddr)
2653 fl4->saddr = inet_select_addr(dev_out, 0,
2654 RT_SCOPE_LINK);
2655 goto make_route;
2656 }
2657 if (fl4->saddr) {
2658 if (ipv4_is_multicast(fl4->daddr))
2659 fl4->saddr = inet_select_addr(dev_out, 0,
2660 fl4->flowi4_scope);
2661 else if (!fl4->daddr)
2662 fl4->saddr = inet_select_addr(dev_out, 0,
2663 RT_SCOPE_HOST);
2664 }
2665 }
2666
2667 if (!fl4->daddr) {
2668 fl4->daddr = fl4->saddr;
2669 if (!fl4->daddr)
2670 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2671 dev_out = net->loopback_dev;
2672 fl4->flowi4_oif = net->loopback_dev->ifindex;
2673 res.type = RTN_LOCAL;
2674 flags |= RTCF_LOCAL;
2675 goto make_route;
2676 }
2677
2678 if (fib_lookup(net, fl4, &res)) {
2679 res.fi = NULL;
2680 res.table = NULL;
2681 if (fl4->flowi4_oif) {
2682 /* Apparently, routing tables are wrong. Assume,
2683 that the destination is on link.
2684
2685 WHY? DW.
2686 Because we are allowed to send to iface
2687 even if it has NO routes and NO assigned
2688 addresses. When oif is specified, routing
2689 tables are looked up with only one purpose:
2690 to catch if destination is gatewayed, rather than
2691 direct. Moreover, if MSG_DONTROUTE is set,
2692 we send packet, ignoring both routing tables
2693 and ifaddr state. --ANK
2694
2695
2696 We could make it even if oif is unknown,
2697 likely IPv6, but we do not.
2698 */
2699
2700 if (fl4->saddr == 0)
2701 fl4->saddr = inet_select_addr(dev_out, 0,
2702 RT_SCOPE_LINK);
2703 res.type = RTN_UNICAST;
2704 goto make_route;
2705 }
2706 rth = ERR_PTR(-ENETUNREACH);
2707 goto out;
2708 }
2709
2710 if (res.type == RTN_LOCAL) {
2711 if (!fl4->saddr) {
2712 if (res.fi->fib_prefsrc)
2713 fl4->saddr = res.fi->fib_prefsrc;
2714 else
2715 fl4->saddr = fl4->daddr;
2716 }
2717 dev_out = net->loopback_dev;
2718 fl4->flowi4_oif = dev_out->ifindex;
2719 res.fi = NULL;
2720 flags |= RTCF_LOCAL;
2721 goto make_route;
2722 }
2723
2724 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2725 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2726 fib_select_multipath(&res);
2727 else
2728 #endif
2729 if (!res.prefixlen &&
2730 res.table->tb_num_default > 1 &&
2731 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2732 fib_select_default(&res);
2733
2734 if (!fl4->saddr)
2735 fl4->saddr = FIB_RES_PREFSRC(net, res);
2736
2737 dev_out = FIB_RES_DEV(res);
2738 fl4->flowi4_oif = dev_out->ifindex;
2739
2740
2741 make_route:
2742 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2743 tos, dev_out, flags);
2744 if (!IS_ERR(rth)) {
2745 unsigned int hash;
2746
2747 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2748 rt_genid(dev_net(dev_out)));
2749 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2750 }
2751
2752 out:
2753 rcu_read_unlock();
2754 return rth;
2755 }
2756
2757 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2758 {
2759 struct rtable *rth;
2760 unsigned int hash;
2761
2762 if (!rt_caching(net))
2763 goto slow_output;
2764
2765 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2766
2767 rcu_read_lock_bh();
2768 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2769 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2770 if (rth->rt_key_dst == flp4->daddr &&
2771 rth->rt_key_src == flp4->saddr &&
2772 rt_is_output_route(rth) &&
2773 rth->rt_oif == flp4->flowi4_oif &&
2774 rth->rt_mark == flp4->flowi4_mark &&
2775 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2776 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2777 net_eq(dev_net(rth->dst.dev), net) &&
2778 !rt_is_expired(rth)) {
2779 ipv4_validate_peer(rth);
2780 dst_use(&rth->dst, jiffies);
2781 RT_CACHE_STAT_INC(out_hit);
2782 rcu_read_unlock_bh();
2783 if (!flp4->saddr)
2784 flp4->saddr = rth->rt_src;
2785 if (!flp4->daddr)
2786 flp4->daddr = rth->rt_dst;
2787 return rth;
2788 }
2789 RT_CACHE_STAT_INC(out_hlist_search);
2790 }
2791 rcu_read_unlock_bh();
2792
2793 slow_output:
2794 return ip_route_output_slow(net, flp4);
2795 }
2796 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2797
2798 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2799 {
2800 return NULL;
2801 }
2802
2803 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2804 {
2805 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2806
2807 return mtu ? : dst->dev->mtu;
2808 }
2809
2810 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2811 {
2812 }
2813
2814 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2815 unsigned long old)
2816 {
2817 return NULL;
2818 }
2819
2820 static struct dst_ops ipv4_dst_blackhole_ops = {
2821 .family = AF_INET,
2822 .protocol = cpu_to_be16(ETH_P_IP),
2823 .destroy = ipv4_dst_destroy,
2824 .check = ipv4_blackhole_dst_check,
2825 .mtu = ipv4_blackhole_mtu,
2826 .default_advmss = ipv4_default_advmss,
2827 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2828 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2829 .neigh_lookup = ipv4_neigh_lookup,
2830 };
2831
2832 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2833 {
2834 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2835 struct rtable *ort = (struct rtable *) dst_orig;
2836
2837 if (rt) {
2838 struct dst_entry *new = &rt->dst;
2839
2840 new->__use = 1;
2841 new->input = dst_discard;
2842 new->output = dst_discard;
2843 dst_copy_metrics(new, &ort->dst);
2844
2845 new->dev = ort->dst.dev;
2846 if (new->dev)
2847 dev_hold(new->dev);
2848
2849 rt->rt_key_dst = ort->rt_key_dst;
2850 rt->rt_key_src = ort->rt_key_src;
2851 rt->rt_key_tos = ort->rt_key_tos;
2852 rt->rt_route_iif = ort->rt_route_iif;
2853 rt->rt_iif = ort->rt_iif;
2854 rt->rt_oif = ort->rt_oif;
2855 rt->rt_mark = ort->rt_mark;
2856
2857 rt->rt_genid = rt_genid(net);
2858 rt->rt_flags = ort->rt_flags;
2859 rt->rt_type = ort->rt_type;
2860 rt->rt_dst = ort->rt_dst;
2861 rt->rt_src = ort->rt_src;
2862 rt->rt_gateway = ort->rt_gateway;
2863 rt_transfer_peer(rt, ort);
2864 rt->fi = ort->fi;
2865 if (rt->fi)
2866 atomic_inc(&rt->fi->fib_clntref);
2867
2868 dst_free(new);
2869 }
2870
2871 dst_release(dst_orig);
2872
2873 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2874 }
2875
2876 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2877 struct sock *sk)
2878 {
2879 struct rtable *rt = __ip_route_output_key(net, flp4);
2880
2881 if (IS_ERR(rt))
2882 return rt;
2883
2884 if (flp4->flowi4_proto)
2885 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2886 flowi4_to_flowi(flp4),
2887 sk, 0);
2888
2889 return rt;
2890 }
2891 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2892
2893 static int rt_fill_info(struct net *net,
2894 struct sk_buff *skb, u32 pid, u32 seq, int event,
2895 int nowait, unsigned int flags)
2896 {
2897 struct rtable *rt = skb_rtable(skb);
2898 struct rtmsg *r;
2899 struct nlmsghdr *nlh;
2900 unsigned long expires = 0;
2901 u32 id = 0, ts = 0, tsage = 0, error;
2902
2903 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2904 if (nlh == NULL)
2905 return -EMSGSIZE;
2906
2907 r = nlmsg_data(nlh);
2908 r->rtm_family = AF_INET;
2909 r->rtm_dst_len = 32;
2910 r->rtm_src_len = 0;
2911 r->rtm_tos = rt->rt_key_tos;
2912 r->rtm_table = RT_TABLE_MAIN;
2913 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2914 goto nla_put_failure;
2915 r->rtm_type = rt->rt_type;
2916 r->rtm_scope = RT_SCOPE_UNIVERSE;
2917 r->rtm_protocol = RTPROT_UNSPEC;
2918 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2919 if (rt->rt_flags & RTCF_NOTIFY)
2920 r->rtm_flags |= RTM_F_NOTIFY;
2921
2922 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2923 goto nla_put_failure;
2924 if (rt->rt_key_src) {
2925 r->rtm_src_len = 32;
2926 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2927 goto nla_put_failure;
2928 }
2929 if (rt->dst.dev &&
2930 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2931 goto nla_put_failure;
2932 #ifdef CONFIG_IP_ROUTE_CLASSID
2933 if (rt->dst.tclassid &&
2934 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2935 goto nla_put_failure;
2936 #endif
2937 if (!rt_is_input_route(rt) &&
2938 rt->rt_src != rt->rt_key_src) {
2939 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2940 goto nla_put_failure;
2941 }
2942 if (rt->rt_dst != rt->rt_gateway &&
2943 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2944 goto nla_put_failure;
2945
2946 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2947 goto nla_put_failure;
2948
2949 if (rt->rt_mark &&
2950 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2951 goto nla_put_failure;
2952
2953 error = rt->dst.error;
2954 if (rt_has_peer(rt)) {
2955 const struct inet_peer *peer = rt_peer_ptr(rt);
2956 inet_peer_refcheck(peer);
2957 id = atomic_read(&peer->ip_id_count) & 0xffff;
2958 if (peer->tcp_ts_stamp) {
2959 ts = peer->tcp_ts;
2960 tsage = get_seconds() - peer->tcp_ts_stamp;
2961 }
2962 expires = ACCESS_ONCE(peer->pmtu_expires);
2963 if (expires) {
2964 if (time_before(jiffies, expires))
2965 expires -= jiffies;
2966 else
2967 expires = 0;
2968 }
2969 }
2970
2971 if (rt_is_input_route(rt)) {
2972 #ifdef CONFIG_IP_MROUTE
2973 __be32 dst = rt->rt_dst;
2974
2975 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2976 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2977 int err = ipmr_get_route(net, skb,
2978 rt->rt_src, rt->rt_dst,
2979 r, nowait);
2980 if (err <= 0) {
2981 if (!nowait) {
2982 if (err == 0)
2983 return 0;
2984 goto nla_put_failure;
2985 } else {
2986 if (err == -EMSGSIZE)
2987 goto nla_put_failure;
2988 error = err;
2989 }
2990 }
2991 } else
2992 #endif
2993 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2994 goto nla_put_failure;
2995 }
2996
2997 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2998 expires, error) < 0)
2999 goto nla_put_failure;
3000
3001 return nlmsg_end(skb, nlh);
3002
3003 nla_put_failure:
3004 nlmsg_cancel(skb, nlh);
3005 return -EMSGSIZE;
3006 }
3007
3008 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3009 {
3010 struct net *net = sock_net(in_skb->sk);
3011 struct rtmsg *rtm;
3012 struct nlattr *tb[RTA_MAX+1];
3013 struct rtable *rt = NULL;
3014 __be32 dst = 0;
3015 __be32 src = 0;
3016 u32 iif;
3017 int err;
3018 int mark;
3019 struct sk_buff *skb;
3020
3021 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3022 if (err < 0)
3023 goto errout;
3024
3025 rtm = nlmsg_data(nlh);
3026
3027 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3028 if (skb == NULL) {
3029 err = -ENOBUFS;
3030 goto errout;
3031 }
3032
3033 /* Reserve room for dummy headers, this skb can pass
3034 through good chunk of routing engine.
3035 */
3036 skb_reset_mac_header(skb);
3037 skb_reset_network_header(skb);
3038
3039 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3040 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3041 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3042
3043 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3044 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3045 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3046 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3047
3048 if (iif) {
3049 struct net_device *dev;
3050
3051 dev = __dev_get_by_index(net, iif);
3052 if (dev == NULL) {
3053 err = -ENODEV;
3054 goto errout_free;
3055 }
3056
3057 skb->protocol = htons(ETH_P_IP);
3058 skb->dev = dev;
3059 skb->mark = mark;
3060 local_bh_disable();
3061 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3062 local_bh_enable();
3063
3064 rt = skb_rtable(skb);
3065 if (err == 0 && rt->dst.error)
3066 err = -rt->dst.error;
3067 } else {
3068 struct flowi4 fl4 = {
3069 .daddr = dst,
3070 .saddr = src,
3071 .flowi4_tos = rtm->rtm_tos,
3072 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3073 .flowi4_mark = mark,
3074 };
3075 rt = ip_route_output_key(net, &fl4);
3076
3077 err = 0;
3078 if (IS_ERR(rt))
3079 err = PTR_ERR(rt);
3080 }
3081
3082 if (err)
3083 goto errout_free;
3084
3085 skb_dst_set(skb, &rt->dst);
3086 if (rtm->rtm_flags & RTM_F_NOTIFY)
3087 rt->rt_flags |= RTCF_NOTIFY;
3088
3089 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3090 RTM_NEWROUTE, 0, 0);
3091 if (err <= 0)
3092 goto errout_free;
3093
3094 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3095 errout:
3096 return err;
3097
3098 errout_free:
3099 kfree_skb(skb);
3100 goto errout;
3101 }
3102
3103 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3104 {
3105 struct rtable *rt;
3106 int h, s_h;
3107 int idx, s_idx;
3108 struct net *net;
3109
3110 net = sock_net(skb->sk);
3111
3112 s_h = cb->args[0];
3113 if (s_h < 0)
3114 s_h = 0;
3115 s_idx = idx = cb->args[1];
3116 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3117 if (!rt_hash_table[h].chain)
3118 continue;
3119 rcu_read_lock_bh();
3120 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3121 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3122 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3123 continue;
3124 if (rt_is_expired(rt))
3125 continue;
3126 skb_dst_set_noref(skb, &rt->dst);
3127 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3128 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3129 1, NLM_F_MULTI) <= 0) {
3130 skb_dst_drop(skb);
3131 rcu_read_unlock_bh();
3132 goto done;
3133 }
3134 skb_dst_drop(skb);
3135 }
3136 rcu_read_unlock_bh();
3137 }
3138
3139 done:
3140 cb->args[0] = h;
3141 cb->args[1] = idx;
3142 return skb->len;
3143 }
3144
3145 void ip_rt_multicast_event(struct in_device *in_dev)
3146 {
3147 rt_cache_flush(dev_net(in_dev->dev), 0);
3148 }
3149
3150 #ifdef CONFIG_SYSCTL
3151 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3152 void __user *buffer,
3153 size_t *lenp, loff_t *ppos)
3154 {
3155 if (write) {
3156 int flush_delay;
3157 ctl_table ctl;
3158 struct net *net;
3159
3160 memcpy(&ctl, __ctl, sizeof(ctl));
3161 ctl.data = &flush_delay;
3162 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3163
3164 net = (struct net *)__ctl->extra1;
3165 rt_cache_flush(net, flush_delay);
3166 return 0;
3167 }
3168
3169 return -EINVAL;
3170 }
3171
3172 static ctl_table ipv4_route_table[] = {
3173 {
3174 .procname = "gc_thresh",
3175 .data = &ipv4_dst_ops.gc_thresh,
3176 .maxlen = sizeof(int),
3177 .mode = 0644,
3178 .proc_handler = proc_dointvec,
3179 },
3180 {
3181 .procname = "max_size",
3182 .data = &ip_rt_max_size,
3183 .maxlen = sizeof(int),
3184 .mode = 0644,
3185 .proc_handler = proc_dointvec,
3186 },
3187 {
3188 /* Deprecated. Use gc_min_interval_ms */
3189
3190 .procname = "gc_min_interval",
3191 .data = &ip_rt_gc_min_interval,
3192 .maxlen = sizeof(int),
3193 .mode = 0644,
3194 .proc_handler = proc_dointvec_jiffies,
3195 },
3196 {
3197 .procname = "gc_min_interval_ms",
3198 .data = &ip_rt_gc_min_interval,
3199 .maxlen = sizeof(int),
3200 .mode = 0644,
3201 .proc_handler = proc_dointvec_ms_jiffies,
3202 },
3203 {
3204 .procname = "gc_timeout",
3205 .data = &ip_rt_gc_timeout,
3206 .maxlen = sizeof(int),
3207 .mode = 0644,
3208 .proc_handler = proc_dointvec_jiffies,
3209 },
3210 {
3211 .procname = "gc_interval",
3212 .data = &ip_rt_gc_interval,
3213 .maxlen = sizeof(int),
3214 .mode = 0644,
3215 .proc_handler = proc_dointvec_jiffies,
3216 },
3217 {
3218 .procname = "redirect_load",
3219 .data = &ip_rt_redirect_load,
3220 .maxlen = sizeof(int),
3221 .mode = 0644,
3222 .proc_handler = proc_dointvec,
3223 },
3224 {
3225 .procname = "redirect_number",
3226 .data = &ip_rt_redirect_number,
3227 .maxlen = sizeof(int),
3228 .mode = 0644,
3229 .proc_handler = proc_dointvec,
3230 },
3231 {
3232 .procname = "redirect_silence",
3233 .data = &ip_rt_redirect_silence,
3234 .maxlen = sizeof(int),
3235 .mode = 0644,
3236 .proc_handler = proc_dointvec,
3237 },
3238 {
3239 .procname = "error_cost",
3240 .data = &ip_rt_error_cost,
3241 .maxlen = sizeof(int),
3242 .mode = 0644,
3243 .proc_handler = proc_dointvec,
3244 },
3245 {
3246 .procname = "error_burst",
3247 .data = &ip_rt_error_burst,
3248 .maxlen = sizeof(int),
3249 .mode = 0644,
3250 .proc_handler = proc_dointvec,
3251 },
3252 {
3253 .procname = "gc_elasticity",
3254 .data = &ip_rt_gc_elasticity,
3255 .maxlen = sizeof(int),
3256 .mode = 0644,
3257 .proc_handler = proc_dointvec,
3258 },
3259 {
3260 .procname = "mtu_expires",
3261 .data = &ip_rt_mtu_expires,
3262 .maxlen = sizeof(int),
3263 .mode = 0644,
3264 .proc_handler = proc_dointvec_jiffies,
3265 },
3266 {
3267 .procname = "min_pmtu",
3268 .data = &ip_rt_min_pmtu,
3269 .maxlen = sizeof(int),
3270 .mode = 0644,
3271 .proc_handler = proc_dointvec,
3272 },
3273 {
3274 .procname = "min_adv_mss",
3275 .data = &ip_rt_min_advmss,
3276 .maxlen = sizeof(int),
3277 .mode = 0644,
3278 .proc_handler = proc_dointvec,
3279 },
3280 { }
3281 };
3282
3283 static struct ctl_table ipv4_route_flush_table[] = {
3284 {
3285 .procname = "flush",
3286 .maxlen = sizeof(int),
3287 .mode = 0200,
3288 .proc_handler = ipv4_sysctl_rtcache_flush,
3289 },
3290 { },
3291 };
3292
3293 static __net_init int sysctl_route_net_init(struct net *net)
3294 {
3295 struct ctl_table *tbl;
3296
3297 tbl = ipv4_route_flush_table;
3298 if (!net_eq(net, &init_net)) {
3299 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3300 if (tbl == NULL)
3301 goto err_dup;
3302 }
3303 tbl[0].extra1 = net;
3304
3305 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3306 if (net->ipv4.route_hdr == NULL)
3307 goto err_reg;
3308 return 0;
3309
3310 err_reg:
3311 if (tbl != ipv4_route_flush_table)
3312 kfree(tbl);
3313 err_dup:
3314 return -ENOMEM;
3315 }
3316
3317 static __net_exit void sysctl_route_net_exit(struct net *net)
3318 {
3319 struct ctl_table *tbl;
3320
3321 tbl = net->ipv4.route_hdr->ctl_table_arg;
3322 unregister_net_sysctl_table(net->ipv4.route_hdr);
3323 BUG_ON(tbl == ipv4_route_flush_table);
3324 kfree(tbl);
3325 }
3326
3327 static __net_initdata struct pernet_operations sysctl_route_ops = {
3328 .init = sysctl_route_net_init,
3329 .exit = sysctl_route_net_exit,
3330 };
3331 #endif
3332
3333 static __net_init int rt_genid_init(struct net *net)
3334 {
3335 get_random_bytes(&net->ipv4.rt_genid,
3336 sizeof(net->ipv4.rt_genid));
3337 get_random_bytes(&net->ipv4.dev_addr_genid,
3338 sizeof(net->ipv4.dev_addr_genid));
3339 return 0;
3340 }
3341
3342 static __net_initdata struct pernet_operations rt_genid_ops = {
3343 .init = rt_genid_init,
3344 };
3345
3346 static int __net_init ipv4_inetpeer_init(struct net *net)
3347 {
3348 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3349
3350 if (!bp)
3351 return -ENOMEM;
3352 inet_peer_base_init(bp);
3353 net->ipv4.peers = bp;
3354 return 0;
3355 }
3356
3357 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3358 {
3359 struct inet_peer_base *bp = net->ipv4.peers;
3360
3361 net->ipv4.peers = NULL;
3362 inetpeer_invalidate_tree(bp);
3363 kfree(bp);
3364 }
3365
3366 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3367 .init = ipv4_inetpeer_init,
3368 .exit = ipv4_inetpeer_exit,
3369 };
3370
3371 #ifdef CONFIG_IP_ROUTE_CLASSID
3372 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_IP_ROUTE_CLASSID */
3374
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3377 {
3378 ssize_t ret;
3379
3380 if (!str)
3381 return 0;
3382
3383 ret = kstrtoul(str, 0, &rhash_entries);
3384 if (ret)
3385 return 0;
3386
3387 return 1;
3388 }
3389 __setup("rhash_entries=", set_rhash_entries);
3390
3391 int __init ip_rt_init(void)
3392 {
3393 int rc = 0;
3394
3395 #ifdef CONFIG_IP_ROUTE_CLASSID
3396 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3397 if (!ip_rt_acct)
3398 panic("IP: failed to allocate ip_rt_acct\n");
3399 #endif
3400
3401 ipv4_dst_ops.kmem_cachep =
3402 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3403 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3404
3405 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3406
3407 if (dst_entries_init(&ipv4_dst_ops) < 0)
3408 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3409
3410 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3411 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3412
3413 rt_hash_table = (struct rt_hash_bucket *)
3414 alloc_large_system_hash("IP route cache",
3415 sizeof(struct rt_hash_bucket),
3416 rhash_entries,
3417 (totalram_pages >= 128 * 1024) ?
3418 15 : 17,
3419 0,
3420 &rt_hash_log,
3421 &rt_hash_mask,
3422 0,
3423 rhash_entries ? 0 : 512 * 1024);
3424 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3425 rt_hash_lock_init();
3426
3427 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3428 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3429
3430 devinet_init();
3431 ip_fib_init();
3432
3433 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3434 expires_ljiffies = jiffies;
3435 schedule_delayed_work(&expires_work,
3436 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3437
3438 if (ip_rt_proc_init())
3439 pr_err("Unable to create route proc files\n");
3440 #ifdef CONFIG_XFRM
3441 xfrm_init();
3442 xfrm4_init(ip_rt_max_size);
3443 #endif
3444 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3445
3446 #ifdef CONFIG_SYSCTL
3447 register_pernet_subsys(&sysctl_route_ops);
3448 #endif
3449 register_pernet_subsys(&rt_genid_ops);
3450 register_pernet_subsys(&ipv4_inetpeer_ops);
3451 return rc;
3452 }
3453
3454 #ifdef CONFIG_SYSCTL
3455 /*
3456 * We really need to sanitize the damn ipv4 init order, then all
3457 * this nonsense will go away.
3458 */
3459 void __init ip_static_sysctl_init(void)
3460 {
3461 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3462 }
3463 #endif