]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame_incremental - net/ipv4/route.c
ipv4: use separate genid for next hop exceptions
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
... / ...
CommitLineData
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
89#include <linux/rcupdate.h>
90#include <linux/times.h>
91#include <linux/slab.h>
92#include <net/dst.h>
93#include <net/net_namespace.h>
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/netevent.h>
105#include <net/rtnetlink.h>
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#include <linux/kmemleak.h>
109#endif
110#include <net/secure_seq.h>
111
112#define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_max_size;
120static int ip_rt_redirect_number __read_mostly = 9;
121static int ip_rt_redirect_load __read_mostly = HZ / 50;
122static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123static int ip_rt_error_cost __read_mostly = HZ;
124static int ip_rt_error_burst __read_mostly = 5 * HZ;
125static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
126static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
127static int ip_rt_min_advmss __read_mostly = 256;
128
129/*
130 * Interface to generic destination cache.
131 */
132
133static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
135static unsigned int ipv4_mtu(const struct dst_entry *dst);
136static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137static void ipv4_link_failure(struct sk_buff *skb);
138static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139 struct sk_buff *skb, u32 mtu);
140static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141 struct sk_buff *skb);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143
144static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145 int how)
146{
147}
148
149static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150{
151 WARN_ON(1);
152 return NULL;
153}
154
155static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158
159static struct dst_ops ipv4_dst_ops = {
160 .family = AF_INET,
161 .protocol = cpu_to_be16(ETH_P_IP),
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .ifdown = ipv4_dst_ifdown,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174};
175
176#define ECN_OR_COST(class) TC_PRIO_##class
177
178const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195};
196EXPORT_SYMBOL(ip_tos2prio);
197
198static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201#ifdef CONFIG_PROC_FS
202static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203{
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207}
208
209static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210{
211 ++*pos;
212 return NULL;
213}
214
215static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216{
217}
218
219static int rt_cache_seq_show(struct seq_file *seq, void *v)
220{
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227}
228
229static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234};
235
236static int rt_cache_seq_open(struct inode *inode, struct file *file)
237{
238 return seq_open(file, &rt_cache_seq_ops);
239}
240
241static const struct file_operations rt_cache_seq_fops = {
242 .owner = THIS_MODULE,
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
246 .release = seq_release,
247};
248
249
250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251{
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
261 return &per_cpu(rt_cache_stat, cpu);
262 }
263 return NULL;
264}
265
266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267{
268 int cpu;
269
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 return NULL;
277
278}
279
280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281{
282
283}
284
285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286{
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 st->in_hit,
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 st->out_hit,
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 st->gc_total,
310 st->gc_ignored,
311 st->gc_goal_miss,
312 st->gc_dst_overflow,
313 st->in_hlist_search,
314 st->out_hlist_search
315 );
316 return 0;
317}
318
319static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324};
325
326
327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328{
329 return seq_open(file, &rt_cpu_seq_ops);
330}
331
332static const struct file_operations rt_cpu_seq_fops = {
333 .owner = THIS_MODULE,
334 .open = rt_cpu_seq_open,
335 .read = seq_read,
336 .llseek = seq_lseek,
337 .release = seq_release,
338};
339
340#ifdef CONFIG_IP_ROUTE_CLASSID
341static int rt_acct_proc_show(struct seq_file *m, void *v)
342{
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
358 }
359
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363}
364
365static int rt_acct_proc_open(struct inode *inode, struct file *file)
366{
367 return single_open(file, rt_acct_proc_show, NULL);
368}
369
370static const struct file_operations rt_acct_proc_fops = {
371 .owner = THIS_MODULE,
372 .open = rt_acct_proc_open,
373 .read = seq_read,
374 .llseek = seq_lseek,
375 .release = single_release,
376};
377#endif
378
379static int __net_init ip_rt_do_proc_init(struct net *net)
380{
381 struct proc_dir_entry *pde;
382
383 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 &rt_cache_seq_fops);
385 if (!pde)
386 goto err1;
387
388 pde = proc_create("rt_cache", S_IRUGO,
389 net->proc_net_stat, &rt_cpu_seq_fops);
390 if (!pde)
391 goto err2;
392
393#ifdef CONFIG_IP_ROUTE_CLASSID
394 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395 if (!pde)
396 goto err3;
397#endif
398 return 0;
399
400#ifdef CONFIG_IP_ROUTE_CLASSID
401err3:
402 remove_proc_entry("rt_cache", net->proc_net_stat);
403#endif
404err2:
405 remove_proc_entry("rt_cache", net->proc_net);
406err1:
407 return -ENOMEM;
408}
409
410static void __net_exit ip_rt_do_proc_exit(struct net *net)
411{
412 remove_proc_entry("rt_cache", net->proc_net_stat);
413 remove_proc_entry("rt_cache", net->proc_net);
414#ifdef CONFIG_IP_ROUTE_CLASSID
415 remove_proc_entry("rt_acct", net->proc_net);
416#endif
417}
418
419static struct pernet_operations ip_rt_proc_ops __net_initdata = {
420 .init = ip_rt_do_proc_init,
421 .exit = ip_rt_do_proc_exit,
422};
423
424static int __init ip_rt_proc_init(void)
425{
426 return register_pernet_subsys(&ip_rt_proc_ops);
427}
428
429#else
430static inline int ip_rt_proc_init(void)
431{
432 return 0;
433}
434#endif /* CONFIG_PROC_FS */
435
436static inline bool rt_is_expired(const struct rtable *rth)
437{
438 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439}
440
441void rt_cache_flush(struct net *net)
442{
443 rt_genid_bump(net);
444}
445
446static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 struct sk_buff *skb,
448 const void *daddr)
449{
450 struct net_device *dev = dst->dev;
451 const __be32 *pkey = daddr;
452 const struct rtable *rt;
453 struct neighbour *n;
454
455 rt = (const struct rtable *) dst;
456 if (rt->rt_gateway)
457 pkey = (const __be32 *) &rt->rt_gateway;
458 else if (skb)
459 pkey = &ip_hdr(skb)->daddr;
460
461 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462 if (n)
463 return n;
464 return neigh_create(&arp_tbl, pkey, dev);
465}
466
467/*
468 * Peer allocation may fail only in serious out-of-memory conditions. However
469 * we still can generate some output.
470 * Random ID selection looks a bit dangerous because we have no chances to
471 * select ID being unique in a reasonable period of time.
472 * But broken packet identifier may be better than no packet at all.
473 */
474static void ip_select_fb_ident(struct iphdr *iph)
475{
476 static DEFINE_SPINLOCK(ip_fb_id_lock);
477 static u32 ip_fallback_id;
478 u32 salt;
479
480 spin_lock_bh(&ip_fb_id_lock);
481 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482 iph->id = htons(salt & 0xFFFF);
483 ip_fallback_id = salt;
484 spin_unlock_bh(&ip_fb_id_lock);
485}
486
487void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488{
489 struct net *net = dev_net(dst->dev);
490 struct inet_peer *peer;
491
492 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493 if (peer) {
494 iph->id = htons(inet_getid(peer, more));
495 inet_putpeer(peer);
496 return;
497 }
498
499 ip_select_fb_ident(iph);
500}
501EXPORT_SYMBOL(__ip_select_ident);
502
503static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504 const struct iphdr *iph,
505 int oif, u8 tos,
506 u8 prot, u32 mark, int flow_flags)
507{
508 if (sk) {
509 const struct inet_sock *inet = inet_sk(sk);
510
511 oif = sk->sk_bound_dev_if;
512 mark = sk->sk_mark;
513 tos = RT_CONN_FLAGS(sk);
514 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515 }
516 flowi4_init_output(fl4, oif, mark, tos,
517 RT_SCOPE_UNIVERSE, prot,
518 flow_flags,
519 iph->daddr, iph->saddr, 0, 0);
520}
521
522static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523 const struct sock *sk)
524{
525 const struct iphdr *iph = ip_hdr(skb);
526 int oif = skb->dev->ifindex;
527 u8 tos = RT_TOS(iph->tos);
528 u8 prot = iph->protocol;
529 u32 mark = skb->mark;
530
531 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532}
533
534static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535{
536 const struct inet_sock *inet = inet_sk(sk);
537 const struct ip_options_rcu *inet_opt;
538 __be32 daddr = inet->inet_daddr;
539
540 rcu_read_lock();
541 inet_opt = rcu_dereference(inet->inet_opt);
542 if (inet_opt && inet_opt->opt.srr)
543 daddr = inet_opt->opt.faddr;
544 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547 inet_sk_flowi_flags(sk),
548 daddr, inet->inet_saddr, 0, 0);
549 rcu_read_unlock();
550}
551
552static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553 const struct sk_buff *skb)
554{
555 if (skb)
556 build_skb_flow_key(fl4, skb, sk);
557 else
558 build_sk_flow_key(fl4, sk);
559}
560
561static inline void rt_free(struct rtable *rt)
562{
563 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564}
565
566static DEFINE_SPINLOCK(fnhe_lock);
567
568static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
569{
570 struct fib_nh_exception *fnhe, *oldest;
571 struct rtable *orig;
572
573 oldest = rcu_dereference(hash->chain);
574 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575 fnhe = rcu_dereference(fnhe->fnhe_next)) {
576 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577 oldest = fnhe;
578 }
579 orig = rcu_dereference(oldest->fnhe_rth);
580 if (orig) {
581 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582 rt_free(orig);
583 }
584 return oldest;
585}
586
587static inline u32 fnhe_hashfun(__be32 daddr)
588{
589 u32 hval;
590
591 hval = (__force u32) daddr;
592 hval ^= (hval >> 11) ^ (hval >> 22);
593
594 return hval & (FNHE_HASH_SIZE - 1);
595}
596
597static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
598{
599 rt->rt_pmtu = fnhe->fnhe_pmtu;
600 rt->dst.expires = fnhe->fnhe_expires;
601
602 if (fnhe->fnhe_gw) {
603 rt->rt_flags |= RTCF_REDIRECTED;
604 rt->rt_gateway = fnhe->fnhe_gw;
605 rt->rt_uses_gateway = 1;
606 }
607}
608
609static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
610 u32 pmtu, unsigned long expires)
611{
612 struct fnhe_hash_bucket *hash;
613 struct fib_nh_exception *fnhe;
614 struct rtable *rt;
615 unsigned int i;
616 int depth;
617 u32 hval = fnhe_hashfun(daddr);
618
619 spin_lock_bh(&fnhe_lock);
620
621 hash = nh->nh_exceptions;
622 if (!hash) {
623 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
624 if (!hash)
625 goto out_unlock;
626 nh->nh_exceptions = hash;
627 }
628
629 hash += hval;
630
631 depth = 0;
632 for (fnhe = rcu_dereference(hash->chain); fnhe;
633 fnhe = rcu_dereference(fnhe->fnhe_next)) {
634 if (fnhe->fnhe_daddr == daddr)
635 break;
636 depth++;
637 }
638
639 if (fnhe) {
640 if (gw)
641 fnhe->fnhe_gw = gw;
642 if (pmtu) {
643 fnhe->fnhe_pmtu = pmtu;
644 fnhe->fnhe_expires = max(1UL, expires);
645 }
646 /* Update all cached dsts too */
647 rt = rcu_dereference(fnhe->fnhe_rth);
648 if (rt)
649 fill_route_from_fnhe(rt, fnhe);
650 } else {
651 if (depth > FNHE_RECLAIM_DEPTH)
652 fnhe = fnhe_oldest(hash);
653 else {
654 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
655 if (!fnhe)
656 goto out_unlock;
657
658 fnhe->fnhe_next = hash->chain;
659 rcu_assign_pointer(hash->chain, fnhe);
660 }
661 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
662 fnhe->fnhe_daddr = daddr;
663 fnhe->fnhe_gw = gw;
664 fnhe->fnhe_pmtu = pmtu;
665 fnhe->fnhe_expires = expires;
666
667 /* Exception created; mark the cached routes for the nexthop
668 * stale, so anyone caching it rechecks if this exception
669 * applies to them.
670 */
671 for_each_possible_cpu(i) {
672 struct rtable __rcu **prt;
673 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
674 rt = rcu_dereference(*prt);
675 if (rt)
676 rt->dst.obsolete = DST_OBSOLETE_KILL;
677 }
678 }
679
680 fnhe->fnhe_stamp = jiffies;
681
682out_unlock:
683 spin_unlock_bh(&fnhe_lock);
684 return;
685}
686
687static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
688 bool kill_route)
689{
690 __be32 new_gw = icmp_hdr(skb)->un.gateway;
691 __be32 old_gw = ip_hdr(skb)->saddr;
692 struct net_device *dev = skb->dev;
693 struct in_device *in_dev;
694 struct fib_result res;
695 struct neighbour *n;
696 struct net *net;
697
698 switch (icmp_hdr(skb)->code & 7) {
699 case ICMP_REDIR_NET:
700 case ICMP_REDIR_NETTOS:
701 case ICMP_REDIR_HOST:
702 case ICMP_REDIR_HOSTTOS:
703 break;
704
705 default:
706 return;
707 }
708
709 if (rt->rt_gateway != old_gw)
710 return;
711
712 in_dev = __in_dev_get_rcu(dev);
713 if (!in_dev)
714 return;
715
716 net = dev_net(dev);
717 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
718 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
719 ipv4_is_zeronet(new_gw))
720 goto reject_redirect;
721
722 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
723 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
724 goto reject_redirect;
725 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
726 goto reject_redirect;
727 } else {
728 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
729 goto reject_redirect;
730 }
731
732 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
733 if (n) {
734 if (!(n->nud_state & NUD_VALID)) {
735 neigh_event_send(n, NULL);
736 } else {
737 if (fib_lookup(net, fl4, &res) == 0) {
738 struct fib_nh *nh = &FIB_RES_NH(res);
739
740 update_or_create_fnhe(nh, fl4->daddr, new_gw,
741 0, 0);
742 }
743 if (kill_route)
744 rt->dst.obsolete = DST_OBSOLETE_KILL;
745 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
746 }
747 neigh_release(n);
748 }
749 return;
750
751reject_redirect:
752#ifdef CONFIG_IP_ROUTE_VERBOSE
753 if (IN_DEV_LOG_MARTIANS(in_dev)) {
754 const struct iphdr *iph = (const struct iphdr *) skb->data;
755 __be32 daddr = iph->daddr;
756 __be32 saddr = iph->saddr;
757
758 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
759 " Advised path = %pI4 -> %pI4\n",
760 &old_gw, dev->name, &new_gw,
761 &saddr, &daddr);
762 }
763#endif
764 ;
765}
766
767static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
768{
769 struct rtable *rt;
770 struct flowi4 fl4;
771
772 rt = (struct rtable *) dst;
773
774 ip_rt_build_flow_key(&fl4, sk, skb);
775 __ip_do_redirect(rt, skb, &fl4, true);
776}
777
778static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
779{
780 struct rtable *rt = (struct rtable *)dst;
781 struct dst_entry *ret = dst;
782
783 if (rt) {
784 if (dst->obsolete > 0) {
785 ip_rt_put(rt);
786 ret = NULL;
787 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
788 rt->dst.expires) {
789 ip_rt_put(rt);
790 ret = NULL;
791 }
792 }
793 return ret;
794}
795
796/*
797 * Algorithm:
798 * 1. The first ip_rt_redirect_number redirects are sent
799 * with exponential backoff, then we stop sending them at all,
800 * assuming that the host ignores our redirects.
801 * 2. If we did not see packets requiring redirects
802 * during ip_rt_redirect_silence, we assume that the host
803 * forgot redirected route and start to send redirects again.
804 *
805 * This algorithm is much cheaper and more intelligent than dumb load limiting
806 * in icmp.c.
807 *
808 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
809 * and "frag. need" (breaks PMTU discovery) in icmp.c.
810 */
811
812void ip_rt_send_redirect(struct sk_buff *skb)
813{
814 struct rtable *rt = skb_rtable(skb);
815 struct in_device *in_dev;
816 struct inet_peer *peer;
817 struct net *net;
818 int log_martians;
819
820 rcu_read_lock();
821 in_dev = __in_dev_get_rcu(rt->dst.dev);
822 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
823 rcu_read_unlock();
824 return;
825 }
826 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
827 rcu_read_unlock();
828
829 net = dev_net(rt->dst.dev);
830 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
831 if (!peer) {
832 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
833 rt_nexthop(rt, ip_hdr(skb)->daddr));
834 return;
835 }
836
837 /* No redirected packets during ip_rt_redirect_silence;
838 * reset the algorithm.
839 */
840 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
841 peer->rate_tokens = 0;
842
843 /* Too many ignored redirects; do not send anything
844 * set dst.rate_last to the last seen redirected packet.
845 */
846 if (peer->rate_tokens >= ip_rt_redirect_number) {
847 peer->rate_last = jiffies;
848 goto out_put_peer;
849 }
850
851 /* Check for load limit; set rate_last to the latest sent
852 * redirect.
853 */
854 if (peer->rate_tokens == 0 ||
855 time_after(jiffies,
856 (peer->rate_last +
857 (ip_rt_redirect_load << peer->rate_tokens)))) {
858 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
859
860 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
861 peer->rate_last = jiffies;
862 ++peer->rate_tokens;
863#ifdef CONFIG_IP_ROUTE_VERBOSE
864 if (log_martians &&
865 peer->rate_tokens == ip_rt_redirect_number)
866 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
867 &ip_hdr(skb)->saddr, inet_iif(skb),
868 &ip_hdr(skb)->daddr, &gw);
869#endif
870 }
871out_put_peer:
872 inet_putpeer(peer);
873}
874
875static int ip_error(struct sk_buff *skb)
876{
877 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
878 struct rtable *rt = skb_rtable(skb);
879 struct inet_peer *peer;
880 unsigned long now;
881 struct net *net;
882 bool send;
883 int code;
884
885 net = dev_net(rt->dst.dev);
886 if (!IN_DEV_FORWARD(in_dev)) {
887 switch (rt->dst.error) {
888 case EHOSTUNREACH:
889 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
890 break;
891
892 case ENETUNREACH:
893 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
894 break;
895 }
896 goto out;
897 }
898
899 switch (rt->dst.error) {
900 case EINVAL:
901 default:
902 goto out;
903 case EHOSTUNREACH:
904 code = ICMP_HOST_UNREACH;
905 break;
906 case ENETUNREACH:
907 code = ICMP_NET_UNREACH;
908 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
909 break;
910 case EACCES:
911 code = ICMP_PKT_FILTERED;
912 break;
913 }
914
915 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
916
917 send = true;
918 if (peer) {
919 now = jiffies;
920 peer->rate_tokens += now - peer->rate_last;
921 if (peer->rate_tokens > ip_rt_error_burst)
922 peer->rate_tokens = ip_rt_error_burst;
923 peer->rate_last = now;
924 if (peer->rate_tokens >= ip_rt_error_cost)
925 peer->rate_tokens -= ip_rt_error_cost;
926 else
927 send = false;
928 inet_putpeer(peer);
929 }
930 if (send)
931 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
932
933out: kfree_skb(skb);
934 return 0;
935}
936
937static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
938{
939 struct dst_entry *dst = &rt->dst;
940 struct fib_result res;
941
942 if (dst_metric_locked(dst, RTAX_MTU))
943 return;
944
945 if (dst->dev->mtu < mtu)
946 return;
947
948 if (mtu < ip_rt_min_pmtu)
949 mtu = ip_rt_min_pmtu;
950
951 if (rt->rt_pmtu == mtu &&
952 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
953 return;
954
955 rcu_read_lock();
956 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
957 struct fib_nh *nh = &FIB_RES_NH(res);
958
959 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
960 jiffies + ip_rt_mtu_expires);
961 }
962 rcu_read_unlock();
963}
964
965static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
966 struct sk_buff *skb, u32 mtu)
967{
968 struct rtable *rt = (struct rtable *) dst;
969 struct flowi4 fl4;
970
971 ip_rt_build_flow_key(&fl4, sk, skb);
972 __ip_rt_update_pmtu(rt, &fl4, mtu);
973}
974
975void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
976 int oif, u32 mark, u8 protocol, int flow_flags)
977{
978 const struct iphdr *iph = (const struct iphdr *) skb->data;
979 struct flowi4 fl4;
980 struct rtable *rt;
981
982 __build_flow_key(&fl4, NULL, iph, oif,
983 RT_TOS(iph->tos), protocol, mark, flow_flags);
984 rt = __ip_route_output_key(net, &fl4);
985 if (!IS_ERR(rt)) {
986 __ip_rt_update_pmtu(rt, &fl4, mtu);
987 ip_rt_put(rt);
988 }
989}
990EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
991
992static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
993{
994 const struct iphdr *iph = (const struct iphdr *) skb->data;
995 struct flowi4 fl4;
996 struct rtable *rt;
997
998 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
999 rt = __ip_route_output_key(sock_net(sk), &fl4);
1000 if (!IS_ERR(rt)) {
1001 __ip_rt_update_pmtu(rt, &fl4, mtu);
1002 ip_rt_put(rt);
1003 }
1004}
1005
1006void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1007{
1008 const struct iphdr *iph = (const struct iphdr *) skb->data;
1009 struct flowi4 fl4;
1010 struct rtable *rt;
1011 struct dst_entry *dst;
1012 bool new = false;
1013
1014 bh_lock_sock(sk);
1015 rt = (struct rtable *) __sk_dst_get(sk);
1016
1017 if (sock_owned_by_user(sk) || !rt) {
1018 __ipv4_sk_update_pmtu(skb, sk, mtu);
1019 goto out;
1020 }
1021
1022 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023
1024 if (!__sk_dst_check(sk, 0)) {
1025 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1026 if (IS_ERR(rt))
1027 goto out;
1028
1029 new = true;
1030 }
1031
1032 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1033
1034 dst = dst_check(&rt->dst, 0);
1035 if (!dst) {
1036 if (new)
1037 dst_release(&rt->dst);
1038
1039 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1040 if (IS_ERR(rt))
1041 goto out;
1042
1043 new = true;
1044 }
1045
1046 if (new)
1047 __sk_dst_set(sk, &rt->dst);
1048
1049out:
1050 bh_unlock_sock(sk);
1051}
1052EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1053
1054void ipv4_redirect(struct sk_buff *skb, struct net *net,
1055 int oif, u32 mark, u8 protocol, int flow_flags)
1056{
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
1060
1061 __build_flow_key(&fl4, NULL, iph, oif,
1062 RT_TOS(iph->tos), protocol, mark, flow_flags);
1063 rt = __ip_route_output_key(net, &fl4);
1064 if (!IS_ERR(rt)) {
1065 __ip_do_redirect(rt, skb, &fl4, false);
1066 ip_rt_put(rt);
1067 }
1068}
1069EXPORT_SYMBOL_GPL(ipv4_redirect);
1070
1071void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1072{
1073 const struct iphdr *iph = (const struct iphdr *) skb->data;
1074 struct flowi4 fl4;
1075 struct rtable *rt;
1076
1077 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1078 rt = __ip_route_output_key(sock_net(sk), &fl4);
1079 if (!IS_ERR(rt)) {
1080 __ip_do_redirect(rt, skb, &fl4, false);
1081 ip_rt_put(rt);
1082 }
1083}
1084EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1085
1086static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1087{
1088 struct rtable *rt = (struct rtable *) dst;
1089
1090 /* All IPV4 dsts are created with ->obsolete set to the value
1091 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1092 * into this function always.
1093 *
1094 * When a PMTU/redirect information update invalidates a route,
1095 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1096 * DST_OBSOLETE_DEAD by dst_free().
1097 */
1098 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1099 return NULL;
1100 return dst;
1101}
1102
1103static void ipv4_link_failure(struct sk_buff *skb)
1104{
1105 struct rtable *rt;
1106
1107 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1108
1109 rt = skb_rtable(skb);
1110 if (rt)
1111 dst_set_expires(&rt->dst, 0);
1112}
1113
1114static int ip_rt_bug(struct sk_buff *skb)
1115{
1116 pr_debug("%s: %pI4 -> %pI4, %s\n",
1117 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1118 skb->dev ? skb->dev->name : "?");
1119 kfree_skb(skb);
1120 WARN_ON(1);
1121 return 0;
1122}
1123
1124/*
1125 We do not cache source address of outgoing interface,
1126 because it is used only by IP RR, TS and SRR options,
1127 so that it out of fast path.
1128
1129 BTW remember: "addr" is allowed to be not aligned
1130 in IP options!
1131 */
1132
1133void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1134{
1135 __be32 src;
1136
1137 if (rt_is_output_route(rt))
1138 src = ip_hdr(skb)->saddr;
1139 else {
1140 struct fib_result res;
1141 struct flowi4 fl4;
1142 struct iphdr *iph;
1143
1144 iph = ip_hdr(skb);
1145
1146 memset(&fl4, 0, sizeof(fl4));
1147 fl4.daddr = iph->daddr;
1148 fl4.saddr = iph->saddr;
1149 fl4.flowi4_tos = RT_TOS(iph->tos);
1150 fl4.flowi4_oif = rt->dst.dev->ifindex;
1151 fl4.flowi4_iif = skb->dev->ifindex;
1152 fl4.flowi4_mark = skb->mark;
1153
1154 rcu_read_lock();
1155 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1156 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1157 else
1158 src = inet_select_addr(rt->dst.dev,
1159 rt_nexthop(rt, iph->daddr),
1160 RT_SCOPE_UNIVERSE);
1161 rcu_read_unlock();
1162 }
1163 memcpy(addr, &src, 4);
1164}
1165
1166#ifdef CONFIG_IP_ROUTE_CLASSID
1167static void set_class_tag(struct rtable *rt, u32 tag)
1168{
1169 if (!(rt->dst.tclassid & 0xFFFF))
1170 rt->dst.tclassid |= tag & 0xFFFF;
1171 if (!(rt->dst.tclassid & 0xFFFF0000))
1172 rt->dst.tclassid |= tag & 0xFFFF0000;
1173}
1174#endif
1175
1176static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1177{
1178 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1179
1180 if (advmss == 0) {
1181 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1182 ip_rt_min_advmss);
1183 if (advmss > 65535 - 40)
1184 advmss = 65535 - 40;
1185 }
1186 return advmss;
1187}
1188
1189static unsigned int ipv4_mtu(const struct dst_entry *dst)
1190{
1191 const struct rtable *rt = (const struct rtable *) dst;
1192 unsigned int mtu = rt->rt_pmtu;
1193
1194 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1195 mtu = dst_metric_raw(dst, RTAX_MTU);
1196
1197 if (mtu)
1198 return mtu;
1199
1200 mtu = dst->dev->mtu;
1201
1202 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1203 if (rt->rt_uses_gateway && mtu > 576)
1204 mtu = 576;
1205 }
1206
1207 if (mtu > IP_MAX_MTU)
1208 mtu = IP_MAX_MTU;
1209
1210 return mtu;
1211}
1212
1213static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1214{
1215 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1216 struct fib_nh_exception *fnhe;
1217 u32 hval;
1218
1219 if (!hash)
1220 return NULL;
1221
1222 hval = fnhe_hashfun(daddr);
1223
1224 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1225 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1226 if (fnhe->fnhe_daddr == daddr)
1227 return fnhe;
1228 }
1229 return NULL;
1230}
1231
1232static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1233 __be32 daddr)
1234{
1235 bool ret = false;
1236
1237 spin_lock_bh(&fnhe_lock);
1238
1239 if (daddr == fnhe->fnhe_daddr) {
1240 int genid = fnhe_genid(dev_net(rt->dst.dev));
1241 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1242
1243 if (fnhe->fnhe_genid != genid) {
1244 fnhe->fnhe_genid = genid;
1245 fnhe->fnhe_gw = 0;
1246 fnhe->fnhe_pmtu = 0;
1247 fnhe->fnhe_expires = 0;
1248 }
1249 fill_route_from_fnhe(rt, fnhe);
1250 if (!rt->rt_gateway)
1251 rt->rt_gateway = daddr;
1252
1253 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1254 if (orig)
1255 rt_free(orig);
1256
1257 fnhe->fnhe_stamp = jiffies;
1258 ret = true;
1259 }
1260 spin_unlock_bh(&fnhe_lock);
1261
1262 return ret;
1263}
1264
1265static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1266{
1267 struct rtable *orig, *prev, **p;
1268 bool ret = true;
1269
1270 if (rt_is_input_route(rt)) {
1271 p = (struct rtable **)&nh->nh_rth_input;
1272 } else {
1273 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1274 }
1275 orig = *p;
1276
1277 prev = cmpxchg(p, orig, rt);
1278 if (prev == orig) {
1279 if (orig)
1280 rt_free(orig);
1281 } else
1282 ret = false;
1283
1284 return ret;
1285}
1286
1287static DEFINE_SPINLOCK(rt_uncached_lock);
1288static LIST_HEAD(rt_uncached_list);
1289
1290static void rt_add_uncached_list(struct rtable *rt)
1291{
1292 spin_lock_bh(&rt_uncached_lock);
1293 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1294 spin_unlock_bh(&rt_uncached_lock);
1295}
1296
1297static void ipv4_dst_destroy(struct dst_entry *dst)
1298{
1299 struct rtable *rt = (struct rtable *) dst;
1300
1301 if (!list_empty(&rt->rt_uncached)) {
1302 spin_lock_bh(&rt_uncached_lock);
1303 list_del(&rt->rt_uncached);
1304 spin_unlock_bh(&rt_uncached_lock);
1305 }
1306}
1307
1308void rt_flush_dev(struct net_device *dev)
1309{
1310 if (!list_empty(&rt_uncached_list)) {
1311 struct net *net = dev_net(dev);
1312 struct rtable *rt;
1313
1314 spin_lock_bh(&rt_uncached_lock);
1315 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1316 if (rt->dst.dev != dev)
1317 continue;
1318 rt->dst.dev = net->loopback_dev;
1319 dev_hold(rt->dst.dev);
1320 dev_put(dev);
1321 }
1322 spin_unlock_bh(&rt_uncached_lock);
1323 }
1324}
1325
1326static bool rt_cache_valid(const struct rtable *rt)
1327{
1328 return rt &&
1329 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1330 !rt_is_expired(rt);
1331}
1332
1333static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1334 const struct fib_result *res,
1335 struct fib_nh_exception *fnhe,
1336 struct fib_info *fi, u16 type, u32 itag)
1337{
1338 bool cached = false;
1339
1340 if (fi) {
1341 struct fib_nh *nh = &FIB_RES_NH(*res);
1342
1343 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1344 rt->rt_gateway = nh->nh_gw;
1345 rt->rt_uses_gateway = 1;
1346 }
1347 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1348#ifdef CONFIG_IP_ROUTE_CLASSID
1349 rt->dst.tclassid = nh->nh_tclassid;
1350#endif
1351 if (unlikely(fnhe))
1352 cached = rt_bind_exception(rt, fnhe, daddr);
1353 else if (!(rt->dst.flags & DST_NOCACHE))
1354 cached = rt_cache_route(nh, rt);
1355 if (unlikely(!cached)) {
1356 /* Routes we intend to cache in nexthop exception or
1357 * FIB nexthop have the DST_NOCACHE bit clear.
1358 * However, if we are unsuccessful at storing this
1359 * route into the cache we really need to set it.
1360 */
1361 rt->dst.flags |= DST_NOCACHE;
1362 if (!rt->rt_gateway)
1363 rt->rt_gateway = daddr;
1364 rt_add_uncached_list(rt);
1365 }
1366 } else
1367 rt_add_uncached_list(rt);
1368
1369#ifdef CONFIG_IP_ROUTE_CLASSID
1370#ifdef CONFIG_IP_MULTIPLE_TABLES
1371 set_class_tag(rt, res->tclassid);
1372#endif
1373 set_class_tag(rt, itag);
1374#endif
1375}
1376
1377static struct rtable *rt_dst_alloc(struct net_device *dev,
1378 bool nopolicy, bool noxfrm, bool will_cache)
1379{
1380 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1381 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1382 (nopolicy ? DST_NOPOLICY : 0) |
1383 (noxfrm ? DST_NOXFRM : 0));
1384}
1385
1386/* called in rcu_read_lock() section */
1387static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1388 u8 tos, struct net_device *dev, int our)
1389{
1390 struct rtable *rth;
1391 struct in_device *in_dev = __in_dev_get_rcu(dev);
1392 u32 itag = 0;
1393 int err;
1394
1395 /* Primary sanity checks. */
1396
1397 if (in_dev == NULL)
1398 return -EINVAL;
1399
1400 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1401 skb->protocol != htons(ETH_P_IP))
1402 goto e_inval;
1403
1404 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1405 if (ipv4_is_loopback(saddr))
1406 goto e_inval;
1407
1408 if (ipv4_is_zeronet(saddr)) {
1409 if (!ipv4_is_local_multicast(daddr))
1410 goto e_inval;
1411 } else {
1412 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1413 in_dev, &itag);
1414 if (err < 0)
1415 goto e_err;
1416 }
1417 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1418 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1419 if (!rth)
1420 goto e_nobufs;
1421
1422#ifdef CONFIG_IP_ROUTE_CLASSID
1423 rth->dst.tclassid = itag;
1424#endif
1425 rth->dst.output = ip_rt_bug;
1426
1427 rth->rt_genid = rt_genid(dev_net(dev));
1428 rth->rt_flags = RTCF_MULTICAST;
1429 rth->rt_type = RTN_MULTICAST;
1430 rth->rt_is_input= 1;
1431 rth->rt_iif = 0;
1432 rth->rt_pmtu = 0;
1433 rth->rt_gateway = 0;
1434 rth->rt_uses_gateway = 0;
1435 INIT_LIST_HEAD(&rth->rt_uncached);
1436 if (our) {
1437 rth->dst.input= ip_local_deliver;
1438 rth->rt_flags |= RTCF_LOCAL;
1439 }
1440
1441#ifdef CONFIG_IP_MROUTE
1442 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1443 rth->dst.input = ip_mr_input;
1444#endif
1445 RT_CACHE_STAT_INC(in_slow_mc);
1446
1447 skb_dst_set(skb, &rth->dst);
1448 return 0;
1449
1450e_nobufs:
1451 return -ENOBUFS;
1452e_inval:
1453 return -EINVAL;
1454e_err:
1455 return err;
1456}
1457
1458
1459static void ip_handle_martian_source(struct net_device *dev,
1460 struct in_device *in_dev,
1461 struct sk_buff *skb,
1462 __be32 daddr,
1463 __be32 saddr)
1464{
1465 RT_CACHE_STAT_INC(in_martian_src);
1466#ifdef CONFIG_IP_ROUTE_VERBOSE
1467 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1468 /*
1469 * RFC1812 recommendation, if source is martian,
1470 * the only hint is MAC header.
1471 */
1472 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1473 &daddr, &saddr, dev->name);
1474 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1475 print_hex_dump(KERN_WARNING, "ll header: ",
1476 DUMP_PREFIX_OFFSET, 16, 1,
1477 skb_mac_header(skb),
1478 dev->hard_header_len, true);
1479 }
1480 }
1481#endif
1482}
1483
1484/* called in rcu_read_lock() section */
1485static int __mkroute_input(struct sk_buff *skb,
1486 const struct fib_result *res,
1487 struct in_device *in_dev,
1488 __be32 daddr, __be32 saddr, u32 tos)
1489{
1490 struct rtable *rth;
1491 int err;
1492 struct in_device *out_dev;
1493 unsigned int flags = 0;
1494 bool do_cache;
1495 u32 itag;
1496
1497 /* get a working reference to the output device */
1498 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1499 if (out_dev == NULL) {
1500 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1501 return -EINVAL;
1502 }
1503
1504 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1505 in_dev->dev, in_dev, &itag);
1506 if (err < 0) {
1507 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1508 saddr);
1509
1510 goto cleanup;
1511 }
1512
1513 do_cache = res->fi && !itag;
1514 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1515 (IN_DEV_SHARED_MEDIA(out_dev) ||
1516 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1517 flags |= RTCF_DOREDIRECT;
1518 do_cache = false;
1519 }
1520
1521 if (skb->protocol != htons(ETH_P_IP)) {
1522 /* Not IP (i.e. ARP). Do not create route, if it is
1523 * invalid for proxy arp. DNAT routes are always valid.
1524 *
1525 * Proxy arp feature have been extended to allow, ARP
1526 * replies back to the same interface, to support
1527 * Private VLAN switch technologies. See arp.c.
1528 */
1529 if (out_dev == in_dev &&
1530 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1531 err = -EINVAL;
1532 goto cleanup;
1533 }
1534 }
1535
1536 if (do_cache) {
1537 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1538 if (rt_cache_valid(rth)) {
1539 skb_dst_set_noref(skb, &rth->dst);
1540 goto out;
1541 }
1542 }
1543
1544 rth = rt_dst_alloc(out_dev->dev,
1545 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1546 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1547 if (!rth) {
1548 err = -ENOBUFS;
1549 goto cleanup;
1550 }
1551
1552 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1553 rth->rt_flags = flags;
1554 rth->rt_type = res->type;
1555 rth->rt_is_input = 1;
1556 rth->rt_iif = 0;
1557 rth->rt_pmtu = 0;
1558 rth->rt_gateway = 0;
1559 rth->rt_uses_gateway = 0;
1560 INIT_LIST_HEAD(&rth->rt_uncached);
1561
1562 rth->dst.input = ip_forward;
1563 rth->dst.output = ip_output;
1564
1565 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1566 skb_dst_set(skb, &rth->dst);
1567out:
1568 err = 0;
1569 cleanup:
1570 return err;
1571}
1572
1573static int ip_mkroute_input(struct sk_buff *skb,
1574 struct fib_result *res,
1575 const struct flowi4 *fl4,
1576 struct in_device *in_dev,
1577 __be32 daddr, __be32 saddr, u32 tos)
1578{
1579#ifdef CONFIG_IP_ROUTE_MULTIPATH
1580 if (res->fi && res->fi->fib_nhs > 1)
1581 fib_select_multipath(res);
1582#endif
1583
1584 /* create a routing cache entry */
1585 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1586}
1587
1588/*
1589 * NOTE. We drop all the packets that has local source
1590 * addresses, because every properly looped back packet
1591 * must have correct destination already attached by output routine.
1592 *
1593 * Such approach solves two big problems:
1594 * 1. Not simplex devices are handled properly.
1595 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1596 * called with rcu_read_lock()
1597 */
1598
1599static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1600 u8 tos, struct net_device *dev)
1601{
1602 struct fib_result res;
1603 struct in_device *in_dev = __in_dev_get_rcu(dev);
1604 struct flowi4 fl4;
1605 unsigned int flags = 0;
1606 u32 itag = 0;
1607 struct rtable *rth;
1608 int err = -EINVAL;
1609 struct net *net = dev_net(dev);
1610 bool do_cache;
1611
1612 /* IP on this device is disabled. */
1613
1614 if (!in_dev)
1615 goto out;
1616
1617 /* Check for the most weird martians, which can be not detected
1618 by fib_lookup.
1619 */
1620
1621 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1622 goto martian_source;
1623
1624 res.fi = NULL;
1625 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1626 goto brd_input;
1627
1628 /* Accept zero addresses only to limited broadcast;
1629 * I even do not know to fix it or not. Waiting for complains :-)
1630 */
1631 if (ipv4_is_zeronet(saddr))
1632 goto martian_source;
1633
1634 if (ipv4_is_zeronet(daddr))
1635 goto martian_destination;
1636
1637 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1638 * and call it once if daddr or/and saddr are loopback addresses
1639 */
1640 if (ipv4_is_loopback(daddr)) {
1641 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1642 goto martian_destination;
1643 } else if (ipv4_is_loopback(saddr)) {
1644 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1645 goto martian_source;
1646 }
1647
1648 /*
1649 * Now we are ready to route packet.
1650 */
1651 fl4.flowi4_oif = 0;
1652 fl4.flowi4_iif = dev->ifindex;
1653 fl4.flowi4_mark = skb->mark;
1654 fl4.flowi4_tos = tos;
1655 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1656 fl4.daddr = daddr;
1657 fl4.saddr = saddr;
1658 err = fib_lookup(net, &fl4, &res);
1659 if (err != 0)
1660 goto no_route;
1661
1662 RT_CACHE_STAT_INC(in_slow_tot);
1663
1664 if (res.type == RTN_BROADCAST)
1665 goto brd_input;
1666
1667 if (res.type == RTN_LOCAL) {
1668 err = fib_validate_source(skb, saddr, daddr, tos,
1669 LOOPBACK_IFINDEX,
1670 dev, in_dev, &itag);
1671 if (err < 0)
1672 goto martian_source_keep_err;
1673 goto local_input;
1674 }
1675
1676 if (!IN_DEV_FORWARD(in_dev))
1677 goto no_route;
1678 if (res.type != RTN_UNICAST)
1679 goto martian_destination;
1680
1681 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1682out: return err;
1683
1684brd_input:
1685 if (skb->protocol != htons(ETH_P_IP))
1686 goto e_inval;
1687
1688 if (!ipv4_is_zeronet(saddr)) {
1689 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1690 in_dev, &itag);
1691 if (err < 0)
1692 goto martian_source_keep_err;
1693 }
1694 flags |= RTCF_BROADCAST;
1695 res.type = RTN_BROADCAST;
1696 RT_CACHE_STAT_INC(in_brd);
1697
1698local_input:
1699 do_cache = false;
1700 if (res.fi) {
1701 if (!itag) {
1702 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1703 if (rt_cache_valid(rth)) {
1704 skb_dst_set_noref(skb, &rth->dst);
1705 err = 0;
1706 goto out;
1707 }
1708 do_cache = true;
1709 }
1710 }
1711
1712 rth = rt_dst_alloc(net->loopback_dev,
1713 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1714 if (!rth)
1715 goto e_nobufs;
1716
1717 rth->dst.input= ip_local_deliver;
1718 rth->dst.output= ip_rt_bug;
1719#ifdef CONFIG_IP_ROUTE_CLASSID
1720 rth->dst.tclassid = itag;
1721#endif
1722
1723 rth->rt_genid = rt_genid(net);
1724 rth->rt_flags = flags|RTCF_LOCAL;
1725 rth->rt_type = res.type;
1726 rth->rt_is_input = 1;
1727 rth->rt_iif = 0;
1728 rth->rt_pmtu = 0;
1729 rth->rt_gateway = 0;
1730 rth->rt_uses_gateway = 0;
1731 INIT_LIST_HEAD(&rth->rt_uncached);
1732 if (res.type == RTN_UNREACHABLE) {
1733 rth->dst.input= ip_error;
1734 rth->dst.error= -err;
1735 rth->rt_flags &= ~RTCF_LOCAL;
1736 }
1737 if (do_cache)
1738 rt_cache_route(&FIB_RES_NH(res), rth);
1739 skb_dst_set(skb, &rth->dst);
1740 err = 0;
1741 goto out;
1742
1743no_route:
1744 RT_CACHE_STAT_INC(in_no_route);
1745 res.type = RTN_UNREACHABLE;
1746 if (err == -ESRCH)
1747 err = -ENETUNREACH;
1748 goto local_input;
1749
1750 /*
1751 * Do not cache martian addresses: they should be logged (RFC1812)
1752 */
1753martian_destination:
1754 RT_CACHE_STAT_INC(in_martian_dst);
1755#ifdef CONFIG_IP_ROUTE_VERBOSE
1756 if (IN_DEV_LOG_MARTIANS(in_dev))
1757 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1758 &daddr, &saddr, dev->name);
1759#endif
1760
1761e_inval:
1762 err = -EINVAL;
1763 goto out;
1764
1765e_nobufs:
1766 err = -ENOBUFS;
1767 goto out;
1768
1769martian_source:
1770 err = -EINVAL;
1771martian_source_keep_err:
1772 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1773 goto out;
1774}
1775
1776int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1777 u8 tos, struct net_device *dev)
1778{
1779 int res;
1780
1781 rcu_read_lock();
1782
1783 /* Multicast recognition logic is moved from route cache to here.
1784 The problem was that too many Ethernet cards have broken/missing
1785 hardware multicast filters :-( As result the host on multicasting
1786 network acquires a lot of useless route cache entries, sort of
1787 SDR messages from all the world. Now we try to get rid of them.
1788 Really, provided software IP multicast filter is organized
1789 reasonably (at least, hashed), it does not result in a slowdown
1790 comparing with route cache reject entries.
1791 Note, that multicast routers are not affected, because
1792 route cache entry is created eventually.
1793 */
1794 if (ipv4_is_multicast(daddr)) {
1795 struct in_device *in_dev = __in_dev_get_rcu(dev);
1796
1797 if (in_dev) {
1798 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1799 ip_hdr(skb)->protocol);
1800 if (our
1801#ifdef CONFIG_IP_MROUTE
1802 ||
1803 (!ipv4_is_local_multicast(daddr) &&
1804 IN_DEV_MFORWARD(in_dev))
1805#endif
1806 ) {
1807 int res = ip_route_input_mc(skb, daddr, saddr,
1808 tos, dev, our);
1809 rcu_read_unlock();
1810 return res;
1811 }
1812 }
1813 rcu_read_unlock();
1814 return -EINVAL;
1815 }
1816 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1817 rcu_read_unlock();
1818 return res;
1819}
1820EXPORT_SYMBOL(ip_route_input_noref);
1821
1822/* called with rcu_read_lock() */
1823static struct rtable *__mkroute_output(const struct fib_result *res,
1824 const struct flowi4 *fl4, int orig_oif,
1825 struct net_device *dev_out,
1826 unsigned int flags)
1827{
1828 struct fib_info *fi = res->fi;
1829 struct fib_nh_exception *fnhe;
1830 struct in_device *in_dev;
1831 u16 type = res->type;
1832 struct rtable *rth;
1833 bool do_cache;
1834
1835 in_dev = __in_dev_get_rcu(dev_out);
1836 if (!in_dev)
1837 return ERR_PTR(-EINVAL);
1838
1839 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1840 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1841 return ERR_PTR(-EINVAL);
1842
1843 if (ipv4_is_lbcast(fl4->daddr))
1844 type = RTN_BROADCAST;
1845 else if (ipv4_is_multicast(fl4->daddr))
1846 type = RTN_MULTICAST;
1847 else if (ipv4_is_zeronet(fl4->daddr))
1848 return ERR_PTR(-EINVAL);
1849
1850 if (dev_out->flags & IFF_LOOPBACK)
1851 flags |= RTCF_LOCAL;
1852
1853 do_cache = true;
1854 if (type == RTN_BROADCAST) {
1855 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1856 fi = NULL;
1857 } else if (type == RTN_MULTICAST) {
1858 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1859 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1860 fl4->flowi4_proto))
1861 flags &= ~RTCF_LOCAL;
1862 else
1863 do_cache = false;
1864 /* If multicast route do not exist use
1865 * default one, but do not gateway in this case.
1866 * Yes, it is hack.
1867 */
1868 if (fi && res->prefixlen < 4)
1869 fi = NULL;
1870 }
1871
1872 fnhe = NULL;
1873 do_cache &= fi != NULL;
1874 if (do_cache) {
1875 struct rtable __rcu **prth;
1876 struct fib_nh *nh = &FIB_RES_NH(*res);
1877
1878 fnhe = find_exception(nh, fl4->daddr);
1879 if (fnhe)
1880 prth = &fnhe->fnhe_rth;
1881 else {
1882 if (unlikely(fl4->flowi4_flags &
1883 FLOWI_FLAG_KNOWN_NH &&
1884 !(nh->nh_gw &&
1885 nh->nh_scope == RT_SCOPE_LINK))) {
1886 do_cache = false;
1887 goto add;
1888 }
1889 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1890 }
1891 rth = rcu_dereference(*prth);
1892 if (rt_cache_valid(rth)) {
1893 dst_hold(&rth->dst);
1894 return rth;
1895 }
1896 }
1897
1898add:
1899 rth = rt_dst_alloc(dev_out,
1900 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1901 IN_DEV_CONF_GET(in_dev, NOXFRM),
1902 do_cache);
1903 if (!rth)
1904 return ERR_PTR(-ENOBUFS);
1905
1906 rth->dst.output = ip_output;
1907
1908 rth->rt_genid = rt_genid(dev_net(dev_out));
1909 rth->rt_flags = flags;
1910 rth->rt_type = type;
1911 rth->rt_is_input = 0;
1912 rth->rt_iif = orig_oif ? : 0;
1913 rth->rt_pmtu = 0;
1914 rth->rt_gateway = 0;
1915 rth->rt_uses_gateway = 0;
1916 INIT_LIST_HEAD(&rth->rt_uncached);
1917
1918 RT_CACHE_STAT_INC(out_slow_tot);
1919
1920 if (flags & RTCF_LOCAL)
1921 rth->dst.input = ip_local_deliver;
1922 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1923 if (flags & RTCF_LOCAL &&
1924 !(dev_out->flags & IFF_LOOPBACK)) {
1925 rth->dst.output = ip_mc_output;
1926 RT_CACHE_STAT_INC(out_slow_mc);
1927 }
1928#ifdef CONFIG_IP_MROUTE
1929 if (type == RTN_MULTICAST) {
1930 if (IN_DEV_MFORWARD(in_dev) &&
1931 !ipv4_is_local_multicast(fl4->daddr)) {
1932 rth->dst.input = ip_mr_input;
1933 rth->dst.output = ip_mc_output;
1934 }
1935 }
1936#endif
1937 }
1938
1939 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1940
1941 return rth;
1942}
1943
1944/*
1945 * Major route resolver routine.
1946 */
1947
1948struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1949{
1950 struct net_device *dev_out = NULL;
1951 __u8 tos = RT_FL_TOS(fl4);
1952 unsigned int flags = 0;
1953 struct fib_result res;
1954 struct rtable *rth;
1955 int orig_oif;
1956
1957 res.tclassid = 0;
1958 res.fi = NULL;
1959 res.table = NULL;
1960
1961 orig_oif = fl4->flowi4_oif;
1962
1963 fl4->flowi4_iif = LOOPBACK_IFINDEX;
1964 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1965 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1966 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1967
1968 rcu_read_lock();
1969 if (fl4->saddr) {
1970 rth = ERR_PTR(-EINVAL);
1971 if (ipv4_is_multicast(fl4->saddr) ||
1972 ipv4_is_lbcast(fl4->saddr) ||
1973 ipv4_is_zeronet(fl4->saddr))
1974 goto out;
1975
1976 /* I removed check for oif == dev_out->oif here.
1977 It was wrong for two reasons:
1978 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1979 is assigned to multiple interfaces.
1980 2. Moreover, we are allowed to send packets with saddr
1981 of another iface. --ANK
1982 */
1983
1984 if (fl4->flowi4_oif == 0 &&
1985 (ipv4_is_multicast(fl4->daddr) ||
1986 ipv4_is_lbcast(fl4->daddr))) {
1987 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1988 dev_out = __ip_dev_find(net, fl4->saddr, false);
1989 if (dev_out == NULL)
1990 goto out;
1991
1992 /* Special hack: user can direct multicasts
1993 and limited broadcast via necessary interface
1994 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1995 This hack is not just for fun, it allows
1996 vic,vat and friends to work.
1997 They bind socket to loopback, set ttl to zero
1998 and expect that it will work.
1999 From the viewpoint of routing cache they are broken,
2000 because we are not allowed to build multicast path
2001 with loopback source addr (look, routing cache
2002 cannot know, that ttl is zero, so that packet
2003 will not leave this host and route is valid).
2004 Luckily, this hack is good workaround.
2005 */
2006
2007 fl4->flowi4_oif = dev_out->ifindex;
2008 goto make_route;
2009 }
2010
2011 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2012 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2013 if (!__ip_dev_find(net, fl4->saddr, false))
2014 goto out;
2015 }
2016 }
2017
2018
2019 if (fl4->flowi4_oif) {
2020 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2021 rth = ERR_PTR(-ENODEV);
2022 if (dev_out == NULL)
2023 goto out;
2024
2025 /* RACE: Check return value of inet_select_addr instead. */
2026 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2027 rth = ERR_PTR(-ENETUNREACH);
2028 goto out;
2029 }
2030 if (ipv4_is_local_multicast(fl4->daddr) ||
2031 ipv4_is_lbcast(fl4->daddr)) {
2032 if (!fl4->saddr)
2033 fl4->saddr = inet_select_addr(dev_out, 0,
2034 RT_SCOPE_LINK);
2035 goto make_route;
2036 }
2037 if (fl4->saddr) {
2038 if (ipv4_is_multicast(fl4->daddr))
2039 fl4->saddr = inet_select_addr(dev_out, 0,
2040 fl4->flowi4_scope);
2041 else if (!fl4->daddr)
2042 fl4->saddr = inet_select_addr(dev_out, 0,
2043 RT_SCOPE_HOST);
2044 }
2045 }
2046
2047 if (!fl4->daddr) {
2048 fl4->daddr = fl4->saddr;
2049 if (!fl4->daddr)
2050 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2051 dev_out = net->loopback_dev;
2052 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2053 res.type = RTN_LOCAL;
2054 flags |= RTCF_LOCAL;
2055 goto make_route;
2056 }
2057
2058 if (fib_lookup(net, fl4, &res)) {
2059 res.fi = NULL;
2060 res.table = NULL;
2061 if (fl4->flowi4_oif) {
2062 /* Apparently, routing tables are wrong. Assume,
2063 that the destination is on link.
2064
2065 WHY? DW.
2066 Because we are allowed to send to iface
2067 even if it has NO routes and NO assigned
2068 addresses. When oif is specified, routing
2069 tables are looked up with only one purpose:
2070 to catch if destination is gatewayed, rather than
2071 direct. Moreover, if MSG_DONTROUTE is set,
2072 we send packet, ignoring both routing tables
2073 and ifaddr state. --ANK
2074
2075
2076 We could make it even if oif is unknown,
2077 likely IPv6, but we do not.
2078 */
2079
2080 if (fl4->saddr == 0)
2081 fl4->saddr = inet_select_addr(dev_out, 0,
2082 RT_SCOPE_LINK);
2083 res.type = RTN_UNICAST;
2084 goto make_route;
2085 }
2086 rth = ERR_PTR(-ENETUNREACH);
2087 goto out;
2088 }
2089
2090 if (res.type == RTN_LOCAL) {
2091 if (!fl4->saddr) {
2092 if (res.fi->fib_prefsrc)
2093 fl4->saddr = res.fi->fib_prefsrc;
2094 else
2095 fl4->saddr = fl4->daddr;
2096 }
2097 dev_out = net->loopback_dev;
2098 fl4->flowi4_oif = dev_out->ifindex;
2099 flags |= RTCF_LOCAL;
2100 goto make_route;
2101 }
2102
2103#ifdef CONFIG_IP_ROUTE_MULTIPATH
2104 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2105 fib_select_multipath(&res);
2106 else
2107#endif
2108 if (!res.prefixlen &&
2109 res.table->tb_num_default > 1 &&
2110 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2111 fib_select_default(&res);
2112
2113 if (!fl4->saddr)
2114 fl4->saddr = FIB_RES_PREFSRC(net, res);
2115
2116 dev_out = FIB_RES_DEV(res);
2117 fl4->flowi4_oif = dev_out->ifindex;
2118
2119
2120make_route:
2121 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2122
2123out:
2124 rcu_read_unlock();
2125 return rth;
2126}
2127EXPORT_SYMBOL_GPL(__ip_route_output_key);
2128
2129static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2130{
2131 return NULL;
2132}
2133
2134static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2135{
2136 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2137
2138 return mtu ? : dst->dev->mtu;
2139}
2140
2141static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2142 struct sk_buff *skb, u32 mtu)
2143{
2144}
2145
2146static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2147 struct sk_buff *skb)
2148{
2149}
2150
2151static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2152 unsigned long old)
2153{
2154 return NULL;
2155}
2156
2157static struct dst_ops ipv4_dst_blackhole_ops = {
2158 .family = AF_INET,
2159 .protocol = cpu_to_be16(ETH_P_IP),
2160 .check = ipv4_blackhole_dst_check,
2161 .mtu = ipv4_blackhole_mtu,
2162 .default_advmss = ipv4_default_advmss,
2163 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2164 .redirect = ipv4_rt_blackhole_redirect,
2165 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2166 .neigh_lookup = ipv4_neigh_lookup,
2167};
2168
2169struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2170{
2171 struct rtable *ort = (struct rtable *) dst_orig;
2172 struct rtable *rt;
2173
2174 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2175 if (rt) {
2176 struct dst_entry *new = &rt->dst;
2177
2178 new->__use = 1;
2179 new->input = dst_discard;
2180 new->output = dst_discard;
2181
2182 new->dev = ort->dst.dev;
2183 if (new->dev)
2184 dev_hold(new->dev);
2185
2186 rt->rt_is_input = ort->rt_is_input;
2187 rt->rt_iif = ort->rt_iif;
2188 rt->rt_pmtu = ort->rt_pmtu;
2189
2190 rt->rt_genid = rt_genid(net);
2191 rt->rt_flags = ort->rt_flags;
2192 rt->rt_type = ort->rt_type;
2193 rt->rt_gateway = ort->rt_gateway;
2194 rt->rt_uses_gateway = ort->rt_uses_gateway;
2195
2196 INIT_LIST_HEAD(&rt->rt_uncached);
2197
2198 dst_free(new);
2199 }
2200
2201 dst_release(dst_orig);
2202
2203 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2204}
2205
2206struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2207 struct sock *sk)
2208{
2209 struct rtable *rt = __ip_route_output_key(net, flp4);
2210
2211 if (IS_ERR(rt))
2212 return rt;
2213
2214 if (flp4->flowi4_proto)
2215 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2216 flowi4_to_flowi(flp4),
2217 sk, 0);
2218
2219 return rt;
2220}
2221EXPORT_SYMBOL_GPL(ip_route_output_flow);
2222
2223static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2224 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2225 u32 seq, int event, int nowait, unsigned int flags)
2226{
2227 struct rtable *rt = skb_rtable(skb);
2228 struct rtmsg *r;
2229 struct nlmsghdr *nlh;
2230 unsigned long expires = 0;
2231 u32 error;
2232 u32 metrics[RTAX_MAX];
2233
2234 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2235 if (nlh == NULL)
2236 return -EMSGSIZE;
2237
2238 r = nlmsg_data(nlh);
2239 r->rtm_family = AF_INET;
2240 r->rtm_dst_len = 32;
2241 r->rtm_src_len = 0;
2242 r->rtm_tos = fl4->flowi4_tos;
2243 r->rtm_table = RT_TABLE_MAIN;
2244 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2245 goto nla_put_failure;
2246 r->rtm_type = rt->rt_type;
2247 r->rtm_scope = RT_SCOPE_UNIVERSE;
2248 r->rtm_protocol = RTPROT_UNSPEC;
2249 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2250 if (rt->rt_flags & RTCF_NOTIFY)
2251 r->rtm_flags |= RTM_F_NOTIFY;
2252
2253 if (nla_put_be32(skb, RTA_DST, dst))
2254 goto nla_put_failure;
2255 if (src) {
2256 r->rtm_src_len = 32;
2257 if (nla_put_be32(skb, RTA_SRC, src))
2258 goto nla_put_failure;
2259 }
2260 if (rt->dst.dev &&
2261 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2262 goto nla_put_failure;
2263#ifdef CONFIG_IP_ROUTE_CLASSID
2264 if (rt->dst.tclassid &&
2265 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2266 goto nla_put_failure;
2267#endif
2268 if (!rt_is_input_route(rt) &&
2269 fl4->saddr != src) {
2270 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2271 goto nla_put_failure;
2272 }
2273 if (rt->rt_uses_gateway &&
2274 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2275 goto nla_put_failure;
2276
2277 expires = rt->dst.expires;
2278 if (expires) {
2279 unsigned long now = jiffies;
2280
2281 if (time_before(now, expires))
2282 expires -= now;
2283 else
2284 expires = 0;
2285 }
2286
2287 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2288 if (rt->rt_pmtu && expires)
2289 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2290 if (rtnetlink_put_metrics(skb, metrics) < 0)
2291 goto nla_put_failure;
2292
2293 if (fl4->flowi4_mark &&
2294 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2295 goto nla_put_failure;
2296
2297 error = rt->dst.error;
2298
2299 if (rt_is_input_route(rt)) {
2300#ifdef CONFIG_IP_MROUTE
2301 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2302 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2303 int err = ipmr_get_route(net, skb,
2304 fl4->saddr, fl4->daddr,
2305 r, nowait);
2306 if (err <= 0) {
2307 if (!nowait) {
2308 if (err == 0)
2309 return 0;
2310 goto nla_put_failure;
2311 } else {
2312 if (err == -EMSGSIZE)
2313 goto nla_put_failure;
2314 error = err;
2315 }
2316 }
2317 } else
2318#endif
2319 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2320 goto nla_put_failure;
2321 }
2322
2323 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2324 goto nla_put_failure;
2325
2326 return nlmsg_end(skb, nlh);
2327
2328nla_put_failure:
2329 nlmsg_cancel(skb, nlh);
2330 return -EMSGSIZE;
2331}
2332
2333static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2334{
2335 struct net *net = sock_net(in_skb->sk);
2336 struct rtmsg *rtm;
2337 struct nlattr *tb[RTA_MAX+1];
2338 struct rtable *rt = NULL;
2339 struct flowi4 fl4;
2340 __be32 dst = 0;
2341 __be32 src = 0;
2342 u32 iif;
2343 int err;
2344 int mark;
2345 struct sk_buff *skb;
2346
2347 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2348 if (err < 0)
2349 goto errout;
2350
2351 rtm = nlmsg_data(nlh);
2352
2353 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2354 if (skb == NULL) {
2355 err = -ENOBUFS;
2356 goto errout;
2357 }
2358
2359 /* Reserve room for dummy headers, this skb can pass
2360 through good chunk of routing engine.
2361 */
2362 skb_reset_mac_header(skb);
2363 skb_reset_network_header(skb);
2364
2365 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2366 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2367 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2368
2369 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2370 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2371 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2372 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2373
2374 memset(&fl4, 0, sizeof(fl4));
2375 fl4.daddr = dst;
2376 fl4.saddr = src;
2377 fl4.flowi4_tos = rtm->rtm_tos;
2378 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2379 fl4.flowi4_mark = mark;
2380
2381 if (iif) {
2382 struct net_device *dev;
2383
2384 dev = __dev_get_by_index(net, iif);
2385 if (dev == NULL) {
2386 err = -ENODEV;
2387 goto errout_free;
2388 }
2389
2390 skb->protocol = htons(ETH_P_IP);
2391 skb->dev = dev;
2392 skb->mark = mark;
2393 local_bh_disable();
2394 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2395 local_bh_enable();
2396
2397 rt = skb_rtable(skb);
2398 if (err == 0 && rt->dst.error)
2399 err = -rt->dst.error;
2400 } else {
2401 rt = ip_route_output_key(net, &fl4);
2402
2403 err = 0;
2404 if (IS_ERR(rt))
2405 err = PTR_ERR(rt);
2406 }
2407
2408 if (err)
2409 goto errout_free;
2410
2411 skb_dst_set(skb, &rt->dst);
2412 if (rtm->rtm_flags & RTM_F_NOTIFY)
2413 rt->rt_flags |= RTCF_NOTIFY;
2414
2415 err = rt_fill_info(net, dst, src, &fl4, skb,
2416 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2417 RTM_NEWROUTE, 0, 0);
2418 if (err <= 0)
2419 goto errout_free;
2420
2421 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2422errout:
2423 return err;
2424
2425errout_free:
2426 kfree_skb(skb);
2427 goto errout;
2428}
2429
2430int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2431{
2432 return skb->len;
2433}
2434
2435void ip_rt_multicast_event(struct in_device *in_dev)
2436{
2437 rt_cache_flush(dev_net(in_dev->dev));
2438}
2439
2440#ifdef CONFIG_SYSCTL
2441static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2442static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2443static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2444static int ip_rt_gc_elasticity __read_mostly = 8;
2445
2446static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2447 void __user *buffer,
2448 size_t *lenp, loff_t *ppos)
2449{
2450 struct net *net = (struct net *)__ctl->extra1;
2451
2452 if (write) {
2453 rt_cache_flush(net);
2454 fnhe_genid_bump(net);
2455 return 0;
2456 }
2457
2458 return -EINVAL;
2459}
2460
2461static ctl_table ipv4_route_table[] = {
2462 {
2463 .procname = "gc_thresh",
2464 .data = &ipv4_dst_ops.gc_thresh,
2465 .maxlen = sizeof(int),
2466 .mode = 0644,
2467 .proc_handler = proc_dointvec,
2468 },
2469 {
2470 .procname = "max_size",
2471 .data = &ip_rt_max_size,
2472 .maxlen = sizeof(int),
2473 .mode = 0644,
2474 .proc_handler = proc_dointvec,
2475 },
2476 {
2477 /* Deprecated. Use gc_min_interval_ms */
2478
2479 .procname = "gc_min_interval",
2480 .data = &ip_rt_gc_min_interval,
2481 .maxlen = sizeof(int),
2482 .mode = 0644,
2483 .proc_handler = proc_dointvec_jiffies,
2484 },
2485 {
2486 .procname = "gc_min_interval_ms",
2487 .data = &ip_rt_gc_min_interval,
2488 .maxlen = sizeof(int),
2489 .mode = 0644,
2490 .proc_handler = proc_dointvec_ms_jiffies,
2491 },
2492 {
2493 .procname = "gc_timeout",
2494 .data = &ip_rt_gc_timeout,
2495 .maxlen = sizeof(int),
2496 .mode = 0644,
2497 .proc_handler = proc_dointvec_jiffies,
2498 },
2499 {
2500 .procname = "gc_interval",
2501 .data = &ip_rt_gc_interval,
2502 .maxlen = sizeof(int),
2503 .mode = 0644,
2504 .proc_handler = proc_dointvec_jiffies,
2505 },
2506 {
2507 .procname = "redirect_load",
2508 .data = &ip_rt_redirect_load,
2509 .maxlen = sizeof(int),
2510 .mode = 0644,
2511 .proc_handler = proc_dointvec,
2512 },
2513 {
2514 .procname = "redirect_number",
2515 .data = &ip_rt_redirect_number,
2516 .maxlen = sizeof(int),
2517 .mode = 0644,
2518 .proc_handler = proc_dointvec,
2519 },
2520 {
2521 .procname = "redirect_silence",
2522 .data = &ip_rt_redirect_silence,
2523 .maxlen = sizeof(int),
2524 .mode = 0644,
2525 .proc_handler = proc_dointvec,
2526 },
2527 {
2528 .procname = "error_cost",
2529 .data = &ip_rt_error_cost,
2530 .maxlen = sizeof(int),
2531 .mode = 0644,
2532 .proc_handler = proc_dointvec,
2533 },
2534 {
2535 .procname = "error_burst",
2536 .data = &ip_rt_error_burst,
2537 .maxlen = sizeof(int),
2538 .mode = 0644,
2539 .proc_handler = proc_dointvec,
2540 },
2541 {
2542 .procname = "gc_elasticity",
2543 .data = &ip_rt_gc_elasticity,
2544 .maxlen = sizeof(int),
2545 .mode = 0644,
2546 .proc_handler = proc_dointvec,
2547 },
2548 {
2549 .procname = "mtu_expires",
2550 .data = &ip_rt_mtu_expires,
2551 .maxlen = sizeof(int),
2552 .mode = 0644,
2553 .proc_handler = proc_dointvec_jiffies,
2554 },
2555 {
2556 .procname = "min_pmtu",
2557 .data = &ip_rt_min_pmtu,
2558 .maxlen = sizeof(int),
2559 .mode = 0644,
2560 .proc_handler = proc_dointvec,
2561 },
2562 {
2563 .procname = "min_adv_mss",
2564 .data = &ip_rt_min_advmss,
2565 .maxlen = sizeof(int),
2566 .mode = 0644,
2567 .proc_handler = proc_dointvec,
2568 },
2569 { }
2570};
2571
2572static struct ctl_table ipv4_route_flush_table[] = {
2573 {
2574 .procname = "flush",
2575 .maxlen = sizeof(int),
2576 .mode = 0200,
2577 .proc_handler = ipv4_sysctl_rtcache_flush,
2578 },
2579 { },
2580};
2581
2582static __net_init int sysctl_route_net_init(struct net *net)
2583{
2584 struct ctl_table *tbl;
2585
2586 tbl = ipv4_route_flush_table;
2587 if (!net_eq(net, &init_net)) {
2588 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2589 if (tbl == NULL)
2590 goto err_dup;
2591
2592 /* Don't export sysctls to unprivileged users */
2593 if (net->user_ns != &init_user_ns)
2594 tbl[0].procname = NULL;
2595 }
2596 tbl[0].extra1 = net;
2597
2598 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2599 if (net->ipv4.route_hdr == NULL)
2600 goto err_reg;
2601 return 0;
2602
2603err_reg:
2604 if (tbl != ipv4_route_flush_table)
2605 kfree(tbl);
2606err_dup:
2607 return -ENOMEM;
2608}
2609
2610static __net_exit void sysctl_route_net_exit(struct net *net)
2611{
2612 struct ctl_table *tbl;
2613
2614 tbl = net->ipv4.route_hdr->ctl_table_arg;
2615 unregister_net_sysctl_table(net->ipv4.route_hdr);
2616 BUG_ON(tbl == ipv4_route_flush_table);
2617 kfree(tbl);
2618}
2619
2620static __net_initdata struct pernet_operations sysctl_route_ops = {
2621 .init = sysctl_route_net_init,
2622 .exit = sysctl_route_net_exit,
2623};
2624#endif
2625
2626static __net_init int rt_genid_init(struct net *net)
2627{
2628 atomic_set(&net->rt_genid, 0);
2629 atomic_set(&net->fnhe_genid, 0);
2630 get_random_bytes(&net->ipv4.dev_addr_genid,
2631 sizeof(net->ipv4.dev_addr_genid));
2632 return 0;
2633}
2634
2635static __net_initdata struct pernet_operations rt_genid_ops = {
2636 .init = rt_genid_init,
2637};
2638
2639static int __net_init ipv4_inetpeer_init(struct net *net)
2640{
2641 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2642
2643 if (!bp)
2644 return -ENOMEM;
2645 inet_peer_base_init(bp);
2646 net->ipv4.peers = bp;
2647 return 0;
2648}
2649
2650static void __net_exit ipv4_inetpeer_exit(struct net *net)
2651{
2652 struct inet_peer_base *bp = net->ipv4.peers;
2653
2654 net->ipv4.peers = NULL;
2655 inetpeer_invalidate_tree(bp);
2656 kfree(bp);
2657}
2658
2659static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2660 .init = ipv4_inetpeer_init,
2661 .exit = ipv4_inetpeer_exit,
2662};
2663
2664#ifdef CONFIG_IP_ROUTE_CLASSID
2665struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2666#endif /* CONFIG_IP_ROUTE_CLASSID */
2667
2668int __init ip_rt_init(void)
2669{
2670 int rc = 0;
2671
2672#ifdef CONFIG_IP_ROUTE_CLASSID
2673 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2674 if (!ip_rt_acct)
2675 panic("IP: failed to allocate ip_rt_acct\n");
2676#endif
2677
2678 ipv4_dst_ops.kmem_cachep =
2679 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2680 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2681
2682 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2683
2684 if (dst_entries_init(&ipv4_dst_ops) < 0)
2685 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2686
2687 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2688 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2689
2690 ipv4_dst_ops.gc_thresh = ~0;
2691 ip_rt_max_size = INT_MAX;
2692
2693 devinet_init();
2694 ip_fib_init();
2695
2696 if (ip_rt_proc_init())
2697 pr_err("Unable to create route proc files\n");
2698#ifdef CONFIG_XFRM
2699 xfrm_init();
2700 xfrm4_init();
2701#endif
2702 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2703
2704#ifdef CONFIG_SYSCTL
2705 register_pernet_subsys(&sysctl_route_ops);
2706#endif
2707 register_pernet_subsys(&rt_genid_ops);
2708 register_pernet_subsys(&ipv4_inetpeer_ops);
2709 return rc;
2710}
2711
2712#ifdef CONFIG_SYSCTL
2713/*
2714 * We really need to sanitize the damn ipv4 init order, then all
2715 * this nonsense will go away.
2716 */
2717void __init ip_static_sysctl_init(void)
2718{
2719 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2720}
2721#endif