]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/ipv4/route.c
net: ipv4: don't allow setting net.ipv4.route.min_pmtu below 68
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #include "fib_lookup.h"
118
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122 #define RT_GC_TIMEOUT (300*HZ)
123
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
135
136 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
137
138 /*
139 * Interface to generic destination cache.
140 */
141
142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
144 static unsigned int ipv4_mtu(const struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu);
149 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150 struct sk_buff *skb);
151 static void ipv4_dst_destroy(struct dst_entry *dst);
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 WARN_ON(1);
156 return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
163
164 static struct dst_ops ipv4_dst_ops = {
165 .family = AF_INET,
166 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss,
168 .mtu = ipv4_mtu,
169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
171 .negative_advice = ipv4_negative_advice,
172 .link_failure = ipv4_link_failure,
173 .update_pmtu = ip_rt_update_pmtu,
174 .redirect = ip_do_redirect,
175 .local_out = __ip_local_out,
176 .neigh_lookup = ipv4_neigh_lookup,
177 .confirm_neigh = ipv4_confirm_neigh,
178 };
179
180 #define ECN_OR_COST(class) TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183 TC_PRIO_BESTEFFORT,
184 ECN_OR_COST(BESTEFFORT),
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208 if (*pos)
209 return NULL;
210 return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215 ++*pos;
216 return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
230 return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242 return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
250 .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
265 return &per_cpu(rt_cache_stat, cpu);
266 }
267 return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272 int cpu;
273
274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
278 return &per_cpu(rt_cache_stat, cpu);
279 }
280 return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 return 0;
296 }
297
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
301 0, /* st->in_hit */
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 0, /* st->out_hit */
310 st->out_slow_tot,
311 st->out_slow_mc,
312
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
319 );
320 return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333 return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
362 }
363
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371 return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385 struct proc_dir_entry *pde;
386
387 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
394 if (!pde)
395 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 if (!pde)
400 goto err3;
401 #endif
402 return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411 return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430 return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436 return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447 rt_genid_bump_ipv4(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
453 {
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
457 struct neighbour *n;
458
459 rt = (const struct rtable *) dst;
460 if (rt->rt_gateway)
461 pkey = (const __be32 *) &rt->rt_gateway;
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
464
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 if (n)
467 return n;
468 return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
472 {
473 struct net_device *dev = dst->dev;
474 const __be32 *pkey = daddr;
475 const struct rtable *rt;
476
477 rt = (const struct rtable *)dst;
478 if (rt->rt_gateway)
479 pkey = (const __be32 *)&rt->rt_gateway;
480 else if (!daddr ||
481 (rt->rt_flags &
482 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
483 return;
484
485 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
486 }
487
488 #define IP_IDENTS_SZ 2048u
489
490 static atomic_t *ip_idents __read_mostly;
491 static u32 *ip_tstamps __read_mostly;
492
493 /* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
496 */
497 u32 ip_idents_reserve(u32 hash, int segs)
498 {
499 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
500 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
501 u32 old = READ_ONCE(*p_tstamp);
502 u32 now = (u32)jiffies;
503 u32 new, delta = 0;
504
505 if (old != now && cmpxchg(p_tstamp, old, now) == old)
506 delta = prandom_u32_max(now - old);
507
508 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
509 do {
510 old = (u32)atomic_read(p_id);
511 new = old + delta + segs;
512 } while (atomic_cmpxchg(p_id, old, new) != old);
513
514 return new - segs;
515 }
516 EXPORT_SYMBOL(ip_idents_reserve);
517
518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
519 {
520 static u32 ip_idents_hashrnd __read_mostly;
521 u32 hash, id;
522
523 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
524
525 hash = jhash_3words((__force u32)iph->daddr,
526 (__force u32)iph->saddr,
527 iph->protocol ^ net_hash_mix(net),
528 ip_idents_hashrnd);
529 id = ip_idents_reserve(hash, segs);
530 iph->id = htons(id);
531 }
532 EXPORT_SYMBOL(__ip_select_ident);
533
534 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
535 const struct sock *sk,
536 const struct iphdr *iph,
537 int oif, u8 tos,
538 u8 prot, u32 mark, int flow_flags)
539 {
540 if (sk) {
541 const struct inet_sock *inet = inet_sk(sk);
542
543 oif = sk->sk_bound_dev_if;
544 mark = sk->sk_mark;
545 tos = RT_CONN_FLAGS(sk);
546 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
547 }
548 flowi4_init_output(fl4, oif, mark, tos,
549 RT_SCOPE_UNIVERSE, prot,
550 flow_flags,
551 iph->daddr, iph->saddr, 0, 0,
552 sock_net_uid(net, sk));
553 }
554
555 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
556 const struct sock *sk)
557 {
558 const struct net *net = dev_net(skb->dev);
559 const struct iphdr *iph = ip_hdr(skb);
560 int oif = skb->dev->ifindex;
561 u8 tos = RT_TOS(iph->tos);
562 u8 prot = iph->protocol;
563 u32 mark = skb->mark;
564
565 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
566 }
567
568 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
569 {
570 const struct inet_sock *inet = inet_sk(sk);
571 const struct ip_options_rcu *inet_opt;
572 __be32 daddr = inet->inet_daddr;
573
574 rcu_read_lock();
575 inet_opt = rcu_dereference(inet->inet_opt);
576 if (inet_opt && inet_opt->opt.srr)
577 daddr = inet_opt->opt.faddr;
578 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
579 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
580 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
581 inet_sk_flowi_flags(sk),
582 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
583 rcu_read_unlock();
584 }
585
586 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 const struct sk_buff *skb)
588 {
589 if (skb)
590 build_skb_flow_key(fl4, skb, sk);
591 else
592 build_sk_flow_key(fl4, sk);
593 }
594
595 static DEFINE_SPINLOCK(fnhe_lock);
596
597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
598 {
599 struct rtable *rt;
600
601 rt = rcu_dereference(fnhe->fnhe_rth_input);
602 if (rt) {
603 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
604 dst_dev_put(&rt->dst);
605 dst_release(&rt->dst);
606 }
607 rt = rcu_dereference(fnhe->fnhe_rth_output);
608 if (rt) {
609 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
610 dst_dev_put(&rt->dst);
611 dst_release(&rt->dst);
612 }
613 }
614
615 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
616 {
617 struct fib_nh_exception *fnhe, *oldest;
618
619 oldest = rcu_dereference(hash->chain);
620 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
621 fnhe = rcu_dereference(fnhe->fnhe_next)) {
622 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
623 oldest = fnhe;
624 }
625 fnhe_flush_routes(oldest);
626 return oldest;
627 }
628
629 static inline u32 fnhe_hashfun(__be32 daddr)
630 {
631 static u32 fnhe_hashrnd __read_mostly;
632 u32 hval;
633
634 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
635 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
636 return hash_32(hval, FNHE_HASH_SHIFT);
637 }
638
639 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640 {
641 rt->rt_pmtu = fnhe->fnhe_pmtu;
642 rt->dst.expires = fnhe->fnhe_expires;
643
644 if (fnhe->fnhe_gw) {
645 rt->rt_flags |= RTCF_REDIRECTED;
646 rt->rt_gateway = fnhe->fnhe_gw;
647 rt->rt_uses_gateway = 1;
648 }
649 }
650
651 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
652 u32 pmtu, unsigned long expires)
653 {
654 struct fnhe_hash_bucket *hash;
655 struct fib_nh_exception *fnhe;
656 struct rtable *rt;
657 u32 genid, hval;
658 unsigned int i;
659 int depth;
660
661 genid = fnhe_genid(dev_net(nh->nh_dev));
662 hval = fnhe_hashfun(daddr);
663
664 spin_lock_bh(&fnhe_lock);
665
666 hash = rcu_dereference(nh->nh_exceptions);
667 if (!hash) {
668 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
669 if (!hash)
670 goto out_unlock;
671 rcu_assign_pointer(nh->nh_exceptions, hash);
672 }
673
674 hash += hval;
675
676 depth = 0;
677 for (fnhe = rcu_dereference(hash->chain); fnhe;
678 fnhe = rcu_dereference(fnhe->fnhe_next)) {
679 if (fnhe->fnhe_daddr == daddr)
680 break;
681 depth++;
682 }
683
684 if (fnhe) {
685 if (fnhe->fnhe_genid != genid)
686 fnhe->fnhe_genid = genid;
687 if (gw)
688 fnhe->fnhe_gw = gw;
689 if (pmtu)
690 fnhe->fnhe_pmtu = pmtu;
691 fnhe->fnhe_expires = max(1UL, expires);
692 /* Update all cached dsts too */
693 rt = rcu_dereference(fnhe->fnhe_rth_input);
694 if (rt)
695 fill_route_from_fnhe(rt, fnhe);
696 rt = rcu_dereference(fnhe->fnhe_rth_output);
697 if (rt)
698 fill_route_from_fnhe(rt, fnhe);
699 } else {
700 if (depth > FNHE_RECLAIM_DEPTH)
701 fnhe = fnhe_oldest(hash);
702 else {
703 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
704 if (!fnhe)
705 goto out_unlock;
706
707 fnhe->fnhe_next = hash->chain;
708 rcu_assign_pointer(hash->chain, fnhe);
709 }
710 fnhe->fnhe_genid = genid;
711 fnhe->fnhe_daddr = daddr;
712 fnhe->fnhe_gw = gw;
713 fnhe->fnhe_pmtu = pmtu;
714 fnhe->fnhe_expires = expires;
715
716 /* Exception created; mark the cached routes for the nexthop
717 * stale, so anyone caching it rechecks if this exception
718 * applies to them.
719 */
720 rt = rcu_dereference(nh->nh_rth_input);
721 if (rt)
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
723
724 for_each_possible_cpu(i) {
725 struct rtable __rcu **prt;
726 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
727 rt = rcu_dereference(*prt);
728 if (rt)
729 rt->dst.obsolete = DST_OBSOLETE_KILL;
730 }
731 }
732
733 fnhe->fnhe_stamp = jiffies;
734
735 out_unlock:
736 spin_unlock_bh(&fnhe_lock);
737 }
738
739 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
740 bool kill_route)
741 {
742 __be32 new_gw = icmp_hdr(skb)->un.gateway;
743 __be32 old_gw = ip_hdr(skb)->saddr;
744 struct net_device *dev = skb->dev;
745 struct in_device *in_dev;
746 struct fib_result res;
747 struct neighbour *n;
748 struct net *net;
749
750 switch (icmp_hdr(skb)->code & 7) {
751 case ICMP_REDIR_NET:
752 case ICMP_REDIR_NETTOS:
753 case ICMP_REDIR_HOST:
754 case ICMP_REDIR_HOSTTOS:
755 break;
756
757 default:
758 return;
759 }
760
761 if (rt->rt_gateway != old_gw)
762 return;
763
764 in_dev = __in_dev_get_rcu(dev);
765 if (!in_dev)
766 return;
767
768 net = dev_net(dev);
769 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
770 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
771 ipv4_is_zeronet(new_gw))
772 goto reject_redirect;
773
774 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
775 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
776 goto reject_redirect;
777 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
778 goto reject_redirect;
779 } else {
780 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
781 goto reject_redirect;
782 }
783
784 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
785 if (!n)
786 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
787 if (!IS_ERR(n)) {
788 if (!(n->nud_state & NUD_VALID)) {
789 neigh_event_send(n, NULL);
790 } else {
791 if (fib_lookup(net, fl4, &res, 0) == 0) {
792 struct fib_nh *nh = &FIB_RES_NH(res);
793
794 update_or_create_fnhe(nh, fl4->daddr, new_gw,
795 0, jiffies + ip_rt_gc_timeout);
796 }
797 if (kill_route)
798 rt->dst.obsolete = DST_OBSOLETE_KILL;
799 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800 }
801 neigh_release(n);
802 }
803 return;
804
805 reject_redirect:
806 #ifdef CONFIG_IP_ROUTE_VERBOSE
807 if (IN_DEV_LOG_MARTIANS(in_dev)) {
808 const struct iphdr *iph = (const struct iphdr *) skb->data;
809 __be32 daddr = iph->daddr;
810 __be32 saddr = iph->saddr;
811
812 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813 " Advised path = %pI4 -> %pI4\n",
814 &old_gw, dev->name, &new_gw,
815 &saddr, &daddr);
816 }
817 #endif
818 ;
819 }
820
821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822 {
823 struct rtable *rt;
824 struct flowi4 fl4;
825 const struct iphdr *iph = (const struct iphdr *) skb->data;
826 struct net *net = dev_net(skb->dev);
827 int oif = skb->dev->ifindex;
828 u8 tos = RT_TOS(iph->tos);
829 u8 prot = iph->protocol;
830 u32 mark = skb->mark;
831
832 rt = (struct rtable *) dst;
833
834 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
835 __ip_do_redirect(rt, skb, &fl4, true);
836 }
837
838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839 {
840 struct rtable *rt = (struct rtable *)dst;
841 struct dst_entry *ret = dst;
842
843 if (rt) {
844 if (dst->obsolete > 0) {
845 ip_rt_put(rt);
846 ret = NULL;
847 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848 rt->dst.expires) {
849 ip_rt_put(rt);
850 ret = NULL;
851 }
852 }
853 return ret;
854 }
855
856 /*
857 * Algorithm:
858 * 1. The first ip_rt_redirect_number redirects are sent
859 * with exponential backoff, then we stop sending them at all,
860 * assuming that the host ignores our redirects.
861 * 2. If we did not see packets requiring redirects
862 * during ip_rt_redirect_silence, we assume that the host
863 * forgot redirected route and start to send redirects again.
864 *
865 * This algorithm is much cheaper and more intelligent than dumb load limiting
866 * in icmp.c.
867 *
868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
870 */
871
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874 struct rtable *rt = skb_rtable(skb);
875 struct in_device *in_dev;
876 struct inet_peer *peer;
877 struct net *net;
878 int log_martians;
879 int vif;
880
881 rcu_read_lock();
882 in_dev = __in_dev_get_rcu(rt->dst.dev);
883 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884 rcu_read_unlock();
885 return;
886 }
887 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
888 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889 rcu_read_unlock();
890
891 net = dev_net(rt->dst.dev);
892 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
893 if (!peer) {
894 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895 rt_nexthop(rt, ip_hdr(skb)->daddr));
896 return;
897 }
898
899 /* No redirected packets during ip_rt_redirect_silence;
900 * reset the algorithm.
901 */
902 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
903 peer->rate_tokens = 0;
904
905 /* Too many ignored redirects; do not send anything
906 * set dst.rate_last to the last seen redirected packet.
907 */
908 if (peer->rate_tokens >= ip_rt_redirect_number) {
909 peer->rate_last = jiffies;
910 goto out_put_peer;
911 }
912
913 /* Check for load limit; set rate_last to the latest sent
914 * redirect.
915 */
916 if (peer->rate_tokens == 0 ||
917 time_after(jiffies,
918 (peer->rate_last +
919 (ip_rt_redirect_load << peer->rate_tokens)))) {
920 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921
922 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923 peer->rate_last = jiffies;
924 ++peer->rate_tokens;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
926 if (log_martians &&
927 peer->rate_tokens == ip_rt_redirect_number)
928 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929 &ip_hdr(skb)->saddr, inet_iif(skb),
930 &ip_hdr(skb)->daddr, &gw);
931 #endif
932 }
933 out_put_peer:
934 inet_putpeer(peer);
935 }
936
937 static int ip_error(struct sk_buff *skb)
938 {
939 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
940 struct rtable *rt = skb_rtable(skb);
941 struct inet_peer *peer;
942 unsigned long now;
943 struct net *net;
944 bool send;
945 int code;
946
947 /* IP on this device is disabled. */
948 if (!in_dev)
949 goto out;
950
951 net = dev_net(rt->dst.dev);
952 if (!IN_DEV_FORWARD(in_dev)) {
953 switch (rt->dst.error) {
954 case EHOSTUNREACH:
955 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
956 break;
957
958 case ENETUNREACH:
959 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
960 break;
961 }
962 goto out;
963 }
964
965 switch (rt->dst.error) {
966 case EINVAL:
967 default:
968 goto out;
969 case EHOSTUNREACH:
970 code = ICMP_HOST_UNREACH;
971 break;
972 case ENETUNREACH:
973 code = ICMP_NET_UNREACH;
974 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
975 break;
976 case EACCES:
977 code = ICMP_PKT_FILTERED;
978 break;
979 }
980
981 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
982 l3mdev_master_ifindex(skb->dev), 1);
983
984 send = true;
985 if (peer) {
986 now = jiffies;
987 peer->rate_tokens += now - peer->rate_last;
988 if (peer->rate_tokens > ip_rt_error_burst)
989 peer->rate_tokens = ip_rt_error_burst;
990 peer->rate_last = now;
991 if (peer->rate_tokens >= ip_rt_error_cost)
992 peer->rate_tokens -= ip_rt_error_cost;
993 else
994 send = false;
995 inet_putpeer(peer);
996 }
997 if (send)
998 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
999
1000 out: kfree_skb(skb);
1001 return 0;
1002 }
1003
1004 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1005 {
1006 struct dst_entry *dst = &rt->dst;
1007 struct fib_result res;
1008
1009 if (dst_metric_locked(dst, RTAX_MTU))
1010 return;
1011
1012 if (ipv4_mtu(dst) < mtu)
1013 return;
1014
1015 if (mtu < ip_rt_min_pmtu)
1016 mtu = ip_rt_min_pmtu;
1017
1018 if (rt->rt_pmtu == mtu &&
1019 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1020 return;
1021
1022 rcu_read_lock();
1023 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1024 struct fib_nh *nh = &FIB_RES_NH(res);
1025
1026 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1027 jiffies + ip_rt_mtu_expires);
1028 }
1029 rcu_read_unlock();
1030 }
1031
1032 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1033 struct sk_buff *skb, u32 mtu)
1034 {
1035 struct rtable *rt = (struct rtable *) dst;
1036 struct flowi4 fl4;
1037
1038 ip_rt_build_flow_key(&fl4, sk, skb);
1039 __ip_rt_update_pmtu(rt, &fl4, mtu);
1040 }
1041
1042 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1043 int oif, u32 mark, u8 protocol, int flow_flags)
1044 {
1045 const struct iphdr *iph = (const struct iphdr *) skb->data;
1046 struct flowi4 fl4;
1047 struct rtable *rt;
1048
1049 if (!mark)
1050 mark = IP4_REPLY_MARK(net, skb->mark);
1051
1052 __build_flow_key(net, &fl4, NULL, iph, oif,
1053 RT_TOS(iph->tos), protocol, mark, flow_flags);
1054 rt = __ip_route_output_key(net, &fl4);
1055 if (!IS_ERR(rt)) {
1056 __ip_rt_update_pmtu(rt, &fl4, mtu);
1057 ip_rt_put(rt);
1058 }
1059 }
1060 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1061
1062 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1063 {
1064 const struct iphdr *iph = (const struct iphdr *) skb->data;
1065 struct flowi4 fl4;
1066 struct rtable *rt;
1067
1068 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1069
1070 if (!fl4.flowi4_mark)
1071 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1072
1073 rt = __ip_route_output_key(sock_net(sk), &fl4);
1074 if (!IS_ERR(rt)) {
1075 __ip_rt_update_pmtu(rt, &fl4, mtu);
1076 ip_rt_put(rt);
1077 }
1078 }
1079
1080 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1081 {
1082 const struct iphdr *iph = (const struct iphdr *) skb->data;
1083 struct flowi4 fl4;
1084 struct rtable *rt;
1085 struct dst_entry *odst = NULL;
1086 bool new = false;
1087 struct net *net = sock_net(sk);
1088
1089 bh_lock_sock(sk);
1090
1091 if (!ip_sk_accept_pmtu(sk))
1092 goto out;
1093
1094 odst = sk_dst_get(sk);
1095
1096 if (sock_owned_by_user(sk) || !odst) {
1097 __ipv4_sk_update_pmtu(skb, sk, mtu);
1098 goto out;
1099 }
1100
1101 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1102
1103 rt = (struct rtable *)odst;
1104 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1105 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1106 if (IS_ERR(rt))
1107 goto out;
1108
1109 new = true;
1110 }
1111
1112 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1113
1114 if (!dst_check(&rt->dst, 0)) {
1115 if (new)
1116 dst_release(&rt->dst);
1117
1118 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119 if (IS_ERR(rt))
1120 goto out;
1121
1122 new = true;
1123 }
1124
1125 if (new)
1126 sk_dst_set(sk, &rt->dst);
1127
1128 out:
1129 bh_unlock_sock(sk);
1130 dst_release(odst);
1131 }
1132 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1133
1134 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1135 int oif, u32 mark, u8 protocol, int flow_flags)
1136 {
1137 const struct iphdr *iph = (const struct iphdr *) skb->data;
1138 struct flowi4 fl4;
1139 struct rtable *rt;
1140
1141 __build_flow_key(net, &fl4, NULL, iph, oif,
1142 RT_TOS(iph->tos), protocol, mark, flow_flags);
1143 rt = __ip_route_output_key(net, &fl4);
1144 if (!IS_ERR(rt)) {
1145 __ip_do_redirect(rt, skb, &fl4, false);
1146 ip_rt_put(rt);
1147 }
1148 }
1149 EXPORT_SYMBOL_GPL(ipv4_redirect);
1150
1151 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1152 {
1153 const struct iphdr *iph = (const struct iphdr *) skb->data;
1154 struct flowi4 fl4;
1155 struct rtable *rt;
1156 struct net *net = sock_net(sk);
1157
1158 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1159 rt = __ip_route_output_key(net, &fl4);
1160 if (!IS_ERR(rt)) {
1161 __ip_do_redirect(rt, skb, &fl4, false);
1162 ip_rt_put(rt);
1163 }
1164 }
1165 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1166
1167 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1168 {
1169 struct rtable *rt = (struct rtable *) dst;
1170
1171 /* All IPV4 dsts are created with ->obsolete set to the value
1172 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1173 * into this function always.
1174 *
1175 * When a PMTU/redirect information update invalidates a route,
1176 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1177 * DST_OBSOLETE_DEAD by dst_free().
1178 */
1179 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1180 return NULL;
1181 return dst;
1182 }
1183
1184 static void ipv4_link_failure(struct sk_buff *skb)
1185 {
1186 struct rtable *rt;
1187
1188 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1189
1190 rt = skb_rtable(skb);
1191 if (rt)
1192 dst_set_expires(&rt->dst, 0);
1193 }
1194
1195 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1196 {
1197 pr_debug("%s: %pI4 -> %pI4, %s\n",
1198 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1199 skb->dev ? skb->dev->name : "?");
1200 kfree_skb(skb);
1201 WARN_ON(1);
1202 return 0;
1203 }
1204
1205 /*
1206 We do not cache source address of outgoing interface,
1207 because it is used only by IP RR, TS and SRR options,
1208 so that it out of fast path.
1209
1210 BTW remember: "addr" is allowed to be not aligned
1211 in IP options!
1212 */
1213
1214 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1215 {
1216 __be32 src;
1217
1218 if (rt_is_output_route(rt))
1219 src = ip_hdr(skb)->saddr;
1220 else {
1221 struct fib_result res;
1222 struct flowi4 fl4;
1223 struct iphdr *iph;
1224
1225 iph = ip_hdr(skb);
1226
1227 memset(&fl4, 0, sizeof(fl4));
1228 fl4.daddr = iph->daddr;
1229 fl4.saddr = iph->saddr;
1230 fl4.flowi4_tos = RT_TOS(iph->tos);
1231 fl4.flowi4_oif = rt->dst.dev->ifindex;
1232 fl4.flowi4_iif = skb->dev->ifindex;
1233 fl4.flowi4_mark = skb->mark;
1234
1235 rcu_read_lock();
1236 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1237 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1238 else
1239 src = inet_select_addr(rt->dst.dev,
1240 rt_nexthop(rt, iph->daddr),
1241 RT_SCOPE_UNIVERSE);
1242 rcu_read_unlock();
1243 }
1244 memcpy(addr, &src, 4);
1245 }
1246
1247 #ifdef CONFIG_IP_ROUTE_CLASSID
1248 static void set_class_tag(struct rtable *rt, u32 tag)
1249 {
1250 if (!(rt->dst.tclassid & 0xFFFF))
1251 rt->dst.tclassid |= tag & 0xFFFF;
1252 if (!(rt->dst.tclassid & 0xFFFF0000))
1253 rt->dst.tclassid |= tag & 0xFFFF0000;
1254 }
1255 #endif
1256
1257 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1258 {
1259 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1260 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1261 ip_rt_min_advmss);
1262
1263 return min(advmss, IPV4_MAX_PMTU - header_size);
1264 }
1265
1266 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1267 {
1268 const struct rtable *rt = (const struct rtable *) dst;
1269 unsigned int mtu = rt->rt_pmtu;
1270
1271 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1272 mtu = dst_metric_raw(dst, RTAX_MTU);
1273
1274 if (mtu)
1275 return mtu;
1276
1277 mtu = READ_ONCE(dst->dev->mtu);
1278
1279 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1280 if (rt->rt_uses_gateway && mtu > 576)
1281 mtu = 576;
1282 }
1283
1284 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1285
1286 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1287 }
1288
1289 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1290 {
1291 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1292 struct fib_nh_exception *fnhe;
1293 u32 hval;
1294
1295 if (!hash)
1296 return NULL;
1297
1298 hval = fnhe_hashfun(daddr);
1299
1300 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1301 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1302 if (fnhe->fnhe_daddr == daddr)
1303 return fnhe;
1304 }
1305 return NULL;
1306 }
1307
1308 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1309 __be32 daddr, const bool do_cache)
1310 {
1311 bool ret = false;
1312
1313 spin_lock_bh(&fnhe_lock);
1314
1315 if (daddr == fnhe->fnhe_daddr) {
1316 struct rtable __rcu **porig;
1317 struct rtable *orig;
1318 int genid = fnhe_genid(dev_net(rt->dst.dev));
1319
1320 if (rt_is_input_route(rt))
1321 porig = &fnhe->fnhe_rth_input;
1322 else
1323 porig = &fnhe->fnhe_rth_output;
1324 orig = rcu_dereference(*porig);
1325
1326 if (fnhe->fnhe_genid != genid) {
1327 fnhe->fnhe_genid = genid;
1328 fnhe->fnhe_gw = 0;
1329 fnhe->fnhe_pmtu = 0;
1330 fnhe->fnhe_expires = 0;
1331 fnhe_flush_routes(fnhe);
1332 orig = NULL;
1333 }
1334 fill_route_from_fnhe(rt, fnhe);
1335 if (!rt->rt_gateway)
1336 rt->rt_gateway = daddr;
1337
1338 if (do_cache) {
1339 dst_hold(&rt->dst);
1340 rcu_assign_pointer(*porig, rt);
1341 if (orig) {
1342 dst_dev_put(&orig->dst);
1343 dst_release(&orig->dst);
1344 }
1345 ret = true;
1346 }
1347
1348 fnhe->fnhe_stamp = jiffies;
1349 }
1350 spin_unlock_bh(&fnhe_lock);
1351
1352 return ret;
1353 }
1354
1355 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1356 {
1357 struct rtable *orig, *prev, **p;
1358 bool ret = true;
1359
1360 if (rt_is_input_route(rt)) {
1361 p = (struct rtable **)&nh->nh_rth_input;
1362 } else {
1363 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1364 }
1365 orig = *p;
1366
1367 /* hold dst before doing cmpxchg() to avoid race condition
1368 * on this dst
1369 */
1370 dst_hold(&rt->dst);
1371 prev = cmpxchg(p, orig, rt);
1372 if (prev == orig) {
1373 if (orig) {
1374 dst_dev_put(&orig->dst);
1375 dst_release(&orig->dst);
1376 }
1377 } else {
1378 dst_release(&rt->dst);
1379 ret = false;
1380 }
1381
1382 return ret;
1383 }
1384
1385 struct uncached_list {
1386 spinlock_t lock;
1387 struct list_head head;
1388 };
1389
1390 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1391
1392 static void rt_add_uncached_list(struct rtable *rt)
1393 {
1394 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1395
1396 rt->rt_uncached_list = ul;
1397
1398 spin_lock_bh(&ul->lock);
1399 list_add_tail(&rt->rt_uncached, &ul->head);
1400 spin_unlock_bh(&ul->lock);
1401 }
1402
1403 static void ipv4_dst_destroy(struct dst_entry *dst)
1404 {
1405 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1406 struct rtable *rt = (struct rtable *) dst;
1407
1408 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1409 kfree(p);
1410
1411 if (!list_empty(&rt->rt_uncached)) {
1412 struct uncached_list *ul = rt->rt_uncached_list;
1413
1414 spin_lock_bh(&ul->lock);
1415 list_del(&rt->rt_uncached);
1416 spin_unlock_bh(&ul->lock);
1417 }
1418 }
1419
1420 void rt_flush_dev(struct net_device *dev)
1421 {
1422 struct net *net = dev_net(dev);
1423 struct rtable *rt;
1424 int cpu;
1425
1426 for_each_possible_cpu(cpu) {
1427 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1428
1429 spin_lock_bh(&ul->lock);
1430 list_for_each_entry(rt, &ul->head, rt_uncached) {
1431 if (rt->dst.dev != dev)
1432 continue;
1433 rt->dst.dev = net->loopback_dev;
1434 dev_hold(rt->dst.dev);
1435 dev_put(dev);
1436 }
1437 spin_unlock_bh(&ul->lock);
1438 }
1439 }
1440
1441 static bool rt_cache_valid(const struct rtable *rt)
1442 {
1443 return rt &&
1444 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1445 !rt_is_expired(rt);
1446 }
1447
1448 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1449 const struct fib_result *res,
1450 struct fib_nh_exception *fnhe,
1451 struct fib_info *fi, u16 type, u32 itag,
1452 const bool do_cache)
1453 {
1454 bool cached = false;
1455
1456 if (fi) {
1457 struct fib_nh *nh = &FIB_RES_NH(*res);
1458
1459 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1460 rt->rt_gateway = nh->nh_gw;
1461 rt->rt_uses_gateway = 1;
1462 }
1463 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1464 if (fi->fib_metrics != &dst_default_metrics) {
1465 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1466 refcount_inc(&fi->fib_metrics->refcnt);
1467 }
1468 #ifdef CONFIG_IP_ROUTE_CLASSID
1469 rt->dst.tclassid = nh->nh_tclassid;
1470 #endif
1471 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1472 if (unlikely(fnhe))
1473 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1474 else if (do_cache)
1475 cached = rt_cache_route(nh, rt);
1476 if (unlikely(!cached)) {
1477 /* Routes we intend to cache in nexthop exception or
1478 * FIB nexthop have the DST_NOCACHE bit clear.
1479 * However, if we are unsuccessful at storing this
1480 * route into the cache we really need to set it.
1481 */
1482 if (!rt->rt_gateway)
1483 rt->rt_gateway = daddr;
1484 rt_add_uncached_list(rt);
1485 }
1486 } else
1487 rt_add_uncached_list(rt);
1488
1489 #ifdef CONFIG_IP_ROUTE_CLASSID
1490 #ifdef CONFIG_IP_MULTIPLE_TABLES
1491 set_class_tag(rt, res->tclassid);
1492 #endif
1493 set_class_tag(rt, itag);
1494 #endif
1495 }
1496
1497 struct rtable *rt_dst_alloc(struct net_device *dev,
1498 unsigned int flags, u16 type,
1499 bool nopolicy, bool noxfrm, bool will_cache)
1500 {
1501 struct rtable *rt;
1502
1503 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1504 (will_cache ? 0 : DST_HOST) |
1505 (nopolicy ? DST_NOPOLICY : 0) |
1506 (noxfrm ? DST_NOXFRM : 0));
1507
1508 if (rt) {
1509 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1510 rt->rt_flags = flags;
1511 rt->rt_type = type;
1512 rt->rt_is_input = 0;
1513 rt->rt_iif = 0;
1514 rt->rt_pmtu = 0;
1515 rt->rt_gateway = 0;
1516 rt->rt_uses_gateway = 0;
1517 rt->rt_table_id = 0;
1518 INIT_LIST_HEAD(&rt->rt_uncached);
1519
1520 rt->dst.output = ip_output;
1521 if (flags & RTCF_LOCAL)
1522 rt->dst.input = ip_local_deliver;
1523 }
1524
1525 return rt;
1526 }
1527 EXPORT_SYMBOL(rt_dst_alloc);
1528
1529 /* called in rcu_read_lock() section */
1530 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1531 u8 tos, struct net_device *dev,
1532 struct in_device *in_dev, u32 *itag)
1533 {
1534 int err;
1535
1536 /* Primary sanity checks. */
1537 if (!in_dev)
1538 return -EINVAL;
1539
1540 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1541 skb->protocol != htons(ETH_P_IP))
1542 return -EINVAL;
1543
1544 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1545 return -EINVAL;
1546
1547 if (ipv4_is_zeronet(saddr)) {
1548 if (!ipv4_is_local_multicast(daddr))
1549 return -EINVAL;
1550 } else {
1551 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1552 in_dev, itag);
1553 if (err < 0)
1554 return err;
1555 }
1556 return 0;
1557 }
1558
1559 /* called in rcu_read_lock() section */
1560 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1561 u8 tos, struct net_device *dev, int our)
1562 {
1563 struct in_device *in_dev = __in_dev_get_rcu(dev);
1564 unsigned int flags = RTCF_MULTICAST;
1565 struct rtable *rth;
1566 u32 itag = 0;
1567 int err;
1568
1569 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1570 if (err)
1571 return err;
1572
1573 if (our)
1574 flags |= RTCF_LOCAL;
1575
1576 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1577 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1578 if (!rth)
1579 return -ENOBUFS;
1580
1581 #ifdef CONFIG_IP_ROUTE_CLASSID
1582 rth->dst.tclassid = itag;
1583 #endif
1584 rth->dst.output = ip_rt_bug;
1585 rth->rt_is_input= 1;
1586
1587 #ifdef CONFIG_IP_MROUTE
1588 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1589 rth->dst.input = ip_mr_input;
1590 #endif
1591 RT_CACHE_STAT_INC(in_slow_mc);
1592
1593 skb_dst_set(skb, &rth->dst);
1594 return 0;
1595 }
1596
1597
1598 static void ip_handle_martian_source(struct net_device *dev,
1599 struct in_device *in_dev,
1600 struct sk_buff *skb,
1601 __be32 daddr,
1602 __be32 saddr)
1603 {
1604 RT_CACHE_STAT_INC(in_martian_src);
1605 #ifdef CONFIG_IP_ROUTE_VERBOSE
1606 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1607 /*
1608 * RFC1812 recommendation, if source is martian,
1609 * the only hint is MAC header.
1610 */
1611 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1612 &daddr, &saddr, dev->name);
1613 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1614 print_hex_dump(KERN_WARNING, "ll header: ",
1615 DUMP_PREFIX_OFFSET, 16, 1,
1616 skb_mac_header(skb),
1617 dev->hard_header_len, true);
1618 }
1619 }
1620 #endif
1621 }
1622
1623 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1624 {
1625 struct fnhe_hash_bucket *hash;
1626 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1627 u32 hval = fnhe_hashfun(daddr);
1628
1629 spin_lock_bh(&fnhe_lock);
1630
1631 hash = rcu_dereference_protected(nh->nh_exceptions,
1632 lockdep_is_held(&fnhe_lock));
1633 hash += hval;
1634
1635 fnhe_p = &hash->chain;
1636 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1637 while (fnhe) {
1638 if (fnhe->fnhe_daddr == daddr) {
1639 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1640 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1641 fnhe_flush_routes(fnhe);
1642 kfree_rcu(fnhe, rcu);
1643 break;
1644 }
1645 fnhe_p = &fnhe->fnhe_next;
1646 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1647 lockdep_is_held(&fnhe_lock));
1648 }
1649
1650 spin_unlock_bh(&fnhe_lock);
1651 }
1652
1653 static void set_lwt_redirect(struct rtable *rth)
1654 {
1655 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1656 rth->dst.lwtstate->orig_output = rth->dst.output;
1657 rth->dst.output = lwtunnel_output;
1658 }
1659
1660 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1661 rth->dst.lwtstate->orig_input = rth->dst.input;
1662 rth->dst.input = lwtunnel_input;
1663 }
1664 }
1665
1666 /* called in rcu_read_lock() section */
1667 static int __mkroute_input(struct sk_buff *skb,
1668 const struct fib_result *res,
1669 struct in_device *in_dev,
1670 __be32 daddr, __be32 saddr, u32 tos)
1671 {
1672 struct fib_nh_exception *fnhe;
1673 struct rtable *rth;
1674 int err;
1675 struct in_device *out_dev;
1676 bool do_cache;
1677 u32 itag = 0;
1678
1679 /* get a working reference to the output device */
1680 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1681 if (!out_dev) {
1682 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1683 return -EINVAL;
1684 }
1685
1686 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1687 in_dev->dev, in_dev, &itag);
1688 if (err < 0) {
1689 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1690 saddr);
1691
1692 goto cleanup;
1693 }
1694
1695 do_cache = res->fi && !itag;
1696 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1697 skb->protocol == htons(ETH_P_IP) &&
1698 (IN_DEV_SHARED_MEDIA(out_dev) ||
1699 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1700 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1701
1702 if (skb->protocol != htons(ETH_P_IP)) {
1703 /* Not IP (i.e. ARP). Do not create route, if it is
1704 * invalid for proxy arp. DNAT routes are always valid.
1705 *
1706 * Proxy arp feature have been extended to allow, ARP
1707 * replies back to the same interface, to support
1708 * Private VLAN switch technologies. See arp.c.
1709 */
1710 if (out_dev == in_dev &&
1711 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1712 err = -EINVAL;
1713 goto cleanup;
1714 }
1715 }
1716
1717 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1718 if (do_cache) {
1719 if (fnhe) {
1720 rth = rcu_dereference(fnhe->fnhe_rth_input);
1721 if (rth && rth->dst.expires &&
1722 time_after(jiffies, rth->dst.expires)) {
1723 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1724 fnhe = NULL;
1725 } else {
1726 goto rt_cache;
1727 }
1728 }
1729
1730 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1731
1732 rt_cache:
1733 if (rt_cache_valid(rth)) {
1734 skb_dst_set_noref(skb, &rth->dst);
1735 goto out;
1736 }
1737 }
1738
1739 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742 if (!rth) {
1743 err = -ENOBUFS;
1744 goto cleanup;
1745 }
1746
1747 rth->rt_is_input = 1;
1748 if (res->table)
1749 rth->rt_table_id = res->table->tb_id;
1750 RT_CACHE_STAT_INC(in_slow_tot);
1751
1752 rth->dst.input = ip_forward;
1753
1754 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1755 do_cache);
1756 set_lwt_redirect(rth);
1757 skb_dst_set(skb, &rth->dst);
1758 out:
1759 err = 0;
1760 cleanup:
1761 return err;
1762 }
1763
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1765 /* To make ICMP packets follow the right flow, the multipath hash is
1766 * calculated from the inner IP addresses.
1767 */
1768 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1769 struct flow_keys *hash_keys)
1770 {
1771 const struct iphdr *outer_iph = ip_hdr(skb);
1772 const struct iphdr *inner_iph;
1773 const struct icmphdr *icmph;
1774 struct iphdr _inner_iph;
1775 struct icmphdr _icmph;
1776
1777 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1778 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1779 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1780 return;
1781
1782 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1783 return;
1784
1785 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1786 &_icmph);
1787 if (!icmph)
1788 return;
1789
1790 if (icmph->type != ICMP_DEST_UNREACH &&
1791 icmph->type != ICMP_REDIRECT &&
1792 icmph->type != ICMP_TIME_EXCEEDED &&
1793 icmph->type != ICMP_PARAMETERPROB)
1794 return;
1795
1796 inner_iph = skb_header_pointer(skb,
1797 outer_iph->ihl * 4 + sizeof(_icmph),
1798 sizeof(_inner_iph), &_inner_iph);
1799 if (!inner_iph)
1800 return;
1801 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1802 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1803 }
1804
1805 /* if skb is set it will be used and fl4 can be NULL */
1806 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1807 const struct sk_buff *skb)
1808 {
1809 struct net *net = fi->fib_net;
1810 struct flow_keys hash_keys;
1811 u32 mhash;
1812
1813 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1814 case 0:
1815 memset(&hash_keys, 0, sizeof(hash_keys));
1816 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1817 if (skb) {
1818 ip_multipath_l3_keys(skb, &hash_keys);
1819 } else {
1820 hash_keys.addrs.v4addrs.src = fl4->saddr;
1821 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1822 }
1823 break;
1824 case 1:
1825 /* skb is currently provided only when forwarding */
1826 if (skb) {
1827 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1828 struct flow_keys keys;
1829
1830 /* short-circuit if we already have L4 hash present */
1831 if (skb->l4_hash)
1832 return skb_get_hash_raw(skb) >> 1;
1833 memset(&hash_keys, 0, sizeof(hash_keys));
1834 skb_flow_dissect_flow_keys(skb, &keys, flag);
1835 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1836 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1837 hash_keys.ports.src = keys.ports.src;
1838 hash_keys.ports.dst = keys.ports.dst;
1839 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1840 } else {
1841 memset(&hash_keys, 0, sizeof(hash_keys));
1842 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1843 hash_keys.addrs.v4addrs.src = fl4->saddr;
1844 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1845 hash_keys.ports.src = fl4->fl4_sport;
1846 hash_keys.ports.dst = fl4->fl4_dport;
1847 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1848 }
1849 break;
1850 }
1851 mhash = flow_hash_from_keys(&hash_keys);
1852
1853 return mhash >> 1;
1854 }
1855 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1856 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1857
1858 static int ip_mkroute_input(struct sk_buff *skb,
1859 struct fib_result *res,
1860 struct in_device *in_dev,
1861 __be32 daddr, __be32 saddr, u32 tos)
1862 {
1863 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1864 if (res->fi && res->fi->fib_nhs > 1) {
1865 int h = fib_multipath_hash(res->fi, NULL, skb);
1866
1867 fib_select_multipath(res, h);
1868 }
1869 #endif
1870
1871 /* create a routing cache entry */
1872 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1873 }
1874
1875 /*
1876 * NOTE. We drop all the packets that has local source
1877 * addresses, because every properly looped back packet
1878 * must have correct destination already attached by output routine.
1879 *
1880 * Such approach solves two big problems:
1881 * 1. Not simplex devices are handled properly.
1882 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1883 * called with rcu_read_lock()
1884 */
1885
1886 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1887 u8 tos, struct net_device *dev,
1888 struct fib_result *res)
1889 {
1890 struct in_device *in_dev = __in_dev_get_rcu(dev);
1891 struct ip_tunnel_info *tun_info;
1892 struct flowi4 fl4;
1893 unsigned int flags = 0;
1894 u32 itag = 0;
1895 struct rtable *rth;
1896 int err = -EINVAL;
1897 struct net *net = dev_net(dev);
1898 bool do_cache;
1899
1900 /* IP on this device is disabled. */
1901
1902 if (!in_dev)
1903 goto out;
1904
1905 /* Check for the most weird martians, which can be not detected
1906 by fib_lookup.
1907 */
1908
1909 tun_info = skb_tunnel_info(skb);
1910 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1911 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1912 else
1913 fl4.flowi4_tun_key.tun_id = 0;
1914 skb_dst_drop(skb);
1915
1916 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1917 goto martian_source;
1918
1919 res->fi = NULL;
1920 res->table = NULL;
1921 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1922 goto brd_input;
1923
1924 /* Accept zero addresses only to limited broadcast;
1925 * I even do not know to fix it or not. Waiting for complains :-)
1926 */
1927 if (ipv4_is_zeronet(saddr))
1928 goto martian_source;
1929
1930 if (ipv4_is_zeronet(daddr))
1931 goto martian_destination;
1932
1933 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1934 * and call it once if daddr or/and saddr are loopback addresses
1935 */
1936 if (ipv4_is_loopback(daddr)) {
1937 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1938 goto martian_destination;
1939 } else if (ipv4_is_loopback(saddr)) {
1940 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1941 goto martian_source;
1942 }
1943
1944 /*
1945 * Now we are ready to route packet.
1946 */
1947 fl4.flowi4_oif = 0;
1948 fl4.flowi4_iif = dev->ifindex;
1949 fl4.flowi4_mark = skb->mark;
1950 fl4.flowi4_tos = tos;
1951 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1952 fl4.flowi4_flags = 0;
1953 fl4.daddr = daddr;
1954 fl4.saddr = saddr;
1955 fl4.flowi4_uid = sock_net_uid(net, NULL);
1956 err = fib_lookup(net, &fl4, res, 0);
1957 if (err != 0) {
1958 if (!IN_DEV_FORWARD(in_dev))
1959 err = -EHOSTUNREACH;
1960 goto no_route;
1961 }
1962
1963 if (res->type == RTN_BROADCAST)
1964 goto brd_input;
1965
1966 if (res->type == RTN_LOCAL) {
1967 err = fib_validate_source(skb, saddr, daddr, tos,
1968 0, dev, in_dev, &itag);
1969 if (err < 0)
1970 goto martian_source;
1971 goto local_input;
1972 }
1973
1974 if (!IN_DEV_FORWARD(in_dev)) {
1975 err = -EHOSTUNREACH;
1976 goto no_route;
1977 }
1978 if (res->type != RTN_UNICAST)
1979 goto martian_destination;
1980
1981 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1982 out: return err;
1983
1984 brd_input:
1985 if (skb->protocol != htons(ETH_P_IP))
1986 goto e_inval;
1987
1988 if (!ipv4_is_zeronet(saddr)) {
1989 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1990 in_dev, &itag);
1991 if (err < 0)
1992 goto martian_source;
1993 }
1994 flags |= RTCF_BROADCAST;
1995 res->type = RTN_BROADCAST;
1996 RT_CACHE_STAT_INC(in_brd);
1997
1998 local_input:
1999 do_cache = false;
2000 if (res->fi) {
2001 if (!itag) {
2002 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2003 if (rt_cache_valid(rth)) {
2004 skb_dst_set_noref(skb, &rth->dst);
2005 err = 0;
2006 goto out;
2007 }
2008 do_cache = true;
2009 }
2010 }
2011
2012 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2013 flags | RTCF_LOCAL, res->type,
2014 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2015 if (!rth)
2016 goto e_nobufs;
2017
2018 rth->dst.output= ip_rt_bug;
2019 #ifdef CONFIG_IP_ROUTE_CLASSID
2020 rth->dst.tclassid = itag;
2021 #endif
2022 rth->rt_is_input = 1;
2023 if (res->table)
2024 rth->rt_table_id = res->table->tb_id;
2025
2026 RT_CACHE_STAT_INC(in_slow_tot);
2027 if (res->type == RTN_UNREACHABLE) {
2028 rth->dst.input= ip_error;
2029 rth->dst.error= -err;
2030 rth->rt_flags &= ~RTCF_LOCAL;
2031 }
2032
2033 if (do_cache) {
2034 struct fib_nh *nh = &FIB_RES_NH(*res);
2035
2036 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2037 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2038 WARN_ON(rth->dst.input == lwtunnel_input);
2039 rth->dst.lwtstate->orig_input = rth->dst.input;
2040 rth->dst.input = lwtunnel_input;
2041 }
2042
2043 if (unlikely(!rt_cache_route(nh, rth)))
2044 rt_add_uncached_list(rth);
2045 }
2046 skb_dst_set(skb, &rth->dst);
2047 err = 0;
2048 goto out;
2049
2050 no_route:
2051 RT_CACHE_STAT_INC(in_no_route);
2052 res->type = RTN_UNREACHABLE;
2053 res->fi = NULL;
2054 res->table = NULL;
2055 goto local_input;
2056
2057 /*
2058 * Do not cache martian addresses: they should be logged (RFC1812)
2059 */
2060 martian_destination:
2061 RT_CACHE_STAT_INC(in_martian_dst);
2062 #ifdef CONFIG_IP_ROUTE_VERBOSE
2063 if (IN_DEV_LOG_MARTIANS(in_dev))
2064 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2065 &daddr, &saddr, dev->name);
2066 #endif
2067
2068 e_inval:
2069 err = -EINVAL;
2070 goto out;
2071
2072 e_nobufs:
2073 err = -ENOBUFS;
2074 goto out;
2075
2076 martian_source:
2077 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2078 goto out;
2079 }
2080
2081 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2082 u8 tos, struct net_device *dev)
2083 {
2084 struct fib_result res;
2085 int err;
2086
2087 tos &= IPTOS_RT_MASK;
2088 rcu_read_lock();
2089 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2090 rcu_read_unlock();
2091
2092 return err;
2093 }
2094 EXPORT_SYMBOL(ip_route_input_noref);
2095
2096 /* called with rcu_read_lock held */
2097 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098 u8 tos, struct net_device *dev, struct fib_result *res)
2099 {
2100 /* Multicast recognition logic is moved from route cache to here.
2101 The problem was that too many Ethernet cards have broken/missing
2102 hardware multicast filters :-( As result the host on multicasting
2103 network acquires a lot of useless route cache entries, sort of
2104 SDR messages from all the world. Now we try to get rid of them.
2105 Really, provided software IP multicast filter is organized
2106 reasonably (at least, hashed), it does not result in a slowdown
2107 comparing with route cache reject entries.
2108 Note, that multicast routers are not affected, because
2109 route cache entry is created eventually.
2110 */
2111 if (ipv4_is_multicast(daddr)) {
2112 struct in_device *in_dev = __in_dev_get_rcu(dev);
2113 int our = 0;
2114 int err = -EINVAL;
2115
2116 if (in_dev)
2117 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2118 ip_hdr(skb)->protocol);
2119
2120 /* check l3 master if no match yet */
2121 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2122 struct in_device *l3_in_dev;
2123
2124 l3_in_dev = __in_dev_get_rcu(skb->dev);
2125 if (l3_in_dev)
2126 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2127 ip_hdr(skb)->protocol);
2128 }
2129
2130 if (our
2131 #ifdef CONFIG_IP_MROUTE
2132 ||
2133 (!ipv4_is_local_multicast(daddr) &&
2134 IN_DEV_MFORWARD(in_dev))
2135 #endif
2136 ) {
2137 err = ip_route_input_mc(skb, daddr, saddr,
2138 tos, dev, our);
2139 }
2140 return err;
2141 }
2142
2143 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2144 }
2145
2146 /* called with rcu_read_lock() */
2147 static struct rtable *__mkroute_output(const struct fib_result *res,
2148 const struct flowi4 *fl4, int orig_oif,
2149 struct net_device *dev_out,
2150 unsigned int flags)
2151 {
2152 struct fib_info *fi = res->fi;
2153 struct fib_nh_exception *fnhe;
2154 struct in_device *in_dev;
2155 u16 type = res->type;
2156 struct rtable *rth;
2157 bool do_cache;
2158
2159 in_dev = __in_dev_get_rcu(dev_out);
2160 if (!in_dev)
2161 return ERR_PTR(-EINVAL);
2162
2163 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2164 if (ipv4_is_loopback(fl4->saddr) &&
2165 !(dev_out->flags & IFF_LOOPBACK) &&
2166 !netif_is_l3_master(dev_out))
2167 return ERR_PTR(-EINVAL);
2168
2169 if (ipv4_is_lbcast(fl4->daddr))
2170 type = RTN_BROADCAST;
2171 else if (ipv4_is_multicast(fl4->daddr))
2172 type = RTN_MULTICAST;
2173 else if (ipv4_is_zeronet(fl4->daddr))
2174 return ERR_PTR(-EINVAL);
2175
2176 if (dev_out->flags & IFF_LOOPBACK)
2177 flags |= RTCF_LOCAL;
2178
2179 do_cache = true;
2180 if (type == RTN_BROADCAST) {
2181 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182 fi = NULL;
2183 } else if (type == RTN_MULTICAST) {
2184 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2185 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2186 fl4->flowi4_proto))
2187 flags &= ~RTCF_LOCAL;
2188 else
2189 do_cache = false;
2190 /* If multicast route do not exist use
2191 * default one, but do not gateway in this case.
2192 * Yes, it is hack.
2193 */
2194 if (fi && res->prefixlen < 4)
2195 fi = NULL;
2196 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2197 (orig_oif != dev_out->ifindex)) {
2198 /* For local routes that require a particular output interface
2199 * we do not want to cache the result. Caching the result
2200 * causes incorrect behaviour when there are multiple source
2201 * addresses on the interface, the end result being that if the
2202 * intended recipient is waiting on that interface for the
2203 * packet he won't receive it because it will be delivered on
2204 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2205 * be set to the loopback interface as well.
2206 */
2207 fi = NULL;
2208 }
2209
2210 fnhe = NULL;
2211 do_cache &= fi != NULL;
2212 if (do_cache) {
2213 struct rtable __rcu **prth;
2214 struct fib_nh *nh = &FIB_RES_NH(*res);
2215
2216 fnhe = find_exception(nh, fl4->daddr);
2217 if (fnhe) {
2218 prth = &fnhe->fnhe_rth_output;
2219 rth = rcu_dereference(*prth);
2220 if (rth && rth->dst.expires &&
2221 time_after(jiffies, rth->dst.expires)) {
2222 ip_del_fnhe(nh, fl4->daddr);
2223 fnhe = NULL;
2224 } else {
2225 goto rt_cache;
2226 }
2227 }
2228
2229 if (unlikely(fl4->flowi4_flags &
2230 FLOWI_FLAG_KNOWN_NH &&
2231 !(nh->nh_gw &&
2232 nh->nh_scope == RT_SCOPE_LINK))) {
2233 do_cache = false;
2234 goto add;
2235 }
2236 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2237 rth = rcu_dereference(*prth);
2238
2239 rt_cache:
2240 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2241 return rth;
2242 }
2243
2244 add:
2245 rth = rt_dst_alloc(dev_out, flags, type,
2246 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2247 IN_DEV_CONF_GET(in_dev, NOXFRM),
2248 do_cache);
2249 if (!rth)
2250 return ERR_PTR(-ENOBUFS);
2251
2252 rth->rt_iif = orig_oif;
2253 if (res->table)
2254 rth->rt_table_id = res->table->tb_id;
2255
2256 RT_CACHE_STAT_INC(out_slow_tot);
2257
2258 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2259 if (flags & RTCF_LOCAL &&
2260 !(dev_out->flags & IFF_LOOPBACK)) {
2261 rth->dst.output = ip_mc_output;
2262 RT_CACHE_STAT_INC(out_slow_mc);
2263 }
2264 #ifdef CONFIG_IP_MROUTE
2265 if (type == RTN_MULTICAST) {
2266 if (IN_DEV_MFORWARD(in_dev) &&
2267 !ipv4_is_local_multicast(fl4->daddr)) {
2268 rth->dst.input = ip_mr_input;
2269 rth->dst.output = ip_mc_output;
2270 }
2271 }
2272 #endif
2273 }
2274
2275 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2276 set_lwt_redirect(rth);
2277
2278 return rth;
2279 }
2280
2281 /*
2282 * Major route resolver routine.
2283 */
2284
2285 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2286 const struct sk_buff *skb)
2287 {
2288 __u8 tos = RT_FL_TOS(fl4);
2289 struct fib_result res;
2290 struct rtable *rth;
2291
2292 res.tclassid = 0;
2293 res.fi = NULL;
2294 res.table = NULL;
2295
2296 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2297 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2298 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2299 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2300
2301 rcu_read_lock();
2302 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2303 rcu_read_unlock();
2304
2305 return rth;
2306 }
2307 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2308
2309 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2310 struct fib_result *res,
2311 const struct sk_buff *skb)
2312 {
2313 struct net_device *dev_out = NULL;
2314 int orig_oif = fl4->flowi4_oif;
2315 unsigned int flags = 0;
2316 struct rtable *rth;
2317 int err = -ENETUNREACH;
2318
2319 if (fl4->saddr) {
2320 rth = ERR_PTR(-EINVAL);
2321 if (ipv4_is_multicast(fl4->saddr) ||
2322 ipv4_is_lbcast(fl4->saddr) ||
2323 ipv4_is_zeronet(fl4->saddr))
2324 goto out;
2325
2326 /* I removed check for oif == dev_out->oif here.
2327 It was wrong for two reasons:
2328 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2329 is assigned to multiple interfaces.
2330 2. Moreover, we are allowed to send packets with saddr
2331 of another iface. --ANK
2332 */
2333
2334 if (fl4->flowi4_oif == 0 &&
2335 (ipv4_is_multicast(fl4->daddr) ||
2336 ipv4_is_lbcast(fl4->daddr))) {
2337 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2338 dev_out = __ip_dev_find(net, fl4->saddr, false);
2339 if (!dev_out)
2340 goto out;
2341
2342 /* Special hack: user can direct multicasts
2343 and limited broadcast via necessary interface
2344 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2345 This hack is not just for fun, it allows
2346 vic,vat and friends to work.
2347 They bind socket to loopback, set ttl to zero
2348 and expect that it will work.
2349 From the viewpoint of routing cache they are broken,
2350 because we are not allowed to build multicast path
2351 with loopback source addr (look, routing cache
2352 cannot know, that ttl is zero, so that packet
2353 will not leave this host and route is valid).
2354 Luckily, this hack is good workaround.
2355 */
2356
2357 fl4->flowi4_oif = dev_out->ifindex;
2358 goto make_route;
2359 }
2360
2361 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2362 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2363 if (!__ip_dev_find(net, fl4->saddr, false))
2364 goto out;
2365 }
2366 }
2367
2368
2369 if (fl4->flowi4_oif) {
2370 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2371 rth = ERR_PTR(-ENODEV);
2372 if (!dev_out)
2373 goto out;
2374
2375 /* RACE: Check return value of inet_select_addr instead. */
2376 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2377 rth = ERR_PTR(-ENETUNREACH);
2378 goto out;
2379 }
2380 if (ipv4_is_local_multicast(fl4->daddr) ||
2381 ipv4_is_lbcast(fl4->daddr) ||
2382 fl4->flowi4_proto == IPPROTO_IGMP) {
2383 if (!fl4->saddr)
2384 fl4->saddr = inet_select_addr(dev_out, 0,
2385 RT_SCOPE_LINK);
2386 goto make_route;
2387 }
2388 if (!fl4->saddr) {
2389 if (ipv4_is_multicast(fl4->daddr))
2390 fl4->saddr = inet_select_addr(dev_out, 0,
2391 fl4->flowi4_scope);
2392 else if (!fl4->daddr)
2393 fl4->saddr = inet_select_addr(dev_out, 0,
2394 RT_SCOPE_HOST);
2395 }
2396 }
2397
2398 if (!fl4->daddr) {
2399 fl4->daddr = fl4->saddr;
2400 if (!fl4->daddr)
2401 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2402 dev_out = net->loopback_dev;
2403 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2404 res->type = RTN_LOCAL;
2405 flags |= RTCF_LOCAL;
2406 goto make_route;
2407 }
2408
2409 err = fib_lookup(net, fl4, res, 0);
2410 if (err) {
2411 res->fi = NULL;
2412 res->table = NULL;
2413 if (fl4->flowi4_oif &&
2414 (ipv4_is_multicast(fl4->daddr) ||
2415 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2416 /* Apparently, routing tables are wrong. Assume,
2417 that the destination is on link.
2418
2419 WHY? DW.
2420 Because we are allowed to send to iface
2421 even if it has NO routes and NO assigned
2422 addresses. When oif is specified, routing
2423 tables are looked up with only one purpose:
2424 to catch if destination is gatewayed, rather than
2425 direct. Moreover, if MSG_DONTROUTE is set,
2426 we send packet, ignoring both routing tables
2427 and ifaddr state. --ANK
2428
2429
2430 We could make it even if oif is unknown,
2431 likely IPv6, but we do not.
2432 */
2433
2434 if (fl4->saddr == 0)
2435 fl4->saddr = inet_select_addr(dev_out, 0,
2436 RT_SCOPE_LINK);
2437 res->type = RTN_UNICAST;
2438 goto make_route;
2439 }
2440 rth = ERR_PTR(err);
2441 goto out;
2442 }
2443
2444 if (res->type == RTN_LOCAL) {
2445 if (!fl4->saddr) {
2446 if (res->fi->fib_prefsrc)
2447 fl4->saddr = res->fi->fib_prefsrc;
2448 else
2449 fl4->saddr = fl4->daddr;
2450 }
2451
2452 /* L3 master device is the loopback for that domain */
2453 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2454 net->loopback_dev;
2455
2456 /* make sure orig_oif points to fib result device even
2457 * though packet rx/tx happens over loopback or l3mdev
2458 */
2459 orig_oif = FIB_RES_OIF(*res);
2460
2461 fl4->flowi4_oif = dev_out->ifindex;
2462 flags |= RTCF_LOCAL;
2463 goto make_route;
2464 }
2465
2466 fib_select_path(net, res, fl4, skb);
2467
2468 dev_out = FIB_RES_DEV(*res);
2469 fl4->flowi4_oif = dev_out->ifindex;
2470
2471
2472 make_route:
2473 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2474
2475 out:
2476 return rth;
2477 }
2478
2479 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2480 {
2481 return NULL;
2482 }
2483
2484 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2485 {
2486 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2487
2488 return mtu ? : dst->dev->mtu;
2489 }
2490
2491 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2492 struct sk_buff *skb, u32 mtu)
2493 {
2494 }
2495
2496 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2497 struct sk_buff *skb)
2498 {
2499 }
2500
2501 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2502 unsigned long old)
2503 {
2504 return NULL;
2505 }
2506
2507 static struct dst_ops ipv4_dst_blackhole_ops = {
2508 .family = AF_INET,
2509 .check = ipv4_blackhole_dst_check,
2510 .mtu = ipv4_blackhole_mtu,
2511 .default_advmss = ipv4_default_advmss,
2512 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2513 .redirect = ipv4_rt_blackhole_redirect,
2514 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2515 .neigh_lookup = ipv4_neigh_lookup,
2516 };
2517
2518 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2519 {
2520 struct rtable *ort = (struct rtable *) dst_orig;
2521 struct rtable *rt;
2522
2523 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2524 if (rt) {
2525 struct dst_entry *new = &rt->dst;
2526
2527 new->__use = 1;
2528 new->input = dst_discard;
2529 new->output = dst_discard_out;
2530
2531 new->dev = net->loopback_dev;
2532 if (new->dev)
2533 dev_hold(new->dev);
2534
2535 rt->rt_is_input = ort->rt_is_input;
2536 rt->rt_iif = ort->rt_iif;
2537 rt->rt_pmtu = ort->rt_pmtu;
2538
2539 rt->rt_genid = rt_genid_ipv4(net);
2540 rt->rt_flags = ort->rt_flags;
2541 rt->rt_type = ort->rt_type;
2542 rt->rt_gateway = ort->rt_gateway;
2543 rt->rt_uses_gateway = ort->rt_uses_gateway;
2544
2545 INIT_LIST_HEAD(&rt->rt_uncached);
2546 }
2547
2548 dst_release(dst_orig);
2549
2550 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2551 }
2552
2553 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2554 const struct sock *sk)
2555 {
2556 struct rtable *rt = __ip_route_output_key(net, flp4);
2557
2558 if (IS_ERR(rt))
2559 return rt;
2560
2561 if (flp4->flowi4_proto)
2562 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2563 flowi4_to_flowi(flp4),
2564 sk, 0);
2565
2566 return rt;
2567 }
2568 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2569
2570 /* called with rcu_read_lock held */
2571 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2572 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2573 u32 seq)
2574 {
2575 struct rtable *rt = skb_rtable(skb);
2576 struct rtmsg *r;
2577 struct nlmsghdr *nlh;
2578 unsigned long expires = 0;
2579 u32 error;
2580 u32 metrics[RTAX_MAX];
2581
2582 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2583 if (!nlh)
2584 return -EMSGSIZE;
2585
2586 r = nlmsg_data(nlh);
2587 r->rtm_family = AF_INET;
2588 r->rtm_dst_len = 32;
2589 r->rtm_src_len = 0;
2590 r->rtm_tos = fl4->flowi4_tos;
2591 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2592 if (nla_put_u32(skb, RTA_TABLE, table_id))
2593 goto nla_put_failure;
2594 r->rtm_type = rt->rt_type;
2595 r->rtm_scope = RT_SCOPE_UNIVERSE;
2596 r->rtm_protocol = RTPROT_UNSPEC;
2597 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2598 if (rt->rt_flags & RTCF_NOTIFY)
2599 r->rtm_flags |= RTM_F_NOTIFY;
2600 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2601 r->rtm_flags |= RTCF_DOREDIRECT;
2602
2603 if (nla_put_in_addr(skb, RTA_DST, dst))
2604 goto nla_put_failure;
2605 if (src) {
2606 r->rtm_src_len = 32;
2607 if (nla_put_in_addr(skb, RTA_SRC, src))
2608 goto nla_put_failure;
2609 }
2610 if (rt->dst.dev &&
2611 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2612 goto nla_put_failure;
2613 #ifdef CONFIG_IP_ROUTE_CLASSID
2614 if (rt->dst.tclassid &&
2615 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2616 goto nla_put_failure;
2617 #endif
2618 if (!rt_is_input_route(rt) &&
2619 fl4->saddr != src) {
2620 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2621 goto nla_put_failure;
2622 }
2623 if (rt->rt_uses_gateway &&
2624 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2625 goto nla_put_failure;
2626
2627 expires = rt->dst.expires;
2628 if (expires) {
2629 unsigned long now = jiffies;
2630
2631 if (time_before(now, expires))
2632 expires -= now;
2633 else
2634 expires = 0;
2635 }
2636
2637 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2638 if (rt->rt_pmtu && expires)
2639 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2640 if (rtnetlink_put_metrics(skb, metrics) < 0)
2641 goto nla_put_failure;
2642
2643 if (fl4->flowi4_mark &&
2644 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2645 goto nla_put_failure;
2646
2647 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2648 nla_put_u32(skb, RTA_UID,
2649 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2650 goto nla_put_failure;
2651
2652 error = rt->dst.error;
2653
2654 if (rt_is_input_route(rt)) {
2655 #ifdef CONFIG_IP_MROUTE
2656 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2657 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2658 int err = ipmr_get_route(net, skb,
2659 fl4->saddr, fl4->daddr,
2660 r, portid);
2661
2662 if (err <= 0) {
2663 if (err == 0)
2664 return 0;
2665 goto nla_put_failure;
2666 }
2667 } else
2668 #endif
2669 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2670 goto nla_put_failure;
2671 }
2672
2673 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2674 goto nla_put_failure;
2675
2676 nlmsg_end(skb, nlh);
2677 return 0;
2678
2679 nla_put_failure:
2680 nlmsg_cancel(skb, nlh);
2681 return -EMSGSIZE;
2682 }
2683
2684 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2685 struct netlink_ext_ack *extack)
2686 {
2687 struct net *net = sock_net(in_skb->sk);
2688 struct rtmsg *rtm;
2689 struct nlattr *tb[RTA_MAX+1];
2690 struct fib_result res = {};
2691 struct rtable *rt = NULL;
2692 struct flowi4 fl4;
2693 __be32 dst = 0;
2694 __be32 src = 0;
2695 u32 iif;
2696 int err;
2697 int mark;
2698 struct sk_buff *skb;
2699 u32 table_id = RT_TABLE_MAIN;
2700 kuid_t uid;
2701
2702 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2703 extack);
2704 if (err < 0)
2705 goto errout;
2706
2707 rtm = nlmsg_data(nlh);
2708
2709 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2710 if (!skb) {
2711 err = -ENOBUFS;
2712 goto errout;
2713 }
2714
2715 /* Reserve room for dummy headers, this skb can pass
2716 through good chunk of routing engine.
2717 */
2718 skb_reset_mac_header(skb);
2719 skb_reset_network_header(skb);
2720
2721 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2722 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2723 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2724 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2725 if (tb[RTA_UID])
2726 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2727 else
2728 uid = (iif ? INVALID_UID : current_uid());
2729
2730 /* Bugfix: need to give ip_route_input enough of an IP header to
2731 * not gag.
2732 */
2733 ip_hdr(skb)->protocol = IPPROTO_UDP;
2734 ip_hdr(skb)->saddr = src;
2735 ip_hdr(skb)->daddr = dst;
2736
2737 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2738
2739 memset(&fl4, 0, sizeof(fl4));
2740 fl4.daddr = dst;
2741 fl4.saddr = src;
2742 fl4.flowi4_tos = rtm->rtm_tos;
2743 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2744 fl4.flowi4_mark = mark;
2745 fl4.flowi4_uid = uid;
2746
2747 rcu_read_lock();
2748
2749 if (iif) {
2750 struct net_device *dev;
2751
2752 dev = dev_get_by_index_rcu(net, iif);
2753 if (!dev) {
2754 err = -ENODEV;
2755 goto errout_free;
2756 }
2757
2758 skb->protocol = htons(ETH_P_IP);
2759 skb->dev = dev;
2760 skb->mark = mark;
2761 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2762 dev, &res);
2763
2764 rt = skb_rtable(skb);
2765 if (err == 0 && rt->dst.error)
2766 err = -rt->dst.error;
2767 } else {
2768 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2769 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2770 err = 0;
2771 if (IS_ERR(rt))
2772 err = PTR_ERR(rt);
2773 else
2774 skb_dst_set(skb, &rt->dst);
2775 }
2776
2777 if (err)
2778 goto errout_free;
2779
2780 if (rtm->rtm_flags & RTM_F_NOTIFY)
2781 rt->rt_flags |= RTCF_NOTIFY;
2782
2783 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2784 table_id = rt->rt_table_id;
2785
2786 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2787 if (!res.fi) {
2788 err = fib_props[res.type].error;
2789 if (!err)
2790 err = -EHOSTUNREACH;
2791 goto errout_free;
2792 }
2793 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2794 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2795 rt->rt_type, res.prefix, res.prefixlen,
2796 fl4.flowi4_tos, res.fi, 0);
2797 } else {
2798 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2799 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2800 }
2801 if (err < 0)
2802 goto errout_free;
2803
2804 rcu_read_unlock();
2805
2806 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2807 errout:
2808 return err;
2809
2810 errout_free:
2811 rcu_read_unlock();
2812 kfree_skb(skb);
2813 goto errout;
2814 }
2815
2816 void ip_rt_multicast_event(struct in_device *in_dev)
2817 {
2818 rt_cache_flush(dev_net(in_dev->dev));
2819 }
2820
2821 #ifdef CONFIG_SYSCTL
2822 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2823 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2824 static int ip_rt_gc_elasticity __read_mostly = 8;
2825
2826 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2827 void __user *buffer,
2828 size_t *lenp, loff_t *ppos)
2829 {
2830 struct net *net = (struct net *)__ctl->extra1;
2831
2832 if (write) {
2833 rt_cache_flush(net);
2834 fnhe_genid_bump(net);
2835 return 0;
2836 }
2837
2838 return -EINVAL;
2839 }
2840
2841 static struct ctl_table ipv4_route_table[] = {
2842 {
2843 .procname = "gc_thresh",
2844 .data = &ipv4_dst_ops.gc_thresh,
2845 .maxlen = sizeof(int),
2846 .mode = 0644,
2847 .proc_handler = proc_dointvec,
2848 },
2849 {
2850 .procname = "max_size",
2851 .data = &ip_rt_max_size,
2852 .maxlen = sizeof(int),
2853 .mode = 0644,
2854 .proc_handler = proc_dointvec,
2855 },
2856 {
2857 /* Deprecated. Use gc_min_interval_ms */
2858
2859 .procname = "gc_min_interval",
2860 .data = &ip_rt_gc_min_interval,
2861 .maxlen = sizeof(int),
2862 .mode = 0644,
2863 .proc_handler = proc_dointvec_jiffies,
2864 },
2865 {
2866 .procname = "gc_min_interval_ms",
2867 .data = &ip_rt_gc_min_interval,
2868 .maxlen = sizeof(int),
2869 .mode = 0644,
2870 .proc_handler = proc_dointvec_ms_jiffies,
2871 },
2872 {
2873 .procname = "gc_timeout",
2874 .data = &ip_rt_gc_timeout,
2875 .maxlen = sizeof(int),
2876 .mode = 0644,
2877 .proc_handler = proc_dointvec_jiffies,
2878 },
2879 {
2880 .procname = "gc_interval",
2881 .data = &ip_rt_gc_interval,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
2884 .proc_handler = proc_dointvec_jiffies,
2885 },
2886 {
2887 .procname = "redirect_load",
2888 .data = &ip_rt_redirect_load,
2889 .maxlen = sizeof(int),
2890 .mode = 0644,
2891 .proc_handler = proc_dointvec,
2892 },
2893 {
2894 .procname = "redirect_number",
2895 .data = &ip_rt_redirect_number,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = proc_dointvec,
2899 },
2900 {
2901 .procname = "redirect_silence",
2902 .data = &ip_rt_redirect_silence,
2903 .maxlen = sizeof(int),
2904 .mode = 0644,
2905 .proc_handler = proc_dointvec,
2906 },
2907 {
2908 .procname = "error_cost",
2909 .data = &ip_rt_error_cost,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
2912 .proc_handler = proc_dointvec,
2913 },
2914 {
2915 .procname = "error_burst",
2916 .data = &ip_rt_error_burst,
2917 .maxlen = sizeof(int),
2918 .mode = 0644,
2919 .proc_handler = proc_dointvec,
2920 },
2921 {
2922 .procname = "gc_elasticity",
2923 .data = &ip_rt_gc_elasticity,
2924 .maxlen = sizeof(int),
2925 .mode = 0644,
2926 .proc_handler = proc_dointvec,
2927 },
2928 {
2929 .procname = "mtu_expires",
2930 .data = &ip_rt_mtu_expires,
2931 .maxlen = sizeof(int),
2932 .mode = 0644,
2933 .proc_handler = proc_dointvec_jiffies,
2934 },
2935 {
2936 .procname = "min_pmtu",
2937 .data = &ip_rt_min_pmtu,
2938 .maxlen = sizeof(int),
2939 .mode = 0644,
2940 .proc_handler = proc_dointvec_minmax,
2941 .extra1 = &ip_min_valid_pmtu,
2942 },
2943 {
2944 .procname = "min_adv_mss",
2945 .data = &ip_rt_min_advmss,
2946 .maxlen = sizeof(int),
2947 .mode = 0644,
2948 .proc_handler = proc_dointvec,
2949 },
2950 { }
2951 };
2952
2953 static struct ctl_table ipv4_route_flush_table[] = {
2954 {
2955 .procname = "flush",
2956 .maxlen = sizeof(int),
2957 .mode = 0200,
2958 .proc_handler = ipv4_sysctl_rtcache_flush,
2959 },
2960 { },
2961 };
2962
2963 static __net_init int sysctl_route_net_init(struct net *net)
2964 {
2965 struct ctl_table *tbl;
2966
2967 tbl = ipv4_route_flush_table;
2968 if (!net_eq(net, &init_net)) {
2969 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2970 if (!tbl)
2971 goto err_dup;
2972
2973 /* Don't export sysctls to unprivileged users */
2974 if (net->user_ns != &init_user_ns)
2975 tbl[0].procname = NULL;
2976 }
2977 tbl[0].extra1 = net;
2978
2979 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2980 if (!net->ipv4.route_hdr)
2981 goto err_reg;
2982 return 0;
2983
2984 err_reg:
2985 if (tbl != ipv4_route_flush_table)
2986 kfree(tbl);
2987 err_dup:
2988 return -ENOMEM;
2989 }
2990
2991 static __net_exit void sysctl_route_net_exit(struct net *net)
2992 {
2993 struct ctl_table *tbl;
2994
2995 tbl = net->ipv4.route_hdr->ctl_table_arg;
2996 unregister_net_sysctl_table(net->ipv4.route_hdr);
2997 BUG_ON(tbl == ipv4_route_flush_table);
2998 kfree(tbl);
2999 }
3000
3001 static __net_initdata struct pernet_operations sysctl_route_ops = {
3002 .init = sysctl_route_net_init,
3003 .exit = sysctl_route_net_exit,
3004 };
3005 #endif
3006
3007 static __net_init int rt_genid_init(struct net *net)
3008 {
3009 atomic_set(&net->ipv4.rt_genid, 0);
3010 atomic_set(&net->fnhe_genid, 0);
3011 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3012 return 0;
3013 }
3014
3015 static __net_initdata struct pernet_operations rt_genid_ops = {
3016 .init = rt_genid_init,
3017 };
3018
3019 static int __net_init ipv4_inetpeer_init(struct net *net)
3020 {
3021 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3022
3023 if (!bp)
3024 return -ENOMEM;
3025 inet_peer_base_init(bp);
3026 net->ipv4.peers = bp;
3027 return 0;
3028 }
3029
3030 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3031 {
3032 struct inet_peer_base *bp = net->ipv4.peers;
3033
3034 net->ipv4.peers = NULL;
3035 inetpeer_invalidate_tree(bp);
3036 kfree(bp);
3037 }
3038
3039 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3040 .init = ipv4_inetpeer_init,
3041 .exit = ipv4_inetpeer_exit,
3042 };
3043
3044 #ifdef CONFIG_IP_ROUTE_CLASSID
3045 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3046 #endif /* CONFIG_IP_ROUTE_CLASSID */
3047
3048 int __init ip_rt_init(void)
3049 {
3050 int cpu;
3051
3052 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3053 if (!ip_idents)
3054 panic("IP: failed to allocate ip_idents\n");
3055
3056 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3057
3058 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3059 if (!ip_tstamps)
3060 panic("IP: failed to allocate ip_tstamps\n");
3061
3062 for_each_possible_cpu(cpu) {
3063 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3064
3065 INIT_LIST_HEAD(&ul->head);
3066 spin_lock_init(&ul->lock);
3067 }
3068 #ifdef CONFIG_IP_ROUTE_CLASSID
3069 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3070 if (!ip_rt_acct)
3071 panic("IP: failed to allocate ip_rt_acct\n");
3072 #endif
3073
3074 ipv4_dst_ops.kmem_cachep =
3075 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3076 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3077
3078 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3079
3080 if (dst_entries_init(&ipv4_dst_ops) < 0)
3081 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3082
3083 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3084 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3085
3086 ipv4_dst_ops.gc_thresh = ~0;
3087 ip_rt_max_size = INT_MAX;
3088
3089 devinet_init();
3090 ip_fib_init();
3091
3092 if (ip_rt_proc_init())
3093 pr_err("Unable to create route proc files\n");
3094 #ifdef CONFIG_XFRM
3095 xfrm_init();
3096 xfrm4_init();
3097 #endif
3098 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3099 RTNL_FLAG_DOIT_UNLOCKED);
3100
3101 #ifdef CONFIG_SYSCTL
3102 register_pernet_subsys(&sysctl_route_ops);
3103 #endif
3104 register_pernet_subsys(&rt_genid_ops);
3105 register_pernet_subsys(&ipv4_inetpeer_ops);
3106 return 0;
3107 }
3108
3109 #ifdef CONFIG_SYSCTL
3110 /*
3111 * We really need to sanitize the damn ipv4 init order, then all
3112 * this nonsense will go away.
3113 */
3114 void __init ip_static_sysctl_init(void)
3115 {
3116 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3117 }
3118 #endif