]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/ipv4/route.c
net: ipv4: don't let PMTU updates increase route MTU
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #include "fib_lookup.h"
118
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122 #define RT_GC_TIMEOUT (300*HZ)
123
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
135
136 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
137
138 /*
139 * Interface to generic destination cache.
140 */
141
142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
144 static unsigned int ipv4_mtu(const struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu);
149 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150 struct sk_buff *skb);
151 static void ipv4_dst_destroy(struct dst_entry *dst);
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 WARN_ON(1);
156 return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
163
164 static struct dst_ops ipv4_dst_ops = {
165 .family = AF_INET,
166 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss,
168 .mtu = ipv4_mtu,
169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
171 .negative_advice = ipv4_negative_advice,
172 .link_failure = ipv4_link_failure,
173 .update_pmtu = ip_rt_update_pmtu,
174 .redirect = ip_do_redirect,
175 .local_out = __ip_local_out,
176 .neigh_lookup = ipv4_neigh_lookup,
177 .confirm_neigh = ipv4_confirm_neigh,
178 };
179
180 #define ECN_OR_COST(class) TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183 TC_PRIO_BESTEFFORT,
184 ECN_OR_COST(BESTEFFORT),
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208 if (*pos)
209 return NULL;
210 return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215 ++*pos;
216 return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
230 return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242 return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
250 .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
265 return &per_cpu(rt_cache_stat, cpu);
266 }
267 return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272 int cpu;
273
274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
278 return &per_cpu(rt_cache_stat, cpu);
279 }
280 return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 return 0;
296 }
297
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
301 0, /* st->in_hit */
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 0, /* st->out_hit */
310 st->out_slow_tot,
311 st->out_slow_mc,
312
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
319 );
320 return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333 return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
362 }
363
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371 return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385 struct proc_dir_entry *pde;
386
387 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
394 if (!pde)
395 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 if (!pde)
400 goto err3;
401 #endif
402 return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411 return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430 return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436 return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447 rt_genid_bump_ipv4(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
453 {
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
457 struct neighbour *n;
458
459 rt = (const struct rtable *) dst;
460 if (rt->rt_gateway)
461 pkey = (const __be32 *) &rt->rt_gateway;
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
464
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 if (n)
467 return n;
468 return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
472 {
473 struct net_device *dev = dst->dev;
474 const __be32 *pkey = daddr;
475 const struct rtable *rt;
476
477 rt = (const struct rtable *)dst;
478 if (rt->rt_gateway)
479 pkey = (const __be32 *)&rt->rt_gateway;
480 else if (!daddr ||
481 (rt->rt_flags &
482 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
483 return;
484
485 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
486 }
487
488 #define IP_IDENTS_SZ 2048u
489
490 static atomic_t *ip_idents __read_mostly;
491 static u32 *ip_tstamps __read_mostly;
492
493 /* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
496 */
497 u32 ip_idents_reserve(u32 hash, int segs)
498 {
499 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
500 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
501 u32 old = READ_ONCE(*p_tstamp);
502 u32 now = (u32)jiffies;
503 u32 new, delta = 0;
504
505 if (old != now && cmpxchg(p_tstamp, old, now) == old)
506 delta = prandom_u32_max(now - old);
507
508 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
509 do {
510 old = (u32)atomic_read(p_id);
511 new = old + delta + segs;
512 } while (atomic_cmpxchg(p_id, old, new) != old);
513
514 return new - segs;
515 }
516 EXPORT_SYMBOL(ip_idents_reserve);
517
518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
519 {
520 static u32 ip_idents_hashrnd __read_mostly;
521 u32 hash, id;
522
523 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
524
525 hash = jhash_3words((__force u32)iph->daddr,
526 (__force u32)iph->saddr,
527 iph->protocol ^ net_hash_mix(net),
528 ip_idents_hashrnd);
529 id = ip_idents_reserve(hash, segs);
530 iph->id = htons(id);
531 }
532 EXPORT_SYMBOL(__ip_select_ident);
533
534 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
535 const struct sock *sk,
536 const struct iphdr *iph,
537 int oif, u8 tos,
538 u8 prot, u32 mark, int flow_flags)
539 {
540 if (sk) {
541 const struct inet_sock *inet = inet_sk(sk);
542
543 oif = sk->sk_bound_dev_if;
544 mark = sk->sk_mark;
545 tos = RT_CONN_FLAGS(sk);
546 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
547 }
548 flowi4_init_output(fl4, oif, mark, tos,
549 RT_SCOPE_UNIVERSE, prot,
550 flow_flags,
551 iph->daddr, iph->saddr, 0, 0,
552 sock_net_uid(net, sk));
553 }
554
555 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
556 const struct sock *sk)
557 {
558 const struct net *net = dev_net(skb->dev);
559 const struct iphdr *iph = ip_hdr(skb);
560 int oif = skb->dev->ifindex;
561 u8 tos = RT_TOS(iph->tos);
562 u8 prot = iph->protocol;
563 u32 mark = skb->mark;
564
565 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
566 }
567
568 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
569 {
570 const struct inet_sock *inet = inet_sk(sk);
571 const struct ip_options_rcu *inet_opt;
572 __be32 daddr = inet->inet_daddr;
573
574 rcu_read_lock();
575 inet_opt = rcu_dereference(inet->inet_opt);
576 if (inet_opt && inet_opt->opt.srr)
577 daddr = inet_opt->opt.faddr;
578 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
579 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
580 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
581 inet_sk_flowi_flags(sk),
582 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
583 rcu_read_unlock();
584 }
585
586 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 const struct sk_buff *skb)
588 {
589 if (skb)
590 build_skb_flow_key(fl4, skb, sk);
591 else
592 build_sk_flow_key(fl4, sk);
593 }
594
595 static DEFINE_SPINLOCK(fnhe_lock);
596
597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
598 {
599 struct rtable *rt;
600
601 rt = rcu_dereference(fnhe->fnhe_rth_input);
602 if (rt) {
603 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
604 dst_dev_put(&rt->dst);
605 dst_release(&rt->dst);
606 }
607 rt = rcu_dereference(fnhe->fnhe_rth_output);
608 if (rt) {
609 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
610 dst_dev_put(&rt->dst);
611 dst_release(&rt->dst);
612 }
613 }
614
615 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
616 {
617 struct fib_nh_exception *fnhe, *oldest;
618
619 oldest = rcu_dereference(hash->chain);
620 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
621 fnhe = rcu_dereference(fnhe->fnhe_next)) {
622 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
623 oldest = fnhe;
624 }
625 fnhe_flush_routes(oldest);
626 return oldest;
627 }
628
629 static inline u32 fnhe_hashfun(__be32 daddr)
630 {
631 static u32 fnhe_hashrnd __read_mostly;
632 u32 hval;
633
634 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
635 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
636 return hash_32(hval, FNHE_HASH_SHIFT);
637 }
638
639 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640 {
641 rt->rt_pmtu = fnhe->fnhe_pmtu;
642 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
643 rt->dst.expires = fnhe->fnhe_expires;
644
645 if (fnhe->fnhe_gw) {
646 rt->rt_flags |= RTCF_REDIRECTED;
647 rt->rt_gateway = fnhe->fnhe_gw;
648 rt->rt_uses_gateway = 1;
649 }
650 }
651
652 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
653 u32 pmtu, bool lock, unsigned long expires)
654 {
655 struct fnhe_hash_bucket *hash;
656 struct fib_nh_exception *fnhe;
657 struct rtable *rt;
658 u32 genid, hval;
659 unsigned int i;
660 int depth;
661
662 genid = fnhe_genid(dev_net(nh->nh_dev));
663 hval = fnhe_hashfun(daddr);
664
665 spin_lock_bh(&fnhe_lock);
666
667 hash = rcu_dereference(nh->nh_exceptions);
668 if (!hash) {
669 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
670 if (!hash)
671 goto out_unlock;
672 rcu_assign_pointer(nh->nh_exceptions, hash);
673 }
674
675 hash += hval;
676
677 depth = 0;
678 for (fnhe = rcu_dereference(hash->chain); fnhe;
679 fnhe = rcu_dereference(fnhe->fnhe_next)) {
680 if (fnhe->fnhe_daddr == daddr)
681 break;
682 depth++;
683 }
684
685 if (fnhe) {
686 if (fnhe->fnhe_genid != genid)
687 fnhe->fnhe_genid = genid;
688 if (gw)
689 fnhe->fnhe_gw = gw;
690 if (pmtu) {
691 fnhe->fnhe_pmtu = pmtu;
692 fnhe->fnhe_mtu_locked = lock;
693 }
694 fnhe->fnhe_expires = max(1UL, expires);
695 /* Update all cached dsts too */
696 rt = rcu_dereference(fnhe->fnhe_rth_input);
697 if (rt)
698 fill_route_from_fnhe(rt, fnhe);
699 rt = rcu_dereference(fnhe->fnhe_rth_output);
700 if (rt)
701 fill_route_from_fnhe(rt, fnhe);
702 } else {
703 if (depth > FNHE_RECLAIM_DEPTH)
704 fnhe = fnhe_oldest(hash);
705 else {
706 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
707 if (!fnhe)
708 goto out_unlock;
709
710 fnhe->fnhe_next = hash->chain;
711 rcu_assign_pointer(hash->chain, fnhe);
712 }
713 fnhe->fnhe_genid = genid;
714 fnhe->fnhe_daddr = daddr;
715 fnhe->fnhe_gw = gw;
716 fnhe->fnhe_pmtu = pmtu;
717 fnhe->fnhe_mtu_locked = lock;
718 fnhe->fnhe_expires = max(1UL, expires);
719
720 /* Exception created; mark the cached routes for the nexthop
721 * stale, so anyone caching it rechecks if this exception
722 * applies to them.
723 */
724 rt = rcu_dereference(nh->nh_rth_input);
725 if (rt)
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
727
728 for_each_possible_cpu(i) {
729 struct rtable __rcu **prt;
730 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
731 rt = rcu_dereference(*prt);
732 if (rt)
733 rt->dst.obsolete = DST_OBSOLETE_KILL;
734 }
735 }
736
737 fnhe->fnhe_stamp = jiffies;
738
739 out_unlock:
740 spin_unlock_bh(&fnhe_lock);
741 }
742
743 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
744 bool kill_route)
745 {
746 __be32 new_gw = icmp_hdr(skb)->un.gateway;
747 __be32 old_gw = ip_hdr(skb)->saddr;
748 struct net_device *dev = skb->dev;
749 struct in_device *in_dev;
750 struct fib_result res;
751 struct neighbour *n;
752 struct net *net;
753
754 switch (icmp_hdr(skb)->code & 7) {
755 case ICMP_REDIR_NET:
756 case ICMP_REDIR_NETTOS:
757 case ICMP_REDIR_HOST:
758 case ICMP_REDIR_HOSTTOS:
759 break;
760
761 default:
762 return;
763 }
764
765 if (rt->rt_gateway != old_gw)
766 return;
767
768 in_dev = __in_dev_get_rcu(dev);
769 if (!in_dev)
770 return;
771
772 net = dev_net(dev);
773 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
774 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
775 ipv4_is_zeronet(new_gw))
776 goto reject_redirect;
777
778 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
779 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
780 goto reject_redirect;
781 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
782 goto reject_redirect;
783 } else {
784 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
785 goto reject_redirect;
786 }
787
788 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
789 if (!n)
790 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
791 if (!IS_ERR(n)) {
792 if (!(n->nud_state & NUD_VALID)) {
793 neigh_event_send(n, NULL);
794 } else {
795 if (fib_lookup(net, fl4, &res, 0) == 0) {
796 struct fib_nh *nh = &FIB_RES_NH(res);
797
798 update_or_create_fnhe(nh, fl4->daddr, new_gw,
799 0, false,
800 jiffies + ip_rt_gc_timeout);
801 }
802 if (kill_route)
803 rt->dst.obsolete = DST_OBSOLETE_KILL;
804 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
805 }
806 neigh_release(n);
807 }
808 return;
809
810 reject_redirect:
811 #ifdef CONFIG_IP_ROUTE_VERBOSE
812 if (IN_DEV_LOG_MARTIANS(in_dev)) {
813 const struct iphdr *iph = (const struct iphdr *) skb->data;
814 __be32 daddr = iph->daddr;
815 __be32 saddr = iph->saddr;
816
817 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
818 " Advised path = %pI4 -> %pI4\n",
819 &old_gw, dev->name, &new_gw,
820 &saddr, &daddr);
821 }
822 #endif
823 ;
824 }
825
826 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
827 {
828 struct rtable *rt;
829 struct flowi4 fl4;
830 const struct iphdr *iph = (const struct iphdr *) skb->data;
831 struct net *net = dev_net(skb->dev);
832 int oif = skb->dev->ifindex;
833 u8 tos = RT_TOS(iph->tos);
834 u8 prot = iph->protocol;
835 u32 mark = skb->mark;
836
837 rt = (struct rtable *) dst;
838
839 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
840 __ip_do_redirect(rt, skb, &fl4, true);
841 }
842
843 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
844 {
845 struct rtable *rt = (struct rtable *)dst;
846 struct dst_entry *ret = dst;
847
848 if (rt) {
849 if (dst->obsolete > 0) {
850 ip_rt_put(rt);
851 ret = NULL;
852 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
853 rt->dst.expires) {
854 ip_rt_put(rt);
855 ret = NULL;
856 }
857 }
858 return ret;
859 }
860
861 /*
862 * Algorithm:
863 * 1. The first ip_rt_redirect_number redirects are sent
864 * with exponential backoff, then we stop sending them at all,
865 * assuming that the host ignores our redirects.
866 * 2. If we did not see packets requiring redirects
867 * during ip_rt_redirect_silence, we assume that the host
868 * forgot redirected route and start to send redirects again.
869 *
870 * This algorithm is much cheaper and more intelligent than dumb load limiting
871 * in icmp.c.
872 *
873 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
874 * and "frag. need" (breaks PMTU discovery) in icmp.c.
875 */
876
877 void ip_rt_send_redirect(struct sk_buff *skb)
878 {
879 struct rtable *rt = skb_rtable(skb);
880 struct in_device *in_dev;
881 struct inet_peer *peer;
882 struct net *net;
883 int log_martians;
884 int vif;
885
886 rcu_read_lock();
887 in_dev = __in_dev_get_rcu(rt->dst.dev);
888 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
889 rcu_read_unlock();
890 return;
891 }
892 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
893 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
894 rcu_read_unlock();
895
896 net = dev_net(rt->dst.dev);
897 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
898 if (!peer) {
899 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
900 rt_nexthop(rt, ip_hdr(skb)->daddr));
901 return;
902 }
903
904 /* No redirected packets during ip_rt_redirect_silence;
905 * reset the algorithm.
906 */
907 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
908 peer->rate_tokens = 0;
909
910 /* Too many ignored redirects; do not send anything
911 * set dst.rate_last to the last seen redirected packet.
912 */
913 if (peer->rate_tokens >= ip_rt_redirect_number) {
914 peer->rate_last = jiffies;
915 goto out_put_peer;
916 }
917
918 /* Check for load limit; set rate_last to the latest sent
919 * redirect.
920 */
921 if (peer->rate_tokens == 0 ||
922 time_after(jiffies,
923 (peer->rate_last +
924 (ip_rt_redirect_load << peer->rate_tokens)))) {
925 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
926
927 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
928 peer->rate_last = jiffies;
929 ++peer->rate_tokens;
930 #ifdef CONFIG_IP_ROUTE_VERBOSE
931 if (log_martians &&
932 peer->rate_tokens == ip_rt_redirect_number)
933 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
934 &ip_hdr(skb)->saddr, inet_iif(skb),
935 &ip_hdr(skb)->daddr, &gw);
936 #endif
937 }
938 out_put_peer:
939 inet_putpeer(peer);
940 }
941
942 static int ip_error(struct sk_buff *skb)
943 {
944 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
945 struct rtable *rt = skb_rtable(skb);
946 struct inet_peer *peer;
947 unsigned long now;
948 struct net *net;
949 bool send;
950 int code;
951
952 /* IP on this device is disabled. */
953 if (!in_dev)
954 goto out;
955
956 net = dev_net(rt->dst.dev);
957 if (!IN_DEV_FORWARD(in_dev)) {
958 switch (rt->dst.error) {
959 case EHOSTUNREACH:
960 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
961 break;
962
963 case ENETUNREACH:
964 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
965 break;
966 }
967 goto out;
968 }
969
970 switch (rt->dst.error) {
971 case EINVAL:
972 default:
973 goto out;
974 case EHOSTUNREACH:
975 code = ICMP_HOST_UNREACH;
976 break;
977 case ENETUNREACH:
978 code = ICMP_NET_UNREACH;
979 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
980 break;
981 case EACCES:
982 code = ICMP_PKT_FILTERED;
983 break;
984 }
985
986 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
987 l3mdev_master_ifindex(skb->dev), 1);
988
989 send = true;
990 if (peer) {
991 now = jiffies;
992 peer->rate_tokens += now - peer->rate_last;
993 if (peer->rate_tokens > ip_rt_error_burst)
994 peer->rate_tokens = ip_rt_error_burst;
995 peer->rate_last = now;
996 if (peer->rate_tokens >= ip_rt_error_cost)
997 peer->rate_tokens -= ip_rt_error_cost;
998 else
999 send = false;
1000 inet_putpeer(peer);
1001 }
1002 if (send)
1003 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004
1005 out: kfree_skb(skb);
1006 return 0;
1007 }
1008
1009 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1010 {
1011 struct dst_entry *dst = &rt->dst;
1012 u32 old_mtu = ipv4_mtu(dst);
1013 struct fib_result res;
1014 bool lock = false;
1015
1016 if (ip_mtu_locked(dst))
1017 return;
1018
1019 if (old_mtu < mtu)
1020 return;
1021
1022 if (mtu < ip_rt_min_pmtu) {
1023 lock = true;
1024 mtu = min(old_mtu, ip_rt_min_pmtu);
1025 }
1026
1027 if (rt->rt_pmtu == mtu && !lock &&
1028 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1029 return;
1030
1031 rcu_read_lock();
1032 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1033 struct fib_nh *nh = &FIB_RES_NH(res);
1034
1035 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1036 jiffies + ip_rt_mtu_expires);
1037 }
1038 rcu_read_unlock();
1039 }
1040
1041 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1042 struct sk_buff *skb, u32 mtu)
1043 {
1044 struct rtable *rt = (struct rtable *) dst;
1045 struct flowi4 fl4;
1046
1047 ip_rt_build_flow_key(&fl4, sk, skb);
1048 __ip_rt_update_pmtu(rt, &fl4, mtu);
1049 }
1050
1051 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1052 int oif, u32 mark, u8 protocol, int flow_flags)
1053 {
1054 const struct iphdr *iph = (const struct iphdr *) skb->data;
1055 struct flowi4 fl4;
1056 struct rtable *rt;
1057
1058 if (!mark)
1059 mark = IP4_REPLY_MARK(net, skb->mark);
1060
1061 __build_flow_key(net, &fl4, NULL, iph, oif,
1062 RT_TOS(iph->tos), protocol, mark, flow_flags);
1063 rt = __ip_route_output_key(net, &fl4);
1064 if (!IS_ERR(rt)) {
1065 __ip_rt_update_pmtu(rt, &fl4, mtu);
1066 ip_rt_put(rt);
1067 }
1068 }
1069 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1070
1071 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1072 {
1073 const struct iphdr *iph = (const struct iphdr *) skb->data;
1074 struct flowi4 fl4;
1075 struct rtable *rt;
1076
1077 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1078
1079 if (!fl4.flowi4_mark)
1080 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1081
1082 rt = __ip_route_output_key(sock_net(sk), &fl4);
1083 if (!IS_ERR(rt)) {
1084 __ip_rt_update_pmtu(rt, &fl4, mtu);
1085 ip_rt_put(rt);
1086 }
1087 }
1088
1089 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1090 {
1091 const struct iphdr *iph = (const struct iphdr *) skb->data;
1092 struct flowi4 fl4;
1093 struct rtable *rt;
1094 struct dst_entry *odst = NULL;
1095 bool new = false;
1096 struct net *net = sock_net(sk);
1097
1098 bh_lock_sock(sk);
1099
1100 if (!ip_sk_accept_pmtu(sk))
1101 goto out;
1102
1103 odst = sk_dst_get(sk);
1104
1105 if (sock_owned_by_user(sk) || !odst) {
1106 __ipv4_sk_update_pmtu(skb, sk, mtu);
1107 goto out;
1108 }
1109
1110 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1111
1112 rt = (struct rtable *)odst;
1113 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1114 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1115 if (IS_ERR(rt))
1116 goto out;
1117
1118 new = true;
1119 }
1120
1121 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1122
1123 if (!dst_check(&rt->dst, 0)) {
1124 if (new)
1125 dst_release(&rt->dst);
1126
1127 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1128 if (IS_ERR(rt))
1129 goto out;
1130
1131 new = true;
1132 }
1133
1134 if (new)
1135 sk_dst_set(sk, &rt->dst);
1136
1137 out:
1138 bh_unlock_sock(sk);
1139 dst_release(odst);
1140 }
1141 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1142
1143 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1144 int oif, u32 mark, u8 protocol, int flow_flags)
1145 {
1146 const struct iphdr *iph = (const struct iphdr *) skb->data;
1147 struct flowi4 fl4;
1148 struct rtable *rt;
1149
1150 __build_flow_key(net, &fl4, NULL, iph, oif,
1151 RT_TOS(iph->tos), protocol, mark, flow_flags);
1152 rt = __ip_route_output_key(net, &fl4);
1153 if (!IS_ERR(rt)) {
1154 __ip_do_redirect(rt, skb, &fl4, false);
1155 ip_rt_put(rt);
1156 }
1157 }
1158 EXPORT_SYMBOL_GPL(ipv4_redirect);
1159
1160 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1161 {
1162 const struct iphdr *iph = (const struct iphdr *) skb->data;
1163 struct flowi4 fl4;
1164 struct rtable *rt;
1165 struct net *net = sock_net(sk);
1166
1167 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1168 rt = __ip_route_output_key(net, &fl4);
1169 if (!IS_ERR(rt)) {
1170 __ip_do_redirect(rt, skb, &fl4, false);
1171 ip_rt_put(rt);
1172 }
1173 }
1174 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1175
1176 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1177 {
1178 struct rtable *rt = (struct rtable *) dst;
1179
1180 /* All IPV4 dsts are created with ->obsolete set to the value
1181 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1182 * into this function always.
1183 *
1184 * When a PMTU/redirect information update invalidates a route,
1185 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1186 * DST_OBSOLETE_DEAD by dst_free().
1187 */
1188 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1189 return NULL;
1190 return dst;
1191 }
1192
1193 static void ipv4_link_failure(struct sk_buff *skb)
1194 {
1195 struct rtable *rt;
1196
1197 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1198
1199 rt = skb_rtable(skb);
1200 if (rt)
1201 dst_set_expires(&rt->dst, 0);
1202 }
1203
1204 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1205 {
1206 pr_debug("%s: %pI4 -> %pI4, %s\n",
1207 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1208 skb->dev ? skb->dev->name : "?");
1209 kfree_skb(skb);
1210 WARN_ON(1);
1211 return 0;
1212 }
1213
1214 /*
1215 We do not cache source address of outgoing interface,
1216 because it is used only by IP RR, TS and SRR options,
1217 so that it out of fast path.
1218
1219 BTW remember: "addr" is allowed to be not aligned
1220 in IP options!
1221 */
1222
1223 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1224 {
1225 __be32 src;
1226
1227 if (rt_is_output_route(rt))
1228 src = ip_hdr(skb)->saddr;
1229 else {
1230 struct fib_result res;
1231 struct flowi4 fl4;
1232 struct iphdr *iph;
1233
1234 iph = ip_hdr(skb);
1235
1236 memset(&fl4, 0, sizeof(fl4));
1237 fl4.daddr = iph->daddr;
1238 fl4.saddr = iph->saddr;
1239 fl4.flowi4_tos = RT_TOS(iph->tos);
1240 fl4.flowi4_oif = rt->dst.dev->ifindex;
1241 fl4.flowi4_iif = skb->dev->ifindex;
1242 fl4.flowi4_mark = skb->mark;
1243
1244 rcu_read_lock();
1245 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1246 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1247 else
1248 src = inet_select_addr(rt->dst.dev,
1249 rt_nexthop(rt, iph->daddr),
1250 RT_SCOPE_UNIVERSE);
1251 rcu_read_unlock();
1252 }
1253 memcpy(addr, &src, 4);
1254 }
1255
1256 #ifdef CONFIG_IP_ROUTE_CLASSID
1257 static void set_class_tag(struct rtable *rt, u32 tag)
1258 {
1259 if (!(rt->dst.tclassid & 0xFFFF))
1260 rt->dst.tclassid |= tag & 0xFFFF;
1261 if (!(rt->dst.tclassid & 0xFFFF0000))
1262 rt->dst.tclassid |= tag & 0xFFFF0000;
1263 }
1264 #endif
1265
1266 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1267 {
1268 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1269 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1270 ip_rt_min_advmss);
1271
1272 return min(advmss, IPV4_MAX_PMTU - header_size);
1273 }
1274
1275 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1276 {
1277 const struct rtable *rt = (const struct rtable *) dst;
1278 unsigned int mtu = rt->rt_pmtu;
1279
1280 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1281 mtu = dst_metric_raw(dst, RTAX_MTU);
1282
1283 if (mtu)
1284 return mtu;
1285
1286 mtu = READ_ONCE(dst->dev->mtu);
1287
1288 if (unlikely(ip_mtu_locked(dst))) {
1289 if (rt->rt_uses_gateway && mtu > 576)
1290 mtu = 576;
1291 }
1292
1293 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1294
1295 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1296 }
1297
1298 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1299 {
1300 struct fnhe_hash_bucket *hash;
1301 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1302 u32 hval = fnhe_hashfun(daddr);
1303
1304 spin_lock_bh(&fnhe_lock);
1305
1306 hash = rcu_dereference_protected(nh->nh_exceptions,
1307 lockdep_is_held(&fnhe_lock));
1308 hash += hval;
1309
1310 fnhe_p = &hash->chain;
1311 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1312 while (fnhe) {
1313 if (fnhe->fnhe_daddr == daddr) {
1314 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1315 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1316 fnhe_flush_routes(fnhe);
1317 kfree_rcu(fnhe, rcu);
1318 break;
1319 }
1320 fnhe_p = &fnhe->fnhe_next;
1321 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1322 lockdep_is_held(&fnhe_lock));
1323 }
1324
1325 spin_unlock_bh(&fnhe_lock);
1326 }
1327
1328 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1329 {
1330 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1331 struct fib_nh_exception *fnhe;
1332 u32 hval;
1333
1334 if (!hash)
1335 return NULL;
1336
1337 hval = fnhe_hashfun(daddr);
1338
1339 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1340 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1341 if (fnhe->fnhe_daddr == daddr) {
1342 if (fnhe->fnhe_expires &&
1343 time_after(jiffies, fnhe->fnhe_expires)) {
1344 ip_del_fnhe(nh, daddr);
1345 break;
1346 }
1347 return fnhe;
1348 }
1349 }
1350 return NULL;
1351 }
1352
1353 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1354 __be32 daddr, const bool do_cache)
1355 {
1356 bool ret = false;
1357
1358 spin_lock_bh(&fnhe_lock);
1359
1360 if (daddr == fnhe->fnhe_daddr) {
1361 struct rtable __rcu **porig;
1362 struct rtable *orig;
1363 int genid = fnhe_genid(dev_net(rt->dst.dev));
1364
1365 if (rt_is_input_route(rt))
1366 porig = &fnhe->fnhe_rth_input;
1367 else
1368 porig = &fnhe->fnhe_rth_output;
1369 orig = rcu_dereference(*porig);
1370
1371 if (fnhe->fnhe_genid != genid) {
1372 fnhe->fnhe_genid = genid;
1373 fnhe->fnhe_gw = 0;
1374 fnhe->fnhe_pmtu = 0;
1375 fnhe->fnhe_expires = 0;
1376 fnhe_flush_routes(fnhe);
1377 orig = NULL;
1378 }
1379 fill_route_from_fnhe(rt, fnhe);
1380 if (!rt->rt_gateway)
1381 rt->rt_gateway = daddr;
1382
1383 if (do_cache) {
1384 dst_hold(&rt->dst);
1385 rcu_assign_pointer(*porig, rt);
1386 if (orig) {
1387 dst_dev_put(&orig->dst);
1388 dst_release(&orig->dst);
1389 }
1390 ret = true;
1391 }
1392
1393 fnhe->fnhe_stamp = jiffies;
1394 }
1395 spin_unlock_bh(&fnhe_lock);
1396
1397 return ret;
1398 }
1399
1400 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1401 {
1402 struct rtable *orig, *prev, **p;
1403 bool ret = true;
1404
1405 if (rt_is_input_route(rt)) {
1406 p = (struct rtable **)&nh->nh_rth_input;
1407 } else {
1408 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1409 }
1410 orig = *p;
1411
1412 /* hold dst before doing cmpxchg() to avoid race condition
1413 * on this dst
1414 */
1415 dst_hold(&rt->dst);
1416 prev = cmpxchg(p, orig, rt);
1417 if (prev == orig) {
1418 if (orig) {
1419 dst_dev_put(&orig->dst);
1420 dst_release(&orig->dst);
1421 }
1422 } else {
1423 dst_release(&rt->dst);
1424 ret = false;
1425 }
1426
1427 return ret;
1428 }
1429
1430 struct uncached_list {
1431 spinlock_t lock;
1432 struct list_head head;
1433 };
1434
1435 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1436
1437 void rt_add_uncached_list(struct rtable *rt)
1438 {
1439 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1440
1441 rt->rt_uncached_list = ul;
1442
1443 spin_lock_bh(&ul->lock);
1444 list_add_tail(&rt->rt_uncached, &ul->head);
1445 spin_unlock_bh(&ul->lock);
1446 }
1447
1448 void rt_del_uncached_list(struct rtable *rt)
1449 {
1450 if (!list_empty(&rt->rt_uncached)) {
1451 struct uncached_list *ul = rt->rt_uncached_list;
1452
1453 spin_lock_bh(&ul->lock);
1454 list_del(&rt->rt_uncached);
1455 spin_unlock_bh(&ul->lock);
1456 }
1457 }
1458
1459 static void ipv4_dst_destroy(struct dst_entry *dst)
1460 {
1461 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1462 struct rtable *rt = (struct rtable *)dst;
1463
1464 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1465 kfree(p);
1466
1467 rt_del_uncached_list(rt);
1468 }
1469
1470 void rt_flush_dev(struct net_device *dev)
1471 {
1472 struct net *net = dev_net(dev);
1473 struct rtable *rt;
1474 int cpu;
1475
1476 for_each_possible_cpu(cpu) {
1477 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1478
1479 spin_lock_bh(&ul->lock);
1480 list_for_each_entry(rt, &ul->head, rt_uncached) {
1481 if (rt->dst.dev != dev)
1482 continue;
1483 rt->dst.dev = net->loopback_dev;
1484 dev_hold(rt->dst.dev);
1485 dev_put(dev);
1486 }
1487 spin_unlock_bh(&ul->lock);
1488 }
1489 }
1490
1491 static bool rt_cache_valid(const struct rtable *rt)
1492 {
1493 return rt &&
1494 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1495 !rt_is_expired(rt);
1496 }
1497
1498 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1499 const struct fib_result *res,
1500 struct fib_nh_exception *fnhe,
1501 struct fib_info *fi, u16 type, u32 itag,
1502 const bool do_cache)
1503 {
1504 bool cached = false;
1505
1506 if (fi) {
1507 struct fib_nh *nh = &FIB_RES_NH(*res);
1508
1509 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1510 rt->rt_gateway = nh->nh_gw;
1511 rt->rt_uses_gateway = 1;
1512 }
1513 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1514 if (fi->fib_metrics != &dst_default_metrics) {
1515 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1516 refcount_inc(&fi->fib_metrics->refcnt);
1517 }
1518 #ifdef CONFIG_IP_ROUTE_CLASSID
1519 rt->dst.tclassid = nh->nh_tclassid;
1520 #endif
1521 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1522 if (unlikely(fnhe))
1523 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1524 else if (do_cache)
1525 cached = rt_cache_route(nh, rt);
1526 if (unlikely(!cached)) {
1527 /* Routes we intend to cache in nexthop exception or
1528 * FIB nexthop have the DST_NOCACHE bit clear.
1529 * However, if we are unsuccessful at storing this
1530 * route into the cache we really need to set it.
1531 */
1532 if (!rt->rt_gateway)
1533 rt->rt_gateway = daddr;
1534 rt_add_uncached_list(rt);
1535 }
1536 } else
1537 rt_add_uncached_list(rt);
1538
1539 #ifdef CONFIG_IP_ROUTE_CLASSID
1540 #ifdef CONFIG_IP_MULTIPLE_TABLES
1541 set_class_tag(rt, res->tclassid);
1542 #endif
1543 set_class_tag(rt, itag);
1544 #endif
1545 }
1546
1547 struct rtable *rt_dst_alloc(struct net_device *dev,
1548 unsigned int flags, u16 type,
1549 bool nopolicy, bool noxfrm, bool will_cache)
1550 {
1551 struct rtable *rt;
1552
1553 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1554 (will_cache ? 0 : DST_HOST) |
1555 (nopolicy ? DST_NOPOLICY : 0) |
1556 (noxfrm ? DST_NOXFRM : 0));
1557
1558 if (rt) {
1559 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1560 rt->rt_flags = flags;
1561 rt->rt_type = type;
1562 rt->rt_is_input = 0;
1563 rt->rt_iif = 0;
1564 rt->rt_pmtu = 0;
1565 rt->rt_mtu_locked = 0;
1566 rt->rt_gateway = 0;
1567 rt->rt_uses_gateway = 0;
1568 rt->rt_table_id = 0;
1569 INIT_LIST_HEAD(&rt->rt_uncached);
1570
1571 rt->dst.output = ip_output;
1572 if (flags & RTCF_LOCAL)
1573 rt->dst.input = ip_local_deliver;
1574 }
1575
1576 return rt;
1577 }
1578 EXPORT_SYMBOL(rt_dst_alloc);
1579
1580 /* called in rcu_read_lock() section */
1581 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1582 u8 tos, struct net_device *dev,
1583 struct in_device *in_dev, u32 *itag)
1584 {
1585 int err;
1586
1587 /* Primary sanity checks. */
1588 if (!in_dev)
1589 return -EINVAL;
1590
1591 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1592 skb->protocol != htons(ETH_P_IP))
1593 return -EINVAL;
1594
1595 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1596 return -EINVAL;
1597
1598 if (ipv4_is_zeronet(saddr)) {
1599 if (!ipv4_is_local_multicast(daddr))
1600 return -EINVAL;
1601 } else {
1602 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1603 in_dev, itag);
1604 if (err < 0)
1605 return err;
1606 }
1607 return 0;
1608 }
1609
1610 /* called in rcu_read_lock() section */
1611 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1612 u8 tos, struct net_device *dev, int our)
1613 {
1614 struct in_device *in_dev = __in_dev_get_rcu(dev);
1615 unsigned int flags = RTCF_MULTICAST;
1616 struct rtable *rth;
1617 u32 itag = 0;
1618 int err;
1619
1620 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1621 if (err)
1622 return err;
1623
1624 if (our)
1625 flags |= RTCF_LOCAL;
1626
1627 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1628 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1629 if (!rth)
1630 return -ENOBUFS;
1631
1632 #ifdef CONFIG_IP_ROUTE_CLASSID
1633 rth->dst.tclassid = itag;
1634 #endif
1635 rth->dst.output = ip_rt_bug;
1636 rth->rt_is_input= 1;
1637
1638 #ifdef CONFIG_IP_MROUTE
1639 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1640 rth->dst.input = ip_mr_input;
1641 #endif
1642 RT_CACHE_STAT_INC(in_slow_mc);
1643
1644 skb_dst_set(skb, &rth->dst);
1645 return 0;
1646 }
1647
1648
1649 static void ip_handle_martian_source(struct net_device *dev,
1650 struct in_device *in_dev,
1651 struct sk_buff *skb,
1652 __be32 daddr,
1653 __be32 saddr)
1654 {
1655 RT_CACHE_STAT_INC(in_martian_src);
1656 #ifdef CONFIG_IP_ROUTE_VERBOSE
1657 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1658 /*
1659 * RFC1812 recommendation, if source is martian,
1660 * the only hint is MAC header.
1661 */
1662 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1663 &daddr, &saddr, dev->name);
1664 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1665 print_hex_dump(KERN_WARNING, "ll header: ",
1666 DUMP_PREFIX_OFFSET, 16, 1,
1667 skb_mac_header(skb),
1668 dev->hard_header_len, true);
1669 }
1670 }
1671 #endif
1672 }
1673
1674 static void set_lwt_redirect(struct rtable *rth)
1675 {
1676 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1677 rth->dst.lwtstate->orig_output = rth->dst.output;
1678 rth->dst.output = lwtunnel_output;
1679 }
1680
1681 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1682 rth->dst.lwtstate->orig_input = rth->dst.input;
1683 rth->dst.input = lwtunnel_input;
1684 }
1685 }
1686
1687 /* called in rcu_read_lock() section */
1688 static int __mkroute_input(struct sk_buff *skb,
1689 const struct fib_result *res,
1690 struct in_device *in_dev,
1691 __be32 daddr, __be32 saddr, u32 tos)
1692 {
1693 struct fib_nh_exception *fnhe;
1694 struct rtable *rth;
1695 int err;
1696 struct in_device *out_dev;
1697 bool do_cache;
1698 u32 itag = 0;
1699
1700 /* get a working reference to the output device */
1701 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1702 if (!out_dev) {
1703 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1704 return -EINVAL;
1705 }
1706
1707 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1708 in_dev->dev, in_dev, &itag);
1709 if (err < 0) {
1710 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1711 saddr);
1712
1713 goto cleanup;
1714 }
1715
1716 do_cache = res->fi && !itag;
1717 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1718 skb->protocol == htons(ETH_P_IP) &&
1719 (IN_DEV_SHARED_MEDIA(out_dev) ||
1720 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1721 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1722
1723 if (skb->protocol != htons(ETH_P_IP)) {
1724 /* Not IP (i.e. ARP). Do not create route, if it is
1725 * invalid for proxy arp. DNAT routes are always valid.
1726 *
1727 * Proxy arp feature have been extended to allow, ARP
1728 * replies back to the same interface, to support
1729 * Private VLAN switch technologies. See arp.c.
1730 */
1731 if (out_dev == in_dev &&
1732 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1733 err = -EINVAL;
1734 goto cleanup;
1735 }
1736 }
1737
1738 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1739 if (do_cache) {
1740 if (fnhe)
1741 rth = rcu_dereference(fnhe->fnhe_rth_input);
1742 else
1743 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1744 if (rt_cache_valid(rth)) {
1745 skb_dst_set_noref(skb, &rth->dst);
1746 goto out;
1747 }
1748 }
1749
1750 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1751 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1752 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1753 if (!rth) {
1754 err = -ENOBUFS;
1755 goto cleanup;
1756 }
1757
1758 rth->rt_is_input = 1;
1759 if (res->table)
1760 rth->rt_table_id = res->table->tb_id;
1761 RT_CACHE_STAT_INC(in_slow_tot);
1762
1763 rth->dst.input = ip_forward;
1764
1765 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1766 do_cache);
1767 set_lwt_redirect(rth);
1768 skb_dst_set(skb, &rth->dst);
1769 out:
1770 err = 0;
1771 cleanup:
1772 return err;
1773 }
1774
1775 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1776 /* To make ICMP packets follow the right flow, the multipath hash is
1777 * calculated from the inner IP addresses.
1778 */
1779 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1780 struct flow_keys *hash_keys)
1781 {
1782 const struct iphdr *outer_iph = ip_hdr(skb);
1783 const struct iphdr *inner_iph;
1784 const struct icmphdr *icmph;
1785 struct iphdr _inner_iph;
1786 struct icmphdr _icmph;
1787
1788 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1789 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1790 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1791 return;
1792
1793 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1794 return;
1795
1796 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1797 &_icmph);
1798 if (!icmph)
1799 return;
1800
1801 if (icmph->type != ICMP_DEST_UNREACH &&
1802 icmph->type != ICMP_REDIRECT &&
1803 icmph->type != ICMP_TIME_EXCEEDED &&
1804 icmph->type != ICMP_PARAMETERPROB)
1805 return;
1806
1807 inner_iph = skb_header_pointer(skb,
1808 outer_iph->ihl * 4 + sizeof(_icmph),
1809 sizeof(_inner_iph), &_inner_iph);
1810 if (!inner_iph)
1811 return;
1812 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1813 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1814 }
1815
1816 /* if skb is set it will be used and fl4 can be NULL */
1817 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1818 const struct sk_buff *skb)
1819 {
1820 struct net *net = fi->fib_net;
1821 struct flow_keys hash_keys;
1822 u32 mhash;
1823
1824 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1825 case 0:
1826 memset(&hash_keys, 0, sizeof(hash_keys));
1827 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1828 if (skb) {
1829 ip_multipath_l3_keys(skb, &hash_keys);
1830 } else {
1831 hash_keys.addrs.v4addrs.src = fl4->saddr;
1832 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1833 }
1834 break;
1835 case 1:
1836 /* skb is currently provided only when forwarding */
1837 if (skb) {
1838 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1839 struct flow_keys keys;
1840
1841 /* short-circuit if we already have L4 hash present */
1842 if (skb->l4_hash)
1843 return skb_get_hash_raw(skb) >> 1;
1844 memset(&hash_keys, 0, sizeof(hash_keys));
1845 skb_flow_dissect_flow_keys(skb, &keys, flag);
1846
1847 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1848 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1849 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1850 hash_keys.ports.src = keys.ports.src;
1851 hash_keys.ports.dst = keys.ports.dst;
1852 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1853 } else {
1854 memset(&hash_keys, 0, sizeof(hash_keys));
1855 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1856 hash_keys.addrs.v4addrs.src = fl4->saddr;
1857 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1858 hash_keys.ports.src = fl4->fl4_sport;
1859 hash_keys.ports.dst = fl4->fl4_dport;
1860 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1861 }
1862 break;
1863 }
1864 mhash = flow_hash_from_keys(&hash_keys);
1865
1866 return mhash >> 1;
1867 }
1868 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1869 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1870
1871 static int ip_mkroute_input(struct sk_buff *skb,
1872 struct fib_result *res,
1873 struct in_device *in_dev,
1874 __be32 daddr, __be32 saddr, u32 tos)
1875 {
1876 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1877 if (res->fi && res->fi->fib_nhs > 1) {
1878 int h = fib_multipath_hash(res->fi, NULL, skb);
1879
1880 fib_select_multipath(res, h);
1881 }
1882 #endif
1883
1884 /* create a routing cache entry */
1885 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1886 }
1887
1888 /*
1889 * NOTE. We drop all the packets that has local source
1890 * addresses, because every properly looped back packet
1891 * must have correct destination already attached by output routine.
1892 *
1893 * Such approach solves two big problems:
1894 * 1. Not simplex devices are handled properly.
1895 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1896 * called with rcu_read_lock()
1897 */
1898
1899 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1900 u8 tos, struct net_device *dev,
1901 struct fib_result *res)
1902 {
1903 struct in_device *in_dev = __in_dev_get_rcu(dev);
1904 struct ip_tunnel_info *tun_info;
1905 struct flowi4 fl4;
1906 unsigned int flags = 0;
1907 u32 itag = 0;
1908 struct rtable *rth;
1909 int err = -EINVAL;
1910 struct net *net = dev_net(dev);
1911 bool do_cache;
1912
1913 /* IP on this device is disabled. */
1914
1915 if (!in_dev)
1916 goto out;
1917
1918 /* Check for the most weird martians, which can be not detected
1919 by fib_lookup.
1920 */
1921
1922 tun_info = skb_tunnel_info(skb);
1923 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1924 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1925 else
1926 fl4.flowi4_tun_key.tun_id = 0;
1927 skb_dst_drop(skb);
1928
1929 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1930 goto martian_source;
1931
1932 res->fi = NULL;
1933 res->table = NULL;
1934 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1935 goto brd_input;
1936
1937 /* Accept zero addresses only to limited broadcast;
1938 * I even do not know to fix it or not. Waiting for complains :-)
1939 */
1940 if (ipv4_is_zeronet(saddr))
1941 goto martian_source;
1942
1943 if (ipv4_is_zeronet(daddr))
1944 goto martian_destination;
1945
1946 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1947 * and call it once if daddr or/and saddr are loopback addresses
1948 */
1949 if (ipv4_is_loopback(daddr)) {
1950 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1951 goto martian_destination;
1952 } else if (ipv4_is_loopback(saddr)) {
1953 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1954 goto martian_source;
1955 }
1956
1957 /*
1958 * Now we are ready to route packet.
1959 */
1960 fl4.flowi4_oif = 0;
1961 fl4.flowi4_iif = dev->ifindex;
1962 fl4.flowi4_mark = skb->mark;
1963 fl4.flowi4_tos = tos;
1964 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1965 fl4.flowi4_flags = 0;
1966 fl4.daddr = daddr;
1967 fl4.saddr = saddr;
1968 fl4.flowi4_uid = sock_net_uid(net, NULL);
1969 err = fib_lookup(net, &fl4, res, 0);
1970 if (err != 0) {
1971 if (!IN_DEV_FORWARD(in_dev))
1972 err = -EHOSTUNREACH;
1973 goto no_route;
1974 }
1975
1976 if (res->type == RTN_BROADCAST)
1977 goto brd_input;
1978
1979 if (res->type == RTN_LOCAL) {
1980 err = fib_validate_source(skb, saddr, daddr, tos,
1981 0, dev, in_dev, &itag);
1982 if (err < 0)
1983 goto martian_source;
1984 goto local_input;
1985 }
1986
1987 if (!IN_DEV_FORWARD(in_dev)) {
1988 err = -EHOSTUNREACH;
1989 goto no_route;
1990 }
1991 if (res->type != RTN_UNICAST)
1992 goto martian_destination;
1993
1994 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1995 out: return err;
1996
1997 brd_input:
1998 if (skb->protocol != htons(ETH_P_IP))
1999 goto e_inval;
2000
2001 if (!ipv4_is_zeronet(saddr)) {
2002 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2003 in_dev, &itag);
2004 if (err < 0)
2005 goto martian_source;
2006 }
2007 flags |= RTCF_BROADCAST;
2008 res->type = RTN_BROADCAST;
2009 RT_CACHE_STAT_INC(in_brd);
2010
2011 local_input:
2012 do_cache = false;
2013 if (res->fi) {
2014 if (!itag) {
2015 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2016 if (rt_cache_valid(rth)) {
2017 skb_dst_set_noref(skb, &rth->dst);
2018 err = 0;
2019 goto out;
2020 }
2021 do_cache = true;
2022 }
2023 }
2024
2025 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2026 flags | RTCF_LOCAL, res->type,
2027 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2028 if (!rth)
2029 goto e_nobufs;
2030
2031 rth->dst.output= ip_rt_bug;
2032 #ifdef CONFIG_IP_ROUTE_CLASSID
2033 rth->dst.tclassid = itag;
2034 #endif
2035 rth->rt_is_input = 1;
2036 if (res->table)
2037 rth->rt_table_id = res->table->tb_id;
2038
2039 RT_CACHE_STAT_INC(in_slow_tot);
2040 if (res->type == RTN_UNREACHABLE) {
2041 rth->dst.input= ip_error;
2042 rth->dst.error= -err;
2043 rth->rt_flags &= ~RTCF_LOCAL;
2044 }
2045
2046 if (do_cache) {
2047 struct fib_nh *nh = &FIB_RES_NH(*res);
2048
2049 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2050 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2051 WARN_ON(rth->dst.input == lwtunnel_input);
2052 rth->dst.lwtstate->orig_input = rth->dst.input;
2053 rth->dst.input = lwtunnel_input;
2054 }
2055
2056 if (unlikely(!rt_cache_route(nh, rth)))
2057 rt_add_uncached_list(rth);
2058 }
2059 skb_dst_set(skb, &rth->dst);
2060 err = 0;
2061 goto out;
2062
2063 no_route:
2064 RT_CACHE_STAT_INC(in_no_route);
2065 res->type = RTN_UNREACHABLE;
2066 res->fi = NULL;
2067 res->table = NULL;
2068 goto local_input;
2069
2070 /*
2071 * Do not cache martian addresses: they should be logged (RFC1812)
2072 */
2073 martian_destination:
2074 RT_CACHE_STAT_INC(in_martian_dst);
2075 #ifdef CONFIG_IP_ROUTE_VERBOSE
2076 if (IN_DEV_LOG_MARTIANS(in_dev))
2077 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2078 &daddr, &saddr, dev->name);
2079 #endif
2080
2081 e_inval:
2082 err = -EINVAL;
2083 goto out;
2084
2085 e_nobufs:
2086 err = -ENOBUFS;
2087 goto out;
2088
2089 martian_source:
2090 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2091 goto out;
2092 }
2093
2094 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2095 u8 tos, struct net_device *dev)
2096 {
2097 struct fib_result res;
2098 int err;
2099
2100 tos &= IPTOS_RT_MASK;
2101 rcu_read_lock();
2102 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2103 rcu_read_unlock();
2104
2105 return err;
2106 }
2107 EXPORT_SYMBOL(ip_route_input_noref);
2108
2109 /* called with rcu_read_lock held */
2110 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2111 u8 tos, struct net_device *dev, struct fib_result *res)
2112 {
2113 /* Multicast recognition logic is moved from route cache to here.
2114 The problem was that too many Ethernet cards have broken/missing
2115 hardware multicast filters :-( As result the host on multicasting
2116 network acquires a lot of useless route cache entries, sort of
2117 SDR messages from all the world. Now we try to get rid of them.
2118 Really, provided software IP multicast filter is organized
2119 reasonably (at least, hashed), it does not result in a slowdown
2120 comparing with route cache reject entries.
2121 Note, that multicast routers are not affected, because
2122 route cache entry is created eventually.
2123 */
2124 if (ipv4_is_multicast(daddr)) {
2125 struct in_device *in_dev = __in_dev_get_rcu(dev);
2126 int our = 0;
2127 int err = -EINVAL;
2128
2129 if (in_dev)
2130 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2131 ip_hdr(skb)->protocol);
2132
2133 /* check l3 master if no match yet */
2134 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2135 struct in_device *l3_in_dev;
2136
2137 l3_in_dev = __in_dev_get_rcu(skb->dev);
2138 if (l3_in_dev)
2139 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2140 ip_hdr(skb)->protocol);
2141 }
2142
2143 if (our
2144 #ifdef CONFIG_IP_MROUTE
2145 ||
2146 (!ipv4_is_local_multicast(daddr) &&
2147 IN_DEV_MFORWARD(in_dev))
2148 #endif
2149 ) {
2150 err = ip_route_input_mc(skb, daddr, saddr,
2151 tos, dev, our);
2152 }
2153 return err;
2154 }
2155
2156 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2157 }
2158
2159 /* called with rcu_read_lock() */
2160 static struct rtable *__mkroute_output(const struct fib_result *res,
2161 const struct flowi4 *fl4, int orig_oif,
2162 struct net_device *dev_out,
2163 unsigned int flags)
2164 {
2165 struct fib_info *fi = res->fi;
2166 struct fib_nh_exception *fnhe;
2167 struct in_device *in_dev;
2168 u16 type = res->type;
2169 struct rtable *rth;
2170 bool do_cache;
2171
2172 in_dev = __in_dev_get_rcu(dev_out);
2173 if (!in_dev)
2174 return ERR_PTR(-EINVAL);
2175
2176 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2177 if (ipv4_is_loopback(fl4->saddr) &&
2178 !(dev_out->flags & IFF_LOOPBACK) &&
2179 !netif_is_l3_master(dev_out))
2180 return ERR_PTR(-EINVAL);
2181
2182 if (ipv4_is_lbcast(fl4->daddr))
2183 type = RTN_BROADCAST;
2184 else if (ipv4_is_multicast(fl4->daddr))
2185 type = RTN_MULTICAST;
2186 else if (ipv4_is_zeronet(fl4->daddr))
2187 return ERR_PTR(-EINVAL);
2188
2189 if (dev_out->flags & IFF_LOOPBACK)
2190 flags |= RTCF_LOCAL;
2191
2192 do_cache = true;
2193 if (type == RTN_BROADCAST) {
2194 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2195 fi = NULL;
2196 } else if (type == RTN_MULTICAST) {
2197 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2198 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2199 fl4->flowi4_proto))
2200 flags &= ~RTCF_LOCAL;
2201 else
2202 do_cache = false;
2203 /* If multicast route do not exist use
2204 * default one, but do not gateway in this case.
2205 * Yes, it is hack.
2206 */
2207 if (fi && res->prefixlen < 4)
2208 fi = NULL;
2209 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2210 (orig_oif != dev_out->ifindex)) {
2211 /* For local routes that require a particular output interface
2212 * we do not want to cache the result. Caching the result
2213 * causes incorrect behaviour when there are multiple source
2214 * addresses on the interface, the end result being that if the
2215 * intended recipient is waiting on that interface for the
2216 * packet he won't receive it because it will be delivered on
2217 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2218 * be set to the loopback interface as well.
2219 */
2220 do_cache = false;
2221 }
2222
2223 fnhe = NULL;
2224 do_cache &= fi != NULL;
2225 if (fi) {
2226 struct rtable __rcu **prth;
2227 struct fib_nh *nh = &FIB_RES_NH(*res);
2228
2229 fnhe = find_exception(nh, fl4->daddr);
2230 if (!do_cache)
2231 goto add;
2232 if (fnhe) {
2233 prth = &fnhe->fnhe_rth_output;
2234 } else {
2235 if (unlikely(fl4->flowi4_flags &
2236 FLOWI_FLAG_KNOWN_NH &&
2237 !(nh->nh_gw &&
2238 nh->nh_scope == RT_SCOPE_LINK))) {
2239 do_cache = false;
2240 goto add;
2241 }
2242 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2243 }
2244 rth = rcu_dereference(*prth);
2245 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2246 return rth;
2247 }
2248
2249 add:
2250 rth = rt_dst_alloc(dev_out, flags, type,
2251 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2252 IN_DEV_CONF_GET(in_dev, NOXFRM),
2253 do_cache);
2254 if (!rth)
2255 return ERR_PTR(-ENOBUFS);
2256
2257 rth->rt_iif = orig_oif;
2258 if (res->table)
2259 rth->rt_table_id = res->table->tb_id;
2260
2261 RT_CACHE_STAT_INC(out_slow_tot);
2262
2263 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2264 if (flags & RTCF_LOCAL &&
2265 !(dev_out->flags & IFF_LOOPBACK)) {
2266 rth->dst.output = ip_mc_output;
2267 RT_CACHE_STAT_INC(out_slow_mc);
2268 }
2269 #ifdef CONFIG_IP_MROUTE
2270 if (type == RTN_MULTICAST) {
2271 if (IN_DEV_MFORWARD(in_dev) &&
2272 !ipv4_is_local_multicast(fl4->daddr)) {
2273 rth->dst.input = ip_mr_input;
2274 rth->dst.output = ip_mc_output;
2275 }
2276 }
2277 #endif
2278 }
2279
2280 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2281 set_lwt_redirect(rth);
2282
2283 return rth;
2284 }
2285
2286 /*
2287 * Major route resolver routine.
2288 */
2289
2290 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2291 const struct sk_buff *skb)
2292 {
2293 __u8 tos = RT_FL_TOS(fl4);
2294 struct fib_result res = {
2295 .type = RTN_UNSPEC,
2296 .fi = NULL,
2297 .table = NULL,
2298 .tclassid = 0,
2299 };
2300 struct rtable *rth;
2301
2302 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2303 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2304 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2305 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2306
2307 rcu_read_lock();
2308 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2309 rcu_read_unlock();
2310
2311 return rth;
2312 }
2313 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2314
2315 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2316 struct fib_result *res,
2317 const struct sk_buff *skb)
2318 {
2319 struct net_device *dev_out = NULL;
2320 int orig_oif = fl4->flowi4_oif;
2321 unsigned int flags = 0;
2322 struct rtable *rth;
2323 int err = -ENETUNREACH;
2324
2325 if (fl4->saddr) {
2326 rth = ERR_PTR(-EINVAL);
2327 if (ipv4_is_multicast(fl4->saddr) ||
2328 ipv4_is_lbcast(fl4->saddr) ||
2329 ipv4_is_zeronet(fl4->saddr))
2330 goto out;
2331
2332 /* I removed check for oif == dev_out->oif here.
2333 It was wrong for two reasons:
2334 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2335 is assigned to multiple interfaces.
2336 2. Moreover, we are allowed to send packets with saddr
2337 of another iface. --ANK
2338 */
2339
2340 if (fl4->flowi4_oif == 0 &&
2341 (ipv4_is_multicast(fl4->daddr) ||
2342 ipv4_is_lbcast(fl4->daddr))) {
2343 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2344 dev_out = __ip_dev_find(net, fl4->saddr, false);
2345 if (!dev_out)
2346 goto out;
2347
2348 /* Special hack: user can direct multicasts
2349 and limited broadcast via necessary interface
2350 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2351 This hack is not just for fun, it allows
2352 vic,vat and friends to work.
2353 They bind socket to loopback, set ttl to zero
2354 and expect that it will work.
2355 From the viewpoint of routing cache they are broken,
2356 because we are not allowed to build multicast path
2357 with loopback source addr (look, routing cache
2358 cannot know, that ttl is zero, so that packet
2359 will not leave this host and route is valid).
2360 Luckily, this hack is good workaround.
2361 */
2362
2363 fl4->flowi4_oif = dev_out->ifindex;
2364 goto make_route;
2365 }
2366
2367 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2368 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2369 if (!__ip_dev_find(net, fl4->saddr, false))
2370 goto out;
2371 }
2372 }
2373
2374
2375 if (fl4->flowi4_oif) {
2376 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2377 rth = ERR_PTR(-ENODEV);
2378 if (!dev_out)
2379 goto out;
2380
2381 /* RACE: Check return value of inet_select_addr instead. */
2382 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2383 rth = ERR_PTR(-ENETUNREACH);
2384 goto out;
2385 }
2386 if (ipv4_is_local_multicast(fl4->daddr) ||
2387 ipv4_is_lbcast(fl4->daddr) ||
2388 fl4->flowi4_proto == IPPROTO_IGMP) {
2389 if (!fl4->saddr)
2390 fl4->saddr = inet_select_addr(dev_out, 0,
2391 RT_SCOPE_LINK);
2392 goto make_route;
2393 }
2394 if (!fl4->saddr) {
2395 if (ipv4_is_multicast(fl4->daddr))
2396 fl4->saddr = inet_select_addr(dev_out, 0,
2397 fl4->flowi4_scope);
2398 else if (!fl4->daddr)
2399 fl4->saddr = inet_select_addr(dev_out, 0,
2400 RT_SCOPE_HOST);
2401 }
2402 }
2403
2404 if (!fl4->daddr) {
2405 fl4->daddr = fl4->saddr;
2406 if (!fl4->daddr)
2407 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2408 dev_out = net->loopback_dev;
2409 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2410 res->type = RTN_LOCAL;
2411 flags |= RTCF_LOCAL;
2412 goto make_route;
2413 }
2414
2415 err = fib_lookup(net, fl4, res, 0);
2416 if (err) {
2417 res->fi = NULL;
2418 res->table = NULL;
2419 if (fl4->flowi4_oif &&
2420 (ipv4_is_multicast(fl4->daddr) ||
2421 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2422 /* Apparently, routing tables are wrong. Assume,
2423 that the destination is on link.
2424
2425 WHY? DW.
2426 Because we are allowed to send to iface
2427 even if it has NO routes and NO assigned
2428 addresses. When oif is specified, routing
2429 tables are looked up with only one purpose:
2430 to catch if destination is gatewayed, rather than
2431 direct. Moreover, if MSG_DONTROUTE is set,
2432 we send packet, ignoring both routing tables
2433 and ifaddr state. --ANK
2434
2435
2436 We could make it even if oif is unknown,
2437 likely IPv6, but we do not.
2438 */
2439
2440 if (fl4->saddr == 0)
2441 fl4->saddr = inet_select_addr(dev_out, 0,
2442 RT_SCOPE_LINK);
2443 res->type = RTN_UNICAST;
2444 goto make_route;
2445 }
2446 rth = ERR_PTR(err);
2447 goto out;
2448 }
2449
2450 if (res->type == RTN_LOCAL) {
2451 if (!fl4->saddr) {
2452 if (res->fi->fib_prefsrc)
2453 fl4->saddr = res->fi->fib_prefsrc;
2454 else
2455 fl4->saddr = fl4->daddr;
2456 }
2457
2458 /* L3 master device is the loopback for that domain */
2459 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2460 net->loopback_dev;
2461
2462 /* make sure orig_oif points to fib result device even
2463 * though packet rx/tx happens over loopback or l3mdev
2464 */
2465 orig_oif = FIB_RES_OIF(*res);
2466
2467 fl4->flowi4_oif = dev_out->ifindex;
2468 flags |= RTCF_LOCAL;
2469 goto make_route;
2470 }
2471
2472 fib_select_path(net, res, fl4, skb);
2473
2474 dev_out = FIB_RES_DEV(*res);
2475 fl4->flowi4_oif = dev_out->ifindex;
2476
2477
2478 make_route:
2479 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2480
2481 out:
2482 return rth;
2483 }
2484
2485 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2486 {
2487 return NULL;
2488 }
2489
2490 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2491 {
2492 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2493
2494 return mtu ? : dst->dev->mtu;
2495 }
2496
2497 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2498 struct sk_buff *skb, u32 mtu)
2499 {
2500 }
2501
2502 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2503 struct sk_buff *skb)
2504 {
2505 }
2506
2507 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2508 unsigned long old)
2509 {
2510 return NULL;
2511 }
2512
2513 static struct dst_ops ipv4_dst_blackhole_ops = {
2514 .family = AF_INET,
2515 .check = ipv4_blackhole_dst_check,
2516 .mtu = ipv4_blackhole_mtu,
2517 .default_advmss = ipv4_default_advmss,
2518 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2519 .redirect = ipv4_rt_blackhole_redirect,
2520 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2521 .neigh_lookup = ipv4_neigh_lookup,
2522 };
2523
2524 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2525 {
2526 struct rtable *ort = (struct rtable *) dst_orig;
2527 struct rtable *rt;
2528
2529 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2530 if (rt) {
2531 struct dst_entry *new = &rt->dst;
2532
2533 new->__use = 1;
2534 new->input = dst_discard;
2535 new->output = dst_discard_out;
2536
2537 new->dev = net->loopback_dev;
2538 if (new->dev)
2539 dev_hold(new->dev);
2540
2541 rt->rt_is_input = ort->rt_is_input;
2542 rt->rt_iif = ort->rt_iif;
2543 rt->rt_pmtu = ort->rt_pmtu;
2544 rt->rt_mtu_locked = ort->rt_mtu_locked;
2545
2546 rt->rt_genid = rt_genid_ipv4(net);
2547 rt->rt_flags = ort->rt_flags;
2548 rt->rt_type = ort->rt_type;
2549 rt->rt_gateway = ort->rt_gateway;
2550 rt->rt_uses_gateway = ort->rt_uses_gateway;
2551
2552 INIT_LIST_HEAD(&rt->rt_uncached);
2553 }
2554
2555 dst_release(dst_orig);
2556
2557 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2558 }
2559
2560 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2561 const struct sock *sk)
2562 {
2563 struct rtable *rt = __ip_route_output_key(net, flp4);
2564
2565 if (IS_ERR(rt))
2566 return rt;
2567
2568 if (flp4->flowi4_proto)
2569 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2570 flowi4_to_flowi(flp4),
2571 sk, 0);
2572
2573 return rt;
2574 }
2575 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2576
2577 /* called with rcu_read_lock held */
2578 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2579 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2580 u32 seq)
2581 {
2582 struct rtable *rt = skb_rtable(skb);
2583 struct rtmsg *r;
2584 struct nlmsghdr *nlh;
2585 unsigned long expires = 0;
2586 u32 error;
2587 u32 metrics[RTAX_MAX];
2588
2589 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2590 if (!nlh)
2591 return -EMSGSIZE;
2592
2593 r = nlmsg_data(nlh);
2594 r->rtm_family = AF_INET;
2595 r->rtm_dst_len = 32;
2596 r->rtm_src_len = 0;
2597 r->rtm_tos = fl4->flowi4_tos;
2598 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2599 if (nla_put_u32(skb, RTA_TABLE, table_id))
2600 goto nla_put_failure;
2601 r->rtm_type = rt->rt_type;
2602 r->rtm_scope = RT_SCOPE_UNIVERSE;
2603 r->rtm_protocol = RTPROT_UNSPEC;
2604 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2605 if (rt->rt_flags & RTCF_NOTIFY)
2606 r->rtm_flags |= RTM_F_NOTIFY;
2607 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2608 r->rtm_flags |= RTCF_DOREDIRECT;
2609
2610 if (nla_put_in_addr(skb, RTA_DST, dst))
2611 goto nla_put_failure;
2612 if (src) {
2613 r->rtm_src_len = 32;
2614 if (nla_put_in_addr(skb, RTA_SRC, src))
2615 goto nla_put_failure;
2616 }
2617 if (rt->dst.dev &&
2618 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2619 goto nla_put_failure;
2620 #ifdef CONFIG_IP_ROUTE_CLASSID
2621 if (rt->dst.tclassid &&
2622 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2623 goto nla_put_failure;
2624 #endif
2625 if (!rt_is_input_route(rt) &&
2626 fl4->saddr != src) {
2627 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2628 goto nla_put_failure;
2629 }
2630 if (rt->rt_uses_gateway &&
2631 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2632 goto nla_put_failure;
2633
2634 expires = rt->dst.expires;
2635 if (expires) {
2636 unsigned long now = jiffies;
2637
2638 if (time_before(now, expires))
2639 expires -= now;
2640 else
2641 expires = 0;
2642 }
2643
2644 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2645 if (rt->rt_pmtu && expires)
2646 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2647 if (rt->rt_mtu_locked && expires)
2648 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2649 if (rtnetlink_put_metrics(skb, metrics) < 0)
2650 goto nla_put_failure;
2651
2652 if (fl4->flowi4_mark &&
2653 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2654 goto nla_put_failure;
2655
2656 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2657 nla_put_u32(skb, RTA_UID,
2658 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2659 goto nla_put_failure;
2660
2661 error = rt->dst.error;
2662
2663 if (rt_is_input_route(rt)) {
2664 #ifdef CONFIG_IP_MROUTE
2665 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2666 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2667 int err = ipmr_get_route(net, skb,
2668 fl4->saddr, fl4->daddr,
2669 r, portid);
2670
2671 if (err <= 0) {
2672 if (err == 0)
2673 return 0;
2674 goto nla_put_failure;
2675 }
2676 } else
2677 #endif
2678 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2679 goto nla_put_failure;
2680 }
2681
2682 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2683 goto nla_put_failure;
2684
2685 nlmsg_end(skb, nlh);
2686 return 0;
2687
2688 nla_put_failure:
2689 nlmsg_cancel(skb, nlh);
2690 return -EMSGSIZE;
2691 }
2692
2693 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2694 struct netlink_ext_ack *extack)
2695 {
2696 struct net *net = sock_net(in_skb->sk);
2697 struct rtmsg *rtm;
2698 struct nlattr *tb[RTA_MAX+1];
2699 struct fib_result res = {};
2700 struct rtable *rt = NULL;
2701 struct flowi4 fl4;
2702 __be32 dst = 0;
2703 __be32 src = 0;
2704 u32 iif;
2705 int err;
2706 int mark;
2707 struct sk_buff *skb;
2708 u32 table_id = RT_TABLE_MAIN;
2709 kuid_t uid;
2710
2711 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2712 extack);
2713 if (err < 0)
2714 goto errout;
2715
2716 rtm = nlmsg_data(nlh);
2717
2718 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2719 if (!skb) {
2720 err = -ENOBUFS;
2721 goto errout;
2722 }
2723
2724 /* Reserve room for dummy headers, this skb can pass
2725 through good chunk of routing engine.
2726 */
2727 skb_reset_mac_header(skb);
2728 skb_reset_network_header(skb);
2729
2730 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2731 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2732 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2733 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2734 if (tb[RTA_UID])
2735 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2736 else
2737 uid = (iif ? INVALID_UID : current_uid());
2738
2739 /* Bugfix: need to give ip_route_input enough of an IP header to
2740 * not gag.
2741 */
2742 ip_hdr(skb)->protocol = IPPROTO_UDP;
2743 ip_hdr(skb)->saddr = src;
2744 ip_hdr(skb)->daddr = dst;
2745
2746 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2747
2748 memset(&fl4, 0, sizeof(fl4));
2749 fl4.daddr = dst;
2750 fl4.saddr = src;
2751 fl4.flowi4_tos = rtm->rtm_tos;
2752 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2753 fl4.flowi4_mark = mark;
2754 fl4.flowi4_uid = uid;
2755
2756 rcu_read_lock();
2757
2758 if (iif) {
2759 struct net_device *dev;
2760
2761 dev = dev_get_by_index_rcu(net, iif);
2762 if (!dev) {
2763 err = -ENODEV;
2764 goto errout_free;
2765 }
2766
2767 skb->protocol = htons(ETH_P_IP);
2768 skb->dev = dev;
2769 skb->mark = mark;
2770 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2771 dev, &res);
2772
2773 rt = skb_rtable(skb);
2774 if (err == 0 && rt->dst.error)
2775 err = -rt->dst.error;
2776 } else {
2777 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2778 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2779 err = 0;
2780 if (IS_ERR(rt))
2781 err = PTR_ERR(rt);
2782 else
2783 skb_dst_set(skb, &rt->dst);
2784 }
2785
2786 if (err)
2787 goto errout_free;
2788
2789 if (rtm->rtm_flags & RTM_F_NOTIFY)
2790 rt->rt_flags |= RTCF_NOTIFY;
2791
2792 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2793 table_id = rt->rt_table_id;
2794
2795 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2796 if (!res.fi) {
2797 err = fib_props[res.type].error;
2798 if (!err)
2799 err = -EHOSTUNREACH;
2800 goto errout_free;
2801 }
2802 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2803 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2804 rt->rt_type, res.prefix, res.prefixlen,
2805 fl4.flowi4_tos, res.fi, 0);
2806 } else {
2807 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2808 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2809 }
2810 if (err < 0)
2811 goto errout_free;
2812
2813 rcu_read_unlock();
2814
2815 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2816 errout:
2817 return err;
2818
2819 errout_free:
2820 rcu_read_unlock();
2821 kfree_skb(skb);
2822 goto errout;
2823 }
2824
2825 void ip_rt_multicast_event(struct in_device *in_dev)
2826 {
2827 rt_cache_flush(dev_net(in_dev->dev));
2828 }
2829
2830 #ifdef CONFIG_SYSCTL
2831 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2832 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2833 static int ip_rt_gc_elasticity __read_mostly = 8;
2834
2835 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2836 void __user *buffer,
2837 size_t *lenp, loff_t *ppos)
2838 {
2839 struct net *net = (struct net *)__ctl->extra1;
2840
2841 if (write) {
2842 rt_cache_flush(net);
2843 fnhe_genid_bump(net);
2844 return 0;
2845 }
2846
2847 return -EINVAL;
2848 }
2849
2850 static struct ctl_table ipv4_route_table[] = {
2851 {
2852 .procname = "gc_thresh",
2853 .data = &ipv4_dst_ops.gc_thresh,
2854 .maxlen = sizeof(int),
2855 .mode = 0644,
2856 .proc_handler = proc_dointvec,
2857 },
2858 {
2859 .procname = "max_size",
2860 .data = &ip_rt_max_size,
2861 .maxlen = sizeof(int),
2862 .mode = 0644,
2863 .proc_handler = proc_dointvec,
2864 },
2865 {
2866 /* Deprecated. Use gc_min_interval_ms */
2867
2868 .procname = "gc_min_interval",
2869 .data = &ip_rt_gc_min_interval,
2870 .maxlen = sizeof(int),
2871 .mode = 0644,
2872 .proc_handler = proc_dointvec_jiffies,
2873 },
2874 {
2875 .procname = "gc_min_interval_ms",
2876 .data = &ip_rt_gc_min_interval,
2877 .maxlen = sizeof(int),
2878 .mode = 0644,
2879 .proc_handler = proc_dointvec_ms_jiffies,
2880 },
2881 {
2882 .procname = "gc_timeout",
2883 .data = &ip_rt_gc_timeout,
2884 .maxlen = sizeof(int),
2885 .mode = 0644,
2886 .proc_handler = proc_dointvec_jiffies,
2887 },
2888 {
2889 .procname = "gc_interval",
2890 .data = &ip_rt_gc_interval,
2891 .maxlen = sizeof(int),
2892 .mode = 0644,
2893 .proc_handler = proc_dointvec_jiffies,
2894 },
2895 {
2896 .procname = "redirect_load",
2897 .data = &ip_rt_redirect_load,
2898 .maxlen = sizeof(int),
2899 .mode = 0644,
2900 .proc_handler = proc_dointvec,
2901 },
2902 {
2903 .procname = "redirect_number",
2904 .data = &ip_rt_redirect_number,
2905 .maxlen = sizeof(int),
2906 .mode = 0644,
2907 .proc_handler = proc_dointvec,
2908 },
2909 {
2910 .procname = "redirect_silence",
2911 .data = &ip_rt_redirect_silence,
2912 .maxlen = sizeof(int),
2913 .mode = 0644,
2914 .proc_handler = proc_dointvec,
2915 },
2916 {
2917 .procname = "error_cost",
2918 .data = &ip_rt_error_cost,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
2921 .proc_handler = proc_dointvec,
2922 },
2923 {
2924 .procname = "error_burst",
2925 .data = &ip_rt_error_burst,
2926 .maxlen = sizeof(int),
2927 .mode = 0644,
2928 .proc_handler = proc_dointvec,
2929 },
2930 {
2931 .procname = "gc_elasticity",
2932 .data = &ip_rt_gc_elasticity,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = proc_dointvec,
2936 },
2937 {
2938 .procname = "mtu_expires",
2939 .data = &ip_rt_mtu_expires,
2940 .maxlen = sizeof(int),
2941 .mode = 0644,
2942 .proc_handler = proc_dointvec_jiffies,
2943 },
2944 {
2945 .procname = "min_pmtu",
2946 .data = &ip_rt_min_pmtu,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
2949 .proc_handler = proc_dointvec_minmax,
2950 .extra1 = &ip_min_valid_pmtu,
2951 },
2952 {
2953 .procname = "min_adv_mss",
2954 .data = &ip_rt_min_advmss,
2955 .maxlen = sizeof(int),
2956 .mode = 0644,
2957 .proc_handler = proc_dointvec,
2958 },
2959 { }
2960 };
2961
2962 static struct ctl_table ipv4_route_flush_table[] = {
2963 {
2964 .procname = "flush",
2965 .maxlen = sizeof(int),
2966 .mode = 0200,
2967 .proc_handler = ipv4_sysctl_rtcache_flush,
2968 },
2969 { },
2970 };
2971
2972 static __net_init int sysctl_route_net_init(struct net *net)
2973 {
2974 struct ctl_table *tbl;
2975
2976 tbl = ipv4_route_flush_table;
2977 if (!net_eq(net, &init_net)) {
2978 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2979 if (!tbl)
2980 goto err_dup;
2981
2982 /* Don't export sysctls to unprivileged users */
2983 if (net->user_ns != &init_user_ns)
2984 tbl[0].procname = NULL;
2985 }
2986 tbl[0].extra1 = net;
2987
2988 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2989 if (!net->ipv4.route_hdr)
2990 goto err_reg;
2991 return 0;
2992
2993 err_reg:
2994 if (tbl != ipv4_route_flush_table)
2995 kfree(tbl);
2996 err_dup:
2997 return -ENOMEM;
2998 }
2999
3000 static __net_exit void sysctl_route_net_exit(struct net *net)
3001 {
3002 struct ctl_table *tbl;
3003
3004 tbl = net->ipv4.route_hdr->ctl_table_arg;
3005 unregister_net_sysctl_table(net->ipv4.route_hdr);
3006 BUG_ON(tbl == ipv4_route_flush_table);
3007 kfree(tbl);
3008 }
3009
3010 static __net_initdata struct pernet_operations sysctl_route_ops = {
3011 .init = sysctl_route_net_init,
3012 .exit = sysctl_route_net_exit,
3013 };
3014 #endif
3015
3016 static __net_init int rt_genid_init(struct net *net)
3017 {
3018 atomic_set(&net->ipv4.rt_genid, 0);
3019 atomic_set(&net->fnhe_genid, 0);
3020 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3021 return 0;
3022 }
3023
3024 static __net_initdata struct pernet_operations rt_genid_ops = {
3025 .init = rt_genid_init,
3026 };
3027
3028 static int __net_init ipv4_inetpeer_init(struct net *net)
3029 {
3030 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3031
3032 if (!bp)
3033 return -ENOMEM;
3034 inet_peer_base_init(bp);
3035 net->ipv4.peers = bp;
3036 return 0;
3037 }
3038
3039 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3040 {
3041 struct inet_peer_base *bp = net->ipv4.peers;
3042
3043 net->ipv4.peers = NULL;
3044 inetpeer_invalidate_tree(bp);
3045 kfree(bp);
3046 }
3047
3048 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3049 .init = ipv4_inetpeer_init,
3050 .exit = ipv4_inetpeer_exit,
3051 };
3052
3053 #ifdef CONFIG_IP_ROUTE_CLASSID
3054 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3055 #endif /* CONFIG_IP_ROUTE_CLASSID */
3056
3057 int __init ip_rt_init(void)
3058 {
3059 int cpu;
3060
3061 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3062 if (!ip_idents)
3063 panic("IP: failed to allocate ip_idents\n");
3064
3065 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3066
3067 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3068 if (!ip_tstamps)
3069 panic("IP: failed to allocate ip_tstamps\n");
3070
3071 for_each_possible_cpu(cpu) {
3072 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3073
3074 INIT_LIST_HEAD(&ul->head);
3075 spin_lock_init(&ul->lock);
3076 }
3077 #ifdef CONFIG_IP_ROUTE_CLASSID
3078 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3079 if (!ip_rt_acct)
3080 panic("IP: failed to allocate ip_rt_acct\n");
3081 #endif
3082
3083 ipv4_dst_ops.kmem_cachep =
3084 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3085 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3086
3087 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3088
3089 if (dst_entries_init(&ipv4_dst_ops) < 0)
3090 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3091
3092 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3093 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3094
3095 ipv4_dst_ops.gc_thresh = ~0;
3096 ip_rt_max_size = INT_MAX;
3097
3098 devinet_init();
3099 ip_fib_init();
3100
3101 if (ip_rt_proc_init())
3102 pr_err("Unable to create route proc files\n");
3103 #ifdef CONFIG_XFRM
3104 xfrm_init();
3105 xfrm4_init();
3106 #endif
3107 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3108 RTNL_FLAG_DOIT_UNLOCKED);
3109
3110 #ifdef CONFIG_SYSCTL
3111 register_pernet_subsys(&sysctl_route_ops);
3112 #endif
3113 register_pernet_subsys(&rt_genid_ops);
3114 register_pernet_subsys(&ipv4_inetpeer_ops);
3115 return 0;
3116 }
3117
3118 #ifdef CONFIG_SYSCTL
3119 /*
3120 * We really need to sanitize the damn ipv4 init order, then all
3121 * this nonsense will go away.
3122 */
3123 void __init ip_static_sysctl_init(void)
3124 {
3125 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3126 }
3127 #endif