]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/ipv4/route.c
UBUNTU: Ubuntu-4.15.0-96.97
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #include "fib_lookup.h"
118
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122 #define RT_GC_TIMEOUT (300*HZ)
123
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
135
136 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
137
138 /*
139 * Interface to generic destination cache.
140 */
141
142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
144 static unsigned int ipv4_mtu(const struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu,
149 bool confirm_neigh);
150 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
151 struct sk_buff *skb);
152 static void ipv4_dst_destroy(struct dst_entry *dst);
153
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155 {
156 WARN_ON(1);
157 return NULL;
158 }
159
160 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
161 struct sk_buff *skb,
162 const void *daddr);
163 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
164
165 static struct dst_ops ipv4_dst_ops = {
166 .family = AF_INET,
167 .check = ipv4_dst_check,
168 .default_advmss = ipv4_default_advmss,
169 .mtu = ipv4_mtu,
170 .cow_metrics = ipv4_cow_metrics,
171 .destroy = ipv4_dst_destroy,
172 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu,
175 .redirect = ip_do_redirect,
176 .local_out = __ip_local_out,
177 .neigh_lookup = ipv4_neigh_lookup,
178 .confirm_neigh = ipv4_confirm_neigh,
179 };
180
181 #define ECN_OR_COST(class) TC_PRIO_##class
182
183 const __u8 ip_tos2prio[16] = {
184 TC_PRIO_BESTEFFORT,
185 ECN_OR_COST(BESTEFFORT),
186 TC_PRIO_BESTEFFORT,
187 ECN_OR_COST(BESTEFFORT),
188 TC_PRIO_BULK,
189 ECN_OR_COST(BULK),
190 TC_PRIO_BULK,
191 ECN_OR_COST(BULK),
192 TC_PRIO_INTERACTIVE,
193 ECN_OR_COST(INTERACTIVE),
194 TC_PRIO_INTERACTIVE,
195 ECN_OR_COST(INTERACTIVE),
196 TC_PRIO_INTERACTIVE_BULK,
197 ECN_OR_COST(INTERACTIVE_BULK),
198 TC_PRIO_INTERACTIVE_BULK,
199 ECN_OR_COST(INTERACTIVE_BULK)
200 };
201 EXPORT_SYMBOL(ip_tos2prio);
202
203 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
204 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
205
206 #ifdef CONFIG_PROC_FS
207 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
208 {
209 if (*pos)
210 return NULL;
211 return SEQ_START_TOKEN;
212 }
213
214 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
215 {
216 ++*pos;
217 return NULL;
218 }
219
220 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
221 {
222 }
223
224 static int rt_cache_seq_show(struct seq_file *seq, void *v)
225 {
226 if (v == SEQ_START_TOKEN)
227 seq_printf(seq, "%-127s\n",
228 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
229 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
230 "HHUptod\tSpecDst");
231 return 0;
232 }
233
234 static const struct seq_operations rt_cache_seq_ops = {
235 .start = rt_cache_seq_start,
236 .next = rt_cache_seq_next,
237 .stop = rt_cache_seq_stop,
238 .show = rt_cache_seq_show,
239 };
240
241 static int rt_cache_seq_open(struct inode *inode, struct file *file)
242 {
243 return seq_open(file, &rt_cache_seq_ops);
244 }
245
246 static const struct file_operations rt_cache_seq_fops = {
247 .owner = THIS_MODULE,
248 .open = rt_cache_seq_open,
249 .read = seq_read,
250 .llseek = seq_lseek,
251 .release = seq_release,
252 };
253
254
255 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
256 {
257 int cpu;
258
259 if (*pos == 0)
260 return SEQ_START_TOKEN;
261
262 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
263 if (!cpu_possible(cpu))
264 continue;
265 *pos = cpu+1;
266 return &per_cpu(rt_cache_stat, cpu);
267 }
268 return NULL;
269 }
270
271 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
272 {
273 int cpu;
274
275 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
276 if (!cpu_possible(cpu))
277 continue;
278 *pos = cpu+1;
279 return &per_cpu(rt_cache_stat, cpu);
280 }
281 return NULL;
282
283 }
284
285 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
286 {
287
288 }
289
290 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
291 {
292 struct rt_cache_stat *st = v;
293
294 if (v == SEQ_START_TOKEN) {
295 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
296 return 0;
297 }
298
299 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
300 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
301 dst_entries_get_slow(&ipv4_dst_ops),
302 0, /* st->in_hit */
303 st->in_slow_tot,
304 st->in_slow_mc,
305 st->in_no_route,
306 st->in_brd,
307 st->in_martian_dst,
308 st->in_martian_src,
309
310 0, /* st->out_hit */
311 st->out_slow_tot,
312 st->out_slow_mc,
313
314 0, /* st->gc_total */
315 0, /* st->gc_ignored */
316 0, /* st->gc_goal_miss */
317 0, /* st->gc_dst_overflow */
318 0, /* st->in_hlist_search */
319 0 /* st->out_hlist_search */
320 );
321 return 0;
322 }
323
324 static const struct seq_operations rt_cpu_seq_ops = {
325 .start = rt_cpu_seq_start,
326 .next = rt_cpu_seq_next,
327 .stop = rt_cpu_seq_stop,
328 .show = rt_cpu_seq_show,
329 };
330
331
332 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
333 {
334 return seq_open(file, &rt_cpu_seq_ops);
335 }
336
337 static const struct file_operations rt_cpu_seq_fops = {
338 .owner = THIS_MODULE,
339 .open = rt_cpu_seq_open,
340 .read = seq_read,
341 .llseek = seq_lseek,
342 .release = seq_release,
343 };
344
345 #ifdef CONFIG_IP_ROUTE_CLASSID
346 static int rt_acct_proc_show(struct seq_file *m, void *v)
347 {
348 struct ip_rt_acct *dst, *src;
349 unsigned int i, j;
350
351 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
352 if (!dst)
353 return -ENOMEM;
354
355 for_each_possible_cpu(i) {
356 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
357 for (j = 0; j < 256; j++) {
358 dst[j].o_bytes += src[j].o_bytes;
359 dst[j].o_packets += src[j].o_packets;
360 dst[j].i_bytes += src[j].i_bytes;
361 dst[j].i_packets += src[j].i_packets;
362 }
363 }
364
365 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
366 kfree(dst);
367 return 0;
368 }
369
370 static int rt_acct_proc_open(struct inode *inode, struct file *file)
371 {
372 return single_open(file, rt_acct_proc_show, NULL);
373 }
374
375 static const struct file_operations rt_acct_proc_fops = {
376 .owner = THIS_MODULE,
377 .open = rt_acct_proc_open,
378 .read = seq_read,
379 .llseek = seq_lseek,
380 .release = single_release,
381 };
382 #endif
383
384 static int __net_init ip_rt_do_proc_init(struct net *net)
385 {
386 struct proc_dir_entry *pde;
387
388 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
389 &rt_cache_seq_fops);
390 if (!pde)
391 goto err1;
392
393 pde = proc_create("rt_cache", S_IRUGO,
394 net->proc_net_stat, &rt_cpu_seq_fops);
395 if (!pde)
396 goto err2;
397
398 #ifdef CONFIG_IP_ROUTE_CLASSID
399 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
400 if (!pde)
401 goto err3;
402 #endif
403 return 0;
404
405 #ifdef CONFIG_IP_ROUTE_CLASSID
406 err3:
407 remove_proc_entry("rt_cache", net->proc_net_stat);
408 #endif
409 err2:
410 remove_proc_entry("rt_cache", net->proc_net);
411 err1:
412 return -ENOMEM;
413 }
414
415 static void __net_exit ip_rt_do_proc_exit(struct net *net)
416 {
417 remove_proc_entry("rt_cache", net->proc_net_stat);
418 remove_proc_entry("rt_cache", net->proc_net);
419 #ifdef CONFIG_IP_ROUTE_CLASSID
420 remove_proc_entry("rt_acct", net->proc_net);
421 #endif
422 }
423
424 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
425 .init = ip_rt_do_proc_init,
426 .exit = ip_rt_do_proc_exit,
427 };
428
429 static int __init ip_rt_proc_init(void)
430 {
431 return register_pernet_subsys(&ip_rt_proc_ops);
432 }
433
434 #else
435 static inline int ip_rt_proc_init(void)
436 {
437 return 0;
438 }
439 #endif /* CONFIG_PROC_FS */
440
441 static inline bool rt_is_expired(const struct rtable *rth)
442 {
443 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
444 }
445
446 void rt_cache_flush(struct net *net)
447 {
448 rt_genid_bump_ipv4(net);
449 }
450
451 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
452 struct sk_buff *skb,
453 const void *daddr)
454 {
455 struct net_device *dev = dst->dev;
456 const __be32 *pkey = daddr;
457 const struct rtable *rt;
458 struct neighbour *n;
459
460 rt = (const struct rtable *) dst;
461 if (rt->rt_gateway)
462 pkey = (const __be32 *) &rt->rt_gateway;
463 else if (skb)
464 pkey = &ip_hdr(skb)->daddr;
465
466 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
467 if (n)
468 return n;
469 return neigh_create(&arp_tbl, pkey, dev);
470 }
471
472 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
473 {
474 struct net_device *dev = dst->dev;
475 const __be32 *pkey = daddr;
476 const struct rtable *rt;
477
478 rt = (const struct rtable *)dst;
479 if (rt->rt_gateway)
480 pkey = (const __be32 *)&rt->rt_gateway;
481 else if (!daddr ||
482 (rt->rt_flags &
483 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
484 return;
485
486 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
487 }
488
489 #define IP_IDENTS_SZ 2048u
490
491 static atomic_t *ip_idents __read_mostly;
492 static u32 *ip_tstamps __read_mostly;
493
494 /* In order to protect privacy, we add a perturbation to identifiers
495 * if one generator is seldom used. This makes hard for an attacker
496 * to infer how many packets were sent between two points in time.
497 */
498 u32 ip_idents_reserve(u32 hash, int segs)
499 {
500 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
501 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
502 u32 old = READ_ONCE(*p_tstamp);
503 u32 now = (u32)jiffies;
504 u32 new, delta = 0;
505
506 if (old != now && cmpxchg(p_tstamp, old, now) == old)
507 delta = prandom_u32_max(now - old);
508
509 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
510 do {
511 old = (u32)atomic_read(p_id);
512 new = old + delta + segs;
513 } while (atomic_cmpxchg(p_id, old, new) != old);
514
515 return new - segs;
516 }
517 EXPORT_SYMBOL(ip_idents_reserve);
518
519 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
520 {
521 u32 hash, id;
522
523 /* Note the following code is not safe, but this is okay. */
524 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
525 get_random_bytes(&net->ipv4.ip_id_key,
526 sizeof(net->ipv4.ip_id_key));
527
528 hash = siphash_3u32((__force u32)iph->daddr,
529 (__force u32)iph->saddr,
530 iph->protocol,
531 &net->ipv4.ip_id_key);
532 id = ip_idents_reserve(hash, segs);
533 iph->id = htons(id);
534 }
535 EXPORT_SYMBOL(__ip_select_ident);
536
537 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
538 const struct sock *sk,
539 const struct iphdr *iph,
540 int oif, u8 tos,
541 u8 prot, u32 mark, int flow_flags)
542 {
543 if (sk) {
544 const struct inet_sock *inet = inet_sk(sk);
545
546 oif = sk->sk_bound_dev_if;
547 mark = sk->sk_mark;
548 tos = RT_CONN_FLAGS(sk);
549 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
550 }
551 flowi4_init_output(fl4, oif, mark, tos,
552 RT_SCOPE_UNIVERSE, prot,
553 flow_flags,
554 iph->daddr, iph->saddr, 0, 0,
555 sock_net_uid(net, sk));
556 }
557
558 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
559 const struct sock *sk)
560 {
561 const struct net *net = dev_net(skb->dev);
562 const struct iphdr *iph = ip_hdr(skb);
563 int oif = skb->dev->ifindex;
564 u8 tos = RT_TOS(iph->tos);
565 u8 prot = iph->protocol;
566 u32 mark = skb->mark;
567
568 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
569 }
570
571 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
572 {
573 const struct inet_sock *inet = inet_sk(sk);
574 const struct ip_options_rcu *inet_opt;
575 __be32 daddr = inet->inet_daddr;
576
577 rcu_read_lock();
578 inet_opt = rcu_dereference(inet->inet_opt);
579 if (inet_opt && inet_opt->opt.srr)
580 daddr = inet_opt->opt.faddr;
581 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
582 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
583 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
584 inet_sk_flowi_flags(sk),
585 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
586 rcu_read_unlock();
587 }
588
589 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
590 const struct sk_buff *skb)
591 {
592 if (skb)
593 build_skb_flow_key(fl4, skb, sk);
594 else
595 build_sk_flow_key(fl4, sk);
596 }
597
598 static DEFINE_SPINLOCK(fnhe_lock);
599
600 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
601 {
602 struct rtable *rt;
603
604 rt = rcu_dereference(fnhe->fnhe_rth_input);
605 if (rt) {
606 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
607 dst_dev_put(&rt->dst);
608 dst_release(&rt->dst);
609 }
610 rt = rcu_dereference(fnhe->fnhe_rth_output);
611 if (rt) {
612 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
613 dst_dev_put(&rt->dst);
614 dst_release(&rt->dst);
615 }
616 }
617
618 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
619 {
620 struct fib_nh_exception *fnhe, *oldest;
621
622 oldest = rcu_dereference(hash->chain);
623 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
624 fnhe = rcu_dereference(fnhe->fnhe_next)) {
625 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
626 oldest = fnhe;
627 }
628 fnhe_flush_routes(oldest);
629 return oldest;
630 }
631
632 static inline u32 fnhe_hashfun(__be32 daddr)
633 {
634 static u32 fnhe_hashrnd __read_mostly;
635 u32 hval;
636
637 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
638 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
639 return hash_32(hval, FNHE_HASH_SHIFT);
640 }
641
642 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
643 {
644 rt->rt_pmtu = fnhe->fnhe_pmtu;
645 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
646 rt->dst.expires = fnhe->fnhe_expires;
647
648 if (fnhe->fnhe_gw) {
649 rt->rt_flags |= RTCF_REDIRECTED;
650 rt->rt_gateway = fnhe->fnhe_gw;
651 rt->rt_uses_gateway = 1;
652 }
653 }
654
655 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
656 u32 pmtu, bool lock, unsigned long expires)
657 {
658 struct fnhe_hash_bucket *hash;
659 struct fib_nh_exception *fnhe;
660 struct rtable *rt;
661 u32 genid, hval;
662 unsigned int i;
663 int depth;
664
665 genid = fnhe_genid(dev_net(nh->nh_dev));
666 hval = fnhe_hashfun(daddr);
667
668 spin_lock_bh(&fnhe_lock);
669
670 hash = rcu_dereference(nh->nh_exceptions);
671 if (!hash) {
672 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
673 if (!hash)
674 goto out_unlock;
675 rcu_assign_pointer(nh->nh_exceptions, hash);
676 }
677
678 hash += hval;
679
680 depth = 0;
681 for (fnhe = rcu_dereference(hash->chain); fnhe;
682 fnhe = rcu_dereference(fnhe->fnhe_next)) {
683 if (fnhe->fnhe_daddr == daddr)
684 break;
685 depth++;
686 }
687
688 if (fnhe) {
689 if (fnhe->fnhe_genid != genid)
690 fnhe->fnhe_genid = genid;
691 if (gw)
692 fnhe->fnhe_gw = gw;
693 if (pmtu) {
694 fnhe->fnhe_pmtu = pmtu;
695 fnhe->fnhe_mtu_locked = lock;
696 }
697 fnhe->fnhe_expires = max(1UL, expires);
698 /* Update all cached dsts too */
699 rt = rcu_dereference(fnhe->fnhe_rth_input);
700 if (rt)
701 fill_route_from_fnhe(rt, fnhe);
702 rt = rcu_dereference(fnhe->fnhe_rth_output);
703 if (rt)
704 fill_route_from_fnhe(rt, fnhe);
705 } else {
706 if (depth > FNHE_RECLAIM_DEPTH)
707 fnhe = fnhe_oldest(hash);
708 else {
709 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
710 if (!fnhe)
711 goto out_unlock;
712
713 fnhe->fnhe_next = hash->chain;
714 rcu_assign_pointer(hash->chain, fnhe);
715 }
716 fnhe->fnhe_genid = genid;
717 fnhe->fnhe_daddr = daddr;
718 fnhe->fnhe_gw = gw;
719 fnhe->fnhe_pmtu = pmtu;
720 fnhe->fnhe_mtu_locked = lock;
721 fnhe->fnhe_expires = max(1UL, expires);
722
723 /* Exception created; mark the cached routes for the nexthop
724 * stale, so anyone caching it rechecks if this exception
725 * applies to them.
726 */
727 rt = rcu_dereference(nh->nh_rth_input);
728 if (rt)
729 rt->dst.obsolete = DST_OBSOLETE_KILL;
730
731 for_each_possible_cpu(i) {
732 struct rtable __rcu **prt;
733 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
734 rt = rcu_dereference(*prt);
735 if (rt)
736 rt->dst.obsolete = DST_OBSOLETE_KILL;
737 }
738 }
739
740 fnhe->fnhe_stamp = jiffies;
741
742 out_unlock:
743 spin_unlock_bh(&fnhe_lock);
744 }
745
746 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
747 bool kill_route)
748 {
749 __be32 new_gw = icmp_hdr(skb)->un.gateway;
750 __be32 old_gw = ip_hdr(skb)->saddr;
751 struct net_device *dev = skb->dev;
752 struct in_device *in_dev;
753 struct fib_result res;
754 struct neighbour *n;
755 struct net *net;
756
757 switch (icmp_hdr(skb)->code & 7) {
758 case ICMP_REDIR_NET:
759 case ICMP_REDIR_NETTOS:
760 case ICMP_REDIR_HOST:
761 case ICMP_REDIR_HOSTTOS:
762 break;
763
764 default:
765 return;
766 }
767
768 if (rt->rt_gateway != old_gw)
769 return;
770
771 in_dev = __in_dev_get_rcu(dev);
772 if (!in_dev)
773 return;
774
775 net = dev_net(dev);
776 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
777 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
778 ipv4_is_zeronet(new_gw))
779 goto reject_redirect;
780
781 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
782 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
783 goto reject_redirect;
784 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
785 goto reject_redirect;
786 } else {
787 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
788 goto reject_redirect;
789 }
790
791 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
792 if (!n)
793 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
794 if (!IS_ERR(n)) {
795 if (!(n->nud_state & NUD_VALID)) {
796 neigh_event_send(n, NULL);
797 } else {
798 if (fib_lookup(net, fl4, &res, 0) == 0) {
799 struct fib_nh *nh = &FIB_RES_NH(res);
800
801 update_or_create_fnhe(nh, fl4->daddr, new_gw,
802 0, false,
803 jiffies + ip_rt_gc_timeout);
804 }
805 if (kill_route)
806 rt->dst.obsolete = DST_OBSOLETE_KILL;
807 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
808 }
809 neigh_release(n);
810 }
811 return;
812
813 reject_redirect:
814 #ifdef CONFIG_IP_ROUTE_VERBOSE
815 if (IN_DEV_LOG_MARTIANS(in_dev)) {
816 const struct iphdr *iph = (const struct iphdr *) skb->data;
817 __be32 daddr = iph->daddr;
818 __be32 saddr = iph->saddr;
819
820 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
821 " Advised path = %pI4 -> %pI4\n",
822 &old_gw, dev->name, &new_gw,
823 &saddr, &daddr);
824 }
825 #endif
826 ;
827 }
828
829 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
830 {
831 struct rtable *rt;
832 struct flowi4 fl4;
833 const struct iphdr *iph = (const struct iphdr *) skb->data;
834 struct net *net = dev_net(skb->dev);
835 int oif = skb->dev->ifindex;
836 u8 tos = RT_TOS(iph->tos);
837 u8 prot = iph->protocol;
838 u32 mark = skb->mark;
839
840 rt = (struct rtable *) dst;
841
842 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
843 __ip_do_redirect(rt, skb, &fl4, true);
844 }
845
846 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
847 {
848 struct rtable *rt = (struct rtable *)dst;
849 struct dst_entry *ret = dst;
850
851 if (rt) {
852 if (dst->obsolete > 0) {
853 ip_rt_put(rt);
854 ret = NULL;
855 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
856 rt->dst.expires) {
857 ip_rt_put(rt);
858 ret = NULL;
859 }
860 }
861 return ret;
862 }
863
864 /*
865 * Algorithm:
866 * 1. The first ip_rt_redirect_number redirects are sent
867 * with exponential backoff, then we stop sending them at all,
868 * assuming that the host ignores our redirects.
869 * 2. If we did not see packets requiring redirects
870 * during ip_rt_redirect_silence, we assume that the host
871 * forgot redirected route and start to send redirects again.
872 *
873 * This algorithm is much cheaper and more intelligent than dumb load limiting
874 * in icmp.c.
875 *
876 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
877 * and "frag. need" (breaks PMTU discovery) in icmp.c.
878 */
879
880 void ip_rt_send_redirect(struct sk_buff *skb)
881 {
882 struct rtable *rt = skb_rtable(skb);
883 struct in_device *in_dev;
884 struct inet_peer *peer;
885 struct net *net;
886 int log_martians;
887 int vif;
888
889 rcu_read_lock();
890 in_dev = __in_dev_get_rcu(rt->dst.dev);
891 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
892 rcu_read_unlock();
893 return;
894 }
895 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
896 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
897 rcu_read_unlock();
898
899 net = dev_net(rt->dst.dev);
900 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
901 if (!peer) {
902 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
903 rt_nexthop(rt, ip_hdr(skb)->daddr));
904 return;
905 }
906
907 /* No redirected packets during ip_rt_redirect_silence;
908 * reset the algorithm.
909 */
910 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
911 peer->rate_tokens = 0;
912 peer->n_redirects = 0;
913 }
914
915 /* Too many ignored redirects; do not send anything
916 * set dst.rate_last to the last seen redirected packet.
917 */
918 if (peer->n_redirects >= ip_rt_redirect_number) {
919 peer->rate_last = jiffies;
920 goto out_put_peer;
921 }
922
923 /* Check for load limit; set rate_last to the latest sent
924 * redirect.
925 */
926 if (peer->rate_tokens == 0 ||
927 time_after(jiffies,
928 (peer->rate_last +
929 (ip_rt_redirect_load << peer->n_redirects)))) {
930 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
931
932 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
933 peer->rate_last = jiffies;
934 ++peer->n_redirects;
935 #ifdef CONFIG_IP_ROUTE_VERBOSE
936 if (log_martians &&
937 peer->n_redirects == ip_rt_redirect_number)
938 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
939 &ip_hdr(skb)->saddr, inet_iif(skb),
940 &ip_hdr(skb)->daddr, &gw);
941 #endif
942 }
943 out_put_peer:
944 inet_putpeer(peer);
945 }
946
947 static int ip_error(struct sk_buff *skb)
948 {
949 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
950 struct rtable *rt = skb_rtable(skb);
951 struct inet_peer *peer;
952 unsigned long now;
953 struct net *net;
954 bool send;
955 int code;
956
957 /* IP on this device is disabled. */
958 if (!in_dev)
959 goto out;
960
961 net = dev_net(rt->dst.dev);
962 if (!IN_DEV_FORWARD(in_dev)) {
963 switch (rt->dst.error) {
964 case EHOSTUNREACH:
965 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
966 break;
967
968 case ENETUNREACH:
969 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
970 break;
971 }
972 goto out;
973 }
974
975 switch (rt->dst.error) {
976 case EINVAL:
977 default:
978 goto out;
979 case EHOSTUNREACH:
980 code = ICMP_HOST_UNREACH;
981 break;
982 case ENETUNREACH:
983 code = ICMP_NET_UNREACH;
984 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
985 break;
986 case EACCES:
987 code = ICMP_PKT_FILTERED;
988 break;
989 }
990
991 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
992 l3mdev_master_ifindex(skb->dev), 1);
993
994 send = true;
995 if (peer) {
996 now = jiffies;
997 peer->rate_tokens += now - peer->rate_last;
998 if (peer->rate_tokens > ip_rt_error_burst)
999 peer->rate_tokens = ip_rt_error_burst;
1000 peer->rate_last = now;
1001 if (peer->rate_tokens >= ip_rt_error_cost)
1002 peer->rate_tokens -= ip_rt_error_cost;
1003 else
1004 send = false;
1005 inet_putpeer(peer);
1006 }
1007 if (send)
1008 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out: kfree_skb(skb);
1011 return 0;
1012 }
1013
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016 struct dst_entry *dst = &rt->dst;
1017 u32 old_mtu = ipv4_mtu(dst);
1018 struct fib_result res;
1019 bool lock = false;
1020
1021 if (ip_mtu_locked(dst))
1022 return;
1023
1024 if (old_mtu < mtu)
1025 return;
1026
1027 if (mtu < ip_rt_min_pmtu) {
1028 lock = true;
1029 mtu = min(old_mtu, ip_rt_min_pmtu);
1030 }
1031
1032 if (rt->rt_pmtu == mtu && !lock &&
1033 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034 return;
1035
1036 rcu_read_lock();
1037 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038 struct fib_nh *nh = &FIB_RES_NH(res);
1039
1040 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1041 jiffies + ip_rt_mtu_expires);
1042 }
1043 rcu_read_unlock();
1044 }
1045
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047 struct sk_buff *skb, u32 mtu,
1048 bool confirm_neigh)
1049 {
1050 struct rtable *rt = (struct rtable *) dst;
1051 struct flowi4 fl4;
1052
1053 ip_rt_build_flow_key(&fl4, sk, skb);
1054 __ip_rt_update_pmtu(rt, &fl4, mtu);
1055 }
1056
1057 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1058 int oif, u32 mark, u8 protocol, int flow_flags)
1059 {
1060 const struct iphdr *iph = (const struct iphdr *) skb->data;
1061 struct flowi4 fl4;
1062 struct rtable *rt;
1063
1064 if (!mark)
1065 mark = IP4_REPLY_MARK(net, skb->mark);
1066
1067 __build_flow_key(net, &fl4, NULL, iph, oif,
1068 RT_TOS(iph->tos), protocol, mark, flow_flags);
1069 rt = __ip_route_output_key(net, &fl4);
1070 if (!IS_ERR(rt)) {
1071 __ip_rt_update_pmtu(rt, &fl4, mtu);
1072 ip_rt_put(rt);
1073 }
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1076
1077 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1078 {
1079 const struct iphdr *iph = (const struct iphdr *) skb->data;
1080 struct flowi4 fl4;
1081 struct rtable *rt;
1082
1083 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1084
1085 if (!fl4.flowi4_mark)
1086 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1087
1088 rt = __ip_route_output_key(sock_net(sk), &fl4);
1089 if (!IS_ERR(rt)) {
1090 __ip_rt_update_pmtu(rt, &fl4, mtu);
1091 ip_rt_put(rt);
1092 }
1093 }
1094
1095 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1096 {
1097 const struct iphdr *iph = (const struct iphdr *) skb->data;
1098 struct flowi4 fl4;
1099 struct rtable *rt;
1100 struct dst_entry *odst = NULL;
1101 bool new = false;
1102 struct net *net = sock_net(sk);
1103
1104 bh_lock_sock(sk);
1105
1106 if (!ip_sk_accept_pmtu(sk))
1107 goto out;
1108
1109 odst = sk_dst_get(sk);
1110
1111 if (sock_owned_by_user(sk) || !odst) {
1112 __ipv4_sk_update_pmtu(skb, sk, mtu);
1113 goto out;
1114 }
1115
1116 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1117
1118 rt = (struct rtable *)odst;
1119 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1120 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1121 if (IS_ERR(rt))
1122 goto out;
1123
1124 new = true;
1125 }
1126
1127 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1128
1129 if (!dst_check(&rt->dst, 0)) {
1130 if (new)
1131 dst_release(&rt->dst);
1132
1133 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1134 if (IS_ERR(rt))
1135 goto out;
1136
1137 new = true;
1138 }
1139
1140 if (new)
1141 sk_dst_set(sk, &rt->dst);
1142
1143 out:
1144 bh_unlock_sock(sk);
1145 dst_release(odst);
1146 }
1147 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1148
1149 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1150 int oif, u32 mark, u8 protocol, int flow_flags)
1151 {
1152 const struct iphdr *iph = (const struct iphdr *) skb->data;
1153 struct flowi4 fl4;
1154 struct rtable *rt;
1155
1156 __build_flow_key(net, &fl4, NULL, iph, oif,
1157 RT_TOS(iph->tos), protocol, mark, flow_flags);
1158 rt = __ip_route_output_key(net, &fl4);
1159 if (!IS_ERR(rt)) {
1160 __ip_do_redirect(rt, skb, &fl4, false);
1161 ip_rt_put(rt);
1162 }
1163 }
1164 EXPORT_SYMBOL_GPL(ipv4_redirect);
1165
1166 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1167 {
1168 const struct iphdr *iph = (const struct iphdr *) skb->data;
1169 struct flowi4 fl4;
1170 struct rtable *rt;
1171 struct net *net = sock_net(sk);
1172
1173 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1174 rt = __ip_route_output_key(net, &fl4);
1175 if (!IS_ERR(rt)) {
1176 __ip_do_redirect(rt, skb, &fl4, false);
1177 ip_rt_put(rt);
1178 }
1179 }
1180 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1181
1182 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1183 {
1184 struct rtable *rt = (struct rtable *) dst;
1185
1186 /* All IPV4 dsts are created with ->obsolete set to the value
1187 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1188 * into this function always.
1189 *
1190 * When a PMTU/redirect information update invalidates a route,
1191 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1192 * DST_OBSOLETE_DEAD by dst_free().
1193 */
1194 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1195 return NULL;
1196 return dst;
1197 }
1198
1199 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1200 {
1201 struct ip_options opt;
1202 int res;
1203
1204 /* Recompile ip options since IPCB may not be valid anymore.
1205 * Also check we have a reasonable ipv4 header.
1206 */
1207 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1208 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1209 return;
1210
1211 memset(&opt, 0, sizeof(opt));
1212 if (ip_hdr(skb)->ihl > 5) {
1213 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1214 return;
1215 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1216
1217 rcu_read_lock();
1218 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1219 rcu_read_unlock();
1220
1221 if (res)
1222 return;
1223 }
1224 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1225 }
1226
1227 static void ipv4_link_failure(struct sk_buff *skb)
1228 {
1229 struct rtable *rt;
1230
1231 ipv4_send_dest_unreach(skb);
1232
1233 rt = skb_rtable(skb);
1234 if (rt)
1235 dst_set_expires(&rt->dst, 0);
1236 }
1237
1238 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1239 {
1240 pr_debug("%s: %pI4 -> %pI4, %s\n",
1241 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1242 skb->dev ? skb->dev->name : "?");
1243 kfree_skb(skb);
1244 WARN_ON(1);
1245 return 0;
1246 }
1247
1248 /*
1249 We do not cache source address of outgoing interface,
1250 because it is used only by IP RR, TS and SRR options,
1251 so that it out of fast path.
1252
1253 BTW remember: "addr" is allowed to be not aligned
1254 in IP options!
1255 */
1256
1257 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1258 {
1259 __be32 src;
1260
1261 if (rt_is_output_route(rt))
1262 src = ip_hdr(skb)->saddr;
1263 else {
1264 struct fib_result res;
1265 struct flowi4 fl4;
1266 struct iphdr *iph;
1267
1268 iph = ip_hdr(skb);
1269
1270 memset(&fl4, 0, sizeof(fl4));
1271 fl4.daddr = iph->daddr;
1272 fl4.saddr = iph->saddr;
1273 fl4.flowi4_tos = RT_TOS(iph->tos);
1274 fl4.flowi4_oif = rt->dst.dev->ifindex;
1275 fl4.flowi4_iif = skb->dev->ifindex;
1276 fl4.flowi4_mark = skb->mark;
1277
1278 rcu_read_lock();
1279 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1280 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1281 else
1282 src = inet_select_addr(rt->dst.dev,
1283 rt_nexthop(rt, iph->daddr),
1284 RT_SCOPE_UNIVERSE);
1285 rcu_read_unlock();
1286 }
1287 memcpy(addr, &src, 4);
1288 }
1289
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable *rt, u32 tag)
1292 {
1293 if (!(rt->dst.tclassid & 0xFFFF))
1294 rt->dst.tclassid |= tag & 0xFFFF;
1295 if (!(rt->dst.tclassid & 0xFFFF0000))
1296 rt->dst.tclassid |= tag & 0xFFFF0000;
1297 }
1298 #endif
1299
1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1301 {
1302 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1303 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1304 ip_rt_min_advmss);
1305
1306 return min(advmss, IPV4_MAX_PMTU - header_size);
1307 }
1308
1309 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1310 {
1311 const struct rtable *rt = (const struct rtable *) dst;
1312 unsigned int mtu = rt->rt_pmtu;
1313
1314 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1315 mtu = dst_metric_raw(dst, RTAX_MTU);
1316
1317 if (mtu)
1318 return mtu;
1319
1320 mtu = READ_ONCE(dst->dev->mtu);
1321
1322 if (unlikely(ip_mtu_locked(dst))) {
1323 if (rt->rt_uses_gateway && mtu > 576)
1324 mtu = 576;
1325 }
1326
1327 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1328
1329 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1330 }
1331
1332 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1333 {
1334 struct fnhe_hash_bucket *hash;
1335 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1336 u32 hval = fnhe_hashfun(daddr);
1337
1338 spin_lock_bh(&fnhe_lock);
1339
1340 hash = rcu_dereference_protected(nh->nh_exceptions,
1341 lockdep_is_held(&fnhe_lock));
1342 hash += hval;
1343
1344 fnhe_p = &hash->chain;
1345 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1346 while (fnhe) {
1347 if (fnhe->fnhe_daddr == daddr) {
1348 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1349 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1350 /* set fnhe_daddr to 0 to ensure it won't bind with
1351 * new dsts in rt_bind_exception().
1352 */
1353 fnhe->fnhe_daddr = 0;
1354 fnhe_flush_routes(fnhe);
1355 kfree_rcu(fnhe, rcu);
1356 break;
1357 }
1358 fnhe_p = &fnhe->fnhe_next;
1359 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1360 lockdep_is_held(&fnhe_lock));
1361 }
1362
1363 spin_unlock_bh(&fnhe_lock);
1364 }
1365
1366 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1367 {
1368 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1369 struct fib_nh_exception *fnhe;
1370 u32 hval;
1371
1372 if (!hash)
1373 return NULL;
1374
1375 hval = fnhe_hashfun(daddr);
1376
1377 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1378 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1379 if (fnhe->fnhe_daddr == daddr) {
1380 if (fnhe->fnhe_expires &&
1381 time_after(jiffies, fnhe->fnhe_expires)) {
1382 ip_del_fnhe(nh, daddr);
1383 break;
1384 }
1385 return fnhe;
1386 }
1387 }
1388 return NULL;
1389 }
1390
1391 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1392 __be32 daddr, const bool do_cache)
1393 {
1394 bool ret = false;
1395
1396 spin_lock_bh(&fnhe_lock);
1397
1398 if (daddr == fnhe->fnhe_daddr) {
1399 struct rtable __rcu **porig;
1400 struct rtable *orig;
1401 int genid = fnhe_genid(dev_net(rt->dst.dev));
1402
1403 if (rt_is_input_route(rt))
1404 porig = &fnhe->fnhe_rth_input;
1405 else
1406 porig = &fnhe->fnhe_rth_output;
1407 orig = rcu_dereference(*porig);
1408
1409 if (fnhe->fnhe_genid != genid) {
1410 fnhe->fnhe_genid = genid;
1411 fnhe->fnhe_gw = 0;
1412 fnhe->fnhe_pmtu = 0;
1413 fnhe->fnhe_expires = 0;
1414 fnhe_flush_routes(fnhe);
1415 orig = NULL;
1416 }
1417 fill_route_from_fnhe(rt, fnhe);
1418 if (!rt->rt_gateway)
1419 rt->rt_gateway = daddr;
1420
1421 if (do_cache) {
1422 dst_hold(&rt->dst);
1423 rcu_assign_pointer(*porig, rt);
1424 if (orig) {
1425 dst_dev_put(&orig->dst);
1426 dst_release(&orig->dst);
1427 }
1428 ret = true;
1429 }
1430
1431 fnhe->fnhe_stamp = jiffies;
1432 }
1433 spin_unlock_bh(&fnhe_lock);
1434
1435 return ret;
1436 }
1437
1438 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1439 {
1440 struct rtable *orig, *prev, **p;
1441 bool ret = true;
1442
1443 if (rt_is_input_route(rt)) {
1444 p = (struct rtable **)&nh->nh_rth_input;
1445 } else {
1446 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1447 }
1448 orig = *p;
1449
1450 /* hold dst before doing cmpxchg() to avoid race condition
1451 * on this dst
1452 */
1453 dst_hold(&rt->dst);
1454 prev = cmpxchg(p, orig, rt);
1455 if (prev == orig) {
1456 if (orig) {
1457 rt_add_uncached_list(orig);
1458 dst_release(&orig->dst);
1459 }
1460 } else {
1461 dst_release(&rt->dst);
1462 ret = false;
1463 }
1464
1465 return ret;
1466 }
1467
1468 struct uncached_list {
1469 spinlock_t lock;
1470 struct list_head head;
1471 };
1472
1473 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1474
1475 void rt_add_uncached_list(struct rtable *rt)
1476 {
1477 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1478
1479 rt->rt_uncached_list = ul;
1480
1481 spin_lock_bh(&ul->lock);
1482 list_add_tail(&rt->rt_uncached, &ul->head);
1483 spin_unlock_bh(&ul->lock);
1484 }
1485
1486 void rt_del_uncached_list(struct rtable *rt)
1487 {
1488 if (!list_empty(&rt->rt_uncached)) {
1489 struct uncached_list *ul = rt->rt_uncached_list;
1490
1491 spin_lock_bh(&ul->lock);
1492 list_del(&rt->rt_uncached);
1493 spin_unlock_bh(&ul->lock);
1494 }
1495 }
1496
1497 static void ipv4_dst_destroy(struct dst_entry *dst)
1498 {
1499 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1500 struct rtable *rt = (struct rtable *)dst;
1501
1502 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1503 kfree(p);
1504
1505 rt_del_uncached_list(rt);
1506 }
1507
1508 void rt_flush_dev(struct net_device *dev)
1509 {
1510 struct net *net = dev_net(dev);
1511 struct rtable *rt;
1512 int cpu;
1513
1514 for_each_possible_cpu(cpu) {
1515 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1516
1517 spin_lock_bh(&ul->lock);
1518 list_for_each_entry(rt, &ul->head, rt_uncached) {
1519 if (rt->dst.dev != dev)
1520 continue;
1521 rt->dst.dev = net->loopback_dev;
1522 dev_hold(rt->dst.dev);
1523 dev_put(dev);
1524 }
1525 spin_unlock_bh(&ul->lock);
1526 }
1527 }
1528
1529 static bool rt_cache_valid(const struct rtable *rt)
1530 {
1531 return rt &&
1532 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1533 !rt_is_expired(rt);
1534 }
1535
1536 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1537 const struct fib_result *res,
1538 struct fib_nh_exception *fnhe,
1539 struct fib_info *fi, u16 type, u32 itag,
1540 const bool do_cache)
1541 {
1542 bool cached = false;
1543
1544 if (fi) {
1545 struct fib_nh *nh = &FIB_RES_NH(*res);
1546
1547 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1548 rt->rt_gateway = nh->nh_gw;
1549 rt->rt_uses_gateway = 1;
1550 }
1551 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1552 if (fi->fib_metrics != &dst_default_metrics) {
1553 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1554 refcount_inc(&fi->fib_metrics->refcnt);
1555 }
1556 #ifdef CONFIG_IP_ROUTE_CLASSID
1557 rt->dst.tclassid = nh->nh_tclassid;
1558 #endif
1559 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1560 if (unlikely(fnhe))
1561 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1562 else if (do_cache)
1563 cached = rt_cache_route(nh, rt);
1564 if (unlikely(!cached)) {
1565 /* Routes we intend to cache in nexthop exception or
1566 * FIB nexthop have the DST_NOCACHE bit clear.
1567 * However, if we are unsuccessful at storing this
1568 * route into the cache we really need to set it.
1569 */
1570 if (!rt->rt_gateway)
1571 rt->rt_gateway = daddr;
1572 rt_add_uncached_list(rt);
1573 }
1574 } else
1575 rt_add_uncached_list(rt);
1576
1577 #ifdef CONFIG_IP_ROUTE_CLASSID
1578 #ifdef CONFIG_IP_MULTIPLE_TABLES
1579 set_class_tag(rt, res->tclassid);
1580 #endif
1581 set_class_tag(rt, itag);
1582 #endif
1583 }
1584
1585 struct rtable *rt_dst_alloc(struct net_device *dev,
1586 unsigned int flags, u16 type,
1587 bool nopolicy, bool noxfrm, bool will_cache)
1588 {
1589 struct rtable *rt;
1590
1591 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1592 (will_cache ? 0 : DST_HOST) |
1593 (nopolicy ? DST_NOPOLICY : 0) |
1594 (noxfrm ? DST_NOXFRM : 0));
1595
1596 if (rt) {
1597 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1598 rt->rt_flags = flags;
1599 rt->rt_type = type;
1600 rt->rt_is_input = 0;
1601 rt->rt_iif = 0;
1602 rt->rt_pmtu = 0;
1603 rt->rt_mtu_locked = 0;
1604 rt->rt_gateway = 0;
1605 rt->rt_uses_gateway = 0;
1606 rt->rt_table_id = 0;
1607 INIT_LIST_HEAD(&rt->rt_uncached);
1608
1609 rt->dst.output = ip_output;
1610 if (flags & RTCF_LOCAL)
1611 rt->dst.input = ip_local_deliver;
1612 }
1613
1614 return rt;
1615 }
1616 EXPORT_SYMBOL(rt_dst_alloc);
1617
1618 /* called in rcu_read_lock() section */
1619 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1620 u8 tos, struct net_device *dev,
1621 struct in_device *in_dev, u32 *itag)
1622 {
1623 int err;
1624
1625 /* Primary sanity checks. */
1626 if (!in_dev)
1627 return -EINVAL;
1628
1629 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1630 skb->protocol != htons(ETH_P_IP))
1631 return -EINVAL;
1632
1633 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1634 return -EINVAL;
1635
1636 if (ipv4_is_zeronet(saddr)) {
1637 if (!ipv4_is_local_multicast(daddr))
1638 return -EINVAL;
1639 } else {
1640 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1641 in_dev, itag);
1642 if (err < 0)
1643 return err;
1644 }
1645 return 0;
1646 }
1647
1648 /* called in rcu_read_lock() section */
1649 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1650 u8 tos, struct net_device *dev, int our)
1651 {
1652 struct in_device *in_dev = __in_dev_get_rcu(dev);
1653 unsigned int flags = RTCF_MULTICAST;
1654 struct rtable *rth;
1655 u32 itag = 0;
1656 int err;
1657
1658 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1659 if (err)
1660 return err;
1661
1662 if (our)
1663 flags |= RTCF_LOCAL;
1664
1665 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1666 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1667 if (!rth)
1668 return -ENOBUFS;
1669
1670 #ifdef CONFIG_IP_ROUTE_CLASSID
1671 rth->dst.tclassid = itag;
1672 #endif
1673 rth->dst.output = ip_rt_bug;
1674 rth->rt_is_input= 1;
1675
1676 #ifdef CONFIG_IP_MROUTE
1677 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1678 rth->dst.input = ip_mr_input;
1679 #endif
1680 RT_CACHE_STAT_INC(in_slow_mc);
1681
1682 skb_dst_set(skb, &rth->dst);
1683 return 0;
1684 }
1685
1686
1687 static void ip_handle_martian_source(struct net_device *dev,
1688 struct in_device *in_dev,
1689 struct sk_buff *skb,
1690 __be32 daddr,
1691 __be32 saddr)
1692 {
1693 RT_CACHE_STAT_INC(in_martian_src);
1694 #ifdef CONFIG_IP_ROUTE_VERBOSE
1695 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1696 /*
1697 * RFC1812 recommendation, if source is martian,
1698 * the only hint is MAC header.
1699 */
1700 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1701 &daddr, &saddr, dev->name);
1702 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1703 print_hex_dump(KERN_WARNING, "ll header: ",
1704 DUMP_PREFIX_OFFSET, 16, 1,
1705 skb_mac_header(skb),
1706 dev->hard_header_len, true);
1707 }
1708 }
1709 #endif
1710 }
1711
1712 static void set_lwt_redirect(struct rtable *rth)
1713 {
1714 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1715 rth->dst.lwtstate->orig_output = rth->dst.output;
1716 rth->dst.output = lwtunnel_output;
1717 }
1718
1719 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1720 rth->dst.lwtstate->orig_input = rth->dst.input;
1721 rth->dst.input = lwtunnel_input;
1722 }
1723 }
1724
1725 /* called in rcu_read_lock() section */
1726 static int __mkroute_input(struct sk_buff *skb,
1727 const struct fib_result *res,
1728 struct in_device *in_dev,
1729 __be32 daddr, __be32 saddr, u32 tos)
1730 {
1731 struct fib_nh_exception *fnhe;
1732 struct rtable *rth;
1733 int err;
1734 struct in_device *out_dev;
1735 bool do_cache;
1736 u32 itag = 0;
1737
1738 /* get a working reference to the output device */
1739 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1740 if (!out_dev) {
1741 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1742 return -EINVAL;
1743 }
1744
1745 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1746 in_dev->dev, in_dev, &itag);
1747 if (err < 0) {
1748 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1749 saddr);
1750
1751 goto cleanup;
1752 }
1753
1754 do_cache = res->fi && !itag;
1755 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1756 skb->protocol == htons(ETH_P_IP) &&
1757 (IN_DEV_SHARED_MEDIA(out_dev) ||
1758 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1759 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1760
1761 if (skb->protocol != htons(ETH_P_IP)) {
1762 /* Not IP (i.e. ARP). Do not create route, if it is
1763 * invalid for proxy arp. DNAT routes are always valid.
1764 *
1765 * Proxy arp feature have been extended to allow, ARP
1766 * replies back to the same interface, to support
1767 * Private VLAN switch technologies. See arp.c.
1768 */
1769 if (out_dev == in_dev &&
1770 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1771 err = -EINVAL;
1772 goto cleanup;
1773 }
1774 }
1775
1776 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1777 if (do_cache) {
1778 if (fnhe)
1779 rth = rcu_dereference(fnhe->fnhe_rth_input);
1780 else
1781 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1782 if (rt_cache_valid(rth)) {
1783 skb_dst_set_noref(skb, &rth->dst);
1784 goto out;
1785 }
1786 }
1787
1788 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1789 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1790 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1791 if (!rth) {
1792 err = -ENOBUFS;
1793 goto cleanup;
1794 }
1795
1796 rth->rt_is_input = 1;
1797 if (res->table)
1798 rth->rt_table_id = res->table->tb_id;
1799 RT_CACHE_STAT_INC(in_slow_tot);
1800
1801 rth->dst.input = ip_forward;
1802
1803 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1804 do_cache);
1805 set_lwt_redirect(rth);
1806 skb_dst_set(skb, &rth->dst);
1807 out:
1808 err = 0;
1809 cleanup:
1810 return err;
1811 }
1812
1813 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1814 /* To make ICMP packets follow the right flow, the multipath hash is
1815 * calculated from the inner IP addresses.
1816 */
1817 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1818 struct flow_keys *hash_keys)
1819 {
1820 const struct iphdr *outer_iph = ip_hdr(skb);
1821 const struct iphdr *inner_iph;
1822 const struct icmphdr *icmph;
1823 struct iphdr _inner_iph;
1824 struct icmphdr _icmph;
1825
1826 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1827 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1828 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1829 return;
1830
1831 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1832 return;
1833
1834 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1835 &_icmph);
1836 if (!icmph)
1837 return;
1838
1839 if (icmph->type != ICMP_DEST_UNREACH &&
1840 icmph->type != ICMP_REDIRECT &&
1841 icmph->type != ICMP_TIME_EXCEEDED &&
1842 icmph->type != ICMP_PARAMETERPROB)
1843 return;
1844
1845 inner_iph = skb_header_pointer(skb,
1846 outer_iph->ihl * 4 + sizeof(_icmph),
1847 sizeof(_inner_iph), &_inner_iph);
1848 if (!inner_iph)
1849 return;
1850 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1851 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1852 }
1853
1854 /* if skb is set it will be used and fl4 can be NULL */
1855 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1856 const struct sk_buff *skb)
1857 {
1858 struct net *net = fi->fib_net;
1859 struct flow_keys hash_keys;
1860 u32 mhash;
1861
1862 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1863 case 0:
1864 memset(&hash_keys, 0, sizeof(hash_keys));
1865 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1866 if (skb) {
1867 ip_multipath_l3_keys(skb, &hash_keys);
1868 } else {
1869 hash_keys.addrs.v4addrs.src = fl4->saddr;
1870 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1871 }
1872 break;
1873 case 1:
1874 /* skb is currently provided only when forwarding */
1875 if (skb) {
1876 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1877 struct flow_keys keys;
1878
1879 /* short-circuit if we already have L4 hash present */
1880 if (skb->l4_hash)
1881 return skb_get_hash_raw(skb) >> 1;
1882 memset(&hash_keys, 0, sizeof(hash_keys));
1883 skb_flow_dissect_flow_keys(skb, &keys, flag);
1884
1885 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1886 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1887 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1888 hash_keys.ports.src = keys.ports.src;
1889 hash_keys.ports.dst = keys.ports.dst;
1890 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1891 } else {
1892 memset(&hash_keys, 0, sizeof(hash_keys));
1893 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1894 hash_keys.addrs.v4addrs.src = fl4->saddr;
1895 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1896 hash_keys.ports.src = fl4->fl4_sport;
1897 hash_keys.ports.dst = fl4->fl4_dport;
1898 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1899 }
1900 break;
1901 }
1902 mhash = flow_hash_from_keys(&hash_keys);
1903
1904 return mhash >> 1;
1905 }
1906 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1907 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1908
1909 static int ip_mkroute_input(struct sk_buff *skb,
1910 struct fib_result *res,
1911 struct in_device *in_dev,
1912 __be32 daddr, __be32 saddr, u32 tos)
1913 {
1914 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1915 if (res->fi && res->fi->fib_nhs > 1) {
1916 int h = fib_multipath_hash(res->fi, NULL, skb);
1917
1918 fib_select_multipath(res, h);
1919 }
1920 #endif
1921
1922 /* create a routing cache entry */
1923 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1924 }
1925
1926 /*
1927 * NOTE. We drop all the packets that has local source
1928 * addresses, because every properly looped back packet
1929 * must have correct destination already attached by output routine.
1930 *
1931 * Such approach solves two big problems:
1932 * 1. Not simplex devices are handled properly.
1933 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1934 * called with rcu_read_lock()
1935 */
1936
1937 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1938 u8 tos, struct net_device *dev,
1939 struct fib_result *res)
1940 {
1941 struct in_device *in_dev = __in_dev_get_rcu(dev);
1942 struct ip_tunnel_info *tun_info;
1943 struct flowi4 fl4;
1944 unsigned int flags = 0;
1945 u32 itag = 0;
1946 struct rtable *rth;
1947 int err = -EINVAL;
1948 struct net *net = dev_net(dev);
1949 bool do_cache;
1950
1951 /* IP on this device is disabled. */
1952
1953 if (!in_dev)
1954 goto out;
1955
1956 /* Check for the most weird martians, which can be not detected
1957 by fib_lookup.
1958 */
1959
1960 tun_info = skb_tunnel_info(skb);
1961 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1962 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1963 else
1964 fl4.flowi4_tun_key.tun_id = 0;
1965 skb_dst_drop(skb);
1966
1967 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1968 goto martian_source;
1969
1970 res->fi = NULL;
1971 res->table = NULL;
1972 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1973 goto brd_input;
1974
1975 /* Accept zero addresses only to limited broadcast;
1976 * I even do not know to fix it or not. Waiting for complains :-)
1977 */
1978 if (ipv4_is_zeronet(saddr))
1979 goto martian_source;
1980
1981 if (ipv4_is_zeronet(daddr))
1982 goto martian_destination;
1983
1984 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1985 * and call it once if daddr or/and saddr are loopback addresses
1986 */
1987 if (ipv4_is_loopback(daddr)) {
1988 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1989 goto martian_destination;
1990 } else if (ipv4_is_loopback(saddr)) {
1991 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1992 goto martian_source;
1993 }
1994
1995 /*
1996 * Now we are ready to route packet.
1997 */
1998 fl4.flowi4_oif = 0;
1999 fl4.flowi4_iif = dev->ifindex;
2000 fl4.flowi4_mark = skb->mark;
2001 fl4.flowi4_tos = tos;
2002 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2003 fl4.flowi4_flags = 0;
2004 fl4.daddr = daddr;
2005 fl4.saddr = saddr;
2006 fl4.flowi4_uid = sock_net_uid(net, NULL);
2007 err = fib_lookup(net, &fl4, res, 0);
2008 if (err != 0) {
2009 if (!IN_DEV_FORWARD(in_dev))
2010 err = -EHOSTUNREACH;
2011 goto no_route;
2012 }
2013
2014 if (res->type == RTN_BROADCAST)
2015 goto brd_input;
2016
2017 if (res->type == RTN_LOCAL) {
2018 err = fib_validate_source(skb, saddr, daddr, tos,
2019 0, dev, in_dev, &itag);
2020 if (err < 0)
2021 goto martian_source;
2022 goto local_input;
2023 }
2024
2025 if (!IN_DEV_FORWARD(in_dev)) {
2026 err = -EHOSTUNREACH;
2027 goto no_route;
2028 }
2029 if (res->type != RTN_UNICAST)
2030 goto martian_destination;
2031
2032 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2033 out: return err;
2034
2035 brd_input:
2036 if (skb->protocol != htons(ETH_P_IP))
2037 goto e_inval;
2038
2039 if (!ipv4_is_zeronet(saddr)) {
2040 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2041 in_dev, &itag);
2042 if (err < 0)
2043 goto martian_source;
2044 }
2045 flags |= RTCF_BROADCAST;
2046 res->type = RTN_BROADCAST;
2047 RT_CACHE_STAT_INC(in_brd);
2048
2049 local_input:
2050 do_cache = false;
2051 if (res->fi) {
2052 if (!itag) {
2053 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2054 if (rt_cache_valid(rth)) {
2055 skb_dst_set_noref(skb, &rth->dst);
2056 err = 0;
2057 goto out;
2058 }
2059 do_cache = true;
2060 }
2061 }
2062
2063 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2064 flags | RTCF_LOCAL, res->type,
2065 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2066 if (!rth)
2067 goto e_nobufs;
2068
2069 rth->dst.output= ip_rt_bug;
2070 #ifdef CONFIG_IP_ROUTE_CLASSID
2071 rth->dst.tclassid = itag;
2072 #endif
2073 rth->rt_is_input = 1;
2074 if (res->table)
2075 rth->rt_table_id = res->table->tb_id;
2076
2077 RT_CACHE_STAT_INC(in_slow_tot);
2078 if (res->type == RTN_UNREACHABLE) {
2079 rth->dst.input= ip_error;
2080 rth->dst.error= -err;
2081 rth->rt_flags &= ~RTCF_LOCAL;
2082 }
2083
2084 if (do_cache) {
2085 struct fib_nh *nh = &FIB_RES_NH(*res);
2086
2087 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2088 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2089 WARN_ON(rth->dst.input == lwtunnel_input);
2090 rth->dst.lwtstate->orig_input = rth->dst.input;
2091 rth->dst.input = lwtunnel_input;
2092 }
2093
2094 if (unlikely(!rt_cache_route(nh, rth)))
2095 rt_add_uncached_list(rth);
2096 }
2097 skb_dst_set(skb, &rth->dst);
2098 err = 0;
2099 goto out;
2100
2101 no_route:
2102 RT_CACHE_STAT_INC(in_no_route);
2103 res->type = RTN_UNREACHABLE;
2104 res->fi = NULL;
2105 res->table = NULL;
2106 goto local_input;
2107
2108 /*
2109 * Do not cache martian addresses: they should be logged (RFC1812)
2110 */
2111 martian_destination:
2112 RT_CACHE_STAT_INC(in_martian_dst);
2113 #ifdef CONFIG_IP_ROUTE_VERBOSE
2114 if (IN_DEV_LOG_MARTIANS(in_dev))
2115 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2116 &daddr, &saddr, dev->name);
2117 #endif
2118
2119 e_inval:
2120 err = -EINVAL;
2121 goto out;
2122
2123 e_nobufs:
2124 err = -ENOBUFS;
2125 goto out;
2126
2127 martian_source:
2128 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2129 goto out;
2130 }
2131
2132 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2133 u8 tos, struct net_device *dev)
2134 {
2135 struct fib_result res;
2136 int err;
2137
2138 tos &= IPTOS_RT_MASK;
2139 rcu_read_lock();
2140 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2141 rcu_read_unlock();
2142
2143 return err;
2144 }
2145 EXPORT_SYMBOL(ip_route_input_noref);
2146
2147 /* called with rcu_read_lock held */
2148 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2149 u8 tos, struct net_device *dev, struct fib_result *res)
2150 {
2151 /* Multicast recognition logic is moved from route cache to here.
2152 The problem was that too many Ethernet cards have broken/missing
2153 hardware multicast filters :-( As result the host on multicasting
2154 network acquires a lot of useless route cache entries, sort of
2155 SDR messages from all the world. Now we try to get rid of them.
2156 Really, provided software IP multicast filter is organized
2157 reasonably (at least, hashed), it does not result in a slowdown
2158 comparing with route cache reject entries.
2159 Note, that multicast routers are not affected, because
2160 route cache entry is created eventually.
2161 */
2162 if (ipv4_is_multicast(daddr)) {
2163 struct in_device *in_dev = __in_dev_get_rcu(dev);
2164 int our = 0;
2165 int err = -EINVAL;
2166
2167 if (!in_dev)
2168 return err;
2169 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2170 ip_hdr(skb)->protocol);
2171
2172 /* check l3 master if no match yet */
2173 if (!our && netif_is_l3_slave(dev)) {
2174 struct in_device *l3_in_dev;
2175
2176 l3_in_dev = __in_dev_get_rcu(skb->dev);
2177 if (l3_in_dev)
2178 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2179 ip_hdr(skb)->protocol);
2180 }
2181
2182 if (our
2183 #ifdef CONFIG_IP_MROUTE
2184 ||
2185 (!ipv4_is_local_multicast(daddr) &&
2186 IN_DEV_MFORWARD(in_dev))
2187 #endif
2188 ) {
2189 err = ip_route_input_mc(skb, daddr, saddr,
2190 tos, dev, our);
2191 }
2192 return err;
2193 }
2194
2195 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2196 }
2197
2198 /* called with rcu_read_lock() */
2199 static struct rtable *__mkroute_output(const struct fib_result *res,
2200 const struct flowi4 *fl4, int orig_oif,
2201 struct net_device *dev_out,
2202 unsigned int flags)
2203 {
2204 struct fib_info *fi = res->fi;
2205 struct fib_nh_exception *fnhe;
2206 struct in_device *in_dev;
2207 u16 type = res->type;
2208 struct rtable *rth;
2209 bool do_cache;
2210
2211 in_dev = __in_dev_get_rcu(dev_out);
2212 if (!in_dev)
2213 return ERR_PTR(-EINVAL);
2214
2215 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2216 if (ipv4_is_loopback(fl4->saddr) &&
2217 !(dev_out->flags & IFF_LOOPBACK) &&
2218 !netif_is_l3_master(dev_out))
2219 return ERR_PTR(-EINVAL);
2220
2221 if (ipv4_is_lbcast(fl4->daddr))
2222 type = RTN_BROADCAST;
2223 else if (ipv4_is_multicast(fl4->daddr))
2224 type = RTN_MULTICAST;
2225 else if (ipv4_is_zeronet(fl4->daddr))
2226 return ERR_PTR(-EINVAL);
2227
2228 if (dev_out->flags & IFF_LOOPBACK)
2229 flags |= RTCF_LOCAL;
2230
2231 do_cache = true;
2232 if (type == RTN_BROADCAST) {
2233 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2234 fi = NULL;
2235 } else if (type == RTN_MULTICAST) {
2236 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2237 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2238 fl4->flowi4_proto))
2239 flags &= ~RTCF_LOCAL;
2240 else
2241 do_cache = false;
2242 /* If multicast route do not exist use
2243 * default one, but do not gateway in this case.
2244 * Yes, it is hack.
2245 */
2246 if (fi && res->prefixlen < 4)
2247 fi = NULL;
2248 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2249 (orig_oif != dev_out->ifindex)) {
2250 /* For local routes that require a particular output interface
2251 * we do not want to cache the result. Caching the result
2252 * causes incorrect behaviour when there are multiple source
2253 * addresses on the interface, the end result being that if the
2254 * intended recipient is waiting on that interface for the
2255 * packet he won't receive it because it will be delivered on
2256 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2257 * be set to the loopback interface as well.
2258 */
2259 do_cache = false;
2260 }
2261
2262 fnhe = NULL;
2263 do_cache &= fi != NULL;
2264 if (fi) {
2265 struct rtable __rcu **prth;
2266 struct fib_nh *nh = &FIB_RES_NH(*res);
2267
2268 fnhe = find_exception(nh, fl4->daddr);
2269 if (!do_cache)
2270 goto add;
2271 if (fnhe) {
2272 prth = &fnhe->fnhe_rth_output;
2273 } else {
2274 if (unlikely(fl4->flowi4_flags &
2275 FLOWI_FLAG_KNOWN_NH &&
2276 !(nh->nh_gw &&
2277 nh->nh_scope == RT_SCOPE_LINK))) {
2278 do_cache = false;
2279 goto add;
2280 }
2281 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2282 }
2283 rth = rcu_dereference(*prth);
2284 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2285 return rth;
2286 }
2287
2288 add:
2289 rth = rt_dst_alloc(dev_out, flags, type,
2290 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2291 IN_DEV_CONF_GET(in_dev, NOXFRM),
2292 do_cache);
2293 if (!rth)
2294 return ERR_PTR(-ENOBUFS);
2295
2296 rth->rt_iif = orig_oif;
2297 if (res->table)
2298 rth->rt_table_id = res->table->tb_id;
2299
2300 RT_CACHE_STAT_INC(out_slow_tot);
2301
2302 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2303 if (flags & RTCF_LOCAL &&
2304 !(dev_out->flags & IFF_LOOPBACK)) {
2305 rth->dst.output = ip_mc_output;
2306 RT_CACHE_STAT_INC(out_slow_mc);
2307 }
2308 #ifdef CONFIG_IP_MROUTE
2309 if (type == RTN_MULTICAST) {
2310 if (IN_DEV_MFORWARD(in_dev) &&
2311 !ipv4_is_local_multicast(fl4->daddr)) {
2312 rth->dst.input = ip_mr_input;
2313 rth->dst.output = ip_mc_output;
2314 }
2315 }
2316 #endif
2317 }
2318
2319 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2320 set_lwt_redirect(rth);
2321
2322 return rth;
2323 }
2324
2325 /*
2326 * Major route resolver routine.
2327 */
2328
2329 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2330 const struct sk_buff *skb)
2331 {
2332 __u8 tos = RT_FL_TOS(fl4);
2333 struct fib_result res = {
2334 .type = RTN_UNSPEC,
2335 .fi = NULL,
2336 .table = NULL,
2337 .tclassid = 0,
2338 };
2339 struct rtable *rth;
2340
2341 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2342 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2343 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2344 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2345
2346 rcu_read_lock();
2347 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2348 rcu_read_unlock();
2349
2350 return rth;
2351 }
2352 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2353
2354 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2355 struct fib_result *res,
2356 const struct sk_buff *skb)
2357 {
2358 struct net_device *dev_out = NULL;
2359 int orig_oif = fl4->flowi4_oif;
2360 unsigned int flags = 0;
2361 struct rtable *rth;
2362 int err;
2363
2364 if (fl4->saddr) {
2365 if (ipv4_is_multicast(fl4->saddr) ||
2366 ipv4_is_lbcast(fl4->saddr) ||
2367 ipv4_is_zeronet(fl4->saddr)) {
2368 rth = ERR_PTR(-EINVAL);
2369 goto out;
2370 }
2371
2372 rth = ERR_PTR(-ENETUNREACH);
2373
2374 /* I removed check for oif == dev_out->oif here.
2375 It was wrong for two reasons:
2376 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2377 is assigned to multiple interfaces.
2378 2. Moreover, we are allowed to send packets with saddr
2379 of another iface. --ANK
2380 */
2381
2382 if (fl4->flowi4_oif == 0 &&
2383 (ipv4_is_multicast(fl4->daddr) ||
2384 ipv4_is_lbcast(fl4->daddr))) {
2385 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2386 dev_out = __ip_dev_find(net, fl4->saddr, false);
2387 if (!dev_out)
2388 goto out;
2389
2390 /* Special hack: user can direct multicasts
2391 and limited broadcast via necessary interface
2392 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2393 This hack is not just for fun, it allows
2394 vic,vat and friends to work.
2395 They bind socket to loopback, set ttl to zero
2396 and expect that it will work.
2397 From the viewpoint of routing cache they are broken,
2398 because we are not allowed to build multicast path
2399 with loopback source addr (look, routing cache
2400 cannot know, that ttl is zero, so that packet
2401 will not leave this host and route is valid).
2402 Luckily, this hack is good workaround.
2403 */
2404
2405 fl4->flowi4_oif = dev_out->ifindex;
2406 goto make_route;
2407 }
2408
2409 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2410 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2411 if (!__ip_dev_find(net, fl4->saddr, false))
2412 goto out;
2413 }
2414 }
2415
2416
2417 if (fl4->flowi4_oif) {
2418 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2419 rth = ERR_PTR(-ENODEV);
2420 if (!dev_out)
2421 goto out;
2422
2423 /* RACE: Check return value of inet_select_addr instead. */
2424 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2425 rth = ERR_PTR(-ENETUNREACH);
2426 goto out;
2427 }
2428 if (ipv4_is_local_multicast(fl4->daddr) ||
2429 ipv4_is_lbcast(fl4->daddr) ||
2430 fl4->flowi4_proto == IPPROTO_IGMP) {
2431 if (!fl4->saddr)
2432 fl4->saddr = inet_select_addr(dev_out, 0,
2433 RT_SCOPE_LINK);
2434 goto make_route;
2435 }
2436 if (!fl4->saddr) {
2437 if (ipv4_is_multicast(fl4->daddr))
2438 fl4->saddr = inet_select_addr(dev_out, 0,
2439 fl4->flowi4_scope);
2440 else if (!fl4->daddr)
2441 fl4->saddr = inet_select_addr(dev_out, 0,
2442 RT_SCOPE_HOST);
2443 }
2444 }
2445
2446 if (!fl4->daddr) {
2447 fl4->daddr = fl4->saddr;
2448 if (!fl4->daddr)
2449 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2450 dev_out = net->loopback_dev;
2451 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2452 res->type = RTN_LOCAL;
2453 flags |= RTCF_LOCAL;
2454 goto make_route;
2455 }
2456
2457 err = fib_lookup(net, fl4, res, 0);
2458 if (err) {
2459 res->fi = NULL;
2460 res->table = NULL;
2461 if (fl4->flowi4_oif &&
2462 (ipv4_is_multicast(fl4->daddr) ||
2463 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2464 /* Apparently, routing tables are wrong. Assume,
2465 that the destination is on link.
2466
2467 WHY? DW.
2468 Because we are allowed to send to iface
2469 even if it has NO routes and NO assigned
2470 addresses. When oif is specified, routing
2471 tables are looked up with only one purpose:
2472 to catch if destination is gatewayed, rather than
2473 direct. Moreover, if MSG_DONTROUTE is set,
2474 we send packet, ignoring both routing tables
2475 and ifaddr state. --ANK
2476
2477
2478 We could make it even if oif is unknown,
2479 likely IPv6, but we do not.
2480 */
2481
2482 if (fl4->saddr == 0)
2483 fl4->saddr = inet_select_addr(dev_out, 0,
2484 RT_SCOPE_LINK);
2485 res->type = RTN_UNICAST;
2486 goto make_route;
2487 }
2488 rth = ERR_PTR(err);
2489 goto out;
2490 }
2491
2492 if (res->type == RTN_LOCAL) {
2493 if (!fl4->saddr) {
2494 if (res->fi->fib_prefsrc)
2495 fl4->saddr = res->fi->fib_prefsrc;
2496 else
2497 fl4->saddr = fl4->daddr;
2498 }
2499
2500 /* L3 master device is the loopback for that domain */
2501 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2502 net->loopback_dev;
2503
2504 /* make sure orig_oif points to fib result device even
2505 * though packet rx/tx happens over loopback or l3mdev
2506 */
2507 orig_oif = FIB_RES_OIF(*res);
2508
2509 fl4->flowi4_oif = dev_out->ifindex;
2510 flags |= RTCF_LOCAL;
2511 goto make_route;
2512 }
2513
2514 fib_select_path(net, res, fl4, skb);
2515
2516 dev_out = FIB_RES_DEV(*res);
2517 fl4->flowi4_oif = dev_out->ifindex;
2518
2519
2520 make_route:
2521 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2522
2523 out:
2524 return rth;
2525 }
2526
2527 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2528 {
2529 return NULL;
2530 }
2531
2532 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2533 {
2534 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2535
2536 return mtu ? : dst->dev->mtu;
2537 }
2538
2539 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2540 struct sk_buff *skb, u32 mtu,
2541 bool confirm_neigh)
2542 {
2543 }
2544
2545 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2546 struct sk_buff *skb)
2547 {
2548 }
2549
2550 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2551 unsigned long old)
2552 {
2553 return NULL;
2554 }
2555
2556 static struct dst_ops ipv4_dst_blackhole_ops = {
2557 .family = AF_INET,
2558 .check = ipv4_blackhole_dst_check,
2559 .mtu = ipv4_blackhole_mtu,
2560 .default_advmss = ipv4_default_advmss,
2561 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2562 .redirect = ipv4_rt_blackhole_redirect,
2563 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2564 .neigh_lookup = ipv4_neigh_lookup,
2565 };
2566
2567 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2568 {
2569 struct rtable *ort = (struct rtable *) dst_orig;
2570 struct rtable *rt;
2571
2572 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2573 if (rt) {
2574 struct dst_entry *new = &rt->dst;
2575
2576 new->__use = 1;
2577 new->input = dst_discard;
2578 new->output = dst_discard_out;
2579
2580 new->dev = net->loopback_dev;
2581 if (new->dev)
2582 dev_hold(new->dev);
2583
2584 rt->rt_is_input = ort->rt_is_input;
2585 rt->rt_iif = ort->rt_iif;
2586 rt->rt_pmtu = ort->rt_pmtu;
2587 rt->rt_mtu_locked = ort->rt_mtu_locked;
2588
2589 rt->rt_genid = rt_genid_ipv4(net);
2590 rt->rt_flags = ort->rt_flags;
2591 rt->rt_type = ort->rt_type;
2592 rt->rt_gateway = ort->rt_gateway;
2593 rt->rt_uses_gateway = ort->rt_uses_gateway;
2594
2595 INIT_LIST_HEAD(&rt->rt_uncached);
2596 }
2597
2598 dst_release(dst_orig);
2599
2600 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2601 }
2602
2603 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2604 const struct sock *sk)
2605 {
2606 struct rtable *rt = __ip_route_output_key(net, flp4);
2607
2608 if (IS_ERR(rt))
2609 return rt;
2610
2611 if (flp4->flowi4_proto)
2612 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2613 flowi4_to_flowi(flp4),
2614 sk, 0);
2615
2616 return rt;
2617 }
2618 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2619
2620 /* called with rcu_read_lock held */
2621 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2622 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2623 u32 seq)
2624 {
2625 struct rtable *rt = skb_rtable(skb);
2626 struct rtmsg *r;
2627 struct nlmsghdr *nlh;
2628 unsigned long expires = 0;
2629 u32 error;
2630 u32 metrics[RTAX_MAX];
2631
2632 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2633 if (!nlh)
2634 return -EMSGSIZE;
2635
2636 r = nlmsg_data(nlh);
2637 r->rtm_family = AF_INET;
2638 r->rtm_dst_len = 32;
2639 r->rtm_src_len = 0;
2640 r->rtm_tos = fl4->flowi4_tos;
2641 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2642 if (nla_put_u32(skb, RTA_TABLE, table_id))
2643 goto nla_put_failure;
2644 r->rtm_type = rt->rt_type;
2645 r->rtm_scope = RT_SCOPE_UNIVERSE;
2646 r->rtm_protocol = RTPROT_UNSPEC;
2647 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648 if (rt->rt_flags & RTCF_NOTIFY)
2649 r->rtm_flags |= RTM_F_NOTIFY;
2650 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2651 r->rtm_flags |= RTCF_DOREDIRECT;
2652
2653 if (nla_put_in_addr(skb, RTA_DST, dst))
2654 goto nla_put_failure;
2655 if (src) {
2656 r->rtm_src_len = 32;
2657 if (nla_put_in_addr(skb, RTA_SRC, src))
2658 goto nla_put_failure;
2659 }
2660 if (rt->dst.dev &&
2661 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2662 goto nla_put_failure;
2663 #ifdef CONFIG_IP_ROUTE_CLASSID
2664 if (rt->dst.tclassid &&
2665 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2666 goto nla_put_failure;
2667 #endif
2668 if (!rt_is_input_route(rt) &&
2669 fl4->saddr != src) {
2670 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2671 goto nla_put_failure;
2672 }
2673 if (rt->rt_uses_gateway &&
2674 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2675 goto nla_put_failure;
2676
2677 expires = rt->dst.expires;
2678 if (expires) {
2679 unsigned long now = jiffies;
2680
2681 if (time_before(now, expires))
2682 expires -= now;
2683 else
2684 expires = 0;
2685 }
2686
2687 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2688 if (rt->rt_pmtu && expires)
2689 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2690 if (rt->rt_mtu_locked && expires)
2691 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2692 if (rtnetlink_put_metrics(skb, metrics) < 0)
2693 goto nla_put_failure;
2694
2695 if (fl4->flowi4_mark &&
2696 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2697 goto nla_put_failure;
2698
2699 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2700 nla_put_u32(skb, RTA_UID,
2701 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2702 goto nla_put_failure;
2703
2704 error = rt->dst.error;
2705
2706 if (rt_is_input_route(rt)) {
2707 #ifdef CONFIG_IP_MROUTE
2708 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2709 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2710 int err = ipmr_get_route(net, skb,
2711 fl4->saddr, fl4->daddr,
2712 r, portid);
2713
2714 if (err <= 0) {
2715 if (err == 0)
2716 return 0;
2717 goto nla_put_failure;
2718 }
2719 } else
2720 #endif
2721 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2722 goto nla_put_failure;
2723 }
2724
2725 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2726 goto nla_put_failure;
2727
2728 nlmsg_end(skb, nlh);
2729 return 0;
2730
2731 nla_put_failure:
2732 nlmsg_cancel(skb, nlh);
2733 return -EMSGSIZE;
2734 }
2735
2736 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2737 struct netlink_ext_ack *extack)
2738 {
2739 struct net *net = sock_net(in_skb->sk);
2740 struct rtmsg *rtm;
2741 struct nlattr *tb[RTA_MAX+1];
2742 struct fib_result res = {};
2743 struct rtable *rt = NULL;
2744 struct flowi4 fl4;
2745 __be32 dst = 0;
2746 __be32 src = 0;
2747 u32 iif;
2748 int err;
2749 int mark;
2750 struct sk_buff *skb;
2751 u32 table_id = RT_TABLE_MAIN;
2752 kuid_t uid;
2753
2754 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2755 extack);
2756 if (err < 0)
2757 goto errout;
2758
2759 rtm = nlmsg_data(nlh);
2760
2761 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2762 if (!skb) {
2763 err = -ENOBUFS;
2764 goto errout;
2765 }
2766
2767 /* Reserve room for dummy headers, this skb can pass
2768 through good chunk of routing engine.
2769 */
2770 skb_reset_mac_header(skb);
2771 skb_reset_network_header(skb);
2772
2773 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2774 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2775 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2776 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2777 if (tb[RTA_UID])
2778 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2779 else
2780 uid = (iif ? INVALID_UID : current_uid());
2781
2782 /* Bugfix: need to give ip_route_input enough of an IP header to
2783 * not gag.
2784 */
2785 ip_hdr(skb)->protocol = IPPROTO_UDP;
2786 ip_hdr(skb)->saddr = src;
2787 ip_hdr(skb)->daddr = dst;
2788
2789 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2790
2791 memset(&fl4, 0, sizeof(fl4));
2792 fl4.daddr = dst;
2793 fl4.saddr = src;
2794 fl4.flowi4_tos = rtm->rtm_tos;
2795 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2796 fl4.flowi4_mark = mark;
2797 fl4.flowi4_uid = uid;
2798
2799 rcu_read_lock();
2800
2801 if (iif) {
2802 struct net_device *dev;
2803
2804 dev = dev_get_by_index_rcu(net, iif);
2805 if (!dev) {
2806 err = -ENODEV;
2807 goto errout_free;
2808 }
2809
2810 skb->protocol = htons(ETH_P_IP);
2811 skb->dev = dev;
2812 skb->mark = mark;
2813 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2814 dev, &res);
2815
2816 rt = skb_rtable(skb);
2817 if (err == 0 && rt->dst.error)
2818 err = -rt->dst.error;
2819 } else {
2820 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2821 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2822 err = 0;
2823 if (IS_ERR(rt))
2824 err = PTR_ERR(rt);
2825 else
2826 skb_dst_set(skb, &rt->dst);
2827 }
2828
2829 if (err)
2830 goto errout_free;
2831
2832 if (rtm->rtm_flags & RTM_F_NOTIFY)
2833 rt->rt_flags |= RTCF_NOTIFY;
2834
2835 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2836 table_id = rt->rt_table_id;
2837
2838 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2839 if (!res.fi) {
2840 err = fib_props[res.type].error;
2841 if (!err)
2842 err = -EHOSTUNREACH;
2843 goto errout_free;
2844 }
2845 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2846 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2847 rt->rt_type, res.prefix, res.prefixlen,
2848 fl4.flowi4_tos, res.fi, 0);
2849 } else {
2850 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2851 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2852 }
2853 if (err < 0)
2854 goto errout_free;
2855
2856 rcu_read_unlock();
2857
2858 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2859 errout:
2860 return err;
2861
2862 errout_free:
2863 rcu_read_unlock();
2864 kfree_skb(skb);
2865 goto errout;
2866 }
2867
2868 void ip_rt_multicast_event(struct in_device *in_dev)
2869 {
2870 rt_cache_flush(dev_net(in_dev->dev));
2871 }
2872
2873 #ifdef CONFIG_SYSCTL
2874 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2875 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2876 static int ip_rt_gc_elasticity __read_mostly = 8;
2877
2878 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2879 void __user *buffer,
2880 size_t *lenp, loff_t *ppos)
2881 {
2882 struct net *net = (struct net *)__ctl->extra1;
2883
2884 if (write) {
2885 rt_cache_flush(net);
2886 fnhe_genid_bump(net);
2887 return 0;
2888 }
2889
2890 return -EINVAL;
2891 }
2892
2893 static struct ctl_table ipv4_route_table[] = {
2894 {
2895 .procname = "gc_thresh",
2896 .data = &ipv4_dst_ops.gc_thresh,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
2899 .proc_handler = proc_dointvec,
2900 },
2901 {
2902 .procname = "max_size",
2903 .data = &ip_rt_max_size,
2904 .maxlen = sizeof(int),
2905 .mode = 0644,
2906 .proc_handler = proc_dointvec,
2907 },
2908 {
2909 /* Deprecated. Use gc_min_interval_ms */
2910
2911 .procname = "gc_min_interval",
2912 .data = &ip_rt_gc_min_interval,
2913 .maxlen = sizeof(int),
2914 .mode = 0644,
2915 .proc_handler = proc_dointvec_jiffies,
2916 },
2917 {
2918 .procname = "gc_min_interval_ms",
2919 .data = &ip_rt_gc_min_interval,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = proc_dointvec_ms_jiffies,
2923 },
2924 {
2925 .procname = "gc_timeout",
2926 .data = &ip_rt_gc_timeout,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
2929 .proc_handler = proc_dointvec_jiffies,
2930 },
2931 {
2932 .procname = "gc_interval",
2933 .data = &ip_rt_gc_interval,
2934 .maxlen = sizeof(int),
2935 .mode = 0644,
2936 .proc_handler = proc_dointvec_jiffies,
2937 },
2938 {
2939 .procname = "redirect_load",
2940 .data = &ip_rt_redirect_load,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = proc_dointvec,
2944 },
2945 {
2946 .procname = "redirect_number",
2947 .data = &ip_rt_redirect_number,
2948 .maxlen = sizeof(int),
2949 .mode = 0644,
2950 .proc_handler = proc_dointvec,
2951 },
2952 {
2953 .procname = "redirect_silence",
2954 .data = &ip_rt_redirect_silence,
2955 .maxlen = sizeof(int),
2956 .mode = 0644,
2957 .proc_handler = proc_dointvec,
2958 },
2959 {
2960 .procname = "error_cost",
2961 .data = &ip_rt_error_cost,
2962 .maxlen = sizeof(int),
2963 .mode = 0644,
2964 .proc_handler = proc_dointvec,
2965 },
2966 {
2967 .procname = "error_burst",
2968 .data = &ip_rt_error_burst,
2969 .maxlen = sizeof(int),
2970 .mode = 0644,
2971 .proc_handler = proc_dointvec,
2972 },
2973 {
2974 .procname = "gc_elasticity",
2975 .data = &ip_rt_gc_elasticity,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = proc_dointvec,
2979 },
2980 {
2981 .procname = "mtu_expires",
2982 .data = &ip_rt_mtu_expires,
2983 .maxlen = sizeof(int),
2984 .mode = 0644,
2985 .proc_handler = proc_dointvec_jiffies,
2986 },
2987 {
2988 .procname = "min_pmtu",
2989 .data = &ip_rt_min_pmtu,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = proc_dointvec_minmax,
2993 .extra1 = &ip_min_valid_pmtu,
2994 },
2995 {
2996 .procname = "min_adv_mss",
2997 .data = &ip_rt_min_advmss,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = proc_dointvec,
3001 },
3002 { }
3003 };
3004
3005 static struct ctl_table ipv4_route_flush_table[] = {
3006 {
3007 .procname = "flush",
3008 .maxlen = sizeof(int),
3009 .mode = 0200,
3010 .proc_handler = ipv4_sysctl_rtcache_flush,
3011 },
3012 { },
3013 };
3014
3015 static __net_init int sysctl_route_net_init(struct net *net)
3016 {
3017 struct ctl_table *tbl;
3018
3019 tbl = ipv4_route_flush_table;
3020 if (!net_eq(net, &init_net)) {
3021 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3022 if (!tbl)
3023 goto err_dup;
3024
3025 /* Don't export sysctls to unprivileged users */
3026 if (net->user_ns != &init_user_ns)
3027 tbl[0].procname = NULL;
3028 }
3029 tbl[0].extra1 = net;
3030
3031 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3032 if (!net->ipv4.route_hdr)
3033 goto err_reg;
3034 return 0;
3035
3036 err_reg:
3037 if (tbl != ipv4_route_flush_table)
3038 kfree(tbl);
3039 err_dup:
3040 return -ENOMEM;
3041 }
3042
3043 static __net_exit void sysctl_route_net_exit(struct net *net)
3044 {
3045 struct ctl_table *tbl;
3046
3047 tbl = net->ipv4.route_hdr->ctl_table_arg;
3048 unregister_net_sysctl_table(net->ipv4.route_hdr);
3049 BUG_ON(tbl == ipv4_route_flush_table);
3050 kfree(tbl);
3051 }
3052
3053 static __net_initdata struct pernet_operations sysctl_route_ops = {
3054 .init = sysctl_route_net_init,
3055 .exit = sysctl_route_net_exit,
3056 };
3057 #endif
3058
3059 static __net_init int rt_genid_init(struct net *net)
3060 {
3061 atomic_set(&net->ipv4.rt_genid, 0);
3062 atomic_set(&net->fnhe_genid, 0);
3063 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3064 return 0;
3065 }
3066
3067 static __net_initdata struct pernet_operations rt_genid_ops = {
3068 .init = rt_genid_init,
3069 };
3070
3071 static int __net_init ipv4_inetpeer_init(struct net *net)
3072 {
3073 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3074
3075 if (!bp)
3076 return -ENOMEM;
3077 inet_peer_base_init(bp);
3078 net->ipv4.peers = bp;
3079 return 0;
3080 }
3081
3082 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3083 {
3084 struct inet_peer_base *bp = net->ipv4.peers;
3085
3086 net->ipv4.peers = NULL;
3087 inetpeer_invalidate_tree(bp);
3088 kfree(bp);
3089 }
3090
3091 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3092 .init = ipv4_inetpeer_init,
3093 .exit = ipv4_inetpeer_exit,
3094 };
3095
3096 #ifdef CONFIG_IP_ROUTE_CLASSID
3097 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3098 #endif /* CONFIG_IP_ROUTE_CLASSID */
3099
3100 int __init ip_rt_init(void)
3101 {
3102 int cpu;
3103
3104 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3105 if (!ip_idents)
3106 panic("IP: failed to allocate ip_idents\n");
3107
3108 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3109
3110 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3111 if (!ip_tstamps)
3112 panic("IP: failed to allocate ip_tstamps\n");
3113
3114 for_each_possible_cpu(cpu) {
3115 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3116
3117 INIT_LIST_HEAD(&ul->head);
3118 spin_lock_init(&ul->lock);
3119 }
3120 #ifdef CONFIG_IP_ROUTE_CLASSID
3121 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3122 if (!ip_rt_acct)
3123 panic("IP: failed to allocate ip_rt_acct\n");
3124 #endif
3125
3126 ipv4_dst_ops.kmem_cachep =
3127 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3128 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3129
3130 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3131
3132 if (dst_entries_init(&ipv4_dst_ops) < 0)
3133 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3134
3135 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3136 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3137
3138 ipv4_dst_ops.gc_thresh = ~0;
3139 ip_rt_max_size = INT_MAX;
3140
3141 devinet_init();
3142 ip_fib_init();
3143
3144 if (ip_rt_proc_init())
3145 pr_err("Unable to create route proc files\n");
3146 #ifdef CONFIG_XFRM
3147 xfrm_init();
3148 xfrm4_init();
3149 #endif
3150 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3151 RTNL_FLAG_DOIT_UNLOCKED);
3152
3153 #ifdef CONFIG_SYSCTL
3154 register_pernet_subsys(&sysctl_route_ops);
3155 #endif
3156 register_pernet_subsys(&rt_genid_ops);
3157 register_pernet_subsys(&ipv4_inetpeer_ops);
3158 return 0;
3159 }
3160
3161 #ifdef CONFIG_SYSCTL
3162 /*
3163 * We really need to sanitize the damn ipv4 init order, then all
3164 * this nonsense will go away.
3165 */
3166 void __init ip_static_sysctl_init(void)
3167 {
3168 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3169 }
3170 #endif