]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - net/ipv4/route.c
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/arp.h>
99 #include <net/tcp.h>
100 #include <net/icmp.h>
101 #include <net/xfrm.h>
102 #include <net/lwtunnel.h>
103 #include <net/netevent.h>
104 #include <net/rtnetlink.h>
105 #ifdef CONFIG_SYSCTL
106 #include <linux/sysctl.h>
107 #endif
108 #include <net/secure_seq.h>
109 #include <net/ip_tunnels.h>
110 #include <net/l3mdev.h>
111
112 #include "fib_lookup.h"
113
114 #define RT_FL_TOS(oldflp4) \
115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly = 9;
121 static int ip_rt_redirect_load __read_mostly = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly = HZ;
124 static int ip_rt_error_burst __read_mostly = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
126 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly = 256;
128
129 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
130
131 /*
132 * Interface to generic destination cache.
133 */
134
135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
136 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
137 static unsigned int ipv4_mtu(const struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void ipv4_link_failure(struct sk_buff *skb);
140 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
141 struct sk_buff *skb, u32 mtu);
142 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145
146 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
147 {
148 WARN_ON(1);
149 return NULL;
150 }
151
152 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
153 struct sk_buff *skb,
154 const void *daddr);
155 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
156
157 static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
159 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss,
161 .mtu = ipv4_mtu,
162 .cow_metrics = ipv4_cow_metrics,
163 .destroy = ipv4_dst_destroy,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .redirect = ip_do_redirect,
168 .local_out = __ip_local_out,
169 .neigh_lookup = ipv4_neigh_lookup,
170 .confirm_neigh = ipv4_confirm_neigh,
171 };
172
173 #define ECN_OR_COST(class) TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176 TC_PRIO_BESTEFFORT,
177 ECN_OR_COST(BESTEFFORT),
178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(BESTEFFORT),
180 TC_PRIO_BULK,
181 ECN_OR_COST(BULK),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_INTERACTIVE,
185 ECN_OR_COST(INTERACTIVE),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201 if (*pos)
202 return NULL;
203 return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208 ++*pos;
209 return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218 if (v == SEQ_START_TOKEN)
219 seq_printf(seq, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222 "HHUptod\tSpecDst");
223 return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227 .start = rt_cache_seq_start,
228 .next = rt_cache_seq_next,
229 .stop = rt_cache_seq_stop,
230 .show = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235 return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239 .open = rt_cache_seq_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = seq_release,
243 };
244
245
246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
247 {
248 int cpu;
249
250 if (*pos == 0)
251 return SEQ_START_TOKEN;
252
253 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
254 if (!cpu_possible(cpu))
255 continue;
256 *pos = cpu+1;
257 return &per_cpu(rt_cache_stat, cpu);
258 }
259 return NULL;
260 }
261
262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
263 {
264 int cpu;
265
266 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
267 if (!cpu_possible(cpu))
268 continue;
269 *pos = cpu+1;
270 return &per_cpu(rt_cache_stat, cpu);
271 }
272 return NULL;
273
274 }
275
276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
277 {
278
279 }
280
281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
282 {
283 struct rt_cache_stat *st = v;
284
285 if (v == SEQ_START_TOKEN) {
286 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
287 return 0;
288 }
289
290 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
291 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
292 dst_entries_get_slow(&ipv4_dst_ops),
293 0, /* st->in_hit */
294 st->in_slow_tot,
295 st->in_slow_mc,
296 st->in_no_route,
297 st->in_brd,
298 st->in_martian_dst,
299 st->in_martian_src,
300
301 0, /* st->out_hit */
302 st->out_slow_tot,
303 st->out_slow_mc,
304
305 0, /* st->gc_total */
306 0, /* st->gc_ignored */
307 0, /* st->gc_goal_miss */
308 0, /* st->gc_dst_overflow */
309 0, /* st->in_hlist_search */
310 0 /* st->out_hlist_search */
311 );
312 return 0;
313 }
314
315 static const struct seq_operations rt_cpu_seq_ops = {
316 .start = rt_cpu_seq_start,
317 .next = rt_cpu_seq_next,
318 .stop = rt_cpu_seq_stop,
319 .show = rt_cpu_seq_show,
320 };
321
322
323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
324 {
325 return seq_open(file, &rt_cpu_seq_ops);
326 }
327
328 static const struct file_operations rt_cpu_seq_fops = {
329 .open = rt_cpu_seq_open,
330 .read = seq_read,
331 .llseek = seq_lseek,
332 .release = seq_release,
333 };
334
335 #ifdef CONFIG_IP_ROUTE_CLASSID
336 static int rt_acct_proc_show(struct seq_file *m, void *v)
337 {
338 struct ip_rt_acct *dst, *src;
339 unsigned int i, j;
340
341 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
342 if (!dst)
343 return -ENOMEM;
344
345 for_each_possible_cpu(i) {
346 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
347 for (j = 0; j < 256; j++) {
348 dst[j].o_bytes += src[j].o_bytes;
349 dst[j].o_packets += src[j].o_packets;
350 dst[j].i_bytes += src[j].i_bytes;
351 dst[j].i_packets += src[j].i_packets;
352 }
353 }
354
355 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
356 kfree(dst);
357 return 0;
358 }
359 #endif
360
361 static int __net_init ip_rt_do_proc_init(struct net *net)
362 {
363 struct proc_dir_entry *pde;
364
365 pde = proc_create("rt_cache", 0444, net->proc_net,
366 &rt_cache_seq_fops);
367 if (!pde)
368 goto err1;
369
370 pde = proc_create("rt_cache", 0444,
371 net->proc_net_stat, &rt_cpu_seq_fops);
372 if (!pde)
373 goto err2;
374
375 #ifdef CONFIG_IP_ROUTE_CLASSID
376 pde = proc_create_single("rt_acct", 0, net->proc_net,
377 rt_acct_proc_show);
378 if (!pde)
379 goto err3;
380 #endif
381 return 0;
382
383 #ifdef CONFIG_IP_ROUTE_CLASSID
384 err3:
385 remove_proc_entry("rt_cache", net->proc_net_stat);
386 #endif
387 err2:
388 remove_proc_entry("rt_cache", net->proc_net);
389 err1:
390 return -ENOMEM;
391 }
392
393 static void __net_exit ip_rt_do_proc_exit(struct net *net)
394 {
395 remove_proc_entry("rt_cache", net->proc_net_stat);
396 remove_proc_entry("rt_cache", net->proc_net);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 remove_proc_entry("rt_acct", net->proc_net);
399 #endif
400 }
401
402 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
403 .init = ip_rt_do_proc_init,
404 .exit = ip_rt_do_proc_exit,
405 };
406
407 static int __init ip_rt_proc_init(void)
408 {
409 return register_pernet_subsys(&ip_rt_proc_ops);
410 }
411
412 #else
413 static inline int ip_rt_proc_init(void)
414 {
415 return 0;
416 }
417 #endif /* CONFIG_PROC_FS */
418
419 static inline bool rt_is_expired(const struct rtable *rth)
420 {
421 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
422 }
423
424 void rt_cache_flush(struct net *net)
425 {
426 rt_genid_bump_ipv4(net);
427 }
428
429 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
430 struct sk_buff *skb,
431 const void *daddr)
432 {
433 const struct rtable *rt = container_of(dst, struct rtable, dst);
434 struct net_device *dev = dst->dev;
435 struct neighbour *n;
436
437 rcu_read_lock_bh();
438
439 if (likely(rt->rt_gw_family == AF_INET)) {
440 n = ip_neigh_gw4(dev, rt->rt_gw4);
441 } else if (rt->rt_gw_family == AF_INET6) {
442 n = ip_neigh_gw6(dev, &rt->rt_gw6);
443 } else {
444 __be32 pkey;
445
446 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
447 n = ip_neigh_gw4(dev, pkey);
448 }
449
450 if (n && !refcount_inc_not_zero(&n->refcnt))
451 n = NULL;
452
453 rcu_read_unlock_bh();
454
455 return n;
456 }
457
458 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
459 {
460 const struct rtable *rt = container_of(dst, struct rtable, dst);
461 struct net_device *dev = dst->dev;
462 const __be32 *pkey = daddr;
463
464 if (rt->rt_gw_family == AF_INET) {
465 pkey = (const __be32 *)&rt->rt_gw4;
466 } else if (rt->rt_gw_family == AF_INET6) {
467 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
468 } else if (!daddr ||
469 (rt->rt_flags &
470 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
471 return;
472 }
473 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
474 }
475
476 #define IP_IDENTS_SZ 2048u
477
478 static atomic_t *ip_idents __read_mostly;
479 static u32 *ip_tstamps __read_mostly;
480
481 /* In order to protect privacy, we add a perturbation to identifiers
482 * if one generator is seldom used. This makes hard for an attacker
483 * to infer how many packets were sent between two points in time.
484 */
485 u32 ip_idents_reserve(u32 hash, int segs)
486 {
487 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
488 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
489 u32 old = READ_ONCE(*p_tstamp);
490 u32 now = (u32)jiffies;
491 u32 new, delta = 0;
492
493 if (old != now && cmpxchg(p_tstamp, old, now) == old)
494 delta = prandom_u32_max(now - old);
495
496 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
497 do {
498 old = (u32)atomic_read(p_id);
499 new = old + delta + segs;
500 } while (atomic_cmpxchg(p_id, old, new) != old);
501
502 return new - segs;
503 }
504 EXPORT_SYMBOL(ip_idents_reserve);
505
506 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
507 {
508 u32 hash, id;
509
510 /* Note the following code is not safe, but this is okay. */
511 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
512 get_random_bytes(&net->ipv4.ip_id_key,
513 sizeof(net->ipv4.ip_id_key));
514
515 hash = siphash_3u32((__force u32)iph->daddr,
516 (__force u32)iph->saddr,
517 iph->protocol,
518 &net->ipv4.ip_id_key);
519 id = ip_idents_reserve(hash, segs);
520 iph->id = htons(id);
521 }
522 EXPORT_SYMBOL(__ip_select_ident);
523
524 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
525 const struct sock *sk,
526 const struct iphdr *iph,
527 int oif, u8 tos,
528 u8 prot, u32 mark, int flow_flags)
529 {
530 if (sk) {
531 const struct inet_sock *inet = inet_sk(sk);
532
533 oif = sk->sk_bound_dev_if;
534 mark = sk->sk_mark;
535 tos = RT_CONN_FLAGS(sk);
536 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
537 }
538 flowi4_init_output(fl4, oif, mark, tos,
539 RT_SCOPE_UNIVERSE, prot,
540 flow_flags,
541 iph->daddr, iph->saddr, 0, 0,
542 sock_net_uid(net, sk));
543 }
544
545 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
546 const struct sock *sk)
547 {
548 const struct net *net = dev_net(skb->dev);
549 const struct iphdr *iph = ip_hdr(skb);
550 int oif = skb->dev->ifindex;
551 u8 tos = RT_TOS(iph->tos);
552 u8 prot = iph->protocol;
553 u32 mark = skb->mark;
554
555 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
556 }
557
558 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
559 {
560 const struct inet_sock *inet = inet_sk(sk);
561 const struct ip_options_rcu *inet_opt;
562 __be32 daddr = inet->inet_daddr;
563
564 rcu_read_lock();
565 inet_opt = rcu_dereference(inet->inet_opt);
566 if (inet_opt && inet_opt->opt.srr)
567 daddr = inet_opt->opt.faddr;
568 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
569 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
570 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
571 inet_sk_flowi_flags(sk),
572 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
573 rcu_read_unlock();
574 }
575
576 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
577 const struct sk_buff *skb)
578 {
579 if (skb)
580 build_skb_flow_key(fl4, skb, sk);
581 else
582 build_sk_flow_key(fl4, sk);
583 }
584
585 static DEFINE_SPINLOCK(fnhe_lock);
586
587 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
588 {
589 struct rtable *rt;
590
591 rt = rcu_dereference(fnhe->fnhe_rth_input);
592 if (rt) {
593 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
594 dst_dev_put(&rt->dst);
595 dst_release(&rt->dst);
596 }
597 rt = rcu_dereference(fnhe->fnhe_rth_output);
598 if (rt) {
599 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
600 dst_dev_put(&rt->dst);
601 dst_release(&rt->dst);
602 }
603 }
604
605 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
606 {
607 struct fib_nh_exception *fnhe, *oldest;
608
609 oldest = rcu_dereference(hash->chain);
610 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
611 fnhe = rcu_dereference(fnhe->fnhe_next)) {
612 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
613 oldest = fnhe;
614 }
615 fnhe_flush_routes(oldest);
616 return oldest;
617 }
618
619 static inline u32 fnhe_hashfun(__be32 daddr)
620 {
621 static u32 fnhe_hashrnd __read_mostly;
622 u32 hval;
623
624 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
625 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
626 return hash_32(hval, FNHE_HASH_SHIFT);
627 }
628
629 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
630 {
631 rt->rt_pmtu = fnhe->fnhe_pmtu;
632 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
633 rt->dst.expires = fnhe->fnhe_expires;
634
635 if (fnhe->fnhe_gw) {
636 rt->rt_flags |= RTCF_REDIRECTED;
637 rt->rt_gw_family = AF_INET;
638 rt->rt_gw4 = fnhe->fnhe_gw;
639 }
640 }
641
642 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
643 __be32 gw, u32 pmtu, bool lock,
644 unsigned long expires)
645 {
646 struct fnhe_hash_bucket *hash;
647 struct fib_nh_exception *fnhe;
648 struct rtable *rt;
649 u32 genid, hval;
650 unsigned int i;
651 int depth;
652
653 genid = fnhe_genid(dev_net(nhc->nhc_dev));
654 hval = fnhe_hashfun(daddr);
655
656 spin_lock_bh(&fnhe_lock);
657
658 hash = rcu_dereference(nhc->nhc_exceptions);
659 if (!hash) {
660 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
661 if (!hash)
662 goto out_unlock;
663 rcu_assign_pointer(nhc->nhc_exceptions, hash);
664 }
665
666 hash += hval;
667
668 depth = 0;
669 for (fnhe = rcu_dereference(hash->chain); fnhe;
670 fnhe = rcu_dereference(fnhe->fnhe_next)) {
671 if (fnhe->fnhe_daddr == daddr)
672 break;
673 depth++;
674 }
675
676 if (fnhe) {
677 if (fnhe->fnhe_genid != genid)
678 fnhe->fnhe_genid = genid;
679 if (gw)
680 fnhe->fnhe_gw = gw;
681 if (pmtu) {
682 fnhe->fnhe_pmtu = pmtu;
683 fnhe->fnhe_mtu_locked = lock;
684 }
685 fnhe->fnhe_expires = max(1UL, expires);
686 /* Update all cached dsts too */
687 rt = rcu_dereference(fnhe->fnhe_rth_input);
688 if (rt)
689 fill_route_from_fnhe(rt, fnhe);
690 rt = rcu_dereference(fnhe->fnhe_rth_output);
691 if (rt)
692 fill_route_from_fnhe(rt, fnhe);
693 } else {
694 if (depth > FNHE_RECLAIM_DEPTH)
695 fnhe = fnhe_oldest(hash);
696 else {
697 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698 if (!fnhe)
699 goto out_unlock;
700
701 fnhe->fnhe_next = hash->chain;
702 rcu_assign_pointer(hash->chain, fnhe);
703 }
704 fnhe->fnhe_genid = genid;
705 fnhe->fnhe_daddr = daddr;
706 fnhe->fnhe_gw = gw;
707 fnhe->fnhe_pmtu = pmtu;
708 fnhe->fnhe_mtu_locked = lock;
709 fnhe->fnhe_expires = max(1UL, expires);
710
711 /* Exception created; mark the cached routes for the nexthop
712 * stale, so anyone caching it rechecks if this exception
713 * applies to them.
714 */
715 rt = rcu_dereference(nhc->nhc_rth_input);
716 if (rt)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718
719 for_each_possible_cpu(i) {
720 struct rtable __rcu **prt;
721 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
722 rt = rcu_dereference(*prt);
723 if (rt)
724 rt->dst.obsolete = DST_OBSOLETE_KILL;
725 }
726 }
727
728 fnhe->fnhe_stamp = jiffies;
729
730 out_unlock:
731 spin_unlock_bh(&fnhe_lock);
732 }
733
734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
735 bool kill_route)
736 {
737 __be32 new_gw = icmp_hdr(skb)->un.gateway;
738 __be32 old_gw = ip_hdr(skb)->saddr;
739 struct net_device *dev = skb->dev;
740 struct in_device *in_dev;
741 struct fib_result res;
742 struct neighbour *n;
743 struct net *net;
744
745 switch (icmp_hdr(skb)->code & 7) {
746 case ICMP_REDIR_NET:
747 case ICMP_REDIR_NETTOS:
748 case ICMP_REDIR_HOST:
749 case ICMP_REDIR_HOSTTOS:
750 break;
751
752 default:
753 return;
754 }
755
756 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
757 return;
758
759 in_dev = __in_dev_get_rcu(dev);
760 if (!in_dev)
761 return;
762
763 net = dev_net(dev);
764 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
765 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
766 ipv4_is_zeronet(new_gw))
767 goto reject_redirect;
768
769 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
770 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
771 goto reject_redirect;
772 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
773 goto reject_redirect;
774 } else {
775 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
776 goto reject_redirect;
777 }
778
779 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
780 if (!n)
781 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
782 if (!IS_ERR(n)) {
783 if (!(n->nud_state & NUD_VALID)) {
784 neigh_event_send(n, NULL);
785 } else {
786 if (fib_lookup(net, fl4, &res, 0) == 0) {
787 struct fib_nh_common *nhc = FIB_RES_NHC(res);
788
789 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
790 0, false,
791 jiffies + ip_rt_gc_timeout);
792 }
793 if (kill_route)
794 rt->dst.obsolete = DST_OBSOLETE_KILL;
795 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
796 }
797 neigh_release(n);
798 }
799 return;
800
801 reject_redirect:
802 #ifdef CONFIG_IP_ROUTE_VERBOSE
803 if (IN_DEV_LOG_MARTIANS(in_dev)) {
804 const struct iphdr *iph = (const struct iphdr *) skb->data;
805 __be32 daddr = iph->daddr;
806 __be32 saddr = iph->saddr;
807
808 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
809 " Advised path = %pI4 -> %pI4\n",
810 &old_gw, dev->name, &new_gw,
811 &saddr, &daddr);
812 }
813 #endif
814 ;
815 }
816
817 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
818 {
819 struct rtable *rt;
820 struct flowi4 fl4;
821 const struct iphdr *iph = (const struct iphdr *) skb->data;
822 struct net *net = dev_net(skb->dev);
823 int oif = skb->dev->ifindex;
824 u8 tos = RT_TOS(iph->tos);
825 u8 prot = iph->protocol;
826 u32 mark = skb->mark;
827
828 rt = (struct rtable *) dst;
829
830 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
831 __ip_do_redirect(rt, skb, &fl4, true);
832 }
833
834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
835 {
836 struct rtable *rt = (struct rtable *)dst;
837 struct dst_entry *ret = dst;
838
839 if (rt) {
840 if (dst->obsolete > 0) {
841 ip_rt_put(rt);
842 ret = NULL;
843 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
844 rt->dst.expires) {
845 ip_rt_put(rt);
846 ret = NULL;
847 }
848 }
849 return ret;
850 }
851
852 /*
853 * Algorithm:
854 * 1. The first ip_rt_redirect_number redirects are sent
855 * with exponential backoff, then we stop sending them at all,
856 * assuming that the host ignores our redirects.
857 * 2. If we did not see packets requiring redirects
858 * during ip_rt_redirect_silence, we assume that the host
859 * forgot redirected route and start to send redirects again.
860 *
861 * This algorithm is much cheaper and more intelligent than dumb load limiting
862 * in icmp.c.
863 *
864 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
865 * and "frag. need" (breaks PMTU discovery) in icmp.c.
866 */
867
868 void ip_rt_send_redirect(struct sk_buff *skb)
869 {
870 struct rtable *rt = skb_rtable(skb);
871 struct in_device *in_dev;
872 struct inet_peer *peer;
873 struct net *net;
874 int log_martians;
875 int vif;
876
877 rcu_read_lock();
878 in_dev = __in_dev_get_rcu(rt->dst.dev);
879 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
880 rcu_read_unlock();
881 return;
882 }
883 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
884 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
885 rcu_read_unlock();
886
887 net = dev_net(rt->dst.dev);
888 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
889 if (!peer) {
890 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
891 rt_nexthop(rt, ip_hdr(skb)->daddr));
892 return;
893 }
894
895 /* No redirected packets during ip_rt_redirect_silence;
896 * reset the algorithm.
897 */
898 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
899 peer->rate_tokens = 0;
900 peer->n_redirects = 0;
901 }
902
903 /* Too many ignored redirects; do not send anything
904 * set dst.rate_last to the last seen redirected packet.
905 */
906 if (peer->n_redirects >= ip_rt_redirect_number) {
907 peer->rate_last = jiffies;
908 goto out_put_peer;
909 }
910
911 /* Check for load limit; set rate_last to the latest sent
912 * redirect.
913 */
914 if (peer->rate_tokens == 0 ||
915 time_after(jiffies,
916 (peer->rate_last +
917 (ip_rt_redirect_load << peer->rate_tokens)))) {
918 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
919
920 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
921 peer->rate_last = jiffies;
922 ++peer->rate_tokens;
923 ++peer->n_redirects;
924 #ifdef CONFIG_IP_ROUTE_VERBOSE
925 if (log_martians &&
926 peer->rate_tokens == ip_rt_redirect_number)
927 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
928 &ip_hdr(skb)->saddr, inet_iif(skb),
929 &ip_hdr(skb)->daddr, &gw);
930 #endif
931 }
932 out_put_peer:
933 inet_putpeer(peer);
934 }
935
936 static int ip_error(struct sk_buff *skb)
937 {
938 struct rtable *rt = skb_rtable(skb);
939 struct net_device *dev = skb->dev;
940 struct in_device *in_dev;
941 struct inet_peer *peer;
942 unsigned long now;
943 struct net *net;
944 bool send;
945 int code;
946
947 if (netif_is_l3_master(skb->dev)) {
948 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949 if (!dev)
950 goto out;
951 }
952
953 in_dev = __in_dev_get_rcu(dev);
954
955 /* IP on this device is disabled. */
956 if (!in_dev)
957 goto out;
958
959 net = dev_net(rt->dst.dev);
960 if (!IN_DEV_FORWARD(in_dev)) {
961 switch (rt->dst.error) {
962 case EHOSTUNREACH:
963 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
964 break;
965
966 case ENETUNREACH:
967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968 break;
969 }
970 goto out;
971 }
972
973 switch (rt->dst.error) {
974 case EINVAL:
975 default:
976 goto out;
977 case EHOSTUNREACH:
978 code = ICMP_HOST_UNREACH;
979 break;
980 case ENETUNREACH:
981 code = ICMP_NET_UNREACH;
982 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
983 break;
984 case EACCES:
985 code = ICMP_PKT_FILTERED;
986 break;
987 }
988
989 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
990 l3mdev_master_ifindex(skb->dev), 1);
991
992 send = true;
993 if (peer) {
994 now = jiffies;
995 peer->rate_tokens += now - peer->rate_last;
996 if (peer->rate_tokens > ip_rt_error_burst)
997 peer->rate_tokens = ip_rt_error_burst;
998 peer->rate_last = now;
999 if (peer->rate_tokens >= ip_rt_error_cost)
1000 peer->rate_tokens -= ip_rt_error_cost;
1001 else
1002 send = false;
1003 inet_putpeer(peer);
1004 }
1005 if (send)
1006 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out: kfree_skb(skb);
1009 return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014 struct dst_entry *dst = &rt->dst;
1015 u32 old_mtu = ipv4_mtu(dst);
1016 struct fib_result res;
1017 bool lock = false;
1018
1019 if (ip_mtu_locked(dst))
1020 return;
1021
1022 if (old_mtu < mtu)
1023 return;
1024
1025 if (mtu < ip_rt_min_pmtu) {
1026 lock = true;
1027 mtu = min(old_mtu, ip_rt_min_pmtu);
1028 }
1029
1030 if (rt->rt_pmtu == mtu && !lock &&
1031 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032 return;
1033
1034 rcu_read_lock();
1035 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1037
1038 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1039 jiffies + ip_rt_mtu_expires);
1040 }
1041 rcu_read_unlock();
1042 }
1043
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045 struct sk_buff *skb, u32 mtu)
1046 {
1047 struct rtable *rt = (struct rtable *) dst;
1048 struct flowi4 fl4;
1049
1050 ip_rt_build_flow_key(&fl4, sk, skb);
1051 __ip_rt_update_pmtu(rt, &fl4, mtu);
1052 }
1053
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055 int oif, u8 protocol)
1056 {
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
1060 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1061
1062 __build_flow_key(net, &fl4, NULL, iph, oif,
1063 RT_TOS(iph->tos), protocol, mark, 0);
1064 rt = __ip_route_output_key(net, &fl4);
1065 if (!IS_ERR(rt)) {
1066 __ip_rt_update_pmtu(rt, &fl4, mtu);
1067 ip_rt_put(rt);
1068 }
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1071
1072 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1073 {
1074 const struct iphdr *iph = (const struct iphdr *) skb->data;
1075 struct flowi4 fl4;
1076 struct rtable *rt;
1077
1078 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1079
1080 if (!fl4.flowi4_mark)
1081 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1082
1083 rt = __ip_route_output_key(sock_net(sk), &fl4);
1084 if (!IS_ERR(rt)) {
1085 __ip_rt_update_pmtu(rt, &fl4, mtu);
1086 ip_rt_put(rt);
1087 }
1088 }
1089
1090 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1091 {
1092 const struct iphdr *iph = (const struct iphdr *) skb->data;
1093 struct flowi4 fl4;
1094 struct rtable *rt;
1095 struct dst_entry *odst = NULL;
1096 bool new = false;
1097 struct net *net = sock_net(sk);
1098
1099 bh_lock_sock(sk);
1100
1101 if (!ip_sk_accept_pmtu(sk))
1102 goto out;
1103
1104 odst = sk_dst_get(sk);
1105
1106 if (sock_owned_by_user(sk) || !odst) {
1107 __ipv4_sk_update_pmtu(skb, sk, mtu);
1108 goto out;
1109 }
1110
1111 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1112
1113 rt = (struct rtable *)odst;
1114 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1115 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116 if (IS_ERR(rt))
1117 goto out;
1118
1119 new = true;
1120 }
1121
1122 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1123
1124 if (!dst_check(&rt->dst, 0)) {
1125 if (new)
1126 dst_release(&rt->dst);
1127
1128 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129 if (IS_ERR(rt))
1130 goto out;
1131
1132 new = true;
1133 }
1134
1135 if (new)
1136 sk_dst_set(sk, &rt->dst);
1137
1138 out:
1139 bh_unlock_sock(sk);
1140 dst_release(odst);
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1143
1144 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1145 int oif, u8 protocol)
1146 {
1147 const struct iphdr *iph = (const struct iphdr *) skb->data;
1148 struct flowi4 fl4;
1149 struct rtable *rt;
1150
1151 __build_flow_key(net, &fl4, NULL, iph, oif,
1152 RT_TOS(iph->tos), protocol, 0, 0);
1153 rt = __ip_route_output_key(net, &fl4);
1154 if (!IS_ERR(rt)) {
1155 __ip_do_redirect(rt, skb, &fl4, false);
1156 ip_rt_put(rt);
1157 }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_redirect);
1160
1161 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1162 {
1163 const struct iphdr *iph = (const struct iphdr *) skb->data;
1164 struct flowi4 fl4;
1165 struct rtable *rt;
1166 struct net *net = sock_net(sk);
1167
1168 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169 rt = __ip_route_output_key(net, &fl4);
1170 if (!IS_ERR(rt)) {
1171 __ip_do_redirect(rt, skb, &fl4, false);
1172 ip_rt_put(rt);
1173 }
1174 }
1175 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1176
1177 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1178 {
1179 struct rtable *rt = (struct rtable *) dst;
1180
1181 /* All IPV4 dsts are created with ->obsolete set to the value
1182 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183 * into this function always.
1184 *
1185 * When a PMTU/redirect information update invalidates a route,
1186 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1187 * DST_OBSOLETE_DEAD.
1188 */
1189 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1190 return NULL;
1191 return dst;
1192 }
1193
1194 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1195 {
1196 struct ip_options opt;
1197 int res;
1198
1199 /* Recompile ip options since IPCB may not be valid anymore.
1200 * Also check we have a reasonable ipv4 header.
1201 */
1202 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204 return;
1205
1206 memset(&opt, 0, sizeof(opt));
1207 if (ip_hdr(skb)->ihl > 5) {
1208 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209 return;
1210 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1211
1212 rcu_read_lock();
1213 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214 rcu_read_unlock();
1215
1216 if (res)
1217 return;
1218 }
1219 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1220 }
1221
1222 static void ipv4_link_failure(struct sk_buff *skb)
1223 {
1224 struct rtable *rt;
1225
1226 ipv4_send_dest_unreach(skb);
1227
1228 rt = skb_rtable(skb);
1229 if (rt)
1230 dst_set_expires(&rt->dst, 0);
1231 }
1232
1233 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1234 {
1235 pr_debug("%s: %pI4 -> %pI4, %s\n",
1236 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237 skb->dev ? skb->dev->name : "?");
1238 kfree_skb(skb);
1239 WARN_ON(1);
1240 return 0;
1241 }
1242
1243 /*
1244 We do not cache source address of outgoing interface,
1245 because it is used only by IP RR, TS and SRR options,
1246 so that it out of fast path.
1247
1248 BTW remember: "addr" is allowed to be not aligned
1249 in IP options!
1250 */
1251
1252 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1253 {
1254 __be32 src;
1255
1256 if (rt_is_output_route(rt))
1257 src = ip_hdr(skb)->saddr;
1258 else {
1259 struct fib_result res;
1260 struct iphdr *iph = ip_hdr(skb);
1261 struct flowi4 fl4 = {
1262 .daddr = iph->daddr,
1263 .saddr = iph->saddr,
1264 .flowi4_tos = RT_TOS(iph->tos),
1265 .flowi4_oif = rt->dst.dev->ifindex,
1266 .flowi4_iif = skb->dev->ifindex,
1267 .flowi4_mark = skb->mark,
1268 };
1269
1270 rcu_read_lock();
1271 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1273 else
1274 src = inet_select_addr(rt->dst.dev,
1275 rt_nexthop(rt, iph->daddr),
1276 RT_SCOPE_UNIVERSE);
1277 rcu_read_unlock();
1278 }
1279 memcpy(addr, &src, 4);
1280 }
1281
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285 if (!(rt->dst.tclassid & 0xFFFF))
1286 rt->dst.tclassid |= tag & 0xFFFF;
1287 if (!(rt->dst.tclassid & 0xFFFF0000))
1288 rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1295 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1296 ip_rt_min_advmss);
1297
1298 return min(advmss, IPV4_MAX_PMTU - header_size);
1299 }
1300
1301 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1302 {
1303 const struct rtable *rt = (const struct rtable *) dst;
1304 unsigned int mtu = rt->rt_pmtu;
1305
1306 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1307 mtu = dst_metric_raw(dst, RTAX_MTU);
1308
1309 if (mtu)
1310 return mtu;
1311
1312 mtu = READ_ONCE(dst->dev->mtu);
1313
1314 if (unlikely(ip_mtu_locked(dst))) {
1315 if (rt->rt_gw_family && mtu > 576)
1316 mtu = 576;
1317 }
1318
1319 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320
1321 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1322 }
1323
1324 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1325 {
1326 struct fnhe_hash_bucket *hash;
1327 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328 u32 hval = fnhe_hashfun(daddr);
1329
1330 spin_lock_bh(&fnhe_lock);
1331
1332 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1333 lockdep_is_held(&fnhe_lock));
1334 hash += hval;
1335
1336 fnhe_p = &hash->chain;
1337 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338 while (fnhe) {
1339 if (fnhe->fnhe_daddr == daddr) {
1340 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1342 /* set fnhe_daddr to 0 to ensure it won't bind with
1343 * new dsts in rt_bind_exception().
1344 */
1345 fnhe->fnhe_daddr = 0;
1346 fnhe_flush_routes(fnhe);
1347 kfree_rcu(fnhe, rcu);
1348 break;
1349 }
1350 fnhe_p = &fnhe->fnhe_next;
1351 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352 lockdep_is_held(&fnhe_lock));
1353 }
1354
1355 spin_unlock_bh(&fnhe_lock);
1356 }
1357
1358 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359 __be32 daddr)
1360 {
1361 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1362 struct fib_nh_exception *fnhe;
1363 u32 hval;
1364
1365 if (!hash)
1366 return NULL;
1367
1368 hval = fnhe_hashfun(daddr);
1369
1370 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1372 if (fnhe->fnhe_daddr == daddr) {
1373 if (fnhe->fnhe_expires &&
1374 time_after(jiffies, fnhe->fnhe_expires)) {
1375 ip_del_fnhe(nhc, daddr);
1376 break;
1377 }
1378 return fnhe;
1379 }
1380 }
1381 return NULL;
1382 }
1383
1384 /* MTU selection:
1385 * 1. mtu on route is locked - use it
1386 * 2. mtu from nexthop exception
1387 * 3. mtu from egress device
1388 */
1389
1390 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1391 {
1392 struct fib_nh_common *nhc = res->nhc;
1393 struct net_device *dev = nhc->nhc_dev;
1394 struct fib_info *fi = res->fi;
1395 u32 mtu = 0;
1396
1397 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399 mtu = fi->fib_mtu;
1400
1401 if (likely(!mtu)) {
1402 struct fib_nh_exception *fnhe;
1403
1404 fnhe = find_exception(nhc, daddr);
1405 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406 mtu = fnhe->fnhe_pmtu;
1407 }
1408
1409 if (likely(!mtu))
1410 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1411
1412 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1413 }
1414
1415 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1416 __be32 daddr, const bool do_cache)
1417 {
1418 bool ret = false;
1419
1420 spin_lock_bh(&fnhe_lock);
1421
1422 if (daddr == fnhe->fnhe_daddr) {
1423 struct rtable __rcu **porig;
1424 struct rtable *orig;
1425 int genid = fnhe_genid(dev_net(rt->dst.dev));
1426
1427 if (rt_is_input_route(rt))
1428 porig = &fnhe->fnhe_rth_input;
1429 else
1430 porig = &fnhe->fnhe_rth_output;
1431 orig = rcu_dereference(*porig);
1432
1433 if (fnhe->fnhe_genid != genid) {
1434 fnhe->fnhe_genid = genid;
1435 fnhe->fnhe_gw = 0;
1436 fnhe->fnhe_pmtu = 0;
1437 fnhe->fnhe_expires = 0;
1438 fnhe->fnhe_mtu_locked = false;
1439 fnhe_flush_routes(fnhe);
1440 orig = NULL;
1441 }
1442 fill_route_from_fnhe(rt, fnhe);
1443 if (!rt->rt_gw4) {
1444 rt->rt_gw4 = daddr;
1445 rt->rt_gw_family = AF_INET;
1446 }
1447
1448 if (do_cache) {
1449 dst_hold(&rt->dst);
1450 rcu_assign_pointer(*porig, rt);
1451 if (orig) {
1452 dst_dev_put(&orig->dst);
1453 dst_release(&orig->dst);
1454 }
1455 ret = true;
1456 }
1457
1458 fnhe->fnhe_stamp = jiffies;
1459 }
1460 spin_unlock_bh(&fnhe_lock);
1461
1462 return ret;
1463 }
1464
1465 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1466 {
1467 struct rtable *orig, *prev, **p;
1468 bool ret = true;
1469
1470 if (rt_is_input_route(rt)) {
1471 p = (struct rtable **)&nhc->nhc_rth_input;
1472 } else {
1473 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1474 }
1475 orig = *p;
1476
1477 /* hold dst before doing cmpxchg() to avoid race condition
1478 * on this dst
1479 */
1480 dst_hold(&rt->dst);
1481 prev = cmpxchg(p, orig, rt);
1482 if (prev == orig) {
1483 if (orig) {
1484 dst_dev_put(&orig->dst);
1485 dst_release(&orig->dst);
1486 }
1487 } else {
1488 dst_release(&rt->dst);
1489 ret = false;
1490 }
1491
1492 return ret;
1493 }
1494
1495 struct uncached_list {
1496 spinlock_t lock;
1497 struct list_head head;
1498 };
1499
1500 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1501
1502 void rt_add_uncached_list(struct rtable *rt)
1503 {
1504 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1505
1506 rt->rt_uncached_list = ul;
1507
1508 spin_lock_bh(&ul->lock);
1509 list_add_tail(&rt->rt_uncached, &ul->head);
1510 spin_unlock_bh(&ul->lock);
1511 }
1512
1513 void rt_del_uncached_list(struct rtable *rt)
1514 {
1515 if (!list_empty(&rt->rt_uncached)) {
1516 struct uncached_list *ul = rt->rt_uncached_list;
1517
1518 spin_lock_bh(&ul->lock);
1519 list_del(&rt->rt_uncached);
1520 spin_unlock_bh(&ul->lock);
1521 }
1522 }
1523
1524 static void ipv4_dst_destroy(struct dst_entry *dst)
1525 {
1526 struct rtable *rt = (struct rtable *)dst;
1527
1528 ip_dst_metrics_put(dst);
1529 rt_del_uncached_list(rt);
1530 }
1531
1532 void rt_flush_dev(struct net_device *dev)
1533 {
1534 struct net *net = dev_net(dev);
1535 struct rtable *rt;
1536 int cpu;
1537
1538 for_each_possible_cpu(cpu) {
1539 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541 spin_lock_bh(&ul->lock);
1542 list_for_each_entry(rt, &ul->head, rt_uncached) {
1543 if (rt->dst.dev != dev)
1544 continue;
1545 rt->dst.dev = net->loopback_dev;
1546 dev_hold(rt->dst.dev);
1547 dev_put(dev);
1548 }
1549 spin_unlock_bh(&ul->lock);
1550 }
1551 }
1552
1553 static bool rt_cache_valid(const struct rtable *rt)
1554 {
1555 return rt &&
1556 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557 !rt_is_expired(rt);
1558 }
1559
1560 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561 const struct fib_result *res,
1562 struct fib_nh_exception *fnhe,
1563 struct fib_info *fi, u16 type, u32 itag,
1564 const bool do_cache)
1565 {
1566 bool cached = false;
1567
1568 if (fi) {
1569 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572 rt->rt_gw_family = nhc->nhc_gw_family;
1573 /* only INET and INET6 are supported */
1574 if (likely(nhc->nhc_gw_family == AF_INET))
1575 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576 else
1577 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1578 }
1579
1580 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581
1582 #ifdef CONFIG_IP_ROUTE_CLASSID
1583 {
1584 struct fib_nh *nh;
1585
1586 nh = container_of(nhc, struct fib_nh, nh_common);
1587 rt->dst.tclassid = nh->nh_tclassid;
1588 }
1589 #endif
1590 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591 if (unlikely(fnhe))
1592 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593 else if (do_cache)
1594 cached = rt_cache_route(nhc, rt);
1595 if (unlikely(!cached)) {
1596 /* Routes we intend to cache in nexthop exception or
1597 * FIB nexthop have the DST_NOCACHE bit clear.
1598 * However, if we are unsuccessful at storing this
1599 * route into the cache we really need to set it.
1600 */
1601 if (!rt->rt_gw4) {
1602 rt->rt_gw_family = AF_INET;
1603 rt->rt_gw4 = daddr;
1604 }
1605 rt_add_uncached_list(rt);
1606 }
1607 } else
1608 rt_add_uncached_list(rt);
1609
1610 #ifdef CONFIG_IP_ROUTE_CLASSID
1611 #ifdef CONFIG_IP_MULTIPLE_TABLES
1612 set_class_tag(rt, res->tclassid);
1613 #endif
1614 set_class_tag(rt, itag);
1615 #endif
1616 }
1617
1618 struct rtable *rt_dst_alloc(struct net_device *dev,
1619 unsigned int flags, u16 type,
1620 bool nopolicy, bool noxfrm, bool will_cache)
1621 {
1622 struct rtable *rt;
1623
1624 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625 (will_cache ? 0 : DST_HOST) |
1626 (nopolicy ? DST_NOPOLICY : 0) |
1627 (noxfrm ? DST_NOXFRM : 0));
1628
1629 if (rt) {
1630 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631 rt->rt_flags = flags;
1632 rt->rt_type = type;
1633 rt->rt_is_input = 0;
1634 rt->rt_iif = 0;
1635 rt->rt_pmtu = 0;
1636 rt->rt_mtu_locked = 0;
1637 rt->rt_gw_family = 0;
1638 rt->rt_gw4 = 0;
1639 INIT_LIST_HEAD(&rt->rt_uncached);
1640
1641 rt->dst.output = ip_output;
1642 if (flags & RTCF_LOCAL)
1643 rt->dst.input = ip_local_deliver;
1644 }
1645
1646 return rt;
1647 }
1648 EXPORT_SYMBOL(rt_dst_alloc);
1649
1650 /* called in rcu_read_lock() section */
1651 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1652 u8 tos, struct net_device *dev,
1653 struct in_device *in_dev, u32 *itag)
1654 {
1655 int err;
1656
1657 /* Primary sanity checks. */
1658 if (!in_dev)
1659 return -EINVAL;
1660
1661 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1662 skb->protocol != htons(ETH_P_IP))
1663 return -EINVAL;
1664
1665 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1666 return -EINVAL;
1667
1668 if (ipv4_is_zeronet(saddr)) {
1669 if (!ipv4_is_local_multicast(daddr) &&
1670 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1671 return -EINVAL;
1672 } else {
1673 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1674 in_dev, itag);
1675 if (err < 0)
1676 return err;
1677 }
1678 return 0;
1679 }
1680
1681 /* called in rcu_read_lock() section */
1682 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1683 u8 tos, struct net_device *dev, int our)
1684 {
1685 struct in_device *in_dev = __in_dev_get_rcu(dev);
1686 unsigned int flags = RTCF_MULTICAST;
1687 struct rtable *rth;
1688 u32 itag = 0;
1689 int err;
1690
1691 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1692 if (err)
1693 return err;
1694
1695 if (our)
1696 flags |= RTCF_LOCAL;
1697
1698 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1699 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1700 if (!rth)
1701 return -ENOBUFS;
1702
1703 #ifdef CONFIG_IP_ROUTE_CLASSID
1704 rth->dst.tclassid = itag;
1705 #endif
1706 rth->dst.output = ip_rt_bug;
1707 rth->rt_is_input= 1;
1708
1709 #ifdef CONFIG_IP_MROUTE
1710 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1711 rth->dst.input = ip_mr_input;
1712 #endif
1713 RT_CACHE_STAT_INC(in_slow_mc);
1714
1715 skb_dst_set(skb, &rth->dst);
1716 return 0;
1717 }
1718
1719
1720 static void ip_handle_martian_source(struct net_device *dev,
1721 struct in_device *in_dev,
1722 struct sk_buff *skb,
1723 __be32 daddr,
1724 __be32 saddr)
1725 {
1726 RT_CACHE_STAT_INC(in_martian_src);
1727 #ifdef CONFIG_IP_ROUTE_VERBOSE
1728 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1729 /*
1730 * RFC1812 recommendation, if source is martian,
1731 * the only hint is MAC header.
1732 */
1733 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1734 &daddr, &saddr, dev->name);
1735 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1736 print_hex_dump(KERN_WARNING, "ll header: ",
1737 DUMP_PREFIX_OFFSET, 16, 1,
1738 skb_mac_header(skb),
1739 dev->hard_header_len, false);
1740 }
1741 }
1742 #endif
1743 }
1744
1745 /* called in rcu_read_lock() section */
1746 static int __mkroute_input(struct sk_buff *skb,
1747 const struct fib_result *res,
1748 struct in_device *in_dev,
1749 __be32 daddr, __be32 saddr, u32 tos)
1750 {
1751 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1752 struct net_device *dev = nhc->nhc_dev;
1753 struct fib_nh_exception *fnhe;
1754 struct rtable *rth;
1755 int err;
1756 struct in_device *out_dev;
1757 bool do_cache;
1758 u32 itag = 0;
1759
1760 /* get a working reference to the output device */
1761 out_dev = __in_dev_get_rcu(dev);
1762 if (!out_dev) {
1763 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1764 return -EINVAL;
1765 }
1766
1767 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1768 in_dev->dev, in_dev, &itag);
1769 if (err < 0) {
1770 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1771 saddr);
1772
1773 goto cleanup;
1774 }
1775
1776 do_cache = res->fi && !itag;
1777 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1778 skb->protocol == htons(ETH_P_IP)) {
1779 __be32 gw;
1780
1781 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1782 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1783 inet_addr_onlink(out_dev, saddr, gw))
1784 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1785 }
1786
1787 if (skb->protocol != htons(ETH_P_IP)) {
1788 /* Not IP (i.e. ARP). Do not create route, if it is
1789 * invalid for proxy arp. DNAT routes are always valid.
1790 *
1791 * Proxy arp feature have been extended to allow, ARP
1792 * replies back to the same interface, to support
1793 * Private VLAN switch technologies. See arp.c.
1794 */
1795 if (out_dev == in_dev &&
1796 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1797 err = -EINVAL;
1798 goto cleanup;
1799 }
1800 }
1801
1802 fnhe = find_exception(nhc, daddr);
1803 if (do_cache) {
1804 if (fnhe)
1805 rth = rcu_dereference(fnhe->fnhe_rth_input);
1806 else
1807 rth = rcu_dereference(nhc->nhc_rth_input);
1808 if (rt_cache_valid(rth)) {
1809 skb_dst_set_noref(skb, &rth->dst);
1810 goto out;
1811 }
1812 }
1813
1814 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1815 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1816 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1817 if (!rth) {
1818 err = -ENOBUFS;
1819 goto cleanup;
1820 }
1821
1822 rth->rt_is_input = 1;
1823 RT_CACHE_STAT_INC(in_slow_tot);
1824
1825 rth->dst.input = ip_forward;
1826
1827 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1828 do_cache);
1829 lwtunnel_set_redirect(&rth->dst);
1830 skb_dst_set(skb, &rth->dst);
1831 out:
1832 err = 0;
1833 cleanup:
1834 return err;
1835 }
1836
1837 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1838 /* To make ICMP packets follow the right flow, the multipath hash is
1839 * calculated from the inner IP addresses.
1840 */
1841 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1842 struct flow_keys *hash_keys)
1843 {
1844 const struct iphdr *outer_iph = ip_hdr(skb);
1845 const struct iphdr *key_iph = outer_iph;
1846 const struct iphdr *inner_iph;
1847 const struct icmphdr *icmph;
1848 struct iphdr _inner_iph;
1849 struct icmphdr _icmph;
1850
1851 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1852 goto out;
1853
1854 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1855 goto out;
1856
1857 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1858 &_icmph);
1859 if (!icmph)
1860 goto out;
1861
1862 if (icmph->type != ICMP_DEST_UNREACH &&
1863 icmph->type != ICMP_REDIRECT &&
1864 icmph->type != ICMP_TIME_EXCEEDED &&
1865 icmph->type != ICMP_PARAMETERPROB)
1866 goto out;
1867
1868 inner_iph = skb_header_pointer(skb,
1869 outer_iph->ihl * 4 + sizeof(_icmph),
1870 sizeof(_inner_iph), &_inner_iph);
1871 if (!inner_iph)
1872 goto out;
1873
1874 key_iph = inner_iph;
1875 out:
1876 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1877 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1878 }
1879
1880 /* if skb is set it will be used and fl4 can be NULL */
1881 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1882 const struct sk_buff *skb, struct flow_keys *flkeys)
1883 {
1884 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1885 struct flow_keys hash_keys;
1886 u32 mhash;
1887
1888 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1889 case 0:
1890 memset(&hash_keys, 0, sizeof(hash_keys));
1891 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1892 if (skb) {
1893 ip_multipath_l3_keys(skb, &hash_keys);
1894 } else {
1895 hash_keys.addrs.v4addrs.src = fl4->saddr;
1896 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1897 }
1898 break;
1899 case 1:
1900 /* skb is currently provided only when forwarding */
1901 if (skb) {
1902 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1903 struct flow_keys keys;
1904
1905 /* short-circuit if we already have L4 hash present */
1906 if (skb->l4_hash)
1907 return skb_get_hash_raw(skb) >> 1;
1908
1909 memset(&hash_keys, 0, sizeof(hash_keys));
1910
1911 if (!flkeys) {
1912 skb_flow_dissect_flow_keys(skb, &keys, flag);
1913 flkeys = &keys;
1914 }
1915
1916 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1917 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1918 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1919 hash_keys.ports.src = flkeys->ports.src;
1920 hash_keys.ports.dst = flkeys->ports.dst;
1921 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1922 } else {
1923 memset(&hash_keys, 0, sizeof(hash_keys));
1924 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925 hash_keys.addrs.v4addrs.src = fl4->saddr;
1926 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1927 hash_keys.ports.src = fl4->fl4_sport;
1928 hash_keys.ports.dst = fl4->fl4_dport;
1929 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1930 }
1931 break;
1932 }
1933 mhash = flow_hash_from_keys(&hash_keys);
1934
1935 if (multipath_hash)
1936 mhash = jhash_2words(mhash, multipath_hash, 0);
1937
1938 return mhash >> 1;
1939 }
1940 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1941
1942 static int ip_mkroute_input(struct sk_buff *skb,
1943 struct fib_result *res,
1944 struct in_device *in_dev,
1945 __be32 daddr, __be32 saddr, u32 tos,
1946 struct flow_keys *hkeys)
1947 {
1948 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1949 if (res->fi && res->fi->fib_nhs > 1) {
1950 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1951
1952 fib_select_multipath(res, h);
1953 }
1954 #endif
1955
1956 /* create a routing cache entry */
1957 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1958 }
1959
1960 /*
1961 * NOTE. We drop all the packets that has local source
1962 * addresses, because every properly looped back packet
1963 * must have correct destination already attached by output routine.
1964 *
1965 * Such approach solves two big problems:
1966 * 1. Not simplex devices are handled properly.
1967 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1968 * called with rcu_read_lock()
1969 */
1970
1971 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1972 u8 tos, struct net_device *dev,
1973 struct fib_result *res)
1974 {
1975 struct in_device *in_dev = __in_dev_get_rcu(dev);
1976 struct flow_keys *flkeys = NULL, _flkeys;
1977 struct net *net = dev_net(dev);
1978 struct ip_tunnel_info *tun_info;
1979 int err = -EINVAL;
1980 unsigned int flags = 0;
1981 u32 itag = 0;
1982 struct rtable *rth;
1983 struct flowi4 fl4;
1984 bool do_cache = true;
1985
1986 /* IP on this device is disabled. */
1987
1988 if (!in_dev)
1989 goto out;
1990
1991 /* Check for the most weird martians, which can be not detected
1992 by fib_lookup.
1993 */
1994
1995 tun_info = skb_tunnel_info(skb);
1996 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1997 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1998 else
1999 fl4.flowi4_tun_key.tun_id = 0;
2000 skb_dst_drop(skb);
2001
2002 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2003 goto martian_source;
2004
2005 res->fi = NULL;
2006 res->table = NULL;
2007 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2008 goto brd_input;
2009
2010 /* Accept zero addresses only to limited broadcast;
2011 * I even do not know to fix it or not. Waiting for complains :-)
2012 */
2013 if (ipv4_is_zeronet(saddr))
2014 goto martian_source;
2015
2016 if (ipv4_is_zeronet(daddr))
2017 goto martian_destination;
2018
2019 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2020 * and call it once if daddr or/and saddr are loopback addresses
2021 */
2022 if (ipv4_is_loopback(daddr)) {
2023 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2024 goto martian_destination;
2025 } else if (ipv4_is_loopback(saddr)) {
2026 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2027 goto martian_source;
2028 }
2029
2030 /*
2031 * Now we are ready to route packet.
2032 */
2033 fl4.flowi4_oif = 0;
2034 fl4.flowi4_iif = dev->ifindex;
2035 fl4.flowi4_mark = skb->mark;
2036 fl4.flowi4_tos = tos;
2037 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2038 fl4.flowi4_flags = 0;
2039 fl4.daddr = daddr;
2040 fl4.saddr = saddr;
2041 fl4.flowi4_uid = sock_net_uid(net, NULL);
2042
2043 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2044 flkeys = &_flkeys;
2045 } else {
2046 fl4.flowi4_proto = 0;
2047 fl4.fl4_sport = 0;
2048 fl4.fl4_dport = 0;
2049 }
2050
2051 err = fib_lookup(net, &fl4, res, 0);
2052 if (err != 0) {
2053 if (!IN_DEV_FORWARD(in_dev))
2054 err = -EHOSTUNREACH;
2055 goto no_route;
2056 }
2057
2058 if (res->type == RTN_BROADCAST) {
2059 if (IN_DEV_BFORWARD(in_dev))
2060 goto make_route;
2061 /* not do cache if bc_forwarding is enabled */
2062 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2063 do_cache = false;
2064 goto brd_input;
2065 }
2066
2067 if (res->type == RTN_LOCAL) {
2068 err = fib_validate_source(skb, saddr, daddr, tos,
2069 0, dev, in_dev, &itag);
2070 if (err < 0)
2071 goto martian_source;
2072 goto local_input;
2073 }
2074
2075 if (!IN_DEV_FORWARD(in_dev)) {
2076 err = -EHOSTUNREACH;
2077 goto no_route;
2078 }
2079 if (res->type != RTN_UNICAST)
2080 goto martian_destination;
2081
2082 make_route:
2083 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2084 out: return err;
2085
2086 brd_input:
2087 if (skb->protocol != htons(ETH_P_IP))
2088 goto e_inval;
2089
2090 if (!ipv4_is_zeronet(saddr)) {
2091 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2092 in_dev, &itag);
2093 if (err < 0)
2094 goto martian_source;
2095 }
2096 flags |= RTCF_BROADCAST;
2097 res->type = RTN_BROADCAST;
2098 RT_CACHE_STAT_INC(in_brd);
2099
2100 local_input:
2101 do_cache &= res->fi && !itag;
2102 if (do_cache) {
2103 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2104
2105 rth = rcu_dereference(nhc->nhc_rth_input);
2106 if (rt_cache_valid(rth)) {
2107 skb_dst_set_noref(skb, &rth->dst);
2108 err = 0;
2109 goto out;
2110 }
2111 }
2112
2113 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2114 flags | RTCF_LOCAL, res->type,
2115 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2116 if (!rth)
2117 goto e_nobufs;
2118
2119 rth->dst.output= ip_rt_bug;
2120 #ifdef CONFIG_IP_ROUTE_CLASSID
2121 rth->dst.tclassid = itag;
2122 #endif
2123 rth->rt_is_input = 1;
2124
2125 RT_CACHE_STAT_INC(in_slow_tot);
2126 if (res->type == RTN_UNREACHABLE) {
2127 rth->dst.input= ip_error;
2128 rth->dst.error= -err;
2129 rth->rt_flags &= ~RTCF_LOCAL;
2130 }
2131
2132 if (do_cache) {
2133 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2134
2135 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2136 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2137 WARN_ON(rth->dst.input == lwtunnel_input);
2138 rth->dst.lwtstate->orig_input = rth->dst.input;
2139 rth->dst.input = lwtunnel_input;
2140 }
2141
2142 if (unlikely(!rt_cache_route(nhc, rth)))
2143 rt_add_uncached_list(rth);
2144 }
2145 skb_dst_set(skb, &rth->dst);
2146 err = 0;
2147 goto out;
2148
2149 no_route:
2150 RT_CACHE_STAT_INC(in_no_route);
2151 res->type = RTN_UNREACHABLE;
2152 res->fi = NULL;
2153 res->table = NULL;
2154 goto local_input;
2155
2156 /*
2157 * Do not cache martian addresses: they should be logged (RFC1812)
2158 */
2159 martian_destination:
2160 RT_CACHE_STAT_INC(in_martian_dst);
2161 #ifdef CONFIG_IP_ROUTE_VERBOSE
2162 if (IN_DEV_LOG_MARTIANS(in_dev))
2163 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2164 &daddr, &saddr, dev->name);
2165 #endif
2166
2167 e_inval:
2168 err = -EINVAL;
2169 goto out;
2170
2171 e_nobufs:
2172 err = -ENOBUFS;
2173 goto out;
2174
2175 martian_source:
2176 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2177 goto out;
2178 }
2179
2180 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2181 u8 tos, struct net_device *dev)
2182 {
2183 struct fib_result res;
2184 int err;
2185
2186 tos &= IPTOS_RT_MASK;
2187 rcu_read_lock();
2188 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2189 rcu_read_unlock();
2190
2191 return err;
2192 }
2193 EXPORT_SYMBOL(ip_route_input_noref);
2194
2195 /* called with rcu_read_lock held */
2196 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2197 u8 tos, struct net_device *dev, struct fib_result *res)
2198 {
2199 /* Multicast recognition logic is moved from route cache to here.
2200 The problem was that too many Ethernet cards have broken/missing
2201 hardware multicast filters :-( As result the host on multicasting
2202 network acquires a lot of useless route cache entries, sort of
2203 SDR messages from all the world. Now we try to get rid of them.
2204 Really, provided software IP multicast filter is organized
2205 reasonably (at least, hashed), it does not result in a slowdown
2206 comparing with route cache reject entries.
2207 Note, that multicast routers are not affected, because
2208 route cache entry is created eventually.
2209 */
2210 if (ipv4_is_multicast(daddr)) {
2211 struct in_device *in_dev = __in_dev_get_rcu(dev);
2212 int our = 0;
2213 int err = -EINVAL;
2214
2215 if (!in_dev)
2216 return err;
2217 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2218 ip_hdr(skb)->protocol);
2219
2220 /* check l3 master if no match yet */
2221 if (!our && netif_is_l3_slave(dev)) {
2222 struct in_device *l3_in_dev;
2223
2224 l3_in_dev = __in_dev_get_rcu(skb->dev);
2225 if (l3_in_dev)
2226 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2227 ip_hdr(skb)->protocol);
2228 }
2229
2230 if (our
2231 #ifdef CONFIG_IP_MROUTE
2232 ||
2233 (!ipv4_is_local_multicast(daddr) &&
2234 IN_DEV_MFORWARD(in_dev))
2235 #endif
2236 ) {
2237 err = ip_route_input_mc(skb, daddr, saddr,
2238 tos, dev, our);
2239 }
2240 return err;
2241 }
2242
2243 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2244 }
2245
2246 /* called with rcu_read_lock() */
2247 static struct rtable *__mkroute_output(const struct fib_result *res,
2248 const struct flowi4 *fl4, int orig_oif,
2249 struct net_device *dev_out,
2250 unsigned int flags)
2251 {
2252 struct fib_info *fi = res->fi;
2253 struct fib_nh_exception *fnhe;
2254 struct in_device *in_dev;
2255 u16 type = res->type;
2256 struct rtable *rth;
2257 bool do_cache;
2258
2259 in_dev = __in_dev_get_rcu(dev_out);
2260 if (!in_dev)
2261 return ERR_PTR(-EINVAL);
2262
2263 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2264 if (ipv4_is_loopback(fl4->saddr) &&
2265 !(dev_out->flags & IFF_LOOPBACK) &&
2266 !netif_is_l3_master(dev_out))
2267 return ERR_PTR(-EINVAL);
2268
2269 if (ipv4_is_lbcast(fl4->daddr))
2270 type = RTN_BROADCAST;
2271 else if (ipv4_is_multicast(fl4->daddr))
2272 type = RTN_MULTICAST;
2273 else if (ipv4_is_zeronet(fl4->daddr))
2274 return ERR_PTR(-EINVAL);
2275
2276 if (dev_out->flags & IFF_LOOPBACK)
2277 flags |= RTCF_LOCAL;
2278
2279 do_cache = true;
2280 if (type == RTN_BROADCAST) {
2281 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2282 fi = NULL;
2283 } else if (type == RTN_MULTICAST) {
2284 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2285 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2286 fl4->flowi4_proto))
2287 flags &= ~RTCF_LOCAL;
2288 else
2289 do_cache = false;
2290 /* If multicast route do not exist use
2291 * default one, but do not gateway in this case.
2292 * Yes, it is hack.
2293 */
2294 if (fi && res->prefixlen < 4)
2295 fi = NULL;
2296 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2297 (orig_oif != dev_out->ifindex)) {
2298 /* For local routes that require a particular output interface
2299 * we do not want to cache the result. Caching the result
2300 * causes incorrect behaviour when there are multiple source
2301 * addresses on the interface, the end result being that if the
2302 * intended recipient is waiting on that interface for the
2303 * packet he won't receive it because it will be delivered on
2304 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2305 * be set to the loopback interface as well.
2306 */
2307 do_cache = false;
2308 }
2309
2310 fnhe = NULL;
2311 do_cache &= fi != NULL;
2312 if (fi) {
2313 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2314 struct rtable __rcu **prth;
2315
2316 fnhe = find_exception(nhc, fl4->daddr);
2317 if (!do_cache)
2318 goto add;
2319 if (fnhe) {
2320 prth = &fnhe->fnhe_rth_output;
2321 } else {
2322 if (unlikely(fl4->flowi4_flags &
2323 FLOWI_FLAG_KNOWN_NH &&
2324 !(nhc->nhc_gw_family &&
2325 nhc->nhc_scope == RT_SCOPE_LINK))) {
2326 do_cache = false;
2327 goto add;
2328 }
2329 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2330 }
2331 rth = rcu_dereference(*prth);
2332 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2333 return rth;
2334 }
2335
2336 add:
2337 rth = rt_dst_alloc(dev_out, flags, type,
2338 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2339 IN_DEV_CONF_GET(in_dev, NOXFRM),
2340 do_cache);
2341 if (!rth)
2342 return ERR_PTR(-ENOBUFS);
2343
2344 rth->rt_iif = orig_oif;
2345
2346 RT_CACHE_STAT_INC(out_slow_tot);
2347
2348 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2349 if (flags & RTCF_LOCAL &&
2350 !(dev_out->flags & IFF_LOOPBACK)) {
2351 rth->dst.output = ip_mc_output;
2352 RT_CACHE_STAT_INC(out_slow_mc);
2353 }
2354 #ifdef CONFIG_IP_MROUTE
2355 if (type == RTN_MULTICAST) {
2356 if (IN_DEV_MFORWARD(in_dev) &&
2357 !ipv4_is_local_multicast(fl4->daddr)) {
2358 rth->dst.input = ip_mr_input;
2359 rth->dst.output = ip_mc_output;
2360 }
2361 }
2362 #endif
2363 }
2364
2365 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2366 lwtunnel_set_redirect(&rth->dst);
2367
2368 return rth;
2369 }
2370
2371 /*
2372 * Major route resolver routine.
2373 */
2374
2375 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2376 const struct sk_buff *skb)
2377 {
2378 __u8 tos = RT_FL_TOS(fl4);
2379 struct fib_result res = {
2380 .type = RTN_UNSPEC,
2381 .fi = NULL,
2382 .table = NULL,
2383 .tclassid = 0,
2384 };
2385 struct rtable *rth;
2386
2387 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2388 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2389 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2390 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2391
2392 rcu_read_lock();
2393 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2394 rcu_read_unlock();
2395
2396 return rth;
2397 }
2398 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2399
2400 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2401 struct fib_result *res,
2402 const struct sk_buff *skb)
2403 {
2404 struct net_device *dev_out = NULL;
2405 int orig_oif = fl4->flowi4_oif;
2406 unsigned int flags = 0;
2407 struct rtable *rth;
2408 int err = -ENETUNREACH;
2409
2410 if (fl4->saddr) {
2411 rth = ERR_PTR(-EINVAL);
2412 if (ipv4_is_multicast(fl4->saddr) ||
2413 ipv4_is_lbcast(fl4->saddr) ||
2414 ipv4_is_zeronet(fl4->saddr))
2415 goto out;
2416
2417 /* I removed check for oif == dev_out->oif here.
2418 It was wrong for two reasons:
2419 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2420 is assigned to multiple interfaces.
2421 2. Moreover, we are allowed to send packets with saddr
2422 of another iface. --ANK
2423 */
2424
2425 if (fl4->flowi4_oif == 0 &&
2426 (ipv4_is_multicast(fl4->daddr) ||
2427 ipv4_is_lbcast(fl4->daddr))) {
2428 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2429 dev_out = __ip_dev_find(net, fl4->saddr, false);
2430 if (!dev_out)
2431 goto out;
2432
2433 /* Special hack: user can direct multicasts
2434 and limited broadcast via necessary interface
2435 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2436 This hack is not just for fun, it allows
2437 vic,vat and friends to work.
2438 They bind socket to loopback, set ttl to zero
2439 and expect that it will work.
2440 From the viewpoint of routing cache they are broken,
2441 because we are not allowed to build multicast path
2442 with loopback source addr (look, routing cache
2443 cannot know, that ttl is zero, so that packet
2444 will not leave this host and route is valid).
2445 Luckily, this hack is good workaround.
2446 */
2447
2448 fl4->flowi4_oif = dev_out->ifindex;
2449 goto make_route;
2450 }
2451
2452 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2453 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2454 if (!__ip_dev_find(net, fl4->saddr, false))
2455 goto out;
2456 }
2457 }
2458
2459
2460 if (fl4->flowi4_oif) {
2461 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2462 rth = ERR_PTR(-ENODEV);
2463 if (!dev_out)
2464 goto out;
2465
2466 /* RACE: Check return value of inet_select_addr instead. */
2467 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2468 rth = ERR_PTR(-ENETUNREACH);
2469 goto out;
2470 }
2471 if (ipv4_is_local_multicast(fl4->daddr) ||
2472 ipv4_is_lbcast(fl4->daddr) ||
2473 fl4->flowi4_proto == IPPROTO_IGMP) {
2474 if (!fl4->saddr)
2475 fl4->saddr = inet_select_addr(dev_out, 0,
2476 RT_SCOPE_LINK);
2477 goto make_route;
2478 }
2479 if (!fl4->saddr) {
2480 if (ipv4_is_multicast(fl4->daddr))
2481 fl4->saddr = inet_select_addr(dev_out, 0,
2482 fl4->flowi4_scope);
2483 else if (!fl4->daddr)
2484 fl4->saddr = inet_select_addr(dev_out, 0,
2485 RT_SCOPE_HOST);
2486 }
2487 }
2488
2489 if (!fl4->daddr) {
2490 fl4->daddr = fl4->saddr;
2491 if (!fl4->daddr)
2492 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2493 dev_out = net->loopback_dev;
2494 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2495 res->type = RTN_LOCAL;
2496 flags |= RTCF_LOCAL;
2497 goto make_route;
2498 }
2499
2500 err = fib_lookup(net, fl4, res, 0);
2501 if (err) {
2502 res->fi = NULL;
2503 res->table = NULL;
2504 if (fl4->flowi4_oif &&
2505 (ipv4_is_multicast(fl4->daddr) ||
2506 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2507 /* Apparently, routing tables are wrong. Assume,
2508 that the destination is on link.
2509
2510 WHY? DW.
2511 Because we are allowed to send to iface
2512 even if it has NO routes and NO assigned
2513 addresses. When oif is specified, routing
2514 tables are looked up with only one purpose:
2515 to catch if destination is gatewayed, rather than
2516 direct. Moreover, if MSG_DONTROUTE is set,
2517 we send packet, ignoring both routing tables
2518 and ifaddr state. --ANK
2519
2520
2521 We could make it even if oif is unknown,
2522 likely IPv6, but we do not.
2523 */
2524
2525 if (fl4->saddr == 0)
2526 fl4->saddr = inet_select_addr(dev_out, 0,
2527 RT_SCOPE_LINK);
2528 res->type = RTN_UNICAST;
2529 goto make_route;
2530 }
2531 rth = ERR_PTR(err);
2532 goto out;
2533 }
2534
2535 if (res->type == RTN_LOCAL) {
2536 if (!fl4->saddr) {
2537 if (res->fi->fib_prefsrc)
2538 fl4->saddr = res->fi->fib_prefsrc;
2539 else
2540 fl4->saddr = fl4->daddr;
2541 }
2542
2543 /* L3 master device is the loopback for that domain */
2544 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2545 net->loopback_dev;
2546
2547 /* make sure orig_oif points to fib result device even
2548 * though packet rx/tx happens over loopback or l3mdev
2549 */
2550 orig_oif = FIB_RES_OIF(*res);
2551
2552 fl4->flowi4_oif = dev_out->ifindex;
2553 flags |= RTCF_LOCAL;
2554 goto make_route;
2555 }
2556
2557 fib_select_path(net, res, fl4, skb);
2558
2559 dev_out = FIB_RES_DEV(*res);
2560 fl4->flowi4_oif = dev_out->ifindex;
2561
2562
2563 make_route:
2564 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2565
2566 out:
2567 return rth;
2568 }
2569
2570 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2571 {
2572 return NULL;
2573 }
2574
2575 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2576 {
2577 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2578
2579 return mtu ? : dst->dev->mtu;
2580 }
2581
2582 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2583 struct sk_buff *skb, u32 mtu)
2584 {
2585 }
2586
2587 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2588 struct sk_buff *skb)
2589 {
2590 }
2591
2592 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2593 unsigned long old)
2594 {
2595 return NULL;
2596 }
2597
2598 static struct dst_ops ipv4_dst_blackhole_ops = {
2599 .family = AF_INET,
2600 .check = ipv4_blackhole_dst_check,
2601 .mtu = ipv4_blackhole_mtu,
2602 .default_advmss = ipv4_default_advmss,
2603 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2604 .redirect = ipv4_rt_blackhole_redirect,
2605 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2606 .neigh_lookup = ipv4_neigh_lookup,
2607 };
2608
2609 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2610 {
2611 struct rtable *ort = (struct rtable *) dst_orig;
2612 struct rtable *rt;
2613
2614 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2615 if (rt) {
2616 struct dst_entry *new = &rt->dst;
2617
2618 new->__use = 1;
2619 new->input = dst_discard;
2620 new->output = dst_discard_out;
2621
2622 new->dev = net->loopback_dev;
2623 if (new->dev)
2624 dev_hold(new->dev);
2625
2626 rt->rt_is_input = ort->rt_is_input;
2627 rt->rt_iif = ort->rt_iif;
2628 rt->rt_pmtu = ort->rt_pmtu;
2629 rt->rt_mtu_locked = ort->rt_mtu_locked;
2630
2631 rt->rt_genid = rt_genid_ipv4(net);
2632 rt->rt_flags = ort->rt_flags;
2633 rt->rt_type = ort->rt_type;
2634 rt->rt_gw_family = ort->rt_gw_family;
2635 if (rt->rt_gw_family == AF_INET)
2636 rt->rt_gw4 = ort->rt_gw4;
2637 else if (rt->rt_gw_family == AF_INET6)
2638 rt->rt_gw6 = ort->rt_gw6;
2639
2640 INIT_LIST_HEAD(&rt->rt_uncached);
2641 }
2642
2643 dst_release(dst_orig);
2644
2645 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2646 }
2647
2648 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2649 const struct sock *sk)
2650 {
2651 struct rtable *rt = __ip_route_output_key(net, flp4);
2652
2653 if (IS_ERR(rt))
2654 return rt;
2655
2656 if (flp4->flowi4_proto)
2657 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2658 flowi4_to_flowi(flp4),
2659 sk, 0);
2660
2661 return rt;
2662 }
2663 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2664
2665 /* called with rcu_read_lock held */
2666 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2667 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2668 struct sk_buff *skb, u32 portid, u32 seq)
2669 {
2670 struct rtmsg *r;
2671 struct nlmsghdr *nlh;
2672 unsigned long expires = 0;
2673 u32 error;
2674 u32 metrics[RTAX_MAX];
2675
2676 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2677 if (!nlh)
2678 return -EMSGSIZE;
2679
2680 r = nlmsg_data(nlh);
2681 r->rtm_family = AF_INET;
2682 r->rtm_dst_len = 32;
2683 r->rtm_src_len = 0;
2684 r->rtm_tos = fl4->flowi4_tos;
2685 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2686 if (nla_put_u32(skb, RTA_TABLE, table_id))
2687 goto nla_put_failure;
2688 r->rtm_type = rt->rt_type;
2689 r->rtm_scope = RT_SCOPE_UNIVERSE;
2690 r->rtm_protocol = RTPROT_UNSPEC;
2691 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2692 if (rt->rt_flags & RTCF_NOTIFY)
2693 r->rtm_flags |= RTM_F_NOTIFY;
2694 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2695 r->rtm_flags |= RTCF_DOREDIRECT;
2696
2697 if (nla_put_in_addr(skb, RTA_DST, dst))
2698 goto nla_put_failure;
2699 if (src) {
2700 r->rtm_src_len = 32;
2701 if (nla_put_in_addr(skb, RTA_SRC, src))
2702 goto nla_put_failure;
2703 }
2704 if (rt->dst.dev &&
2705 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2706 goto nla_put_failure;
2707 #ifdef CONFIG_IP_ROUTE_CLASSID
2708 if (rt->dst.tclassid &&
2709 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2710 goto nla_put_failure;
2711 #endif
2712 if (!rt_is_input_route(rt) &&
2713 fl4->saddr != src) {
2714 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2715 goto nla_put_failure;
2716 }
2717 if (rt->rt_gw_family == AF_INET &&
2718 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2719 goto nla_put_failure;
2720 } else if (rt->rt_gw_family == AF_INET6) {
2721 int alen = sizeof(struct in6_addr);
2722 struct nlattr *nla;
2723 struct rtvia *via;
2724
2725 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2726 if (!nla)
2727 goto nla_put_failure;
2728
2729 via = nla_data(nla);
2730 via->rtvia_family = AF_INET6;
2731 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2732 }
2733
2734 expires = rt->dst.expires;
2735 if (expires) {
2736 unsigned long now = jiffies;
2737
2738 if (time_before(now, expires))
2739 expires -= now;
2740 else
2741 expires = 0;
2742 }
2743
2744 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2745 if (rt->rt_pmtu && expires)
2746 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2747 if (rt->rt_mtu_locked && expires)
2748 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2749 if (rtnetlink_put_metrics(skb, metrics) < 0)
2750 goto nla_put_failure;
2751
2752 if (fl4->flowi4_mark &&
2753 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2754 goto nla_put_failure;
2755
2756 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2757 nla_put_u32(skb, RTA_UID,
2758 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2759 goto nla_put_failure;
2760
2761 error = rt->dst.error;
2762
2763 if (rt_is_input_route(rt)) {
2764 #ifdef CONFIG_IP_MROUTE
2765 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2766 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2767 int err = ipmr_get_route(net, skb,
2768 fl4->saddr, fl4->daddr,
2769 r, portid);
2770
2771 if (err <= 0) {
2772 if (err == 0)
2773 return 0;
2774 goto nla_put_failure;
2775 }
2776 } else
2777 #endif
2778 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2779 goto nla_put_failure;
2780 }
2781
2782 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2783 goto nla_put_failure;
2784
2785 nlmsg_end(skb, nlh);
2786 return 0;
2787
2788 nla_put_failure:
2789 nlmsg_cancel(skb, nlh);
2790 return -EMSGSIZE;
2791 }
2792
2793 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2794 u8 ip_proto, __be16 sport,
2795 __be16 dport)
2796 {
2797 struct sk_buff *skb;
2798 struct iphdr *iph;
2799
2800 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2801 if (!skb)
2802 return NULL;
2803
2804 /* Reserve room for dummy headers, this skb can pass
2805 * through good chunk of routing engine.
2806 */
2807 skb_reset_mac_header(skb);
2808 skb_reset_network_header(skb);
2809 skb->protocol = htons(ETH_P_IP);
2810 iph = skb_put(skb, sizeof(struct iphdr));
2811 iph->protocol = ip_proto;
2812 iph->saddr = src;
2813 iph->daddr = dst;
2814 iph->version = 0x4;
2815 iph->frag_off = 0;
2816 iph->ihl = 0x5;
2817 skb_set_transport_header(skb, skb->len);
2818
2819 switch (iph->protocol) {
2820 case IPPROTO_UDP: {
2821 struct udphdr *udph;
2822
2823 udph = skb_put_zero(skb, sizeof(struct udphdr));
2824 udph->source = sport;
2825 udph->dest = dport;
2826 udph->len = sizeof(struct udphdr);
2827 udph->check = 0;
2828 break;
2829 }
2830 case IPPROTO_TCP: {
2831 struct tcphdr *tcph;
2832
2833 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2834 tcph->source = sport;
2835 tcph->dest = dport;
2836 tcph->doff = sizeof(struct tcphdr) / 4;
2837 tcph->rst = 1;
2838 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2839 src, dst, 0);
2840 break;
2841 }
2842 case IPPROTO_ICMP: {
2843 struct icmphdr *icmph;
2844
2845 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2846 icmph->type = ICMP_ECHO;
2847 icmph->code = 0;
2848 }
2849 }
2850
2851 return skb;
2852 }
2853
2854 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2855 const struct nlmsghdr *nlh,
2856 struct nlattr **tb,
2857 struct netlink_ext_ack *extack)
2858 {
2859 struct rtmsg *rtm;
2860 int i, err;
2861
2862 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2863 NL_SET_ERR_MSG(extack,
2864 "ipv4: Invalid header for route get request");
2865 return -EINVAL;
2866 }
2867
2868 if (!netlink_strict_get_check(skb))
2869 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2870 rtm_ipv4_policy, extack);
2871
2872 rtm = nlmsg_data(nlh);
2873 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2874 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2875 rtm->rtm_table || rtm->rtm_protocol ||
2876 rtm->rtm_scope || rtm->rtm_type) {
2877 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2878 return -EINVAL;
2879 }
2880
2881 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2882 RTM_F_LOOKUP_TABLE |
2883 RTM_F_FIB_MATCH)) {
2884 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2885 return -EINVAL;
2886 }
2887
2888 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2889 rtm_ipv4_policy, extack);
2890 if (err)
2891 return err;
2892
2893 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2894 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2895 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2896 return -EINVAL;
2897 }
2898
2899 for (i = 0; i <= RTA_MAX; i++) {
2900 if (!tb[i])
2901 continue;
2902
2903 switch (i) {
2904 case RTA_IIF:
2905 case RTA_OIF:
2906 case RTA_SRC:
2907 case RTA_DST:
2908 case RTA_IP_PROTO:
2909 case RTA_SPORT:
2910 case RTA_DPORT:
2911 case RTA_MARK:
2912 case RTA_UID:
2913 break;
2914 default:
2915 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2916 return -EINVAL;
2917 }
2918 }
2919
2920 return 0;
2921 }
2922
2923 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2924 struct netlink_ext_ack *extack)
2925 {
2926 struct net *net = sock_net(in_skb->sk);
2927 struct nlattr *tb[RTA_MAX+1];
2928 u32 table_id = RT_TABLE_MAIN;
2929 __be16 sport = 0, dport = 0;
2930 struct fib_result res = {};
2931 u8 ip_proto = IPPROTO_UDP;
2932 struct rtable *rt = NULL;
2933 struct sk_buff *skb;
2934 struct rtmsg *rtm;
2935 struct flowi4 fl4 = {};
2936 __be32 dst = 0;
2937 __be32 src = 0;
2938 kuid_t uid;
2939 u32 iif;
2940 int err;
2941 int mark;
2942
2943 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2944 if (err < 0)
2945 return err;
2946
2947 rtm = nlmsg_data(nlh);
2948 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2949 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2950 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2951 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2952 if (tb[RTA_UID])
2953 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2954 else
2955 uid = (iif ? INVALID_UID : current_uid());
2956
2957 if (tb[RTA_IP_PROTO]) {
2958 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2959 &ip_proto, AF_INET, extack);
2960 if (err)
2961 return err;
2962 }
2963
2964 if (tb[RTA_SPORT])
2965 sport = nla_get_be16(tb[RTA_SPORT]);
2966
2967 if (tb[RTA_DPORT])
2968 dport = nla_get_be16(tb[RTA_DPORT]);
2969
2970 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2971 if (!skb)
2972 return -ENOBUFS;
2973
2974 fl4.daddr = dst;
2975 fl4.saddr = src;
2976 fl4.flowi4_tos = rtm->rtm_tos;
2977 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2978 fl4.flowi4_mark = mark;
2979 fl4.flowi4_uid = uid;
2980 if (sport)
2981 fl4.fl4_sport = sport;
2982 if (dport)
2983 fl4.fl4_dport = dport;
2984 fl4.flowi4_proto = ip_proto;
2985
2986 rcu_read_lock();
2987
2988 if (iif) {
2989 struct net_device *dev;
2990
2991 dev = dev_get_by_index_rcu(net, iif);
2992 if (!dev) {
2993 err = -ENODEV;
2994 goto errout_rcu;
2995 }
2996
2997 fl4.flowi4_iif = iif; /* for rt_fill_info */
2998 skb->dev = dev;
2999 skb->mark = mark;
3000 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3001 dev, &res);
3002
3003 rt = skb_rtable(skb);
3004 if (err == 0 && rt->dst.error)
3005 err = -rt->dst.error;
3006 } else {
3007 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3008 skb->dev = net->loopback_dev;
3009 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3010 err = 0;
3011 if (IS_ERR(rt))
3012 err = PTR_ERR(rt);
3013 else
3014 skb_dst_set(skb, &rt->dst);
3015 }
3016
3017 if (err)
3018 goto errout_rcu;
3019
3020 if (rtm->rtm_flags & RTM_F_NOTIFY)
3021 rt->rt_flags |= RTCF_NOTIFY;
3022
3023 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3024 table_id = res.table ? res.table->tb_id : 0;
3025
3026 /* reset skb for netlink reply msg */
3027 skb_trim(skb, 0);
3028 skb_reset_network_header(skb);
3029 skb_reset_transport_header(skb);
3030 skb_reset_mac_header(skb);
3031
3032 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3033 if (!res.fi) {
3034 err = fib_props[res.type].error;
3035 if (!err)
3036 err = -EHOSTUNREACH;
3037 goto errout_rcu;
3038 }
3039 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3040 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3041 rt->rt_type, res.prefix, res.prefixlen,
3042 fl4.flowi4_tos, res.fi, 0);
3043 } else {
3044 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3045 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3046 }
3047 if (err < 0)
3048 goto errout_rcu;
3049
3050 rcu_read_unlock();
3051
3052 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3053
3054 errout_free:
3055 return err;
3056 errout_rcu:
3057 rcu_read_unlock();
3058 kfree_skb(skb);
3059 goto errout_free;
3060 }
3061
3062 void ip_rt_multicast_event(struct in_device *in_dev)
3063 {
3064 rt_cache_flush(dev_net(in_dev->dev));
3065 }
3066
3067 #ifdef CONFIG_SYSCTL
3068 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3069 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3070 static int ip_rt_gc_elasticity __read_mostly = 8;
3071 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3072
3073 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3074 void __user *buffer,
3075 size_t *lenp, loff_t *ppos)
3076 {
3077 struct net *net = (struct net *)__ctl->extra1;
3078
3079 if (write) {
3080 rt_cache_flush(net);
3081 fnhe_genid_bump(net);
3082 return 0;
3083 }
3084
3085 return -EINVAL;
3086 }
3087
3088 static struct ctl_table ipv4_route_table[] = {
3089 {
3090 .procname = "gc_thresh",
3091 .data = &ipv4_dst_ops.gc_thresh,
3092 .maxlen = sizeof(int),
3093 .mode = 0644,
3094 .proc_handler = proc_dointvec,
3095 },
3096 {
3097 .procname = "max_size",
3098 .data = &ip_rt_max_size,
3099 .maxlen = sizeof(int),
3100 .mode = 0644,
3101 .proc_handler = proc_dointvec,
3102 },
3103 {
3104 /* Deprecated. Use gc_min_interval_ms */
3105
3106 .procname = "gc_min_interval",
3107 .data = &ip_rt_gc_min_interval,
3108 .maxlen = sizeof(int),
3109 .mode = 0644,
3110 .proc_handler = proc_dointvec_jiffies,
3111 },
3112 {
3113 .procname = "gc_min_interval_ms",
3114 .data = &ip_rt_gc_min_interval,
3115 .maxlen = sizeof(int),
3116 .mode = 0644,
3117 .proc_handler = proc_dointvec_ms_jiffies,
3118 },
3119 {
3120 .procname = "gc_timeout",
3121 .data = &ip_rt_gc_timeout,
3122 .maxlen = sizeof(int),
3123 .mode = 0644,
3124 .proc_handler = proc_dointvec_jiffies,
3125 },
3126 {
3127 .procname = "gc_interval",
3128 .data = &ip_rt_gc_interval,
3129 .maxlen = sizeof(int),
3130 .mode = 0644,
3131 .proc_handler = proc_dointvec_jiffies,
3132 },
3133 {
3134 .procname = "redirect_load",
3135 .data = &ip_rt_redirect_load,
3136 .maxlen = sizeof(int),
3137 .mode = 0644,
3138 .proc_handler = proc_dointvec,
3139 },
3140 {
3141 .procname = "redirect_number",
3142 .data = &ip_rt_redirect_number,
3143 .maxlen = sizeof(int),
3144 .mode = 0644,
3145 .proc_handler = proc_dointvec,
3146 },
3147 {
3148 .procname = "redirect_silence",
3149 .data = &ip_rt_redirect_silence,
3150 .maxlen = sizeof(int),
3151 .mode = 0644,
3152 .proc_handler = proc_dointvec,
3153 },
3154 {
3155 .procname = "error_cost",
3156 .data = &ip_rt_error_cost,
3157 .maxlen = sizeof(int),
3158 .mode = 0644,
3159 .proc_handler = proc_dointvec,
3160 },
3161 {
3162 .procname = "error_burst",
3163 .data = &ip_rt_error_burst,
3164 .maxlen = sizeof(int),
3165 .mode = 0644,
3166 .proc_handler = proc_dointvec,
3167 },
3168 {
3169 .procname = "gc_elasticity",
3170 .data = &ip_rt_gc_elasticity,
3171 .maxlen = sizeof(int),
3172 .mode = 0644,
3173 .proc_handler = proc_dointvec,
3174 },
3175 {
3176 .procname = "mtu_expires",
3177 .data = &ip_rt_mtu_expires,
3178 .maxlen = sizeof(int),
3179 .mode = 0644,
3180 .proc_handler = proc_dointvec_jiffies,
3181 },
3182 {
3183 .procname = "min_pmtu",
3184 .data = &ip_rt_min_pmtu,
3185 .maxlen = sizeof(int),
3186 .mode = 0644,
3187 .proc_handler = proc_dointvec_minmax,
3188 .extra1 = &ip_min_valid_pmtu,
3189 },
3190 {
3191 .procname = "min_adv_mss",
3192 .data = &ip_rt_min_advmss,
3193 .maxlen = sizeof(int),
3194 .mode = 0644,
3195 .proc_handler = proc_dointvec,
3196 },
3197 { }
3198 };
3199
3200 static struct ctl_table ipv4_route_flush_table[] = {
3201 {
3202 .procname = "flush",
3203 .maxlen = sizeof(int),
3204 .mode = 0200,
3205 .proc_handler = ipv4_sysctl_rtcache_flush,
3206 },
3207 { },
3208 };
3209
3210 static __net_init int sysctl_route_net_init(struct net *net)
3211 {
3212 struct ctl_table *tbl;
3213
3214 tbl = ipv4_route_flush_table;
3215 if (!net_eq(net, &init_net)) {
3216 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3217 if (!tbl)
3218 goto err_dup;
3219
3220 /* Don't export sysctls to unprivileged users */
3221 if (net->user_ns != &init_user_ns)
3222 tbl[0].procname = NULL;
3223 }
3224 tbl[0].extra1 = net;
3225
3226 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3227 if (!net->ipv4.route_hdr)
3228 goto err_reg;
3229 return 0;
3230
3231 err_reg:
3232 if (tbl != ipv4_route_flush_table)
3233 kfree(tbl);
3234 err_dup:
3235 return -ENOMEM;
3236 }
3237
3238 static __net_exit void sysctl_route_net_exit(struct net *net)
3239 {
3240 struct ctl_table *tbl;
3241
3242 tbl = net->ipv4.route_hdr->ctl_table_arg;
3243 unregister_net_sysctl_table(net->ipv4.route_hdr);
3244 BUG_ON(tbl == ipv4_route_flush_table);
3245 kfree(tbl);
3246 }
3247
3248 static __net_initdata struct pernet_operations sysctl_route_ops = {
3249 .init = sysctl_route_net_init,
3250 .exit = sysctl_route_net_exit,
3251 };
3252 #endif
3253
3254 static __net_init int rt_genid_init(struct net *net)
3255 {
3256 atomic_set(&net->ipv4.rt_genid, 0);
3257 atomic_set(&net->fnhe_genid, 0);
3258 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3259 return 0;
3260 }
3261
3262 static __net_initdata struct pernet_operations rt_genid_ops = {
3263 .init = rt_genid_init,
3264 };
3265
3266 static int __net_init ipv4_inetpeer_init(struct net *net)
3267 {
3268 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3269
3270 if (!bp)
3271 return -ENOMEM;
3272 inet_peer_base_init(bp);
3273 net->ipv4.peers = bp;
3274 return 0;
3275 }
3276
3277 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3278 {
3279 struct inet_peer_base *bp = net->ipv4.peers;
3280
3281 net->ipv4.peers = NULL;
3282 inetpeer_invalidate_tree(bp);
3283 kfree(bp);
3284 }
3285
3286 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3287 .init = ipv4_inetpeer_init,
3288 .exit = ipv4_inetpeer_exit,
3289 };
3290
3291 #ifdef CONFIG_IP_ROUTE_CLASSID
3292 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3293 #endif /* CONFIG_IP_ROUTE_CLASSID */
3294
3295 int __init ip_rt_init(void)
3296 {
3297 int cpu;
3298
3299 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3300 GFP_KERNEL);
3301 if (!ip_idents)
3302 panic("IP: failed to allocate ip_idents\n");
3303
3304 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3305
3306 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3307 if (!ip_tstamps)
3308 panic("IP: failed to allocate ip_tstamps\n");
3309
3310 for_each_possible_cpu(cpu) {
3311 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3312
3313 INIT_LIST_HEAD(&ul->head);
3314 spin_lock_init(&ul->lock);
3315 }
3316 #ifdef CONFIG_IP_ROUTE_CLASSID
3317 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3318 if (!ip_rt_acct)
3319 panic("IP: failed to allocate ip_rt_acct\n");
3320 #endif
3321
3322 ipv4_dst_ops.kmem_cachep =
3323 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3324 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3325
3326 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3327
3328 if (dst_entries_init(&ipv4_dst_ops) < 0)
3329 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3330
3331 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3332 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3333
3334 ipv4_dst_ops.gc_thresh = ~0;
3335 ip_rt_max_size = INT_MAX;
3336
3337 devinet_init();
3338 ip_fib_init();
3339
3340 if (ip_rt_proc_init())
3341 pr_err("Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343 xfrm_init();
3344 xfrm4_init();
3345 #endif
3346 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3347 RTNL_FLAG_DOIT_UNLOCKED);
3348
3349 #ifdef CONFIG_SYSCTL
3350 register_pernet_subsys(&sysctl_route_ops);
3351 #endif
3352 register_pernet_subsys(&rt_genid_ops);
3353 register_pernet_subsys(&ipv4_inetpeer_ops);
3354 return 0;
3355 }
3356
3357 #ifdef CONFIG_SYSCTL
3358 /*
3359 * We really need to sanitize the damn ipv4 init order, then all
3360 * this nonsense will go away.
3361 */
3362 void __init ip_static_sysctl_init(void)
3363 {
3364 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3365 }
3366 #endif