]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/route.c
userns: Make seq_file's user namespace accessible
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4 72#include <linux/mm.h>
424c4b70 73#include <linux/bootmem.h>
1da177e4
LT
74#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
39c90ece 83#include <linux/workqueue.h>
1da177e4 84#include <linux/skbuff.h>
1da177e4
LT
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
5a0e3ad6 94#include <linux/slab.h>
b9eda06f 95#include <linux/prefetch.h>
352e512c 96#include <net/dst.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/protocol.h>
99#include <net/ip.h>
100#include <net/route.h>
101#include <net/inetpeer.h>
102#include <net/sock.h>
103#include <net/ip_fib.h>
104#include <net/arp.h>
105#include <net/tcp.h>
106#include <net/icmp.h>
107#include <net/xfrm.h>
8d71740c 108#include <net/netevent.h>
63f3444f 109#include <net/rtnetlink.h>
1da177e4
LT
110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h>
7426a564 112#include <linux/kmemleak.h>
1da177e4 113#endif
6e5714ea 114#include <net/secure_seq.h>
1da177e4 115
68a5e3dd 116#define RT_FL_TOS(oldflp4) \
f61759e6 117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
1da177e4 123static int ip_rt_max_size;
817bc4db 124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
9f28a2fc 125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
817bc4db
SH
126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 136
1da177e4
LT
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 143static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
146static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb, u32 mtu);
148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb);
caacf05e 150static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 151
72cdd1d9
ED
152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154{
155}
1da177e4 156
62fa8a84
DM
157static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158{
31248731
DM
159 WARN_ON(1);
160 return NULL;
62fa8a84
DM
161}
162
f894cbf8
DM
163static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
164 struct sk_buff *skb,
165 const void *daddr);
d3aaeb38 166
1da177e4
LT
167static struct dst_ops ipv4_dst_ops = {
168 .family = AF_INET,
09640e63 169 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4 170 .check = ipv4_dst_check,
0dbaee3b 171 .default_advmss = ipv4_default_advmss,
ebb762f2 172 .mtu = ipv4_mtu,
62fa8a84 173 .cow_metrics = ipv4_cow_metrics,
caacf05e 174 .destroy = ipv4_dst_destroy,
1da177e4
LT
175 .ifdown = ipv4_dst_ifdown,
176 .negative_advice = ipv4_negative_advice,
177 .link_failure = ipv4_link_failure,
178 .update_pmtu = ip_rt_update_pmtu,
e47a185b 179 .redirect = ip_do_redirect,
1ac06e03 180 .local_out = __ip_local_out,
d3aaeb38 181 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
182};
183
184#define ECN_OR_COST(class) TC_PRIO_##class
185
4839c52b 186const __u8 ip_tos2prio[16] = {
1da177e4 187 TC_PRIO_BESTEFFORT,
4a2b9c37 188 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
189 TC_PRIO_BESTEFFORT,
190 ECN_OR_COST(BESTEFFORT),
191 TC_PRIO_BULK,
192 ECN_OR_COST(BULK),
193 TC_PRIO_BULK,
194 ECN_OR_COST(BULK),
195 TC_PRIO_INTERACTIVE,
196 ECN_OR_COST(INTERACTIVE),
197 TC_PRIO_INTERACTIVE,
198 ECN_OR_COST(INTERACTIVE),
199 TC_PRIO_INTERACTIVE_BULK,
200 ECN_OR_COST(INTERACTIVE_BULK),
201 TC_PRIO_INTERACTIVE_BULK,
202 ECN_OR_COST(INTERACTIVE_BULK)
203};
d4a96865 204EXPORT_SYMBOL(ip_tos2prio);
1da177e4 205
2f970d83 206static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 207#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 208
e84f84f2
DL
209static inline int rt_genid(struct net *net)
210{
211 return atomic_read(&net->ipv4.rt_genid);
212}
213
1da177e4 214#ifdef CONFIG_PROC_FS
1da177e4
LT
215static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
216{
29e75252 217 if (*pos)
89aef892 218 return NULL;
29e75252 219 return SEQ_START_TOKEN;
1da177e4
LT
220}
221
222static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223{
1da177e4 224 ++*pos;
89aef892 225 return NULL;
1da177e4
LT
226}
227
228static void rt_cache_seq_stop(struct seq_file *seq, void *v)
229{
1da177e4
LT
230}
231
232static int rt_cache_seq_show(struct seq_file *seq, void *v)
233{
234 if (v == SEQ_START_TOKEN)
235 seq_printf(seq, "%-127s\n",
236 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
237 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
238 "HHUptod\tSpecDst");
e905a9ed 239 return 0;
1da177e4
LT
240}
241
f690808e 242static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
243 .start = rt_cache_seq_start,
244 .next = rt_cache_seq_next,
245 .stop = rt_cache_seq_stop,
246 .show = rt_cache_seq_show,
247};
248
249static int rt_cache_seq_open(struct inode *inode, struct file *file)
250{
89aef892 251 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
252}
253
9a32144e 254static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
255 .owner = THIS_MODULE,
256 .open = rt_cache_seq_open,
257 .read = seq_read,
258 .llseek = seq_lseek,
89aef892 259 .release = seq_release,
1da177e4
LT
260};
261
262
263static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
264{
265 int cpu;
266
267 if (*pos == 0)
268 return SEQ_START_TOKEN;
269
0f23174a 270 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
2f970d83 274 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
275 }
276 return NULL;
277}
278
279static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
280{
281 int cpu;
282
0f23174a 283 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
284 if (!cpu_possible(cpu))
285 continue;
286 *pos = cpu+1;
2f970d83 287 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
288 }
289 return NULL;
e905a9ed 290
1da177e4
LT
291}
292
293static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
294{
295
296}
297
298static int rt_cpu_seq_show(struct seq_file *seq, void *v)
299{
300 struct rt_cache_stat *st = v;
301
302 if (v == SEQ_START_TOKEN) {
5bec0039 303 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
304 return 0;
305 }
e905a9ed 306
1da177e4
LT
307 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
308 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 309 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
310 st->in_hit,
311 st->in_slow_tot,
312 st->in_slow_mc,
313 st->in_no_route,
314 st->in_brd,
315 st->in_martian_dst,
316 st->in_martian_src,
317
318 st->out_hit,
319 st->out_slow_tot,
e905a9ed 320 st->out_slow_mc,
1da177e4
LT
321
322 st->gc_total,
323 st->gc_ignored,
324 st->gc_goal_miss,
325 st->gc_dst_overflow,
326 st->in_hlist_search,
327 st->out_hlist_search
328 );
329 return 0;
330}
331
f690808e 332static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
333 .start = rt_cpu_seq_start,
334 .next = rt_cpu_seq_next,
335 .stop = rt_cpu_seq_stop,
336 .show = rt_cpu_seq_show,
337};
338
339
340static int rt_cpu_seq_open(struct inode *inode, struct file *file)
341{
342 return seq_open(file, &rt_cpu_seq_ops);
343}
344
9a32144e 345static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
346 .owner = THIS_MODULE,
347 .open = rt_cpu_seq_open,
348 .read = seq_read,
349 .llseek = seq_lseek,
350 .release = seq_release,
351};
352
c7066f70 353#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 354static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 355{
a661c419
AD
356 struct ip_rt_acct *dst, *src;
357 unsigned int i, j;
358
359 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
360 if (!dst)
361 return -ENOMEM;
362
363 for_each_possible_cpu(i) {
364 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
365 for (j = 0; j < 256; j++) {
366 dst[j].o_bytes += src[j].o_bytes;
367 dst[j].o_packets += src[j].o_packets;
368 dst[j].i_bytes += src[j].i_bytes;
369 dst[j].i_packets += src[j].i_packets;
370 }
78c686e9
PE
371 }
372
a661c419
AD
373 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
374 kfree(dst);
375 return 0;
376}
78c686e9 377
a661c419
AD
378static int rt_acct_proc_open(struct inode *inode, struct file *file)
379{
380 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 381}
a661c419
AD
382
383static const struct file_operations rt_acct_proc_fops = {
384 .owner = THIS_MODULE,
385 .open = rt_acct_proc_open,
386 .read = seq_read,
387 .llseek = seq_lseek,
388 .release = single_release,
389};
78c686e9 390#endif
107f1634 391
73b38711 392static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
393{
394 struct proc_dir_entry *pde;
395
396 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
397 &rt_cache_seq_fops);
398 if (!pde)
399 goto err1;
400
77020720
WC
401 pde = proc_create("rt_cache", S_IRUGO,
402 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
403 if (!pde)
404 goto err2;
405
c7066f70 406#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 407 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
408 if (!pde)
409 goto err3;
410#endif
411 return 0;
412
c7066f70 413#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
414err3:
415 remove_proc_entry("rt_cache", net->proc_net_stat);
416#endif
417err2:
418 remove_proc_entry("rt_cache", net->proc_net);
419err1:
420 return -ENOMEM;
421}
73b38711
DL
422
423static void __net_exit ip_rt_do_proc_exit(struct net *net)
424{
425 remove_proc_entry("rt_cache", net->proc_net_stat);
426 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 427#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 428 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 429#endif
73b38711
DL
430}
431
432static struct pernet_operations ip_rt_proc_ops __net_initdata = {
433 .init = ip_rt_do_proc_init,
434 .exit = ip_rt_do_proc_exit,
435};
436
437static int __init ip_rt_proc_init(void)
438{
439 return register_pernet_subsys(&ip_rt_proc_ops);
440}
441
107f1634 442#else
73b38711 443static inline int ip_rt_proc_init(void)
107f1634
PE
444{
445 return 0;
446}
1da177e4 447#endif /* CONFIG_PROC_FS */
e905a9ed 448
4331debc 449static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 450{
d8d1f30b 451 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
452}
453
29e75252 454/*
25985edc 455 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
456 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
457 * many times (2^24) without giving recent rt_genid.
458 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 459 */
86c657f6 460static void rt_cache_invalidate(struct net *net)
1da177e4 461{
29e75252 462 unsigned char shuffle;
1da177e4 463
29e75252 464 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 465 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
466}
467
29e75252
ED
468/*
469 * delay < 0 : invalidate cache (fast : entries will be deleted later)
470 * delay >= 0 : invalidate & flush cache (can be long)
471 */
76e6ebfb 472void rt_cache_flush(struct net *net, int delay)
1da177e4 473{
86c657f6 474 rt_cache_invalidate(net);
98376387
ED
475}
476
f894cbf8
DM
477static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
478 struct sk_buff *skb,
479 const void *daddr)
3769cffb 480{
d3aaeb38
DM
481 struct net_device *dev = dst->dev;
482 const __be32 *pkey = daddr;
39232973 483 const struct rtable *rt;
3769cffb
DM
484 struct neighbour *n;
485
39232973 486 rt = (const struct rtable *) dst;
a263b309 487 if (rt->rt_gateway)
39232973 488 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
489 else if (skb)
490 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 491
80703d26 492 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
493 if (n)
494 return n;
32092ecf 495 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
496}
497
1da177e4
LT
498/*
499 * Peer allocation may fail only in serious out-of-memory conditions. However
500 * we still can generate some output.
501 * Random ID selection looks a bit dangerous because we have no chances to
502 * select ID being unique in a reasonable period of time.
503 * But broken packet identifier may be better than no packet at all.
504 */
505static void ip_select_fb_ident(struct iphdr *iph)
506{
507 static DEFINE_SPINLOCK(ip_fb_id_lock);
508 static u32 ip_fallback_id;
509 u32 salt;
510
511 spin_lock_bh(&ip_fb_id_lock);
e448515c 512 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
513 iph->id = htons(salt & 0xFFFF);
514 ip_fallback_id = salt;
515 spin_unlock_bh(&ip_fb_id_lock);
516}
517
518void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
519{
1d861aa4
DM
520 struct net *net = dev_net(dst->dev);
521 struct inet_peer *peer;
1da177e4 522
1d861aa4
DM
523 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
524 if (peer) {
525 iph->id = htons(inet_getid(peer, more));
526 inet_putpeer(peer);
527 return;
528 }
1da177e4
LT
529
530 ip_select_fb_ident(iph);
531}
4bc2f18b 532EXPORT_SYMBOL(__ip_select_ident);
1da177e4 533
5abf7f7e 534static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
4895c771
DM
535 const struct iphdr *iph,
536 int oif, u8 tos,
537 u8 prot, u32 mark, int flow_flags)
538{
539 if (sk) {
540 const struct inet_sock *inet = inet_sk(sk);
541
542 oif = sk->sk_bound_dev_if;
543 mark = sk->sk_mark;
544 tos = RT_CONN_FLAGS(sk);
545 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
546 }
547 flowi4_init_output(fl4, oif, mark, tos,
548 RT_SCOPE_UNIVERSE, prot,
549 flow_flags,
550 iph->daddr, iph->saddr, 0, 0);
551}
552
5abf7f7e
ED
553static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 const struct sock *sk)
4895c771
DM
555{
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
562 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
563}
564
5abf7f7e 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
566{
567 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 568 const struct ip_options_rcu *inet_opt;
4895c771
DM
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
579 daddr, inet->inet_saddr, 0, 0);
580 rcu_read_unlock();
581}
582
5abf7f7e
ED
583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
4895c771
DM
585{
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590}
591
c5038a83
DM
592static inline void rt_free(struct rtable *rt)
593{
594 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
595}
596
597static DEFINE_SPINLOCK(fnhe_lock);
4895c771 598
aee06da6 599static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
600{
601 struct fib_nh_exception *fnhe, *oldest;
c5038a83 602 struct rtable *orig;
4895c771
DM
603
604 oldest = rcu_dereference(hash->chain);
605 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
606 fnhe = rcu_dereference(fnhe->fnhe_next)) {
607 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
608 oldest = fnhe;
609 }
c5038a83
DM
610 orig = rcu_dereference(oldest->fnhe_rth);
611 if (orig) {
612 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
613 rt_free(orig);
614 }
4895c771
DM
615 return oldest;
616}
617
d3a25c98
DM
618static inline u32 fnhe_hashfun(__be32 daddr)
619{
620 u32 hval;
621
622 hval = (__force u32) daddr;
623 hval ^= (hval >> 11) ^ (hval >> 22);
624
625 return hval & (FNHE_HASH_SIZE - 1);
626}
627
aee06da6
JA
628static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
629 u32 pmtu, unsigned long expires)
4895c771 630{
aee06da6 631 struct fnhe_hash_bucket *hash;
4895c771
DM
632 struct fib_nh_exception *fnhe;
633 int depth;
aee06da6
JA
634 u32 hval = fnhe_hashfun(daddr);
635
c5038a83 636 spin_lock_bh(&fnhe_lock);
4895c771 637
aee06da6 638 hash = nh->nh_exceptions;
4895c771 639 if (!hash) {
aee06da6 640 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 641 if (!hash)
aee06da6
JA
642 goto out_unlock;
643 nh->nh_exceptions = hash;
4895c771
DM
644 }
645
4895c771
DM
646 hash += hval;
647
648 depth = 0;
649 for (fnhe = rcu_dereference(hash->chain); fnhe;
650 fnhe = rcu_dereference(fnhe->fnhe_next)) {
651 if (fnhe->fnhe_daddr == daddr)
aee06da6 652 break;
4895c771
DM
653 depth++;
654 }
655
aee06da6
JA
656 if (fnhe) {
657 if (gw)
658 fnhe->fnhe_gw = gw;
659 if (pmtu) {
660 fnhe->fnhe_pmtu = pmtu;
661 fnhe->fnhe_expires = expires;
662 }
663 } else {
664 if (depth > FNHE_RECLAIM_DEPTH)
665 fnhe = fnhe_oldest(hash);
666 else {
667 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
668 if (!fnhe)
669 goto out_unlock;
670
671 fnhe->fnhe_next = hash->chain;
672 rcu_assign_pointer(hash->chain, fnhe);
673 }
674 fnhe->fnhe_daddr = daddr;
675 fnhe->fnhe_gw = gw;
676 fnhe->fnhe_pmtu = pmtu;
677 fnhe->fnhe_expires = expires;
4895c771 678 }
4895c771 679
4895c771 680 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
681
682out_unlock:
c5038a83 683 spin_unlock_bh(&fnhe_lock);
aee06da6 684 return;
4895c771
DM
685}
686
ceb33206
DM
687static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
688 bool kill_route)
1da177e4 689{
e47a185b 690 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 691 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 692 struct net_device *dev = skb->dev;
e47a185b 693 struct in_device *in_dev;
4895c771 694 struct fib_result res;
e47a185b 695 struct neighbour *n;
317805b8 696 struct net *net;
1da177e4 697
94206125
DM
698 switch (icmp_hdr(skb)->code & 7) {
699 case ICMP_REDIR_NET:
700 case ICMP_REDIR_NETTOS:
701 case ICMP_REDIR_HOST:
702 case ICMP_REDIR_HOSTTOS:
703 break;
704
705 default:
706 return;
707 }
708
e47a185b
DM
709 if (rt->rt_gateway != old_gw)
710 return;
711
712 in_dev = __in_dev_get_rcu(dev);
713 if (!in_dev)
714 return;
715
c346dca1 716 net = dev_net(dev);
9d4fb27d
JP
717 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
718 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
719 ipv4_is_zeronet(new_gw))
1da177e4
LT
720 goto reject_redirect;
721
722 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
723 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
724 goto reject_redirect;
725 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
726 goto reject_redirect;
727 } else {
317805b8 728 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
729 goto reject_redirect;
730 }
731
4895c771 732 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
e47a185b
DM
733 if (n) {
734 if (!(n->nud_state & NUD_VALID)) {
735 neigh_event_send(n, NULL);
736 } else {
4895c771
DM
737 if (fib_lookup(net, fl4, &res) == 0) {
738 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 739
aee06da6
JA
740 update_or_create_fnhe(nh, fl4->daddr, new_gw,
741 0, 0);
4895c771 742 }
ceb33206
DM
743 if (kill_route)
744 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
745 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
746 }
747 neigh_release(n);
748 }
749 return;
750
751reject_redirect:
752#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
753 if (IN_DEV_LOG_MARTIANS(in_dev)) {
754 const struct iphdr *iph = (const struct iphdr *) skb->data;
755 __be32 daddr = iph->daddr;
756 __be32 saddr = iph->saddr;
757
e47a185b
DM
758 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
759 " Advised path = %pI4 -> %pI4\n",
760 &old_gw, dev->name, &new_gw,
761 &saddr, &daddr);
99ee038d 762 }
e47a185b
DM
763#endif
764 ;
765}
766
4895c771
DM
767static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
768{
769 struct rtable *rt;
770 struct flowi4 fl4;
771
772 rt = (struct rtable *) dst;
773
774 ip_rt_build_flow_key(&fl4, sk, skb);
ceb33206 775 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
776}
777
1da177e4
LT
778static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
779{
ee6b9673 780 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
781 struct dst_entry *ret = dst;
782
783 if (rt) {
d11a4dc1 784 if (dst->obsolete > 0) {
1da177e4
LT
785 ip_rt_put(rt);
786 ret = NULL;
5943634f
DM
787 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
788 rt->dst.expires) {
89aef892 789 ip_rt_put(rt);
1da177e4
LT
790 ret = NULL;
791 }
792 }
793 return ret;
794}
795
796/*
797 * Algorithm:
798 * 1. The first ip_rt_redirect_number redirects are sent
799 * with exponential backoff, then we stop sending them at all,
800 * assuming that the host ignores our redirects.
801 * 2. If we did not see packets requiring redirects
802 * during ip_rt_redirect_silence, we assume that the host
803 * forgot redirected route and start to send redirects again.
804 *
805 * This algorithm is much cheaper and more intelligent than dumb load limiting
806 * in icmp.c.
807 *
808 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
809 * and "frag. need" (breaks PMTU discovery) in icmp.c.
810 */
811
812void ip_rt_send_redirect(struct sk_buff *skb)
813{
511c3f92 814 struct rtable *rt = skb_rtable(skb);
30038fc6 815 struct in_device *in_dev;
92d86829 816 struct inet_peer *peer;
1d861aa4 817 struct net *net;
30038fc6 818 int log_martians;
1da177e4 819
30038fc6 820 rcu_read_lock();
d8d1f30b 821 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
822 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
823 rcu_read_unlock();
1da177e4 824 return;
30038fc6
ED
825 }
826 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
827 rcu_read_unlock();
1da177e4 828
1d861aa4
DM
829 net = dev_net(rt->dst.dev);
830 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
831 if (!peer) {
832 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
833 return;
834 }
835
1da177e4
LT
836 /* No redirected packets during ip_rt_redirect_silence;
837 * reset the algorithm.
838 */
92d86829
DM
839 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
840 peer->rate_tokens = 0;
1da177e4
LT
841
842 /* Too many ignored redirects; do not send anything
d8d1f30b 843 * set dst.rate_last to the last seen redirected packet.
1da177e4 844 */
92d86829
DM
845 if (peer->rate_tokens >= ip_rt_redirect_number) {
846 peer->rate_last = jiffies;
1d861aa4 847 goto out_put_peer;
1da177e4
LT
848 }
849
850 /* Check for load limit; set rate_last to the latest sent
851 * redirect.
852 */
92d86829 853 if (peer->rate_tokens == 0 ||
14fb8a76 854 time_after(jiffies,
92d86829
DM
855 (peer->rate_last +
856 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 857 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
858 peer->rate_last = jiffies;
859 ++peer->rate_tokens;
1da177e4 860#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 861 if (log_martians &&
e87cc472
JP
862 peer->rate_tokens == ip_rt_redirect_number)
863 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 864 &ip_hdr(skb)->saddr, inet_iif(skb),
f1ce3062 865 &ip_hdr(skb)->daddr, &rt->rt_gateway);
1da177e4
LT
866#endif
867 }
1d861aa4
DM
868out_put_peer:
869 inet_putpeer(peer);
1da177e4
LT
870}
871
872static int ip_error(struct sk_buff *skb)
873{
251da413 874 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 875 struct rtable *rt = skb_rtable(skb);
92d86829 876 struct inet_peer *peer;
1da177e4 877 unsigned long now;
251da413 878 struct net *net;
92d86829 879 bool send;
1da177e4
LT
880 int code;
881
251da413
DM
882 net = dev_net(rt->dst.dev);
883 if (!IN_DEV_FORWARD(in_dev)) {
884 switch (rt->dst.error) {
885 case EHOSTUNREACH:
886 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
887 break;
888
889 case ENETUNREACH:
890 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
891 break;
892 }
893 goto out;
894 }
895
d8d1f30b 896 switch (rt->dst.error) {
4500ebf8
JP
897 case EINVAL:
898 default:
899 goto out;
900 case EHOSTUNREACH:
901 code = ICMP_HOST_UNREACH;
902 break;
903 case ENETUNREACH:
904 code = ICMP_NET_UNREACH;
251da413 905 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
906 break;
907 case EACCES:
908 code = ICMP_PKT_FILTERED;
909 break;
1da177e4
LT
910 }
911
1d861aa4 912 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
913
914 send = true;
915 if (peer) {
916 now = jiffies;
917 peer->rate_tokens += now - peer->rate_last;
918 if (peer->rate_tokens > ip_rt_error_burst)
919 peer->rate_tokens = ip_rt_error_burst;
920 peer->rate_last = now;
921 if (peer->rate_tokens >= ip_rt_error_cost)
922 peer->rate_tokens -= ip_rt_error_cost;
923 else
924 send = false;
1d861aa4 925 inet_putpeer(peer);
1da177e4 926 }
92d86829
DM
927 if (send)
928 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
929
930out: kfree_skb(skb);
931 return 0;
e905a9ed 932}
1da177e4 933
ceb33206 934static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 935{
4895c771 936 struct fib_result res;
2c8cec5c 937
5943634f
DM
938 if (mtu < ip_rt_min_pmtu)
939 mtu = ip_rt_min_pmtu;
2c8cec5c 940
4895c771
DM
941 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
942 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 943
aee06da6
JA
944 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
945 jiffies + ip_rt_mtu_expires);
4895c771 946 }
ceb33206 947 return mtu;
1da177e4
LT
948}
949
4895c771
DM
950static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
951 struct sk_buff *skb, u32 mtu)
952{
953 struct rtable *rt = (struct rtable *) dst;
954 struct flowi4 fl4;
955
956 ip_rt_build_flow_key(&fl4, sk, skb);
ceb33206
DM
957 mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
958
959 if (!rt->rt_pmtu) {
960 dst->obsolete = DST_OBSOLETE_KILL;
961 } else {
962 rt->rt_pmtu = mtu;
963 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
964 }
4895c771
DM
965}
966
36393395
DM
967void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
968 int oif, u32 mark, u8 protocol, int flow_flags)
969{
4895c771 970 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
971 struct flowi4 fl4;
972 struct rtable *rt;
973
4895c771
DM
974 __build_flow_key(&fl4, NULL, iph, oif,
975 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
976 rt = __ip_route_output_key(net, &fl4);
977 if (!IS_ERR(rt)) {
4895c771 978 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
979 ip_rt_put(rt);
980 }
981}
982EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
983
984void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
985{
4895c771
DM
986 const struct iphdr *iph = (const struct iphdr *) skb->data;
987 struct flowi4 fl4;
988 struct rtable *rt;
36393395 989
4895c771
DM
990 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
991 rt = __ip_route_output_key(sock_net(sk), &fl4);
992 if (!IS_ERR(rt)) {
993 __ip_rt_update_pmtu(rt, &fl4, mtu);
994 ip_rt_put(rt);
995 }
36393395
DM
996}
997EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 998
b42597e2
DM
999void ipv4_redirect(struct sk_buff *skb, struct net *net,
1000 int oif, u32 mark, u8 protocol, int flow_flags)
1001{
4895c771 1002 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1003 struct flowi4 fl4;
1004 struct rtable *rt;
1005
4895c771
DM
1006 __build_flow_key(&fl4, NULL, iph, oif,
1007 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1008 rt = __ip_route_output_key(net, &fl4);
1009 if (!IS_ERR(rt)) {
ceb33206 1010 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1011 ip_rt_put(rt);
1012 }
1013}
1014EXPORT_SYMBOL_GPL(ipv4_redirect);
1015
1016void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1017{
4895c771
DM
1018 const struct iphdr *iph = (const struct iphdr *) skb->data;
1019 struct flowi4 fl4;
1020 struct rtable *rt;
b42597e2 1021
4895c771
DM
1022 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023 rt = __ip_route_output_key(sock_net(sk), &fl4);
1024 if (!IS_ERR(rt)) {
ceb33206 1025 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1026 ip_rt_put(rt);
1027 }
b42597e2
DM
1028}
1029EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1030
efbc368d
DM
1031static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1032{
1033 struct rtable *rt = (struct rtable *) dst;
1034
ceb33206
DM
1035 /* All IPV4 dsts are created with ->obsolete set to the value
1036 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1037 * into this function always.
1038 *
1039 * When a PMTU/redirect information update invalidates a
1040 * route, this is indicated by setting obsolete to
1041 * DST_OBSOLETE_KILL.
1042 */
1043 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
efbc368d 1044 return NULL;
d11a4dc1 1045 return dst;
1da177e4
LT
1046}
1047
1da177e4
LT
1048static void ipv4_link_failure(struct sk_buff *skb)
1049{
1050 struct rtable *rt;
1051
1052 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1053
511c3f92 1054 rt = skb_rtable(skb);
5943634f
DM
1055 if (rt)
1056 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1057}
1058
1059static int ip_rt_bug(struct sk_buff *skb)
1060{
91df42be
JP
1061 pr_debug("%s: %pI4 -> %pI4, %s\n",
1062 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1063 skb->dev ? skb->dev->name : "?");
1da177e4 1064 kfree_skb(skb);
c378a9c0 1065 WARN_ON(1);
1da177e4
LT
1066 return 0;
1067}
1068
1069/*
1070 We do not cache source address of outgoing interface,
1071 because it is used only by IP RR, TS and SRR options,
1072 so that it out of fast path.
1073
1074 BTW remember: "addr" is allowed to be not aligned
1075 in IP options!
1076 */
1077
8e36360a 1078void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1079{
a61ced5d 1080 __be32 src;
1da177e4 1081
c7537967 1082 if (rt_is_output_route(rt))
c5be24ff 1083 src = ip_hdr(skb)->saddr;
ebc0ffae 1084 else {
8e36360a
DM
1085 struct fib_result res;
1086 struct flowi4 fl4;
1087 struct iphdr *iph;
1088
1089 iph = ip_hdr(skb);
1090
1091 memset(&fl4, 0, sizeof(fl4));
1092 fl4.daddr = iph->daddr;
1093 fl4.saddr = iph->saddr;
b0fe4a31 1094 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1095 fl4.flowi4_oif = rt->dst.dev->ifindex;
1096 fl4.flowi4_iif = skb->dev->ifindex;
1097 fl4.flowi4_mark = skb->mark;
5e2b61f7 1098
ebc0ffae 1099 rcu_read_lock();
68a5e3dd 1100 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1101 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1102 else
f8126f1d
DM
1103 src = inet_select_addr(rt->dst.dev,
1104 rt_nexthop(rt, iph->daddr),
1105 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1106 rcu_read_unlock();
1107 }
1da177e4
LT
1108 memcpy(addr, &src, 4);
1109}
1110
c7066f70 1111#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1112static void set_class_tag(struct rtable *rt, u32 tag)
1113{
d8d1f30b
CG
1114 if (!(rt->dst.tclassid & 0xFFFF))
1115 rt->dst.tclassid |= tag & 0xFFFF;
1116 if (!(rt->dst.tclassid & 0xFFFF0000))
1117 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1118}
1119#endif
1120
0dbaee3b
DM
1121static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1122{
1123 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1124
1125 if (advmss == 0) {
1126 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1127 ip_rt_min_advmss);
1128 if (advmss > 65535 - 40)
1129 advmss = 65535 - 40;
1130 }
1131 return advmss;
1132}
1133
ebb762f2 1134static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1135{
261663b0 1136 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1137 unsigned int mtu = rt->rt_pmtu;
1138
1139 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1140 mtu = 0;
1141
1142 if (!mtu)
1143 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1144
261663b0 1145 if (mtu && rt_is_output_route(rt))
618f9bc7
SK
1146 return mtu;
1147
1148 mtu = dst->dev->mtu;
d33e4553
DM
1149
1150 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
f8126f1d 1151 if (rt->rt_gateway && mtu > 576)
d33e4553
DM
1152 mtu = 576;
1153 }
1154
1155 if (mtu > IP_MAX_MTU)
1156 mtu = IP_MAX_MTU;
1157
1158 return mtu;
1159}
1160
f2bb4bed 1161static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771
DM
1162{
1163 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1164 struct fib_nh_exception *fnhe;
1165 u32 hval;
1166
f2bb4bed
DM
1167 if (!hash)
1168 return NULL;
1169
d3a25c98 1170 hval = fnhe_hashfun(daddr);
4895c771
DM
1171
1172 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1173 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1174 if (fnhe->fnhe_daddr == daddr)
1175 return fnhe;
1176 }
1177 return NULL;
1178}
aee06da6 1179
caacf05e 1180static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
f2bb4bed
DM
1181 __be32 daddr)
1182{
caacf05e
DM
1183 bool ret = false;
1184
c5038a83 1185 spin_lock_bh(&fnhe_lock);
f2bb4bed 1186
c5038a83
DM
1187 if (daddr == fnhe->fnhe_daddr) {
1188 struct rtable *orig;
f2bb4bed 1189
c5038a83
DM
1190 if (fnhe->fnhe_pmtu) {
1191 unsigned long expires = fnhe->fnhe_expires;
1192 unsigned long diff = expires - jiffies;
1193
1194 if (time_before(jiffies, expires)) {
1195 rt->rt_pmtu = fnhe->fnhe_pmtu;
1196 dst_set_expires(&rt->dst, diff);
1197 }
1198 }
1199 if (fnhe->fnhe_gw) {
1200 rt->rt_flags |= RTCF_REDIRECTED;
1201 rt->rt_gateway = fnhe->fnhe_gw;
ceb33206 1202 }
f2bb4bed 1203
c5038a83
DM
1204 orig = rcu_dereference(fnhe->fnhe_rth);
1205 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1206 if (orig)
1207 rt_free(orig);
1208
1209 fnhe->fnhe_stamp = jiffies;
caacf05e 1210 ret = true;
c5038a83
DM
1211 } else {
1212 /* Routes we intend to cache in nexthop exception have
1213 * the DST_NOCACHE bit clear. However, if we are
1214 * unsuccessful at storing this route into the cache
1215 * we really need to set it.
1216 */
1217 rt->dst.flags |= DST_NOCACHE;
1218 }
1219 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1220
1221 return ret;
54764bb6
ED
1222}
1223
caacf05e 1224static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1225{
d26b3a7c 1226 struct rtable *orig, *prev, **p;
caacf05e 1227 bool ret = true;
f2bb4bed 1228
d26b3a7c 1229 if (rt_is_input_route(rt)) {
54764bb6 1230 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c
ED
1231 } else {
1232 if (!nh->nh_pcpu_rth_output)
1233 goto nocache;
1234 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1235 }
f2bb4bed
DM
1236 orig = *p;
1237
1238 prev = cmpxchg(p, orig, rt);
1239 if (prev == orig) {
f2bb4bed 1240 if (orig)
54764bb6 1241 rt_free(orig);
c6cffba4 1242 } else {
54764bb6
ED
1243 /* Routes we intend to cache in the FIB nexthop have
1244 * the DST_NOCACHE bit clear. However, if we are
1245 * unsuccessful at storing this route into the cache
1246 * we really need to set it.
1247 */
d26b3a7c 1248nocache:
54764bb6 1249 rt->dst.flags |= DST_NOCACHE;
caacf05e
DM
1250 ret = false;
1251 }
1252
1253 return ret;
1254}
1255
1256static DEFINE_SPINLOCK(rt_uncached_lock);
1257static LIST_HEAD(rt_uncached_list);
1258
1259static void rt_add_uncached_list(struct rtable *rt)
1260{
1261 spin_lock_bh(&rt_uncached_lock);
1262 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1263 spin_unlock_bh(&rt_uncached_lock);
1264}
1265
1266static void ipv4_dst_destroy(struct dst_entry *dst)
1267{
1268 struct rtable *rt = (struct rtable *) dst;
1269
1270 if (dst->flags & DST_NOCACHE) {
1271 spin_lock_bh(&rt_uncached_lock);
1272 list_del(&rt->rt_uncached);
1273 spin_unlock_bh(&rt_uncached_lock);
1274 }
1275}
1276
1277void rt_flush_dev(struct net_device *dev)
1278{
1279 if (!list_empty(&rt_uncached_list)) {
1280 struct net *net = dev_net(dev);
1281 struct rtable *rt;
1282
1283 spin_lock_bh(&rt_uncached_lock);
1284 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1285 if (rt->dst.dev != dev)
1286 continue;
1287 rt->dst.dev = net->loopback_dev;
1288 dev_hold(rt->dst.dev);
1289 dev_put(dev);
1290 }
1291 spin_unlock_bh(&rt_uncached_lock);
4895c771
DM
1292 }
1293}
1294
4331debc 1295static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1296{
4331debc
ED
1297 return rt &&
1298 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1299 !rt_is_expired(rt);
d2d68ba9
DM
1300}
1301
f2bb4bed 1302static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1303 const struct fib_result *res,
f2bb4bed 1304 struct fib_nh_exception *fnhe,
982721f3 1305 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1306{
caacf05e
DM
1307 bool cached = false;
1308
1da177e4 1309 if (fi) {
4895c771
DM
1310 struct fib_nh *nh = &FIB_RES_NH(*res);
1311
1312 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1313 rt->rt_gateway = nh->nh_gw;
2860583f 1314 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
c7066f70 1315#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1316 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1317#endif
c5038a83 1318 if (unlikely(fnhe))
caacf05e 1319 cached = rt_bind_exception(rt, fnhe, daddr);
c5038a83 1320 else if (!(rt->dst.flags & DST_NOCACHE))
caacf05e 1321 cached = rt_cache_route(nh, rt);
d33e4553 1322 }
caacf05e
DM
1323 if (unlikely(!cached))
1324 rt_add_uncached_list(rt);
defb3519 1325
c7066f70 1326#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1327#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1328 set_class_tag(rt, res->tclassid);
1da177e4
LT
1329#endif
1330 set_class_tag(rt, itag);
1331#endif
1da177e4
LT
1332}
1333
5c1e6aa3 1334static struct rtable *rt_dst_alloc(struct net_device *dev,
f2bb4bed 1335 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1336{
f5b0a874 1337 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
c6cffba4 1338 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
5c1e6aa3
DM
1339 (nopolicy ? DST_NOPOLICY : 0) |
1340 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1341}
1342
96d36220 1343/* called in rcu_read_lock() section */
9e12bb22 1344static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1345 u8 tos, struct net_device *dev, int our)
1346{
1da177e4 1347 struct rtable *rth;
96d36220 1348 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1349 u32 itag = 0;
b5f7e755 1350 int err;
1da177e4
LT
1351
1352 /* Primary sanity checks. */
1353
1354 if (in_dev == NULL)
1355 return -EINVAL;
1356
1e637c74 1357 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1358 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1359 goto e_inval;
1360
d0daebc3
TG
1361 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1362 if (ipv4_is_loopback(saddr))
1363 goto e_inval;
1364
f97c1e0c
JP
1365 if (ipv4_is_zeronet(saddr)) {
1366 if (!ipv4_is_local_multicast(daddr))
1da177e4 1367 goto e_inval;
b5f7e755 1368 } else {
9e56e380
DM
1369 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1370 in_dev, &itag);
b5f7e755
ED
1371 if (err < 0)
1372 goto e_err;
1373 }
4e7b2f14 1374 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
f2bb4bed 1375 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4
LT
1376 if (!rth)
1377 goto e_nobufs;
1378
cf911662
DM
1379#ifdef CONFIG_IP_ROUTE_CLASSID
1380 rth->dst.tclassid = itag;
1381#endif
d8d1f30b 1382 rth->dst.output = ip_rt_bug;
1da177e4 1383
cf911662
DM
1384 rth->rt_genid = rt_genid(dev_net(dev));
1385 rth->rt_flags = RTCF_MULTICAST;
1386 rth->rt_type = RTN_MULTICAST;
9917e1e8 1387 rth->rt_is_input= 1;
13378cad 1388 rth->rt_iif = 0;
5943634f 1389 rth->rt_pmtu = 0;
f8126f1d 1390 rth->rt_gateway = 0;
caacf05e 1391 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1392 if (our) {
d8d1f30b 1393 rth->dst.input= ip_local_deliver;
1da177e4
LT
1394 rth->rt_flags |= RTCF_LOCAL;
1395 }
1396
1397#ifdef CONFIG_IP_MROUTE
f97c1e0c 1398 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1399 rth->dst.input = ip_mr_input;
1da177e4
LT
1400#endif
1401 RT_CACHE_STAT_INC(in_slow_mc);
1402
89aef892
DM
1403 skb_dst_set(skb, &rth->dst);
1404 return 0;
1da177e4
LT
1405
1406e_nobufs:
1da177e4 1407 return -ENOBUFS;
1da177e4 1408e_inval:
96d36220 1409 return -EINVAL;
b5f7e755 1410e_err:
b5f7e755 1411 return err;
1da177e4
LT
1412}
1413
1414
1415static void ip_handle_martian_source(struct net_device *dev,
1416 struct in_device *in_dev,
1417 struct sk_buff *skb,
9e12bb22
AV
1418 __be32 daddr,
1419 __be32 saddr)
1da177e4
LT
1420{
1421 RT_CACHE_STAT_INC(in_martian_src);
1422#ifdef CONFIG_IP_ROUTE_VERBOSE
1423 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1424 /*
1425 * RFC1812 recommendation, if source is martian,
1426 * the only hint is MAC header.
1427 */
058bd4d2 1428 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1429 &daddr, &saddr, dev->name);
98e399f8 1430 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1431 print_hex_dump(KERN_WARNING, "ll header: ",
1432 DUMP_PREFIX_OFFSET, 16, 1,
1433 skb_mac_header(skb),
1434 dev->hard_header_len, true);
1da177e4
LT
1435 }
1436 }
1437#endif
1438}
1439
47360228 1440/* called in rcu_read_lock() section */
5969f71d 1441static int __mkroute_input(struct sk_buff *skb,
982721f3 1442 const struct fib_result *res,
5969f71d 1443 struct in_device *in_dev,
c6cffba4 1444 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1445{
1da177e4
LT
1446 struct rtable *rth;
1447 int err;
1448 struct in_device *out_dev;
47360228 1449 unsigned int flags = 0;
d2d68ba9 1450 bool do_cache;
d9c9df8c 1451 u32 itag;
1da177e4
LT
1452
1453 /* get a working reference to the output device */
47360228 1454 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 1455 if (out_dev == NULL) {
e87cc472 1456 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1457 return -EINVAL;
1458 }
1459
1460
5c04c819 1461 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1462 in_dev->dev, in_dev, &itag);
1da177e4 1463 if (err < 0) {
e905a9ed 1464 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1465 saddr);
e905a9ed 1466
1da177e4
LT
1467 goto cleanup;
1468 }
1469
51b77cae 1470 if (out_dev == in_dev && err &&
1da177e4
LT
1471 (IN_DEV_SHARED_MEDIA(out_dev) ||
1472 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1473 flags |= RTCF_DOREDIRECT;
1474
1475 if (skb->protocol != htons(ETH_P_IP)) {
1476 /* Not IP (i.e. ARP). Do not create route, if it is
1477 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1478 *
1479 * Proxy arp feature have been extended to allow, ARP
1480 * replies back to the same interface, to support
1481 * Private VLAN switch technologies. See arp.c.
1da177e4 1482 */
65324144
JDB
1483 if (out_dev == in_dev &&
1484 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1485 err = -EINVAL;
1486 goto cleanup;
1487 }
1488 }
1489
d2d68ba9
DM
1490 do_cache = false;
1491 if (res->fi) {
fe3edf45 1492 if (!itag) {
54764bb6 1493 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 1494 if (rt_cache_valid(rth)) {
c6cffba4 1495 skb_dst_set_noref(skb, &rth->dst);
d2d68ba9
DM
1496 goto out;
1497 }
1498 do_cache = true;
1499 }
1500 }
f2bb4bed 1501
5c1e6aa3
DM
1502 rth = rt_dst_alloc(out_dev->dev,
1503 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1504 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1505 if (!rth) {
1506 err = -ENOBUFS;
1507 goto cleanup;
1508 }
1509
cf911662
DM
1510 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1511 rth->rt_flags = flags;
1512 rth->rt_type = res->type;
9917e1e8 1513 rth->rt_is_input = 1;
13378cad 1514 rth->rt_iif = 0;
5943634f 1515 rth->rt_pmtu = 0;
f8126f1d 1516 rth->rt_gateway = 0;
caacf05e 1517 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1518
d8d1f30b
CG
1519 rth->dst.input = ip_forward;
1520 rth->dst.output = ip_output;
1da177e4 1521
d2d68ba9 1522 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
c6cffba4 1523 skb_dst_set(skb, &rth->dst);
d2d68ba9 1524out:
1da177e4
LT
1525 err = 0;
1526 cleanup:
1da177e4 1527 return err;
e905a9ed 1528}
1da177e4 1529
5969f71d
SH
1530static int ip_mkroute_input(struct sk_buff *skb,
1531 struct fib_result *res,
68a5e3dd 1532 const struct flowi4 *fl4,
5969f71d
SH
1533 struct in_device *in_dev,
1534 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1535{
1da177e4 1536#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 1537 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 1538 fib_select_multipath(res);
1da177e4
LT
1539#endif
1540
1541 /* create a routing cache entry */
c6cffba4 1542 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1543}
1544
1da177e4
LT
1545/*
1546 * NOTE. We drop all the packets that has local source
1547 * addresses, because every properly looped back packet
1548 * must have correct destination already attached by output routine.
1549 *
1550 * Such approach solves two big problems:
1551 * 1. Not simplex devices are handled properly.
1552 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1553 * called with rcu_read_lock()
1da177e4
LT
1554 */
1555
9e12bb22 1556static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 1557 u8 tos, struct net_device *dev)
1da177e4
LT
1558{
1559 struct fib_result res;
96d36220 1560 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 1561 struct flowi4 fl4;
95c96174 1562 unsigned int flags = 0;
1da177e4 1563 u32 itag = 0;
95c96174 1564 struct rtable *rth;
1da177e4 1565 int err = -EINVAL;
5e73ea1a 1566 struct net *net = dev_net(dev);
d2d68ba9 1567 bool do_cache;
1da177e4
LT
1568
1569 /* IP on this device is disabled. */
1570
1571 if (!in_dev)
1572 goto out;
1573
1574 /* Check for the most weird martians, which can be not detected
1575 by fib_lookup.
1576 */
1577
d0daebc3 1578 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1579 goto martian_source;
1580
d2d68ba9 1581 res.fi = NULL;
27a954bd 1582 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1583 goto brd_input;
1584
1585 /* Accept zero addresses only to limited broadcast;
1586 * I even do not know to fix it or not. Waiting for complains :-)
1587 */
f97c1e0c 1588 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1589 goto martian_source;
1590
d0daebc3 1591 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1592 goto martian_destination;
1593
d0daebc3
TG
1594 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1595 if (ipv4_is_loopback(daddr))
1596 goto martian_destination;
1597
1598 if (ipv4_is_loopback(saddr))
1599 goto martian_source;
1600 }
1601
1da177e4
LT
1602 /*
1603 * Now we are ready to route packet.
1604 */
68a5e3dd
DM
1605 fl4.flowi4_oif = 0;
1606 fl4.flowi4_iif = dev->ifindex;
1607 fl4.flowi4_mark = skb->mark;
1608 fl4.flowi4_tos = tos;
1609 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1610 fl4.daddr = daddr;
1611 fl4.saddr = saddr;
1612 err = fib_lookup(net, &fl4, &res);
251da413 1613 if (err != 0)
1da177e4 1614 goto no_route;
1da177e4
LT
1615
1616 RT_CACHE_STAT_INC(in_slow_tot);
1617
1618 if (res.type == RTN_BROADCAST)
1619 goto brd_input;
1620
1621 if (res.type == RTN_LOCAL) {
5c04c819 1622 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 1623 net->loopback_dev->ifindex,
9e56e380 1624 dev, in_dev, &itag);
b5f7e755
ED
1625 if (err < 0)
1626 goto martian_source_keep_err;
1da177e4
LT
1627 goto local_input;
1628 }
1629
1630 if (!IN_DEV_FORWARD(in_dev))
251da413 1631 goto no_route;
1da177e4
LT
1632 if (res.type != RTN_UNICAST)
1633 goto martian_destination;
1634
68a5e3dd 1635 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
1636out: return err;
1637
1638brd_input:
1639 if (skb->protocol != htons(ETH_P_IP))
1640 goto e_inval;
1641
41347dcd 1642 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1643 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1644 in_dev, &itag);
1da177e4 1645 if (err < 0)
b5f7e755 1646 goto martian_source_keep_err;
1da177e4
LT
1647 }
1648 flags |= RTCF_BROADCAST;
1649 res.type = RTN_BROADCAST;
1650 RT_CACHE_STAT_INC(in_brd);
1651
1652local_input:
d2d68ba9
DM
1653 do_cache = false;
1654 if (res.fi) {
fe3edf45 1655 if (!itag) {
54764bb6 1656 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
d2d68ba9 1657 if (rt_cache_valid(rth)) {
c6cffba4
DM
1658 skb_dst_set_noref(skb, &rth->dst);
1659 err = 0;
1660 goto out;
d2d68ba9
DM
1661 }
1662 do_cache = true;
1663 }
1664 }
1665
5c1e6aa3 1666 rth = rt_dst_alloc(net->loopback_dev,
d2d68ba9 1667 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
1668 if (!rth)
1669 goto e_nobufs;
1670
cf911662 1671 rth->dst.input= ip_local_deliver;
d8d1f30b 1672 rth->dst.output= ip_rt_bug;
cf911662
DM
1673#ifdef CONFIG_IP_ROUTE_CLASSID
1674 rth->dst.tclassid = itag;
1675#endif
1da177e4 1676
cf911662
DM
1677 rth->rt_genid = rt_genid(net);
1678 rth->rt_flags = flags|RTCF_LOCAL;
1679 rth->rt_type = res.type;
9917e1e8 1680 rth->rt_is_input = 1;
13378cad 1681 rth->rt_iif = 0;
5943634f 1682 rth->rt_pmtu = 0;
f8126f1d 1683 rth->rt_gateway = 0;
caacf05e 1684 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1685 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
1686 rth->dst.input= ip_error;
1687 rth->dst.error= -err;
1da177e4
LT
1688 rth->rt_flags &= ~RTCF_LOCAL;
1689 }
d2d68ba9
DM
1690 if (do_cache)
1691 rt_cache_route(&FIB_RES_NH(res), rth);
89aef892 1692 skb_dst_set(skb, &rth->dst);
b23dd4fe 1693 err = 0;
ebc0ffae 1694 goto out;
1da177e4
LT
1695
1696no_route:
1697 RT_CACHE_STAT_INC(in_no_route);
1da177e4 1698 res.type = RTN_UNREACHABLE;
7f53878d
MC
1699 if (err == -ESRCH)
1700 err = -ENETUNREACH;
1da177e4
LT
1701 goto local_input;
1702
1703 /*
1704 * Do not cache martian addresses: they should be logged (RFC1812)
1705 */
1706martian_destination:
1707 RT_CACHE_STAT_INC(in_martian_dst);
1708#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
1709 if (IN_DEV_LOG_MARTIANS(in_dev))
1710 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1711 &daddr, &saddr, dev->name);
1da177e4 1712#endif
2c2910a4 1713
1da177e4
LT
1714e_inval:
1715 err = -EINVAL;
ebc0ffae 1716 goto out;
1da177e4
LT
1717
1718e_nobufs:
1719 err = -ENOBUFS;
ebc0ffae 1720 goto out;
1da177e4
LT
1721
1722martian_source:
b5f7e755
ED
1723 err = -EINVAL;
1724martian_source_keep_err:
1da177e4 1725 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 1726 goto out;
1da177e4
LT
1727}
1728
c6cffba4
DM
1729int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730 u8 tos, struct net_device *dev)
1da177e4 1731{
96d36220 1732 int res;
1da177e4 1733
96d36220
ED
1734 rcu_read_lock();
1735
1da177e4
LT
1736 /* Multicast recognition logic is moved from route cache to here.
1737 The problem was that too many Ethernet cards have broken/missing
1738 hardware multicast filters :-( As result the host on multicasting
1739 network acquires a lot of useless route cache entries, sort of
1740 SDR messages from all the world. Now we try to get rid of them.
1741 Really, provided software IP multicast filter is organized
1742 reasonably (at least, hashed), it does not result in a slowdown
1743 comparing with route cache reject entries.
1744 Note, that multicast routers are not affected, because
1745 route cache entry is created eventually.
1746 */
f97c1e0c 1747 if (ipv4_is_multicast(daddr)) {
96d36220 1748 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1749
96d36220 1750 if (in_dev) {
dbdd9a52
DM
1751 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1752 ip_hdr(skb)->protocol);
1da177e4
LT
1753 if (our
1754#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
1755 ||
1756 (!ipv4_is_local_multicast(daddr) &&
1757 IN_DEV_MFORWARD(in_dev))
1da177e4 1758#endif
9d4fb27d 1759 ) {
96d36220
ED
1760 int res = ip_route_input_mc(skb, daddr, saddr,
1761 tos, dev, our);
1da177e4 1762 rcu_read_unlock();
96d36220 1763 return res;
1da177e4
LT
1764 }
1765 }
1766 rcu_read_unlock();
1767 return -EINVAL;
1768 }
c10237e0 1769 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
1770 rcu_read_unlock();
1771 return res;
1da177e4 1772}
c6cffba4 1773EXPORT_SYMBOL(ip_route_input_noref);
1da177e4 1774
ebc0ffae 1775/* called with rcu_read_lock() */
982721f3 1776static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 1777 const struct flowi4 *fl4, int orig_oif,
f61759e6 1778 struct net_device *dev_out,
5ada5527 1779 unsigned int flags)
1da177e4 1780{
982721f3 1781 struct fib_info *fi = res->fi;
f2bb4bed 1782 struct fib_nh_exception *fnhe;
5ada5527 1783 struct in_device *in_dev;
982721f3 1784 u16 type = res->type;
5ada5527 1785 struct rtable *rth;
1da177e4 1786
d0daebc3
TG
1787 in_dev = __in_dev_get_rcu(dev_out);
1788 if (!in_dev)
5ada5527 1789 return ERR_PTR(-EINVAL);
1da177e4 1790
d0daebc3
TG
1791 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1792 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1793 return ERR_PTR(-EINVAL);
1794
68a5e3dd 1795 if (ipv4_is_lbcast(fl4->daddr))
982721f3 1796 type = RTN_BROADCAST;
68a5e3dd 1797 else if (ipv4_is_multicast(fl4->daddr))
982721f3 1798 type = RTN_MULTICAST;
68a5e3dd 1799 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 1800 return ERR_PTR(-EINVAL);
1da177e4
LT
1801
1802 if (dev_out->flags & IFF_LOOPBACK)
1803 flags |= RTCF_LOCAL;
1804
982721f3 1805 if (type == RTN_BROADCAST) {
1da177e4 1806 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
1807 fi = NULL;
1808 } else if (type == RTN_MULTICAST) {
dd28d1a0 1809 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
1810 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1811 fl4->flowi4_proto))
1da177e4
LT
1812 flags &= ~RTCF_LOCAL;
1813 /* If multicast route do not exist use
dd28d1a0
ED
1814 * default one, but do not gateway in this case.
1815 * Yes, it is hack.
1da177e4 1816 */
982721f3
DM
1817 if (fi && res->prefixlen < 4)
1818 fi = NULL;
1da177e4
LT
1819 }
1820
f2bb4bed
DM
1821 fnhe = NULL;
1822 if (fi) {
c5038a83 1823 struct rtable __rcu **prth;
d26b3a7c 1824
c5038a83
DM
1825 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1826 if (fnhe)
1827 prth = &fnhe->fnhe_rth;
1828 else
d26b3a7c 1829 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
c5038a83
DM
1830 rth = rcu_dereference(*prth);
1831 if (rt_cache_valid(rth)) {
1832 dst_hold(&rth->dst);
1833 return rth;
f2bb4bed
DM
1834 }
1835 }
5c1e6aa3
DM
1836 rth = rt_dst_alloc(dev_out,
1837 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 1838 IN_DEV_CONF_GET(in_dev, NOXFRM),
c5038a83 1839 fi);
8391d07b 1840 if (!rth)
5ada5527 1841 return ERR_PTR(-ENOBUFS);
8391d07b 1842
cf911662
DM
1843 rth->dst.output = ip_output;
1844
cf911662
DM
1845 rth->rt_genid = rt_genid(dev_net(dev_out));
1846 rth->rt_flags = flags;
1847 rth->rt_type = type;
9917e1e8 1848 rth->rt_is_input = 0;
13378cad 1849 rth->rt_iif = orig_oif ? : 0;
5943634f 1850 rth->rt_pmtu = 0;
f8126f1d 1851 rth->rt_gateway = 0;
caacf05e 1852 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4
LT
1853
1854 RT_CACHE_STAT_INC(out_slow_tot);
1855
41347dcd 1856 if (flags & RTCF_LOCAL)
d8d1f30b 1857 rth->dst.input = ip_local_deliver;
1da177e4 1858 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 1859 if (flags & RTCF_LOCAL &&
1da177e4 1860 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 1861 rth->dst.output = ip_mc_output;
1da177e4
LT
1862 RT_CACHE_STAT_INC(out_slow_mc);
1863 }
1864#ifdef CONFIG_IP_MROUTE
982721f3 1865 if (type == RTN_MULTICAST) {
1da177e4 1866 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 1867 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
1868 rth->dst.input = ip_mr_input;
1869 rth->dst.output = ip_mc_output;
1da177e4
LT
1870 }
1871 }
1872#endif
1873 }
1874
f2bb4bed 1875 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1da177e4 1876
5ada5527 1877 return rth;
1da177e4
LT
1878}
1879
1da177e4
LT
1880/*
1881 * Major route resolver routine.
1882 */
1883
89aef892 1884struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1da177e4 1885{
1da177e4 1886 struct net_device *dev_out = NULL;
f61759e6 1887 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
1888 unsigned int flags = 0;
1889 struct fib_result res;
5ada5527 1890 struct rtable *rth;
813b3b5d 1891 int orig_oif;
1da177e4 1892
85b91b03 1893 res.tclassid = 0;
1da177e4 1894 res.fi = NULL;
8b96d22d 1895 res.table = NULL;
1da177e4 1896
813b3b5d
DM
1897 orig_oif = fl4->flowi4_oif;
1898
1899 fl4->flowi4_iif = net->loopback_dev->ifindex;
1900 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1901 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1902 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 1903
010c2708 1904 rcu_read_lock();
813b3b5d 1905 if (fl4->saddr) {
b23dd4fe 1906 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
1907 if (ipv4_is_multicast(fl4->saddr) ||
1908 ipv4_is_lbcast(fl4->saddr) ||
1909 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
1910 goto out;
1911
1da177e4
LT
1912 /* I removed check for oif == dev_out->oif here.
1913 It was wrong for two reasons:
1ab35276
DL
1914 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1915 is assigned to multiple interfaces.
1da177e4
LT
1916 2. Moreover, we are allowed to send packets with saddr
1917 of another iface. --ANK
1918 */
1919
813b3b5d
DM
1920 if (fl4->flowi4_oif == 0 &&
1921 (ipv4_is_multicast(fl4->daddr) ||
1922 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 1923 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1924 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
1925 if (dev_out == NULL)
1926 goto out;
1927
1da177e4
LT
1928 /* Special hack: user can direct multicasts
1929 and limited broadcast via necessary interface
1930 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1931 This hack is not just for fun, it allows
1932 vic,vat and friends to work.
1933 They bind socket to loopback, set ttl to zero
1934 and expect that it will work.
1935 From the viewpoint of routing cache they are broken,
1936 because we are not allowed to build multicast path
1937 with loopback source addr (look, routing cache
1938 cannot know, that ttl is zero, so that packet
1939 will not leave this host and route is valid).
1940 Luckily, this hack is good workaround.
1941 */
1942
813b3b5d 1943 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
1944 goto make_route;
1945 }
a210d01a 1946
813b3b5d 1947 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 1948 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1949 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 1950 goto out;
a210d01a 1951 }
1da177e4
LT
1952 }
1953
1954
813b3b5d
DM
1955 if (fl4->flowi4_oif) {
1956 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 1957 rth = ERR_PTR(-ENODEV);
1da177e4
LT
1958 if (dev_out == NULL)
1959 goto out;
e5ed6399
HX
1960
1961 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 1962 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 1963 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
1964 goto out;
1965 }
813b3b5d
DM
1966 if (ipv4_is_local_multicast(fl4->daddr) ||
1967 ipv4_is_lbcast(fl4->daddr)) {
1968 if (!fl4->saddr)
1969 fl4->saddr = inet_select_addr(dev_out, 0,
1970 RT_SCOPE_LINK);
1da177e4
LT
1971 goto make_route;
1972 }
813b3b5d
DM
1973 if (fl4->saddr) {
1974 if (ipv4_is_multicast(fl4->daddr))
1975 fl4->saddr = inet_select_addr(dev_out, 0,
1976 fl4->flowi4_scope);
1977 else if (!fl4->daddr)
1978 fl4->saddr = inet_select_addr(dev_out, 0,
1979 RT_SCOPE_HOST);
1da177e4
LT
1980 }
1981 }
1982
813b3b5d
DM
1983 if (!fl4->daddr) {
1984 fl4->daddr = fl4->saddr;
1985 if (!fl4->daddr)
1986 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 1987 dev_out = net->loopback_dev;
813b3b5d 1988 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
1989 res.type = RTN_LOCAL;
1990 flags |= RTCF_LOCAL;
1991 goto make_route;
1992 }
1993
813b3b5d 1994 if (fib_lookup(net, fl4, &res)) {
1da177e4 1995 res.fi = NULL;
8b96d22d 1996 res.table = NULL;
813b3b5d 1997 if (fl4->flowi4_oif) {
1da177e4
LT
1998 /* Apparently, routing tables are wrong. Assume,
1999 that the destination is on link.
2000
2001 WHY? DW.
2002 Because we are allowed to send to iface
2003 even if it has NO routes and NO assigned
2004 addresses. When oif is specified, routing
2005 tables are looked up with only one purpose:
2006 to catch if destination is gatewayed, rather than
2007 direct. Moreover, if MSG_DONTROUTE is set,
2008 we send packet, ignoring both routing tables
2009 and ifaddr state. --ANK
2010
2011
2012 We could make it even if oif is unknown,
2013 likely IPv6, but we do not.
2014 */
2015
813b3b5d
DM
2016 if (fl4->saddr == 0)
2017 fl4->saddr = inet_select_addr(dev_out, 0,
2018 RT_SCOPE_LINK);
1da177e4
LT
2019 res.type = RTN_UNICAST;
2020 goto make_route;
2021 }
b23dd4fe 2022 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2023 goto out;
2024 }
1da177e4
LT
2025
2026 if (res.type == RTN_LOCAL) {
813b3b5d 2027 if (!fl4->saddr) {
9fc3bbb4 2028 if (res.fi->fib_prefsrc)
813b3b5d 2029 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2030 else
813b3b5d 2031 fl4->saddr = fl4->daddr;
9fc3bbb4 2032 }
b40afd0e 2033 dev_out = net->loopback_dev;
813b3b5d 2034 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2035 res.fi = NULL;
2036 flags |= RTCF_LOCAL;
2037 goto make_route;
2038 }
2039
2040#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2041 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2042 fib_select_multipath(&res);
1da177e4
LT
2043 else
2044#endif
21d8c49e
DM
2045 if (!res.prefixlen &&
2046 res.table->tb_num_default > 1 &&
813b3b5d 2047 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2048 fib_select_default(&res);
1da177e4 2049
813b3b5d
DM
2050 if (!fl4->saddr)
2051 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2052
1da177e4 2053 dev_out = FIB_RES_DEV(res);
813b3b5d 2054 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2055
2056
2057make_route:
1a00fee4 2058 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1da177e4 2059
010c2708
DM
2060out:
2061 rcu_read_unlock();
b23dd4fe 2062 return rth;
1da177e4 2063}
d8c97a94
ACM
2064EXPORT_SYMBOL_GPL(__ip_route_output_key);
2065
ae2688d5
JW
2066static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2067{
2068 return NULL;
2069}
2070
ebb762f2 2071static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2072{
618f9bc7
SK
2073 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2074
2075 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2076}
2077
6700c270
DM
2078static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2079 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2080{
2081}
2082
6700c270
DM
2083static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2084 struct sk_buff *skb)
b587ee3b
DM
2085{
2086}
2087
0972ddb2
HB
2088static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2089 unsigned long old)
2090{
2091 return NULL;
2092}
2093
14e50e57
DM
2094static struct dst_ops ipv4_dst_blackhole_ops = {
2095 .family = AF_INET,
09640e63 2096 .protocol = cpu_to_be16(ETH_P_IP),
ae2688d5 2097 .check = ipv4_blackhole_dst_check,
ebb762f2 2098 .mtu = ipv4_blackhole_mtu,
214f45c9 2099 .default_advmss = ipv4_default_advmss,
14e50e57 2100 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2101 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2102 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2103 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2104};
2105
2774c131 2106struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2107{
2774c131 2108 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2109 struct rtable *rt;
14e50e57 2110
f5b0a874 2111 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
14e50e57 2112 if (rt) {
d8d1f30b 2113 struct dst_entry *new = &rt->dst;
14e50e57 2114
14e50e57 2115 new->__use = 1;
352e512c
HX
2116 new->input = dst_discard;
2117 new->output = dst_discard;
14e50e57 2118
d8d1f30b 2119 new->dev = ort->dst.dev;
14e50e57
DM
2120 if (new->dev)
2121 dev_hold(new->dev);
2122
9917e1e8 2123 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2124 rt->rt_iif = ort->rt_iif;
5943634f 2125 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2126
e84f84f2 2127 rt->rt_genid = rt_genid(net);
14e50e57
DM
2128 rt->rt_flags = ort->rt_flags;
2129 rt->rt_type = ort->rt_type;
14e50e57 2130 rt->rt_gateway = ort->rt_gateway;
14e50e57 2131
caacf05e
DM
2132 INIT_LIST_HEAD(&rt->rt_uncached);
2133
14e50e57
DM
2134 dst_free(new);
2135 }
2136
2774c131
DM
2137 dst_release(dst_orig);
2138
2139 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2140}
2141
9d6ec938 2142struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2143 struct sock *sk)
1da177e4 2144{
9d6ec938 2145 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2146
b23dd4fe
DM
2147 if (IS_ERR(rt))
2148 return rt;
1da177e4 2149
56157872 2150 if (flp4->flowi4_proto)
9d6ec938
DM
2151 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2152 flowi4_to_flowi(flp4),
2153 sk, 0);
1da177e4 2154
b23dd4fe 2155 return rt;
1da177e4 2156}
d8c97a94
ACM
2157EXPORT_SYMBOL_GPL(ip_route_output_flow);
2158
f1ce3062
DM
2159static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2160 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2161 u32 seq, int event, int nowait, unsigned int flags)
1da177e4 2162{
511c3f92 2163 struct rtable *rt = skb_rtable(skb);
1da177e4 2164 struct rtmsg *r;
be403ea1 2165 struct nlmsghdr *nlh;
2bc8ca40 2166 unsigned long expires = 0;
f185071d 2167 u32 error;
521f5490 2168 u32 metrics[RTAX_MAX];
be403ea1
TG
2169
2170 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2171 if (nlh == NULL)
26932566 2172 return -EMSGSIZE;
be403ea1
TG
2173
2174 r = nlmsg_data(nlh);
1da177e4
LT
2175 r->rtm_family = AF_INET;
2176 r->rtm_dst_len = 32;
2177 r->rtm_src_len = 0;
d6c0a4f6 2178 r->rtm_tos = fl4->flowi4_tos;
1da177e4 2179 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2180 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2181 goto nla_put_failure;
1da177e4
LT
2182 r->rtm_type = rt->rt_type;
2183 r->rtm_scope = RT_SCOPE_UNIVERSE;
2184 r->rtm_protocol = RTPROT_UNSPEC;
2185 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2186 if (rt->rt_flags & RTCF_NOTIFY)
2187 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2188
f1ce3062 2189 if (nla_put_be32(skb, RTA_DST, dst))
f3756b79 2190 goto nla_put_failure;
1a00fee4 2191 if (src) {
1da177e4 2192 r->rtm_src_len = 32;
1a00fee4 2193 if (nla_put_be32(skb, RTA_SRC, src))
f3756b79 2194 goto nla_put_failure;
1da177e4 2195 }
f3756b79
DM
2196 if (rt->dst.dev &&
2197 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2198 goto nla_put_failure;
c7066f70 2199#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2200 if (rt->dst.tclassid &&
2201 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2202 goto nla_put_failure;
1da177e4 2203#endif
41347dcd 2204 if (!rt_is_input_route(rt) &&
d6c0a4f6
DM
2205 fl4->saddr != src) {
2206 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2207 goto nla_put_failure;
2208 }
f8126f1d 2209 if (rt->rt_gateway &&
f3756b79
DM
2210 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2211 goto nla_put_failure;
be403ea1 2212
521f5490
JA
2213 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2214 if (rt->rt_pmtu)
2215 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2216 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2217 goto nla_put_failure;
2218
b4869889
DM
2219 if (fl4->flowi4_mark &&
2220 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2221 goto nla_put_failure;
963bfeee 2222
d8d1f30b 2223 error = rt->dst.error;
5943634f
DM
2224 expires = rt->dst.expires;
2225 if (expires) {
2226 if (time_before(jiffies, expires))
2227 expires -= jiffies;
2228 else
2229 expires = 0;
1da177e4 2230 }
be403ea1 2231
c7537967 2232 if (rt_is_input_route(rt)) {
f1ce3062
DM
2233 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2234 goto nla_put_failure;
1da177e4
LT
2235 }
2236
f185071d 2237 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2238 goto nla_put_failure;
be403ea1
TG
2239
2240 return nlmsg_end(skb, nlh);
1da177e4 2241
be403ea1 2242nla_put_failure:
26932566
PM
2243 nlmsg_cancel(skb, nlh);
2244 return -EMSGSIZE;
1da177e4
LT
2245}
2246
5e73ea1a 2247static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
1da177e4 2248{
3b1e0a65 2249 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2250 struct rtmsg *rtm;
2251 struct nlattr *tb[RTA_MAX+1];
1da177e4 2252 struct rtable *rt = NULL;
d6c0a4f6 2253 struct flowi4 fl4;
9e12bb22
AV
2254 __be32 dst = 0;
2255 __be32 src = 0;
2256 u32 iif;
d889ce3b 2257 int err;
963bfeee 2258 int mark;
1da177e4
LT
2259 struct sk_buff *skb;
2260
d889ce3b
TG
2261 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2262 if (err < 0)
2263 goto errout;
2264
2265 rtm = nlmsg_data(nlh);
2266
1da177e4 2267 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2268 if (skb == NULL) {
2269 err = -ENOBUFS;
2270 goto errout;
2271 }
1da177e4
LT
2272
2273 /* Reserve room for dummy headers, this skb can pass
2274 through good chunk of routing engine.
2275 */
459a98ed 2276 skb_reset_mac_header(skb);
c1d2bbe1 2277 skb_reset_network_header(skb);
d2c962b8
SH
2278
2279 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2280 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2281 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2282
17fb2c64
AV
2283 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2284 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2285 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2286 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4 2287
d6c0a4f6
DM
2288 memset(&fl4, 0, sizeof(fl4));
2289 fl4.daddr = dst;
2290 fl4.saddr = src;
2291 fl4.flowi4_tos = rtm->rtm_tos;
2292 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2293 fl4.flowi4_mark = mark;
2294
1da177e4 2295 if (iif) {
d889ce3b
TG
2296 struct net_device *dev;
2297
1937504d 2298 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2299 if (dev == NULL) {
2300 err = -ENODEV;
2301 goto errout_free;
2302 }
2303
1da177e4
LT
2304 skb->protocol = htons(ETH_P_IP);
2305 skb->dev = dev;
963bfeee 2306 skb->mark = mark;
1da177e4
LT
2307 local_bh_disable();
2308 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2309 local_bh_enable();
d889ce3b 2310
511c3f92 2311 rt = skb_rtable(skb);
d8d1f30b
CG
2312 if (err == 0 && rt->dst.error)
2313 err = -rt->dst.error;
1da177e4 2314 } else {
9d6ec938 2315 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2316
2317 err = 0;
2318 if (IS_ERR(rt))
2319 err = PTR_ERR(rt);
1da177e4 2320 }
d889ce3b 2321
1da177e4 2322 if (err)
d889ce3b 2323 goto errout_free;
1da177e4 2324
d8d1f30b 2325 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2326 if (rtm->rtm_flags & RTM_F_NOTIFY)
2327 rt->rt_flags |= RTCF_NOTIFY;
2328
f1ce3062 2329 err = rt_fill_info(net, dst, src, &fl4, skb,
1a00fee4 2330 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2331 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2332 if (err <= 0)
2333 goto errout_free;
1da177e4 2334
1937504d 2335 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2336errout:
2942e900 2337 return err;
1da177e4 2338
d889ce3b 2339errout_free:
1da177e4 2340 kfree_skb(skb);
d889ce3b 2341 goto errout;
1da177e4
LT
2342}
2343
2344int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2345{
1da177e4
LT
2346 return skb->len;
2347}
2348
2349void ip_rt_multicast_event(struct in_device *in_dev)
2350{
76e6ebfb 2351 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
2352}
2353
2354#ifdef CONFIG_SYSCTL
81c684d1 2355static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 2356 void __user *buffer,
1da177e4
LT
2357 size_t *lenp, loff_t *ppos)
2358{
2359 if (write) {
639e104f 2360 int flush_delay;
81c684d1 2361 ctl_table ctl;
39a23e75 2362 struct net *net;
639e104f 2363
81c684d1
DL
2364 memcpy(&ctl, __ctl, sizeof(ctl));
2365 ctl.data = &flush_delay;
8d65af78 2366 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 2367
81c684d1 2368 net = (struct net *)__ctl->extra1;
39a23e75 2369 rt_cache_flush(net, flush_delay);
1da177e4 2370 return 0;
e905a9ed 2371 }
1da177e4
LT
2372
2373 return -EINVAL;
2374}
2375
eeb61f71 2376static ctl_table ipv4_route_table[] = {
1da177e4 2377 {
1da177e4
LT
2378 .procname = "gc_thresh",
2379 .data = &ipv4_dst_ops.gc_thresh,
2380 .maxlen = sizeof(int),
2381 .mode = 0644,
6d9f239a 2382 .proc_handler = proc_dointvec,
1da177e4
LT
2383 },
2384 {
1da177e4
LT
2385 .procname = "max_size",
2386 .data = &ip_rt_max_size,
2387 .maxlen = sizeof(int),
2388 .mode = 0644,
6d9f239a 2389 .proc_handler = proc_dointvec,
1da177e4
LT
2390 },
2391 {
2392 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2393
1da177e4
LT
2394 .procname = "gc_min_interval",
2395 .data = &ip_rt_gc_min_interval,
2396 .maxlen = sizeof(int),
2397 .mode = 0644,
6d9f239a 2398 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2399 },
2400 {
1da177e4
LT
2401 .procname = "gc_min_interval_ms",
2402 .data = &ip_rt_gc_min_interval,
2403 .maxlen = sizeof(int),
2404 .mode = 0644,
6d9f239a 2405 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2406 },
2407 {
1da177e4
LT
2408 .procname = "gc_timeout",
2409 .data = &ip_rt_gc_timeout,
2410 .maxlen = sizeof(int),
2411 .mode = 0644,
6d9f239a 2412 .proc_handler = proc_dointvec_jiffies,
1da177e4 2413 },
9f28a2fc
ED
2414 {
2415 .procname = "gc_interval",
2416 .data = &ip_rt_gc_interval,
2417 .maxlen = sizeof(int),
2418 .mode = 0644,
2419 .proc_handler = proc_dointvec_jiffies,
2420 },
1da177e4 2421 {
1da177e4
LT
2422 .procname = "redirect_load",
2423 .data = &ip_rt_redirect_load,
2424 .maxlen = sizeof(int),
2425 .mode = 0644,
6d9f239a 2426 .proc_handler = proc_dointvec,
1da177e4
LT
2427 },
2428 {
1da177e4
LT
2429 .procname = "redirect_number",
2430 .data = &ip_rt_redirect_number,
2431 .maxlen = sizeof(int),
2432 .mode = 0644,
6d9f239a 2433 .proc_handler = proc_dointvec,
1da177e4
LT
2434 },
2435 {
1da177e4
LT
2436 .procname = "redirect_silence",
2437 .data = &ip_rt_redirect_silence,
2438 .maxlen = sizeof(int),
2439 .mode = 0644,
6d9f239a 2440 .proc_handler = proc_dointvec,
1da177e4
LT
2441 },
2442 {
1da177e4
LT
2443 .procname = "error_cost",
2444 .data = &ip_rt_error_cost,
2445 .maxlen = sizeof(int),
2446 .mode = 0644,
6d9f239a 2447 .proc_handler = proc_dointvec,
1da177e4
LT
2448 },
2449 {
1da177e4
LT
2450 .procname = "error_burst",
2451 .data = &ip_rt_error_burst,
2452 .maxlen = sizeof(int),
2453 .mode = 0644,
6d9f239a 2454 .proc_handler = proc_dointvec,
1da177e4
LT
2455 },
2456 {
1da177e4
LT
2457 .procname = "gc_elasticity",
2458 .data = &ip_rt_gc_elasticity,
2459 .maxlen = sizeof(int),
2460 .mode = 0644,
6d9f239a 2461 .proc_handler = proc_dointvec,
1da177e4
LT
2462 },
2463 {
1da177e4
LT
2464 .procname = "mtu_expires",
2465 .data = &ip_rt_mtu_expires,
2466 .maxlen = sizeof(int),
2467 .mode = 0644,
6d9f239a 2468 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2469 },
2470 {
1da177e4
LT
2471 .procname = "min_pmtu",
2472 .data = &ip_rt_min_pmtu,
2473 .maxlen = sizeof(int),
2474 .mode = 0644,
6d9f239a 2475 .proc_handler = proc_dointvec,
1da177e4
LT
2476 },
2477 {
1da177e4
LT
2478 .procname = "min_adv_mss",
2479 .data = &ip_rt_min_advmss,
2480 .maxlen = sizeof(int),
2481 .mode = 0644,
6d9f239a 2482 .proc_handler = proc_dointvec,
1da177e4 2483 },
f8572d8f 2484 { }
1da177e4 2485};
39a23e75 2486
39a23e75
DL
2487static struct ctl_table ipv4_route_flush_table[] = {
2488 {
39a23e75
DL
2489 .procname = "flush",
2490 .maxlen = sizeof(int),
2491 .mode = 0200,
6d9f239a 2492 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2493 },
f8572d8f 2494 { },
39a23e75
DL
2495};
2496
2497static __net_init int sysctl_route_net_init(struct net *net)
2498{
2499 struct ctl_table *tbl;
2500
2501 tbl = ipv4_route_flush_table;
09ad9bc7 2502 if (!net_eq(net, &init_net)) {
39a23e75
DL
2503 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2504 if (tbl == NULL)
2505 goto err_dup;
2506 }
2507 tbl[0].extra1 = net;
2508
ec8f23ce 2509 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
2510 if (net->ipv4.route_hdr == NULL)
2511 goto err_reg;
2512 return 0;
2513
2514err_reg:
2515 if (tbl != ipv4_route_flush_table)
2516 kfree(tbl);
2517err_dup:
2518 return -ENOMEM;
2519}
2520
2521static __net_exit void sysctl_route_net_exit(struct net *net)
2522{
2523 struct ctl_table *tbl;
2524
2525 tbl = net->ipv4.route_hdr->ctl_table_arg;
2526 unregister_net_sysctl_table(net->ipv4.route_hdr);
2527 BUG_ON(tbl == ipv4_route_flush_table);
2528 kfree(tbl);
2529}
2530
2531static __net_initdata struct pernet_operations sysctl_route_ops = {
2532 .init = sysctl_route_net_init,
2533 .exit = sysctl_route_net_exit,
2534};
1da177e4
LT
2535#endif
2536
3ee94372 2537static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2538{
3ee94372
NH
2539 get_random_bytes(&net->ipv4.rt_genid,
2540 sizeof(net->ipv4.rt_genid));
436c3b66
DM
2541 get_random_bytes(&net->ipv4.dev_addr_genid,
2542 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
2543 return 0;
2544}
2545
3ee94372
NH
2546static __net_initdata struct pernet_operations rt_genid_ops = {
2547 .init = rt_genid_init,
9f5e97e5
DL
2548};
2549
c3426b47
DM
2550static int __net_init ipv4_inetpeer_init(struct net *net)
2551{
2552 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2553
2554 if (!bp)
2555 return -ENOMEM;
2556 inet_peer_base_init(bp);
2557 net->ipv4.peers = bp;
2558 return 0;
2559}
2560
2561static void __net_exit ipv4_inetpeer_exit(struct net *net)
2562{
2563 struct inet_peer_base *bp = net->ipv4.peers;
2564
2565 net->ipv4.peers = NULL;
56a6b248 2566 inetpeer_invalidate_tree(bp);
c3426b47
DM
2567 kfree(bp);
2568}
2569
2570static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2571 .init = ipv4_inetpeer_init,
2572 .exit = ipv4_inetpeer_exit,
2573};
9f5e97e5 2574
c7066f70 2575#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 2576struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 2577#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 2578
1da177e4
LT
2579int __init ip_rt_init(void)
2580{
424c4b70 2581 int rc = 0;
1da177e4 2582
c7066f70 2583#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 2584 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
2585 if (!ip_rt_acct)
2586 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
2587#endif
2588
e5d679f3
AD
2589 ipv4_dst_ops.kmem_cachep =
2590 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2591 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2592
14e50e57
DM
2593 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2594
fc66f95c
ED
2595 if (dst_entries_init(&ipv4_dst_ops) < 0)
2596 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2597
2598 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2599 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2600
89aef892
DM
2601 ipv4_dst_ops.gc_thresh = ~0;
2602 ip_rt_max_size = INT_MAX;
1da177e4 2603
1da177e4
LT
2604 devinet_init();
2605 ip_fib_init();
2606
73b38711 2607 if (ip_rt_proc_init())
058bd4d2 2608 pr_err("Unable to create route proc files\n");
1da177e4
LT
2609#ifdef CONFIG_XFRM
2610 xfrm_init();
a33bc5c1 2611 xfrm4_init(ip_rt_max_size);
1da177e4 2612#endif
c7ac8679 2613 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 2614
39a23e75
DL
2615#ifdef CONFIG_SYSCTL
2616 register_pernet_subsys(&sysctl_route_ops);
2617#endif
3ee94372 2618 register_pernet_subsys(&rt_genid_ops);
c3426b47 2619 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
2620 return rc;
2621}
2622
a1bc6eb4 2623#ifdef CONFIG_SYSCTL
eeb61f71
AV
2624/*
2625 * We really need to sanitize the damn ipv4 init order, then all
2626 * this nonsense will go away.
2627 */
2628void __init ip_static_sysctl_init(void)
2629{
4e5ca785 2630 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 2631}
a1bc6eb4 2632#endif