]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/route.c
Merge tag 'omap-fixes-a-for-3.10-rc' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
352e512c 92#include <net/dst.h>
457c4cbc 93#include <net/net_namespace.h>
1da177e4
LT
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
8d71740c 104#include <net/netevent.h>
63f3444f 105#include <net/rtnetlink.h>
1da177e4
LT
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
7426a564 108#include <linux/kmemleak.h>
1da177e4 109#endif
6e5714ea 110#include <net/secure_seq.h>
1da177e4 111
68a5e3dd 112#define RT_FL_TOS(oldflp4) \
f61759e6 113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_redirect_number __read_mostly = 9;
121static int ip_rt_redirect_load __read_mostly = HZ / 50;
122static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123static int ip_rt_error_cost __read_mostly = HZ;
124static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
125static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
126static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
127static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 128
1da177e4
LT
129/*
130 * Interface to generic destination cache.
131 */
132
133static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 134static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 135static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
136static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
138static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139 struct sk_buff *skb, u32 mtu);
140static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141 struct sk_buff *skb);
caacf05e 142static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 143
72cdd1d9
ED
144static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145 int how)
146{
147}
1da177e4 148
62fa8a84
DM
149static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150{
31248731
DM
151 WARN_ON(1);
152 return NULL;
62fa8a84
DM
153}
154
f894cbf8
DM
155static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
d3aaeb38 158
1da177e4
LT
159static struct dst_ops ipv4_dst_ops = {
160 .family = AF_INET,
09640e63 161 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4 162 .check = ipv4_dst_check,
0dbaee3b 163 .default_advmss = ipv4_default_advmss,
ebb762f2 164 .mtu = ipv4_mtu,
62fa8a84 165 .cow_metrics = ipv4_cow_metrics,
caacf05e 166 .destroy = ipv4_dst_destroy,
1da177e4
LT
167 .ifdown = ipv4_dst_ifdown,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
1ac06e03 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
174};
175
176#define ECN_OR_COST(class) TC_PRIO_##class
177
4839c52b 178const __u8 ip_tos2prio[16] = {
1da177e4 179 TC_PRIO_BESTEFFORT,
4a2b9c37 180 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195};
d4a96865 196EXPORT_SYMBOL(ip_tos2prio);
1da177e4 197
2f970d83 198static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 199#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 200
1da177e4 201#ifdef CONFIG_PROC_FS
1da177e4
LT
202static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203{
29e75252 204 if (*pos)
89aef892 205 return NULL;
29e75252 206 return SEQ_START_TOKEN;
1da177e4
LT
207}
208
209static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210{
1da177e4 211 ++*pos;
89aef892 212 return NULL;
1da177e4
LT
213}
214
215static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216{
1da177e4
LT
217}
218
219static int rt_cache_seq_show(struct seq_file *seq, void *v)
220{
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
e905a9ed 226 return 0;
1da177e4
LT
227}
228
f690808e 229static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234};
235
236static int rt_cache_seq_open(struct inode *inode, struct file *file)
237{
89aef892 238 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
239}
240
9a32144e 241static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
242 .owner = THIS_MODULE,
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
89aef892 246 .release = seq_release,
1da177e4
LT
247};
248
249
250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251{
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
0f23174a 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
2f970d83 261 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
262 }
263 return NULL;
264}
265
266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267{
268 int cpu;
269
0f23174a 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
2f970d83 274 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
275 }
276 return NULL;
e905a9ed 277
1da177e4
LT
278}
279
280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281{
282
283}
284
285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286{
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
5bec0039 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
291 return 0;
292 }
e905a9ed 293
1da177e4
LT
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 296 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
297 st->in_hit,
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 st->out_hit,
306 st->out_slow_tot,
e905a9ed 307 st->out_slow_mc,
1da177e4
LT
308
309 st->gc_total,
310 st->gc_ignored,
311 st->gc_goal_miss,
312 st->gc_dst_overflow,
313 st->in_hlist_search,
314 st->out_hlist_search
315 );
316 return 0;
317}
318
f690808e 319static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324};
325
326
327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328{
329 return seq_open(file, &rt_cpu_seq_ops);
330}
331
9a32144e 332static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
333 .owner = THIS_MODULE,
334 .open = rt_cpu_seq_open,
335 .read = seq_read,
336 .llseek = seq_lseek,
337 .release = seq_release,
338};
339
c7066f70 340#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 341static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 342{
a661c419
AD
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
78c686e9
PE
358 }
359
a661c419
AD
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363}
78c686e9 364
a661c419
AD
365static int rt_acct_proc_open(struct inode *inode, struct file *file)
366{
367 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 368}
a661c419
AD
369
370static const struct file_operations rt_acct_proc_fops = {
371 .owner = THIS_MODULE,
372 .open = rt_acct_proc_open,
373 .read = seq_read,
374 .llseek = seq_lseek,
375 .release = single_release,
376};
78c686e9 377#endif
107f1634 378
73b38711 379static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
380{
381 struct proc_dir_entry *pde;
382
d4beaa66
G
383 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384 &rt_cache_seq_fops);
107f1634
PE
385 if (!pde)
386 goto err1;
387
77020720
WC
388 pde = proc_create("rt_cache", S_IRUGO,
389 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
390 if (!pde)
391 goto err2;
392
c7066f70 393#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 394 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
395 if (!pde)
396 goto err3;
397#endif
398 return 0;
399
c7066f70 400#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
401err3:
402 remove_proc_entry("rt_cache", net->proc_net_stat);
403#endif
404err2:
405 remove_proc_entry("rt_cache", net->proc_net);
406err1:
407 return -ENOMEM;
408}
73b38711
DL
409
410static void __net_exit ip_rt_do_proc_exit(struct net *net)
411{
412 remove_proc_entry("rt_cache", net->proc_net_stat);
413 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 414#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 415 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 416#endif
73b38711
DL
417}
418
419static struct pernet_operations ip_rt_proc_ops __net_initdata = {
420 .init = ip_rt_do_proc_init,
421 .exit = ip_rt_do_proc_exit,
422};
423
424static int __init ip_rt_proc_init(void)
425{
426 return register_pernet_subsys(&ip_rt_proc_ops);
427}
428
107f1634 429#else
73b38711 430static inline int ip_rt_proc_init(void)
107f1634
PE
431{
432 return 0;
433}
1da177e4 434#endif /* CONFIG_PROC_FS */
e905a9ed 435
4331debc 436static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 437{
d8d1f30b 438 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
439}
440
4ccfe6d4 441void rt_cache_flush(struct net *net)
1da177e4 442{
b42664f8 443 rt_genid_bump(net);
98376387
ED
444}
445
f894cbf8
DM
446static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 struct sk_buff *skb,
448 const void *daddr)
3769cffb 449{
d3aaeb38
DM
450 struct net_device *dev = dst->dev;
451 const __be32 *pkey = daddr;
39232973 452 const struct rtable *rt;
3769cffb
DM
453 struct neighbour *n;
454
39232973 455 rt = (const struct rtable *) dst;
a263b309 456 if (rt->rt_gateway)
39232973 457 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
458 else if (skb)
459 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 460
80703d26 461 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
462 if (n)
463 return n;
32092ecf 464 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
465}
466
1da177e4
LT
467/*
468 * Peer allocation may fail only in serious out-of-memory conditions. However
469 * we still can generate some output.
470 * Random ID selection looks a bit dangerous because we have no chances to
471 * select ID being unique in a reasonable period of time.
472 * But broken packet identifier may be better than no packet at all.
473 */
474static void ip_select_fb_ident(struct iphdr *iph)
475{
476 static DEFINE_SPINLOCK(ip_fb_id_lock);
477 static u32 ip_fallback_id;
478 u32 salt;
479
480 spin_lock_bh(&ip_fb_id_lock);
e448515c 481 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
482 iph->id = htons(salt & 0xFFFF);
483 ip_fallback_id = salt;
484 spin_unlock_bh(&ip_fb_id_lock);
485}
486
487void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488{
1d861aa4
DM
489 struct net *net = dev_net(dst->dev);
490 struct inet_peer *peer;
1da177e4 491
1d861aa4
DM
492 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493 if (peer) {
494 iph->id = htons(inet_getid(peer, more));
495 inet_putpeer(peer);
496 return;
497 }
1da177e4
LT
498
499 ip_select_fb_ident(iph);
500}
4bc2f18b 501EXPORT_SYMBOL(__ip_select_ident);
1da177e4 502
5abf7f7e 503static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
4895c771
DM
504 const struct iphdr *iph,
505 int oif, u8 tos,
506 u8 prot, u32 mark, int flow_flags)
507{
508 if (sk) {
509 const struct inet_sock *inet = inet_sk(sk);
510
511 oif = sk->sk_bound_dev_if;
512 mark = sk->sk_mark;
513 tos = RT_CONN_FLAGS(sk);
514 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515 }
516 flowi4_init_output(fl4, oif, mark, tos,
517 RT_SCOPE_UNIVERSE, prot,
518 flow_flags,
519 iph->daddr, iph->saddr, 0, 0);
520}
521
5abf7f7e
ED
522static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523 const struct sock *sk)
4895c771
DM
524{
525 const struct iphdr *iph = ip_hdr(skb);
526 int oif = skb->dev->ifindex;
527 u8 tos = RT_TOS(iph->tos);
528 u8 prot = iph->protocol;
529 u32 mark = skb->mark;
530
531 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532}
533
5abf7f7e 534static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
535{
536 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 537 const struct ip_options_rcu *inet_opt;
4895c771
DM
538 __be32 daddr = inet->inet_daddr;
539
540 rcu_read_lock();
541 inet_opt = rcu_dereference(inet->inet_opt);
542 if (inet_opt && inet_opt->opt.srr)
543 daddr = inet_opt->opt.faddr;
544 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547 inet_sk_flowi_flags(sk),
548 daddr, inet->inet_saddr, 0, 0);
549 rcu_read_unlock();
550}
551
5abf7f7e
ED
552static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553 const struct sk_buff *skb)
4895c771
DM
554{
555 if (skb)
556 build_skb_flow_key(fl4, skb, sk);
557 else
558 build_sk_flow_key(fl4, sk);
559}
560
c5038a83
DM
561static inline void rt_free(struct rtable *rt)
562{
563 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564}
565
566static DEFINE_SPINLOCK(fnhe_lock);
4895c771 567
aee06da6 568static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
569{
570 struct fib_nh_exception *fnhe, *oldest;
c5038a83 571 struct rtable *orig;
4895c771
DM
572
573 oldest = rcu_dereference(hash->chain);
574 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575 fnhe = rcu_dereference(fnhe->fnhe_next)) {
576 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577 oldest = fnhe;
578 }
c5038a83
DM
579 orig = rcu_dereference(oldest->fnhe_rth);
580 if (orig) {
581 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582 rt_free(orig);
583 }
4895c771
DM
584 return oldest;
585}
586
d3a25c98
DM
587static inline u32 fnhe_hashfun(__be32 daddr)
588{
589 u32 hval;
590
591 hval = (__force u32) daddr;
592 hval ^= (hval >> 11) ^ (hval >> 22);
593
594 return hval & (FNHE_HASH_SIZE - 1);
595}
596
aee06da6
JA
597static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
598 u32 pmtu, unsigned long expires)
4895c771 599{
aee06da6 600 struct fnhe_hash_bucket *hash;
4895c771
DM
601 struct fib_nh_exception *fnhe;
602 int depth;
aee06da6
JA
603 u32 hval = fnhe_hashfun(daddr);
604
c5038a83 605 spin_lock_bh(&fnhe_lock);
4895c771 606
aee06da6 607 hash = nh->nh_exceptions;
4895c771 608 if (!hash) {
aee06da6 609 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 610 if (!hash)
aee06da6
JA
611 goto out_unlock;
612 nh->nh_exceptions = hash;
4895c771
DM
613 }
614
4895c771
DM
615 hash += hval;
616
617 depth = 0;
618 for (fnhe = rcu_dereference(hash->chain); fnhe;
619 fnhe = rcu_dereference(fnhe->fnhe_next)) {
620 if (fnhe->fnhe_daddr == daddr)
aee06da6 621 break;
4895c771
DM
622 depth++;
623 }
624
aee06da6
JA
625 if (fnhe) {
626 if (gw)
627 fnhe->fnhe_gw = gw;
628 if (pmtu) {
629 fnhe->fnhe_pmtu = pmtu;
630 fnhe->fnhe_expires = expires;
631 }
632 } else {
633 if (depth > FNHE_RECLAIM_DEPTH)
634 fnhe = fnhe_oldest(hash);
635 else {
636 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
637 if (!fnhe)
638 goto out_unlock;
639
640 fnhe->fnhe_next = hash->chain;
641 rcu_assign_pointer(hash->chain, fnhe);
642 }
643 fnhe->fnhe_daddr = daddr;
644 fnhe->fnhe_gw = gw;
645 fnhe->fnhe_pmtu = pmtu;
646 fnhe->fnhe_expires = expires;
4895c771 647 }
4895c771 648
4895c771 649 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
650
651out_unlock:
c5038a83 652 spin_unlock_bh(&fnhe_lock);
aee06da6 653 return;
4895c771
DM
654}
655
ceb33206
DM
656static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
657 bool kill_route)
1da177e4 658{
e47a185b 659 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 660 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 661 struct net_device *dev = skb->dev;
e47a185b 662 struct in_device *in_dev;
4895c771 663 struct fib_result res;
e47a185b 664 struct neighbour *n;
317805b8 665 struct net *net;
1da177e4 666
94206125
DM
667 switch (icmp_hdr(skb)->code & 7) {
668 case ICMP_REDIR_NET:
669 case ICMP_REDIR_NETTOS:
670 case ICMP_REDIR_HOST:
671 case ICMP_REDIR_HOSTTOS:
672 break;
673
674 default:
675 return;
676 }
677
e47a185b
DM
678 if (rt->rt_gateway != old_gw)
679 return;
680
681 in_dev = __in_dev_get_rcu(dev);
682 if (!in_dev)
683 return;
684
c346dca1 685 net = dev_net(dev);
9d4fb27d
JP
686 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
687 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
688 ipv4_is_zeronet(new_gw))
1da177e4
LT
689 goto reject_redirect;
690
691 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
692 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
693 goto reject_redirect;
694 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
695 goto reject_redirect;
696 } else {
317805b8 697 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
698 goto reject_redirect;
699 }
700
4895c771 701 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
e47a185b
DM
702 if (n) {
703 if (!(n->nud_state & NUD_VALID)) {
704 neigh_event_send(n, NULL);
705 } else {
4895c771
DM
706 if (fib_lookup(net, fl4, &res) == 0) {
707 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 708
aee06da6
JA
709 update_or_create_fnhe(nh, fl4->daddr, new_gw,
710 0, 0);
4895c771 711 }
ceb33206
DM
712 if (kill_route)
713 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
714 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
715 }
716 neigh_release(n);
717 }
718 return;
719
720reject_redirect:
721#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
722 if (IN_DEV_LOG_MARTIANS(in_dev)) {
723 const struct iphdr *iph = (const struct iphdr *) skb->data;
724 __be32 daddr = iph->daddr;
725 __be32 saddr = iph->saddr;
726
e47a185b
DM
727 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
728 " Advised path = %pI4 -> %pI4\n",
729 &old_gw, dev->name, &new_gw,
730 &saddr, &daddr);
99ee038d 731 }
e47a185b
DM
732#endif
733 ;
734}
735
4895c771
DM
736static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
737{
738 struct rtable *rt;
739 struct flowi4 fl4;
740
741 rt = (struct rtable *) dst;
742
743 ip_rt_build_flow_key(&fl4, sk, skb);
ceb33206 744 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
745}
746
1da177e4
LT
747static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
748{
ee6b9673 749 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
750 struct dst_entry *ret = dst;
751
752 if (rt) {
d11a4dc1 753 if (dst->obsolete > 0) {
1da177e4
LT
754 ip_rt_put(rt);
755 ret = NULL;
5943634f
DM
756 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
757 rt->dst.expires) {
89aef892 758 ip_rt_put(rt);
1da177e4
LT
759 ret = NULL;
760 }
761 }
762 return ret;
763}
764
765/*
766 * Algorithm:
767 * 1. The first ip_rt_redirect_number redirects are sent
768 * with exponential backoff, then we stop sending them at all,
769 * assuming that the host ignores our redirects.
770 * 2. If we did not see packets requiring redirects
771 * during ip_rt_redirect_silence, we assume that the host
772 * forgot redirected route and start to send redirects again.
773 *
774 * This algorithm is much cheaper and more intelligent than dumb load limiting
775 * in icmp.c.
776 *
777 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
778 * and "frag. need" (breaks PMTU discovery) in icmp.c.
779 */
780
781void ip_rt_send_redirect(struct sk_buff *skb)
782{
511c3f92 783 struct rtable *rt = skb_rtable(skb);
30038fc6 784 struct in_device *in_dev;
92d86829 785 struct inet_peer *peer;
1d861aa4 786 struct net *net;
30038fc6 787 int log_martians;
1da177e4 788
30038fc6 789 rcu_read_lock();
d8d1f30b 790 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
791 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
792 rcu_read_unlock();
1da177e4 793 return;
30038fc6
ED
794 }
795 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
796 rcu_read_unlock();
1da177e4 797
1d861aa4
DM
798 net = dev_net(rt->dst.dev);
799 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829 800 if (!peer) {
e81da0e1
JA
801 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
802 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
803 return;
804 }
805
1da177e4
LT
806 /* No redirected packets during ip_rt_redirect_silence;
807 * reset the algorithm.
808 */
92d86829
DM
809 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
810 peer->rate_tokens = 0;
1da177e4
LT
811
812 /* Too many ignored redirects; do not send anything
d8d1f30b 813 * set dst.rate_last to the last seen redirected packet.
1da177e4 814 */
92d86829
DM
815 if (peer->rate_tokens >= ip_rt_redirect_number) {
816 peer->rate_last = jiffies;
1d861aa4 817 goto out_put_peer;
1da177e4
LT
818 }
819
820 /* Check for load limit; set rate_last to the latest sent
821 * redirect.
822 */
92d86829 823 if (peer->rate_tokens == 0 ||
14fb8a76 824 time_after(jiffies,
92d86829
DM
825 (peer->rate_last +
826 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
827 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
828
829 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
830 peer->rate_last = jiffies;
831 ++peer->rate_tokens;
1da177e4 832#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 833 if (log_martians &&
e87cc472
JP
834 peer->rate_tokens == ip_rt_redirect_number)
835 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 836 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 837 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
838#endif
839 }
1d861aa4
DM
840out_put_peer:
841 inet_putpeer(peer);
1da177e4
LT
842}
843
844static int ip_error(struct sk_buff *skb)
845{
251da413 846 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 847 struct rtable *rt = skb_rtable(skb);
92d86829 848 struct inet_peer *peer;
1da177e4 849 unsigned long now;
251da413 850 struct net *net;
92d86829 851 bool send;
1da177e4
LT
852 int code;
853
251da413
DM
854 net = dev_net(rt->dst.dev);
855 if (!IN_DEV_FORWARD(in_dev)) {
856 switch (rt->dst.error) {
857 case EHOSTUNREACH:
858 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
859 break;
860
861 case ENETUNREACH:
862 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
863 break;
864 }
865 goto out;
866 }
867
d8d1f30b 868 switch (rt->dst.error) {
4500ebf8
JP
869 case EINVAL:
870 default:
871 goto out;
872 case EHOSTUNREACH:
873 code = ICMP_HOST_UNREACH;
874 break;
875 case ENETUNREACH:
876 code = ICMP_NET_UNREACH;
251da413 877 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
878 break;
879 case EACCES:
880 code = ICMP_PKT_FILTERED;
881 break;
1da177e4
LT
882 }
883
1d861aa4 884 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
885
886 send = true;
887 if (peer) {
888 now = jiffies;
889 peer->rate_tokens += now - peer->rate_last;
890 if (peer->rate_tokens > ip_rt_error_burst)
891 peer->rate_tokens = ip_rt_error_burst;
892 peer->rate_last = now;
893 if (peer->rate_tokens >= ip_rt_error_cost)
894 peer->rate_tokens -= ip_rt_error_cost;
895 else
896 send = false;
1d861aa4 897 inet_putpeer(peer);
1da177e4 898 }
92d86829
DM
899 if (send)
900 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
901
902out: kfree_skb(skb);
903 return 0;
e905a9ed 904}
1da177e4 905
d851c12b 906static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 907{
d851c12b 908 struct dst_entry *dst = &rt->dst;
4895c771 909 struct fib_result res;
2c8cec5c 910
fa1e492a
SK
911 if (dst_metric_locked(dst, RTAX_MTU))
912 return;
913
7f92d334
SK
914 if (dst->dev->mtu < mtu)
915 return;
916
5943634f
DM
917 if (mtu < ip_rt_min_pmtu)
918 mtu = ip_rt_min_pmtu;
2c8cec5c 919
d851c12b
SK
920 if (!rt->rt_pmtu) {
921 dst->obsolete = DST_OBSOLETE_KILL;
922 } else {
923 rt->rt_pmtu = mtu;
924 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
925 }
926
c5ae7d41 927 rcu_read_lock();
d851c12b 928 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
4895c771 929 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 930
aee06da6
JA
931 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
932 jiffies + ip_rt_mtu_expires);
4895c771 933 }
c5ae7d41 934 rcu_read_unlock();
1da177e4
LT
935}
936
4895c771
DM
937static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
938 struct sk_buff *skb, u32 mtu)
939{
940 struct rtable *rt = (struct rtable *) dst;
941 struct flowi4 fl4;
942
943 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 944 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
945}
946
36393395
DM
947void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
948 int oif, u32 mark, u8 protocol, int flow_flags)
949{
4895c771 950 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
951 struct flowi4 fl4;
952 struct rtable *rt;
953
4895c771
DM
954 __build_flow_key(&fl4, NULL, iph, oif,
955 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
956 rt = __ip_route_output_key(net, &fl4);
957 if (!IS_ERR(rt)) {
4895c771 958 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
959 ip_rt_put(rt);
960 }
961}
962EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
963
9cb3a50c 964static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 965{
4895c771
DM
966 const struct iphdr *iph = (const struct iphdr *) skb->data;
967 struct flowi4 fl4;
968 struct rtable *rt;
36393395 969
4895c771
DM
970 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
971 rt = __ip_route_output_key(sock_net(sk), &fl4);
972 if (!IS_ERR(rt)) {
973 __ip_rt_update_pmtu(rt, &fl4, mtu);
974 ip_rt_put(rt);
975 }
36393395 976}
9cb3a50c
SK
977
978void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
979{
980 const struct iphdr *iph = (const struct iphdr *) skb->data;
981 struct flowi4 fl4;
982 struct rtable *rt;
983 struct dst_entry *dst;
b44108db 984 bool new = false;
9cb3a50c
SK
985
986 bh_lock_sock(sk);
987 rt = (struct rtable *) __sk_dst_get(sk);
988
989 if (sock_owned_by_user(sk) || !rt) {
990 __ipv4_sk_update_pmtu(skb, sk, mtu);
991 goto out;
992 }
993
994 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
995
996 if (!__sk_dst_check(sk, 0)) {
997 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
998 if (IS_ERR(rt))
999 goto out;
b44108db
SK
1000
1001 new = true;
9cb3a50c
SK
1002 }
1003
1004 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1005
1006 dst = dst_check(&rt->dst, 0);
1007 if (!dst) {
b44108db
SK
1008 if (new)
1009 dst_release(&rt->dst);
1010
9cb3a50c
SK
1011 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1012 if (IS_ERR(rt))
1013 goto out;
1014
b44108db 1015 new = true;
9cb3a50c
SK
1016 }
1017
b44108db
SK
1018 if (new)
1019 __sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1020
1021out:
1022 bh_unlock_sock(sk);
1023}
36393395 1024EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1025
b42597e2
DM
1026void ipv4_redirect(struct sk_buff *skb, struct net *net,
1027 int oif, u32 mark, u8 protocol, int flow_flags)
1028{
4895c771 1029 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1030 struct flowi4 fl4;
1031 struct rtable *rt;
1032
4895c771
DM
1033 __build_flow_key(&fl4, NULL, iph, oif,
1034 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1035 rt = __ip_route_output_key(net, &fl4);
1036 if (!IS_ERR(rt)) {
ceb33206 1037 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1038 ip_rt_put(rt);
1039 }
1040}
1041EXPORT_SYMBOL_GPL(ipv4_redirect);
1042
1043void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1044{
4895c771
DM
1045 const struct iphdr *iph = (const struct iphdr *) skb->data;
1046 struct flowi4 fl4;
1047 struct rtable *rt;
b42597e2 1048
4895c771
DM
1049 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1050 rt = __ip_route_output_key(sock_net(sk), &fl4);
1051 if (!IS_ERR(rt)) {
ceb33206 1052 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1053 ip_rt_put(rt);
1054 }
b42597e2
DM
1055}
1056EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1057
efbc368d
DM
1058static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1059{
1060 struct rtable *rt = (struct rtable *) dst;
1061
ceb33206
DM
1062 /* All IPV4 dsts are created with ->obsolete set to the value
1063 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1064 * into this function always.
1065 *
1066 * When a PMTU/redirect information update invalidates a
1067 * route, this is indicated by setting obsolete to
1068 * DST_OBSOLETE_KILL.
1069 */
1070 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
efbc368d 1071 return NULL;
d11a4dc1 1072 return dst;
1da177e4
LT
1073}
1074
1da177e4
LT
1075static void ipv4_link_failure(struct sk_buff *skb)
1076{
1077 struct rtable *rt;
1078
1079 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1080
511c3f92 1081 rt = skb_rtable(skb);
5943634f
DM
1082 if (rt)
1083 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1084}
1085
1086static int ip_rt_bug(struct sk_buff *skb)
1087{
91df42be
JP
1088 pr_debug("%s: %pI4 -> %pI4, %s\n",
1089 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1090 skb->dev ? skb->dev->name : "?");
1da177e4 1091 kfree_skb(skb);
c378a9c0 1092 WARN_ON(1);
1da177e4
LT
1093 return 0;
1094}
1095
1096/*
1097 We do not cache source address of outgoing interface,
1098 because it is used only by IP RR, TS and SRR options,
1099 so that it out of fast path.
1100
1101 BTW remember: "addr" is allowed to be not aligned
1102 in IP options!
1103 */
1104
8e36360a 1105void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1106{
a61ced5d 1107 __be32 src;
1da177e4 1108
c7537967 1109 if (rt_is_output_route(rt))
c5be24ff 1110 src = ip_hdr(skb)->saddr;
ebc0ffae 1111 else {
8e36360a
DM
1112 struct fib_result res;
1113 struct flowi4 fl4;
1114 struct iphdr *iph;
1115
1116 iph = ip_hdr(skb);
1117
1118 memset(&fl4, 0, sizeof(fl4));
1119 fl4.daddr = iph->daddr;
1120 fl4.saddr = iph->saddr;
b0fe4a31 1121 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1122 fl4.flowi4_oif = rt->dst.dev->ifindex;
1123 fl4.flowi4_iif = skb->dev->ifindex;
1124 fl4.flowi4_mark = skb->mark;
5e2b61f7 1125
ebc0ffae 1126 rcu_read_lock();
68a5e3dd 1127 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1128 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1129 else
f8126f1d
DM
1130 src = inet_select_addr(rt->dst.dev,
1131 rt_nexthop(rt, iph->daddr),
1132 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1133 rcu_read_unlock();
1134 }
1da177e4
LT
1135 memcpy(addr, &src, 4);
1136}
1137
c7066f70 1138#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1139static void set_class_tag(struct rtable *rt, u32 tag)
1140{
d8d1f30b
CG
1141 if (!(rt->dst.tclassid & 0xFFFF))
1142 rt->dst.tclassid |= tag & 0xFFFF;
1143 if (!(rt->dst.tclassid & 0xFFFF0000))
1144 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1145}
1146#endif
1147
0dbaee3b
DM
1148static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1149{
1150 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1151
1152 if (advmss == 0) {
1153 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1154 ip_rt_min_advmss);
1155 if (advmss > 65535 - 40)
1156 advmss = 65535 - 40;
1157 }
1158 return advmss;
1159}
1160
ebb762f2 1161static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1162{
261663b0 1163 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1164 unsigned int mtu = rt->rt_pmtu;
1165
98d75c37 1166 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1167 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1168
38d523e2 1169 if (mtu)
618f9bc7
SK
1170 return mtu;
1171
1172 mtu = dst->dev->mtu;
d33e4553
DM
1173
1174 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1175 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1176 mtu = 576;
1177 }
1178
1179 if (mtu > IP_MAX_MTU)
1180 mtu = IP_MAX_MTU;
1181
1182 return mtu;
1183}
1184
f2bb4bed 1185static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771
DM
1186{
1187 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1188 struct fib_nh_exception *fnhe;
1189 u32 hval;
1190
f2bb4bed
DM
1191 if (!hash)
1192 return NULL;
1193
d3a25c98 1194 hval = fnhe_hashfun(daddr);
4895c771
DM
1195
1196 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1197 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1198 if (fnhe->fnhe_daddr == daddr)
1199 return fnhe;
1200 }
1201 return NULL;
1202}
aee06da6 1203
caacf05e 1204static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
f2bb4bed
DM
1205 __be32 daddr)
1206{
caacf05e
DM
1207 bool ret = false;
1208
c5038a83 1209 spin_lock_bh(&fnhe_lock);
f2bb4bed 1210
c5038a83 1211 if (daddr == fnhe->fnhe_daddr) {
13d82bf5
SK
1212 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1213 if (orig && rt_is_expired(orig)) {
1214 fnhe->fnhe_gw = 0;
1215 fnhe->fnhe_pmtu = 0;
1216 fnhe->fnhe_expires = 0;
1217 }
c5038a83
DM
1218 if (fnhe->fnhe_pmtu) {
1219 unsigned long expires = fnhe->fnhe_expires;
1220 unsigned long diff = expires - jiffies;
1221
1222 if (time_before(jiffies, expires)) {
1223 rt->rt_pmtu = fnhe->fnhe_pmtu;
1224 dst_set_expires(&rt->dst, diff);
1225 }
1226 }
1227 if (fnhe->fnhe_gw) {
1228 rt->rt_flags |= RTCF_REDIRECTED;
1229 rt->rt_gateway = fnhe->fnhe_gw;
155e8336
JA
1230 rt->rt_uses_gateway = 1;
1231 } else if (!rt->rt_gateway)
1232 rt->rt_gateway = daddr;
f2bb4bed 1233
c5038a83
DM
1234 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1235 if (orig)
1236 rt_free(orig);
1237
1238 fnhe->fnhe_stamp = jiffies;
caacf05e 1239 ret = true;
c5038a83
DM
1240 }
1241 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1242
1243 return ret;
54764bb6
ED
1244}
1245
caacf05e 1246static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1247{
d26b3a7c 1248 struct rtable *orig, *prev, **p;
caacf05e 1249 bool ret = true;
f2bb4bed 1250
d26b3a7c 1251 if (rt_is_input_route(rt)) {
54764bb6 1252 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1253 } else {
d26b3a7c
ED
1254 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1255 }
f2bb4bed
DM
1256 orig = *p;
1257
1258 prev = cmpxchg(p, orig, rt);
1259 if (prev == orig) {
f2bb4bed 1260 if (orig)
54764bb6 1261 rt_free(orig);
155e8336 1262 } else
caacf05e 1263 ret = false;
caacf05e
DM
1264
1265 return ret;
1266}
1267
1268static DEFINE_SPINLOCK(rt_uncached_lock);
1269static LIST_HEAD(rt_uncached_list);
1270
1271static void rt_add_uncached_list(struct rtable *rt)
1272{
1273 spin_lock_bh(&rt_uncached_lock);
1274 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1275 spin_unlock_bh(&rt_uncached_lock);
1276}
1277
1278static void ipv4_dst_destroy(struct dst_entry *dst)
1279{
1280 struct rtable *rt = (struct rtable *) dst;
1281
78df76a0 1282 if (!list_empty(&rt->rt_uncached)) {
caacf05e
DM
1283 spin_lock_bh(&rt_uncached_lock);
1284 list_del(&rt->rt_uncached);
1285 spin_unlock_bh(&rt_uncached_lock);
1286 }
1287}
1288
1289void rt_flush_dev(struct net_device *dev)
1290{
1291 if (!list_empty(&rt_uncached_list)) {
1292 struct net *net = dev_net(dev);
1293 struct rtable *rt;
1294
1295 spin_lock_bh(&rt_uncached_lock);
1296 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1297 if (rt->dst.dev != dev)
1298 continue;
1299 rt->dst.dev = net->loopback_dev;
1300 dev_hold(rt->dst.dev);
1301 dev_put(dev);
1302 }
1303 spin_unlock_bh(&rt_uncached_lock);
4895c771
DM
1304 }
1305}
1306
4331debc 1307static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1308{
4331debc
ED
1309 return rt &&
1310 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1311 !rt_is_expired(rt);
d2d68ba9
DM
1312}
1313
f2bb4bed 1314static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1315 const struct fib_result *res,
f2bb4bed 1316 struct fib_nh_exception *fnhe,
982721f3 1317 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1318{
caacf05e
DM
1319 bool cached = false;
1320
1da177e4 1321 if (fi) {
4895c771
DM
1322 struct fib_nh *nh = &FIB_RES_NH(*res);
1323
155e8336 1324 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1325 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1326 rt->rt_uses_gateway = 1;
1327 }
2860583f 1328 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
c7066f70 1329#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1330 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1331#endif
c5038a83 1332 if (unlikely(fnhe))
caacf05e 1333 cached = rt_bind_exception(rt, fnhe, daddr);
c5038a83 1334 else if (!(rt->dst.flags & DST_NOCACHE))
caacf05e 1335 cached = rt_cache_route(nh, rt);
155e8336
JA
1336 if (unlikely(!cached)) {
1337 /* Routes we intend to cache in nexthop exception or
1338 * FIB nexthop have the DST_NOCACHE bit clear.
1339 * However, if we are unsuccessful at storing this
1340 * route into the cache we really need to set it.
1341 */
1342 rt->dst.flags |= DST_NOCACHE;
1343 if (!rt->rt_gateway)
1344 rt->rt_gateway = daddr;
1345 rt_add_uncached_list(rt);
1346 }
1347 } else
caacf05e 1348 rt_add_uncached_list(rt);
defb3519 1349
c7066f70 1350#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1351#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1352 set_class_tag(rt, res->tclassid);
1da177e4
LT
1353#endif
1354 set_class_tag(rt, itag);
1355#endif
1da177e4
LT
1356}
1357
5c1e6aa3 1358static struct rtable *rt_dst_alloc(struct net_device *dev,
f2bb4bed 1359 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1360{
f5b0a874 1361 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
c6cffba4 1362 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
5c1e6aa3
DM
1363 (nopolicy ? DST_NOPOLICY : 0) |
1364 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1365}
1366
96d36220 1367/* called in rcu_read_lock() section */
9e12bb22 1368static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1369 u8 tos, struct net_device *dev, int our)
1370{
1da177e4 1371 struct rtable *rth;
96d36220 1372 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1373 u32 itag = 0;
b5f7e755 1374 int err;
1da177e4
LT
1375
1376 /* Primary sanity checks. */
1377
1378 if (in_dev == NULL)
1379 return -EINVAL;
1380
1e637c74 1381 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1382 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1383 goto e_inval;
1384
d0daebc3
TG
1385 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1386 if (ipv4_is_loopback(saddr))
1387 goto e_inval;
1388
f97c1e0c
JP
1389 if (ipv4_is_zeronet(saddr)) {
1390 if (!ipv4_is_local_multicast(daddr))
1da177e4 1391 goto e_inval;
b5f7e755 1392 } else {
9e56e380
DM
1393 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1394 in_dev, &itag);
b5f7e755
ED
1395 if (err < 0)
1396 goto e_err;
1397 }
4e7b2f14 1398 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
f2bb4bed 1399 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4
LT
1400 if (!rth)
1401 goto e_nobufs;
1402
cf911662
DM
1403#ifdef CONFIG_IP_ROUTE_CLASSID
1404 rth->dst.tclassid = itag;
1405#endif
d8d1f30b 1406 rth->dst.output = ip_rt_bug;
1da177e4 1407
cf911662
DM
1408 rth->rt_genid = rt_genid(dev_net(dev));
1409 rth->rt_flags = RTCF_MULTICAST;
1410 rth->rt_type = RTN_MULTICAST;
9917e1e8 1411 rth->rt_is_input= 1;
13378cad 1412 rth->rt_iif = 0;
5943634f 1413 rth->rt_pmtu = 0;
f8126f1d 1414 rth->rt_gateway = 0;
155e8336 1415 rth->rt_uses_gateway = 0;
caacf05e 1416 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1417 if (our) {
d8d1f30b 1418 rth->dst.input= ip_local_deliver;
1da177e4
LT
1419 rth->rt_flags |= RTCF_LOCAL;
1420 }
1421
1422#ifdef CONFIG_IP_MROUTE
f97c1e0c 1423 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1424 rth->dst.input = ip_mr_input;
1da177e4
LT
1425#endif
1426 RT_CACHE_STAT_INC(in_slow_mc);
1427
89aef892
DM
1428 skb_dst_set(skb, &rth->dst);
1429 return 0;
1da177e4
LT
1430
1431e_nobufs:
1da177e4 1432 return -ENOBUFS;
1da177e4 1433e_inval:
96d36220 1434 return -EINVAL;
b5f7e755 1435e_err:
b5f7e755 1436 return err;
1da177e4
LT
1437}
1438
1439
1440static void ip_handle_martian_source(struct net_device *dev,
1441 struct in_device *in_dev,
1442 struct sk_buff *skb,
9e12bb22
AV
1443 __be32 daddr,
1444 __be32 saddr)
1da177e4
LT
1445{
1446 RT_CACHE_STAT_INC(in_martian_src);
1447#ifdef CONFIG_IP_ROUTE_VERBOSE
1448 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1449 /*
1450 * RFC1812 recommendation, if source is martian,
1451 * the only hint is MAC header.
1452 */
058bd4d2 1453 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1454 &daddr, &saddr, dev->name);
98e399f8 1455 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1456 print_hex_dump(KERN_WARNING, "ll header: ",
1457 DUMP_PREFIX_OFFSET, 16, 1,
1458 skb_mac_header(skb),
1459 dev->hard_header_len, true);
1da177e4
LT
1460 }
1461 }
1462#endif
1463}
1464
47360228 1465/* called in rcu_read_lock() section */
5969f71d 1466static int __mkroute_input(struct sk_buff *skb,
982721f3 1467 const struct fib_result *res,
5969f71d 1468 struct in_device *in_dev,
c6cffba4 1469 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1470{
1da177e4
LT
1471 struct rtable *rth;
1472 int err;
1473 struct in_device *out_dev;
47360228 1474 unsigned int flags = 0;
d2d68ba9 1475 bool do_cache;
d9c9df8c 1476 u32 itag;
1da177e4
LT
1477
1478 /* get a working reference to the output device */
47360228 1479 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 1480 if (out_dev == NULL) {
e87cc472 1481 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1482 return -EINVAL;
1483 }
1484
5c04c819 1485 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1486 in_dev->dev, in_dev, &itag);
1da177e4 1487 if (err < 0) {
e905a9ed 1488 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1489 saddr);
e905a9ed 1490
1da177e4
LT
1491 goto cleanup;
1492 }
1493
e81da0e1
JA
1494 do_cache = res->fi && !itag;
1495 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1da177e4 1496 (IN_DEV_SHARED_MEDIA(out_dev) ||
e81da0e1 1497 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1da177e4 1498 flags |= RTCF_DOREDIRECT;
e81da0e1
JA
1499 do_cache = false;
1500 }
1da177e4
LT
1501
1502 if (skb->protocol != htons(ETH_P_IP)) {
1503 /* Not IP (i.e. ARP). Do not create route, if it is
1504 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1505 *
1506 * Proxy arp feature have been extended to allow, ARP
1507 * replies back to the same interface, to support
1508 * Private VLAN switch technologies. See arp.c.
1da177e4 1509 */
65324144
JDB
1510 if (out_dev == in_dev &&
1511 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1512 err = -EINVAL;
1513 goto cleanup;
1514 }
1515 }
1516
e81da0e1
JA
1517 if (do_cache) {
1518 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1519 if (rt_cache_valid(rth)) {
1520 skb_dst_set_noref(skb, &rth->dst);
1521 goto out;
d2d68ba9
DM
1522 }
1523 }
f2bb4bed 1524
5c1e6aa3
DM
1525 rth = rt_dst_alloc(out_dev->dev,
1526 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1527 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1528 if (!rth) {
1529 err = -ENOBUFS;
1530 goto cleanup;
1531 }
1532
cf911662
DM
1533 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1534 rth->rt_flags = flags;
1535 rth->rt_type = res->type;
9917e1e8 1536 rth->rt_is_input = 1;
13378cad 1537 rth->rt_iif = 0;
5943634f 1538 rth->rt_pmtu = 0;
f8126f1d 1539 rth->rt_gateway = 0;
155e8336 1540 rth->rt_uses_gateway = 0;
caacf05e 1541 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1542
d8d1f30b
CG
1543 rth->dst.input = ip_forward;
1544 rth->dst.output = ip_output;
1da177e4 1545
d2d68ba9 1546 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
c6cffba4 1547 skb_dst_set(skb, &rth->dst);
d2d68ba9 1548out:
1da177e4
LT
1549 err = 0;
1550 cleanup:
1da177e4 1551 return err;
e905a9ed 1552}
1da177e4 1553
5969f71d
SH
1554static int ip_mkroute_input(struct sk_buff *skb,
1555 struct fib_result *res,
68a5e3dd 1556 const struct flowi4 *fl4,
5969f71d
SH
1557 struct in_device *in_dev,
1558 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1559{
1da177e4 1560#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 1561 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 1562 fib_select_multipath(res);
1da177e4
LT
1563#endif
1564
1565 /* create a routing cache entry */
c6cffba4 1566 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1567}
1568
1da177e4
LT
1569/*
1570 * NOTE. We drop all the packets that has local source
1571 * addresses, because every properly looped back packet
1572 * must have correct destination already attached by output routine.
1573 *
1574 * Such approach solves two big problems:
1575 * 1. Not simplex devices are handled properly.
1576 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1577 * called with rcu_read_lock()
1da177e4
LT
1578 */
1579
9e12bb22 1580static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 1581 u8 tos, struct net_device *dev)
1da177e4
LT
1582{
1583 struct fib_result res;
96d36220 1584 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 1585 struct flowi4 fl4;
95c96174 1586 unsigned int flags = 0;
1da177e4 1587 u32 itag = 0;
95c96174 1588 struct rtable *rth;
1da177e4 1589 int err = -EINVAL;
5e73ea1a 1590 struct net *net = dev_net(dev);
d2d68ba9 1591 bool do_cache;
1da177e4
LT
1592
1593 /* IP on this device is disabled. */
1594
1595 if (!in_dev)
1596 goto out;
1597
1598 /* Check for the most weird martians, which can be not detected
1599 by fib_lookup.
1600 */
1601
d0daebc3 1602 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1603 goto martian_source;
1604
d2d68ba9 1605 res.fi = NULL;
27a954bd 1606 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1607 goto brd_input;
1608
1609 /* Accept zero addresses only to limited broadcast;
1610 * I even do not know to fix it or not. Waiting for complains :-)
1611 */
f97c1e0c 1612 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1613 goto martian_source;
1614
d0daebc3 1615 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1616 goto martian_destination;
1617
9eb43e76
ED
1618 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1619 * and call it once if daddr or/and saddr are loopback addresses
1620 */
1621 if (ipv4_is_loopback(daddr)) {
1622 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1623 goto martian_destination;
9eb43e76
ED
1624 } else if (ipv4_is_loopback(saddr)) {
1625 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1626 goto martian_source;
1627 }
1628
1da177e4
LT
1629 /*
1630 * Now we are ready to route packet.
1631 */
68a5e3dd
DM
1632 fl4.flowi4_oif = 0;
1633 fl4.flowi4_iif = dev->ifindex;
1634 fl4.flowi4_mark = skb->mark;
1635 fl4.flowi4_tos = tos;
1636 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1637 fl4.daddr = daddr;
1638 fl4.saddr = saddr;
1639 err = fib_lookup(net, &fl4, &res);
251da413 1640 if (err != 0)
1da177e4 1641 goto no_route;
1da177e4
LT
1642
1643 RT_CACHE_STAT_INC(in_slow_tot);
1644
1645 if (res.type == RTN_BROADCAST)
1646 goto brd_input;
1647
1648 if (res.type == RTN_LOCAL) {
5c04c819 1649 err = fib_validate_source(skb, saddr, daddr, tos,
1fb9489b 1650 LOOPBACK_IFINDEX,
9e56e380 1651 dev, in_dev, &itag);
b5f7e755
ED
1652 if (err < 0)
1653 goto martian_source_keep_err;
1da177e4
LT
1654 goto local_input;
1655 }
1656
1657 if (!IN_DEV_FORWARD(in_dev))
251da413 1658 goto no_route;
1da177e4
LT
1659 if (res.type != RTN_UNICAST)
1660 goto martian_destination;
1661
68a5e3dd 1662 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
1663out: return err;
1664
1665brd_input:
1666 if (skb->protocol != htons(ETH_P_IP))
1667 goto e_inval;
1668
41347dcd 1669 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1670 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1671 in_dev, &itag);
1da177e4 1672 if (err < 0)
b5f7e755 1673 goto martian_source_keep_err;
1da177e4
LT
1674 }
1675 flags |= RTCF_BROADCAST;
1676 res.type = RTN_BROADCAST;
1677 RT_CACHE_STAT_INC(in_brd);
1678
1679local_input:
d2d68ba9
DM
1680 do_cache = false;
1681 if (res.fi) {
fe3edf45 1682 if (!itag) {
54764bb6 1683 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
d2d68ba9 1684 if (rt_cache_valid(rth)) {
c6cffba4
DM
1685 skb_dst_set_noref(skb, &rth->dst);
1686 err = 0;
1687 goto out;
d2d68ba9
DM
1688 }
1689 do_cache = true;
1690 }
1691 }
1692
5c1e6aa3 1693 rth = rt_dst_alloc(net->loopback_dev,
d2d68ba9 1694 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
1695 if (!rth)
1696 goto e_nobufs;
1697
cf911662 1698 rth->dst.input= ip_local_deliver;
d8d1f30b 1699 rth->dst.output= ip_rt_bug;
cf911662
DM
1700#ifdef CONFIG_IP_ROUTE_CLASSID
1701 rth->dst.tclassid = itag;
1702#endif
1da177e4 1703
cf911662
DM
1704 rth->rt_genid = rt_genid(net);
1705 rth->rt_flags = flags|RTCF_LOCAL;
1706 rth->rt_type = res.type;
9917e1e8 1707 rth->rt_is_input = 1;
13378cad 1708 rth->rt_iif = 0;
5943634f 1709 rth->rt_pmtu = 0;
f8126f1d 1710 rth->rt_gateway = 0;
155e8336 1711 rth->rt_uses_gateway = 0;
caacf05e 1712 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1713 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
1714 rth->dst.input= ip_error;
1715 rth->dst.error= -err;
1da177e4
LT
1716 rth->rt_flags &= ~RTCF_LOCAL;
1717 }
d2d68ba9
DM
1718 if (do_cache)
1719 rt_cache_route(&FIB_RES_NH(res), rth);
89aef892 1720 skb_dst_set(skb, &rth->dst);
b23dd4fe 1721 err = 0;
ebc0ffae 1722 goto out;
1da177e4
LT
1723
1724no_route:
1725 RT_CACHE_STAT_INC(in_no_route);
1da177e4 1726 res.type = RTN_UNREACHABLE;
7f53878d
MC
1727 if (err == -ESRCH)
1728 err = -ENETUNREACH;
1da177e4
LT
1729 goto local_input;
1730
1731 /*
1732 * Do not cache martian addresses: they should be logged (RFC1812)
1733 */
1734martian_destination:
1735 RT_CACHE_STAT_INC(in_martian_dst);
1736#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
1737 if (IN_DEV_LOG_MARTIANS(in_dev))
1738 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1739 &daddr, &saddr, dev->name);
1da177e4 1740#endif
2c2910a4 1741
1da177e4
LT
1742e_inval:
1743 err = -EINVAL;
ebc0ffae 1744 goto out;
1da177e4
LT
1745
1746e_nobufs:
1747 err = -ENOBUFS;
ebc0ffae 1748 goto out;
1da177e4
LT
1749
1750martian_source:
b5f7e755
ED
1751 err = -EINVAL;
1752martian_source_keep_err:
1da177e4 1753 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 1754 goto out;
1da177e4
LT
1755}
1756
c6cffba4
DM
1757int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1758 u8 tos, struct net_device *dev)
1da177e4 1759{
96d36220 1760 int res;
1da177e4 1761
96d36220
ED
1762 rcu_read_lock();
1763
1da177e4
LT
1764 /* Multicast recognition logic is moved from route cache to here.
1765 The problem was that too many Ethernet cards have broken/missing
1766 hardware multicast filters :-( As result the host on multicasting
1767 network acquires a lot of useless route cache entries, sort of
1768 SDR messages from all the world. Now we try to get rid of them.
1769 Really, provided software IP multicast filter is organized
1770 reasonably (at least, hashed), it does not result in a slowdown
1771 comparing with route cache reject entries.
1772 Note, that multicast routers are not affected, because
1773 route cache entry is created eventually.
1774 */
f97c1e0c 1775 if (ipv4_is_multicast(daddr)) {
96d36220 1776 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1777
96d36220 1778 if (in_dev) {
dbdd9a52
DM
1779 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1780 ip_hdr(skb)->protocol);
1da177e4
LT
1781 if (our
1782#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
1783 ||
1784 (!ipv4_is_local_multicast(daddr) &&
1785 IN_DEV_MFORWARD(in_dev))
1da177e4 1786#endif
9d4fb27d 1787 ) {
96d36220
ED
1788 int res = ip_route_input_mc(skb, daddr, saddr,
1789 tos, dev, our);
1da177e4 1790 rcu_read_unlock();
96d36220 1791 return res;
1da177e4
LT
1792 }
1793 }
1794 rcu_read_unlock();
1795 return -EINVAL;
1796 }
c10237e0 1797 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
1798 rcu_read_unlock();
1799 return res;
1da177e4 1800}
c6cffba4 1801EXPORT_SYMBOL(ip_route_input_noref);
1da177e4 1802
ebc0ffae 1803/* called with rcu_read_lock() */
982721f3 1804static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 1805 const struct flowi4 *fl4, int orig_oif,
f61759e6 1806 struct net_device *dev_out,
5ada5527 1807 unsigned int flags)
1da177e4 1808{
982721f3 1809 struct fib_info *fi = res->fi;
f2bb4bed 1810 struct fib_nh_exception *fnhe;
5ada5527 1811 struct in_device *in_dev;
982721f3 1812 u16 type = res->type;
5ada5527 1813 struct rtable *rth;
c92b9655 1814 bool do_cache;
1da177e4 1815
d0daebc3
TG
1816 in_dev = __in_dev_get_rcu(dev_out);
1817 if (!in_dev)
5ada5527 1818 return ERR_PTR(-EINVAL);
1da177e4 1819
d0daebc3
TG
1820 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1821 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1822 return ERR_PTR(-EINVAL);
1823
68a5e3dd 1824 if (ipv4_is_lbcast(fl4->daddr))
982721f3 1825 type = RTN_BROADCAST;
68a5e3dd 1826 else if (ipv4_is_multicast(fl4->daddr))
982721f3 1827 type = RTN_MULTICAST;
68a5e3dd 1828 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 1829 return ERR_PTR(-EINVAL);
1da177e4
LT
1830
1831 if (dev_out->flags & IFF_LOOPBACK)
1832 flags |= RTCF_LOCAL;
1833
63617421 1834 do_cache = true;
982721f3 1835 if (type == RTN_BROADCAST) {
1da177e4 1836 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
1837 fi = NULL;
1838 } else if (type == RTN_MULTICAST) {
dd28d1a0 1839 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
1840 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1841 fl4->flowi4_proto))
1da177e4 1842 flags &= ~RTCF_LOCAL;
63617421
JA
1843 else
1844 do_cache = false;
1da177e4 1845 /* If multicast route do not exist use
dd28d1a0
ED
1846 * default one, but do not gateway in this case.
1847 * Yes, it is hack.
1da177e4 1848 */
982721f3
DM
1849 if (fi && res->prefixlen < 4)
1850 fi = NULL;
1da177e4
LT
1851 }
1852
f2bb4bed 1853 fnhe = NULL;
63617421
JA
1854 do_cache &= fi != NULL;
1855 if (do_cache) {
c5038a83 1856 struct rtable __rcu **prth;
c92b9655 1857 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 1858
c92b9655 1859 fnhe = find_exception(nh, fl4->daddr);
c5038a83
DM
1860 if (fnhe)
1861 prth = &fnhe->fnhe_rth;
c92b9655
JA
1862 else {
1863 if (unlikely(fl4->flowi4_flags &
1864 FLOWI_FLAG_KNOWN_NH &&
1865 !(nh->nh_gw &&
1866 nh->nh_scope == RT_SCOPE_LINK))) {
1867 do_cache = false;
1868 goto add;
1869 }
1870 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1871 }
c5038a83
DM
1872 rth = rcu_dereference(*prth);
1873 if (rt_cache_valid(rth)) {
1874 dst_hold(&rth->dst);
1875 return rth;
f2bb4bed
DM
1876 }
1877 }
c92b9655
JA
1878
1879add:
5c1e6aa3
DM
1880 rth = rt_dst_alloc(dev_out,
1881 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 1882 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 1883 do_cache);
8391d07b 1884 if (!rth)
5ada5527 1885 return ERR_PTR(-ENOBUFS);
8391d07b 1886
cf911662
DM
1887 rth->dst.output = ip_output;
1888
cf911662
DM
1889 rth->rt_genid = rt_genid(dev_net(dev_out));
1890 rth->rt_flags = flags;
1891 rth->rt_type = type;
9917e1e8 1892 rth->rt_is_input = 0;
13378cad 1893 rth->rt_iif = orig_oif ? : 0;
5943634f 1894 rth->rt_pmtu = 0;
f8126f1d 1895 rth->rt_gateway = 0;
155e8336 1896 rth->rt_uses_gateway = 0;
caacf05e 1897 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4
LT
1898
1899 RT_CACHE_STAT_INC(out_slow_tot);
1900
41347dcd 1901 if (flags & RTCF_LOCAL)
d8d1f30b 1902 rth->dst.input = ip_local_deliver;
1da177e4 1903 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 1904 if (flags & RTCF_LOCAL &&
1da177e4 1905 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 1906 rth->dst.output = ip_mc_output;
1da177e4
LT
1907 RT_CACHE_STAT_INC(out_slow_mc);
1908 }
1909#ifdef CONFIG_IP_MROUTE
982721f3 1910 if (type == RTN_MULTICAST) {
1da177e4 1911 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 1912 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
1913 rth->dst.input = ip_mr_input;
1914 rth->dst.output = ip_mc_output;
1da177e4
LT
1915 }
1916 }
1917#endif
1918 }
1919
f2bb4bed 1920 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1da177e4 1921
5ada5527 1922 return rth;
1da177e4
LT
1923}
1924
1da177e4
LT
1925/*
1926 * Major route resolver routine.
1927 */
1928
89aef892 1929struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1da177e4 1930{
1da177e4 1931 struct net_device *dev_out = NULL;
f61759e6 1932 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
1933 unsigned int flags = 0;
1934 struct fib_result res;
5ada5527 1935 struct rtable *rth;
813b3b5d 1936 int orig_oif;
1da177e4 1937
85b91b03 1938 res.tclassid = 0;
1da177e4 1939 res.fi = NULL;
8b96d22d 1940 res.table = NULL;
1da177e4 1941
813b3b5d
DM
1942 orig_oif = fl4->flowi4_oif;
1943
1fb9489b 1944 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
1945 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1946 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1947 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 1948
010c2708 1949 rcu_read_lock();
813b3b5d 1950 if (fl4->saddr) {
b23dd4fe 1951 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
1952 if (ipv4_is_multicast(fl4->saddr) ||
1953 ipv4_is_lbcast(fl4->saddr) ||
1954 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
1955 goto out;
1956
1da177e4
LT
1957 /* I removed check for oif == dev_out->oif here.
1958 It was wrong for two reasons:
1ab35276
DL
1959 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1960 is assigned to multiple interfaces.
1da177e4
LT
1961 2. Moreover, we are allowed to send packets with saddr
1962 of another iface. --ANK
1963 */
1964
813b3b5d
DM
1965 if (fl4->flowi4_oif == 0 &&
1966 (ipv4_is_multicast(fl4->daddr) ||
1967 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 1968 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1969 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
1970 if (dev_out == NULL)
1971 goto out;
1972
1da177e4
LT
1973 /* Special hack: user can direct multicasts
1974 and limited broadcast via necessary interface
1975 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1976 This hack is not just for fun, it allows
1977 vic,vat and friends to work.
1978 They bind socket to loopback, set ttl to zero
1979 and expect that it will work.
1980 From the viewpoint of routing cache they are broken,
1981 because we are not allowed to build multicast path
1982 with loopback source addr (look, routing cache
1983 cannot know, that ttl is zero, so that packet
1984 will not leave this host and route is valid).
1985 Luckily, this hack is good workaround.
1986 */
1987
813b3b5d 1988 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
1989 goto make_route;
1990 }
a210d01a 1991
813b3b5d 1992 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 1993 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1994 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 1995 goto out;
a210d01a 1996 }
1da177e4
LT
1997 }
1998
1999
813b3b5d
DM
2000 if (fl4->flowi4_oif) {
2001 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2002 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2003 if (dev_out == NULL)
2004 goto out;
e5ed6399
HX
2005
2006 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2007 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2008 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2009 goto out;
2010 }
813b3b5d
DM
2011 if (ipv4_is_local_multicast(fl4->daddr) ||
2012 ipv4_is_lbcast(fl4->daddr)) {
2013 if (!fl4->saddr)
2014 fl4->saddr = inet_select_addr(dev_out, 0,
2015 RT_SCOPE_LINK);
1da177e4
LT
2016 goto make_route;
2017 }
813b3b5d
DM
2018 if (fl4->saddr) {
2019 if (ipv4_is_multicast(fl4->daddr))
2020 fl4->saddr = inet_select_addr(dev_out, 0,
2021 fl4->flowi4_scope);
2022 else if (!fl4->daddr)
2023 fl4->saddr = inet_select_addr(dev_out, 0,
2024 RT_SCOPE_HOST);
1da177e4
LT
2025 }
2026 }
2027
813b3b5d
DM
2028 if (!fl4->daddr) {
2029 fl4->daddr = fl4->saddr;
2030 if (!fl4->daddr)
2031 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2032 dev_out = net->loopback_dev;
1fb9489b 2033 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1da177e4
LT
2034 res.type = RTN_LOCAL;
2035 flags |= RTCF_LOCAL;
2036 goto make_route;
2037 }
2038
813b3b5d 2039 if (fib_lookup(net, fl4, &res)) {
1da177e4 2040 res.fi = NULL;
8b96d22d 2041 res.table = NULL;
813b3b5d 2042 if (fl4->flowi4_oif) {
1da177e4
LT
2043 /* Apparently, routing tables are wrong. Assume,
2044 that the destination is on link.
2045
2046 WHY? DW.
2047 Because we are allowed to send to iface
2048 even if it has NO routes and NO assigned
2049 addresses. When oif is specified, routing
2050 tables are looked up with only one purpose:
2051 to catch if destination is gatewayed, rather than
2052 direct. Moreover, if MSG_DONTROUTE is set,
2053 we send packet, ignoring both routing tables
2054 and ifaddr state. --ANK
2055
2056
2057 We could make it even if oif is unknown,
2058 likely IPv6, but we do not.
2059 */
2060
813b3b5d
DM
2061 if (fl4->saddr == 0)
2062 fl4->saddr = inet_select_addr(dev_out, 0,
2063 RT_SCOPE_LINK);
1da177e4
LT
2064 res.type = RTN_UNICAST;
2065 goto make_route;
2066 }
b23dd4fe 2067 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2068 goto out;
2069 }
1da177e4
LT
2070
2071 if (res.type == RTN_LOCAL) {
813b3b5d 2072 if (!fl4->saddr) {
9fc3bbb4 2073 if (res.fi->fib_prefsrc)
813b3b5d 2074 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2075 else
813b3b5d 2076 fl4->saddr = fl4->daddr;
9fc3bbb4 2077 }
b40afd0e 2078 dev_out = net->loopback_dev;
813b3b5d 2079 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2080 flags |= RTCF_LOCAL;
2081 goto make_route;
2082 }
2083
2084#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2085 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2086 fib_select_multipath(&res);
1da177e4
LT
2087 else
2088#endif
21d8c49e
DM
2089 if (!res.prefixlen &&
2090 res.table->tb_num_default > 1 &&
813b3b5d 2091 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2092 fib_select_default(&res);
1da177e4 2093
813b3b5d
DM
2094 if (!fl4->saddr)
2095 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2096
1da177e4 2097 dev_out = FIB_RES_DEV(res);
813b3b5d 2098 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2099
2100
2101make_route:
1a00fee4 2102 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1da177e4 2103
010c2708
DM
2104out:
2105 rcu_read_unlock();
b23dd4fe 2106 return rth;
1da177e4 2107}
d8c97a94
ACM
2108EXPORT_SYMBOL_GPL(__ip_route_output_key);
2109
ae2688d5
JW
2110static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2111{
2112 return NULL;
2113}
2114
ebb762f2 2115static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2116{
618f9bc7
SK
2117 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2118
2119 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2120}
2121
6700c270
DM
2122static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2123 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2124{
2125}
2126
6700c270
DM
2127static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2128 struct sk_buff *skb)
b587ee3b
DM
2129{
2130}
2131
0972ddb2
HB
2132static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2133 unsigned long old)
2134{
2135 return NULL;
2136}
2137
14e50e57
DM
2138static struct dst_ops ipv4_dst_blackhole_ops = {
2139 .family = AF_INET,
09640e63 2140 .protocol = cpu_to_be16(ETH_P_IP),
ae2688d5 2141 .check = ipv4_blackhole_dst_check,
ebb762f2 2142 .mtu = ipv4_blackhole_mtu,
214f45c9 2143 .default_advmss = ipv4_default_advmss,
14e50e57 2144 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2145 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2146 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2147 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2148};
2149
2774c131 2150struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2151{
2774c131 2152 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2153 struct rtable *rt;
14e50e57 2154
f5b0a874 2155 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
14e50e57 2156 if (rt) {
d8d1f30b 2157 struct dst_entry *new = &rt->dst;
14e50e57 2158
14e50e57 2159 new->__use = 1;
352e512c
HX
2160 new->input = dst_discard;
2161 new->output = dst_discard;
14e50e57 2162
d8d1f30b 2163 new->dev = ort->dst.dev;
14e50e57
DM
2164 if (new->dev)
2165 dev_hold(new->dev);
2166
9917e1e8 2167 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2168 rt->rt_iif = ort->rt_iif;
5943634f 2169 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2170
e84f84f2 2171 rt->rt_genid = rt_genid(net);
14e50e57
DM
2172 rt->rt_flags = ort->rt_flags;
2173 rt->rt_type = ort->rt_type;
14e50e57 2174 rt->rt_gateway = ort->rt_gateway;
155e8336 2175 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2176
caacf05e
DM
2177 INIT_LIST_HEAD(&rt->rt_uncached);
2178
14e50e57
DM
2179 dst_free(new);
2180 }
2181
2774c131
DM
2182 dst_release(dst_orig);
2183
2184 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2185}
2186
9d6ec938 2187struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2188 struct sock *sk)
1da177e4 2189{
9d6ec938 2190 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2191
b23dd4fe
DM
2192 if (IS_ERR(rt))
2193 return rt;
1da177e4 2194
56157872 2195 if (flp4->flowi4_proto)
9d6ec938
DM
2196 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2197 flowi4_to_flowi(flp4),
2198 sk, 0);
1da177e4 2199
b23dd4fe 2200 return rt;
1da177e4 2201}
d8c97a94
ACM
2202EXPORT_SYMBOL_GPL(ip_route_output_flow);
2203
f1ce3062 2204static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
15e47304 2205 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
f1ce3062 2206 u32 seq, int event, int nowait, unsigned int flags)
1da177e4 2207{
511c3f92 2208 struct rtable *rt = skb_rtable(skb);
1da177e4 2209 struct rtmsg *r;
be403ea1 2210 struct nlmsghdr *nlh;
2bc8ca40 2211 unsigned long expires = 0;
f185071d 2212 u32 error;
521f5490 2213 u32 metrics[RTAX_MAX];
be403ea1 2214
15e47304 2215 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
be403ea1 2216 if (nlh == NULL)
26932566 2217 return -EMSGSIZE;
be403ea1
TG
2218
2219 r = nlmsg_data(nlh);
1da177e4
LT
2220 r->rtm_family = AF_INET;
2221 r->rtm_dst_len = 32;
2222 r->rtm_src_len = 0;
d6c0a4f6 2223 r->rtm_tos = fl4->flowi4_tos;
1da177e4 2224 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2225 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2226 goto nla_put_failure;
1da177e4
LT
2227 r->rtm_type = rt->rt_type;
2228 r->rtm_scope = RT_SCOPE_UNIVERSE;
2229 r->rtm_protocol = RTPROT_UNSPEC;
2230 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2231 if (rt->rt_flags & RTCF_NOTIFY)
2232 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2233
f1ce3062 2234 if (nla_put_be32(skb, RTA_DST, dst))
f3756b79 2235 goto nla_put_failure;
1a00fee4 2236 if (src) {
1da177e4 2237 r->rtm_src_len = 32;
1a00fee4 2238 if (nla_put_be32(skb, RTA_SRC, src))
f3756b79 2239 goto nla_put_failure;
1da177e4 2240 }
f3756b79
DM
2241 if (rt->dst.dev &&
2242 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2243 goto nla_put_failure;
c7066f70 2244#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2245 if (rt->dst.tclassid &&
2246 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2247 goto nla_put_failure;
1da177e4 2248#endif
41347dcd 2249 if (!rt_is_input_route(rt) &&
d6c0a4f6
DM
2250 fl4->saddr != src) {
2251 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2252 goto nla_put_failure;
2253 }
155e8336 2254 if (rt->rt_uses_gateway &&
f3756b79
DM
2255 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2256 goto nla_put_failure;
be403ea1 2257
ee9a8f7a
SK
2258 expires = rt->dst.expires;
2259 if (expires) {
2260 unsigned long now = jiffies;
2261
2262 if (time_before(now, expires))
2263 expires -= now;
2264 else
2265 expires = 0;
2266 }
2267
521f5490 2268 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2269 if (rt->rt_pmtu && expires)
521f5490
JA
2270 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2271 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2272 goto nla_put_failure;
2273
b4869889 2274 if (fl4->flowi4_mark &&
68aaed54 2275 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2276 goto nla_put_failure;
963bfeee 2277
d8d1f30b 2278 error = rt->dst.error;
be403ea1 2279
c7537967 2280 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2281#ifdef CONFIG_IP_MROUTE
2282 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2283 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2284 int err = ipmr_get_route(net, skb,
2285 fl4->saddr, fl4->daddr,
2286 r, nowait);
2287 if (err <= 0) {
2288 if (!nowait) {
2289 if (err == 0)
2290 return 0;
2291 goto nla_put_failure;
2292 } else {
2293 if (err == -EMSGSIZE)
2294 goto nla_put_failure;
2295 error = err;
2296 }
2297 }
2298 } else
2299#endif
2300 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2301 goto nla_put_failure;
1da177e4
LT
2302 }
2303
f185071d 2304 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2305 goto nla_put_failure;
be403ea1
TG
2306
2307 return nlmsg_end(skb, nlh);
1da177e4 2308
be403ea1 2309nla_put_failure:
26932566
PM
2310 nlmsg_cancel(skb, nlh);
2311 return -EMSGSIZE;
1da177e4
LT
2312}
2313
661d2967 2314static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
1da177e4 2315{
3b1e0a65 2316 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2317 struct rtmsg *rtm;
2318 struct nlattr *tb[RTA_MAX+1];
1da177e4 2319 struct rtable *rt = NULL;
d6c0a4f6 2320 struct flowi4 fl4;
9e12bb22
AV
2321 __be32 dst = 0;
2322 __be32 src = 0;
2323 u32 iif;
d889ce3b 2324 int err;
963bfeee 2325 int mark;
1da177e4
LT
2326 struct sk_buff *skb;
2327
d889ce3b
TG
2328 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2329 if (err < 0)
2330 goto errout;
2331
2332 rtm = nlmsg_data(nlh);
2333
1da177e4 2334 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2335 if (skb == NULL) {
2336 err = -ENOBUFS;
2337 goto errout;
2338 }
1da177e4
LT
2339
2340 /* Reserve room for dummy headers, this skb can pass
2341 through good chunk of routing engine.
2342 */
459a98ed 2343 skb_reset_mac_header(skb);
c1d2bbe1 2344 skb_reset_network_header(skb);
d2c962b8
SH
2345
2346 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2347 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2348 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2349
17fb2c64
AV
2350 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2351 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2352 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2353 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4 2354
d6c0a4f6
DM
2355 memset(&fl4, 0, sizeof(fl4));
2356 fl4.daddr = dst;
2357 fl4.saddr = src;
2358 fl4.flowi4_tos = rtm->rtm_tos;
2359 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2360 fl4.flowi4_mark = mark;
2361
1da177e4 2362 if (iif) {
d889ce3b
TG
2363 struct net_device *dev;
2364
1937504d 2365 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2366 if (dev == NULL) {
2367 err = -ENODEV;
2368 goto errout_free;
2369 }
2370
1da177e4
LT
2371 skb->protocol = htons(ETH_P_IP);
2372 skb->dev = dev;
963bfeee 2373 skb->mark = mark;
1da177e4
LT
2374 local_bh_disable();
2375 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2376 local_bh_enable();
d889ce3b 2377
511c3f92 2378 rt = skb_rtable(skb);
d8d1f30b
CG
2379 if (err == 0 && rt->dst.error)
2380 err = -rt->dst.error;
1da177e4 2381 } else {
9d6ec938 2382 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2383
2384 err = 0;
2385 if (IS_ERR(rt))
2386 err = PTR_ERR(rt);
1da177e4 2387 }
d889ce3b 2388
1da177e4 2389 if (err)
d889ce3b 2390 goto errout_free;
1da177e4 2391
d8d1f30b 2392 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2393 if (rtm->rtm_flags & RTM_F_NOTIFY)
2394 rt->rt_flags |= RTCF_NOTIFY;
2395
f1ce3062 2396 err = rt_fill_info(net, dst, src, &fl4, skb,
15e47304 2397 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
1937504d 2398 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2399 if (err <= 0)
2400 goto errout_free;
1da177e4 2401
15e47304 2402 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2403errout:
2942e900 2404 return err;
1da177e4 2405
d889ce3b 2406errout_free:
1da177e4 2407 kfree_skb(skb);
d889ce3b 2408 goto errout;
1da177e4
LT
2409}
2410
2411int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2412{
1da177e4
LT
2413 return skb->len;
2414}
2415
2416void ip_rt_multicast_event(struct in_device *in_dev)
2417{
4ccfe6d4 2418 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2419}
2420
2421#ifdef CONFIG_SYSCTL
082c7ca4
G
2422static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2423static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2424static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2425static int ip_rt_gc_elasticity __read_mostly = 8;
2426
81c684d1 2427static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 2428 void __user *buffer,
1da177e4
LT
2429 size_t *lenp, loff_t *ppos)
2430{
2431 if (write) {
4ccfe6d4 2432 rt_cache_flush((struct net *)__ctl->extra1);
1da177e4 2433 return 0;
e905a9ed 2434 }
1da177e4
LT
2435
2436 return -EINVAL;
2437}
2438
eeb61f71 2439static ctl_table ipv4_route_table[] = {
1da177e4 2440 {
1da177e4
LT
2441 .procname = "gc_thresh",
2442 .data = &ipv4_dst_ops.gc_thresh,
2443 .maxlen = sizeof(int),
2444 .mode = 0644,
6d9f239a 2445 .proc_handler = proc_dointvec,
1da177e4
LT
2446 },
2447 {
1da177e4
LT
2448 .procname = "max_size",
2449 .data = &ip_rt_max_size,
2450 .maxlen = sizeof(int),
2451 .mode = 0644,
6d9f239a 2452 .proc_handler = proc_dointvec,
1da177e4
LT
2453 },
2454 {
2455 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2456
1da177e4
LT
2457 .procname = "gc_min_interval",
2458 .data = &ip_rt_gc_min_interval,
2459 .maxlen = sizeof(int),
2460 .mode = 0644,
6d9f239a 2461 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2462 },
2463 {
1da177e4
LT
2464 .procname = "gc_min_interval_ms",
2465 .data = &ip_rt_gc_min_interval,
2466 .maxlen = sizeof(int),
2467 .mode = 0644,
6d9f239a 2468 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2469 },
2470 {
1da177e4
LT
2471 .procname = "gc_timeout",
2472 .data = &ip_rt_gc_timeout,
2473 .maxlen = sizeof(int),
2474 .mode = 0644,
6d9f239a 2475 .proc_handler = proc_dointvec_jiffies,
1da177e4 2476 },
9f28a2fc
ED
2477 {
2478 .procname = "gc_interval",
2479 .data = &ip_rt_gc_interval,
2480 .maxlen = sizeof(int),
2481 .mode = 0644,
2482 .proc_handler = proc_dointvec_jiffies,
2483 },
1da177e4 2484 {
1da177e4
LT
2485 .procname = "redirect_load",
2486 .data = &ip_rt_redirect_load,
2487 .maxlen = sizeof(int),
2488 .mode = 0644,
6d9f239a 2489 .proc_handler = proc_dointvec,
1da177e4
LT
2490 },
2491 {
1da177e4
LT
2492 .procname = "redirect_number",
2493 .data = &ip_rt_redirect_number,
2494 .maxlen = sizeof(int),
2495 .mode = 0644,
6d9f239a 2496 .proc_handler = proc_dointvec,
1da177e4
LT
2497 },
2498 {
1da177e4
LT
2499 .procname = "redirect_silence",
2500 .data = &ip_rt_redirect_silence,
2501 .maxlen = sizeof(int),
2502 .mode = 0644,
6d9f239a 2503 .proc_handler = proc_dointvec,
1da177e4
LT
2504 },
2505 {
1da177e4
LT
2506 .procname = "error_cost",
2507 .data = &ip_rt_error_cost,
2508 .maxlen = sizeof(int),
2509 .mode = 0644,
6d9f239a 2510 .proc_handler = proc_dointvec,
1da177e4
LT
2511 },
2512 {
1da177e4
LT
2513 .procname = "error_burst",
2514 .data = &ip_rt_error_burst,
2515 .maxlen = sizeof(int),
2516 .mode = 0644,
6d9f239a 2517 .proc_handler = proc_dointvec,
1da177e4
LT
2518 },
2519 {
1da177e4
LT
2520 .procname = "gc_elasticity",
2521 .data = &ip_rt_gc_elasticity,
2522 .maxlen = sizeof(int),
2523 .mode = 0644,
6d9f239a 2524 .proc_handler = proc_dointvec,
1da177e4
LT
2525 },
2526 {
1da177e4
LT
2527 .procname = "mtu_expires",
2528 .data = &ip_rt_mtu_expires,
2529 .maxlen = sizeof(int),
2530 .mode = 0644,
6d9f239a 2531 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2532 },
2533 {
1da177e4
LT
2534 .procname = "min_pmtu",
2535 .data = &ip_rt_min_pmtu,
2536 .maxlen = sizeof(int),
2537 .mode = 0644,
6d9f239a 2538 .proc_handler = proc_dointvec,
1da177e4
LT
2539 },
2540 {
1da177e4
LT
2541 .procname = "min_adv_mss",
2542 .data = &ip_rt_min_advmss,
2543 .maxlen = sizeof(int),
2544 .mode = 0644,
6d9f239a 2545 .proc_handler = proc_dointvec,
1da177e4 2546 },
f8572d8f 2547 { }
1da177e4 2548};
39a23e75 2549
39a23e75
DL
2550static struct ctl_table ipv4_route_flush_table[] = {
2551 {
39a23e75
DL
2552 .procname = "flush",
2553 .maxlen = sizeof(int),
2554 .mode = 0200,
6d9f239a 2555 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2556 },
f8572d8f 2557 { },
39a23e75
DL
2558};
2559
2560static __net_init int sysctl_route_net_init(struct net *net)
2561{
2562 struct ctl_table *tbl;
2563
2564 tbl = ipv4_route_flush_table;
09ad9bc7 2565 if (!net_eq(net, &init_net)) {
39a23e75
DL
2566 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2567 if (tbl == NULL)
2568 goto err_dup;
464dc801
EB
2569
2570 /* Don't export sysctls to unprivileged users */
2571 if (net->user_ns != &init_user_ns)
2572 tbl[0].procname = NULL;
39a23e75
DL
2573 }
2574 tbl[0].extra1 = net;
2575
ec8f23ce 2576 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
2577 if (net->ipv4.route_hdr == NULL)
2578 goto err_reg;
2579 return 0;
2580
2581err_reg:
2582 if (tbl != ipv4_route_flush_table)
2583 kfree(tbl);
2584err_dup:
2585 return -ENOMEM;
2586}
2587
2588static __net_exit void sysctl_route_net_exit(struct net *net)
2589{
2590 struct ctl_table *tbl;
2591
2592 tbl = net->ipv4.route_hdr->ctl_table_arg;
2593 unregister_net_sysctl_table(net->ipv4.route_hdr);
2594 BUG_ON(tbl == ipv4_route_flush_table);
2595 kfree(tbl);
2596}
2597
2598static __net_initdata struct pernet_operations sysctl_route_ops = {
2599 .init = sysctl_route_net_init,
2600 .exit = sysctl_route_net_exit,
2601};
1da177e4
LT
2602#endif
2603
3ee94372 2604static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2605{
b42664f8 2606 atomic_set(&net->rt_genid, 0);
436c3b66
DM
2607 get_random_bytes(&net->ipv4.dev_addr_genid,
2608 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
2609 return 0;
2610}
2611
3ee94372
NH
2612static __net_initdata struct pernet_operations rt_genid_ops = {
2613 .init = rt_genid_init,
9f5e97e5
DL
2614};
2615
c3426b47
DM
2616static int __net_init ipv4_inetpeer_init(struct net *net)
2617{
2618 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2619
2620 if (!bp)
2621 return -ENOMEM;
2622 inet_peer_base_init(bp);
2623 net->ipv4.peers = bp;
2624 return 0;
2625}
2626
2627static void __net_exit ipv4_inetpeer_exit(struct net *net)
2628{
2629 struct inet_peer_base *bp = net->ipv4.peers;
2630
2631 net->ipv4.peers = NULL;
56a6b248 2632 inetpeer_invalidate_tree(bp);
c3426b47
DM
2633 kfree(bp);
2634}
2635
2636static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2637 .init = ipv4_inetpeer_init,
2638 .exit = ipv4_inetpeer_exit,
2639};
9f5e97e5 2640
c7066f70 2641#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 2642struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 2643#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 2644
1da177e4
LT
2645int __init ip_rt_init(void)
2646{
424c4b70 2647 int rc = 0;
1da177e4 2648
c7066f70 2649#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 2650 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
2651 if (!ip_rt_acct)
2652 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
2653#endif
2654
e5d679f3
AD
2655 ipv4_dst_ops.kmem_cachep =
2656 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2657 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2658
14e50e57
DM
2659 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2660
fc66f95c
ED
2661 if (dst_entries_init(&ipv4_dst_ops) < 0)
2662 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2663
2664 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2665 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2666
89aef892
DM
2667 ipv4_dst_ops.gc_thresh = ~0;
2668 ip_rt_max_size = INT_MAX;
1da177e4 2669
1da177e4
LT
2670 devinet_init();
2671 ip_fib_init();
2672
73b38711 2673 if (ip_rt_proc_init())
058bd4d2 2674 pr_err("Unable to create route proc files\n");
1da177e4
LT
2675#ifdef CONFIG_XFRM
2676 xfrm_init();
703fb94e 2677 xfrm4_init();
1da177e4 2678#endif
c7ac8679 2679 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 2680
39a23e75
DL
2681#ifdef CONFIG_SYSCTL
2682 register_pernet_subsys(&sysctl_route_ops);
2683#endif
3ee94372 2684 register_pernet_subsys(&rt_genid_ops);
c3426b47 2685 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
2686 return rc;
2687}
2688
a1bc6eb4 2689#ifdef CONFIG_SYSCTL
eeb61f71
AV
2690/*
2691 * We really need to sanitize the damn ipv4 init order, then all
2692 * this nonsense will go away.
2693 */
2694void __init ip_static_sysctl_init(void)
2695{
4e5ca785 2696 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 2697}
a1bc6eb4 2698#endif