]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/route.c
UBUNTU: SAUCE: media: uvcvideo: Support realtek's UVC 1.5 device
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .owner = THIS_MODULE,
244 .open = rt_cache_seq_open,
245 .read = seq_read,
246 .llseek = seq_lseek,
89aef892 247 .release = seq_release,
1da177e4
LT
248};
249
250
251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252{
253 int cpu;
254
255 if (*pos == 0)
256 return SEQ_START_TOKEN;
257
0f23174a 258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
259 if (!cpu_possible(cpu))
260 continue;
261 *pos = cpu+1;
2f970d83 262 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
263 }
264 return NULL;
265}
266
267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 int cpu;
270
0f23174a 271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
272 if (!cpu_possible(cpu))
273 continue;
274 *pos = cpu+1;
2f970d83 275 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
276 }
277 return NULL;
e905a9ed 278
1da177e4
LT
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
5bec0039 291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
292 return 0;
293 }
e905a9ed 294
1da177e4
LT
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 297 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 298 0, /* st->in_hit */
1da177e4
LT
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
0baf2b35 306 0, /* st->out_hit */
1da177e4 307 st->out_slow_tot,
e905a9ed 308 st->out_slow_mc,
1da177e4 309
0baf2b35
ED
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
1da177e4
LT
316 );
317 return 0;
318}
319
f690808e 320static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
9a32144e 333static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
334 .owner = THIS_MODULE,
335 .open = rt_cpu_seq_open,
336 .read = seq_read,
337 .llseek = seq_lseek,
338 .release = seq_release,
339};
340
c7066f70 341#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 342static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 343{
a661c419
AD
344 struct ip_rt_acct *dst, *src;
345 unsigned int i, j;
346
347 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 if (!dst)
349 return -ENOMEM;
350
351 for_each_possible_cpu(i) {
352 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 for (j = 0; j < 256; j++) {
354 dst[j].o_bytes += src[j].o_bytes;
355 dst[j].o_packets += src[j].o_packets;
356 dst[j].i_bytes += src[j].i_bytes;
357 dst[j].i_packets += src[j].i_packets;
358 }
78c686e9
PE
359 }
360
a661c419
AD
361 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 kfree(dst);
363 return 0;
364}
78c686e9 365
a661c419
AD
366static int rt_acct_proc_open(struct inode *inode, struct file *file)
367{
368 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 369}
a661c419
AD
370
371static const struct file_operations rt_acct_proc_fops = {
372 .owner = THIS_MODULE,
373 .open = rt_acct_proc_open,
374 .read = seq_read,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
78c686e9 378#endif
107f1634 379
73b38711 380static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
381{
382 struct proc_dir_entry *pde;
383
d4beaa66
G
384 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 &rt_cache_seq_fops);
107f1634
PE
386 if (!pde)
387 goto err1;
388
77020720
WC
389 pde = proc_create("rt_cache", S_IRUGO,
390 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
391 if (!pde)
392 goto err2;
393
c7066f70 394#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 395 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
396 if (!pde)
397 goto err3;
398#endif
399 return 0;
400
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
402err3:
403 remove_proc_entry("rt_cache", net->proc_net_stat);
404#endif
405err2:
406 remove_proc_entry("rt_cache", net->proc_net);
407err1:
408 return -ENOMEM;
409}
73b38711
DL
410
411static void __net_exit ip_rt_do_proc_exit(struct net *net)
412{
413 remove_proc_entry("rt_cache", net->proc_net_stat);
414 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 415#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 416 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 417#endif
73b38711
DL
418}
419
420static struct pernet_operations ip_rt_proc_ops __net_initdata = {
421 .init = ip_rt_do_proc_init,
422 .exit = ip_rt_do_proc_exit,
423};
424
425static int __init ip_rt_proc_init(void)
426{
427 return register_pernet_subsys(&ip_rt_proc_ops);
428}
429
107f1634 430#else
73b38711 431static inline int ip_rt_proc_init(void)
107f1634
PE
432{
433 return 0;
434}
1da177e4 435#endif /* CONFIG_PROC_FS */
e905a9ed 436
4331debc 437static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 438{
ca4c3fc2 439 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
440}
441
4ccfe6d4 442void rt_cache_flush(struct net *net)
1da177e4 443{
ca4c3fc2 444 rt_genid_bump_ipv4(net);
98376387
ED
445}
446
f894cbf8
DM
447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct sk_buff *skb,
449 const void *daddr)
3769cffb 450{
d3aaeb38
DM
451 struct net_device *dev = dst->dev;
452 const __be32 *pkey = daddr;
39232973 453 const struct rtable *rt;
3769cffb
DM
454 struct neighbour *n;
455
39232973 456 rt = (const struct rtable *) dst;
a263b309 457 if (rt->rt_gateway)
39232973 458 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
459 else if (skb)
460 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 461
80703d26 462 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
463 if (n)
464 return n;
32092ecf 465 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
466}
467
63fca65d
JA
468static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469{
470 struct net_device *dev = dst->dev;
471 const __be32 *pkey = daddr;
472 const struct rtable *rt;
473
474 rt = (const struct rtable *)dst;
475 if (rt->rt_gateway)
476 pkey = (const __be32 *)&rt->rt_gateway;
477 else if (!daddr ||
478 (rt->rt_flags &
479 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480 return;
481
482 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483}
484
04ca6973 485#define IP_IDENTS_SZ 2048u
04ca6973 486
355b590c
ED
487static atomic_t *ip_idents __read_mostly;
488static u32 *ip_tstamps __read_mostly;
04ca6973
ED
489
490/* In order to protect privacy, we add a perturbation to identifiers
491 * if one generator is seldom used. This makes hard for an attacker
492 * to infer how many packets were sent between two points in time.
493 */
494u32 ip_idents_reserve(u32 hash, int segs)
495{
355b590c
ED
496 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
498 u32 old = ACCESS_ONCE(*p_tstamp);
04ca6973 499 u32 now = (u32)jiffies;
adb03115 500 u32 new, delta = 0;
04ca6973 501
355b590c 502 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
503 delta = prandom_u32_max(now - old);
504
adb03115
ED
505 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 do {
507 old = (u32)atomic_read(p_id);
508 new = old + delta + segs;
509 } while (atomic_cmpxchg(p_id, old, new) != old);
510
511 return new - segs;
04ca6973
ED
512}
513EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 514
b6a7719a 515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 516{
73f156a6
ED
517 static u32 ip_idents_hashrnd __read_mostly;
518 u32 hash, id;
1da177e4 519
73f156a6 520 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 521
04ca6973
ED
522 hash = jhash_3words((__force u32)iph->daddr,
523 (__force u32)iph->saddr,
b6a7719a 524 iph->protocol ^ net_hash_mix(net),
04ca6973 525 ip_idents_hashrnd);
73f156a6
ED
526 id = ip_idents_reserve(hash, segs);
527 iph->id = htons(id);
1da177e4 528}
4bc2f18b 529EXPORT_SYMBOL(__ip_select_ident);
1da177e4 530
e2d118a1
LC
531static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532 const struct sock *sk,
4895c771
DM
533 const struct iphdr *iph,
534 int oif, u8 tos,
535 u8 prot, u32 mark, int flow_flags)
536{
537 if (sk) {
538 const struct inet_sock *inet = inet_sk(sk);
539
540 oif = sk->sk_bound_dev_if;
541 mark = sk->sk_mark;
542 tos = RT_CONN_FLAGS(sk);
543 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 }
545 flowi4_init_output(fl4, oif, mark, tos,
546 RT_SCOPE_UNIVERSE, prot,
547 flow_flags,
e2d118a1
LC
548 iph->daddr, iph->saddr, 0, 0,
549 sock_net_uid(net, sk));
4895c771
DM
550}
551
5abf7f7e
ED
552static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553 const struct sock *sk)
4895c771 554{
d109e61b 555 const struct net *net = dev_net(skb->dev);
4895c771
DM
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
d109e61b 562 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
563}
564
5abf7f7e 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
566{
567 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 568 const struct ip_options_rcu *inet_opt;
4895c771
DM
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
e2d118a1 579 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
580 rcu_read_unlock();
581}
582
5abf7f7e
ED
583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
4895c771
DM
585{
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590}
591
c5038a83 592static DEFINE_SPINLOCK(fnhe_lock);
4895c771 593
2ffae99d
TT
594static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595{
596 struct rtable *rt;
597
598 rt = rcu_dereference(fnhe->fnhe_rth_input);
599 if (rt) {
600 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 601 dst_dev_put(&rt->dst);
0830106c 602 dst_release(&rt->dst);
2ffae99d
TT
603 }
604 rt = rcu_dereference(fnhe->fnhe_rth_output);
605 if (rt) {
606 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 607 dst_dev_put(&rt->dst);
0830106c 608 dst_release(&rt->dst);
2ffae99d
TT
609 }
610}
611
aee06da6 612static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
613{
614 struct fib_nh_exception *fnhe, *oldest;
615
616 oldest = rcu_dereference(hash->chain);
617 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618 fnhe = rcu_dereference(fnhe->fnhe_next)) {
619 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 oldest = fnhe;
621 }
2ffae99d 622 fnhe_flush_routes(oldest);
4895c771
DM
623 return oldest;
624}
625
d3a25c98
DM
626static inline u32 fnhe_hashfun(__be32 daddr)
627{
d546c621 628 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
629 u32 hval;
630
d546c621
ED
631 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
634}
635
387aa65a
TT
636static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637{
638 rt->rt_pmtu = fnhe->fnhe_pmtu;
639 rt->dst.expires = fnhe->fnhe_expires;
640
641 if (fnhe->fnhe_gw) {
642 rt->rt_flags |= RTCF_REDIRECTED;
643 rt->rt_gateway = fnhe->fnhe_gw;
644 rt->rt_uses_gateway = 1;
645 }
646}
647
aee06da6
JA
648static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 u32 pmtu, unsigned long expires)
4895c771 650{
aee06da6 651 struct fnhe_hash_bucket *hash;
4895c771 652 struct fib_nh_exception *fnhe;
387aa65a
TT
653 struct rtable *rt;
654 unsigned int i;
4895c771 655 int depth;
aee06da6
JA
656 u32 hval = fnhe_hashfun(daddr);
657
c5038a83 658 spin_lock_bh(&fnhe_lock);
4895c771 659
caa41527 660 hash = rcu_dereference(nh->nh_exceptions);
4895c771 661 if (!hash) {
aee06da6 662 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 663 if (!hash)
aee06da6 664 goto out_unlock;
caa41527 665 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
666 }
667
4895c771
DM
668 hash += hval;
669
670 depth = 0;
671 for (fnhe = rcu_dereference(hash->chain); fnhe;
672 fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 if (fnhe->fnhe_daddr == daddr)
aee06da6 674 break;
4895c771
DM
675 depth++;
676 }
677
aee06da6
JA
678 if (fnhe) {
679 if (gw)
680 fnhe->fnhe_gw = gw;
681 if (pmtu) {
682 fnhe->fnhe_pmtu = pmtu;
387aa65a 683 fnhe->fnhe_expires = max(1UL, expires);
aee06da6 684 }
387aa65a 685 /* Update all cached dsts too */
2ffae99d
TT
686 rt = rcu_dereference(fnhe->fnhe_rth_input);
687 if (rt)
688 fill_route_from_fnhe(rt, fnhe);
689 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
690 if (rt)
691 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
692 } else {
693 if (depth > FNHE_RECLAIM_DEPTH)
694 fnhe = fnhe_oldest(hash);
695 else {
696 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
697 if (!fnhe)
698 goto out_unlock;
699
700 fnhe->fnhe_next = hash->chain;
701 rcu_assign_pointer(hash->chain, fnhe);
702 }
5aad1de5 703 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
aee06da6
JA
704 fnhe->fnhe_daddr = daddr;
705 fnhe->fnhe_gw = gw;
706 fnhe->fnhe_pmtu = pmtu;
707 fnhe->fnhe_expires = expires;
387aa65a
TT
708
709 /* Exception created; mark the cached routes for the nexthop
710 * stale, so anyone caching it rechecks if this exception
711 * applies to them.
712 */
2ffae99d
TT
713 rt = rcu_dereference(nh->nh_rth_input);
714 if (rt)
715 rt->dst.obsolete = DST_OBSOLETE_KILL;
716
387aa65a
TT
717 for_each_possible_cpu(i) {
718 struct rtable __rcu **prt;
719 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
720 rt = rcu_dereference(*prt);
721 if (rt)
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
723 }
4895c771 724 }
4895c771 725
4895c771 726 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
727
728out_unlock:
c5038a83 729 spin_unlock_bh(&fnhe_lock);
4895c771
DM
730}
731
ceb33206
DM
732static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
733 bool kill_route)
1da177e4 734{
e47a185b 735 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 736 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 737 struct net_device *dev = skb->dev;
e47a185b 738 struct in_device *in_dev;
4895c771 739 struct fib_result res;
e47a185b 740 struct neighbour *n;
317805b8 741 struct net *net;
1da177e4 742
94206125
DM
743 switch (icmp_hdr(skb)->code & 7) {
744 case ICMP_REDIR_NET:
745 case ICMP_REDIR_NETTOS:
746 case ICMP_REDIR_HOST:
747 case ICMP_REDIR_HOSTTOS:
748 break;
749
750 default:
751 return;
752 }
753
e47a185b
DM
754 if (rt->rt_gateway != old_gw)
755 return;
756
757 in_dev = __in_dev_get_rcu(dev);
758 if (!in_dev)
759 return;
760
c346dca1 761 net = dev_net(dev);
9d4fb27d
JP
762 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
763 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
764 ipv4_is_zeronet(new_gw))
1da177e4
LT
765 goto reject_redirect;
766
767 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
768 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
769 goto reject_redirect;
770 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
771 goto reject_redirect;
772 } else {
317805b8 773 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
774 goto reject_redirect;
775 }
776
969447f2
SSL
777 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
778 if (!n)
779 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 780 if (!IS_ERR(n)) {
e47a185b
DM
781 if (!(n->nud_state & NUD_VALID)) {
782 neigh_event_send(n, NULL);
783 } else {
0eeb075f 784 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 785 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 786
aee06da6 787 update_or_create_fnhe(nh, fl4->daddr, new_gw,
deed49df 788 0, jiffies + ip_rt_gc_timeout);
4895c771 789 }
ceb33206
DM
790 if (kill_route)
791 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
792 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
793 }
794 neigh_release(n);
795 }
796 return;
797
798reject_redirect:
799#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
800 if (IN_DEV_LOG_MARTIANS(in_dev)) {
801 const struct iphdr *iph = (const struct iphdr *) skb->data;
802 __be32 daddr = iph->daddr;
803 __be32 saddr = iph->saddr;
804
e47a185b
DM
805 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
806 " Advised path = %pI4 -> %pI4\n",
807 &old_gw, dev->name, &new_gw,
808 &saddr, &daddr);
99ee038d 809 }
e47a185b
DM
810#endif
811 ;
812}
813
4895c771
DM
814static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
815{
816 struct rtable *rt;
817 struct flowi4 fl4;
f96ef988 818 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 819 struct net *net = dev_net(skb->dev);
f96ef988
MK
820 int oif = skb->dev->ifindex;
821 u8 tos = RT_TOS(iph->tos);
822 u8 prot = iph->protocol;
823 u32 mark = skb->mark;
4895c771
DM
824
825 rt = (struct rtable *) dst;
826
7d995694 827 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 828 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
829}
830
1da177e4
LT
831static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
832{
ee6b9673 833 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
834 struct dst_entry *ret = dst;
835
836 if (rt) {
d11a4dc1 837 if (dst->obsolete > 0) {
1da177e4
LT
838 ip_rt_put(rt);
839 ret = NULL;
5943634f
DM
840 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
841 rt->dst.expires) {
89aef892 842 ip_rt_put(rt);
1da177e4
LT
843 ret = NULL;
844 }
845 }
846 return ret;
847}
848
849/*
850 * Algorithm:
851 * 1. The first ip_rt_redirect_number redirects are sent
852 * with exponential backoff, then we stop sending them at all,
853 * assuming that the host ignores our redirects.
854 * 2. If we did not see packets requiring redirects
855 * during ip_rt_redirect_silence, we assume that the host
856 * forgot redirected route and start to send redirects again.
857 *
858 * This algorithm is much cheaper and more intelligent than dumb load limiting
859 * in icmp.c.
860 *
861 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
862 * and "frag. need" (breaks PMTU discovery) in icmp.c.
863 */
864
865void ip_rt_send_redirect(struct sk_buff *skb)
866{
511c3f92 867 struct rtable *rt = skb_rtable(skb);
30038fc6 868 struct in_device *in_dev;
92d86829 869 struct inet_peer *peer;
1d861aa4 870 struct net *net;
30038fc6 871 int log_martians;
192132b9 872 int vif;
1da177e4 873
30038fc6 874 rcu_read_lock();
d8d1f30b 875 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
876 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
877 rcu_read_unlock();
1da177e4 878 return;
30038fc6
ED
879 }
880 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 881 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 882 rcu_read_unlock();
1da177e4 883
1d861aa4 884 net = dev_net(rt->dst.dev);
192132b9 885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 886 if (!peer) {
e81da0e1
JA
887 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
888 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
889 return;
890 }
891
1da177e4
LT
892 /* No redirected packets during ip_rt_redirect_silence;
893 * reset the algorithm.
894 */
92d86829
DM
895 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
896 peer->rate_tokens = 0;
1da177e4
LT
897
898 /* Too many ignored redirects; do not send anything
d8d1f30b 899 * set dst.rate_last to the last seen redirected packet.
1da177e4 900 */
92d86829
DM
901 if (peer->rate_tokens >= ip_rt_redirect_number) {
902 peer->rate_last = jiffies;
1d861aa4 903 goto out_put_peer;
1da177e4
LT
904 }
905
906 /* Check for load limit; set rate_last to the latest sent
907 * redirect.
908 */
92d86829 909 if (peer->rate_tokens == 0 ||
14fb8a76 910 time_after(jiffies,
92d86829
DM
911 (peer->rate_last +
912 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
913 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
914
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
916 peer->rate_last = jiffies;
917 ++peer->rate_tokens;
1da177e4 918#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 919 if (log_martians &&
e87cc472
JP
920 peer->rate_tokens == ip_rt_redirect_number)
921 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 922 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 923 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
924#endif
925 }
1d861aa4
DM
926out_put_peer:
927 inet_putpeer(peer);
1da177e4
LT
928}
929
930static int ip_error(struct sk_buff *skb)
931{
251da413 932 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 933 struct rtable *rt = skb_rtable(skb);
92d86829 934 struct inet_peer *peer;
1da177e4 935 unsigned long now;
251da413 936 struct net *net;
92d86829 937 bool send;
1da177e4
LT
938 int code;
939
381c759d
EB
940 /* IP on this device is disabled. */
941 if (!in_dev)
942 goto out;
943
251da413
DM
944 net = dev_net(rt->dst.dev);
945 if (!IN_DEV_FORWARD(in_dev)) {
946 switch (rt->dst.error) {
947 case EHOSTUNREACH:
b45386ef 948 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
949 break;
950
951 case ENETUNREACH:
b45386ef 952 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
953 break;
954 }
955 goto out;
956 }
957
d8d1f30b 958 switch (rt->dst.error) {
4500ebf8
JP
959 case EINVAL:
960 default:
961 goto out;
962 case EHOSTUNREACH:
963 code = ICMP_HOST_UNREACH;
964 break;
965 case ENETUNREACH:
966 code = ICMP_NET_UNREACH;
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
968 break;
969 case EACCES:
970 code = ICMP_PKT_FILTERED;
971 break;
1da177e4
LT
972 }
973
192132b9 974 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 975 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
976
977 send = true;
978 if (peer) {
979 now = jiffies;
980 peer->rate_tokens += now - peer->rate_last;
981 if (peer->rate_tokens > ip_rt_error_burst)
982 peer->rate_tokens = ip_rt_error_burst;
983 peer->rate_last = now;
984 if (peer->rate_tokens >= ip_rt_error_cost)
985 peer->rate_tokens -= ip_rt_error_cost;
986 else
987 send = false;
1d861aa4 988 inet_putpeer(peer);
1da177e4 989 }
92d86829
DM
990 if (send)
991 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
992
993out: kfree_skb(skb);
994 return 0;
e905a9ed 995}
1da177e4 996
d851c12b 997static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 998{
d851c12b 999 struct dst_entry *dst = &rt->dst;
4895c771 1000 struct fib_result res;
2c8cec5c 1001
fa1e492a
SK
1002 if (dst_metric_locked(dst, RTAX_MTU))
1003 return;
1004
cb6ccf09 1005 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1006 return;
1007
5943634f
DM
1008 if (mtu < ip_rt_min_pmtu)
1009 mtu = ip_rt_min_pmtu;
2c8cec5c 1010
f016229e
TT
1011 if (rt->rt_pmtu == mtu &&
1012 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013 return;
1014
c5ae7d41 1015 rcu_read_lock();
0eeb075f 1016 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1017 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1018
aee06da6
JA
1019 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020 jiffies + ip_rt_mtu_expires);
4895c771 1021 }
c5ae7d41 1022 rcu_read_unlock();
1da177e4
LT
1023}
1024
4895c771
DM
1025static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026 struct sk_buff *skb, u32 mtu)
1027{
1028 struct rtable *rt = (struct rtable *) dst;
1029 struct flowi4 fl4;
1030
1031 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1032 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1033}
1034
36393395
DM
1035void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036 int oif, u32 mark, u8 protocol, int flow_flags)
1037{
4895c771 1038 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1039 struct flowi4 fl4;
1040 struct rtable *rt;
1041
1b3c61dc
LC
1042 if (!mark)
1043 mark = IP4_REPLY_MARK(net, skb->mark);
1044
e2d118a1 1045 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1046 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1047 rt = __ip_route_output_key(net, &fl4);
1048 if (!IS_ERR(rt)) {
4895c771 1049 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1050 ip_rt_put(rt);
1051 }
1052}
1053EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
9cb3a50c 1055static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1056{
4895c771
DM
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
36393395 1060
e2d118a1 1061 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1062
1063 if (!fl4.flowi4_mark)
1064 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
4895c771
DM
1066 rt = __ip_route_output_key(sock_net(sk), &fl4);
1067 if (!IS_ERR(rt)) {
1068 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069 ip_rt_put(rt);
1070 }
36393395 1071}
9cb3a50c
SK
1072
1073void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075 const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 struct flowi4 fl4;
1077 struct rtable *rt;
7f502361 1078 struct dst_entry *odst = NULL;
b44108db 1079 bool new = false;
e2d118a1 1080 struct net *net = sock_net(sk);
9cb3a50c
SK
1081
1082 bh_lock_sock(sk);
482fc609
HFS
1083
1084 if (!ip_sk_accept_pmtu(sk))
1085 goto out;
1086
7f502361 1087 odst = sk_dst_get(sk);
9cb3a50c 1088
7f502361 1089 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1090 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091 goto out;
1092 }
1093
e2d118a1 1094 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1095
7f502361 1096 rt = (struct rtable *)odst;
51456b29 1097 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1098 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099 if (IS_ERR(rt))
1100 goto out;
b44108db
SK
1101
1102 new = true;
9cb3a50c
SK
1103 }
1104
1105 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
7f502361 1107 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1108 if (new)
1109 dst_release(&rt->dst);
1110
9cb3a50c
SK
1111 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112 if (IS_ERR(rt))
1113 goto out;
1114
b44108db 1115 new = true;
9cb3a50c
SK
1116 }
1117
b44108db 1118 if (new)
7f502361 1119 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1120
1121out:
1122 bh_unlock_sock(sk);
7f502361 1123 dst_release(odst);
9cb3a50c 1124}
36393395 1125EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1126
b42597e2
DM
1127void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128 int oif, u32 mark, u8 protocol, int flow_flags)
1129{
4895c771 1130 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1131 struct flowi4 fl4;
1132 struct rtable *rt;
1133
e2d118a1 1134 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1135 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1136 rt = __ip_route_output_key(net, &fl4);
1137 if (!IS_ERR(rt)) {
ceb33206 1138 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1139 ip_rt_put(rt);
1140 }
1141}
1142EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145{
4895c771
DM
1146 const struct iphdr *iph = (const struct iphdr *) skb->data;
1147 struct flowi4 fl4;
1148 struct rtable *rt;
e2d118a1 1149 struct net *net = sock_net(sk);
b42597e2 1150
e2d118a1
LC
1151 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152 rt = __ip_route_output_key(net, &fl4);
4895c771 1153 if (!IS_ERR(rt)) {
ceb33206 1154 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1155 ip_rt_put(rt);
1156 }
b42597e2
DM
1157}
1158EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
efbc368d
DM
1160static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161{
1162 struct rtable *rt = (struct rtable *) dst;
1163
ceb33206
DM
1164 /* All IPV4 dsts are created with ->obsolete set to the value
1165 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166 * into this function always.
1167 *
387aa65a
TT
1168 * When a PMTU/redirect information update invalidates a route,
1169 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1171 */
387aa65a 1172 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1173 return NULL;
d11a4dc1 1174 return dst;
1da177e4
LT
1175}
1176
1da177e4
LT
1177static void ipv4_link_failure(struct sk_buff *skb)
1178{
1179 struct rtable *rt;
1180
1181 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
511c3f92 1183 rt = skb_rtable(skb);
5943634f
DM
1184 if (rt)
1185 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1186}
1187
ede2059d 1188static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1189{
91df42be
JP
1190 pr_debug("%s: %pI4 -> %pI4, %s\n",
1191 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192 skb->dev ? skb->dev->name : "?");
1da177e4 1193 kfree_skb(skb);
c378a9c0 1194 WARN_ON(1);
1da177e4
LT
1195 return 0;
1196}
1197
1198/*
1199 We do not cache source address of outgoing interface,
1200 because it is used only by IP RR, TS and SRR options,
1201 so that it out of fast path.
1202
1203 BTW remember: "addr" is allowed to be not aligned
1204 in IP options!
1205 */
1206
8e36360a 1207void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1208{
a61ced5d 1209 __be32 src;
1da177e4 1210
c7537967 1211 if (rt_is_output_route(rt))
c5be24ff 1212 src = ip_hdr(skb)->saddr;
ebc0ffae 1213 else {
8e36360a
DM
1214 struct fib_result res;
1215 struct flowi4 fl4;
1216 struct iphdr *iph;
1217
1218 iph = ip_hdr(skb);
1219
1220 memset(&fl4, 0, sizeof(fl4));
1221 fl4.daddr = iph->daddr;
1222 fl4.saddr = iph->saddr;
b0fe4a31 1223 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1224 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225 fl4.flowi4_iif = skb->dev->ifindex;
1226 fl4.flowi4_mark = skb->mark;
5e2b61f7 1227
ebc0ffae 1228 rcu_read_lock();
0eeb075f 1229 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1230 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1231 else
f8126f1d
DM
1232 src = inet_select_addr(rt->dst.dev,
1233 rt_nexthop(rt, iph->daddr),
1234 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1235 rcu_read_unlock();
1236 }
1da177e4
LT
1237 memcpy(addr, &src, 4);
1238}
1239
c7066f70 1240#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1241static void set_class_tag(struct rtable *rt, u32 tag)
1242{
d8d1f30b
CG
1243 if (!(rt->dst.tclassid & 0xFFFF))
1244 rt->dst.tclassid |= tag & 0xFFFF;
1245 if (!(rt->dst.tclassid & 0xFFFF0000))
1246 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1247}
1248#endif
1249
0dbaee3b
DM
1250static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251{
7ed14d97
GF
1252 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253 unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254 ip_rt_min_advmss);
0dbaee3b 1255
7ed14d97 1256 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1257}
1258
ebb762f2 1259static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1260{
261663b0 1261 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1262 unsigned int mtu = rt->rt_pmtu;
1263
98d75c37 1264 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1265 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1266
38d523e2 1267 if (mtu)
618f9bc7
SK
1268 return mtu;
1269
c780a049 1270 mtu = READ_ONCE(dst->dev->mtu);
d33e4553
DM
1271
1272 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1273 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1274 mtu = 576;
1275 }
1276
14972cbd
RP
1277 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1280}
1281
f2bb4bed 1282static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1283{
caa41527 1284 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1285 struct fib_nh_exception *fnhe;
1286 u32 hval;
1287
f2bb4bed
DM
1288 if (!hash)
1289 return NULL;
1290
d3a25c98 1291 hval = fnhe_hashfun(daddr);
4895c771
DM
1292
1293 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1295 if (fnhe->fnhe_daddr == daddr)
1296 return fnhe;
1297 }
1298 return NULL;
1299}
aee06da6 1300
caacf05e 1301static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1302 __be32 daddr, const bool do_cache)
f2bb4bed 1303{
caacf05e
DM
1304 bool ret = false;
1305
c5038a83 1306 spin_lock_bh(&fnhe_lock);
f2bb4bed 1307
c5038a83 1308 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1309 struct rtable __rcu **porig;
1310 struct rtable *orig;
5aad1de5 1311 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1312
1313 if (rt_is_input_route(rt))
1314 porig = &fnhe->fnhe_rth_input;
1315 else
1316 porig = &fnhe->fnhe_rth_output;
1317 orig = rcu_dereference(*porig);
5aad1de5
TT
1318
1319 if (fnhe->fnhe_genid != genid) {
1320 fnhe->fnhe_genid = genid;
13d82bf5
SK
1321 fnhe->fnhe_gw = 0;
1322 fnhe->fnhe_pmtu = 0;
1323 fnhe->fnhe_expires = 0;
2ffae99d
TT
1324 fnhe_flush_routes(fnhe);
1325 orig = NULL;
13d82bf5 1326 }
387aa65a
TT
1327 fill_route_from_fnhe(rt, fnhe);
1328 if (!rt->rt_gateway)
155e8336 1329 rt->rt_gateway = daddr;
f2bb4bed 1330
a4c2fd7f 1331 if (do_cache) {
0830106c 1332 dst_hold(&rt->dst);
2ffae99d 1333 rcu_assign_pointer(*porig, rt);
0830106c 1334 if (orig) {
95c47f9c 1335 dst_dev_put(&orig->dst);
0830106c 1336 dst_release(&orig->dst);
0830106c 1337 }
2ffae99d
TT
1338 ret = true;
1339 }
c5038a83
DM
1340
1341 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1342 }
1343 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1344
1345 return ret;
54764bb6
ED
1346}
1347
caacf05e 1348static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1349{
d26b3a7c 1350 struct rtable *orig, *prev, **p;
caacf05e 1351 bool ret = true;
f2bb4bed 1352
d26b3a7c 1353 if (rt_is_input_route(rt)) {
54764bb6 1354 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1355 } else {
903ceff7 1356 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1357 }
f2bb4bed
DM
1358 orig = *p;
1359
0830106c
WW
1360 /* hold dst before doing cmpxchg() to avoid race condition
1361 * on this dst
1362 */
1363 dst_hold(&rt->dst);
f2bb4bed
DM
1364 prev = cmpxchg(p, orig, rt);
1365 if (prev == orig) {
0830106c 1366 if (orig) {
95c47f9c 1367 dst_dev_put(&orig->dst);
0830106c 1368 dst_release(&orig->dst);
0830106c
WW
1369 }
1370 } else {
1371 dst_release(&rt->dst);
caacf05e 1372 ret = false;
0830106c 1373 }
caacf05e
DM
1374
1375 return ret;
1376}
1377
5055c371
ED
1378struct uncached_list {
1379 spinlock_t lock;
1380 struct list_head head;
1381};
1382
1383static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1384
0fc845c7 1385void rt_add_uncached_list(struct rtable *rt)
caacf05e 1386{
5055c371
ED
1387 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389 rt->rt_uncached_list = ul;
1390
1391 spin_lock_bh(&ul->lock);
1392 list_add_tail(&rt->rt_uncached, &ul->head);
1393 spin_unlock_bh(&ul->lock);
caacf05e
DM
1394}
1395
0fc845c7 1396void rt_del_uncached_list(struct rtable *rt)
caacf05e 1397{
78df76a0 1398 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1399 struct uncached_list *ul = rt->rt_uncached_list;
1400
1401 spin_lock_bh(&ul->lock);
caacf05e 1402 list_del(&rt->rt_uncached);
5055c371 1403 spin_unlock_bh(&ul->lock);
caacf05e
DM
1404 }
1405}
1406
0fc845c7
XL
1407static void ipv4_dst_destroy(struct dst_entry *dst)
1408{
1409 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1410 struct rtable *rt = (struct rtable *)dst;
1411
1412 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1413 kfree(p);
1414
1415 rt_del_uncached_list(rt);
1416}
1417
caacf05e
DM
1418void rt_flush_dev(struct net_device *dev)
1419{
5055c371
ED
1420 struct net *net = dev_net(dev);
1421 struct rtable *rt;
1422 int cpu;
1423
1424 for_each_possible_cpu(cpu) {
1425 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1426
5055c371
ED
1427 spin_lock_bh(&ul->lock);
1428 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1429 if (rt->dst.dev != dev)
1430 continue;
1431 rt->dst.dev = net->loopback_dev;
1432 dev_hold(rt->dst.dev);
1433 dev_put(dev);
1434 }
5055c371 1435 spin_unlock_bh(&ul->lock);
4895c771
DM
1436 }
1437}
1438
4331debc 1439static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1440{
4331debc
ED
1441 return rt &&
1442 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1443 !rt_is_expired(rt);
d2d68ba9
DM
1444}
1445
f2bb4bed 1446static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1447 const struct fib_result *res,
f2bb4bed 1448 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1449 struct fib_info *fi, u16 type, u32 itag,
1450 const bool do_cache)
1da177e4 1451{
caacf05e
DM
1452 bool cached = false;
1453
1da177e4 1454 if (fi) {
4895c771
DM
1455 struct fib_nh *nh = &FIB_RES_NH(*res);
1456
155e8336 1457 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1458 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1459 rt->rt_uses_gateway = 1;
1460 }
3fb07daf
ED
1461 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1462 if (fi->fib_metrics != &dst_default_metrics) {
1463 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
28701738 1464 refcount_inc(&fi->fib_metrics->refcnt);
3fb07daf 1465 }
c7066f70 1466#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1467 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1468#endif
61adedf3 1469 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1470 if (unlikely(fnhe))
a4c2fd7f
WW
1471 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1472 else if (do_cache)
caacf05e 1473 cached = rt_cache_route(nh, rt);
155e8336
JA
1474 if (unlikely(!cached)) {
1475 /* Routes we intend to cache in nexthop exception or
1476 * FIB nexthop have the DST_NOCACHE bit clear.
1477 * However, if we are unsuccessful at storing this
1478 * route into the cache we really need to set it.
1479 */
155e8336
JA
1480 if (!rt->rt_gateway)
1481 rt->rt_gateway = daddr;
1482 rt_add_uncached_list(rt);
1483 }
1484 } else
caacf05e 1485 rt_add_uncached_list(rt);
defb3519 1486
c7066f70 1487#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1488#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1489 set_class_tag(rt, res->tclassid);
1da177e4
LT
1490#endif
1491 set_class_tag(rt, itag);
1492#endif
1da177e4
LT
1493}
1494
9ab179d8
DA
1495struct rtable *rt_dst_alloc(struct net_device *dev,
1496 unsigned int flags, u16 type,
1497 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1498{
d08c4f35
DA
1499 struct rtable *rt;
1500
1501 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1502 (will_cache ? 0 : DST_HOST) |
d08c4f35 1503 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1504 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1505
1506 if (rt) {
1507 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1508 rt->rt_flags = flags;
1509 rt->rt_type = type;
1510 rt->rt_is_input = 0;
1511 rt->rt_iif = 0;
1512 rt->rt_pmtu = 0;
1513 rt->rt_gateway = 0;
1514 rt->rt_uses_gateway = 0;
b7503e0c 1515 rt->rt_table_id = 0;
d08c4f35
DA
1516 INIT_LIST_HEAD(&rt->rt_uncached);
1517
1518 rt->dst.output = ip_output;
1519 if (flags & RTCF_LOCAL)
1520 rt->dst.input = ip_local_deliver;
1521 }
1522
1523 return rt;
0c4dcd58 1524}
9ab179d8 1525EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1526
96d36220 1527/* called in rcu_read_lock() section */
3b6ff83a
PA
1528int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1529 u8 tos, struct net_device *dev,
1530 struct in_device *in_dev, u32 *itag)
1da177e4 1531{
b5f7e755 1532 int err;
1da177e4
LT
1533
1534 /* Primary sanity checks. */
51456b29 1535 if (!in_dev)
1da177e4
LT
1536 return -EINVAL;
1537
1e637c74 1538 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1539 skb->protocol != htons(ETH_P_IP))
3b6ff83a 1540 return -EINVAL;
1da177e4 1541
75fea73d 1542 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
3b6ff83a 1543 return -EINVAL;
d0daebc3 1544
f97c1e0c
JP
1545 if (ipv4_is_zeronet(saddr)) {
1546 if (!ipv4_is_local_multicast(daddr))
3b6ff83a 1547 return -EINVAL;
b5f7e755 1548 } else {
9e56e380 1549 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
3b6ff83a 1550 in_dev, itag);
b5f7e755 1551 if (err < 0)
3b6ff83a 1552 return err;
b5f7e755 1553 }
3b6ff83a
PA
1554 return 0;
1555}
1556
1557/* called in rcu_read_lock() section */
1558static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1559 u8 tos, struct net_device *dev, int our)
1560{
1561 struct in_device *in_dev = __in_dev_get_rcu(dev);
1562 unsigned int flags = RTCF_MULTICAST;
1563 struct rtable *rth;
1564 u32 itag = 0;
1565 int err;
1566
1567 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1568 if (err)
1569 return err;
1570
d08c4f35
DA
1571 if (our)
1572 flags |= RTCF_LOCAL;
1573
1574 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1575 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1576 if (!rth)
3b6ff83a 1577 return -ENOBUFS;
1da177e4 1578
cf911662
DM
1579#ifdef CONFIG_IP_ROUTE_CLASSID
1580 rth->dst.tclassid = itag;
1581#endif
d8d1f30b 1582 rth->dst.output = ip_rt_bug;
9917e1e8 1583 rth->rt_is_input= 1;
1da177e4
LT
1584
1585#ifdef CONFIG_IP_MROUTE
f97c1e0c 1586 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1587 rth->dst.input = ip_mr_input;
1da177e4
LT
1588#endif
1589 RT_CACHE_STAT_INC(in_slow_mc);
1590
89aef892
DM
1591 skb_dst_set(skb, &rth->dst);
1592 return 0;
1da177e4
LT
1593}
1594
1595
1596static void ip_handle_martian_source(struct net_device *dev,
1597 struct in_device *in_dev,
1598 struct sk_buff *skb,
9e12bb22
AV
1599 __be32 daddr,
1600 __be32 saddr)
1da177e4
LT
1601{
1602 RT_CACHE_STAT_INC(in_martian_src);
1603#ifdef CONFIG_IP_ROUTE_VERBOSE
1604 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1605 /*
1606 * RFC1812 recommendation, if source is martian,
1607 * the only hint is MAC header.
1608 */
058bd4d2 1609 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1610 &daddr, &saddr, dev->name);
98e399f8 1611 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1612 print_hex_dump(KERN_WARNING, "ll header: ",
1613 DUMP_PREFIX_OFFSET, 16, 1,
1614 skb_mac_header(skb),
1615 dev->hard_header_len, true);
1da177e4
LT
1616 }
1617 }
1618#endif
1619}
1620
deed49df
XL
1621static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1622{
1623 struct fnhe_hash_bucket *hash;
1624 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1625 u32 hval = fnhe_hashfun(daddr);
1626
1627 spin_lock_bh(&fnhe_lock);
1628
1629 hash = rcu_dereference_protected(nh->nh_exceptions,
1630 lockdep_is_held(&fnhe_lock));
1631 hash += hval;
1632
1633 fnhe_p = &hash->chain;
1634 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1635 while (fnhe) {
1636 if (fnhe->fnhe_daddr == daddr) {
1637 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1638 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1639 fnhe_flush_routes(fnhe);
1640 kfree_rcu(fnhe, rcu);
1641 break;
1642 }
1643 fnhe_p = &fnhe->fnhe_next;
1644 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1645 lockdep_is_held(&fnhe_lock));
1646 }
1647
1648 spin_unlock_bh(&fnhe_lock);
1649}
1650
efd85700
TG
1651static void set_lwt_redirect(struct rtable *rth)
1652{
1653 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1654 rth->dst.lwtstate->orig_output = rth->dst.output;
1655 rth->dst.output = lwtunnel_output;
1656 }
1657
1658 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1659 rth->dst.lwtstate->orig_input = rth->dst.input;
1660 rth->dst.input = lwtunnel_input;
1661 }
1662}
1663
47360228 1664/* called in rcu_read_lock() section */
5969f71d 1665static int __mkroute_input(struct sk_buff *skb,
982721f3 1666 const struct fib_result *res,
5969f71d 1667 struct in_device *in_dev,
c6cffba4 1668 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1669{
2ffae99d 1670 struct fib_nh_exception *fnhe;
1da177e4
LT
1671 struct rtable *rth;
1672 int err;
1673 struct in_device *out_dev;
d2d68ba9 1674 bool do_cache;
fbdc0ad0 1675 u32 itag = 0;
1da177e4
LT
1676
1677 /* get a working reference to the output device */
47360228 1678 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1679 if (!out_dev) {
e87cc472 1680 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1681 return -EINVAL;
1682 }
1683
5c04c819 1684 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1685 in_dev->dev, in_dev, &itag);
1da177e4 1686 if (err < 0) {
e905a9ed 1687 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1688 saddr);
e905a9ed 1689
1da177e4
LT
1690 goto cleanup;
1691 }
1692
e81da0e1
JA
1693 do_cache = res->fi && !itag;
1694 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1695 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1696 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1697 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1698 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1699
1700 if (skb->protocol != htons(ETH_P_IP)) {
1701 /* Not IP (i.e. ARP). Do not create route, if it is
1702 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1703 *
1704 * Proxy arp feature have been extended to allow, ARP
1705 * replies back to the same interface, to support
1706 * Private VLAN switch technologies. See arp.c.
1da177e4 1707 */
65324144
JDB
1708 if (out_dev == in_dev &&
1709 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1710 err = -EINVAL;
1711 goto cleanup;
1712 }
1713 }
1714
2ffae99d 1715 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1716 if (do_cache) {
deed49df 1717 if (fnhe) {
2ffae99d 1718 rth = rcu_dereference(fnhe->fnhe_rth_input);
deed49df
XL
1719 if (rth && rth->dst.expires &&
1720 time_after(jiffies, rth->dst.expires)) {
1721 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1722 fnhe = NULL;
1723 } else {
1724 goto rt_cache;
1725 }
1726 }
1727
1728 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2ffae99d 1729
deed49df 1730rt_cache:
e81da0e1
JA
1731 if (rt_cache_valid(rth)) {
1732 skb_dst_set_noref(skb, &rth->dst);
1733 goto out;
d2d68ba9
DM
1734 }
1735 }
f2bb4bed 1736
d08c4f35 1737 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1738 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1739 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1740 if (!rth) {
1741 err = -ENOBUFS;
1742 goto cleanup;
1743 }
1744
9917e1e8 1745 rth->rt_is_input = 1;
b7503e0c
DA
1746 if (res->table)
1747 rth->rt_table_id = res->table->tb_id;
a6254864 1748 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1749
d8d1f30b 1750 rth->dst.input = ip_forward;
1da177e4 1751
a4c2fd7f
WW
1752 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753 do_cache);
efd85700 1754 set_lwt_redirect(rth);
c6cffba4 1755 skb_dst_set(skb, &rth->dst);
d2d68ba9 1756out:
1da177e4
LT
1757 err = 0;
1758 cleanup:
1da177e4 1759 return err;
e905a9ed 1760}
1da177e4 1761
79a13159 1762#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1763/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1764 * calculated from the inner IP addresses.
79a13159 1765 */
bf4e0a3d
NA
1766static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767 struct flow_keys *hash_keys)
79a13159
PN
1768{
1769 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1770 const struct iphdr *inner_iph;
79a13159
PN
1771 const struct icmphdr *icmph;
1772 struct iphdr _inner_iph;
bf4e0a3d
NA
1773 struct icmphdr _icmph;
1774
1775 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1776 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1777 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1778 return;
79a13159
PN
1779
1780 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1781 return;
79a13159
PN
1782
1783 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1784 &_icmph);
1785 if (!icmph)
bf4e0a3d 1786 return;
79a13159
PN
1787
1788 if (icmph->type != ICMP_DEST_UNREACH &&
1789 icmph->type != ICMP_REDIRECT &&
1790 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1791 icmph->type != ICMP_PARAMETERPROB)
1792 return;
79a13159
PN
1793
1794 inner_iph = skb_header_pointer(skb,
1795 outer_iph->ihl * 4 + sizeof(_icmph),
1796 sizeof(_inner_iph), &_inner_iph);
1797 if (!inner_iph)
bf4e0a3d
NA
1798 return;
1799 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1800 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1801}
79a13159 1802
bf4e0a3d
NA
1803/* if skb is set it will be used and fl4 can be NULL */
1804int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1805 const struct sk_buff *skb)
1806{
1807 struct net *net = fi->fib_net;
1808 struct flow_keys hash_keys;
1809 u32 mhash;
79a13159 1810
bf4e0a3d
NA
1811 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1812 case 0:
1813 memset(&hash_keys, 0, sizeof(hash_keys));
1814 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1815 if (skb) {
1816 ip_multipath_l3_keys(skb, &hash_keys);
1817 } else {
1818 hash_keys.addrs.v4addrs.src = fl4->saddr;
1819 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1820 }
1821 break;
1822 case 1:
1823 /* skb is currently provided only when forwarding */
1824 if (skb) {
1825 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1826 struct flow_keys keys;
1827
1828 /* short-circuit if we already have L4 hash present */
1829 if (skb->l4_hash)
1830 return skb_get_hash_raw(skb) >> 1;
1831 memset(&hash_keys, 0, sizeof(hash_keys));
1832 skb_flow_dissect_flow_keys(skb, &keys, flag);
1833 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1834 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1835 hash_keys.ports.src = keys.ports.src;
1836 hash_keys.ports.dst = keys.ports.dst;
1837 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1838 } else {
1839 memset(&hash_keys, 0, sizeof(hash_keys));
1840 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841 hash_keys.addrs.v4addrs.src = fl4->saddr;
1842 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1843 hash_keys.ports.src = fl4->fl4_sport;
1844 hash_keys.ports.dst = fl4->fl4_dport;
1845 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1846 }
1847 break;
1848 }
1849 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1850
bf4e0a3d
NA
1851 return mhash >> 1;
1852}
1853EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1854#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1855
5969f71d
SH
1856static int ip_mkroute_input(struct sk_buff *skb,
1857 struct fib_result *res,
5969f71d
SH
1858 struct in_device *in_dev,
1859 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1860{
1da177e4 1861#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1862 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1863 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1864
0e884c78
PN
1865 fib_select_multipath(res, h);
1866 }
1da177e4
LT
1867#endif
1868
1869 /* create a routing cache entry */
c6cffba4 1870 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1871}
1872
1da177e4
LT
1873/*
1874 * NOTE. We drop all the packets that has local source
1875 * addresses, because every properly looped back packet
1876 * must have correct destination already attached by output routine.
1877 *
1878 * Such approach solves two big problems:
1879 * 1. Not simplex devices are handled properly.
1880 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1881 * called with rcu_read_lock()
1da177e4
LT
1882 */
1883
9e12bb22 1884static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1885 u8 tos, struct net_device *dev,
1886 struct fib_result *res)
1da177e4 1887{
96d36220 1888 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1889 struct ip_tunnel_info *tun_info;
68a5e3dd 1890 struct flowi4 fl4;
95c96174 1891 unsigned int flags = 0;
1da177e4 1892 u32 itag = 0;
95c96174 1893 struct rtable *rth;
1da177e4 1894 int err = -EINVAL;
5e73ea1a 1895 struct net *net = dev_net(dev);
d2d68ba9 1896 bool do_cache;
1da177e4
LT
1897
1898 /* IP on this device is disabled. */
1899
1900 if (!in_dev)
1901 goto out;
1902
1903 /* Check for the most weird martians, which can be not detected
1904 by fib_lookup.
1905 */
1906
61adedf3 1907 tun_info = skb_tunnel_info(skb);
46fa062a 1908 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1909 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1910 else
1911 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1912 skb_dst_drop(skb);
1913
d0daebc3 1914 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1915 goto martian_source;
1916
5510cdf7
DA
1917 res->fi = NULL;
1918 res->table = NULL;
27a954bd 1919 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1920 goto brd_input;
1921
1922 /* Accept zero addresses only to limited broadcast;
1923 * I even do not know to fix it or not. Waiting for complains :-)
1924 */
f97c1e0c 1925 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1926 goto martian_source;
1927
d0daebc3 1928 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1929 goto martian_destination;
1930
9eb43e76
ED
1931 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1932 * and call it once if daddr or/and saddr are loopback addresses
1933 */
1934 if (ipv4_is_loopback(daddr)) {
1935 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1936 goto martian_destination;
9eb43e76
ED
1937 } else if (ipv4_is_loopback(saddr)) {
1938 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1939 goto martian_source;
1940 }
1941
1da177e4
LT
1942 /*
1943 * Now we are ready to route packet.
1944 */
68a5e3dd 1945 fl4.flowi4_oif = 0;
e0d56fdd 1946 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1947 fl4.flowi4_mark = skb->mark;
1948 fl4.flowi4_tos = tos;
1949 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1950 fl4.flowi4_flags = 0;
68a5e3dd
DM
1951 fl4.daddr = daddr;
1952 fl4.saddr = saddr;
8bcfd092 1953 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1954 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1955 if (err != 0) {
1956 if (!IN_DEV_FORWARD(in_dev))
1957 err = -EHOSTUNREACH;
1da177e4 1958 goto no_route;
cd0f0b95 1959 }
1da177e4 1960
5510cdf7 1961 if (res->type == RTN_BROADCAST)
1da177e4
LT
1962 goto brd_input;
1963
5510cdf7 1964 if (res->type == RTN_LOCAL) {
5c04c819 1965 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1966 0, dev, in_dev, &itag);
b5f7e755 1967 if (err < 0)
0d753960 1968 goto martian_source;
1da177e4
LT
1969 goto local_input;
1970 }
1971
cd0f0b95
DJ
1972 if (!IN_DEV_FORWARD(in_dev)) {
1973 err = -EHOSTUNREACH;
251da413 1974 goto no_route;
cd0f0b95 1975 }
5510cdf7 1976 if (res->type != RTN_UNICAST)
1da177e4
LT
1977 goto martian_destination;
1978
5510cdf7 1979 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1980out: return err;
1981
1982brd_input:
1983 if (skb->protocol != htons(ETH_P_IP))
1984 goto e_inval;
1985
41347dcd 1986 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1987 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1988 in_dev, &itag);
1da177e4 1989 if (err < 0)
0d753960 1990 goto martian_source;
1da177e4
LT
1991 }
1992 flags |= RTCF_BROADCAST;
5510cdf7 1993 res->type = RTN_BROADCAST;
1da177e4
LT
1994 RT_CACHE_STAT_INC(in_brd);
1995
1996local_input:
d2d68ba9 1997 do_cache = false;
5510cdf7 1998 if (res->fi) {
fe3edf45 1999 if (!itag) {
5510cdf7 2000 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2001 if (rt_cache_valid(rth)) {
c6cffba4
DM
2002 skb_dst_set_noref(skb, &rth->dst);
2003 err = 0;
2004 goto out;
d2d68ba9
DM
2005 }
2006 do_cache = true;
2007 }
2008 }
2009
f5a0aab8 2010 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2011 flags | RTCF_LOCAL, res->type,
d2d68ba9 2012 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2013 if (!rth)
2014 goto e_nobufs;
2015
d8d1f30b 2016 rth->dst.output= ip_rt_bug;
cf911662
DM
2017#ifdef CONFIG_IP_ROUTE_CLASSID
2018 rth->dst.tclassid = itag;
2019#endif
9917e1e8 2020 rth->rt_is_input = 1;
5510cdf7
DA
2021 if (res->table)
2022 rth->rt_table_id = res->table->tb_id;
571e7226 2023
a6254864 2024 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2025 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2026 rth->dst.input= ip_error;
2027 rth->dst.error= -err;
1da177e4
LT
2028 rth->rt_flags &= ~RTCF_LOCAL;
2029 }
efd85700 2030
dcdfdf56 2031 if (do_cache) {
5510cdf7 2032 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2033
2034 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2035 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2036 WARN_ON(rth->dst.input == lwtunnel_input);
2037 rth->dst.lwtstate->orig_input = rth->dst.input;
2038 rth->dst.input = lwtunnel_input;
2039 }
2040
a4c2fd7f 2041 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2042 rt_add_uncached_list(rth);
dcdfdf56 2043 }
89aef892 2044 skb_dst_set(skb, &rth->dst);
b23dd4fe 2045 err = 0;
ebc0ffae 2046 goto out;
1da177e4
LT
2047
2048no_route:
2049 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2050 res->type = RTN_UNREACHABLE;
2051 res->fi = NULL;
2052 res->table = NULL;
1da177e4
LT
2053 goto local_input;
2054
2055 /*
2056 * Do not cache martian addresses: they should be logged (RFC1812)
2057 */
2058martian_destination:
2059 RT_CACHE_STAT_INC(in_martian_dst);
2060#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2061 if (IN_DEV_LOG_MARTIANS(in_dev))
2062 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2063 &daddr, &saddr, dev->name);
1da177e4 2064#endif
2c2910a4 2065
1da177e4
LT
2066e_inval:
2067 err = -EINVAL;
ebc0ffae 2068 goto out;
1da177e4
LT
2069
2070e_nobufs:
2071 err = -ENOBUFS;
ebc0ffae 2072 goto out;
1da177e4
LT
2073
2074martian_source:
2075 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2076 goto out;
1da177e4
LT
2077}
2078
c6cffba4
DM
2079int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080 u8 tos, struct net_device *dev)
1da177e4 2081{
5510cdf7
DA
2082 struct fib_result res;
2083 int err;
1da177e4 2084
6e28099d 2085 tos &= IPTOS_RT_MASK;
96d36220 2086 rcu_read_lock();
5510cdf7
DA
2087 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2088 rcu_read_unlock();
96d36220 2089
5510cdf7
DA
2090 return err;
2091}
2092EXPORT_SYMBOL(ip_route_input_noref);
2093
2094/* called with rcu_read_lock held */
2095int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096 u8 tos, struct net_device *dev, struct fib_result *res)
2097{
1da177e4
LT
2098 /* Multicast recognition logic is moved from route cache to here.
2099 The problem was that too many Ethernet cards have broken/missing
2100 hardware multicast filters :-( As result the host on multicasting
2101 network acquires a lot of useless route cache entries, sort of
2102 SDR messages from all the world. Now we try to get rid of them.
2103 Really, provided software IP multicast filter is organized
2104 reasonably (at least, hashed), it does not result in a slowdown
2105 comparing with route cache reject entries.
2106 Note, that multicast routers are not affected, because
2107 route cache entry is created eventually.
2108 */
f97c1e0c 2109 if (ipv4_is_multicast(daddr)) {
96d36220 2110 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2111 int our = 0;
5510cdf7 2112 int err = -EINVAL;
1da177e4 2113
e58e4159
DA
2114 if (in_dev)
2115 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2116 ip_hdr(skb)->protocol);
2117
2118 /* check l3 master if no match yet */
2119 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2120 struct in_device *l3_in_dev;
2121
2122 l3_in_dev = __in_dev_get_rcu(skb->dev);
2123 if (l3_in_dev)
2124 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2125 ip_hdr(skb)->protocol);
2126 }
2127
e58e4159 2128 if (our
1da177e4 2129#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2130 ||
2131 (!ipv4_is_local_multicast(daddr) &&
2132 IN_DEV_MFORWARD(in_dev))
1da177e4 2133#endif
e58e4159 2134 ) {
5510cdf7 2135 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2136 tos, dev, our);
1da177e4 2137 }
5510cdf7 2138 return err;
1da177e4 2139 }
5510cdf7
DA
2140
2141 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2142}
2143
ebc0ffae 2144/* called with rcu_read_lock() */
982721f3 2145static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2146 const struct flowi4 *fl4, int orig_oif,
f61759e6 2147 struct net_device *dev_out,
5ada5527 2148 unsigned int flags)
1da177e4 2149{
982721f3 2150 struct fib_info *fi = res->fi;
f2bb4bed 2151 struct fib_nh_exception *fnhe;
5ada5527 2152 struct in_device *in_dev;
982721f3 2153 u16 type = res->type;
5ada5527 2154 struct rtable *rth;
c92b9655 2155 bool do_cache;
1da177e4 2156
d0daebc3
TG
2157 in_dev = __in_dev_get_rcu(dev_out);
2158 if (!in_dev)
5ada5527 2159 return ERR_PTR(-EINVAL);
1da177e4 2160
d0daebc3 2161 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2162 if (ipv4_is_loopback(fl4->saddr) &&
2163 !(dev_out->flags & IFF_LOOPBACK) &&
2164 !netif_is_l3_master(dev_out))
d0daebc3
TG
2165 return ERR_PTR(-EINVAL);
2166
68a5e3dd 2167 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2168 type = RTN_BROADCAST;
68a5e3dd 2169 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2170 type = RTN_MULTICAST;
68a5e3dd 2171 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2172 return ERR_PTR(-EINVAL);
1da177e4
LT
2173
2174 if (dev_out->flags & IFF_LOOPBACK)
2175 flags |= RTCF_LOCAL;
2176
63617421 2177 do_cache = true;
982721f3 2178 if (type == RTN_BROADCAST) {
1da177e4 2179 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2180 fi = NULL;
2181 } else if (type == RTN_MULTICAST) {
dd28d1a0 2182 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2183 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2184 fl4->flowi4_proto))
1da177e4 2185 flags &= ~RTCF_LOCAL;
63617421
JA
2186 else
2187 do_cache = false;
1da177e4 2188 /* If multicast route do not exist use
dd28d1a0
ED
2189 * default one, but do not gateway in this case.
2190 * Yes, it is hack.
1da177e4 2191 */
982721f3
DM
2192 if (fi && res->prefixlen < 4)
2193 fi = NULL;
d6d5e999
CF
2194 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2195 (orig_oif != dev_out->ifindex)) {
2196 /* For local routes that require a particular output interface
2197 * we do not want to cache the result. Caching the result
2198 * causes incorrect behaviour when there are multiple source
2199 * addresses on the interface, the end result being that if the
2200 * intended recipient is waiting on that interface for the
2201 * packet he won't receive it because it will be delivered on
2202 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2203 * be set to the loopback interface as well.
2204 */
2205 fi = NULL;
1da177e4
LT
2206 }
2207
f2bb4bed 2208 fnhe = NULL;
63617421
JA
2209 do_cache &= fi != NULL;
2210 if (do_cache) {
c5038a83 2211 struct rtable __rcu **prth;
c92b9655 2212 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2213
c92b9655 2214 fnhe = find_exception(nh, fl4->daddr);
deed49df 2215 if (fnhe) {
2ffae99d 2216 prth = &fnhe->fnhe_rth_output;
deed49df
XL
2217 rth = rcu_dereference(*prth);
2218 if (rth && rth->dst.expires &&
2219 time_after(jiffies, rth->dst.expires)) {
2220 ip_del_fnhe(nh, fl4->daddr);
2221 fnhe = NULL;
2222 } else {
2223 goto rt_cache;
c92b9655 2224 }
c92b9655 2225 }
deed49df
XL
2226
2227 if (unlikely(fl4->flowi4_flags &
2228 FLOWI_FLAG_KNOWN_NH &&
2229 !(nh->nh_gw &&
2230 nh->nh_scope == RT_SCOPE_LINK))) {
2231 do_cache = false;
2232 goto add;
2233 }
2234 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c5038a83 2235 rth = rcu_dereference(*prth);
deed49df
XL
2236
2237rt_cache:
9df16efa 2238 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2239 return rth;
f2bb4bed 2240 }
c92b9655
JA
2241
2242add:
d08c4f35 2243 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2244 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2245 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2246 do_cache);
8391d07b 2247 if (!rth)
5ada5527 2248 return ERR_PTR(-ENOBUFS);
8391d07b 2249
13378cad 2250 rth->rt_iif = orig_oif ? : 0;
b7503e0c
DA
2251 if (res->table)
2252 rth->rt_table_id = res->table->tb_id;
2253
1da177e4
LT
2254 RT_CACHE_STAT_INC(out_slow_tot);
2255
1da177e4 2256 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2257 if (flags & RTCF_LOCAL &&
1da177e4 2258 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2259 rth->dst.output = ip_mc_output;
1da177e4
LT
2260 RT_CACHE_STAT_INC(out_slow_mc);
2261 }
2262#ifdef CONFIG_IP_MROUTE
982721f3 2263 if (type == RTN_MULTICAST) {
1da177e4 2264 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2265 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2266 rth->dst.input = ip_mr_input;
2267 rth->dst.output = ip_mc_output;
1da177e4
LT
2268 }
2269 }
2270#endif
2271 }
2272
a4c2fd7f 2273 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2274 set_lwt_redirect(rth);
1da177e4 2275
5ada5527 2276 return rth;
1da177e4
LT
2277}
2278
1da177e4
LT
2279/*
2280 * Major route resolver routine.
2281 */
2282
3abd1ade
DA
2283struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2284 const struct sk_buff *skb)
1da177e4 2285{
f61759e6 2286 __u8 tos = RT_FL_TOS(fl4);
813b3b5d 2287 struct fib_result res;
5ada5527 2288 struct rtable *rth;
1da177e4 2289
85b91b03 2290 res.tclassid = 0;
1da177e4 2291 res.fi = NULL;
8b96d22d 2292 res.table = NULL;
1da177e4 2293
1fb9489b 2294 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2295 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2296 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2297 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2298
010c2708 2299 rcu_read_lock();
3abd1ade
DA
2300 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2301 rcu_read_unlock();
2302
2303 return rth;
2304}
2305EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2306
2307struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2308 struct fib_result *res,
2309 const struct sk_buff *skb)
2310{
2311 struct net_device *dev_out = NULL;
2312 int orig_oif = fl4->flowi4_oif;
2313 unsigned int flags = 0;
2314 struct rtable *rth;
2315 int err = -ENETUNREACH;
2316
813b3b5d 2317 if (fl4->saddr) {
b23dd4fe 2318 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2319 if (ipv4_is_multicast(fl4->saddr) ||
2320 ipv4_is_lbcast(fl4->saddr) ||
2321 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2322 goto out;
2323
1da177e4
LT
2324 /* I removed check for oif == dev_out->oif here.
2325 It was wrong for two reasons:
1ab35276
DL
2326 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2327 is assigned to multiple interfaces.
1da177e4
LT
2328 2. Moreover, we are allowed to send packets with saddr
2329 of another iface. --ANK
2330 */
2331
813b3b5d
DM
2332 if (fl4->flowi4_oif == 0 &&
2333 (ipv4_is_multicast(fl4->daddr) ||
2334 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2335 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2336 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2337 if (!dev_out)
a210d01a
JA
2338 goto out;
2339
1da177e4
LT
2340 /* Special hack: user can direct multicasts
2341 and limited broadcast via necessary interface
2342 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2343 This hack is not just for fun, it allows
2344 vic,vat and friends to work.
2345 They bind socket to loopback, set ttl to zero
2346 and expect that it will work.
2347 From the viewpoint of routing cache they are broken,
2348 because we are not allowed to build multicast path
2349 with loopback source addr (look, routing cache
2350 cannot know, that ttl is zero, so that packet
2351 will not leave this host and route is valid).
2352 Luckily, this hack is good workaround.
2353 */
2354
813b3b5d 2355 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2356 goto make_route;
2357 }
a210d01a 2358
813b3b5d 2359 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2360 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2361 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2362 goto out;
a210d01a 2363 }
1da177e4
LT
2364 }
2365
2366
813b3b5d
DM
2367 if (fl4->flowi4_oif) {
2368 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2369 rth = ERR_PTR(-ENODEV);
51456b29 2370 if (!dev_out)
1da177e4 2371 goto out;
e5ed6399
HX
2372
2373 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2374 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2375 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2376 goto out;
2377 }
813b3b5d 2378 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2379 ipv4_is_lbcast(fl4->daddr) ||
2380 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2381 if (!fl4->saddr)
2382 fl4->saddr = inet_select_addr(dev_out, 0,
2383 RT_SCOPE_LINK);
1da177e4
LT
2384 goto make_route;
2385 }
0a7e2260 2386 if (!fl4->saddr) {
813b3b5d
DM
2387 if (ipv4_is_multicast(fl4->daddr))
2388 fl4->saddr = inet_select_addr(dev_out, 0,
2389 fl4->flowi4_scope);
2390 else if (!fl4->daddr)
2391 fl4->saddr = inet_select_addr(dev_out, 0,
2392 RT_SCOPE_HOST);
1da177e4
LT
2393 }
2394 }
2395
813b3b5d
DM
2396 if (!fl4->daddr) {
2397 fl4->daddr = fl4->saddr;
2398 if (!fl4->daddr)
2399 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2400 dev_out = net->loopback_dev;
1fb9489b 2401 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2402 res->type = RTN_LOCAL;
1da177e4
LT
2403 flags |= RTCF_LOCAL;
2404 goto make_route;
2405 }
2406
3abd1ade 2407 err = fib_lookup(net, fl4, res, 0);
0315e382 2408 if (err) {
3abd1ade
DA
2409 res->fi = NULL;
2410 res->table = NULL;
6104e112 2411 if (fl4->flowi4_oif &&
e58e4159
DA
2412 (ipv4_is_multicast(fl4->daddr) ||
2413 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2414 /* Apparently, routing tables are wrong. Assume,
2415 that the destination is on link.
2416
2417 WHY? DW.
2418 Because we are allowed to send to iface
2419 even if it has NO routes and NO assigned
2420 addresses. When oif is specified, routing
2421 tables are looked up with only one purpose:
2422 to catch if destination is gatewayed, rather than
2423 direct. Moreover, if MSG_DONTROUTE is set,
2424 we send packet, ignoring both routing tables
2425 and ifaddr state. --ANK
2426
2427
2428 We could make it even if oif is unknown,
2429 likely IPv6, but we do not.
2430 */
2431
813b3b5d
DM
2432 if (fl4->saddr == 0)
2433 fl4->saddr = inet_select_addr(dev_out, 0,
2434 RT_SCOPE_LINK);
3abd1ade 2435 res->type = RTN_UNICAST;
1da177e4
LT
2436 goto make_route;
2437 }
0315e382 2438 rth = ERR_PTR(err);
1da177e4
LT
2439 goto out;
2440 }
1da177e4 2441
3abd1ade 2442 if (res->type == RTN_LOCAL) {
813b3b5d 2443 if (!fl4->saddr) {
3abd1ade
DA
2444 if (res->fi->fib_prefsrc)
2445 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2446 else
813b3b5d 2447 fl4->saddr = fl4->daddr;
9fc3bbb4 2448 }
5f02ce24
DA
2449
2450 /* L3 master device is the loopback for that domain */
3abd1ade 2451 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2452 net->loopback_dev;
813b3b5d 2453 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2454 flags |= RTCF_LOCAL;
2455 goto make_route;
2456 }
2457
3abd1ade 2458 fib_select_path(net, res, fl4, skb);
1da177e4 2459
3abd1ade 2460 dev_out = FIB_RES_DEV(*res);
813b3b5d 2461 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2462
2463
2464make_route:
3abd1ade 2465 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2466
010c2708 2467out:
b23dd4fe 2468 return rth;
1da177e4 2469}
d8c97a94 2470
ae2688d5
JW
2471static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2472{
2473 return NULL;
2474}
2475
ebb762f2 2476static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2477{
618f9bc7
SK
2478 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2479
2480 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2481}
2482
6700c270
DM
2483static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2484 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2485{
2486}
2487
6700c270
DM
2488static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2489 struct sk_buff *skb)
b587ee3b
DM
2490{
2491}
2492
0972ddb2
HB
2493static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2494 unsigned long old)
2495{
2496 return NULL;
2497}
2498
14e50e57
DM
2499static struct dst_ops ipv4_dst_blackhole_ops = {
2500 .family = AF_INET,
ae2688d5 2501 .check = ipv4_blackhole_dst_check,
ebb762f2 2502 .mtu = ipv4_blackhole_mtu,
214f45c9 2503 .default_advmss = ipv4_default_advmss,
14e50e57 2504 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2505 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2506 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2507 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2508};
2509
2774c131 2510struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2511{
2774c131 2512 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2513 struct rtable *rt;
14e50e57 2514
3b9d5151 2515 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2516 if (rt) {
d8d1f30b 2517 struct dst_entry *new = &rt->dst;
14e50e57 2518
14e50e57 2519 new->__use = 1;
352e512c 2520 new->input = dst_discard;
ede2059d 2521 new->output = dst_discard_out;
14e50e57 2522
1dbe3252 2523 new->dev = net->loopback_dev;
14e50e57
DM
2524 if (new->dev)
2525 dev_hold(new->dev);
2526
9917e1e8 2527 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2528 rt->rt_iif = ort->rt_iif;
5943634f 2529 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2530
ca4c3fc2 2531 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2532 rt->rt_flags = ort->rt_flags;
2533 rt->rt_type = ort->rt_type;
14e50e57 2534 rt->rt_gateway = ort->rt_gateway;
155e8336 2535 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2536
caacf05e 2537 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2538 }
2539
2774c131
DM
2540 dst_release(dst_orig);
2541
2542 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2543}
2544
9d6ec938 2545struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2546 const struct sock *sk)
1da177e4 2547{
9d6ec938 2548 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2549
b23dd4fe
DM
2550 if (IS_ERR(rt))
2551 return rt;
1da177e4 2552
56157872 2553 if (flp4->flowi4_proto)
f92ee619
SK
2554 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2555 flowi4_to_flowi(flp4),
2556 sk, 0);
1da177e4 2557
b23dd4fe 2558 return rt;
1da177e4 2559}
d8c97a94
ACM
2560EXPORT_SYMBOL_GPL(ip_route_output_flow);
2561
3765d35e 2562/* called with rcu_read_lock held */
c36ba660 2563static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2564 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2565 u32 seq)
1da177e4 2566{
ba52d61e 2567 struct rtable *rt = skb_rtable(skb);
1da177e4 2568 struct rtmsg *r;
be403ea1 2569 struct nlmsghdr *nlh;
2bc8ca40 2570 unsigned long expires = 0;
f185071d 2571 u32 error;
521f5490 2572 u32 metrics[RTAX_MAX];
be403ea1 2573
d3166e0c 2574 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2575 if (!nlh)
26932566 2576 return -EMSGSIZE;
be403ea1
TG
2577
2578 r = nlmsg_data(nlh);
1da177e4
LT
2579 r->rtm_family = AF_INET;
2580 r->rtm_dst_len = 32;
2581 r->rtm_src_len = 0;
d6c0a4f6 2582 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2583 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2584 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2585 goto nla_put_failure;
1da177e4
LT
2586 r->rtm_type = rt->rt_type;
2587 r->rtm_scope = RT_SCOPE_UNIVERSE;
2588 r->rtm_protocol = RTPROT_UNSPEC;
2589 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2590 if (rt->rt_flags & RTCF_NOTIFY)
2591 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2592 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2593 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2594
930345ea 2595 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2596 goto nla_put_failure;
1a00fee4 2597 if (src) {
1da177e4 2598 r->rtm_src_len = 32;
930345ea 2599 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2600 goto nla_put_failure;
1da177e4 2601 }
f3756b79
DM
2602 if (rt->dst.dev &&
2603 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2604 goto nla_put_failure;
c7066f70 2605#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2606 if (rt->dst.tclassid &&
2607 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2608 goto nla_put_failure;
1da177e4 2609#endif
41347dcd 2610 if (!rt_is_input_route(rt) &&
d6c0a4f6 2611 fl4->saddr != src) {
930345ea 2612 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2613 goto nla_put_failure;
2614 }
155e8336 2615 if (rt->rt_uses_gateway &&
930345ea 2616 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2617 goto nla_put_failure;
be403ea1 2618
ee9a8f7a
SK
2619 expires = rt->dst.expires;
2620 if (expires) {
2621 unsigned long now = jiffies;
2622
2623 if (time_before(now, expires))
2624 expires -= now;
2625 else
2626 expires = 0;
2627 }
2628
521f5490 2629 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2630 if (rt->rt_pmtu && expires)
521f5490
JA
2631 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2632 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2633 goto nla_put_failure;
2634
b4869889 2635 if (fl4->flowi4_mark &&
68aaed54 2636 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2637 goto nla_put_failure;
963bfeee 2638
622ec2c9
LC
2639 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2640 nla_put_u32(skb, RTA_UID,
2641 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2642 goto nla_put_failure;
2643
d8d1f30b 2644 error = rt->dst.error;
be403ea1 2645
c7537967 2646 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2647#ifdef CONFIG_IP_MROUTE
2648 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2649 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2650 int err = ipmr_get_route(net, skb,
2651 fl4->saddr, fl4->daddr,
9f09eaea 2652 r, portid);
2cf75070 2653
8caaf7b6 2654 if (err <= 0) {
0c8d803f
DA
2655 if (err == 0)
2656 return 0;
2657 goto nla_put_failure;
8caaf7b6
ND
2658 }
2659 } else
2660#endif
91146153 2661 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2662 goto nla_put_failure;
1da177e4
LT
2663 }
2664
f185071d 2665 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2666 goto nla_put_failure;
be403ea1 2667
053c095a
JB
2668 nlmsg_end(skb, nlh);
2669 return 0;
1da177e4 2670
be403ea1 2671nla_put_failure:
26932566
PM
2672 nlmsg_cancel(skb, nlh);
2673 return -EMSGSIZE;
1da177e4
LT
2674}
2675
c21ef3e3
DA
2676static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2677 struct netlink_ext_ack *extack)
1da177e4 2678{
3b1e0a65 2679 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2680 struct rtmsg *rtm;
2681 struct nlattr *tb[RTA_MAX+1];
3765d35e 2682 struct fib_result res = {};
1da177e4 2683 struct rtable *rt = NULL;
d6c0a4f6 2684 struct flowi4 fl4;
9e12bb22
AV
2685 __be32 dst = 0;
2686 __be32 src = 0;
2687 u32 iif;
d889ce3b 2688 int err;
963bfeee 2689 int mark;
1da177e4 2690 struct sk_buff *skb;
c36ba660 2691 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2692 kuid_t uid;
1da177e4 2693
fceb6435 2694 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2695 extack);
d889ce3b
TG
2696 if (err < 0)
2697 goto errout;
2698
2699 rtm = nlmsg_data(nlh);
2700
1da177e4 2701 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2702 if (!skb) {
d889ce3b
TG
2703 err = -ENOBUFS;
2704 goto errout;
2705 }
1da177e4
LT
2706
2707 /* Reserve room for dummy headers, this skb can pass
2708 through good chunk of routing engine.
2709 */
459a98ed 2710 skb_reset_mac_header(skb);
c1d2bbe1 2711 skb_reset_network_header(skb);
d2c962b8 2712
67b61f6c
JB
2713 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2714 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2715 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2716 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2717 if (tb[RTA_UID])
2718 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2719 else
2720 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2721
bbadb9a2
FL
2722 /* Bugfix: need to give ip_route_input enough of an IP header to
2723 * not gag.
2724 */
2725 ip_hdr(skb)->protocol = IPPROTO_UDP;
2726 ip_hdr(skb)->saddr = src;
2727 ip_hdr(skb)->daddr = dst;
2728
2729 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2730
d6c0a4f6
DM
2731 memset(&fl4, 0, sizeof(fl4));
2732 fl4.daddr = dst;
2733 fl4.saddr = src;
2734 fl4.flowi4_tos = rtm->rtm_tos;
2735 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2736 fl4.flowi4_mark = mark;
622ec2c9 2737 fl4.flowi4_uid = uid;
d6c0a4f6 2738
3765d35e
DA
2739 rcu_read_lock();
2740
1da177e4 2741 if (iif) {
d889ce3b
TG
2742 struct net_device *dev;
2743
3765d35e 2744 dev = dev_get_by_index_rcu(net, iif);
51456b29 2745 if (!dev) {
d889ce3b
TG
2746 err = -ENODEV;
2747 goto errout_free;
2748 }
2749
1da177e4
LT
2750 skb->protocol = htons(ETH_P_IP);
2751 skb->dev = dev;
963bfeee 2752 skb->mark = mark;
3765d35e
DA
2753 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2754 dev, &res);
d889ce3b 2755
511c3f92 2756 rt = skb_rtable(skb);
d8d1f30b
CG
2757 if (err == 0 && rt->dst.error)
2758 err = -rt->dst.error;
1da177e4 2759 } else {
3765d35e 2760 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2761 err = 0;
2762 if (IS_ERR(rt))
2763 err = PTR_ERR(rt);
2c87d63a
FW
2764 else
2765 skb_dst_set(skb, &rt->dst);
1da177e4 2766 }
d889ce3b 2767
1da177e4 2768 if (err)
d889ce3b 2769 goto errout_free;
1da177e4 2770
1da177e4
LT
2771 if (rtm->rtm_flags & RTM_F_NOTIFY)
2772 rt->rt_flags |= RTCF_NOTIFY;
2773
c36ba660
DA
2774 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2775 table_id = rt->rt_table_id;
2776
bc3aae2b
RP
2777 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2778 if (!res.fi) {
2779 err = fib_props[res.type].error;
2780 if (!err)
2781 err = -EHOSTUNREACH;
2782 goto errout_free;
2783 }
b6179813
RP
2784 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2785 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2786 rt->rt_type, res.prefix, res.prefixlen,
2787 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2788 } else {
b6179813 2789 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2790 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2791 }
7b46a644 2792 if (err < 0)
d889ce3b 2793 goto errout_free;
1da177e4 2794
3765d35e
DA
2795 rcu_read_unlock();
2796
15e47304 2797 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2798errout:
2942e900 2799 return err;
1da177e4 2800
d889ce3b 2801errout_free:
3765d35e 2802 rcu_read_unlock();
1da177e4 2803 kfree_skb(skb);
d889ce3b 2804 goto errout;
1da177e4
LT
2805}
2806
1da177e4
LT
2807void ip_rt_multicast_event(struct in_device *in_dev)
2808{
4ccfe6d4 2809 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2810}
2811
2812#ifdef CONFIG_SYSCTL
082c7ca4
G
2813static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2814static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2815static int ip_rt_gc_elasticity __read_mostly = 8;
2816
fe2c6338 2817static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2818 void __user *buffer,
1da177e4
LT
2819 size_t *lenp, loff_t *ppos)
2820{
5aad1de5
TT
2821 struct net *net = (struct net *)__ctl->extra1;
2822
1da177e4 2823 if (write) {
5aad1de5
TT
2824 rt_cache_flush(net);
2825 fnhe_genid_bump(net);
1da177e4 2826 return 0;
e905a9ed 2827 }
1da177e4
LT
2828
2829 return -EINVAL;
2830}
2831
fe2c6338 2832static struct ctl_table ipv4_route_table[] = {
1da177e4 2833 {
1da177e4
LT
2834 .procname = "gc_thresh",
2835 .data = &ipv4_dst_ops.gc_thresh,
2836 .maxlen = sizeof(int),
2837 .mode = 0644,
6d9f239a 2838 .proc_handler = proc_dointvec,
1da177e4
LT
2839 },
2840 {
1da177e4
LT
2841 .procname = "max_size",
2842 .data = &ip_rt_max_size,
2843 .maxlen = sizeof(int),
2844 .mode = 0644,
6d9f239a 2845 .proc_handler = proc_dointvec,
1da177e4
LT
2846 },
2847 {
2848 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2849
1da177e4
LT
2850 .procname = "gc_min_interval",
2851 .data = &ip_rt_gc_min_interval,
2852 .maxlen = sizeof(int),
2853 .mode = 0644,
6d9f239a 2854 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2855 },
2856 {
1da177e4
LT
2857 .procname = "gc_min_interval_ms",
2858 .data = &ip_rt_gc_min_interval,
2859 .maxlen = sizeof(int),
2860 .mode = 0644,
6d9f239a 2861 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2862 },
2863 {
1da177e4
LT
2864 .procname = "gc_timeout",
2865 .data = &ip_rt_gc_timeout,
2866 .maxlen = sizeof(int),
2867 .mode = 0644,
6d9f239a 2868 .proc_handler = proc_dointvec_jiffies,
1da177e4 2869 },
9f28a2fc
ED
2870 {
2871 .procname = "gc_interval",
2872 .data = &ip_rt_gc_interval,
2873 .maxlen = sizeof(int),
2874 .mode = 0644,
2875 .proc_handler = proc_dointvec_jiffies,
2876 },
1da177e4 2877 {
1da177e4
LT
2878 .procname = "redirect_load",
2879 .data = &ip_rt_redirect_load,
2880 .maxlen = sizeof(int),
2881 .mode = 0644,
6d9f239a 2882 .proc_handler = proc_dointvec,
1da177e4
LT
2883 },
2884 {
1da177e4
LT
2885 .procname = "redirect_number",
2886 .data = &ip_rt_redirect_number,
2887 .maxlen = sizeof(int),
2888 .mode = 0644,
6d9f239a 2889 .proc_handler = proc_dointvec,
1da177e4
LT
2890 },
2891 {
1da177e4
LT
2892 .procname = "redirect_silence",
2893 .data = &ip_rt_redirect_silence,
2894 .maxlen = sizeof(int),
2895 .mode = 0644,
6d9f239a 2896 .proc_handler = proc_dointvec,
1da177e4
LT
2897 },
2898 {
1da177e4
LT
2899 .procname = "error_cost",
2900 .data = &ip_rt_error_cost,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
6d9f239a 2903 .proc_handler = proc_dointvec,
1da177e4
LT
2904 },
2905 {
1da177e4
LT
2906 .procname = "error_burst",
2907 .data = &ip_rt_error_burst,
2908 .maxlen = sizeof(int),
2909 .mode = 0644,
6d9f239a 2910 .proc_handler = proc_dointvec,
1da177e4
LT
2911 },
2912 {
1da177e4
LT
2913 .procname = "gc_elasticity",
2914 .data = &ip_rt_gc_elasticity,
2915 .maxlen = sizeof(int),
2916 .mode = 0644,
6d9f239a 2917 .proc_handler = proc_dointvec,
1da177e4
LT
2918 },
2919 {
1da177e4
LT
2920 .procname = "mtu_expires",
2921 .data = &ip_rt_mtu_expires,
2922 .maxlen = sizeof(int),
2923 .mode = 0644,
6d9f239a 2924 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2925 },
2926 {
1da177e4
LT
2927 .procname = "min_pmtu",
2928 .data = &ip_rt_min_pmtu,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
6d9f239a 2931 .proc_handler = proc_dointvec,
1da177e4
LT
2932 },
2933 {
1da177e4
LT
2934 .procname = "min_adv_mss",
2935 .data = &ip_rt_min_advmss,
2936 .maxlen = sizeof(int),
2937 .mode = 0644,
6d9f239a 2938 .proc_handler = proc_dointvec,
1da177e4 2939 },
f8572d8f 2940 { }
1da177e4 2941};
39a23e75 2942
39a23e75
DL
2943static struct ctl_table ipv4_route_flush_table[] = {
2944 {
39a23e75
DL
2945 .procname = "flush",
2946 .maxlen = sizeof(int),
2947 .mode = 0200,
6d9f239a 2948 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2949 },
f8572d8f 2950 { },
39a23e75
DL
2951};
2952
2953static __net_init int sysctl_route_net_init(struct net *net)
2954{
2955 struct ctl_table *tbl;
2956
2957 tbl = ipv4_route_flush_table;
09ad9bc7 2958 if (!net_eq(net, &init_net)) {
39a23e75 2959 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2960 if (!tbl)
39a23e75 2961 goto err_dup;
464dc801
EB
2962
2963 /* Don't export sysctls to unprivileged users */
2964 if (net->user_ns != &init_user_ns)
2965 tbl[0].procname = NULL;
39a23e75
DL
2966 }
2967 tbl[0].extra1 = net;
2968
ec8f23ce 2969 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2970 if (!net->ipv4.route_hdr)
39a23e75
DL
2971 goto err_reg;
2972 return 0;
2973
2974err_reg:
2975 if (tbl != ipv4_route_flush_table)
2976 kfree(tbl);
2977err_dup:
2978 return -ENOMEM;
2979}
2980
2981static __net_exit void sysctl_route_net_exit(struct net *net)
2982{
2983 struct ctl_table *tbl;
2984
2985 tbl = net->ipv4.route_hdr->ctl_table_arg;
2986 unregister_net_sysctl_table(net->ipv4.route_hdr);
2987 BUG_ON(tbl == ipv4_route_flush_table);
2988 kfree(tbl);
2989}
2990
2991static __net_initdata struct pernet_operations sysctl_route_ops = {
2992 .init = sysctl_route_net_init,
2993 .exit = sysctl_route_net_exit,
2994};
1da177e4
LT
2995#endif
2996
3ee94372 2997static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2998{
ca4c3fc2 2999 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3000 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3001 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3002 return 0;
3003}
3004
3ee94372
NH
3005static __net_initdata struct pernet_operations rt_genid_ops = {
3006 .init = rt_genid_init,
9f5e97e5
DL
3007};
3008
c3426b47
DM
3009static int __net_init ipv4_inetpeer_init(struct net *net)
3010{
3011 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3012
3013 if (!bp)
3014 return -ENOMEM;
3015 inet_peer_base_init(bp);
3016 net->ipv4.peers = bp;
3017 return 0;
3018}
3019
3020static void __net_exit ipv4_inetpeer_exit(struct net *net)
3021{
3022 struct inet_peer_base *bp = net->ipv4.peers;
3023
3024 net->ipv4.peers = NULL;
56a6b248 3025 inetpeer_invalidate_tree(bp);
c3426b47
DM
3026 kfree(bp);
3027}
3028
3029static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3030 .init = ipv4_inetpeer_init,
3031 .exit = ipv4_inetpeer_exit,
3032};
9f5e97e5 3033
c7066f70 3034#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3035struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3036#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3037
1da177e4
LT
3038int __init ip_rt_init(void)
3039{
424c4b70 3040 int rc = 0;
5055c371 3041 int cpu;
1da177e4 3042
73f156a6
ED
3043 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3044 if (!ip_idents)
3045 panic("IP: failed to allocate ip_idents\n");
3046
3047 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3048
355b590c
ED
3049 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3050 if (!ip_tstamps)
3051 panic("IP: failed to allocate ip_tstamps\n");
3052
5055c371
ED
3053 for_each_possible_cpu(cpu) {
3054 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3055
3056 INIT_LIST_HEAD(&ul->head);
3057 spin_lock_init(&ul->lock);
3058 }
c7066f70 3059#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3060 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3061 if (!ip_rt_acct)
3062 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3063#endif
3064
e5d679f3
AD
3065 ipv4_dst_ops.kmem_cachep =
3066 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3067 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3068
14e50e57
DM
3069 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3070
fc66f95c
ED
3071 if (dst_entries_init(&ipv4_dst_ops) < 0)
3072 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3073
3074 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3075 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3076
89aef892
DM
3077 ipv4_dst_ops.gc_thresh = ~0;
3078 ip_rt_max_size = INT_MAX;
1da177e4 3079
1da177e4
LT
3080 devinet_init();
3081 ip_fib_init();
3082
73b38711 3083 if (ip_rt_proc_init())
058bd4d2 3084 pr_err("Unable to create route proc files\n");
1da177e4
LT
3085#ifdef CONFIG_XFRM
3086 xfrm_init();
703fb94e 3087 xfrm4_init();
1da177e4 3088#endif
c7ac8679 3089 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3090
39a23e75
DL
3091#ifdef CONFIG_SYSCTL
3092 register_pernet_subsys(&sysctl_route_ops);
3093#endif
3ee94372 3094 register_pernet_subsys(&rt_genid_ops);
c3426b47 3095 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3096 return rc;
3097}
3098
a1bc6eb4 3099#ifdef CONFIG_SYSCTL
eeb61f71
AV
3100/*
3101 * We really need to sanitize the damn ipv4 init order, then all
3102 * this nonsense will go away.
3103 */
3104void __init ip_static_sysctl_init(void)
3105{
4e5ca785 3106 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3107}
a1bc6eb4 3108#endif