]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/route.c
Merge tag 'for-linus-20170825' of git://git.infradead.org/linux-mtd
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .owner = THIS_MODULE,
244 .open = rt_cache_seq_open,
245 .read = seq_read,
246 .llseek = seq_lseek,
89aef892 247 .release = seq_release,
1da177e4
LT
248};
249
250
251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252{
253 int cpu;
254
255 if (*pos == 0)
256 return SEQ_START_TOKEN;
257
0f23174a 258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
259 if (!cpu_possible(cpu))
260 continue;
261 *pos = cpu+1;
2f970d83 262 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
263 }
264 return NULL;
265}
266
267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 int cpu;
270
0f23174a 271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
272 if (!cpu_possible(cpu))
273 continue;
274 *pos = cpu+1;
2f970d83 275 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
276 }
277 return NULL;
e905a9ed 278
1da177e4
LT
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
5bec0039 291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
292 return 0;
293 }
e905a9ed 294
1da177e4
LT
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 297 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 298 0, /* st->in_hit */
1da177e4
LT
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
0baf2b35 306 0, /* st->out_hit */
1da177e4 307 st->out_slow_tot,
e905a9ed 308 st->out_slow_mc,
1da177e4 309
0baf2b35
ED
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
1da177e4
LT
316 );
317 return 0;
318}
319
f690808e 320static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
9a32144e 333static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
334 .owner = THIS_MODULE,
335 .open = rt_cpu_seq_open,
336 .read = seq_read,
337 .llseek = seq_lseek,
338 .release = seq_release,
339};
340
c7066f70 341#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 342static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 343{
a661c419
AD
344 struct ip_rt_acct *dst, *src;
345 unsigned int i, j;
346
347 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 if (!dst)
349 return -ENOMEM;
350
351 for_each_possible_cpu(i) {
352 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 for (j = 0; j < 256; j++) {
354 dst[j].o_bytes += src[j].o_bytes;
355 dst[j].o_packets += src[j].o_packets;
356 dst[j].i_bytes += src[j].i_bytes;
357 dst[j].i_packets += src[j].i_packets;
358 }
78c686e9
PE
359 }
360
a661c419
AD
361 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 kfree(dst);
363 return 0;
364}
78c686e9 365
a661c419
AD
366static int rt_acct_proc_open(struct inode *inode, struct file *file)
367{
368 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 369}
a661c419
AD
370
371static const struct file_operations rt_acct_proc_fops = {
372 .owner = THIS_MODULE,
373 .open = rt_acct_proc_open,
374 .read = seq_read,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
78c686e9 378#endif
107f1634 379
73b38711 380static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
381{
382 struct proc_dir_entry *pde;
383
d4beaa66
G
384 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 &rt_cache_seq_fops);
107f1634
PE
386 if (!pde)
387 goto err1;
388
77020720
WC
389 pde = proc_create("rt_cache", S_IRUGO,
390 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
391 if (!pde)
392 goto err2;
393
c7066f70 394#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 395 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
396 if (!pde)
397 goto err3;
398#endif
399 return 0;
400
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
402err3:
403 remove_proc_entry("rt_cache", net->proc_net_stat);
404#endif
405err2:
406 remove_proc_entry("rt_cache", net->proc_net);
407err1:
408 return -ENOMEM;
409}
73b38711
DL
410
411static void __net_exit ip_rt_do_proc_exit(struct net *net)
412{
413 remove_proc_entry("rt_cache", net->proc_net_stat);
414 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 415#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 416 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 417#endif
73b38711
DL
418}
419
420static struct pernet_operations ip_rt_proc_ops __net_initdata = {
421 .init = ip_rt_do_proc_init,
422 .exit = ip_rt_do_proc_exit,
423};
424
425static int __init ip_rt_proc_init(void)
426{
427 return register_pernet_subsys(&ip_rt_proc_ops);
428}
429
107f1634 430#else
73b38711 431static inline int ip_rt_proc_init(void)
107f1634
PE
432{
433 return 0;
434}
1da177e4 435#endif /* CONFIG_PROC_FS */
e905a9ed 436
4331debc 437static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 438{
ca4c3fc2 439 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
440}
441
4ccfe6d4 442void rt_cache_flush(struct net *net)
1da177e4 443{
ca4c3fc2 444 rt_genid_bump_ipv4(net);
98376387
ED
445}
446
f894cbf8
DM
447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct sk_buff *skb,
449 const void *daddr)
3769cffb 450{
d3aaeb38
DM
451 struct net_device *dev = dst->dev;
452 const __be32 *pkey = daddr;
39232973 453 const struct rtable *rt;
3769cffb
DM
454 struct neighbour *n;
455
39232973 456 rt = (const struct rtable *) dst;
a263b309 457 if (rt->rt_gateway)
39232973 458 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
459 else if (skb)
460 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 461
80703d26 462 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
463 if (n)
464 return n;
32092ecf 465 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
466}
467
63fca65d
JA
468static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469{
470 struct net_device *dev = dst->dev;
471 const __be32 *pkey = daddr;
472 const struct rtable *rt;
473
474 rt = (const struct rtable *)dst;
475 if (rt->rt_gateway)
476 pkey = (const __be32 *)&rt->rt_gateway;
477 else if (!daddr ||
478 (rt->rt_flags &
479 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480 return;
481
482 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483}
484
04ca6973 485#define IP_IDENTS_SZ 2048u
04ca6973 486
355b590c
ED
487static atomic_t *ip_idents __read_mostly;
488static u32 *ip_tstamps __read_mostly;
04ca6973
ED
489
490/* In order to protect privacy, we add a perturbation to identifiers
491 * if one generator is seldom used. This makes hard for an attacker
492 * to infer how many packets were sent between two points in time.
493 */
494u32 ip_idents_reserve(u32 hash, int segs)
495{
355b590c
ED
496 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
498 u32 old = ACCESS_ONCE(*p_tstamp);
04ca6973 499 u32 now = (u32)jiffies;
adb03115 500 u32 new, delta = 0;
04ca6973 501
355b590c 502 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
503 delta = prandom_u32_max(now - old);
504
adb03115
ED
505 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 do {
507 old = (u32)atomic_read(p_id);
508 new = old + delta + segs;
509 } while (atomic_cmpxchg(p_id, old, new) != old);
510
511 return new - segs;
04ca6973
ED
512}
513EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 514
b6a7719a 515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 516{
73f156a6
ED
517 static u32 ip_idents_hashrnd __read_mostly;
518 u32 hash, id;
1da177e4 519
73f156a6 520 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 521
04ca6973
ED
522 hash = jhash_3words((__force u32)iph->daddr,
523 (__force u32)iph->saddr,
b6a7719a 524 iph->protocol ^ net_hash_mix(net),
04ca6973 525 ip_idents_hashrnd);
73f156a6
ED
526 id = ip_idents_reserve(hash, segs);
527 iph->id = htons(id);
1da177e4 528}
4bc2f18b 529EXPORT_SYMBOL(__ip_select_ident);
1da177e4 530
e2d118a1
LC
531static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532 const struct sock *sk,
4895c771
DM
533 const struct iphdr *iph,
534 int oif, u8 tos,
535 u8 prot, u32 mark, int flow_flags)
536{
537 if (sk) {
538 const struct inet_sock *inet = inet_sk(sk);
539
540 oif = sk->sk_bound_dev_if;
541 mark = sk->sk_mark;
542 tos = RT_CONN_FLAGS(sk);
543 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 }
545 flowi4_init_output(fl4, oif, mark, tos,
546 RT_SCOPE_UNIVERSE, prot,
547 flow_flags,
e2d118a1
LC
548 iph->daddr, iph->saddr, 0, 0,
549 sock_net_uid(net, sk));
4895c771
DM
550}
551
5abf7f7e
ED
552static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553 const struct sock *sk)
4895c771 554{
d109e61b 555 const struct net *net = dev_net(skb->dev);
4895c771
DM
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
d109e61b 562 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
563}
564
5abf7f7e 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
566{
567 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 568 const struct ip_options_rcu *inet_opt;
4895c771
DM
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
e2d118a1 579 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
580 rcu_read_unlock();
581}
582
5abf7f7e
ED
583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
4895c771
DM
585{
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590}
591
c5038a83 592static DEFINE_SPINLOCK(fnhe_lock);
4895c771 593
2ffae99d
TT
594static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595{
596 struct rtable *rt;
597
598 rt = rcu_dereference(fnhe->fnhe_rth_input);
599 if (rt) {
600 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 601 dst_dev_put(&rt->dst);
0830106c 602 dst_release(&rt->dst);
2ffae99d
TT
603 }
604 rt = rcu_dereference(fnhe->fnhe_rth_output);
605 if (rt) {
606 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 607 dst_dev_put(&rt->dst);
0830106c 608 dst_release(&rt->dst);
2ffae99d
TT
609 }
610}
611
aee06da6 612static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
613{
614 struct fib_nh_exception *fnhe, *oldest;
615
616 oldest = rcu_dereference(hash->chain);
617 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618 fnhe = rcu_dereference(fnhe->fnhe_next)) {
619 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 oldest = fnhe;
621 }
2ffae99d 622 fnhe_flush_routes(oldest);
4895c771
DM
623 return oldest;
624}
625
d3a25c98
DM
626static inline u32 fnhe_hashfun(__be32 daddr)
627{
d546c621 628 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
629 u32 hval;
630
d546c621
ED
631 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
634}
635
387aa65a
TT
636static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637{
638 rt->rt_pmtu = fnhe->fnhe_pmtu;
639 rt->dst.expires = fnhe->fnhe_expires;
640
641 if (fnhe->fnhe_gw) {
642 rt->rt_flags |= RTCF_REDIRECTED;
643 rt->rt_gateway = fnhe->fnhe_gw;
644 rt->rt_uses_gateway = 1;
645 }
646}
647
aee06da6
JA
648static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 u32 pmtu, unsigned long expires)
4895c771 650{
aee06da6 651 struct fnhe_hash_bucket *hash;
4895c771 652 struct fib_nh_exception *fnhe;
387aa65a
TT
653 struct rtable *rt;
654 unsigned int i;
4895c771 655 int depth;
aee06da6
JA
656 u32 hval = fnhe_hashfun(daddr);
657
c5038a83 658 spin_lock_bh(&fnhe_lock);
4895c771 659
caa41527 660 hash = rcu_dereference(nh->nh_exceptions);
4895c771 661 if (!hash) {
aee06da6 662 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 663 if (!hash)
aee06da6 664 goto out_unlock;
caa41527 665 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
666 }
667
4895c771
DM
668 hash += hval;
669
670 depth = 0;
671 for (fnhe = rcu_dereference(hash->chain); fnhe;
672 fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 if (fnhe->fnhe_daddr == daddr)
aee06da6 674 break;
4895c771
DM
675 depth++;
676 }
677
aee06da6
JA
678 if (fnhe) {
679 if (gw)
680 fnhe->fnhe_gw = gw;
681 if (pmtu) {
682 fnhe->fnhe_pmtu = pmtu;
387aa65a 683 fnhe->fnhe_expires = max(1UL, expires);
aee06da6 684 }
387aa65a 685 /* Update all cached dsts too */
2ffae99d
TT
686 rt = rcu_dereference(fnhe->fnhe_rth_input);
687 if (rt)
688 fill_route_from_fnhe(rt, fnhe);
689 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
690 if (rt)
691 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
692 } else {
693 if (depth > FNHE_RECLAIM_DEPTH)
694 fnhe = fnhe_oldest(hash);
695 else {
696 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
697 if (!fnhe)
698 goto out_unlock;
699
700 fnhe->fnhe_next = hash->chain;
701 rcu_assign_pointer(hash->chain, fnhe);
702 }
5aad1de5 703 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
aee06da6
JA
704 fnhe->fnhe_daddr = daddr;
705 fnhe->fnhe_gw = gw;
706 fnhe->fnhe_pmtu = pmtu;
707 fnhe->fnhe_expires = expires;
387aa65a
TT
708
709 /* Exception created; mark the cached routes for the nexthop
710 * stale, so anyone caching it rechecks if this exception
711 * applies to them.
712 */
2ffae99d
TT
713 rt = rcu_dereference(nh->nh_rth_input);
714 if (rt)
715 rt->dst.obsolete = DST_OBSOLETE_KILL;
716
387aa65a
TT
717 for_each_possible_cpu(i) {
718 struct rtable __rcu **prt;
719 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
720 rt = rcu_dereference(*prt);
721 if (rt)
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
723 }
4895c771 724 }
4895c771 725
4895c771 726 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
727
728out_unlock:
c5038a83 729 spin_unlock_bh(&fnhe_lock);
4895c771
DM
730}
731
ceb33206
DM
732static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
733 bool kill_route)
1da177e4 734{
e47a185b 735 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 736 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 737 struct net_device *dev = skb->dev;
e47a185b 738 struct in_device *in_dev;
4895c771 739 struct fib_result res;
e47a185b 740 struct neighbour *n;
317805b8 741 struct net *net;
1da177e4 742
94206125
DM
743 switch (icmp_hdr(skb)->code & 7) {
744 case ICMP_REDIR_NET:
745 case ICMP_REDIR_NETTOS:
746 case ICMP_REDIR_HOST:
747 case ICMP_REDIR_HOSTTOS:
748 break;
749
750 default:
751 return;
752 }
753
e47a185b
DM
754 if (rt->rt_gateway != old_gw)
755 return;
756
757 in_dev = __in_dev_get_rcu(dev);
758 if (!in_dev)
759 return;
760
c346dca1 761 net = dev_net(dev);
9d4fb27d
JP
762 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
763 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
764 ipv4_is_zeronet(new_gw))
1da177e4
LT
765 goto reject_redirect;
766
767 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
768 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
769 goto reject_redirect;
770 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
771 goto reject_redirect;
772 } else {
317805b8 773 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
774 goto reject_redirect;
775 }
776
969447f2
SSL
777 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
778 if (!n)
779 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 780 if (!IS_ERR(n)) {
e47a185b
DM
781 if (!(n->nud_state & NUD_VALID)) {
782 neigh_event_send(n, NULL);
783 } else {
0eeb075f 784 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 785 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 786
aee06da6 787 update_or_create_fnhe(nh, fl4->daddr, new_gw,
deed49df 788 0, jiffies + ip_rt_gc_timeout);
4895c771 789 }
ceb33206
DM
790 if (kill_route)
791 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
792 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
793 }
794 neigh_release(n);
795 }
796 return;
797
798reject_redirect:
799#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
800 if (IN_DEV_LOG_MARTIANS(in_dev)) {
801 const struct iphdr *iph = (const struct iphdr *) skb->data;
802 __be32 daddr = iph->daddr;
803 __be32 saddr = iph->saddr;
804
e47a185b
DM
805 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
806 " Advised path = %pI4 -> %pI4\n",
807 &old_gw, dev->name, &new_gw,
808 &saddr, &daddr);
99ee038d 809 }
e47a185b
DM
810#endif
811 ;
812}
813
4895c771
DM
814static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
815{
816 struct rtable *rt;
817 struct flowi4 fl4;
f96ef988 818 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 819 struct net *net = dev_net(skb->dev);
f96ef988
MK
820 int oif = skb->dev->ifindex;
821 u8 tos = RT_TOS(iph->tos);
822 u8 prot = iph->protocol;
823 u32 mark = skb->mark;
4895c771
DM
824
825 rt = (struct rtable *) dst;
826
7d995694 827 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 828 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
829}
830
1da177e4
LT
831static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
832{
ee6b9673 833 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
834 struct dst_entry *ret = dst;
835
836 if (rt) {
d11a4dc1 837 if (dst->obsolete > 0) {
1da177e4
LT
838 ip_rt_put(rt);
839 ret = NULL;
5943634f
DM
840 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
841 rt->dst.expires) {
89aef892 842 ip_rt_put(rt);
1da177e4
LT
843 ret = NULL;
844 }
845 }
846 return ret;
847}
848
849/*
850 * Algorithm:
851 * 1. The first ip_rt_redirect_number redirects are sent
852 * with exponential backoff, then we stop sending them at all,
853 * assuming that the host ignores our redirects.
854 * 2. If we did not see packets requiring redirects
855 * during ip_rt_redirect_silence, we assume that the host
856 * forgot redirected route and start to send redirects again.
857 *
858 * This algorithm is much cheaper and more intelligent than dumb load limiting
859 * in icmp.c.
860 *
861 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
862 * and "frag. need" (breaks PMTU discovery) in icmp.c.
863 */
864
865void ip_rt_send_redirect(struct sk_buff *skb)
866{
511c3f92 867 struct rtable *rt = skb_rtable(skb);
30038fc6 868 struct in_device *in_dev;
92d86829 869 struct inet_peer *peer;
1d861aa4 870 struct net *net;
30038fc6 871 int log_martians;
192132b9 872 int vif;
1da177e4 873
30038fc6 874 rcu_read_lock();
d8d1f30b 875 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
876 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
877 rcu_read_unlock();
1da177e4 878 return;
30038fc6
ED
879 }
880 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 881 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 882 rcu_read_unlock();
1da177e4 883
1d861aa4 884 net = dev_net(rt->dst.dev);
192132b9 885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 886 if (!peer) {
e81da0e1
JA
887 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
888 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
889 return;
890 }
891
1da177e4
LT
892 /* No redirected packets during ip_rt_redirect_silence;
893 * reset the algorithm.
894 */
92d86829
DM
895 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
896 peer->rate_tokens = 0;
1da177e4
LT
897
898 /* Too many ignored redirects; do not send anything
d8d1f30b 899 * set dst.rate_last to the last seen redirected packet.
1da177e4 900 */
92d86829
DM
901 if (peer->rate_tokens >= ip_rt_redirect_number) {
902 peer->rate_last = jiffies;
1d861aa4 903 goto out_put_peer;
1da177e4
LT
904 }
905
906 /* Check for load limit; set rate_last to the latest sent
907 * redirect.
908 */
92d86829 909 if (peer->rate_tokens == 0 ||
14fb8a76 910 time_after(jiffies,
92d86829
DM
911 (peer->rate_last +
912 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
913 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
914
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
916 peer->rate_last = jiffies;
917 ++peer->rate_tokens;
1da177e4 918#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 919 if (log_martians &&
e87cc472
JP
920 peer->rate_tokens == ip_rt_redirect_number)
921 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 922 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 923 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
924#endif
925 }
1d861aa4
DM
926out_put_peer:
927 inet_putpeer(peer);
1da177e4
LT
928}
929
930static int ip_error(struct sk_buff *skb)
931{
251da413 932 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 933 struct rtable *rt = skb_rtable(skb);
92d86829 934 struct inet_peer *peer;
1da177e4 935 unsigned long now;
251da413 936 struct net *net;
92d86829 937 bool send;
1da177e4
LT
938 int code;
939
381c759d
EB
940 /* IP on this device is disabled. */
941 if (!in_dev)
942 goto out;
943
251da413
DM
944 net = dev_net(rt->dst.dev);
945 if (!IN_DEV_FORWARD(in_dev)) {
946 switch (rt->dst.error) {
947 case EHOSTUNREACH:
b45386ef 948 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
949 break;
950
951 case ENETUNREACH:
b45386ef 952 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
953 break;
954 }
955 goto out;
956 }
957
d8d1f30b 958 switch (rt->dst.error) {
4500ebf8
JP
959 case EINVAL:
960 default:
961 goto out;
962 case EHOSTUNREACH:
963 code = ICMP_HOST_UNREACH;
964 break;
965 case ENETUNREACH:
966 code = ICMP_NET_UNREACH;
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
968 break;
969 case EACCES:
970 code = ICMP_PKT_FILTERED;
971 break;
1da177e4
LT
972 }
973
192132b9 974 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 975 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
976
977 send = true;
978 if (peer) {
979 now = jiffies;
980 peer->rate_tokens += now - peer->rate_last;
981 if (peer->rate_tokens > ip_rt_error_burst)
982 peer->rate_tokens = ip_rt_error_burst;
983 peer->rate_last = now;
984 if (peer->rate_tokens >= ip_rt_error_cost)
985 peer->rate_tokens -= ip_rt_error_cost;
986 else
987 send = false;
1d861aa4 988 inet_putpeer(peer);
1da177e4 989 }
92d86829
DM
990 if (send)
991 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
992
993out: kfree_skb(skb);
994 return 0;
e905a9ed 995}
1da177e4 996
d851c12b 997static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 998{
d851c12b 999 struct dst_entry *dst = &rt->dst;
4895c771 1000 struct fib_result res;
2c8cec5c 1001
fa1e492a
SK
1002 if (dst_metric_locked(dst, RTAX_MTU))
1003 return;
1004
cb6ccf09 1005 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1006 return;
1007
5943634f
DM
1008 if (mtu < ip_rt_min_pmtu)
1009 mtu = ip_rt_min_pmtu;
2c8cec5c 1010
f016229e
TT
1011 if (rt->rt_pmtu == mtu &&
1012 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013 return;
1014
c5ae7d41 1015 rcu_read_lock();
0eeb075f 1016 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1017 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1018
aee06da6
JA
1019 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020 jiffies + ip_rt_mtu_expires);
4895c771 1021 }
c5ae7d41 1022 rcu_read_unlock();
1da177e4
LT
1023}
1024
4895c771
DM
1025static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026 struct sk_buff *skb, u32 mtu)
1027{
1028 struct rtable *rt = (struct rtable *) dst;
1029 struct flowi4 fl4;
1030
1031 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1032 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1033}
1034
36393395
DM
1035void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036 int oif, u32 mark, u8 protocol, int flow_flags)
1037{
4895c771 1038 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1039 struct flowi4 fl4;
1040 struct rtable *rt;
1041
1b3c61dc
LC
1042 if (!mark)
1043 mark = IP4_REPLY_MARK(net, skb->mark);
1044
e2d118a1 1045 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1046 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1047 rt = __ip_route_output_key(net, &fl4);
1048 if (!IS_ERR(rt)) {
4895c771 1049 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1050 ip_rt_put(rt);
1051 }
1052}
1053EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
9cb3a50c 1055static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1056{
4895c771
DM
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
36393395 1060
e2d118a1 1061 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1062
1063 if (!fl4.flowi4_mark)
1064 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
4895c771
DM
1066 rt = __ip_route_output_key(sock_net(sk), &fl4);
1067 if (!IS_ERR(rt)) {
1068 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069 ip_rt_put(rt);
1070 }
36393395 1071}
9cb3a50c
SK
1072
1073void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075 const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 struct flowi4 fl4;
1077 struct rtable *rt;
7f502361 1078 struct dst_entry *odst = NULL;
b44108db 1079 bool new = false;
e2d118a1 1080 struct net *net = sock_net(sk);
9cb3a50c
SK
1081
1082 bh_lock_sock(sk);
482fc609
HFS
1083
1084 if (!ip_sk_accept_pmtu(sk))
1085 goto out;
1086
7f502361 1087 odst = sk_dst_get(sk);
9cb3a50c 1088
7f502361 1089 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1090 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091 goto out;
1092 }
1093
e2d118a1 1094 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1095
7f502361 1096 rt = (struct rtable *)odst;
51456b29 1097 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1098 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099 if (IS_ERR(rt))
1100 goto out;
b44108db
SK
1101
1102 new = true;
9cb3a50c
SK
1103 }
1104
1105 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
7f502361 1107 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1108 if (new)
1109 dst_release(&rt->dst);
1110
9cb3a50c
SK
1111 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112 if (IS_ERR(rt))
1113 goto out;
1114
b44108db 1115 new = true;
9cb3a50c
SK
1116 }
1117
b44108db 1118 if (new)
7f502361 1119 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1120
1121out:
1122 bh_unlock_sock(sk);
7f502361 1123 dst_release(odst);
9cb3a50c 1124}
36393395 1125EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1126
b42597e2
DM
1127void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128 int oif, u32 mark, u8 protocol, int flow_flags)
1129{
4895c771 1130 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1131 struct flowi4 fl4;
1132 struct rtable *rt;
1133
e2d118a1 1134 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1135 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1136 rt = __ip_route_output_key(net, &fl4);
1137 if (!IS_ERR(rt)) {
ceb33206 1138 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1139 ip_rt_put(rt);
1140 }
1141}
1142EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145{
4895c771
DM
1146 const struct iphdr *iph = (const struct iphdr *) skb->data;
1147 struct flowi4 fl4;
1148 struct rtable *rt;
e2d118a1 1149 struct net *net = sock_net(sk);
b42597e2 1150
e2d118a1
LC
1151 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152 rt = __ip_route_output_key(net, &fl4);
4895c771 1153 if (!IS_ERR(rt)) {
ceb33206 1154 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1155 ip_rt_put(rt);
1156 }
b42597e2
DM
1157}
1158EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
efbc368d
DM
1160static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161{
1162 struct rtable *rt = (struct rtable *) dst;
1163
ceb33206
DM
1164 /* All IPV4 dsts are created with ->obsolete set to the value
1165 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166 * into this function always.
1167 *
387aa65a
TT
1168 * When a PMTU/redirect information update invalidates a route,
1169 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1171 */
387aa65a 1172 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1173 return NULL;
d11a4dc1 1174 return dst;
1da177e4
LT
1175}
1176
1da177e4
LT
1177static void ipv4_link_failure(struct sk_buff *skb)
1178{
1179 struct rtable *rt;
1180
1181 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
511c3f92 1183 rt = skb_rtable(skb);
5943634f
DM
1184 if (rt)
1185 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1186}
1187
ede2059d 1188static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1189{
91df42be
JP
1190 pr_debug("%s: %pI4 -> %pI4, %s\n",
1191 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192 skb->dev ? skb->dev->name : "?");
1da177e4 1193 kfree_skb(skb);
c378a9c0 1194 WARN_ON(1);
1da177e4
LT
1195 return 0;
1196}
1197
1198/*
1199 We do not cache source address of outgoing interface,
1200 because it is used only by IP RR, TS and SRR options,
1201 so that it out of fast path.
1202
1203 BTW remember: "addr" is allowed to be not aligned
1204 in IP options!
1205 */
1206
8e36360a 1207void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1208{
a61ced5d 1209 __be32 src;
1da177e4 1210
c7537967 1211 if (rt_is_output_route(rt))
c5be24ff 1212 src = ip_hdr(skb)->saddr;
ebc0ffae 1213 else {
8e36360a
DM
1214 struct fib_result res;
1215 struct flowi4 fl4;
1216 struct iphdr *iph;
1217
1218 iph = ip_hdr(skb);
1219
1220 memset(&fl4, 0, sizeof(fl4));
1221 fl4.daddr = iph->daddr;
1222 fl4.saddr = iph->saddr;
b0fe4a31 1223 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1224 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225 fl4.flowi4_iif = skb->dev->ifindex;
1226 fl4.flowi4_mark = skb->mark;
5e2b61f7 1227
ebc0ffae 1228 rcu_read_lock();
0eeb075f 1229 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1230 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1231 else
f8126f1d
DM
1232 src = inet_select_addr(rt->dst.dev,
1233 rt_nexthop(rt, iph->daddr),
1234 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1235 rcu_read_unlock();
1236 }
1da177e4
LT
1237 memcpy(addr, &src, 4);
1238}
1239
c7066f70 1240#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1241static void set_class_tag(struct rtable *rt, u32 tag)
1242{
d8d1f30b
CG
1243 if (!(rt->dst.tclassid & 0xFFFF))
1244 rt->dst.tclassid |= tag & 0xFFFF;
1245 if (!(rt->dst.tclassid & 0xFFFF0000))
1246 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1247}
1248#endif
1249
0dbaee3b
DM
1250static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251{
7ed14d97
GF
1252 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253 unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254 ip_rt_min_advmss);
0dbaee3b 1255
7ed14d97 1256 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1257}
1258
ebb762f2 1259static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1260{
261663b0 1261 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1262 unsigned int mtu = rt->rt_pmtu;
1263
98d75c37 1264 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1265 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1266
38d523e2 1267 if (mtu)
618f9bc7
SK
1268 return mtu;
1269
c780a049 1270 mtu = READ_ONCE(dst->dev->mtu);
d33e4553
DM
1271
1272 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1273 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1274 mtu = 576;
1275 }
1276
14972cbd
RP
1277 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1280}
1281
f2bb4bed 1282static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1283{
caa41527 1284 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1285 struct fib_nh_exception *fnhe;
1286 u32 hval;
1287
f2bb4bed
DM
1288 if (!hash)
1289 return NULL;
1290
d3a25c98 1291 hval = fnhe_hashfun(daddr);
4895c771
DM
1292
1293 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1295 if (fnhe->fnhe_daddr == daddr)
1296 return fnhe;
1297 }
1298 return NULL;
1299}
aee06da6 1300
caacf05e 1301static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1302 __be32 daddr, const bool do_cache)
f2bb4bed 1303{
caacf05e
DM
1304 bool ret = false;
1305
c5038a83 1306 spin_lock_bh(&fnhe_lock);
f2bb4bed 1307
c5038a83 1308 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1309 struct rtable __rcu **porig;
1310 struct rtable *orig;
5aad1de5 1311 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1312
1313 if (rt_is_input_route(rt))
1314 porig = &fnhe->fnhe_rth_input;
1315 else
1316 porig = &fnhe->fnhe_rth_output;
1317 orig = rcu_dereference(*porig);
5aad1de5
TT
1318
1319 if (fnhe->fnhe_genid != genid) {
1320 fnhe->fnhe_genid = genid;
13d82bf5
SK
1321 fnhe->fnhe_gw = 0;
1322 fnhe->fnhe_pmtu = 0;
1323 fnhe->fnhe_expires = 0;
2ffae99d
TT
1324 fnhe_flush_routes(fnhe);
1325 orig = NULL;
13d82bf5 1326 }
387aa65a
TT
1327 fill_route_from_fnhe(rt, fnhe);
1328 if (!rt->rt_gateway)
155e8336 1329 rt->rt_gateway = daddr;
f2bb4bed 1330
a4c2fd7f 1331 if (do_cache) {
0830106c 1332 dst_hold(&rt->dst);
2ffae99d 1333 rcu_assign_pointer(*porig, rt);
0830106c 1334 if (orig) {
95c47f9c 1335 dst_dev_put(&orig->dst);
0830106c 1336 dst_release(&orig->dst);
0830106c 1337 }
2ffae99d
TT
1338 ret = true;
1339 }
c5038a83
DM
1340
1341 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1342 }
1343 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1344
1345 return ret;
54764bb6
ED
1346}
1347
caacf05e 1348static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1349{
d26b3a7c 1350 struct rtable *orig, *prev, **p;
caacf05e 1351 bool ret = true;
f2bb4bed 1352
d26b3a7c 1353 if (rt_is_input_route(rt)) {
54764bb6 1354 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1355 } else {
903ceff7 1356 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1357 }
f2bb4bed
DM
1358 orig = *p;
1359
0830106c
WW
1360 /* hold dst before doing cmpxchg() to avoid race condition
1361 * on this dst
1362 */
1363 dst_hold(&rt->dst);
f2bb4bed
DM
1364 prev = cmpxchg(p, orig, rt);
1365 if (prev == orig) {
0830106c 1366 if (orig) {
95c47f9c 1367 dst_dev_put(&orig->dst);
0830106c 1368 dst_release(&orig->dst);
0830106c
WW
1369 }
1370 } else {
1371 dst_release(&rt->dst);
caacf05e 1372 ret = false;
0830106c 1373 }
caacf05e
DM
1374
1375 return ret;
1376}
1377
5055c371
ED
1378struct uncached_list {
1379 spinlock_t lock;
1380 struct list_head head;
1381};
1382
1383static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e
DM
1384
1385static void rt_add_uncached_list(struct rtable *rt)
1386{
5055c371
ED
1387 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389 rt->rt_uncached_list = ul;
1390
1391 spin_lock_bh(&ul->lock);
1392 list_add_tail(&rt->rt_uncached, &ul->head);
1393 spin_unlock_bh(&ul->lock);
caacf05e
DM
1394}
1395
1396static void ipv4_dst_destroy(struct dst_entry *dst)
1397{
3fb07daf 1398 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
caacf05e
DM
1399 struct rtable *rt = (struct rtable *) dst;
1400
3fb07daf
ED
1401 if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1402 kfree(p);
1403
78df76a0 1404 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1405 struct uncached_list *ul = rt->rt_uncached_list;
1406
1407 spin_lock_bh(&ul->lock);
caacf05e 1408 list_del(&rt->rt_uncached);
5055c371 1409 spin_unlock_bh(&ul->lock);
caacf05e
DM
1410 }
1411}
1412
1413void rt_flush_dev(struct net_device *dev)
1414{
5055c371
ED
1415 struct net *net = dev_net(dev);
1416 struct rtable *rt;
1417 int cpu;
1418
1419 for_each_possible_cpu(cpu) {
1420 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1421
5055c371
ED
1422 spin_lock_bh(&ul->lock);
1423 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1424 if (rt->dst.dev != dev)
1425 continue;
1426 rt->dst.dev = net->loopback_dev;
1427 dev_hold(rt->dst.dev);
1428 dev_put(dev);
1429 }
5055c371 1430 spin_unlock_bh(&ul->lock);
4895c771
DM
1431 }
1432}
1433
4331debc 1434static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1435{
4331debc
ED
1436 return rt &&
1437 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438 !rt_is_expired(rt);
d2d68ba9
DM
1439}
1440
f2bb4bed 1441static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1442 const struct fib_result *res,
f2bb4bed 1443 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1444 struct fib_info *fi, u16 type, u32 itag,
1445 const bool do_cache)
1da177e4 1446{
caacf05e
DM
1447 bool cached = false;
1448
1da177e4 1449 if (fi) {
4895c771
DM
1450 struct fib_nh *nh = &FIB_RES_NH(*res);
1451
155e8336 1452 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1453 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1454 rt->rt_uses_gateway = 1;
1455 }
3fb07daf
ED
1456 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1457 if (fi->fib_metrics != &dst_default_metrics) {
1458 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1459 atomic_inc(&fi->fib_metrics->refcnt);
1460 }
c7066f70 1461#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1462 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1463#endif
61adedf3 1464 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1465 if (unlikely(fnhe))
a4c2fd7f
WW
1466 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1467 else if (do_cache)
caacf05e 1468 cached = rt_cache_route(nh, rt);
155e8336
JA
1469 if (unlikely(!cached)) {
1470 /* Routes we intend to cache in nexthop exception or
1471 * FIB nexthop have the DST_NOCACHE bit clear.
1472 * However, if we are unsuccessful at storing this
1473 * route into the cache we really need to set it.
1474 */
155e8336
JA
1475 if (!rt->rt_gateway)
1476 rt->rt_gateway = daddr;
1477 rt_add_uncached_list(rt);
1478 }
1479 } else
caacf05e 1480 rt_add_uncached_list(rt);
defb3519 1481
c7066f70 1482#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1483#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1484 set_class_tag(rt, res->tclassid);
1da177e4
LT
1485#endif
1486 set_class_tag(rt, itag);
1487#endif
1da177e4
LT
1488}
1489
9ab179d8
DA
1490struct rtable *rt_dst_alloc(struct net_device *dev,
1491 unsigned int flags, u16 type,
1492 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1493{
d08c4f35
DA
1494 struct rtable *rt;
1495
1496 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1497 (will_cache ? 0 : DST_HOST) |
d08c4f35 1498 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1499 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1500
1501 if (rt) {
1502 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1503 rt->rt_flags = flags;
1504 rt->rt_type = type;
1505 rt->rt_is_input = 0;
1506 rt->rt_iif = 0;
1507 rt->rt_pmtu = 0;
1508 rt->rt_gateway = 0;
1509 rt->rt_uses_gateway = 0;
b7503e0c 1510 rt->rt_table_id = 0;
d08c4f35
DA
1511 INIT_LIST_HEAD(&rt->rt_uncached);
1512
1513 rt->dst.output = ip_output;
1514 if (flags & RTCF_LOCAL)
1515 rt->dst.input = ip_local_deliver;
1516 }
1517
1518 return rt;
0c4dcd58 1519}
9ab179d8 1520EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1521
96d36220 1522/* called in rcu_read_lock() section */
9e12bb22 1523static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1524 u8 tos, struct net_device *dev, int our)
1525{
1da177e4 1526 struct rtable *rth;
96d36220 1527 struct in_device *in_dev = __in_dev_get_rcu(dev);
d08c4f35 1528 unsigned int flags = RTCF_MULTICAST;
1da177e4 1529 u32 itag = 0;
b5f7e755 1530 int err;
1da177e4
LT
1531
1532 /* Primary sanity checks. */
1533
51456b29 1534 if (!in_dev)
1da177e4
LT
1535 return -EINVAL;
1536
1e637c74 1537 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1538 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1539 goto e_inval;
1540
75fea73d
AD
1541 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1542 goto e_inval;
d0daebc3 1543
f97c1e0c
JP
1544 if (ipv4_is_zeronet(saddr)) {
1545 if (!ipv4_is_local_multicast(daddr))
1da177e4 1546 goto e_inval;
b5f7e755 1547 } else {
9e56e380
DM
1548 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1549 in_dev, &itag);
b5f7e755
ED
1550 if (err < 0)
1551 goto e_err;
1552 }
d08c4f35
DA
1553 if (our)
1554 flags |= RTCF_LOCAL;
1555
1556 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1557 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4
LT
1558 if (!rth)
1559 goto e_nobufs;
1560
cf911662
DM
1561#ifdef CONFIG_IP_ROUTE_CLASSID
1562 rth->dst.tclassid = itag;
1563#endif
d8d1f30b 1564 rth->dst.output = ip_rt_bug;
9917e1e8 1565 rth->rt_is_input= 1;
1da177e4
LT
1566
1567#ifdef CONFIG_IP_MROUTE
f97c1e0c 1568 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1569 rth->dst.input = ip_mr_input;
1da177e4
LT
1570#endif
1571 RT_CACHE_STAT_INC(in_slow_mc);
1572
89aef892
DM
1573 skb_dst_set(skb, &rth->dst);
1574 return 0;
1da177e4
LT
1575
1576e_nobufs:
1da177e4 1577 return -ENOBUFS;
1da177e4 1578e_inval:
96d36220 1579 return -EINVAL;
b5f7e755 1580e_err:
b5f7e755 1581 return err;
1da177e4
LT
1582}
1583
1584
1585static void ip_handle_martian_source(struct net_device *dev,
1586 struct in_device *in_dev,
1587 struct sk_buff *skb,
9e12bb22
AV
1588 __be32 daddr,
1589 __be32 saddr)
1da177e4
LT
1590{
1591 RT_CACHE_STAT_INC(in_martian_src);
1592#ifdef CONFIG_IP_ROUTE_VERBOSE
1593 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1594 /*
1595 * RFC1812 recommendation, if source is martian,
1596 * the only hint is MAC header.
1597 */
058bd4d2 1598 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1599 &daddr, &saddr, dev->name);
98e399f8 1600 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1601 print_hex_dump(KERN_WARNING, "ll header: ",
1602 DUMP_PREFIX_OFFSET, 16, 1,
1603 skb_mac_header(skb),
1604 dev->hard_header_len, true);
1da177e4
LT
1605 }
1606 }
1607#endif
1608}
1609
deed49df
XL
1610static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1611{
1612 struct fnhe_hash_bucket *hash;
1613 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1614 u32 hval = fnhe_hashfun(daddr);
1615
1616 spin_lock_bh(&fnhe_lock);
1617
1618 hash = rcu_dereference_protected(nh->nh_exceptions,
1619 lockdep_is_held(&fnhe_lock));
1620 hash += hval;
1621
1622 fnhe_p = &hash->chain;
1623 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1624 while (fnhe) {
1625 if (fnhe->fnhe_daddr == daddr) {
1626 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1627 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1628 fnhe_flush_routes(fnhe);
1629 kfree_rcu(fnhe, rcu);
1630 break;
1631 }
1632 fnhe_p = &fnhe->fnhe_next;
1633 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1634 lockdep_is_held(&fnhe_lock));
1635 }
1636
1637 spin_unlock_bh(&fnhe_lock);
1638}
1639
efd85700
TG
1640static void set_lwt_redirect(struct rtable *rth)
1641{
1642 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1643 rth->dst.lwtstate->orig_output = rth->dst.output;
1644 rth->dst.output = lwtunnel_output;
1645 }
1646
1647 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1648 rth->dst.lwtstate->orig_input = rth->dst.input;
1649 rth->dst.input = lwtunnel_input;
1650 }
1651}
1652
47360228 1653/* called in rcu_read_lock() section */
5969f71d 1654static int __mkroute_input(struct sk_buff *skb,
982721f3 1655 const struct fib_result *res,
5969f71d 1656 struct in_device *in_dev,
c6cffba4 1657 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1658{
2ffae99d 1659 struct fib_nh_exception *fnhe;
1da177e4
LT
1660 struct rtable *rth;
1661 int err;
1662 struct in_device *out_dev;
d2d68ba9 1663 bool do_cache;
fbdc0ad0 1664 u32 itag = 0;
1da177e4
LT
1665
1666 /* get a working reference to the output device */
47360228 1667 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1668 if (!out_dev) {
e87cc472 1669 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1670 return -EINVAL;
1671 }
1672
5c04c819 1673 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1674 in_dev->dev, in_dev, &itag);
1da177e4 1675 if (err < 0) {
e905a9ed 1676 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1677 saddr);
e905a9ed 1678
1da177e4
LT
1679 goto cleanup;
1680 }
1681
e81da0e1
JA
1682 do_cache = res->fi && !itag;
1683 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1684 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1685 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1686 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1687 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1688
1689 if (skb->protocol != htons(ETH_P_IP)) {
1690 /* Not IP (i.e. ARP). Do not create route, if it is
1691 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1692 *
1693 * Proxy arp feature have been extended to allow, ARP
1694 * replies back to the same interface, to support
1695 * Private VLAN switch technologies. See arp.c.
1da177e4 1696 */
65324144
JDB
1697 if (out_dev == in_dev &&
1698 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1699 err = -EINVAL;
1700 goto cleanup;
1701 }
1702 }
1703
2ffae99d 1704 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1705 if (do_cache) {
deed49df 1706 if (fnhe) {
2ffae99d 1707 rth = rcu_dereference(fnhe->fnhe_rth_input);
deed49df
XL
1708 if (rth && rth->dst.expires &&
1709 time_after(jiffies, rth->dst.expires)) {
1710 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1711 fnhe = NULL;
1712 } else {
1713 goto rt_cache;
1714 }
1715 }
1716
1717 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2ffae99d 1718
deed49df 1719rt_cache:
e81da0e1
JA
1720 if (rt_cache_valid(rth)) {
1721 skb_dst_set_noref(skb, &rth->dst);
1722 goto out;
d2d68ba9
DM
1723 }
1724 }
f2bb4bed 1725
d08c4f35 1726 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1727 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1728 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1729 if (!rth) {
1730 err = -ENOBUFS;
1731 goto cleanup;
1732 }
1733
9917e1e8 1734 rth->rt_is_input = 1;
b7503e0c
DA
1735 if (res->table)
1736 rth->rt_table_id = res->table->tb_id;
a6254864 1737 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1738
d8d1f30b 1739 rth->dst.input = ip_forward;
1da177e4 1740
a4c2fd7f
WW
1741 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1742 do_cache);
efd85700 1743 set_lwt_redirect(rth);
c6cffba4 1744 skb_dst_set(skb, &rth->dst);
d2d68ba9 1745out:
1da177e4
LT
1746 err = 0;
1747 cleanup:
1da177e4 1748 return err;
e905a9ed 1749}
1da177e4 1750
79a13159 1751#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1752/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1753 * calculated from the inner IP addresses.
79a13159 1754 */
bf4e0a3d
NA
1755static void ip_multipath_l3_keys(const struct sk_buff *skb,
1756 struct flow_keys *hash_keys)
79a13159
PN
1757{
1758 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1759 const struct iphdr *inner_iph;
79a13159
PN
1760 const struct icmphdr *icmph;
1761 struct iphdr _inner_iph;
bf4e0a3d
NA
1762 struct icmphdr _icmph;
1763
1764 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1765 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1766 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1767 return;
79a13159
PN
1768
1769 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1770 return;
79a13159
PN
1771
1772 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1773 &_icmph);
1774 if (!icmph)
bf4e0a3d 1775 return;
79a13159
PN
1776
1777 if (icmph->type != ICMP_DEST_UNREACH &&
1778 icmph->type != ICMP_REDIRECT &&
1779 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1780 icmph->type != ICMP_PARAMETERPROB)
1781 return;
79a13159
PN
1782
1783 inner_iph = skb_header_pointer(skb,
1784 outer_iph->ihl * 4 + sizeof(_icmph),
1785 sizeof(_inner_iph), &_inner_iph);
1786 if (!inner_iph)
bf4e0a3d
NA
1787 return;
1788 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1789 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1790}
79a13159 1791
bf4e0a3d
NA
1792/* if skb is set it will be used and fl4 can be NULL */
1793int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1794 const struct sk_buff *skb)
1795{
1796 struct net *net = fi->fib_net;
1797 struct flow_keys hash_keys;
1798 u32 mhash;
79a13159 1799
bf4e0a3d
NA
1800 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1801 case 0:
1802 memset(&hash_keys, 0, sizeof(hash_keys));
1803 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1804 if (skb) {
1805 ip_multipath_l3_keys(skb, &hash_keys);
1806 } else {
1807 hash_keys.addrs.v4addrs.src = fl4->saddr;
1808 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1809 }
1810 break;
1811 case 1:
1812 /* skb is currently provided only when forwarding */
1813 if (skb) {
1814 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1815 struct flow_keys keys;
1816
1817 /* short-circuit if we already have L4 hash present */
1818 if (skb->l4_hash)
1819 return skb_get_hash_raw(skb) >> 1;
1820 memset(&hash_keys, 0, sizeof(hash_keys));
1821 skb_flow_dissect_flow_keys(skb, &keys, flag);
1822 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1823 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1824 hash_keys.ports.src = keys.ports.src;
1825 hash_keys.ports.dst = keys.ports.dst;
1826 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1827 } else {
1828 memset(&hash_keys, 0, sizeof(hash_keys));
1829 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1830 hash_keys.addrs.v4addrs.src = fl4->saddr;
1831 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1832 hash_keys.ports.src = fl4->fl4_sport;
1833 hash_keys.ports.dst = fl4->fl4_dport;
1834 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1835 }
1836 break;
1837 }
1838 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1839
bf4e0a3d
NA
1840 return mhash >> 1;
1841}
1842EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1843#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1844
5969f71d
SH
1845static int ip_mkroute_input(struct sk_buff *skb,
1846 struct fib_result *res,
5969f71d
SH
1847 struct in_device *in_dev,
1848 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1849{
1da177e4 1850#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1851 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1852 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1853
0e884c78
PN
1854 fib_select_multipath(res, h);
1855 }
1da177e4
LT
1856#endif
1857
1858 /* create a routing cache entry */
c6cffba4 1859 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1860}
1861
1da177e4
LT
1862/*
1863 * NOTE. We drop all the packets that has local source
1864 * addresses, because every properly looped back packet
1865 * must have correct destination already attached by output routine.
1866 *
1867 * Such approach solves two big problems:
1868 * 1. Not simplex devices are handled properly.
1869 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1870 * called with rcu_read_lock()
1da177e4
LT
1871 */
1872
9e12bb22 1873static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1874 u8 tos, struct net_device *dev,
1875 struct fib_result *res)
1da177e4 1876{
96d36220 1877 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1878 struct ip_tunnel_info *tun_info;
68a5e3dd 1879 struct flowi4 fl4;
95c96174 1880 unsigned int flags = 0;
1da177e4 1881 u32 itag = 0;
95c96174 1882 struct rtable *rth;
1da177e4 1883 int err = -EINVAL;
5e73ea1a 1884 struct net *net = dev_net(dev);
d2d68ba9 1885 bool do_cache;
1da177e4
LT
1886
1887 /* IP on this device is disabled. */
1888
1889 if (!in_dev)
1890 goto out;
1891
1892 /* Check for the most weird martians, which can be not detected
1893 by fib_lookup.
1894 */
1895
61adedf3 1896 tun_info = skb_tunnel_info(skb);
46fa062a 1897 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1898 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1899 else
1900 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1901 skb_dst_drop(skb);
1902
d0daebc3 1903 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1904 goto martian_source;
1905
5510cdf7
DA
1906 res->fi = NULL;
1907 res->table = NULL;
27a954bd 1908 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1909 goto brd_input;
1910
1911 /* Accept zero addresses only to limited broadcast;
1912 * I even do not know to fix it or not. Waiting for complains :-)
1913 */
f97c1e0c 1914 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1915 goto martian_source;
1916
d0daebc3 1917 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1918 goto martian_destination;
1919
9eb43e76
ED
1920 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1921 * and call it once if daddr or/and saddr are loopback addresses
1922 */
1923 if (ipv4_is_loopback(daddr)) {
1924 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1925 goto martian_destination;
9eb43e76
ED
1926 } else if (ipv4_is_loopback(saddr)) {
1927 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1928 goto martian_source;
1929 }
1930
1da177e4
LT
1931 /*
1932 * Now we are ready to route packet.
1933 */
68a5e3dd 1934 fl4.flowi4_oif = 0;
e0d56fdd 1935 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1936 fl4.flowi4_mark = skb->mark;
1937 fl4.flowi4_tos = tos;
1938 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1939 fl4.flowi4_flags = 0;
68a5e3dd
DM
1940 fl4.daddr = daddr;
1941 fl4.saddr = saddr;
8bcfd092 1942 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1943 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1944 if (err != 0) {
1945 if (!IN_DEV_FORWARD(in_dev))
1946 err = -EHOSTUNREACH;
1da177e4 1947 goto no_route;
cd0f0b95 1948 }
1da177e4 1949
5510cdf7 1950 if (res->type == RTN_BROADCAST)
1da177e4
LT
1951 goto brd_input;
1952
5510cdf7 1953 if (res->type == RTN_LOCAL) {
5c04c819 1954 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1955 0, dev, in_dev, &itag);
b5f7e755 1956 if (err < 0)
0d753960 1957 goto martian_source;
1da177e4
LT
1958 goto local_input;
1959 }
1960
cd0f0b95
DJ
1961 if (!IN_DEV_FORWARD(in_dev)) {
1962 err = -EHOSTUNREACH;
251da413 1963 goto no_route;
cd0f0b95 1964 }
5510cdf7 1965 if (res->type != RTN_UNICAST)
1da177e4
LT
1966 goto martian_destination;
1967
5510cdf7 1968 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1969out: return err;
1970
1971brd_input:
1972 if (skb->protocol != htons(ETH_P_IP))
1973 goto e_inval;
1974
41347dcd 1975 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1976 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1977 in_dev, &itag);
1da177e4 1978 if (err < 0)
0d753960 1979 goto martian_source;
1da177e4
LT
1980 }
1981 flags |= RTCF_BROADCAST;
5510cdf7 1982 res->type = RTN_BROADCAST;
1da177e4
LT
1983 RT_CACHE_STAT_INC(in_brd);
1984
1985local_input:
d2d68ba9 1986 do_cache = false;
5510cdf7 1987 if (res->fi) {
fe3edf45 1988 if (!itag) {
5510cdf7 1989 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 1990 if (rt_cache_valid(rth)) {
c6cffba4
DM
1991 skb_dst_set_noref(skb, &rth->dst);
1992 err = 0;
1993 goto out;
d2d68ba9
DM
1994 }
1995 do_cache = true;
1996 }
1997 }
1998
f5a0aab8 1999 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2000 flags | RTCF_LOCAL, res->type,
d2d68ba9 2001 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2002 if (!rth)
2003 goto e_nobufs;
2004
d8d1f30b 2005 rth->dst.output= ip_rt_bug;
cf911662
DM
2006#ifdef CONFIG_IP_ROUTE_CLASSID
2007 rth->dst.tclassid = itag;
2008#endif
9917e1e8 2009 rth->rt_is_input = 1;
5510cdf7
DA
2010 if (res->table)
2011 rth->rt_table_id = res->table->tb_id;
571e7226 2012
a6254864 2013 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2014 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2015 rth->dst.input= ip_error;
2016 rth->dst.error= -err;
1da177e4
LT
2017 rth->rt_flags &= ~RTCF_LOCAL;
2018 }
efd85700 2019
dcdfdf56 2020 if (do_cache) {
5510cdf7 2021 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2022
2023 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2024 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2025 WARN_ON(rth->dst.input == lwtunnel_input);
2026 rth->dst.lwtstate->orig_input = rth->dst.input;
2027 rth->dst.input = lwtunnel_input;
2028 }
2029
a4c2fd7f 2030 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2031 rt_add_uncached_list(rth);
dcdfdf56 2032 }
89aef892 2033 skb_dst_set(skb, &rth->dst);
b23dd4fe 2034 err = 0;
ebc0ffae 2035 goto out;
1da177e4
LT
2036
2037no_route:
2038 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2039 res->type = RTN_UNREACHABLE;
2040 res->fi = NULL;
2041 res->table = NULL;
1da177e4
LT
2042 goto local_input;
2043
2044 /*
2045 * Do not cache martian addresses: they should be logged (RFC1812)
2046 */
2047martian_destination:
2048 RT_CACHE_STAT_INC(in_martian_dst);
2049#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2050 if (IN_DEV_LOG_MARTIANS(in_dev))
2051 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2052 &daddr, &saddr, dev->name);
1da177e4 2053#endif
2c2910a4 2054
1da177e4
LT
2055e_inval:
2056 err = -EINVAL;
ebc0ffae 2057 goto out;
1da177e4
LT
2058
2059e_nobufs:
2060 err = -ENOBUFS;
ebc0ffae 2061 goto out;
1da177e4
LT
2062
2063martian_source:
2064 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2065 goto out;
1da177e4
LT
2066}
2067
c6cffba4
DM
2068int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2069 u8 tos, struct net_device *dev)
1da177e4 2070{
5510cdf7
DA
2071 struct fib_result res;
2072 int err;
1da177e4 2073
6e28099d 2074 tos &= IPTOS_RT_MASK;
96d36220 2075 rcu_read_lock();
5510cdf7
DA
2076 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2077 rcu_read_unlock();
96d36220 2078
5510cdf7
DA
2079 return err;
2080}
2081EXPORT_SYMBOL(ip_route_input_noref);
2082
2083/* called with rcu_read_lock held */
2084int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2085 u8 tos, struct net_device *dev, struct fib_result *res)
2086{
1da177e4
LT
2087 /* Multicast recognition logic is moved from route cache to here.
2088 The problem was that too many Ethernet cards have broken/missing
2089 hardware multicast filters :-( As result the host on multicasting
2090 network acquires a lot of useless route cache entries, sort of
2091 SDR messages from all the world. Now we try to get rid of them.
2092 Really, provided software IP multicast filter is organized
2093 reasonably (at least, hashed), it does not result in a slowdown
2094 comparing with route cache reject entries.
2095 Note, that multicast routers are not affected, because
2096 route cache entry is created eventually.
2097 */
f97c1e0c 2098 if (ipv4_is_multicast(daddr)) {
96d36220 2099 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2100 int our = 0;
5510cdf7 2101 int err = -EINVAL;
1da177e4 2102
e58e4159
DA
2103 if (in_dev)
2104 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2105 ip_hdr(skb)->protocol);
2106
2107 /* check l3 master if no match yet */
2108 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2109 struct in_device *l3_in_dev;
2110
2111 l3_in_dev = __in_dev_get_rcu(skb->dev);
2112 if (l3_in_dev)
2113 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2114 ip_hdr(skb)->protocol);
2115 }
2116
e58e4159 2117 if (our
1da177e4 2118#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2119 ||
2120 (!ipv4_is_local_multicast(daddr) &&
2121 IN_DEV_MFORWARD(in_dev))
1da177e4 2122#endif
e58e4159 2123 ) {
5510cdf7 2124 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2125 tos, dev, our);
1da177e4 2126 }
5510cdf7 2127 return err;
1da177e4 2128 }
5510cdf7
DA
2129
2130 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2131}
2132
ebc0ffae 2133/* called with rcu_read_lock() */
982721f3 2134static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2135 const struct flowi4 *fl4, int orig_oif,
f61759e6 2136 struct net_device *dev_out,
5ada5527 2137 unsigned int flags)
1da177e4 2138{
982721f3 2139 struct fib_info *fi = res->fi;
f2bb4bed 2140 struct fib_nh_exception *fnhe;
5ada5527 2141 struct in_device *in_dev;
982721f3 2142 u16 type = res->type;
5ada5527 2143 struct rtable *rth;
c92b9655 2144 bool do_cache;
1da177e4 2145
d0daebc3
TG
2146 in_dev = __in_dev_get_rcu(dev_out);
2147 if (!in_dev)
5ada5527 2148 return ERR_PTR(-EINVAL);
1da177e4 2149
d0daebc3 2150 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2151 if (ipv4_is_loopback(fl4->saddr) &&
2152 !(dev_out->flags & IFF_LOOPBACK) &&
2153 !netif_is_l3_master(dev_out))
d0daebc3
TG
2154 return ERR_PTR(-EINVAL);
2155
68a5e3dd 2156 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2157 type = RTN_BROADCAST;
68a5e3dd 2158 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2159 type = RTN_MULTICAST;
68a5e3dd 2160 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2161 return ERR_PTR(-EINVAL);
1da177e4
LT
2162
2163 if (dev_out->flags & IFF_LOOPBACK)
2164 flags |= RTCF_LOCAL;
2165
63617421 2166 do_cache = true;
982721f3 2167 if (type == RTN_BROADCAST) {
1da177e4 2168 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2169 fi = NULL;
2170 } else if (type == RTN_MULTICAST) {
dd28d1a0 2171 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2172 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2173 fl4->flowi4_proto))
1da177e4 2174 flags &= ~RTCF_LOCAL;
63617421
JA
2175 else
2176 do_cache = false;
1da177e4 2177 /* If multicast route do not exist use
dd28d1a0
ED
2178 * default one, but do not gateway in this case.
2179 * Yes, it is hack.
1da177e4 2180 */
982721f3
DM
2181 if (fi && res->prefixlen < 4)
2182 fi = NULL;
d6d5e999
CF
2183 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2184 (orig_oif != dev_out->ifindex)) {
2185 /* For local routes that require a particular output interface
2186 * we do not want to cache the result. Caching the result
2187 * causes incorrect behaviour when there are multiple source
2188 * addresses on the interface, the end result being that if the
2189 * intended recipient is waiting on that interface for the
2190 * packet he won't receive it because it will be delivered on
2191 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2192 * be set to the loopback interface as well.
2193 */
2194 fi = NULL;
1da177e4
LT
2195 }
2196
f2bb4bed 2197 fnhe = NULL;
63617421
JA
2198 do_cache &= fi != NULL;
2199 if (do_cache) {
c5038a83 2200 struct rtable __rcu **prth;
c92b9655 2201 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2202
c92b9655 2203 fnhe = find_exception(nh, fl4->daddr);
deed49df 2204 if (fnhe) {
2ffae99d 2205 prth = &fnhe->fnhe_rth_output;
deed49df
XL
2206 rth = rcu_dereference(*prth);
2207 if (rth && rth->dst.expires &&
2208 time_after(jiffies, rth->dst.expires)) {
2209 ip_del_fnhe(nh, fl4->daddr);
2210 fnhe = NULL;
2211 } else {
2212 goto rt_cache;
c92b9655 2213 }
c92b9655 2214 }
deed49df
XL
2215
2216 if (unlikely(fl4->flowi4_flags &
2217 FLOWI_FLAG_KNOWN_NH &&
2218 !(nh->nh_gw &&
2219 nh->nh_scope == RT_SCOPE_LINK))) {
2220 do_cache = false;
2221 goto add;
2222 }
2223 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c5038a83 2224 rth = rcu_dereference(*prth);
deed49df
XL
2225
2226rt_cache:
9df16efa 2227 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2228 return rth;
f2bb4bed 2229 }
c92b9655
JA
2230
2231add:
d08c4f35 2232 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2233 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2234 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2235 do_cache);
8391d07b 2236 if (!rth)
5ada5527 2237 return ERR_PTR(-ENOBUFS);
8391d07b 2238
13378cad 2239 rth->rt_iif = orig_oif ? : 0;
b7503e0c
DA
2240 if (res->table)
2241 rth->rt_table_id = res->table->tb_id;
2242
1da177e4
LT
2243 RT_CACHE_STAT_INC(out_slow_tot);
2244
1da177e4 2245 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2246 if (flags & RTCF_LOCAL &&
1da177e4 2247 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2248 rth->dst.output = ip_mc_output;
1da177e4
LT
2249 RT_CACHE_STAT_INC(out_slow_mc);
2250 }
2251#ifdef CONFIG_IP_MROUTE
982721f3 2252 if (type == RTN_MULTICAST) {
1da177e4 2253 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2254 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2255 rth->dst.input = ip_mr_input;
2256 rth->dst.output = ip_mc_output;
1da177e4
LT
2257 }
2258 }
2259#endif
2260 }
2261
a4c2fd7f 2262 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2263 set_lwt_redirect(rth);
1da177e4 2264
5ada5527 2265 return rth;
1da177e4
LT
2266}
2267
1da177e4
LT
2268/*
2269 * Major route resolver routine.
2270 */
2271
3abd1ade
DA
2272struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2273 const struct sk_buff *skb)
1da177e4 2274{
f61759e6 2275 __u8 tos = RT_FL_TOS(fl4);
813b3b5d 2276 struct fib_result res;
5ada5527 2277 struct rtable *rth;
1da177e4 2278
85b91b03 2279 res.tclassid = 0;
1da177e4 2280 res.fi = NULL;
8b96d22d 2281 res.table = NULL;
1da177e4 2282
1fb9489b 2283 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2284 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2285 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2286 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2287
010c2708 2288 rcu_read_lock();
3abd1ade
DA
2289 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2290 rcu_read_unlock();
2291
2292 return rth;
2293}
2294EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2295
2296struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2297 struct fib_result *res,
2298 const struct sk_buff *skb)
2299{
2300 struct net_device *dev_out = NULL;
2301 int orig_oif = fl4->flowi4_oif;
2302 unsigned int flags = 0;
2303 struct rtable *rth;
2304 int err = -ENETUNREACH;
2305
813b3b5d 2306 if (fl4->saddr) {
b23dd4fe 2307 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2308 if (ipv4_is_multicast(fl4->saddr) ||
2309 ipv4_is_lbcast(fl4->saddr) ||
2310 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2311 goto out;
2312
1da177e4
LT
2313 /* I removed check for oif == dev_out->oif here.
2314 It was wrong for two reasons:
1ab35276
DL
2315 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2316 is assigned to multiple interfaces.
1da177e4
LT
2317 2. Moreover, we are allowed to send packets with saddr
2318 of another iface. --ANK
2319 */
2320
813b3b5d
DM
2321 if (fl4->flowi4_oif == 0 &&
2322 (ipv4_is_multicast(fl4->daddr) ||
2323 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2324 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2325 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2326 if (!dev_out)
a210d01a
JA
2327 goto out;
2328
1da177e4
LT
2329 /* Special hack: user can direct multicasts
2330 and limited broadcast via necessary interface
2331 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2332 This hack is not just for fun, it allows
2333 vic,vat and friends to work.
2334 They bind socket to loopback, set ttl to zero
2335 and expect that it will work.
2336 From the viewpoint of routing cache they are broken,
2337 because we are not allowed to build multicast path
2338 with loopback source addr (look, routing cache
2339 cannot know, that ttl is zero, so that packet
2340 will not leave this host and route is valid).
2341 Luckily, this hack is good workaround.
2342 */
2343
813b3b5d 2344 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2345 goto make_route;
2346 }
a210d01a 2347
813b3b5d 2348 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2349 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2350 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2351 goto out;
a210d01a 2352 }
1da177e4
LT
2353 }
2354
2355
813b3b5d
DM
2356 if (fl4->flowi4_oif) {
2357 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2358 rth = ERR_PTR(-ENODEV);
51456b29 2359 if (!dev_out)
1da177e4 2360 goto out;
e5ed6399
HX
2361
2362 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2363 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2364 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2365 goto out;
2366 }
813b3b5d 2367 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2368 ipv4_is_lbcast(fl4->daddr) ||
2369 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2370 if (!fl4->saddr)
2371 fl4->saddr = inet_select_addr(dev_out, 0,
2372 RT_SCOPE_LINK);
1da177e4
LT
2373 goto make_route;
2374 }
0a7e2260 2375 if (!fl4->saddr) {
813b3b5d
DM
2376 if (ipv4_is_multicast(fl4->daddr))
2377 fl4->saddr = inet_select_addr(dev_out, 0,
2378 fl4->flowi4_scope);
2379 else if (!fl4->daddr)
2380 fl4->saddr = inet_select_addr(dev_out, 0,
2381 RT_SCOPE_HOST);
1da177e4
LT
2382 }
2383 }
2384
813b3b5d
DM
2385 if (!fl4->daddr) {
2386 fl4->daddr = fl4->saddr;
2387 if (!fl4->daddr)
2388 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2389 dev_out = net->loopback_dev;
1fb9489b 2390 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2391 res->type = RTN_LOCAL;
1da177e4
LT
2392 flags |= RTCF_LOCAL;
2393 goto make_route;
2394 }
2395
3abd1ade 2396 err = fib_lookup(net, fl4, res, 0);
0315e382 2397 if (err) {
3abd1ade
DA
2398 res->fi = NULL;
2399 res->table = NULL;
6104e112 2400 if (fl4->flowi4_oif &&
e58e4159
DA
2401 (ipv4_is_multicast(fl4->daddr) ||
2402 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2403 /* Apparently, routing tables are wrong. Assume,
2404 that the destination is on link.
2405
2406 WHY? DW.
2407 Because we are allowed to send to iface
2408 even if it has NO routes and NO assigned
2409 addresses. When oif is specified, routing
2410 tables are looked up with only one purpose:
2411 to catch if destination is gatewayed, rather than
2412 direct. Moreover, if MSG_DONTROUTE is set,
2413 we send packet, ignoring both routing tables
2414 and ifaddr state. --ANK
2415
2416
2417 We could make it even if oif is unknown,
2418 likely IPv6, but we do not.
2419 */
2420
813b3b5d
DM
2421 if (fl4->saddr == 0)
2422 fl4->saddr = inet_select_addr(dev_out, 0,
2423 RT_SCOPE_LINK);
3abd1ade 2424 res->type = RTN_UNICAST;
1da177e4
LT
2425 goto make_route;
2426 }
0315e382 2427 rth = ERR_PTR(err);
1da177e4
LT
2428 goto out;
2429 }
1da177e4 2430
3abd1ade 2431 if (res->type == RTN_LOCAL) {
813b3b5d 2432 if (!fl4->saddr) {
3abd1ade
DA
2433 if (res->fi->fib_prefsrc)
2434 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2435 else
813b3b5d 2436 fl4->saddr = fl4->daddr;
9fc3bbb4 2437 }
5f02ce24
DA
2438
2439 /* L3 master device is the loopback for that domain */
3abd1ade 2440 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2441 net->loopback_dev;
813b3b5d 2442 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2443 flags |= RTCF_LOCAL;
2444 goto make_route;
2445 }
2446
3abd1ade 2447 fib_select_path(net, res, fl4, skb);
1da177e4 2448
3abd1ade 2449 dev_out = FIB_RES_DEV(*res);
813b3b5d 2450 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2451
2452
2453make_route:
3abd1ade 2454 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2455
010c2708 2456out:
b23dd4fe 2457 return rth;
1da177e4 2458}
d8c97a94 2459
ae2688d5
JW
2460static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2461{
2462 return NULL;
2463}
2464
ebb762f2 2465static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2466{
618f9bc7
SK
2467 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2468
2469 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2470}
2471
6700c270
DM
2472static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2473 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2474{
2475}
2476
6700c270
DM
2477static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2478 struct sk_buff *skb)
b587ee3b
DM
2479{
2480}
2481
0972ddb2
HB
2482static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2483 unsigned long old)
2484{
2485 return NULL;
2486}
2487
14e50e57
DM
2488static struct dst_ops ipv4_dst_blackhole_ops = {
2489 .family = AF_INET,
ae2688d5 2490 .check = ipv4_blackhole_dst_check,
ebb762f2 2491 .mtu = ipv4_blackhole_mtu,
214f45c9 2492 .default_advmss = ipv4_default_advmss,
14e50e57 2493 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2494 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2495 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2496 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2497};
2498
2774c131 2499struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2500{
2774c131 2501 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2502 struct rtable *rt;
14e50e57 2503
b2a9c0ed 2504 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
14e50e57 2505 if (rt) {
d8d1f30b 2506 struct dst_entry *new = &rt->dst;
14e50e57 2507
14e50e57 2508 new->__use = 1;
352e512c 2509 new->input = dst_discard;
ede2059d 2510 new->output = dst_discard_out;
14e50e57 2511
1dbe3252 2512 new->dev = net->loopback_dev;
14e50e57
DM
2513 if (new->dev)
2514 dev_hold(new->dev);
2515
9917e1e8 2516 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2517 rt->rt_iif = ort->rt_iif;
5943634f 2518 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2519
ca4c3fc2 2520 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2521 rt->rt_flags = ort->rt_flags;
2522 rt->rt_type = ort->rt_type;
14e50e57 2523 rt->rt_gateway = ort->rt_gateway;
155e8336 2524 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2525
caacf05e 2526 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2527 }
2528
2774c131
DM
2529 dst_release(dst_orig);
2530
2531 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2532}
2533
9d6ec938 2534struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2535 const struct sock *sk)
1da177e4 2536{
9d6ec938 2537 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2538
b23dd4fe
DM
2539 if (IS_ERR(rt))
2540 return rt;
1da177e4 2541
56157872 2542 if (flp4->flowi4_proto)
f92ee619
SK
2543 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2544 flowi4_to_flowi(flp4),
2545 sk, 0);
1da177e4 2546
b23dd4fe 2547 return rt;
1da177e4 2548}
d8c97a94
ACM
2549EXPORT_SYMBOL_GPL(ip_route_output_flow);
2550
3765d35e 2551/* called with rcu_read_lock held */
c36ba660 2552static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2553 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2554 u32 seq)
1da177e4 2555{
ba52d61e 2556 struct rtable *rt = skb_rtable(skb);
1da177e4 2557 struct rtmsg *r;
be403ea1 2558 struct nlmsghdr *nlh;
2bc8ca40 2559 unsigned long expires = 0;
f185071d 2560 u32 error;
521f5490 2561 u32 metrics[RTAX_MAX];
be403ea1 2562
d3166e0c 2563 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2564 if (!nlh)
26932566 2565 return -EMSGSIZE;
be403ea1
TG
2566
2567 r = nlmsg_data(nlh);
1da177e4
LT
2568 r->rtm_family = AF_INET;
2569 r->rtm_dst_len = 32;
2570 r->rtm_src_len = 0;
d6c0a4f6 2571 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2572 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2573 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2574 goto nla_put_failure;
1da177e4
LT
2575 r->rtm_type = rt->rt_type;
2576 r->rtm_scope = RT_SCOPE_UNIVERSE;
2577 r->rtm_protocol = RTPROT_UNSPEC;
2578 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2579 if (rt->rt_flags & RTCF_NOTIFY)
2580 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2581 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2582 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2583
930345ea 2584 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2585 goto nla_put_failure;
1a00fee4 2586 if (src) {
1da177e4 2587 r->rtm_src_len = 32;
930345ea 2588 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2589 goto nla_put_failure;
1da177e4 2590 }
f3756b79
DM
2591 if (rt->dst.dev &&
2592 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2593 goto nla_put_failure;
c7066f70 2594#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2595 if (rt->dst.tclassid &&
2596 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2597 goto nla_put_failure;
1da177e4 2598#endif
41347dcd 2599 if (!rt_is_input_route(rt) &&
d6c0a4f6 2600 fl4->saddr != src) {
930345ea 2601 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2602 goto nla_put_failure;
2603 }
155e8336 2604 if (rt->rt_uses_gateway &&
930345ea 2605 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2606 goto nla_put_failure;
be403ea1 2607
ee9a8f7a
SK
2608 expires = rt->dst.expires;
2609 if (expires) {
2610 unsigned long now = jiffies;
2611
2612 if (time_before(now, expires))
2613 expires -= now;
2614 else
2615 expires = 0;
2616 }
2617
521f5490 2618 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2619 if (rt->rt_pmtu && expires)
521f5490
JA
2620 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2621 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2622 goto nla_put_failure;
2623
b4869889 2624 if (fl4->flowi4_mark &&
68aaed54 2625 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2626 goto nla_put_failure;
963bfeee 2627
622ec2c9
LC
2628 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2629 nla_put_u32(skb, RTA_UID,
2630 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2631 goto nla_put_failure;
2632
d8d1f30b 2633 error = rt->dst.error;
be403ea1 2634
c7537967 2635 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2636#ifdef CONFIG_IP_MROUTE
2637 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2638 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2639 int err = ipmr_get_route(net, skb,
2640 fl4->saddr, fl4->daddr,
9f09eaea 2641 r, portid);
2cf75070 2642
8caaf7b6 2643 if (err <= 0) {
0c8d803f
DA
2644 if (err == 0)
2645 return 0;
2646 goto nla_put_failure;
8caaf7b6
ND
2647 }
2648 } else
2649#endif
91146153 2650 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2651 goto nla_put_failure;
1da177e4
LT
2652 }
2653
f185071d 2654 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2655 goto nla_put_failure;
be403ea1 2656
053c095a
JB
2657 nlmsg_end(skb, nlh);
2658 return 0;
1da177e4 2659
be403ea1 2660nla_put_failure:
26932566
PM
2661 nlmsg_cancel(skb, nlh);
2662 return -EMSGSIZE;
1da177e4
LT
2663}
2664
c21ef3e3
DA
2665static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2666 struct netlink_ext_ack *extack)
1da177e4 2667{
3b1e0a65 2668 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2669 struct rtmsg *rtm;
2670 struct nlattr *tb[RTA_MAX+1];
3765d35e 2671 struct fib_result res = {};
1da177e4 2672 struct rtable *rt = NULL;
d6c0a4f6 2673 struct flowi4 fl4;
9e12bb22
AV
2674 __be32 dst = 0;
2675 __be32 src = 0;
2676 u32 iif;
d889ce3b 2677 int err;
963bfeee 2678 int mark;
1da177e4 2679 struct sk_buff *skb;
c36ba660 2680 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2681 kuid_t uid;
1da177e4 2682
fceb6435 2683 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2684 extack);
d889ce3b
TG
2685 if (err < 0)
2686 goto errout;
2687
2688 rtm = nlmsg_data(nlh);
2689
1da177e4 2690 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2691 if (!skb) {
d889ce3b
TG
2692 err = -ENOBUFS;
2693 goto errout;
2694 }
1da177e4
LT
2695
2696 /* Reserve room for dummy headers, this skb can pass
2697 through good chunk of routing engine.
2698 */
459a98ed 2699 skb_reset_mac_header(skb);
c1d2bbe1 2700 skb_reset_network_header(skb);
d2c962b8 2701
67b61f6c
JB
2702 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2703 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2704 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2705 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2706 if (tb[RTA_UID])
2707 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2708 else
2709 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2710
bbadb9a2
FL
2711 /* Bugfix: need to give ip_route_input enough of an IP header to
2712 * not gag.
2713 */
2714 ip_hdr(skb)->protocol = IPPROTO_UDP;
2715 ip_hdr(skb)->saddr = src;
2716 ip_hdr(skb)->daddr = dst;
2717
2718 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2719
d6c0a4f6
DM
2720 memset(&fl4, 0, sizeof(fl4));
2721 fl4.daddr = dst;
2722 fl4.saddr = src;
2723 fl4.flowi4_tos = rtm->rtm_tos;
2724 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2725 fl4.flowi4_mark = mark;
622ec2c9 2726 fl4.flowi4_uid = uid;
d6c0a4f6 2727
3765d35e
DA
2728 rcu_read_lock();
2729
1da177e4 2730 if (iif) {
d889ce3b
TG
2731 struct net_device *dev;
2732
3765d35e 2733 dev = dev_get_by_index_rcu(net, iif);
51456b29 2734 if (!dev) {
d889ce3b
TG
2735 err = -ENODEV;
2736 goto errout_free;
2737 }
2738
1da177e4
LT
2739 skb->protocol = htons(ETH_P_IP);
2740 skb->dev = dev;
963bfeee 2741 skb->mark = mark;
3765d35e
DA
2742 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2743 dev, &res);
d889ce3b 2744
511c3f92 2745 rt = skb_rtable(skb);
d8d1f30b
CG
2746 if (err == 0 && rt->dst.error)
2747 err = -rt->dst.error;
1da177e4 2748 } else {
3765d35e 2749 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2750 err = 0;
2751 if (IS_ERR(rt))
2752 err = PTR_ERR(rt);
2c87d63a
FW
2753 else
2754 skb_dst_set(skb, &rt->dst);
1da177e4 2755 }
d889ce3b 2756
1da177e4 2757 if (err)
d889ce3b 2758 goto errout_free;
1da177e4 2759
1da177e4
LT
2760 if (rtm->rtm_flags & RTM_F_NOTIFY)
2761 rt->rt_flags |= RTCF_NOTIFY;
2762
c36ba660
DA
2763 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2764 table_id = rt->rt_table_id;
2765
bc3aae2b
RP
2766 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2767 if (!res.fi) {
2768 err = fib_props[res.type].error;
2769 if (!err)
2770 err = -EHOSTUNREACH;
2771 goto errout_free;
2772 }
b6179813
RP
2773 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2774 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2775 rt->rt_type, res.prefix, res.prefixlen,
2776 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2777 } else {
b6179813 2778 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2779 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2780 }
7b46a644 2781 if (err < 0)
d889ce3b 2782 goto errout_free;
1da177e4 2783
3765d35e
DA
2784 rcu_read_unlock();
2785
15e47304 2786 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2787errout:
2942e900 2788 return err;
1da177e4 2789
d889ce3b 2790errout_free:
3765d35e 2791 rcu_read_unlock();
1da177e4 2792 kfree_skb(skb);
d889ce3b 2793 goto errout;
1da177e4
LT
2794}
2795
1da177e4
LT
2796void ip_rt_multicast_event(struct in_device *in_dev)
2797{
4ccfe6d4 2798 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2799}
2800
2801#ifdef CONFIG_SYSCTL
082c7ca4
G
2802static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2803static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2804static int ip_rt_gc_elasticity __read_mostly = 8;
2805
fe2c6338 2806static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2807 void __user *buffer,
1da177e4
LT
2808 size_t *lenp, loff_t *ppos)
2809{
5aad1de5
TT
2810 struct net *net = (struct net *)__ctl->extra1;
2811
1da177e4 2812 if (write) {
5aad1de5
TT
2813 rt_cache_flush(net);
2814 fnhe_genid_bump(net);
1da177e4 2815 return 0;
e905a9ed 2816 }
1da177e4
LT
2817
2818 return -EINVAL;
2819}
2820
fe2c6338 2821static struct ctl_table ipv4_route_table[] = {
1da177e4 2822 {
1da177e4
LT
2823 .procname = "gc_thresh",
2824 .data = &ipv4_dst_ops.gc_thresh,
2825 .maxlen = sizeof(int),
2826 .mode = 0644,
6d9f239a 2827 .proc_handler = proc_dointvec,
1da177e4
LT
2828 },
2829 {
1da177e4
LT
2830 .procname = "max_size",
2831 .data = &ip_rt_max_size,
2832 .maxlen = sizeof(int),
2833 .mode = 0644,
6d9f239a 2834 .proc_handler = proc_dointvec,
1da177e4
LT
2835 },
2836 {
2837 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2838
1da177e4
LT
2839 .procname = "gc_min_interval",
2840 .data = &ip_rt_gc_min_interval,
2841 .maxlen = sizeof(int),
2842 .mode = 0644,
6d9f239a 2843 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2844 },
2845 {
1da177e4
LT
2846 .procname = "gc_min_interval_ms",
2847 .data = &ip_rt_gc_min_interval,
2848 .maxlen = sizeof(int),
2849 .mode = 0644,
6d9f239a 2850 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2851 },
2852 {
1da177e4
LT
2853 .procname = "gc_timeout",
2854 .data = &ip_rt_gc_timeout,
2855 .maxlen = sizeof(int),
2856 .mode = 0644,
6d9f239a 2857 .proc_handler = proc_dointvec_jiffies,
1da177e4 2858 },
9f28a2fc
ED
2859 {
2860 .procname = "gc_interval",
2861 .data = &ip_rt_gc_interval,
2862 .maxlen = sizeof(int),
2863 .mode = 0644,
2864 .proc_handler = proc_dointvec_jiffies,
2865 },
1da177e4 2866 {
1da177e4
LT
2867 .procname = "redirect_load",
2868 .data = &ip_rt_redirect_load,
2869 .maxlen = sizeof(int),
2870 .mode = 0644,
6d9f239a 2871 .proc_handler = proc_dointvec,
1da177e4
LT
2872 },
2873 {
1da177e4
LT
2874 .procname = "redirect_number",
2875 .data = &ip_rt_redirect_number,
2876 .maxlen = sizeof(int),
2877 .mode = 0644,
6d9f239a 2878 .proc_handler = proc_dointvec,
1da177e4
LT
2879 },
2880 {
1da177e4
LT
2881 .procname = "redirect_silence",
2882 .data = &ip_rt_redirect_silence,
2883 .maxlen = sizeof(int),
2884 .mode = 0644,
6d9f239a 2885 .proc_handler = proc_dointvec,
1da177e4
LT
2886 },
2887 {
1da177e4
LT
2888 .procname = "error_cost",
2889 .data = &ip_rt_error_cost,
2890 .maxlen = sizeof(int),
2891 .mode = 0644,
6d9f239a 2892 .proc_handler = proc_dointvec,
1da177e4
LT
2893 },
2894 {
1da177e4
LT
2895 .procname = "error_burst",
2896 .data = &ip_rt_error_burst,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
6d9f239a 2899 .proc_handler = proc_dointvec,
1da177e4
LT
2900 },
2901 {
1da177e4
LT
2902 .procname = "gc_elasticity",
2903 .data = &ip_rt_gc_elasticity,
2904 .maxlen = sizeof(int),
2905 .mode = 0644,
6d9f239a 2906 .proc_handler = proc_dointvec,
1da177e4
LT
2907 },
2908 {
1da177e4
LT
2909 .procname = "mtu_expires",
2910 .data = &ip_rt_mtu_expires,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
6d9f239a 2913 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2914 },
2915 {
1da177e4
LT
2916 .procname = "min_pmtu",
2917 .data = &ip_rt_min_pmtu,
2918 .maxlen = sizeof(int),
2919 .mode = 0644,
6d9f239a 2920 .proc_handler = proc_dointvec,
1da177e4
LT
2921 },
2922 {
1da177e4
LT
2923 .procname = "min_adv_mss",
2924 .data = &ip_rt_min_advmss,
2925 .maxlen = sizeof(int),
2926 .mode = 0644,
6d9f239a 2927 .proc_handler = proc_dointvec,
1da177e4 2928 },
f8572d8f 2929 { }
1da177e4 2930};
39a23e75 2931
39a23e75
DL
2932static struct ctl_table ipv4_route_flush_table[] = {
2933 {
39a23e75
DL
2934 .procname = "flush",
2935 .maxlen = sizeof(int),
2936 .mode = 0200,
6d9f239a 2937 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2938 },
f8572d8f 2939 { },
39a23e75
DL
2940};
2941
2942static __net_init int sysctl_route_net_init(struct net *net)
2943{
2944 struct ctl_table *tbl;
2945
2946 tbl = ipv4_route_flush_table;
09ad9bc7 2947 if (!net_eq(net, &init_net)) {
39a23e75 2948 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2949 if (!tbl)
39a23e75 2950 goto err_dup;
464dc801
EB
2951
2952 /* Don't export sysctls to unprivileged users */
2953 if (net->user_ns != &init_user_ns)
2954 tbl[0].procname = NULL;
39a23e75
DL
2955 }
2956 tbl[0].extra1 = net;
2957
ec8f23ce 2958 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2959 if (!net->ipv4.route_hdr)
39a23e75
DL
2960 goto err_reg;
2961 return 0;
2962
2963err_reg:
2964 if (tbl != ipv4_route_flush_table)
2965 kfree(tbl);
2966err_dup:
2967 return -ENOMEM;
2968}
2969
2970static __net_exit void sysctl_route_net_exit(struct net *net)
2971{
2972 struct ctl_table *tbl;
2973
2974 tbl = net->ipv4.route_hdr->ctl_table_arg;
2975 unregister_net_sysctl_table(net->ipv4.route_hdr);
2976 BUG_ON(tbl == ipv4_route_flush_table);
2977 kfree(tbl);
2978}
2979
2980static __net_initdata struct pernet_operations sysctl_route_ops = {
2981 .init = sysctl_route_net_init,
2982 .exit = sysctl_route_net_exit,
2983};
1da177e4
LT
2984#endif
2985
3ee94372 2986static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2987{
ca4c3fc2 2988 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 2989 atomic_set(&net->fnhe_genid, 0);
7aed9f72 2990 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
2991 return 0;
2992}
2993
3ee94372
NH
2994static __net_initdata struct pernet_operations rt_genid_ops = {
2995 .init = rt_genid_init,
9f5e97e5
DL
2996};
2997
c3426b47
DM
2998static int __net_init ipv4_inetpeer_init(struct net *net)
2999{
3000 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3001
3002 if (!bp)
3003 return -ENOMEM;
3004 inet_peer_base_init(bp);
3005 net->ipv4.peers = bp;
3006 return 0;
3007}
3008
3009static void __net_exit ipv4_inetpeer_exit(struct net *net)
3010{
3011 struct inet_peer_base *bp = net->ipv4.peers;
3012
3013 net->ipv4.peers = NULL;
56a6b248 3014 inetpeer_invalidate_tree(bp);
c3426b47
DM
3015 kfree(bp);
3016}
3017
3018static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3019 .init = ipv4_inetpeer_init,
3020 .exit = ipv4_inetpeer_exit,
3021};
9f5e97e5 3022
c7066f70 3023#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3024struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3025#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3026
1da177e4
LT
3027int __init ip_rt_init(void)
3028{
424c4b70 3029 int rc = 0;
5055c371 3030 int cpu;
1da177e4 3031
73f156a6
ED
3032 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3033 if (!ip_idents)
3034 panic("IP: failed to allocate ip_idents\n");
3035
3036 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3037
355b590c
ED
3038 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3039 if (!ip_tstamps)
3040 panic("IP: failed to allocate ip_tstamps\n");
3041
5055c371
ED
3042 for_each_possible_cpu(cpu) {
3043 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3044
3045 INIT_LIST_HEAD(&ul->head);
3046 spin_lock_init(&ul->lock);
3047 }
c7066f70 3048#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3049 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3050 if (!ip_rt_acct)
3051 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3052#endif
3053
e5d679f3
AD
3054 ipv4_dst_ops.kmem_cachep =
3055 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3056 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3057
14e50e57
DM
3058 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3059
fc66f95c
ED
3060 if (dst_entries_init(&ipv4_dst_ops) < 0)
3061 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3062
3063 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3064 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3065
89aef892
DM
3066 ipv4_dst_ops.gc_thresh = ~0;
3067 ip_rt_max_size = INT_MAX;
1da177e4 3068
1da177e4
LT
3069 devinet_init();
3070 ip_fib_init();
3071
73b38711 3072 if (ip_rt_proc_init())
058bd4d2 3073 pr_err("Unable to create route proc files\n");
1da177e4
LT
3074#ifdef CONFIG_XFRM
3075 xfrm_init();
703fb94e 3076 xfrm4_init();
1da177e4 3077#endif
c7ac8679 3078 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3079
39a23e75
DL
3080#ifdef CONFIG_SYSCTL
3081 register_pernet_subsys(&sysctl_route_ops);
3082#endif
3ee94372 3083 register_pernet_subsys(&rt_genid_ops);
c3426b47 3084 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3085 return rc;
3086}
3087
a1bc6eb4 3088#ifdef CONFIG_SYSCTL
eeb61f71
AV
3089/*
3090 * We really need to sanitize the damn ipv4 init order, then all
3091 * this nonsense will go away.
3092 */
3093void __init ip_static_sysctl_init(void)
3094{
4e5ca785 3095 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3096}
a1bc6eb4 3097#endif