]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/route.c
UBUNTU: [Config] CONFIG_SND_SOC_ES8316=m
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .owner = THIS_MODULE,
244 .open = rt_cache_seq_open,
245 .read = seq_read,
246 .llseek = seq_lseek,
89aef892 247 .release = seq_release,
1da177e4
LT
248};
249
250
251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252{
253 int cpu;
254
255 if (*pos == 0)
256 return SEQ_START_TOKEN;
257
0f23174a 258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
259 if (!cpu_possible(cpu))
260 continue;
261 *pos = cpu+1;
2f970d83 262 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
263 }
264 return NULL;
265}
266
267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 int cpu;
270
0f23174a 271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
272 if (!cpu_possible(cpu))
273 continue;
274 *pos = cpu+1;
2f970d83 275 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
276 }
277 return NULL;
e905a9ed 278
1da177e4
LT
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
5bec0039 291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
292 return 0;
293 }
e905a9ed 294
1da177e4
LT
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 297 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 298 0, /* st->in_hit */
1da177e4
LT
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
0baf2b35 306 0, /* st->out_hit */
1da177e4 307 st->out_slow_tot,
e905a9ed 308 st->out_slow_mc,
1da177e4 309
0baf2b35
ED
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
1da177e4
LT
316 );
317 return 0;
318}
319
f690808e 320static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
9a32144e 333static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
334 .owner = THIS_MODULE,
335 .open = rt_cpu_seq_open,
336 .read = seq_read,
337 .llseek = seq_lseek,
338 .release = seq_release,
339};
340
c7066f70 341#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 342static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 343{
a661c419
AD
344 struct ip_rt_acct *dst, *src;
345 unsigned int i, j;
346
347 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 if (!dst)
349 return -ENOMEM;
350
351 for_each_possible_cpu(i) {
352 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 for (j = 0; j < 256; j++) {
354 dst[j].o_bytes += src[j].o_bytes;
355 dst[j].o_packets += src[j].o_packets;
356 dst[j].i_bytes += src[j].i_bytes;
357 dst[j].i_packets += src[j].i_packets;
358 }
78c686e9
PE
359 }
360
a661c419
AD
361 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 kfree(dst);
363 return 0;
364}
78c686e9 365
a661c419
AD
366static int rt_acct_proc_open(struct inode *inode, struct file *file)
367{
368 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 369}
a661c419
AD
370
371static const struct file_operations rt_acct_proc_fops = {
372 .owner = THIS_MODULE,
373 .open = rt_acct_proc_open,
374 .read = seq_read,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
78c686e9 378#endif
107f1634 379
73b38711 380static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
381{
382 struct proc_dir_entry *pde;
383
d4beaa66
G
384 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 &rt_cache_seq_fops);
107f1634
PE
386 if (!pde)
387 goto err1;
388
77020720
WC
389 pde = proc_create("rt_cache", S_IRUGO,
390 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
391 if (!pde)
392 goto err2;
393
c7066f70 394#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 395 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
396 if (!pde)
397 goto err3;
398#endif
399 return 0;
400
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
402err3:
403 remove_proc_entry("rt_cache", net->proc_net_stat);
404#endif
405err2:
406 remove_proc_entry("rt_cache", net->proc_net);
407err1:
408 return -ENOMEM;
409}
73b38711
DL
410
411static void __net_exit ip_rt_do_proc_exit(struct net *net)
412{
413 remove_proc_entry("rt_cache", net->proc_net_stat);
414 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 415#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 416 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 417#endif
73b38711
DL
418}
419
420static struct pernet_operations ip_rt_proc_ops __net_initdata = {
421 .init = ip_rt_do_proc_init,
422 .exit = ip_rt_do_proc_exit,
423};
424
425static int __init ip_rt_proc_init(void)
426{
427 return register_pernet_subsys(&ip_rt_proc_ops);
428}
429
107f1634 430#else
73b38711 431static inline int ip_rt_proc_init(void)
107f1634
PE
432{
433 return 0;
434}
1da177e4 435#endif /* CONFIG_PROC_FS */
e905a9ed 436
4331debc 437static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 438{
ca4c3fc2 439 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
440}
441
4ccfe6d4 442void rt_cache_flush(struct net *net)
1da177e4 443{
ca4c3fc2 444 rt_genid_bump_ipv4(net);
98376387
ED
445}
446
f894cbf8
DM
447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct sk_buff *skb,
449 const void *daddr)
3769cffb 450{
d3aaeb38
DM
451 struct net_device *dev = dst->dev;
452 const __be32 *pkey = daddr;
39232973 453 const struct rtable *rt;
3769cffb
DM
454 struct neighbour *n;
455
39232973 456 rt = (const struct rtable *) dst;
a263b309 457 if (rt->rt_gateway)
39232973 458 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
459 else if (skb)
460 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 461
80703d26 462 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
463 if (n)
464 return n;
32092ecf 465 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
466}
467
63fca65d
JA
468static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469{
470 struct net_device *dev = dst->dev;
471 const __be32 *pkey = daddr;
472 const struct rtable *rt;
473
474 rt = (const struct rtable *)dst;
475 if (rt->rt_gateway)
476 pkey = (const __be32 *)&rt->rt_gateway;
477 else if (!daddr ||
478 (rt->rt_flags &
479 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480 return;
481
482 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483}
484
04ca6973 485#define IP_IDENTS_SZ 2048u
04ca6973 486
355b590c
ED
487static atomic_t *ip_idents __read_mostly;
488static u32 *ip_tstamps __read_mostly;
04ca6973
ED
489
490/* In order to protect privacy, we add a perturbation to identifiers
491 * if one generator is seldom used. This makes hard for an attacker
492 * to infer how many packets were sent between two points in time.
493 */
494u32 ip_idents_reserve(u32 hash, int segs)
495{
355b590c
ED
496 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
498 u32 old = ACCESS_ONCE(*p_tstamp);
04ca6973 499 u32 now = (u32)jiffies;
adb03115 500 u32 new, delta = 0;
04ca6973 501
355b590c 502 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
503 delta = prandom_u32_max(now - old);
504
adb03115
ED
505 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 do {
507 old = (u32)atomic_read(p_id);
508 new = old + delta + segs;
509 } while (atomic_cmpxchg(p_id, old, new) != old);
510
511 return new - segs;
04ca6973
ED
512}
513EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 514
b6a7719a 515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 516{
73f156a6
ED
517 static u32 ip_idents_hashrnd __read_mostly;
518 u32 hash, id;
1da177e4 519
73f156a6 520 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 521
04ca6973
ED
522 hash = jhash_3words((__force u32)iph->daddr,
523 (__force u32)iph->saddr,
b6a7719a 524 iph->protocol ^ net_hash_mix(net),
04ca6973 525 ip_idents_hashrnd);
73f156a6
ED
526 id = ip_idents_reserve(hash, segs);
527 iph->id = htons(id);
1da177e4 528}
4bc2f18b 529EXPORT_SYMBOL(__ip_select_ident);
1da177e4 530
e2d118a1
LC
531static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532 const struct sock *sk,
4895c771
DM
533 const struct iphdr *iph,
534 int oif, u8 tos,
535 u8 prot, u32 mark, int flow_flags)
536{
537 if (sk) {
538 const struct inet_sock *inet = inet_sk(sk);
539
540 oif = sk->sk_bound_dev_if;
541 mark = sk->sk_mark;
542 tos = RT_CONN_FLAGS(sk);
543 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 }
545 flowi4_init_output(fl4, oif, mark, tos,
546 RT_SCOPE_UNIVERSE, prot,
547 flow_flags,
e2d118a1
LC
548 iph->daddr, iph->saddr, 0, 0,
549 sock_net_uid(net, sk));
4895c771
DM
550}
551
5abf7f7e
ED
552static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553 const struct sock *sk)
4895c771 554{
d109e61b 555 const struct net *net = dev_net(skb->dev);
4895c771
DM
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
d109e61b 562 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
563}
564
5abf7f7e 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
566{
567 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 568 const struct ip_options_rcu *inet_opt;
4895c771
DM
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
e2d118a1 579 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
580 rcu_read_unlock();
581}
582
5abf7f7e
ED
583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
4895c771
DM
585{
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590}
591
c5038a83 592static DEFINE_SPINLOCK(fnhe_lock);
4895c771 593
2ffae99d
TT
594static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595{
596 struct rtable *rt;
597
598 rt = rcu_dereference(fnhe->fnhe_rth_input);
599 if (rt) {
600 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 601 dst_dev_put(&rt->dst);
0830106c 602 dst_release(&rt->dst);
2ffae99d
TT
603 }
604 rt = rcu_dereference(fnhe->fnhe_rth_output);
605 if (rt) {
606 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 607 dst_dev_put(&rt->dst);
0830106c 608 dst_release(&rt->dst);
2ffae99d
TT
609 }
610}
611
aee06da6 612static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
613{
614 struct fib_nh_exception *fnhe, *oldest;
615
616 oldest = rcu_dereference(hash->chain);
617 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618 fnhe = rcu_dereference(fnhe->fnhe_next)) {
619 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 oldest = fnhe;
621 }
2ffae99d 622 fnhe_flush_routes(oldest);
4895c771
DM
623 return oldest;
624}
625
d3a25c98
DM
626static inline u32 fnhe_hashfun(__be32 daddr)
627{
d546c621 628 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
629 u32 hval;
630
d546c621
ED
631 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
634}
635
387aa65a
TT
636static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637{
638 rt->rt_pmtu = fnhe->fnhe_pmtu;
639 rt->dst.expires = fnhe->fnhe_expires;
640
641 if (fnhe->fnhe_gw) {
642 rt->rt_flags |= RTCF_REDIRECTED;
643 rt->rt_gateway = fnhe->fnhe_gw;
644 rt->rt_uses_gateway = 1;
645 }
646}
647
aee06da6
JA
648static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 u32 pmtu, unsigned long expires)
4895c771 650{
aee06da6 651 struct fnhe_hash_bucket *hash;
4895c771 652 struct fib_nh_exception *fnhe;
387aa65a
TT
653 struct rtable *rt;
654 unsigned int i;
4895c771 655 int depth;
aee06da6
JA
656 u32 hval = fnhe_hashfun(daddr);
657
c5038a83 658 spin_lock_bh(&fnhe_lock);
4895c771 659
caa41527 660 hash = rcu_dereference(nh->nh_exceptions);
4895c771 661 if (!hash) {
aee06da6 662 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 663 if (!hash)
aee06da6 664 goto out_unlock;
caa41527 665 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
666 }
667
4895c771
DM
668 hash += hval;
669
670 depth = 0;
671 for (fnhe = rcu_dereference(hash->chain); fnhe;
672 fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 if (fnhe->fnhe_daddr == daddr)
aee06da6 674 break;
4895c771
DM
675 depth++;
676 }
677
aee06da6
JA
678 if (fnhe) {
679 if (gw)
680 fnhe->fnhe_gw = gw;
681 if (pmtu) {
682 fnhe->fnhe_pmtu = pmtu;
387aa65a 683 fnhe->fnhe_expires = max(1UL, expires);
aee06da6 684 }
387aa65a 685 /* Update all cached dsts too */
2ffae99d
TT
686 rt = rcu_dereference(fnhe->fnhe_rth_input);
687 if (rt)
688 fill_route_from_fnhe(rt, fnhe);
689 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
690 if (rt)
691 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
692 } else {
693 if (depth > FNHE_RECLAIM_DEPTH)
694 fnhe = fnhe_oldest(hash);
695 else {
696 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
697 if (!fnhe)
698 goto out_unlock;
699
700 fnhe->fnhe_next = hash->chain;
701 rcu_assign_pointer(hash->chain, fnhe);
702 }
5aad1de5 703 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
aee06da6
JA
704 fnhe->fnhe_daddr = daddr;
705 fnhe->fnhe_gw = gw;
706 fnhe->fnhe_pmtu = pmtu;
707 fnhe->fnhe_expires = expires;
387aa65a
TT
708
709 /* Exception created; mark the cached routes for the nexthop
710 * stale, so anyone caching it rechecks if this exception
711 * applies to them.
712 */
2ffae99d
TT
713 rt = rcu_dereference(nh->nh_rth_input);
714 if (rt)
715 rt->dst.obsolete = DST_OBSOLETE_KILL;
716
387aa65a
TT
717 for_each_possible_cpu(i) {
718 struct rtable __rcu **prt;
719 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
720 rt = rcu_dereference(*prt);
721 if (rt)
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
723 }
4895c771 724 }
4895c771 725
4895c771 726 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
727
728out_unlock:
c5038a83 729 spin_unlock_bh(&fnhe_lock);
4895c771
DM
730}
731
ceb33206
DM
732static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
733 bool kill_route)
1da177e4 734{
e47a185b 735 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 736 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 737 struct net_device *dev = skb->dev;
e47a185b 738 struct in_device *in_dev;
4895c771 739 struct fib_result res;
e47a185b 740 struct neighbour *n;
317805b8 741 struct net *net;
1da177e4 742
94206125
DM
743 switch (icmp_hdr(skb)->code & 7) {
744 case ICMP_REDIR_NET:
745 case ICMP_REDIR_NETTOS:
746 case ICMP_REDIR_HOST:
747 case ICMP_REDIR_HOSTTOS:
748 break;
749
750 default:
751 return;
752 }
753
e47a185b
DM
754 if (rt->rt_gateway != old_gw)
755 return;
756
757 in_dev = __in_dev_get_rcu(dev);
758 if (!in_dev)
759 return;
760
c346dca1 761 net = dev_net(dev);
9d4fb27d
JP
762 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
763 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
764 ipv4_is_zeronet(new_gw))
1da177e4
LT
765 goto reject_redirect;
766
767 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
768 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
769 goto reject_redirect;
770 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
771 goto reject_redirect;
772 } else {
317805b8 773 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
774 goto reject_redirect;
775 }
776
969447f2
SSL
777 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
778 if (!n)
779 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 780 if (!IS_ERR(n)) {
e47a185b
DM
781 if (!(n->nud_state & NUD_VALID)) {
782 neigh_event_send(n, NULL);
783 } else {
0eeb075f 784 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 785 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 786
aee06da6 787 update_or_create_fnhe(nh, fl4->daddr, new_gw,
deed49df 788 0, jiffies + ip_rt_gc_timeout);
4895c771 789 }
ceb33206
DM
790 if (kill_route)
791 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
792 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
793 }
794 neigh_release(n);
795 }
796 return;
797
798reject_redirect:
799#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
800 if (IN_DEV_LOG_MARTIANS(in_dev)) {
801 const struct iphdr *iph = (const struct iphdr *) skb->data;
802 __be32 daddr = iph->daddr;
803 __be32 saddr = iph->saddr;
804
e47a185b
DM
805 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
806 " Advised path = %pI4 -> %pI4\n",
807 &old_gw, dev->name, &new_gw,
808 &saddr, &daddr);
99ee038d 809 }
e47a185b
DM
810#endif
811 ;
812}
813
4895c771
DM
814static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
815{
816 struct rtable *rt;
817 struct flowi4 fl4;
f96ef988 818 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 819 struct net *net = dev_net(skb->dev);
f96ef988
MK
820 int oif = skb->dev->ifindex;
821 u8 tos = RT_TOS(iph->tos);
822 u8 prot = iph->protocol;
823 u32 mark = skb->mark;
4895c771
DM
824
825 rt = (struct rtable *) dst;
826
7d995694 827 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 828 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
829}
830
1da177e4
LT
831static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
832{
ee6b9673 833 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
834 struct dst_entry *ret = dst;
835
836 if (rt) {
d11a4dc1 837 if (dst->obsolete > 0) {
1da177e4
LT
838 ip_rt_put(rt);
839 ret = NULL;
5943634f
DM
840 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
841 rt->dst.expires) {
89aef892 842 ip_rt_put(rt);
1da177e4
LT
843 ret = NULL;
844 }
845 }
846 return ret;
847}
848
849/*
850 * Algorithm:
851 * 1. The first ip_rt_redirect_number redirects are sent
852 * with exponential backoff, then we stop sending them at all,
853 * assuming that the host ignores our redirects.
854 * 2. If we did not see packets requiring redirects
855 * during ip_rt_redirect_silence, we assume that the host
856 * forgot redirected route and start to send redirects again.
857 *
858 * This algorithm is much cheaper and more intelligent than dumb load limiting
859 * in icmp.c.
860 *
861 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
862 * and "frag. need" (breaks PMTU discovery) in icmp.c.
863 */
864
865void ip_rt_send_redirect(struct sk_buff *skb)
866{
511c3f92 867 struct rtable *rt = skb_rtable(skb);
30038fc6 868 struct in_device *in_dev;
92d86829 869 struct inet_peer *peer;
1d861aa4 870 struct net *net;
30038fc6 871 int log_martians;
192132b9 872 int vif;
1da177e4 873
30038fc6 874 rcu_read_lock();
d8d1f30b 875 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
876 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
877 rcu_read_unlock();
1da177e4 878 return;
30038fc6
ED
879 }
880 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 881 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 882 rcu_read_unlock();
1da177e4 883
1d861aa4 884 net = dev_net(rt->dst.dev);
192132b9 885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 886 if (!peer) {
e81da0e1
JA
887 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
888 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
889 return;
890 }
891
1da177e4
LT
892 /* No redirected packets during ip_rt_redirect_silence;
893 * reset the algorithm.
894 */
92d86829
DM
895 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
896 peer->rate_tokens = 0;
1da177e4
LT
897
898 /* Too many ignored redirects; do not send anything
d8d1f30b 899 * set dst.rate_last to the last seen redirected packet.
1da177e4 900 */
92d86829
DM
901 if (peer->rate_tokens >= ip_rt_redirect_number) {
902 peer->rate_last = jiffies;
1d861aa4 903 goto out_put_peer;
1da177e4
LT
904 }
905
906 /* Check for load limit; set rate_last to the latest sent
907 * redirect.
908 */
92d86829 909 if (peer->rate_tokens == 0 ||
14fb8a76 910 time_after(jiffies,
92d86829
DM
911 (peer->rate_last +
912 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
913 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
914
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
916 peer->rate_last = jiffies;
917 ++peer->rate_tokens;
1da177e4 918#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 919 if (log_martians &&
e87cc472
JP
920 peer->rate_tokens == ip_rt_redirect_number)
921 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 922 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 923 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
924#endif
925 }
1d861aa4
DM
926out_put_peer:
927 inet_putpeer(peer);
1da177e4
LT
928}
929
930static int ip_error(struct sk_buff *skb)
931{
251da413 932 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 933 struct rtable *rt = skb_rtable(skb);
92d86829 934 struct inet_peer *peer;
1da177e4 935 unsigned long now;
251da413 936 struct net *net;
92d86829 937 bool send;
1da177e4
LT
938 int code;
939
381c759d
EB
940 /* IP on this device is disabled. */
941 if (!in_dev)
942 goto out;
943
251da413
DM
944 net = dev_net(rt->dst.dev);
945 if (!IN_DEV_FORWARD(in_dev)) {
946 switch (rt->dst.error) {
947 case EHOSTUNREACH:
b45386ef 948 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
949 break;
950
951 case ENETUNREACH:
b45386ef 952 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
953 break;
954 }
955 goto out;
956 }
957
d8d1f30b 958 switch (rt->dst.error) {
4500ebf8
JP
959 case EINVAL:
960 default:
961 goto out;
962 case EHOSTUNREACH:
963 code = ICMP_HOST_UNREACH;
964 break;
965 case ENETUNREACH:
966 code = ICMP_NET_UNREACH;
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
968 break;
969 case EACCES:
970 code = ICMP_PKT_FILTERED;
971 break;
1da177e4
LT
972 }
973
192132b9 974 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 975 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
976
977 send = true;
978 if (peer) {
979 now = jiffies;
980 peer->rate_tokens += now - peer->rate_last;
981 if (peer->rate_tokens > ip_rt_error_burst)
982 peer->rate_tokens = ip_rt_error_burst;
983 peer->rate_last = now;
984 if (peer->rate_tokens >= ip_rt_error_cost)
985 peer->rate_tokens -= ip_rt_error_cost;
986 else
987 send = false;
1d861aa4 988 inet_putpeer(peer);
1da177e4 989 }
92d86829
DM
990 if (send)
991 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
992
993out: kfree_skb(skb);
994 return 0;
e905a9ed 995}
1da177e4 996
d851c12b 997static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 998{
d851c12b 999 struct dst_entry *dst = &rt->dst;
4895c771 1000 struct fib_result res;
2c8cec5c 1001
fa1e492a
SK
1002 if (dst_metric_locked(dst, RTAX_MTU))
1003 return;
1004
cb6ccf09 1005 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1006 return;
1007
5943634f
DM
1008 if (mtu < ip_rt_min_pmtu)
1009 mtu = ip_rt_min_pmtu;
2c8cec5c 1010
f016229e
TT
1011 if (rt->rt_pmtu == mtu &&
1012 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013 return;
1014
c5ae7d41 1015 rcu_read_lock();
0eeb075f 1016 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1017 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1018
aee06da6
JA
1019 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020 jiffies + ip_rt_mtu_expires);
4895c771 1021 }
c5ae7d41 1022 rcu_read_unlock();
1da177e4
LT
1023}
1024
4895c771
DM
1025static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026 struct sk_buff *skb, u32 mtu)
1027{
1028 struct rtable *rt = (struct rtable *) dst;
1029 struct flowi4 fl4;
1030
1031 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1032 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1033}
1034
36393395
DM
1035void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036 int oif, u32 mark, u8 protocol, int flow_flags)
1037{
4895c771 1038 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1039 struct flowi4 fl4;
1040 struct rtable *rt;
1041
1b3c61dc
LC
1042 if (!mark)
1043 mark = IP4_REPLY_MARK(net, skb->mark);
1044
e2d118a1 1045 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1046 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1047 rt = __ip_route_output_key(net, &fl4);
1048 if (!IS_ERR(rt)) {
4895c771 1049 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1050 ip_rt_put(rt);
1051 }
1052}
1053EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
9cb3a50c 1055static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1056{
4895c771
DM
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
36393395 1060
e2d118a1 1061 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1062
1063 if (!fl4.flowi4_mark)
1064 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
4895c771
DM
1066 rt = __ip_route_output_key(sock_net(sk), &fl4);
1067 if (!IS_ERR(rt)) {
1068 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069 ip_rt_put(rt);
1070 }
36393395 1071}
9cb3a50c
SK
1072
1073void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075 const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 struct flowi4 fl4;
1077 struct rtable *rt;
7f502361 1078 struct dst_entry *odst = NULL;
b44108db 1079 bool new = false;
e2d118a1 1080 struct net *net = sock_net(sk);
9cb3a50c
SK
1081
1082 bh_lock_sock(sk);
482fc609
HFS
1083
1084 if (!ip_sk_accept_pmtu(sk))
1085 goto out;
1086
7f502361 1087 odst = sk_dst_get(sk);
9cb3a50c 1088
7f502361 1089 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1090 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091 goto out;
1092 }
1093
e2d118a1 1094 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1095
7f502361 1096 rt = (struct rtable *)odst;
51456b29 1097 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1098 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099 if (IS_ERR(rt))
1100 goto out;
b44108db
SK
1101
1102 new = true;
9cb3a50c
SK
1103 }
1104
1105 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
7f502361 1107 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1108 if (new)
1109 dst_release(&rt->dst);
1110
9cb3a50c
SK
1111 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112 if (IS_ERR(rt))
1113 goto out;
1114
b44108db 1115 new = true;
9cb3a50c
SK
1116 }
1117
b44108db 1118 if (new)
7f502361 1119 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1120
1121out:
1122 bh_unlock_sock(sk);
7f502361 1123 dst_release(odst);
9cb3a50c 1124}
36393395 1125EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1126
b42597e2
DM
1127void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128 int oif, u32 mark, u8 protocol, int flow_flags)
1129{
4895c771 1130 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1131 struct flowi4 fl4;
1132 struct rtable *rt;
1133
e2d118a1 1134 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1135 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1136 rt = __ip_route_output_key(net, &fl4);
1137 if (!IS_ERR(rt)) {
ceb33206 1138 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1139 ip_rt_put(rt);
1140 }
1141}
1142EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145{
4895c771
DM
1146 const struct iphdr *iph = (const struct iphdr *) skb->data;
1147 struct flowi4 fl4;
1148 struct rtable *rt;
e2d118a1 1149 struct net *net = sock_net(sk);
b42597e2 1150
e2d118a1
LC
1151 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152 rt = __ip_route_output_key(net, &fl4);
4895c771 1153 if (!IS_ERR(rt)) {
ceb33206 1154 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1155 ip_rt_put(rt);
1156 }
b42597e2
DM
1157}
1158EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
efbc368d
DM
1160static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161{
1162 struct rtable *rt = (struct rtable *) dst;
1163
ceb33206
DM
1164 /* All IPV4 dsts are created with ->obsolete set to the value
1165 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166 * into this function always.
1167 *
387aa65a
TT
1168 * When a PMTU/redirect information update invalidates a route,
1169 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1171 */
387aa65a 1172 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1173 return NULL;
d11a4dc1 1174 return dst;
1da177e4
LT
1175}
1176
1da177e4
LT
1177static void ipv4_link_failure(struct sk_buff *skb)
1178{
1179 struct rtable *rt;
1180
1181 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
511c3f92 1183 rt = skb_rtable(skb);
5943634f
DM
1184 if (rt)
1185 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1186}
1187
ede2059d 1188static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1189{
91df42be
JP
1190 pr_debug("%s: %pI4 -> %pI4, %s\n",
1191 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192 skb->dev ? skb->dev->name : "?");
1da177e4 1193 kfree_skb(skb);
c378a9c0 1194 WARN_ON(1);
1da177e4
LT
1195 return 0;
1196}
1197
1198/*
1199 We do not cache source address of outgoing interface,
1200 because it is used only by IP RR, TS and SRR options,
1201 so that it out of fast path.
1202
1203 BTW remember: "addr" is allowed to be not aligned
1204 in IP options!
1205 */
1206
8e36360a 1207void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1208{
a61ced5d 1209 __be32 src;
1da177e4 1210
c7537967 1211 if (rt_is_output_route(rt))
c5be24ff 1212 src = ip_hdr(skb)->saddr;
ebc0ffae 1213 else {
8e36360a
DM
1214 struct fib_result res;
1215 struct flowi4 fl4;
1216 struct iphdr *iph;
1217
1218 iph = ip_hdr(skb);
1219
1220 memset(&fl4, 0, sizeof(fl4));
1221 fl4.daddr = iph->daddr;
1222 fl4.saddr = iph->saddr;
b0fe4a31 1223 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1224 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225 fl4.flowi4_iif = skb->dev->ifindex;
1226 fl4.flowi4_mark = skb->mark;
5e2b61f7 1227
ebc0ffae 1228 rcu_read_lock();
0eeb075f 1229 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1230 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1231 else
f8126f1d
DM
1232 src = inet_select_addr(rt->dst.dev,
1233 rt_nexthop(rt, iph->daddr),
1234 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1235 rcu_read_unlock();
1236 }
1da177e4
LT
1237 memcpy(addr, &src, 4);
1238}
1239
c7066f70 1240#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1241static void set_class_tag(struct rtable *rt, u32 tag)
1242{
d8d1f30b
CG
1243 if (!(rt->dst.tclassid & 0xFFFF))
1244 rt->dst.tclassid |= tag & 0xFFFF;
1245 if (!(rt->dst.tclassid & 0xFFFF0000))
1246 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1247}
1248#endif
1249
0dbaee3b
DM
1250static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251{
7ed14d97
GF
1252 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253 unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254 ip_rt_min_advmss);
0dbaee3b 1255
7ed14d97 1256 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1257}
1258
ebb762f2 1259static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1260{
261663b0 1261 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1262 unsigned int mtu = rt->rt_pmtu;
1263
98d75c37 1264 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1265 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1266
38d523e2 1267 if (mtu)
618f9bc7
SK
1268 return mtu;
1269
c780a049 1270 mtu = READ_ONCE(dst->dev->mtu);
d33e4553
DM
1271
1272 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1273 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1274 mtu = 576;
1275 }
1276
14972cbd
RP
1277 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1280}
1281
f2bb4bed 1282static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1283{
caa41527 1284 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1285 struct fib_nh_exception *fnhe;
1286 u32 hval;
1287
f2bb4bed
DM
1288 if (!hash)
1289 return NULL;
1290
d3a25c98 1291 hval = fnhe_hashfun(daddr);
4895c771
DM
1292
1293 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1295 if (fnhe->fnhe_daddr == daddr)
1296 return fnhe;
1297 }
1298 return NULL;
1299}
aee06da6 1300
caacf05e 1301static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1302 __be32 daddr, const bool do_cache)
f2bb4bed 1303{
caacf05e
DM
1304 bool ret = false;
1305
c5038a83 1306 spin_lock_bh(&fnhe_lock);
f2bb4bed 1307
c5038a83 1308 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1309 struct rtable __rcu **porig;
1310 struct rtable *orig;
5aad1de5 1311 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1312
1313 if (rt_is_input_route(rt))
1314 porig = &fnhe->fnhe_rth_input;
1315 else
1316 porig = &fnhe->fnhe_rth_output;
1317 orig = rcu_dereference(*porig);
5aad1de5
TT
1318
1319 if (fnhe->fnhe_genid != genid) {
1320 fnhe->fnhe_genid = genid;
13d82bf5
SK
1321 fnhe->fnhe_gw = 0;
1322 fnhe->fnhe_pmtu = 0;
1323 fnhe->fnhe_expires = 0;
2ffae99d
TT
1324 fnhe_flush_routes(fnhe);
1325 orig = NULL;
13d82bf5 1326 }
387aa65a
TT
1327 fill_route_from_fnhe(rt, fnhe);
1328 if (!rt->rt_gateway)
155e8336 1329 rt->rt_gateway = daddr;
f2bb4bed 1330
a4c2fd7f 1331 if (do_cache) {
0830106c 1332 dst_hold(&rt->dst);
2ffae99d 1333 rcu_assign_pointer(*porig, rt);
0830106c 1334 if (orig) {
95c47f9c 1335 dst_dev_put(&orig->dst);
0830106c 1336 dst_release(&orig->dst);
0830106c 1337 }
2ffae99d
TT
1338 ret = true;
1339 }
c5038a83
DM
1340
1341 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1342 }
1343 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1344
1345 return ret;
54764bb6
ED
1346}
1347
caacf05e 1348static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1349{
d26b3a7c 1350 struct rtable *orig, *prev, **p;
caacf05e 1351 bool ret = true;
f2bb4bed 1352
d26b3a7c 1353 if (rt_is_input_route(rt)) {
54764bb6 1354 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1355 } else {
903ceff7 1356 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1357 }
f2bb4bed
DM
1358 orig = *p;
1359
0830106c
WW
1360 /* hold dst before doing cmpxchg() to avoid race condition
1361 * on this dst
1362 */
1363 dst_hold(&rt->dst);
f2bb4bed
DM
1364 prev = cmpxchg(p, orig, rt);
1365 if (prev == orig) {
0830106c 1366 if (orig) {
95c47f9c 1367 dst_dev_put(&orig->dst);
0830106c 1368 dst_release(&orig->dst);
0830106c
WW
1369 }
1370 } else {
1371 dst_release(&rt->dst);
caacf05e 1372 ret = false;
0830106c 1373 }
caacf05e
DM
1374
1375 return ret;
1376}
1377
5055c371
ED
1378struct uncached_list {
1379 spinlock_t lock;
1380 struct list_head head;
1381};
1382
1383static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e
DM
1384
1385static void rt_add_uncached_list(struct rtable *rt)
1386{
5055c371
ED
1387 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389 rt->rt_uncached_list = ul;
1390
1391 spin_lock_bh(&ul->lock);
1392 list_add_tail(&rt->rt_uncached, &ul->head);
1393 spin_unlock_bh(&ul->lock);
caacf05e
DM
1394}
1395
1396static void ipv4_dst_destroy(struct dst_entry *dst)
1397{
3fb07daf 1398 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
caacf05e
DM
1399 struct rtable *rt = (struct rtable *) dst;
1400
3fb07daf
ED
1401 if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1402 kfree(p);
1403
78df76a0 1404 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1405 struct uncached_list *ul = rt->rt_uncached_list;
1406
1407 spin_lock_bh(&ul->lock);
caacf05e 1408 list_del(&rt->rt_uncached);
5055c371 1409 spin_unlock_bh(&ul->lock);
caacf05e
DM
1410 }
1411}
1412
1413void rt_flush_dev(struct net_device *dev)
1414{
5055c371
ED
1415 struct net *net = dev_net(dev);
1416 struct rtable *rt;
1417 int cpu;
1418
1419 for_each_possible_cpu(cpu) {
1420 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1421
5055c371
ED
1422 spin_lock_bh(&ul->lock);
1423 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1424 if (rt->dst.dev != dev)
1425 continue;
1426 rt->dst.dev = net->loopback_dev;
1427 dev_hold(rt->dst.dev);
1428 dev_put(dev);
1429 }
5055c371 1430 spin_unlock_bh(&ul->lock);
4895c771
DM
1431 }
1432}
1433
4331debc 1434static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1435{
4331debc
ED
1436 return rt &&
1437 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438 !rt_is_expired(rt);
d2d68ba9
DM
1439}
1440
f2bb4bed 1441static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1442 const struct fib_result *res,
f2bb4bed 1443 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1444 struct fib_info *fi, u16 type, u32 itag,
1445 const bool do_cache)
1da177e4 1446{
caacf05e
DM
1447 bool cached = false;
1448
1da177e4 1449 if (fi) {
4895c771
DM
1450 struct fib_nh *nh = &FIB_RES_NH(*res);
1451
155e8336 1452 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1453 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1454 rt->rt_uses_gateway = 1;
1455 }
3fb07daf
ED
1456 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1457 if (fi->fib_metrics != &dst_default_metrics) {
1458 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1459 atomic_inc(&fi->fib_metrics->refcnt);
1460 }
c7066f70 1461#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1462 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1463#endif
61adedf3 1464 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1465 if (unlikely(fnhe))
a4c2fd7f
WW
1466 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1467 else if (do_cache)
caacf05e 1468 cached = rt_cache_route(nh, rt);
155e8336
JA
1469 if (unlikely(!cached)) {
1470 /* Routes we intend to cache in nexthop exception or
1471 * FIB nexthop have the DST_NOCACHE bit clear.
1472 * However, if we are unsuccessful at storing this
1473 * route into the cache we really need to set it.
1474 */
155e8336
JA
1475 if (!rt->rt_gateway)
1476 rt->rt_gateway = daddr;
1477 rt_add_uncached_list(rt);
1478 }
1479 } else
caacf05e 1480 rt_add_uncached_list(rt);
defb3519 1481
c7066f70 1482#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1483#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1484 set_class_tag(rt, res->tclassid);
1da177e4
LT
1485#endif
1486 set_class_tag(rt, itag);
1487#endif
1da177e4
LT
1488}
1489
9ab179d8
DA
1490struct rtable *rt_dst_alloc(struct net_device *dev,
1491 unsigned int flags, u16 type,
1492 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1493{
d08c4f35
DA
1494 struct rtable *rt;
1495
1496 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1497 (will_cache ? 0 : DST_HOST) |
d08c4f35 1498 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1499 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1500
1501 if (rt) {
1502 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1503 rt->rt_flags = flags;
1504 rt->rt_type = type;
1505 rt->rt_is_input = 0;
1506 rt->rt_iif = 0;
1507 rt->rt_pmtu = 0;
1508 rt->rt_gateway = 0;
1509 rt->rt_uses_gateway = 0;
b7503e0c 1510 rt->rt_table_id = 0;
d08c4f35
DA
1511 INIT_LIST_HEAD(&rt->rt_uncached);
1512
1513 rt->dst.output = ip_output;
1514 if (flags & RTCF_LOCAL)
1515 rt->dst.input = ip_local_deliver;
1516 }
1517
1518 return rt;
0c4dcd58 1519}
9ab179d8 1520EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1521
96d36220 1522/* called in rcu_read_lock() section */
3b6ff83a
PA
1523int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1524 u8 tos, struct net_device *dev,
1525 struct in_device *in_dev, u32 *itag)
1da177e4 1526{
b5f7e755 1527 int err;
1da177e4
LT
1528
1529 /* Primary sanity checks. */
51456b29 1530 if (!in_dev)
1da177e4
LT
1531 return -EINVAL;
1532
1e637c74 1533 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1534 skb->protocol != htons(ETH_P_IP))
3b6ff83a 1535 return -EINVAL;
1da177e4 1536
75fea73d 1537 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
3b6ff83a 1538 return -EINVAL;
d0daebc3 1539
f97c1e0c
JP
1540 if (ipv4_is_zeronet(saddr)) {
1541 if (!ipv4_is_local_multicast(daddr))
3b6ff83a 1542 return -EINVAL;
b5f7e755 1543 } else {
9e56e380 1544 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
3b6ff83a 1545 in_dev, itag);
b5f7e755 1546 if (err < 0)
3b6ff83a 1547 return err;
b5f7e755 1548 }
3b6ff83a
PA
1549 return 0;
1550}
1551
1552/* called in rcu_read_lock() section */
1553static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1554 u8 tos, struct net_device *dev, int our)
1555{
1556 struct in_device *in_dev = __in_dev_get_rcu(dev);
1557 unsigned int flags = RTCF_MULTICAST;
1558 struct rtable *rth;
1559 u32 itag = 0;
1560 int err;
1561
1562 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1563 if (err)
1564 return err;
1565
d08c4f35
DA
1566 if (our)
1567 flags |= RTCF_LOCAL;
1568
1569 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1570 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1571 if (!rth)
3b6ff83a 1572 return -ENOBUFS;
1da177e4 1573
cf911662
DM
1574#ifdef CONFIG_IP_ROUTE_CLASSID
1575 rth->dst.tclassid = itag;
1576#endif
d8d1f30b 1577 rth->dst.output = ip_rt_bug;
9917e1e8 1578 rth->rt_is_input= 1;
1da177e4
LT
1579
1580#ifdef CONFIG_IP_MROUTE
f97c1e0c 1581 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1582 rth->dst.input = ip_mr_input;
1da177e4
LT
1583#endif
1584 RT_CACHE_STAT_INC(in_slow_mc);
1585
89aef892
DM
1586 skb_dst_set(skb, &rth->dst);
1587 return 0;
1da177e4
LT
1588}
1589
1590
1591static void ip_handle_martian_source(struct net_device *dev,
1592 struct in_device *in_dev,
1593 struct sk_buff *skb,
9e12bb22
AV
1594 __be32 daddr,
1595 __be32 saddr)
1da177e4
LT
1596{
1597 RT_CACHE_STAT_INC(in_martian_src);
1598#ifdef CONFIG_IP_ROUTE_VERBOSE
1599 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1600 /*
1601 * RFC1812 recommendation, if source is martian,
1602 * the only hint is MAC header.
1603 */
058bd4d2 1604 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1605 &daddr, &saddr, dev->name);
98e399f8 1606 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1607 print_hex_dump(KERN_WARNING, "ll header: ",
1608 DUMP_PREFIX_OFFSET, 16, 1,
1609 skb_mac_header(skb),
1610 dev->hard_header_len, true);
1da177e4
LT
1611 }
1612 }
1613#endif
1614}
1615
deed49df
XL
1616static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1617{
1618 struct fnhe_hash_bucket *hash;
1619 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1620 u32 hval = fnhe_hashfun(daddr);
1621
1622 spin_lock_bh(&fnhe_lock);
1623
1624 hash = rcu_dereference_protected(nh->nh_exceptions,
1625 lockdep_is_held(&fnhe_lock));
1626 hash += hval;
1627
1628 fnhe_p = &hash->chain;
1629 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1630 while (fnhe) {
1631 if (fnhe->fnhe_daddr == daddr) {
1632 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1633 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1634 fnhe_flush_routes(fnhe);
1635 kfree_rcu(fnhe, rcu);
1636 break;
1637 }
1638 fnhe_p = &fnhe->fnhe_next;
1639 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1640 lockdep_is_held(&fnhe_lock));
1641 }
1642
1643 spin_unlock_bh(&fnhe_lock);
1644}
1645
efd85700
TG
1646static void set_lwt_redirect(struct rtable *rth)
1647{
1648 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1649 rth->dst.lwtstate->orig_output = rth->dst.output;
1650 rth->dst.output = lwtunnel_output;
1651 }
1652
1653 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1654 rth->dst.lwtstate->orig_input = rth->dst.input;
1655 rth->dst.input = lwtunnel_input;
1656 }
1657}
1658
47360228 1659/* called in rcu_read_lock() section */
5969f71d 1660static int __mkroute_input(struct sk_buff *skb,
982721f3 1661 const struct fib_result *res,
5969f71d 1662 struct in_device *in_dev,
c6cffba4 1663 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1664{
2ffae99d 1665 struct fib_nh_exception *fnhe;
1da177e4
LT
1666 struct rtable *rth;
1667 int err;
1668 struct in_device *out_dev;
d2d68ba9 1669 bool do_cache;
fbdc0ad0 1670 u32 itag = 0;
1da177e4
LT
1671
1672 /* get a working reference to the output device */
47360228 1673 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1674 if (!out_dev) {
e87cc472 1675 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1676 return -EINVAL;
1677 }
1678
5c04c819 1679 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1680 in_dev->dev, in_dev, &itag);
1da177e4 1681 if (err < 0) {
e905a9ed 1682 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1683 saddr);
e905a9ed 1684
1da177e4
LT
1685 goto cleanup;
1686 }
1687
e81da0e1
JA
1688 do_cache = res->fi && !itag;
1689 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1690 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1691 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1692 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1693 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1694
1695 if (skb->protocol != htons(ETH_P_IP)) {
1696 /* Not IP (i.e. ARP). Do not create route, if it is
1697 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1698 *
1699 * Proxy arp feature have been extended to allow, ARP
1700 * replies back to the same interface, to support
1701 * Private VLAN switch technologies. See arp.c.
1da177e4 1702 */
65324144
JDB
1703 if (out_dev == in_dev &&
1704 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1705 err = -EINVAL;
1706 goto cleanup;
1707 }
1708 }
1709
2ffae99d 1710 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1711 if (do_cache) {
deed49df 1712 if (fnhe) {
2ffae99d 1713 rth = rcu_dereference(fnhe->fnhe_rth_input);
deed49df
XL
1714 if (rth && rth->dst.expires &&
1715 time_after(jiffies, rth->dst.expires)) {
1716 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1717 fnhe = NULL;
1718 } else {
1719 goto rt_cache;
1720 }
1721 }
1722
1723 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2ffae99d 1724
deed49df 1725rt_cache:
e81da0e1
JA
1726 if (rt_cache_valid(rth)) {
1727 skb_dst_set_noref(skb, &rth->dst);
1728 goto out;
d2d68ba9
DM
1729 }
1730 }
f2bb4bed 1731
d08c4f35 1732 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1733 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1734 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1735 if (!rth) {
1736 err = -ENOBUFS;
1737 goto cleanup;
1738 }
1739
9917e1e8 1740 rth->rt_is_input = 1;
b7503e0c
DA
1741 if (res->table)
1742 rth->rt_table_id = res->table->tb_id;
a6254864 1743 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1744
d8d1f30b 1745 rth->dst.input = ip_forward;
1da177e4 1746
a4c2fd7f
WW
1747 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1748 do_cache);
efd85700 1749 set_lwt_redirect(rth);
c6cffba4 1750 skb_dst_set(skb, &rth->dst);
d2d68ba9 1751out:
1da177e4
LT
1752 err = 0;
1753 cleanup:
1da177e4 1754 return err;
e905a9ed 1755}
1da177e4 1756
79a13159 1757#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1758/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1759 * calculated from the inner IP addresses.
79a13159 1760 */
bf4e0a3d
NA
1761static void ip_multipath_l3_keys(const struct sk_buff *skb,
1762 struct flow_keys *hash_keys)
79a13159
PN
1763{
1764 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1765 const struct iphdr *inner_iph;
79a13159
PN
1766 const struct icmphdr *icmph;
1767 struct iphdr _inner_iph;
bf4e0a3d
NA
1768 struct icmphdr _icmph;
1769
1770 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1771 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1772 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1773 return;
79a13159
PN
1774
1775 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1776 return;
79a13159
PN
1777
1778 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1779 &_icmph);
1780 if (!icmph)
bf4e0a3d 1781 return;
79a13159
PN
1782
1783 if (icmph->type != ICMP_DEST_UNREACH &&
1784 icmph->type != ICMP_REDIRECT &&
1785 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1786 icmph->type != ICMP_PARAMETERPROB)
1787 return;
79a13159
PN
1788
1789 inner_iph = skb_header_pointer(skb,
1790 outer_iph->ihl * 4 + sizeof(_icmph),
1791 sizeof(_inner_iph), &_inner_iph);
1792 if (!inner_iph)
bf4e0a3d
NA
1793 return;
1794 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1795 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1796}
79a13159 1797
bf4e0a3d
NA
1798/* if skb is set it will be used and fl4 can be NULL */
1799int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1800 const struct sk_buff *skb)
1801{
1802 struct net *net = fi->fib_net;
1803 struct flow_keys hash_keys;
1804 u32 mhash;
79a13159 1805
bf4e0a3d
NA
1806 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1807 case 0:
1808 memset(&hash_keys, 0, sizeof(hash_keys));
1809 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1810 if (skb) {
1811 ip_multipath_l3_keys(skb, &hash_keys);
1812 } else {
1813 hash_keys.addrs.v4addrs.src = fl4->saddr;
1814 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1815 }
1816 break;
1817 case 1:
1818 /* skb is currently provided only when forwarding */
1819 if (skb) {
1820 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1821 struct flow_keys keys;
1822
1823 /* short-circuit if we already have L4 hash present */
1824 if (skb->l4_hash)
1825 return skb_get_hash_raw(skb) >> 1;
1826 memset(&hash_keys, 0, sizeof(hash_keys));
1827 skb_flow_dissect_flow_keys(skb, &keys, flag);
1828 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1829 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1830 hash_keys.ports.src = keys.ports.src;
1831 hash_keys.ports.dst = keys.ports.dst;
1832 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1833 } else {
1834 memset(&hash_keys, 0, sizeof(hash_keys));
1835 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1836 hash_keys.addrs.v4addrs.src = fl4->saddr;
1837 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1838 hash_keys.ports.src = fl4->fl4_sport;
1839 hash_keys.ports.dst = fl4->fl4_dport;
1840 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1841 }
1842 break;
1843 }
1844 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1845
bf4e0a3d
NA
1846 return mhash >> 1;
1847}
1848EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1849#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1850
5969f71d
SH
1851static int ip_mkroute_input(struct sk_buff *skb,
1852 struct fib_result *res,
5969f71d
SH
1853 struct in_device *in_dev,
1854 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1855{
1da177e4 1856#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1857 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1858 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1859
0e884c78
PN
1860 fib_select_multipath(res, h);
1861 }
1da177e4
LT
1862#endif
1863
1864 /* create a routing cache entry */
c6cffba4 1865 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1866}
1867
1da177e4
LT
1868/*
1869 * NOTE. We drop all the packets that has local source
1870 * addresses, because every properly looped back packet
1871 * must have correct destination already attached by output routine.
1872 *
1873 * Such approach solves two big problems:
1874 * 1. Not simplex devices are handled properly.
1875 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1876 * called with rcu_read_lock()
1da177e4
LT
1877 */
1878
9e12bb22 1879static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1880 u8 tos, struct net_device *dev,
1881 struct fib_result *res)
1da177e4 1882{
96d36220 1883 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1884 struct ip_tunnel_info *tun_info;
68a5e3dd 1885 struct flowi4 fl4;
95c96174 1886 unsigned int flags = 0;
1da177e4 1887 u32 itag = 0;
95c96174 1888 struct rtable *rth;
1da177e4 1889 int err = -EINVAL;
5e73ea1a 1890 struct net *net = dev_net(dev);
d2d68ba9 1891 bool do_cache;
1da177e4
LT
1892
1893 /* IP on this device is disabled. */
1894
1895 if (!in_dev)
1896 goto out;
1897
1898 /* Check for the most weird martians, which can be not detected
1899 by fib_lookup.
1900 */
1901
61adedf3 1902 tun_info = skb_tunnel_info(skb);
46fa062a 1903 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1904 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1905 else
1906 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1907 skb_dst_drop(skb);
1908
d0daebc3 1909 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1910 goto martian_source;
1911
5510cdf7
DA
1912 res->fi = NULL;
1913 res->table = NULL;
27a954bd 1914 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1915 goto brd_input;
1916
1917 /* Accept zero addresses only to limited broadcast;
1918 * I even do not know to fix it or not. Waiting for complains :-)
1919 */
f97c1e0c 1920 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1921 goto martian_source;
1922
d0daebc3 1923 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1924 goto martian_destination;
1925
9eb43e76
ED
1926 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1927 * and call it once if daddr or/and saddr are loopback addresses
1928 */
1929 if (ipv4_is_loopback(daddr)) {
1930 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1931 goto martian_destination;
9eb43e76
ED
1932 } else if (ipv4_is_loopback(saddr)) {
1933 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1934 goto martian_source;
1935 }
1936
1da177e4
LT
1937 /*
1938 * Now we are ready to route packet.
1939 */
68a5e3dd 1940 fl4.flowi4_oif = 0;
e0d56fdd 1941 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1942 fl4.flowi4_mark = skb->mark;
1943 fl4.flowi4_tos = tos;
1944 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1945 fl4.flowi4_flags = 0;
68a5e3dd
DM
1946 fl4.daddr = daddr;
1947 fl4.saddr = saddr;
8bcfd092 1948 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1949 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1950 if (err != 0) {
1951 if (!IN_DEV_FORWARD(in_dev))
1952 err = -EHOSTUNREACH;
1da177e4 1953 goto no_route;
cd0f0b95 1954 }
1da177e4 1955
5510cdf7 1956 if (res->type == RTN_BROADCAST)
1da177e4
LT
1957 goto brd_input;
1958
5510cdf7 1959 if (res->type == RTN_LOCAL) {
5c04c819 1960 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1961 0, dev, in_dev, &itag);
b5f7e755 1962 if (err < 0)
0d753960 1963 goto martian_source;
1da177e4
LT
1964 goto local_input;
1965 }
1966
cd0f0b95
DJ
1967 if (!IN_DEV_FORWARD(in_dev)) {
1968 err = -EHOSTUNREACH;
251da413 1969 goto no_route;
cd0f0b95 1970 }
5510cdf7 1971 if (res->type != RTN_UNICAST)
1da177e4
LT
1972 goto martian_destination;
1973
5510cdf7 1974 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1975out: return err;
1976
1977brd_input:
1978 if (skb->protocol != htons(ETH_P_IP))
1979 goto e_inval;
1980
41347dcd 1981 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1982 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1983 in_dev, &itag);
1da177e4 1984 if (err < 0)
0d753960 1985 goto martian_source;
1da177e4
LT
1986 }
1987 flags |= RTCF_BROADCAST;
5510cdf7 1988 res->type = RTN_BROADCAST;
1da177e4
LT
1989 RT_CACHE_STAT_INC(in_brd);
1990
1991local_input:
d2d68ba9 1992 do_cache = false;
5510cdf7 1993 if (res->fi) {
fe3edf45 1994 if (!itag) {
5510cdf7 1995 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 1996 if (rt_cache_valid(rth)) {
c6cffba4
DM
1997 skb_dst_set_noref(skb, &rth->dst);
1998 err = 0;
1999 goto out;
d2d68ba9
DM
2000 }
2001 do_cache = true;
2002 }
2003 }
2004
f5a0aab8 2005 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2006 flags | RTCF_LOCAL, res->type,
d2d68ba9 2007 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2008 if (!rth)
2009 goto e_nobufs;
2010
d8d1f30b 2011 rth->dst.output= ip_rt_bug;
cf911662
DM
2012#ifdef CONFIG_IP_ROUTE_CLASSID
2013 rth->dst.tclassid = itag;
2014#endif
9917e1e8 2015 rth->rt_is_input = 1;
5510cdf7
DA
2016 if (res->table)
2017 rth->rt_table_id = res->table->tb_id;
571e7226 2018
a6254864 2019 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2020 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2021 rth->dst.input= ip_error;
2022 rth->dst.error= -err;
1da177e4
LT
2023 rth->rt_flags &= ~RTCF_LOCAL;
2024 }
efd85700 2025
dcdfdf56 2026 if (do_cache) {
5510cdf7 2027 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2028
2029 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2030 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2031 WARN_ON(rth->dst.input == lwtunnel_input);
2032 rth->dst.lwtstate->orig_input = rth->dst.input;
2033 rth->dst.input = lwtunnel_input;
2034 }
2035
a4c2fd7f 2036 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2037 rt_add_uncached_list(rth);
dcdfdf56 2038 }
89aef892 2039 skb_dst_set(skb, &rth->dst);
b23dd4fe 2040 err = 0;
ebc0ffae 2041 goto out;
1da177e4
LT
2042
2043no_route:
2044 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2045 res->type = RTN_UNREACHABLE;
2046 res->fi = NULL;
2047 res->table = NULL;
1da177e4
LT
2048 goto local_input;
2049
2050 /*
2051 * Do not cache martian addresses: they should be logged (RFC1812)
2052 */
2053martian_destination:
2054 RT_CACHE_STAT_INC(in_martian_dst);
2055#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2056 if (IN_DEV_LOG_MARTIANS(in_dev))
2057 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2058 &daddr, &saddr, dev->name);
1da177e4 2059#endif
2c2910a4 2060
1da177e4
LT
2061e_inval:
2062 err = -EINVAL;
ebc0ffae 2063 goto out;
1da177e4
LT
2064
2065e_nobufs:
2066 err = -ENOBUFS;
ebc0ffae 2067 goto out;
1da177e4
LT
2068
2069martian_source:
2070 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2071 goto out;
1da177e4
LT
2072}
2073
c6cffba4
DM
2074int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075 u8 tos, struct net_device *dev)
1da177e4 2076{
5510cdf7
DA
2077 struct fib_result res;
2078 int err;
1da177e4 2079
6e28099d 2080 tos &= IPTOS_RT_MASK;
96d36220 2081 rcu_read_lock();
5510cdf7
DA
2082 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2083 rcu_read_unlock();
96d36220 2084
5510cdf7
DA
2085 return err;
2086}
2087EXPORT_SYMBOL(ip_route_input_noref);
2088
2089/* called with rcu_read_lock held */
2090int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 u8 tos, struct net_device *dev, struct fib_result *res)
2092{
1da177e4
LT
2093 /* Multicast recognition logic is moved from route cache to here.
2094 The problem was that too many Ethernet cards have broken/missing
2095 hardware multicast filters :-( As result the host on multicasting
2096 network acquires a lot of useless route cache entries, sort of
2097 SDR messages from all the world. Now we try to get rid of them.
2098 Really, provided software IP multicast filter is organized
2099 reasonably (at least, hashed), it does not result in a slowdown
2100 comparing with route cache reject entries.
2101 Note, that multicast routers are not affected, because
2102 route cache entry is created eventually.
2103 */
f97c1e0c 2104 if (ipv4_is_multicast(daddr)) {
96d36220 2105 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2106 int our = 0;
5510cdf7 2107 int err = -EINVAL;
1da177e4 2108
e58e4159
DA
2109 if (in_dev)
2110 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2111 ip_hdr(skb)->protocol);
2112
2113 /* check l3 master if no match yet */
2114 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2115 struct in_device *l3_in_dev;
2116
2117 l3_in_dev = __in_dev_get_rcu(skb->dev);
2118 if (l3_in_dev)
2119 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2120 ip_hdr(skb)->protocol);
2121 }
2122
e58e4159 2123 if (our
1da177e4 2124#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2125 ||
2126 (!ipv4_is_local_multicast(daddr) &&
2127 IN_DEV_MFORWARD(in_dev))
1da177e4 2128#endif
e58e4159 2129 ) {
5510cdf7 2130 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2131 tos, dev, our);
1da177e4 2132 }
5510cdf7 2133 return err;
1da177e4 2134 }
5510cdf7
DA
2135
2136 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2137}
2138
ebc0ffae 2139/* called with rcu_read_lock() */
982721f3 2140static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2141 const struct flowi4 *fl4, int orig_oif,
f61759e6 2142 struct net_device *dev_out,
5ada5527 2143 unsigned int flags)
1da177e4 2144{
982721f3 2145 struct fib_info *fi = res->fi;
f2bb4bed 2146 struct fib_nh_exception *fnhe;
5ada5527 2147 struct in_device *in_dev;
982721f3 2148 u16 type = res->type;
5ada5527 2149 struct rtable *rth;
c92b9655 2150 bool do_cache;
1da177e4 2151
d0daebc3
TG
2152 in_dev = __in_dev_get_rcu(dev_out);
2153 if (!in_dev)
5ada5527 2154 return ERR_PTR(-EINVAL);
1da177e4 2155
d0daebc3 2156 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2157 if (ipv4_is_loopback(fl4->saddr) &&
2158 !(dev_out->flags & IFF_LOOPBACK) &&
2159 !netif_is_l3_master(dev_out))
d0daebc3
TG
2160 return ERR_PTR(-EINVAL);
2161
68a5e3dd 2162 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2163 type = RTN_BROADCAST;
68a5e3dd 2164 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2165 type = RTN_MULTICAST;
68a5e3dd 2166 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2167 return ERR_PTR(-EINVAL);
1da177e4
LT
2168
2169 if (dev_out->flags & IFF_LOOPBACK)
2170 flags |= RTCF_LOCAL;
2171
63617421 2172 do_cache = true;
982721f3 2173 if (type == RTN_BROADCAST) {
1da177e4 2174 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2175 fi = NULL;
2176 } else if (type == RTN_MULTICAST) {
dd28d1a0 2177 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2178 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2179 fl4->flowi4_proto))
1da177e4 2180 flags &= ~RTCF_LOCAL;
63617421
JA
2181 else
2182 do_cache = false;
1da177e4 2183 /* If multicast route do not exist use
dd28d1a0
ED
2184 * default one, but do not gateway in this case.
2185 * Yes, it is hack.
1da177e4 2186 */
982721f3
DM
2187 if (fi && res->prefixlen < 4)
2188 fi = NULL;
d6d5e999
CF
2189 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2190 (orig_oif != dev_out->ifindex)) {
2191 /* For local routes that require a particular output interface
2192 * we do not want to cache the result. Caching the result
2193 * causes incorrect behaviour when there are multiple source
2194 * addresses on the interface, the end result being that if the
2195 * intended recipient is waiting on that interface for the
2196 * packet he won't receive it because it will be delivered on
2197 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2198 * be set to the loopback interface as well.
2199 */
2200 fi = NULL;
1da177e4
LT
2201 }
2202
f2bb4bed 2203 fnhe = NULL;
63617421
JA
2204 do_cache &= fi != NULL;
2205 if (do_cache) {
c5038a83 2206 struct rtable __rcu **prth;
c92b9655 2207 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2208
c92b9655 2209 fnhe = find_exception(nh, fl4->daddr);
deed49df 2210 if (fnhe) {
2ffae99d 2211 prth = &fnhe->fnhe_rth_output;
deed49df
XL
2212 rth = rcu_dereference(*prth);
2213 if (rth && rth->dst.expires &&
2214 time_after(jiffies, rth->dst.expires)) {
2215 ip_del_fnhe(nh, fl4->daddr);
2216 fnhe = NULL;
2217 } else {
2218 goto rt_cache;
c92b9655 2219 }
c92b9655 2220 }
deed49df
XL
2221
2222 if (unlikely(fl4->flowi4_flags &
2223 FLOWI_FLAG_KNOWN_NH &&
2224 !(nh->nh_gw &&
2225 nh->nh_scope == RT_SCOPE_LINK))) {
2226 do_cache = false;
2227 goto add;
2228 }
2229 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c5038a83 2230 rth = rcu_dereference(*prth);
deed49df
XL
2231
2232rt_cache:
9df16efa 2233 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2234 return rth;
f2bb4bed 2235 }
c92b9655
JA
2236
2237add:
d08c4f35 2238 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2239 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2240 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2241 do_cache);
8391d07b 2242 if (!rth)
5ada5527 2243 return ERR_PTR(-ENOBUFS);
8391d07b 2244
13378cad 2245 rth->rt_iif = orig_oif ? : 0;
b7503e0c
DA
2246 if (res->table)
2247 rth->rt_table_id = res->table->tb_id;
2248
1da177e4
LT
2249 RT_CACHE_STAT_INC(out_slow_tot);
2250
1da177e4 2251 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2252 if (flags & RTCF_LOCAL &&
1da177e4 2253 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2254 rth->dst.output = ip_mc_output;
1da177e4
LT
2255 RT_CACHE_STAT_INC(out_slow_mc);
2256 }
2257#ifdef CONFIG_IP_MROUTE
982721f3 2258 if (type == RTN_MULTICAST) {
1da177e4 2259 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2260 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2261 rth->dst.input = ip_mr_input;
2262 rth->dst.output = ip_mc_output;
1da177e4
LT
2263 }
2264 }
2265#endif
2266 }
2267
a4c2fd7f 2268 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2269 set_lwt_redirect(rth);
1da177e4 2270
5ada5527 2271 return rth;
1da177e4
LT
2272}
2273
1da177e4
LT
2274/*
2275 * Major route resolver routine.
2276 */
2277
3abd1ade
DA
2278struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2279 const struct sk_buff *skb)
1da177e4 2280{
f61759e6 2281 __u8 tos = RT_FL_TOS(fl4);
813b3b5d 2282 struct fib_result res;
5ada5527 2283 struct rtable *rth;
1da177e4 2284
85b91b03 2285 res.tclassid = 0;
1da177e4 2286 res.fi = NULL;
8b96d22d 2287 res.table = NULL;
1da177e4 2288
1fb9489b 2289 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2290 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2291 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2292 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2293
010c2708 2294 rcu_read_lock();
3abd1ade
DA
2295 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2296 rcu_read_unlock();
2297
2298 return rth;
2299}
2300EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2301
2302struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2303 struct fib_result *res,
2304 const struct sk_buff *skb)
2305{
2306 struct net_device *dev_out = NULL;
2307 int orig_oif = fl4->flowi4_oif;
2308 unsigned int flags = 0;
2309 struct rtable *rth;
2310 int err = -ENETUNREACH;
2311
813b3b5d 2312 if (fl4->saddr) {
b23dd4fe 2313 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2314 if (ipv4_is_multicast(fl4->saddr) ||
2315 ipv4_is_lbcast(fl4->saddr) ||
2316 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2317 goto out;
2318
1da177e4
LT
2319 /* I removed check for oif == dev_out->oif here.
2320 It was wrong for two reasons:
1ab35276
DL
2321 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322 is assigned to multiple interfaces.
1da177e4
LT
2323 2. Moreover, we are allowed to send packets with saddr
2324 of another iface. --ANK
2325 */
2326
813b3b5d
DM
2327 if (fl4->flowi4_oif == 0 &&
2328 (ipv4_is_multicast(fl4->daddr) ||
2329 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2330 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2331 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2332 if (!dev_out)
a210d01a
JA
2333 goto out;
2334
1da177e4
LT
2335 /* Special hack: user can direct multicasts
2336 and limited broadcast via necessary interface
2337 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2338 This hack is not just for fun, it allows
2339 vic,vat and friends to work.
2340 They bind socket to loopback, set ttl to zero
2341 and expect that it will work.
2342 From the viewpoint of routing cache they are broken,
2343 because we are not allowed to build multicast path
2344 with loopback source addr (look, routing cache
2345 cannot know, that ttl is zero, so that packet
2346 will not leave this host and route is valid).
2347 Luckily, this hack is good workaround.
2348 */
2349
813b3b5d 2350 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2351 goto make_route;
2352 }
a210d01a 2353
813b3b5d 2354 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2355 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2356 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2357 goto out;
a210d01a 2358 }
1da177e4
LT
2359 }
2360
2361
813b3b5d
DM
2362 if (fl4->flowi4_oif) {
2363 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2364 rth = ERR_PTR(-ENODEV);
51456b29 2365 if (!dev_out)
1da177e4 2366 goto out;
e5ed6399
HX
2367
2368 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2369 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2370 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2371 goto out;
2372 }
813b3b5d 2373 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2374 ipv4_is_lbcast(fl4->daddr) ||
2375 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2376 if (!fl4->saddr)
2377 fl4->saddr = inet_select_addr(dev_out, 0,
2378 RT_SCOPE_LINK);
1da177e4
LT
2379 goto make_route;
2380 }
0a7e2260 2381 if (!fl4->saddr) {
813b3b5d
DM
2382 if (ipv4_is_multicast(fl4->daddr))
2383 fl4->saddr = inet_select_addr(dev_out, 0,
2384 fl4->flowi4_scope);
2385 else if (!fl4->daddr)
2386 fl4->saddr = inet_select_addr(dev_out, 0,
2387 RT_SCOPE_HOST);
1da177e4
LT
2388 }
2389 }
2390
813b3b5d
DM
2391 if (!fl4->daddr) {
2392 fl4->daddr = fl4->saddr;
2393 if (!fl4->daddr)
2394 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2395 dev_out = net->loopback_dev;
1fb9489b 2396 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2397 res->type = RTN_LOCAL;
1da177e4
LT
2398 flags |= RTCF_LOCAL;
2399 goto make_route;
2400 }
2401
3abd1ade 2402 err = fib_lookup(net, fl4, res, 0);
0315e382 2403 if (err) {
3abd1ade
DA
2404 res->fi = NULL;
2405 res->table = NULL;
6104e112 2406 if (fl4->flowi4_oif &&
e58e4159
DA
2407 (ipv4_is_multicast(fl4->daddr) ||
2408 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2409 /* Apparently, routing tables are wrong. Assume,
2410 that the destination is on link.
2411
2412 WHY? DW.
2413 Because we are allowed to send to iface
2414 even if it has NO routes and NO assigned
2415 addresses. When oif is specified, routing
2416 tables are looked up with only one purpose:
2417 to catch if destination is gatewayed, rather than
2418 direct. Moreover, if MSG_DONTROUTE is set,
2419 we send packet, ignoring both routing tables
2420 and ifaddr state. --ANK
2421
2422
2423 We could make it even if oif is unknown,
2424 likely IPv6, but we do not.
2425 */
2426
813b3b5d
DM
2427 if (fl4->saddr == 0)
2428 fl4->saddr = inet_select_addr(dev_out, 0,
2429 RT_SCOPE_LINK);
3abd1ade 2430 res->type = RTN_UNICAST;
1da177e4
LT
2431 goto make_route;
2432 }
0315e382 2433 rth = ERR_PTR(err);
1da177e4
LT
2434 goto out;
2435 }
1da177e4 2436
3abd1ade 2437 if (res->type == RTN_LOCAL) {
813b3b5d 2438 if (!fl4->saddr) {
3abd1ade
DA
2439 if (res->fi->fib_prefsrc)
2440 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2441 else
813b3b5d 2442 fl4->saddr = fl4->daddr;
9fc3bbb4 2443 }
5f02ce24
DA
2444
2445 /* L3 master device is the loopback for that domain */
3abd1ade 2446 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2447 net->loopback_dev;
813b3b5d 2448 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2449 flags |= RTCF_LOCAL;
2450 goto make_route;
2451 }
2452
3abd1ade 2453 fib_select_path(net, res, fl4, skb);
1da177e4 2454
3abd1ade 2455 dev_out = FIB_RES_DEV(*res);
813b3b5d 2456 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2457
2458
2459make_route:
3abd1ade 2460 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2461
010c2708 2462out:
b23dd4fe 2463 return rth;
1da177e4 2464}
d8c97a94 2465
ae2688d5
JW
2466static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2467{
2468 return NULL;
2469}
2470
ebb762f2 2471static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2472{
618f9bc7
SK
2473 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2474
2475 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2476}
2477
6700c270
DM
2478static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2479 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2480{
2481}
2482
6700c270
DM
2483static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2484 struct sk_buff *skb)
b587ee3b
DM
2485{
2486}
2487
0972ddb2
HB
2488static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2489 unsigned long old)
2490{
2491 return NULL;
2492}
2493
14e50e57
DM
2494static struct dst_ops ipv4_dst_blackhole_ops = {
2495 .family = AF_INET,
ae2688d5 2496 .check = ipv4_blackhole_dst_check,
ebb762f2 2497 .mtu = ipv4_blackhole_mtu,
214f45c9 2498 .default_advmss = ipv4_default_advmss,
14e50e57 2499 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2500 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2501 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2502 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2503};
2504
2774c131 2505struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2506{
2774c131 2507 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2508 struct rtable *rt;
14e50e57 2509
b2a9c0ed 2510 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
14e50e57 2511 if (rt) {
d8d1f30b 2512 struct dst_entry *new = &rt->dst;
14e50e57 2513
14e50e57 2514 new->__use = 1;
352e512c 2515 new->input = dst_discard;
ede2059d 2516 new->output = dst_discard_out;
14e50e57 2517
1dbe3252 2518 new->dev = net->loopback_dev;
14e50e57
DM
2519 if (new->dev)
2520 dev_hold(new->dev);
2521
9917e1e8 2522 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2523 rt->rt_iif = ort->rt_iif;
5943634f 2524 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2525
ca4c3fc2 2526 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2527 rt->rt_flags = ort->rt_flags;
2528 rt->rt_type = ort->rt_type;
14e50e57 2529 rt->rt_gateway = ort->rt_gateway;
155e8336 2530 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2531
caacf05e 2532 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2533 }
2534
2774c131
DM
2535 dst_release(dst_orig);
2536
2537 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2538}
2539
9d6ec938 2540struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2541 const struct sock *sk)
1da177e4 2542{
9d6ec938 2543 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2544
b23dd4fe
DM
2545 if (IS_ERR(rt))
2546 return rt;
1da177e4 2547
56157872 2548 if (flp4->flowi4_proto)
f92ee619
SK
2549 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2550 flowi4_to_flowi(flp4),
2551 sk, 0);
1da177e4 2552
b23dd4fe 2553 return rt;
1da177e4 2554}
d8c97a94
ACM
2555EXPORT_SYMBOL_GPL(ip_route_output_flow);
2556
3765d35e 2557/* called with rcu_read_lock held */
c36ba660 2558static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2559 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2560 u32 seq)
1da177e4 2561{
ba52d61e 2562 struct rtable *rt = skb_rtable(skb);
1da177e4 2563 struct rtmsg *r;
be403ea1 2564 struct nlmsghdr *nlh;
2bc8ca40 2565 unsigned long expires = 0;
f185071d 2566 u32 error;
521f5490 2567 u32 metrics[RTAX_MAX];
be403ea1 2568
d3166e0c 2569 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2570 if (!nlh)
26932566 2571 return -EMSGSIZE;
be403ea1
TG
2572
2573 r = nlmsg_data(nlh);
1da177e4
LT
2574 r->rtm_family = AF_INET;
2575 r->rtm_dst_len = 32;
2576 r->rtm_src_len = 0;
d6c0a4f6 2577 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2578 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2579 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2580 goto nla_put_failure;
1da177e4
LT
2581 r->rtm_type = rt->rt_type;
2582 r->rtm_scope = RT_SCOPE_UNIVERSE;
2583 r->rtm_protocol = RTPROT_UNSPEC;
2584 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2585 if (rt->rt_flags & RTCF_NOTIFY)
2586 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2587 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2588 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2589
930345ea 2590 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2591 goto nla_put_failure;
1a00fee4 2592 if (src) {
1da177e4 2593 r->rtm_src_len = 32;
930345ea 2594 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2595 goto nla_put_failure;
1da177e4 2596 }
f3756b79
DM
2597 if (rt->dst.dev &&
2598 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2599 goto nla_put_failure;
c7066f70 2600#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2601 if (rt->dst.tclassid &&
2602 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2603 goto nla_put_failure;
1da177e4 2604#endif
41347dcd 2605 if (!rt_is_input_route(rt) &&
d6c0a4f6 2606 fl4->saddr != src) {
930345ea 2607 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2608 goto nla_put_failure;
2609 }
155e8336 2610 if (rt->rt_uses_gateway &&
930345ea 2611 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2612 goto nla_put_failure;
be403ea1 2613
ee9a8f7a
SK
2614 expires = rt->dst.expires;
2615 if (expires) {
2616 unsigned long now = jiffies;
2617
2618 if (time_before(now, expires))
2619 expires -= now;
2620 else
2621 expires = 0;
2622 }
2623
521f5490 2624 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2625 if (rt->rt_pmtu && expires)
521f5490
JA
2626 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2627 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2628 goto nla_put_failure;
2629
b4869889 2630 if (fl4->flowi4_mark &&
68aaed54 2631 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2632 goto nla_put_failure;
963bfeee 2633
622ec2c9
LC
2634 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2635 nla_put_u32(skb, RTA_UID,
2636 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2637 goto nla_put_failure;
2638
d8d1f30b 2639 error = rt->dst.error;
be403ea1 2640
c7537967 2641 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2642#ifdef CONFIG_IP_MROUTE
2643 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2644 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2645 int err = ipmr_get_route(net, skb,
2646 fl4->saddr, fl4->daddr,
9f09eaea 2647 r, portid);
2cf75070 2648
8caaf7b6 2649 if (err <= 0) {
0c8d803f
DA
2650 if (err == 0)
2651 return 0;
2652 goto nla_put_failure;
8caaf7b6
ND
2653 }
2654 } else
2655#endif
91146153 2656 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2657 goto nla_put_failure;
1da177e4
LT
2658 }
2659
f185071d 2660 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2661 goto nla_put_failure;
be403ea1 2662
053c095a
JB
2663 nlmsg_end(skb, nlh);
2664 return 0;
1da177e4 2665
be403ea1 2666nla_put_failure:
26932566
PM
2667 nlmsg_cancel(skb, nlh);
2668 return -EMSGSIZE;
1da177e4
LT
2669}
2670
c21ef3e3
DA
2671static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2672 struct netlink_ext_ack *extack)
1da177e4 2673{
3b1e0a65 2674 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2675 struct rtmsg *rtm;
2676 struct nlattr *tb[RTA_MAX+1];
3765d35e 2677 struct fib_result res = {};
1da177e4 2678 struct rtable *rt = NULL;
d6c0a4f6 2679 struct flowi4 fl4;
9e12bb22
AV
2680 __be32 dst = 0;
2681 __be32 src = 0;
2682 u32 iif;
d889ce3b 2683 int err;
963bfeee 2684 int mark;
1da177e4 2685 struct sk_buff *skb;
c36ba660 2686 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2687 kuid_t uid;
1da177e4 2688
fceb6435 2689 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2690 extack);
d889ce3b
TG
2691 if (err < 0)
2692 goto errout;
2693
2694 rtm = nlmsg_data(nlh);
2695
1da177e4 2696 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2697 if (!skb) {
d889ce3b
TG
2698 err = -ENOBUFS;
2699 goto errout;
2700 }
1da177e4
LT
2701
2702 /* Reserve room for dummy headers, this skb can pass
2703 through good chunk of routing engine.
2704 */
459a98ed 2705 skb_reset_mac_header(skb);
c1d2bbe1 2706 skb_reset_network_header(skb);
d2c962b8 2707
67b61f6c
JB
2708 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2709 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2710 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2711 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2712 if (tb[RTA_UID])
2713 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2714 else
2715 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2716
bbadb9a2
FL
2717 /* Bugfix: need to give ip_route_input enough of an IP header to
2718 * not gag.
2719 */
2720 ip_hdr(skb)->protocol = IPPROTO_UDP;
2721 ip_hdr(skb)->saddr = src;
2722 ip_hdr(skb)->daddr = dst;
2723
2724 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2725
d6c0a4f6
DM
2726 memset(&fl4, 0, sizeof(fl4));
2727 fl4.daddr = dst;
2728 fl4.saddr = src;
2729 fl4.flowi4_tos = rtm->rtm_tos;
2730 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2731 fl4.flowi4_mark = mark;
622ec2c9 2732 fl4.flowi4_uid = uid;
d6c0a4f6 2733
3765d35e
DA
2734 rcu_read_lock();
2735
1da177e4 2736 if (iif) {
d889ce3b
TG
2737 struct net_device *dev;
2738
3765d35e 2739 dev = dev_get_by_index_rcu(net, iif);
51456b29 2740 if (!dev) {
d889ce3b
TG
2741 err = -ENODEV;
2742 goto errout_free;
2743 }
2744
1da177e4
LT
2745 skb->protocol = htons(ETH_P_IP);
2746 skb->dev = dev;
963bfeee 2747 skb->mark = mark;
3765d35e
DA
2748 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2749 dev, &res);
d889ce3b 2750
511c3f92 2751 rt = skb_rtable(skb);
d8d1f30b
CG
2752 if (err == 0 && rt->dst.error)
2753 err = -rt->dst.error;
1da177e4 2754 } else {
3765d35e 2755 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2756 err = 0;
2757 if (IS_ERR(rt))
2758 err = PTR_ERR(rt);
2c87d63a
FW
2759 else
2760 skb_dst_set(skb, &rt->dst);
1da177e4 2761 }
d889ce3b 2762
1da177e4 2763 if (err)
d889ce3b 2764 goto errout_free;
1da177e4 2765
1da177e4
LT
2766 if (rtm->rtm_flags & RTM_F_NOTIFY)
2767 rt->rt_flags |= RTCF_NOTIFY;
2768
c36ba660
DA
2769 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2770 table_id = rt->rt_table_id;
2771
bc3aae2b
RP
2772 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2773 if (!res.fi) {
2774 err = fib_props[res.type].error;
2775 if (!err)
2776 err = -EHOSTUNREACH;
2777 goto errout_free;
2778 }
b6179813
RP
2779 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2780 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2781 rt->rt_type, res.prefix, res.prefixlen,
2782 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2783 } else {
b6179813 2784 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2785 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2786 }
7b46a644 2787 if (err < 0)
d889ce3b 2788 goto errout_free;
1da177e4 2789
3765d35e
DA
2790 rcu_read_unlock();
2791
15e47304 2792 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2793errout:
2942e900 2794 return err;
1da177e4 2795
d889ce3b 2796errout_free:
3765d35e 2797 rcu_read_unlock();
1da177e4 2798 kfree_skb(skb);
d889ce3b 2799 goto errout;
1da177e4
LT
2800}
2801
1da177e4
LT
2802void ip_rt_multicast_event(struct in_device *in_dev)
2803{
4ccfe6d4 2804 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2805}
2806
2807#ifdef CONFIG_SYSCTL
082c7ca4
G
2808static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2809static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2810static int ip_rt_gc_elasticity __read_mostly = 8;
2811
fe2c6338 2812static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2813 void __user *buffer,
1da177e4
LT
2814 size_t *lenp, loff_t *ppos)
2815{
5aad1de5
TT
2816 struct net *net = (struct net *)__ctl->extra1;
2817
1da177e4 2818 if (write) {
5aad1de5
TT
2819 rt_cache_flush(net);
2820 fnhe_genid_bump(net);
1da177e4 2821 return 0;
e905a9ed 2822 }
1da177e4
LT
2823
2824 return -EINVAL;
2825}
2826
fe2c6338 2827static struct ctl_table ipv4_route_table[] = {
1da177e4 2828 {
1da177e4
LT
2829 .procname = "gc_thresh",
2830 .data = &ipv4_dst_ops.gc_thresh,
2831 .maxlen = sizeof(int),
2832 .mode = 0644,
6d9f239a 2833 .proc_handler = proc_dointvec,
1da177e4
LT
2834 },
2835 {
1da177e4
LT
2836 .procname = "max_size",
2837 .data = &ip_rt_max_size,
2838 .maxlen = sizeof(int),
2839 .mode = 0644,
6d9f239a 2840 .proc_handler = proc_dointvec,
1da177e4
LT
2841 },
2842 {
2843 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2844
1da177e4
LT
2845 .procname = "gc_min_interval",
2846 .data = &ip_rt_gc_min_interval,
2847 .maxlen = sizeof(int),
2848 .mode = 0644,
6d9f239a 2849 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2850 },
2851 {
1da177e4
LT
2852 .procname = "gc_min_interval_ms",
2853 .data = &ip_rt_gc_min_interval,
2854 .maxlen = sizeof(int),
2855 .mode = 0644,
6d9f239a 2856 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2857 },
2858 {
1da177e4
LT
2859 .procname = "gc_timeout",
2860 .data = &ip_rt_gc_timeout,
2861 .maxlen = sizeof(int),
2862 .mode = 0644,
6d9f239a 2863 .proc_handler = proc_dointvec_jiffies,
1da177e4 2864 },
9f28a2fc
ED
2865 {
2866 .procname = "gc_interval",
2867 .data = &ip_rt_gc_interval,
2868 .maxlen = sizeof(int),
2869 .mode = 0644,
2870 .proc_handler = proc_dointvec_jiffies,
2871 },
1da177e4 2872 {
1da177e4
LT
2873 .procname = "redirect_load",
2874 .data = &ip_rt_redirect_load,
2875 .maxlen = sizeof(int),
2876 .mode = 0644,
6d9f239a 2877 .proc_handler = proc_dointvec,
1da177e4
LT
2878 },
2879 {
1da177e4
LT
2880 .procname = "redirect_number",
2881 .data = &ip_rt_redirect_number,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
6d9f239a 2884 .proc_handler = proc_dointvec,
1da177e4
LT
2885 },
2886 {
1da177e4
LT
2887 .procname = "redirect_silence",
2888 .data = &ip_rt_redirect_silence,
2889 .maxlen = sizeof(int),
2890 .mode = 0644,
6d9f239a 2891 .proc_handler = proc_dointvec,
1da177e4
LT
2892 },
2893 {
1da177e4
LT
2894 .procname = "error_cost",
2895 .data = &ip_rt_error_cost,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
6d9f239a 2898 .proc_handler = proc_dointvec,
1da177e4
LT
2899 },
2900 {
1da177e4
LT
2901 .procname = "error_burst",
2902 .data = &ip_rt_error_burst,
2903 .maxlen = sizeof(int),
2904 .mode = 0644,
6d9f239a 2905 .proc_handler = proc_dointvec,
1da177e4
LT
2906 },
2907 {
1da177e4
LT
2908 .procname = "gc_elasticity",
2909 .data = &ip_rt_gc_elasticity,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
6d9f239a 2912 .proc_handler = proc_dointvec,
1da177e4
LT
2913 },
2914 {
1da177e4
LT
2915 .procname = "mtu_expires",
2916 .data = &ip_rt_mtu_expires,
2917 .maxlen = sizeof(int),
2918 .mode = 0644,
6d9f239a 2919 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2920 },
2921 {
1da177e4
LT
2922 .procname = "min_pmtu",
2923 .data = &ip_rt_min_pmtu,
2924 .maxlen = sizeof(int),
2925 .mode = 0644,
6d9f239a 2926 .proc_handler = proc_dointvec,
1da177e4
LT
2927 },
2928 {
1da177e4
LT
2929 .procname = "min_adv_mss",
2930 .data = &ip_rt_min_advmss,
2931 .maxlen = sizeof(int),
2932 .mode = 0644,
6d9f239a 2933 .proc_handler = proc_dointvec,
1da177e4 2934 },
f8572d8f 2935 { }
1da177e4 2936};
39a23e75 2937
39a23e75
DL
2938static struct ctl_table ipv4_route_flush_table[] = {
2939 {
39a23e75
DL
2940 .procname = "flush",
2941 .maxlen = sizeof(int),
2942 .mode = 0200,
6d9f239a 2943 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2944 },
f8572d8f 2945 { },
39a23e75
DL
2946};
2947
2948static __net_init int sysctl_route_net_init(struct net *net)
2949{
2950 struct ctl_table *tbl;
2951
2952 tbl = ipv4_route_flush_table;
09ad9bc7 2953 if (!net_eq(net, &init_net)) {
39a23e75 2954 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2955 if (!tbl)
39a23e75 2956 goto err_dup;
464dc801
EB
2957
2958 /* Don't export sysctls to unprivileged users */
2959 if (net->user_ns != &init_user_ns)
2960 tbl[0].procname = NULL;
39a23e75
DL
2961 }
2962 tbl[0].extra1 = net;
2963
ec8f23ce 2964 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2965 if (!net->ipv4.route_hdr)
39a23e75
DL
2966 goto err_reg;
2967 return 0;
2968
2969err_reg:
2970 if (tbl != ipv4_route_flush_table)
2971 kfree(tbl);
2972err_dup:
2973 return -ENOMEM;
2974}
2975
2976static __net_exit void sysctl_route_net_exit(struct net *net)
2977{
2978 struct ctl_table *tbl;
2979
2980 tbl = net->ipv4.route_hdr->ctl_table_arg;
2981 unregister_net_sysctl_table(net->ipv4.route_hdr);
2982 BUG_ON(tbl == ipv4_route_flush_table);
2983 kfree(tbl);
2984}
2985
2986static __net_initdata struct pernet_operations sysctl_route_ops = {
2987 .init = sysctl_route_net_init,
2988 .exit = sysctl_route_net_exit,
2989};
1da177e4
LT
2990#endif
2991
3ee94372 2992static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2993{
ca4c3fc2 2994 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 2995 atomic_set(&net->fnhe_genid, 0);
7aed9f72 2996 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
2997 return 0;
2998}
2999
3ee94372
NH
3000static __net_initdata struct pernet_operations rt_genid_ops = {
3001 .init = rt_genid_init,
9f5e97e5
DL
3002};
3003
c3426b47
DM
3004static int __net_init ipv4_inetpeer_init(struct net *net)
3005{
3006 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3007
3008 if (!bp)
3009 return -ENOMEM;
3010 inet_peer_base_init(bp);
3011 net->ipv4.peers = bp;
3012 return 0;
3013}
3014
3015static void __net_exit ipv4_inetpeer_exit(struct net *net)
3016{
3017 struct inet_peer_base *bp = net->ipv4.peers;
3018
3019 net->ipv4.peers = NULL;
56a6b248 3020 inetpeer_invalidate_tree(bp);
c3426b47
DM
3021 kfree(bp);
3022}
3023
3024static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3025 .init = ipv4_inetpeer_init,
3026 .exit = ipv4_inetpeer_exit,
3027};
9f5e97e5 3028
c7066f70 3029#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3030struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3031#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3032
1da177e4
LT
3033int __init ip_rt_init(void)
3034{
424c4b70 3035 int rc = 0;
5055c371 3036 int cpu;
1da177e4 3037
73f156a6
ED
3038 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3039 if (!ip_idents)
3040 panic("IP: failed to allocate ip_idents\n");
3041
3042 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3043
355b590c
ED
3044 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3045 if (!ip_tstamps)
3046 panic("IP: failed to allocate ip_tstamps\n");
3047
5055c371
ED
3048 for_each_possible_cpu(cpu) {
3049 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3050
3051 INIT_LIST_HEAD(&ul->head);
3052 spin_lock_init(&ul->lock);
3053 }
c7066f70 3054#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3055 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3056 if (!ip_rt_acct)
3057 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3058#endif
3059
e5d679f3
AD
3060 ipv4_dst_ops.kmem_cachep =
3061 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3062 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3063
14e50e57
DM
3064 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3065
fc66f95c
ED
3066 if (dst_entries_init(&ipv4_dst_ops) < 0)
3067 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3068
3069 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3070 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3071
89aef892
DM
3072 ipv4_dst_ops.gc_thresh = ~0;
3073 ip_rt_max_size = INT_MAX;
1da177e4 3074
1da177e4
LT
3075 devinet_init();
3076 ip_fib_init();
3077
73b38711 3078 if (ip_rt_proc_init())
058bd4d2 3079 pr_err("Unable to create route proc files\n");
1da177e4
LT
3080#ifdef CONFIG_XFRM
3081 xfrm_init();
703fb94e 3082 xfrm4_init();
1da177e4 3083#endif
c7ac8679 3084 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3085
39a23e75
DL
3086#ifdef CONFIG_SYSCTL
3087 register_pernet_subsys(&sysctl_route_ops);
3088#endif
3ee94372 3089 register_pernet_subsys(&rt_genid_ops);
c3426b47 3090 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3091 return rc;
3092}
3093
a1bc6eb4 3094#ifdef CONFIG_SYSCTL
eeb61f71
AV
3095/*
3096 * We really need to sanitize the damn ipv4 init order, then all
3097 * this nonsense will go away.
3098 */
3099void __init ip_static_sysctl_init(void)
3100{
4e5ca785 3101 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3102}
a1bc6eb4 3103#endif