]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/ipv4/route.c
xfrm: Move dst->path into struct xfrm_dst
[mirror_ubuntu-eoan-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .owner = THIS_MODULE,
244 .open = rt_cache_seq_open,
245 .read = seq_read,
246 .llseek = seq_lseek,
89aef892 247 .release = seq_release,
1da177e4
LT
248};
249
250
251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252{
253 int cpu;
254
255 if (*pos == 0)
256 return SEQ_START_TOKEN;
257
0f23174a 258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
259 if (!cpu_possible(cpu))
260 continue;
261 *pos = cpu+1;
2f970d83 262 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
263 }
264 return NULL;
265}
266
267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 int cpu;
270
0f23174a 271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
272 if (!cpu_possible(cpu))
273 continue;
274 *pos = cpu+1;
2f970d83 275 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
276 }
277 return NULL;
e905a9ed 278
1da177e4
LT
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
5bec0039 291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
292 return 0;
293 }
e905a9ed 294
1da177e4
LT
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 297 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 298 0, /* st->in_hit */
1da177e4
LT
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
0baf2b35 306 0, /* st->out_hit */
1da177e4 307 st->out_slow_tot,
e905a9ed 308 st->out_slow_mc,
1da177e4 309
0baf2b35
ED
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
1da177e4
LT
316 );
317 return 0;
318}
319
f690808e 320static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
9a32144e 333static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
334 .owner = THIS_MODULE,
335 .open = rt_cpu_seq_open,
336 .read = seq_read,
337 .llseek = seq_lseek,
338 .release = seq_release,
339};
340
c7066f70 341#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 342static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 343{
a661c419
AD
344 struct ip_rt_acct *dst, *src;
345 unsigned int i, j;
346
347 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 if (!dst)
349 return -ENOMEM;
350
351 for_each_possible_cpu(i) {
352 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 for (j = 0; j < 256; j++) {
354 dst[j].o_bytes += src[j].o_bytes;
355 dst[j].o_packets += src[j].o_packets;
356 dst[j].i_bytes += src[j].i_bytes;
357 dst[j].i_packets += src[j].i_packets;
358 }
78c686e9
PE
359 }
360
a661c419
AD
361 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 kfree(dst);
363 return 0;
364}
78c686e9 365
a661c419
AD
366static int rt_acct_proc_open(struct inode *inode, struct file *file)
367{
368 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 369}
a661c419
AD
370
371static const struct file_operations rt_acct_proc_fops = {
372 .owner = THIS_MODULE,
373 .open = rt_acct_proc_open,
374 .read = seq_read,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
78c686e9 378#endif
107f1634 379
73b38711 380static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
381{
382 struct proc_dir_entry *pde;
383
d4beaa66
G
384 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 &rt_cache_seq_fops);
107f1634
PE
386 if (!pde)
387 goto err1;
388
77020720
WC
389 pde = proc_create("rt_cache", S_IRUGO,
390 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
391 if (!pde)
392 goto err2;
393
c7066f70 394#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 395 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
396 if (!pde)
397 goto err3;
398#endif
399 return 0;
400
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
402err3:
403 remove_proc_entry("rt_cache", net->proc_net_stat);
404#endif
405err2:
406 remove_proc_entry("rt_cache", net->proc_net);
407err1:
408 return -ENOMEM;
409}
73b38711
DL
410
411static void __net_exit ip_rt_do_proc_exit(struct net *net)
412{
413 remove_proc_entry("rt_cache", net->proc_net_stat);
414 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 415#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 416 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 417#endif
73b38711
DL
418}
419
420static struct pernet_operations ip_rt_proc_ops __net_initdata = {
421 .init = ip_rt_do_proc_init,
422 .exit = ip_rt_do_proc_exit,
423};
424
425static int __init ip_rt_proc_init(void)
426{
427 return register_pernet_subsys(&ip_rt_proc_ops);
428}
429
107f1634 430#else
73b38711 431static inline int ip_rt_proc_init(void)
107f1634
PE
432{
433 return 0;
434}
1da177e4 435#endif /* CONFIG_PROC_FS */
e905a9ed 436
4331debc 437static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 438{
ca4c3fc2 439 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
440}
441
4ccfe6d4 442void rt_cache_flush(struct net *net)
1da177e4 443{
ca4c3fc2 444 rt_genid_bump_ipv4(net);
98376387
ED
445}
446
f894cbf8
DM
447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct sk_buff *skb,
449 const void *daddr)
3769cffb 450{
d3aaeb38
DM
451 struct net_device *dev = dst->dev;
452 const __be32 *pkey = daddr;
39232973 453 const struct rtable *rt;
3769cffb
DM
454 struct neighbour *n;
455
39232973 456 rt = (const struct rtable *) dst;
a263b309 457 if (rt->rt_gateway)
39232973 458 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
459 else if (skb)
460 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 461
80703d26 462 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
463 if (n)
464 return n;
32092ecf 465 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
466}
467
63fca65d
JA
468static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469{
470 struct net_device *dev = dst->dev;
471 const __be32 *pkey = daddr;
472 const struct rtable *rt;
473
474 rt = (const struct rtable *)dst;
475 if (rt->rt_gateway)
476 pkey = (const __be32 *)&rt->rt_gateway;
477 else if (!daddr ||
478 (rt->rt_flags &
479 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480 return;
481
482 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483}
484
04ca6973 485#define IP_IDENTS_SZ 2048u
04ca6973 486
355b590c
ED
487static atomic_t *ip_idents __read_mostly;
488static u32 *ip_tstamps __read_mostly;
04ca6973
ED
489
490/* In order to protect privacy, we add a perturbation to identifiers
491 * if one generator is seldom used. This makes hard for an attacker
492 * to infer how many packets were sent between two points in time.
493 */
494u32 ip_idents_reserve(u32 hash, int segs)
495{
355b590c
ED
496 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 498 u32 old = READ_ONCE(*p_tstamp);
04ca6973 499 u32 now = (u32)jiffies;
adb03115 500 u32 new, delta = 0;
04ca6973 501
355b590c 502 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
503 delta = prandom_u32_max(now - old);
504
adb03115
ED
505 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 do {
507 old = (u32)atomic_read(p_id);
508 new = old + delta + segs;
509 } while (atomic_cmpxchg(p_id, old, new) != old);
510
511 return new - segs;
04ca6973
ED
512}
513EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 514
b6a7719a 515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 516{
73f156a6
ED
517 static u32 ip_idents_hashrnd __read_mostly;
518 u32 hash, id;
1da177e4 519
73f156a6 520 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 521
04ca6973
ED
522 hash = jhash_3words((__force u32)iph->daddr,
523 (__force u32)iph->saddr,
b6a7719a 524 iph->protocol ^ net_hash_mix(net),
04ca6973 525 ip_idents_hashrnd);
73f156a6
ED
526 id = ip_idents_reserve(hash, segs);
527 iph->id = htons(id);
1da177e4 528}
4bc2f18b 529EXPORT_SYMBOL(__ip_select_ident);
1da177e4 530
e2d118a1
LC
531static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532 const struct sock *sk,
4895c771
DM
533 const struct iphdr *iph,
534 int oif, u8 tos,
535 u8 prot, u32 mark, int flow_flags)
536{
537 if (sk) {
538 const struct inet_sock *inet = inet_sk(sk);
539
540 oif = sk->sk_bound_dev_if;
541 mark = sk->sk_mark;
542 tos = RT_CONN_FLAGS(sk);
543 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 }
545 flowi4_init_output(fl4, oif, mark, tos,
546 RT_SCOPE_UNIVERSE, prot,
547 flow_flags,
e2d118a1
LC
548 iph->daddr, iph->saddr, 0, 0,
549 sock_net_uid(net, sk));
4895c771
DM
550}
551
5abf7f7e
ED
552static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553 const struct sock *sk)
4895c771 554{
d109e61b 555 const struct net *net = dev_net(skb->dev);
4895c771
DM
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
d109e61b 562 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
563}
564
5abf7f7e 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
566{
567 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 568 const struct ip_options_rcu *inet_opt;
4895c771
DM
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
e2d118a1 579 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
580 rcu_read_unlock();
581}
582
5abf7f7e
ED
583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
4895c771
DM
585{
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590}
591
c5038a83 592static DEFINE_SPINLOCK(fnhe_lock);
4895c771 593
2ffae99d
TT
594static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595{
596 struct rtable *rt;
597
598 rt = rcu_dereference(fnhe->fnhe_rth_input);
599 if (rt) {
600 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 601 dst_dev_put(&rt->dst);
0830106c 602 dst_release(&rt->dst);
2ffae99d
TT
603 }
604 rt = rcu_dereference(fnhe->fnhe_rth_output);
605 if (rt) {
606 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 607 dst_dev_put(&rt->dst);
0830106c 608 dst_release(&rt->dst);
2ffae99d
TT
609 }
610}
611
aee06da6 612static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
613{
614 struct fib_nh_exception *fnhe, *oldest;
615
616 oldest = rcu_dereference(hash->chain);
617 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618 fnhe = rcu_dereference(fnhe->fnhe_next)) {
619 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 oldest = fnhe;
621 }
2ffae99d 622 fnhe_flush_routes(oldest);
4895c771
DM
623 return oldest;
624}
625
d3a25c98
DM
626static inline u32 fnhe_hashfun(__be32 daddr)
627{
d546c621 628 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
629 u32 hval;
630
d546c621
ED
631 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
634}
635
387aa65a
TT
636static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637{
638 rt->rt_pmtu = fnhe->fnhe_pmtu;
639 rt->dst.expires = fnhe->fnhe_expires;
640
641 if (fnhe->fnhe_gw) {
642 rt->rt_flags |= RTCF_REDIRECTED;
643 rt->rt_gateway = fnhe->fnhe_gw;
644 rt->rt_uses_gateway = 1;
645 }
646}
647
aee06da6
JA
648static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 u32 pmtu, unsigned long expires)
4895c771 650{
aee06da6 651 struct fnhe_hash_bucket *hash;
4895c771 652 struct fib_nh_exception *fnhe;
387aa65a 653 struct rtable *rt;
cebe84c6 654 u32 genid, hval;
387aa65a 655 unsigned int i;
4895c771 656 int depth;
cebe84c6
XL
657
658 genid = fnhe_genid(dev_net(nh->nh_dev));
659 hval = fnhe_hashfun(daddr);
aee06da6 660
c5038a83 661 spin_lock_bh(&fnhe_lock);
4895c771 662
caa41527 663 hash = rcu_dereference(nh->nh_exceptions);
4895c771 664 if (!hash) {
aee06da6 665 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 666 if (!hash)
aee06da6 667 goto out_unlock;
caa41527 668 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
669 }
670
4895c771
DM
671 hash += hval;
672
673 depth = 0;
674 for (fnhe = rcu_dereference(hash->chain); fnhe;
675 fnhe = rcu_dereference(fnhe->fnhe_next)) {
676 if (fnhe->fnhe_daddr == daddr)
aee06da6 677 break;
4895c771
DM
678 depth++;
679 }
680
aee06da6 681 if (fnhe) {
cebe84c6
XL
682 if (fnhe->fnhe_genid != genid)
683 fnhe->fnhe_genid = genid;
aee06da6
JA
684 if (gw)
685 fnhe->fnhe_gw = gw;
e39d5246 686 if (pmtu)
aee06da6 687 fnhe->fnhe_pmtu = pmtu;
e39d5246 688 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 689 /* Update all cached dsts too */
2ffae99d
TT
690 rt = rcu_dereference(fnhe->fnhe_rth_input);
691 if (rt)
692 fill_route_from_fnhe(rt, fnhe);
693 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
694 if (rt)
695 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
696 } else {
697 if (depth > FNHE_RECLAIM_DEPTH)
698 fnhe = fnhe_oldest(hash);
699 else {
700 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
701 if (!fnhe)
702 goto out_unlock;
703
704 fnhe->fnhe_next = hash->chain;
705 rcu_assign_pointer(hash->chain, fnhe);
706 }
cebe84c6 707 fnhe->fnhe_genid = genid;
aee06da6
JA
708 fnhe->fnhe_daddr = daddr;
709 fnhe->fnhe_gw = gw;
710 fnhe->fnhe_pmtu = pmtu;
711 fnhe->fnhe_expires = expires;
387aa65a
TT
712
713 /* Exception created; mark the cached routes for the nexthop
714 * stale, so anyone caching it rechecks if this exception
715 * applies to them.
716 */
2ffae99d
TT
717 rt = rcu_dereference(nh->nh_rth_input);
718 if (rt)
719 rt->dst.obsolete = DST_OBSOLETE_KILL;
720
387aa65a
TT
721 for_each_possible_cpu(i) {
722 struct rtable __rcu **prt;
723 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
724 rt = rcu_dereference(*prt);
725 if (rt)
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
727 }
4895c771 728 }
4895c771 729
4895c771 730 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
731
732out_unlock:
c5038a83 733 spin_unlock_bh(&fnhe_lock);
4895c771
DM
734}
735
ceb33206
DM
736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737 bool kill_route)
1da177e4 738{
e47a185b 739 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 740 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 741 struct net_device *dev = skb->dev;
e47a185b 742 struct in_device *in_dev;
4895c771 743 struct fib_result res;
e47a185b 744 struct neighbour *n;
317805b8 745 struct net *net;
1da177e4 746
94206125
DM
747 switch (icmp_hdr(skb)->code & 7) {
748 case ICMP_REDIR_NET:
749 case ICMP_REDIR_NETTOS:
750 case ICMP_REDIR_HOST:
751 case ICMP_REDIR_HOSTTOS:
752 break;
753
754 default:
755 return;
756 }
757
e47a185b
DM
758 if (rt->rt_gateway != old_gw)
759 return;
760
761 in_dev = __in_dev_get_rcu(dev);
762 if (!in_dev)
763 return;
764
c346dca1 765 net = dev_net(dev);
9d4fb27d
JP
766 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768 ipv4_is_zeronet(new_gw))
1da177e4
LT
769 goto reject_redirect;
770
771 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773 goto reject_redirect;
774 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775 goto reject_redirect;
776 } else {
317805b8 777 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
778 goto reject_redirect;
779 }
780
969447f2
SSL
781 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782 if (!n)
783 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 784 if (!IS_ERR(n)) {
e47a185b
DM
785 if (!(n->nud_state & NUD_VALID)) {
786 neigh_event_send(n, NULL);
787 } else {
0eeb075f 788 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 789 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 790
aee06da6 791 update_or_create_fnhe(nh, fl4->daddr, new_gw,
deed49df 792 0, jiffies + ip_rt_gc_timeout);
4895c771 793 }
ceb33206
DM
794 if (kill_route)
795 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
796 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
797 }
798 neigh_release(n);
799 }
800 return;
801
802reject_redirect:
803#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
804 if (IN_DEV_LOG_MARTIANS(in_dev)) {
805 const struct iphdr *iph = (const struct iphdr *) skb->data;
806 __be32 daddr = iph->daddr;
807 __be32 saddr = iph->saddr;
808
e47a185b
DM
809 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
810 " Advised path = %pI4 -> %pI4\n",
811 &old_gw, dev->name, &new_gw,
812 &saddr, &daddr);
99ee038d 813 }
e47a185b
DM
814#endif
815 ;
816}
817
4895c771
DM
818static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
819{
820 struct rtable *rt;
821 struct flowi4 fl4;
f96ef988 822 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 823 struct net *net = dev_net(skb->dev);
f96ef988
MK
824 int oif = skb->dev->ifindex;
825 u8 tos = RT_TOS(iph->tos);
826 u8 prot = iph->protocol;
827 u32 mark = skb->mark;
4895c771
DM
828
829 rt = (struct rtable *) dst;
830
7d995694 831 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 832 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
833}
834
1da177e4
LT
835static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
836{
ee6b9673 837 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
838 struct dst_entry *ret = dst;
839
840 if (rt) {
d11a4dc1 841 if (dst->obsolete > 0) {
1da177e4
LT
842 ip_rt_put(rt);
843 ret = NULL;
5943634f
DM
844 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
845 rt->dst.expires) {
89aef892 846 ip_rt_put(rt);
1da177e4
LT
847 ret = NULL;
848 }
849 }
850 return ret;
851}
852
853/*
854 * Algorithm:
855 * 1. The first ip_rt_redirect_number redirects are sent
856 * with exponential backoff, then we stop sending them at all,
857 * assuming that the host ignores our redirects.
858 * 2. If we did not see packets requiring redirects
859 * during ip_rt_redirect_silence, we assume that the host
860 * forgot redirected route and start to send redirects again.
861 *
862 * This algorithm is much cheaper and more intelligent than dumb load limiting
863 * in icmp.c.
864 *
865 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
866 * and "frag. need" (breaks PMTU discovery) in icmp.c.
867 */
868
869void ip_rt_send_redirect(struct sk_buff *skb)
870{
511c3f92 871 struct rtable *rt = skb_rtable(skb);
30038fc6 872 struct in_device *in_dev;
92d86829 873 struct inet_peer *peer;
1d861aa4 874 struct net *net;
30038fc6 875 int log_martians;
192132b9 876 int vif;
1da177e4 877
30038fc6 878 rcu_read_lock();
d8d1f30b 879 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
880 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
881 rcu_read_unlock();
1da177e4 882 return;
30038fc6
ED
883 }
884 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 885 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 886 rcu_read_unlock();
1da177e4 887
1d861aa4 888 net = dev_net(rt->dst.dev);
192132b9 889 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 890 if (!peer) {
e81da0e1
JA
891 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
892 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
893 return;
894 }
895
1da177e4
LT
896 /* No redirected packets during ip_rt_redirect_silence;
897 * reset the algorithm.
898 */
92d86829
DM
899 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
900 peer->rate_tokens = 0;
1da177e4
LT
901
902 /* Too many ignored redirects; do not send anything
d8d1f30b 903 * set dst.rate_last to the last seen redirected packet.
1da177e4 904 */
92d86829
DM
905 if (peer->rate_tokens >= ip_rt_redirect_number) {
906 peer->rate_last = jiffies;
1d861aa4 907 goto out_put_peer;
1da177e4
LT
908 }
909
910 /* Check for load limit; set rate_last to the latest sent
911 * redirect.
912 */
92d86829 913 if (peer->rate_tokens == 0 ||
14fb8a76 914 time_after(jiffies,
92d86829
DM
915 (peer->rate_last +
916 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
917 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
918
919 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
920 peer->rate_last = jiffies;
921 ++peer->rate_tokens;
1da177e4 922#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 923 if (log_martians &&
e87cc472
JP
924 peer->rate_tokens == ip_rt_redirect_number)
925 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 926 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 927 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
928#endif
929 }
1d861aa4
DM
930out_put_peer:
931 inet_putpeer(peer);
1da177e4
LT
932}
933
934static int ip_error(struct sk_buff *skb)
935{
251da413 936 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 937 struct rtable *rt = skb_rtable(skb);
92d86829 938 struct inet_peer *peer;
1da177e4 939 unsigned long now;
251da413 940 struct net *net;
92d86829 941 bool send;
1da177e4
LT
942 int code;
943
381c759d
EB
944 /* IP on this device is disabled. */
945 if (!in_dev)
946 goto out;
947
251da413
DM
948 net = dev_net(rt->dst.dev);
949 if (!IN_DEV_FORWARD(in_dev)) {
950 switch (rt->dst.error) {
951 case EHOSTUNREACH:
b45386ef 952 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
953 break;
954
955 case ENETUNREACH:
b45386ef 956 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
957 break;
958 }
959 goto out;
960 }
961
d8d1f30b 962 switch (rt->dst.error) {
4500ebf8
JP
963 case EINVAL:
964 default:
965 goto out;
966 case EHOSTUNREACH:
967 code = ICMP_HOST_UNREACH;
968 break;
969 case ENETUNREACH:
970 code = ICMP_NET_UNREACH;
b45386ef 971 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
972 break;
973 case EACCES:
974 code = ICMP_PKT_FILTERED;
975 break;
1da177e4
LT
976 }
977
192132b9 978 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 979 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
980
981 send = true;
982 if (peer) {
983 now = jiffies;
984 peer->rate_tokens += now - peer->rate_last;
985 if (peer->rate_tokens > ip_rt_error_burst)
986 peer->rate_tokens = ip_rt_error_burst;
987 peer->rate_last = now;
988 if (peer->rate_tokens >= ip_rt_error_cost)
989 peer->rate_tokens -= ip_rt_error_cost;
990 else
991 send = false;
1d861aa4 992 inet_putpeer(peer);
1da177e4 993 }
92d86829
DM
994 if (send)
995 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
996
997out: kfree_skb(skb);
998 return 0;
e905a9ed 999}
1da177e4 1000
d851c12b 1001static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1002{
d851c12b 1003 struct dst_entry *dst = &rt->dst;
4895c771 1004 struct fib_result res;
2c8cec5c 1005
fa1e492a
SK
1006 if (dst_metric_locked(dst, RTAX_MTU))
1007 return;
1008
cb6ccf09 1009 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1010 return;
1011
5943634f
DM
1012 if (mtu < ip_rt_min_pmtu)
1013 mtu = ip_rt_min_pmtu;
2c8cec5c 1014
f016229e
TT
1015 if (rt->rt_pmtu == mtu &&
1016 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1017 return;
1018
c5ae7d41 1019 rcu_read_lock();
0eeb075f 1020 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1021 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1022
aee06da6
JA
1023 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1024 jiffies + ip_rt_mtu_expires);
4895c771 1025 }
c5ae7d41 1026 rcu_read_unlock();
1da177e4
LT
1027}
1028
4895c771
DM
1029static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1030 struct sk_buff *skb, u32 mtu)
1031{
1032 struct rtable *rt = (struct rtable *) dst;
1033 struct flowi4 fl4;
1034
1035 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1036 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1037}
1038
36393395
DM
1039void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1040 int oif, u32 mark, u8 protocol, int flow_flags)
1041{
4895c771 1042 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1043 struct flowi4 fl4;
1044 struct rtable *rt;
1045
1b3c61dc
LC
1046 if (!mark)
1047 mark = IP4_REPLY_MARK(net, skb->mark);
1048
e2d118a1 1049 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1050 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1051 rt = __ip_route_output_key(net, &fl4);
1052 if (!IS_ERR(rt)) {
4895c771 1053 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1054 ip_rt_put(rt);
1055 }
1056}
1057EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1058
9cb3a50c 1059static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1060{
4895c771
DM
1061 const struct iphdr *iph = (const struct iphdr *) skb->data;
1062 struct flowi4 fl4;
1063 struct rtable *rt;
36393395 1064
e2d118a1 1065 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1066
1067 if (!fl4.flowi4_mark)
1068 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1069
4895c771
DM
1070 rt = __ip_route_output_key(sock_net(sk), &fl4);
1071 if (!IS_ERR(rt)) {
1072 __ip_rt_update_pmtu(rt, &fl4, mtu);
1073 ip_rt_put(rt);
1074 }
36393395 1075}
9cb3a50c
SK
1076
1077void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1078{
1079 const struct iphdr *iph = (const struct iphdr *) skb->data;
1080 struct flowi4 fl4;
1081 struct rtable *rt;
7f502361 1082 struct dst_entry *odst = NULL;
b44108db 1083 bool new = false;
e2d118a1 1084 struct net *net = sock_net(sk);
9cb3a50c
SK
1085
1086 bh_lock_sock(sk);
482fc609
HFS
1087
1088 if (!ip_sk_accept_pmtu(sk))
1089 goto out;
1090
7f502361 1091 odst = sk_dst_get(sk);
9cb3a50c 1092
7f502361 1093 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1094 __ipv4_sk_update_pmtu(skb, sk, mtu);
1095 goto out;
1096 }
1097
e2d118a1 1098 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1099
7f502361 1100 rt = (struct rtable *)odst;
51456b29 1101 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1102 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1103 if (IS_ERR(rt))
1104 goto out;
b44108db
SK
1105
1106 new = true;
9cb3a50c
SK
1107 }
1108
0f6c480f 1109 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
9cb3a50c 1110
7f502361 1111 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1112 if (new)
1113 dst_release(&rt->dst);
1114
9cb3a50c
SK
1115 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116 if (IS_ERR(rt))
1117 goto out;
1118
b44108db 1119 new = true;
9cb3a50c
SK
1120 }
1121
b44108db 1122 if (new)
7f502361 1123 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1124
1125out:
1126 bh_unlock_sock(sk);
7f502361 1127 dst_release(odst);
9cb3a50c 1128}
36393395 1129EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1130
b42597e2
DM
1131void ipv4_redirect(struct sk_buff *skb, struct net *net,
1132 int oif, u32 mark, u8 protocol, int flow_flags)
1133{
4895c771 1134 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1135 struct flowi4 fl4;
1136 struct rtable *rt;
1137
e2d118a1 1138 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1139 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1140 rt = __ip_route_output_key(net, &fl4);
1141 if (!IS_ERR(rt)) {
ceb33206 1142 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1143 ip_rt_put(rt);
1144 }
1145}
1146EXPORT_SYMBOL_GPL(ipv4_redirect);
1147
1148void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1149{
4895c771
DM
1150 const struct iphdr *iph = (const struct iphdr *) skb->data;
1151 struct flowi4 fl4;
1152 struct rtable *rt;
e2d118a1 1153 struct net *net = sock_net(sk);
b42597e2 1154
e2d118a1
LC
1155 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1156 rt = __ip_route_output_key(net, &fl4);
4895c771 1157 if (!IS_ERR(rt)) {
ceb33206 1158 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1159 ip_rt_put(rt);
1160 }
b42597e2
DM
1161}
1162EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1163
efbc368d
DM
1164static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1165{
1166 struct rtable *rt = (struct rtable *) dst;
1167
ceb33206
DM
1168 /* All IPV4 dsts are created with ->obsolete set to the value
1169 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1170 * into this function always.
1171 *
387aa65a
TT
1172 * When a PMTU/redirect information update invalidates a route,
1173 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1174 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1175 */
387aa65a 1176 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1177 return NULL;
d11a4dc1 1178 return dst;
1da177e4
LT
1179}
1180
1da177e4
LT
1181static void ipv4_link_failure(struct sk_buff *skb)
1182{
1183 struct rtable *rt;
1184
1185 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1186
511c3f92 1187 rt = skb_rtable(skb);
5943634f
DM
1188 if (rt)
1189 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1190}
1191
ede2059d 1192static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1193{
91df42be
JP
1194 pr_debug("%s: %pI4 -> %pI4, %s\n",
1195 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1196 skb->dev ? skb->dev->name : "?");
1da177e4 1197 kfree_skb(skb);
c378a9c0 1198 WARN_ON(1);
1da177e4
LT
1199 return 0;
1200}
1201
1202/*
1203 We do not cache source address of outgoing interface,
1204 because it is used only by IP RR, TS and SRR options,
1205 so that it out of fast path.
1206
1207 BTW remember: "addr" is allowed to be not aligned
1208 in IP options!
1209 */
1210
8e36360a 1211void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1212{
a61ced5d 1213 __be32 src;
1da177e4 1214
c7537967 1215 if (rt_is_output_route(rt))
c5be24ff 1216 src = ip_hdr(skb)->saddr;
ebc0ffae 1217 else {
8e36360a
DM
1218 struct fib_result res;
1219 struct flowi4 fl4;
1220 struct iphdr *iph;
1221
1222 iph = ip_hdr(skb);
1223
1224 memset(&fl4, 0, sizeof(fl4));
1225 fl4.daddr = iph->daddr;
1226 fl4.saddr = iph->saddr;
b0fe4a31 1227 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1228 fl4.flowi4_oif = rt->dst.dev->ifindex;
1229 fl4.flowi4_iif = skb->dev->ifindex;
1230 fl4.flowi4_mark = skb->mark;
5e2b61f7 1231
ebc0ffae 1232 rcu_read_lock();
0eeb075f 1233 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1234 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1235 else
f8126f1d
DM
1236 src = inet_select_addr(rt->dst.dev,
1237 rt_nexthop(rt, iph->daddr),
1238 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1239 rcu_read_unlock();
1240 }
1da177e4
LT
1241 memcpy(addr, &src, 4);
1242}
1243
c7066f70 1244#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1245static void set_class_tag(struct rtable *rt, u32 tag)
1246{
d8d1f30b
CG
1247 if (!(rt->dst.tclassid & 0xFFFF))
1248 rt->dst.tclassid |= tag & 0xFFFF;
1249 if (!(rt->dst.tclassid & 0xFFFF0000))
1250 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1251}
1252#endif
1253
0dbaee3b
DM
1254static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1255{
7ed14d97 1256 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1257 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1258 ip_rt_min_advmss);
0dbaee3b 1259
7ed14d97 1260 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1261}
1262
ebb762f2 1263static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1264{
261663b0 1265 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1266 unsigned int mtu = rt->rt_pmtu;
1267
98d75c37 1268 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1269 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1270
38d523e2 1271 if (mtu)
618f9bc7
SK
1272 return mtu;
1273
c780a049 1274 mtu = READ_ONCE(dst->dev->mtu);
d33e4553
DM
1275
1276 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1277 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1278 mtu = 576;
1279 }
1280
14972cbd
RP
1281 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1282
1283 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1284}
1285
f2bb4bed 1286static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1287{
caa41527 1288 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1289 struct fib_nh_exception *fnhe;
1290 u32 hval;
1291
f2bb4bed
DM
1292 if (!hash)
1293 return NULL;
1294
d3a25c98 1295 hval = fnhe_hashfun(daddr);
4895c771
DM
1296
1297 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1298 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1299 if (fnhe->fnhe_daddr == daddr)
1300 return fnhe;
1301 }
1302 return NULL;
1303}
aee06da6 1304
caacf05e 1305static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1306 __be32 daddr, const bool do_cache)
f2bb4bed 1307{
caacf05e
DM
1308 bool ret = false;
1309
c5038a83 1310 spin_lock_bh(&fnhe_lock);
f2bb4bed 1311
c5038a83 1312 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1313 struct rtable __rcu **porig;
1314 struct rtable *orig;
5aad1de5 1315 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1316
1317 if (rt_is_input_route(rt))
1318 porig = &fnhe->fnhe_rth_input;
1319 else
1320 porig = &fnhe->fnhe_rth_output;
1321 orig = rcu_dereference(*porig);
5aad1de5
TT
1322
1323 if (fnhe->fnhe_genid != genid) {
1324 fnhe->fnhe_genid = genid;
13d82bf5
SK
1325 fnhe->fnhe_gw = 0;
1326 fnhe->fnhe_pmtu = 0;
1327 fnhe->fnhe_expires = 0;
2ffae99d
TT
1328 fnhe_flush_routes(fnhe);
1329 orig = NULL;
13d82bf5 1330 }
387aa65a
TT
1331 fill_route_from_fnhe(rt, fnhe);
1332 if (!rt->rt_gateway)
155e8336 1333 rt->rt_gateway = daddr;
f2bb4bed 1334
a4c2fd7f 1335 if (do_cache) {
0830106c 1336 dst_hold(&rt->dst);
2ffae99d 1337 rcu_assign_pointer(*porig, rt);
0830106c 1338 if (orig) {
95c47f9c 1339 dst_dev_put(&orig->dst);
0830106c 1340 dst_release(&orig->dst);
0830106c 1341 }
2ffae99d
TT
1342 ret = true;
1343 }
c5038a83
DM
1344
1345 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1346 }
1347 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1348
1349 return ret;
54764bb6
ED
1350}
1351
caacf05e 1352static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1353{
d26b3a7c 1354 struct rtable *orig, *prev, **p;
caacf05e 1355 bool ret = true;
f2bb4bed 1356
d26b3a7c 1357 if (rt_is_input_route(rt)) {
54764bb6 1358 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1359 } else {
903ceff7 1360 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1361 }
f2bb4bed
DM
1362 orig = *p;
1363
0830106c
WW
1364 /* hold dst before doing cmpxchg() to avoid race condition
1365 * on this dst
1366 */
1367 dst_hold(&rt->dst);
f2bb4bed
DM
1368 prev = cmpxchg(p, orig, rt);
1369 if (prev == orig) {
0830106c 1370 if (orig) {
95c47f9c 1371 dst_dev_put(&orig->dst);
0830106c 1372 dst_release(&orig->dst);
0830106c
WW
1373 }
1374 } else {
1375 dst_release(&rt->dst);
caacf05e 1376 ret = false;
0830106c 1377 }
caacf05e
DM
1378
1379 return ret;
1380}
1381
5055c371
ED
1382struct uncached_list {
1383 spinlock_t lock;
1384 struct list_head head;
1385};
1386
1387static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e
DM
1388
1389static void rt_add_uncached_list(struct rtable *rt)
1390{
5055c371
ED
1391 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1392
1393 rt->rt_uncached_list = ul;
1394
1395 spin_lock_bh(&ul->lock);
1396 list_add_tail(&rt->rt_uncached, &ul->head);
1397 spin_unlock_bh(&ul->lock);
caacf05e
DM
1398}
1399
1400static void ipv4_dst_destroy(struct dst_entry *dst)
1401{
3fb07daf 1402 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
caacf05e
DM
1403 struct rtable *rt = (struct rtable *) dst;
1404
9620fef2 1405 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
3fb07daf
ED
1406 kfree(p);
1407
78df76a0 1408 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1409 struct uncached_list *ul = rt->rt_uncached_list;
1410
1411 spin_lock_bh(&ul->lock);
caacf05e 1412 list_del(&rt->rt_uncached);
5055c371 1413 spin_unlock_bh(&ul->lock);
caacf05e
DM
1414 }
1415}
1416
1417void rt_flush_dev(struct net_device *dev)
1418{
5055c371
ED
1419 struct net *net = dev_net(dev);
1420 struct rtable *rt;
1421 int cpu;
1422
1423 for_each_possible_cpu(cpu) {
1424 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1425
5055c371
ED
1426 spin_lock_bh(&ul->lock);
1427 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1428 if (rt->dst.dev != dev)
1429 continue;
1430 rt->dst.dev = net->loopback_dev;
1431 dev_hold(rt->dst.dev);
1432 dev_put(dev);
1433 }
5055c371 1434 spin_unlock_bh(&ul->lock);
4895c771
DM
1435 }
1436}
1437
4331debc 1438static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1439{
4331debc
ED
1440 return rt &&
1441 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1442 !rt_is_expired(rt);
d2d68ba9
DM
1443}
1444
f2bb4bed 1445static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1446 const struct fib_result *res,
f2bb4bed 1447 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1448 struct fib_info *fi, u16 type, u32 itag,
1449 const bool do_cache)
1da177e4 1450{
caacf05e
DM
1451 bool cached = false;
1452
1da177e4 1453 if (fi) {
4895c771
DM
1454 struct fib_nh *nh = &FIB_RES_NH(*res);
1455
155e8336 1456 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1457 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1458 rt->rt_uses_gateway = 1;
1459 }
3fb07daf
ED
1460 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1461 if (fi->fib_metrics != &dst_default_metrics) {
1462 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
9620fef2 1463 refcount_inc(&fi->fib_metrics->refcnt);
3fb07daf 1464 }
c7066f70 1465#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1466 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1467#endif
61adedf3 1468 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1469 if (unlikely(fnhe))
a4c2fd7f
WW
1470 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1471 else if (do_cache)
caacf05e 1472 cached = rt_cache_route(nh, rt);
155e8336
JA
1473 if (unlikely(!cached)) {
1474 /* Routes we intend to cache in nexthop exception or
1475 * FIB nexthop have the DST_NOCACHE bit clear.
1476 * However, if we are unsuccessful at storing this
1477 * route into the cache we really need to set it.
1478 */
155e8336
JA
1479 if (!rt->rt_gateway)
1480 rt->rt_gateway = daddr;
1481 rt_add_uncached_list(rt);
1482 }
1483 } else
caacf05e 1484 rt_add_uncached_list(rt);
defb3519 1485
c7066f70 1486#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1487#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1488 set_class_tag(rt, res->tclassid);
1da177e4
LT
1489#endif
1490 set_class_tag(rt, itag);
1491#endif
1da177e4
LT
1492}
1493
9ab179d8
DA
1494struct rtable *rt_dst_alloc(struct net_device *dev,
1495 unsigned int flags, u16 type,
1496 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1497{
d08c4f35
DA
1498 struct rtable *rt;
1499
1500 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1501 (will_cache ? 0 : DST_HOST) |
d08c4f35 1502 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1503 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1504
1505 if (rt) {
1506 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1507 rt->rt_flags = flags;
1508 rt->rt_type = type;
1509 rt->rt_is_input = 0;
1510 rt->rt_iif = 0;
1511 rt->rt_pmtu = 0;
1512 rt->rt_gateway = 0;
1513 rt->rt_uses_gateway = 0;
b7503e0c 1514 rt->rt_table_id = 0;
d08c4f35
DA
1515 INIT_LIST_HEAD(&rt->rt_uncached);
1516
1517 rt->dst.output = ip_output;
1518 if (flags & RTCF_LOCAL)
1519 rt->dst.input = ip_local_deliver;
1520 }
1521
1522 return rt;
0c4dcd58 1523}
9ab179d8 1524EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1525
96d36220 1526/* called in rcu_read_lock() section */
bc044e8d
PA
1527int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1528 u8 tos, struct net_device *dev,
1529 struct in_device *in_dev, u32 *itag)
1da177e4 1530{
b5f7e755 1531 int err;
1da177e4
LT
1532
1533 /* Primary sanity checks. */
51456b29 1534 if (!in_dev)
1da177e4
LT
1535 return -EINVAL;
1536
1e637c74 1537 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1538 skb->protocol != htons(ETH_P_IP))
bc044e8d 1539 return -EINVAL;
1da177e4 1540
75fea73d 1541 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1542 return -EINVAL;
d0daebc3 1543
f97c1e0c
JP
1544 if (ipv4_is_zeronet(saddr)) {
1545 if (!ipv4_is_local_multicast(daddr))
bc044e8d 1546 return -EINVAL;
b5f7e755 1547 } else {
9e56e380 1548 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1549 in_dev, itag);
b5f7e755 1550 if (err < 0)
bc044e8d 1551 return err;
b5f7e755 1552 }
bc044e8d
PA
1553 return 0;
1554}
1555
1556/* called in rcu_read_lock() section */
1557static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1558 u8 tos, struct net_device *dev, int our)
1559{
1560 struct in_device *in_dev = __in_dev_get_rcu(dev);
1561 unsigned int flags = RTCF_MULTICAST;
1562 struct rtable *rth;
1563 u32 itag = 0;
1564 int err;
1565
1566 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1567 if (err)
1568 return err;
1569
d08c4f35
DA
1570 if (our)
1571 flags |= RTCF_LOCAL;
1572
1573 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1574 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1575 if (!rth)
bc044e8d 1576 return -ENOBUFS;
1da177e4 1577
cf911662
DM
1578#ifdef CONFIG_IP_ROUTE_CLASSID
1579 rth->dst.tclassid = itag;
1580#endif
d8d1f30b 1581 rth->dst.output = ip_rt_bug;
9917e1e8 1582 rth->rt_is_input= 1;
1da177e4
LT
1583
1584#ifdef CONFIG_IP_MROUTE
f97c1e0c 1585 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1586 rth->dst.input = ip_mr_input;
1da177e4
LT
1587#endif
1588 RT_CACHE_STAT_INC(in_slow_mc);
1589
89aef892
DM
1590 skb_dst_set(skb, &rth->dst);
1591 return 0;
1da177e4
LT
1592}
1593
1594
1595static void ip_handle_martian_source(struct net_device *dev,
1596 struct in_device *in_dev,
1597 struct sk_buff *skb,
9e12bb22
AV
1598 __be32 daddr,
1599 __be32 saddr)
1da177e4
LT
1600{
1601 RT_CACHE_STAT_INC(in_martian_src);
1602#ifdef CONFIG_IP_ROUTE_VERBOSE
1603 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1604 /*
1605 * RFC1812 recommendation, if source is martian,
1606 * the only hint is MAC header.
1607 */
058bd4d2 1608 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1609 &daddr, &saddr, dev->name);
98e399f8 1610 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1611 print_hex_dump(KERN_WARNING, "ll header: ",
1612 DUMP_PREFIX_OFFSET, 16, 1,
1613 skb_mac_header(skb),
1614 dev->hard_header_len, true);
1da177e4
LT
1615 }
1616 }
1617#endif
1618}
1619
deed49df
XL
1620static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1621{
1622 struct fnhe_hash_bucket *hash;
1623 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1624 u32 hval = fnhe_hashfun(daddr);
1625
1626 spin_lock_bh(&fnhe_lock);
1627
1628 hash = rcu_dereference_protected(nh->nh_exceptions,
1629 lockdep_is_held(&fnhe_lock));
1630 hash += hval;
1631
1632 fnhe_p = &hash->chain;
1633 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1634 while (fnhe) {
1635 if (fnhe->fnhe_daddr == daddr) {
1636 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1637 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1638 fnhe_flush_routes(fnhe);
1639 kfree_rcu(fnhe, rcu);
1640 break;
1641 }
1642 fnhe_p = &fnhe->fnhe_next;
1643 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1644 lockdep_is_held(&fnhe_lock));
1645 }
1646
1647 spin_unlock_bh(&fnhe_lock);
1648}
1649
efd85700
TG
1650static void set_lwt_redirect(struct rtable *rth)
1651{
1652 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1653 rth->dst.lwtstate->orig_output = rth->dst.output;
1654 rth->dst.output = lwtunnel_output;
1655 }
1656
1657 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1658 rth->dst.lwtstate->orig_input = rth->dst.input;
1659 rth->dst.input = lwtunnel_input;
1660 }
1661}
1662
47360228 1663/* called in rcu_read_lock() section */
5969f71d 1664static int __mkroute_input(struct sk_buff *skb,
982721f3 1665 const struct fib_result *res,
5969f71d 1666 struct in_device *in_dev,
c6cffba4 1667 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1668{
2ffae99d 1669 struct fib_nh_exception *fnhe;
1da177e4
LT
1670 struct rtable *rth;
1671 int err;
1672 struct in_device *out_dev;
d2d68ba9 1673 bool do_cache;
fbdc0ad0 1674 u32 itag = 0;
1da177e4
LT
1675
1676 /* get a working reference to the output device */
47360228 1677 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1678 if (!out_dev) {
e87cc472 1679 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1680 return -EINVAL;
1681 }
1682
5c04c819 1683 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1684 in_dev->dev, in_dev, &itag);
1da177e4 1685 if (err < 0) {
e905a9ed 1686 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1687 saddr);
e905a9ed 1688
1da177e4
LT
1689 goto cleanup;
1690 }
1691
e81da0e1
JA
1692 do_cache = res->fi && !itag;
1693 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1694 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1695 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1696 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1697 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1698
1699 if (skb->protocol != htons(ETH_P_IP)) {
1700 /* Not IP (i.e. ARP). Do not create route, if it is
1701 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1702 *
1703 * Proxy arp feature have been extended to allow, ARP
1704 * replies back to the same interface, to support
1705 * Private VLAN switch technologies. See arp.c.
1da177e4 1706 */
65324144
JDB
1707 if (out_dev == in_dev &&
1708 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1709 err = -EINVAL;
1710 goto cleanup;
1711 }
1712 }
1713
2ffae99d 1714 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1715 if (do_cache) {
deed49df 1716 if (fnhe) {
2ffae99d 1717 rth = rcu_dereference(fnhe->fnhe_rth_input);
deed49df
XL
1718 if (rth && rth->dst.expires &&
1719 time_after(jiffies, rth->dst.expires)) {
1720 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1721 fnhe = NULL;
1722 } else {
1723 goto rt_cache;
1724 }
1725 }
1726
1727 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2ffae99d 1728
deed49df 1729rt_cache:
e81da0e1
JA
1730 if (rt_cache_valid(rth)) {
1731 skb_dst_set_noref(skb, &rth->dst);
1732 goto out;
d2d68ba9
DM
1733 }
1734 }
f2bb4bed 1735
d08c4f35 1736 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1737 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1738 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1739 if (!rth) {
1740 err = -ENOBUFS;
1741 goto cleanup;
1742 }
1743
9917e1e8 1744 rth->rt_is_input = 1;
b7503e0c
DA
1745 if (res->table)
1746 rth->rt_table_id = res->table->tb_id;
a6254864 1747 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1748
d8d1f30b 1749 rth->dst.input = ip_forward;
1da177e4 1750
a4c2fd7f
WW
1751 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1752 do_cache);
efd85700 1753 set_lwt_redirect(rth);
c6cffba4 1754 skb_dst_set(skb, &rth->dst);
d2d68ba9 1755out:
1da177e4
LT
1756 err = 0;
1757 cleanup:
1da177e4 1758 return err;
e905a9ed 1759}
1da177e4 1760
79a13159 1761#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1762/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1763 * calculated from the inner IP addresses.
79a13159 1764 */
bf4e0a3d
NA
1765static void ip_multipath_l3_keys(const struct sk_buff *skb,
1766 struct flow_keys *hash_keys)
79a13159
PN
1767{
1768 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1769 const struct iphdr *inner_iph;
79a13159
PN
1770 const struct icmphdr *icmph;
1771 struct iphdr _inner_iph;
bf4e0a3d
NA
1772 struct icmphdr _icmph;
1773
1774 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1775 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1776 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1777 return;
79a13159
PN
1778
1779 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1780 return;
79a13159
PN
1781
1782 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783 &_icmph);
1784 if (!icmph)
bf4e0a3d 1785 return;
79a13159
PN
1786
1787 if (icmph->type != ICMP_DEST_UNREACH &&
1788 icmph->type != ICMP_REDIRECT &&
1789 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1790 icmph->type != ICMP_PARAMETERPROB)
1791 return;
79a13159
PN
1792
1793 inner_iph = skb_header_pointer(skb,
1794 outer_iph->ihl * 4 + sizeof(_icmph),
1795 sizeof(_inner_iph), &_inner_iph);
1796 if (!inner_iph)
bf4e0a3d
NA
1797 return;
1798 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1799 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1800}
79a13159 1801
bf4e0a3d
NA
1802/* if skb is set it will be used and fl4 can be NULL */
1803int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1804 const struct sk_buff *skb)
1805{
1806 struct net *net = fi->fib_net;
1807 struct flow_keys hash_keys;
1808 u32 mhash;
79a13159 1809
bf4e0a3d
NA
1810 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1811 case 0:
1812 memset(&hash_keys, 0, sizeof(hash_keys));
1813 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1814 if (skb) {
1815 ip_multipath_l3_keys(skb, &hash_keys);
1816 } else {
1817 hash_keys.addrs.v4addrs.src = fl4->saddr;
1818 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1819 }
1820 break;
1821 case 1:
1822 /* skb is currently provided only when forwarding */
1823 if (skb) {
1824 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1825 struct flow_keys keys;
1826
1827 /* short-circuit if we already have L4 hash present */
1828 if (skb->l4_hash)
1829 return skb_get_hash_raw(skb) >> 1;
1830 memset(&hash_keys, 0, sizeof(hash_keys));
1831 skb_flow_dissect_flow_keys(skb, &keys, flag);
1832 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1833 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1834 hash_keys.ports.src = keys.ports.src;
1835 hash_keys.ports.dst = keys.ports.dst;
1836 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1837 } else {
1838 memset(&hash_keys, 0, sizeof(hash_keys));
1839 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1840 hash_keys.addrs.v4addrs.src = fl4->saddr;
1841 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1842 hash_keys.ports.src = fl4->fl4_sport;
1843 hash_keys.ports.dst = fl4->fl4_dport;
1844 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1845 }
1846 break;
1847 }
1848 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1849
bf4e0a3d
NA
1850 return mhash >> 1;
1851}
1852EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1853#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1854
5969f71d
SH
1855static int ip_mkroute_input(struct sk_buff *skb,
1856 struct fib_result *res,
5969f71d
SH
1857 struct in_device *in_dev,
1858 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1859{
1da177e4 1860#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1861 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1862 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1863
0e884c78
PN
1864 fib_select_multipath(res, h);
1865 }
1da177e4
LT
1866#endif
1867
1868 /* create a routing cache entry */
c6cffba4 1869 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1870}
1871
1da177e4
LT
1872/*
1873 * NOTE. We drop all the packets that has local source
1874 * addresses, because every properly looped back packet
1875 * must have correct destination already attached by output routine.
1876 *
1877 * Such approach solves two big problems:
1878 * 1. Not simplex devices are handled properly.
1879 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1880 * called with rcu_read_lock()
1da177e4
LT
1881 */
1882
9e12bb22 1883static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1884 u8 tos, struct net_device *dev,
1885 struct fib_result *res)
1da177e4 1886{
96d36220 1887 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1888 struct ip_tunnel_info *tun_info;
68a5e3dd 1889 struct flowi4 fl4;
95c96174 1890 unsigned int flags = 0;
1da177e4 1891 u32 itag = 0;
95c96174 1892 struct rtable *rth;
1da177e4 1893 int err = -EINVAL;
5e73ea1a 1894 struct net *net = dev_net(dev);
d2d68ba9 1895 bool do_cache;
1da177e4
LT
1896
1897 /* IP on this device is disabled. */
1898
1899 if (!in_dev)
1900 goto out;
1901
1902 /* Check for the most weird martians, which can be not detected
1903 by fib_lookup.
1904 */
1905
61adedf3 1906 tun_info = skb_tunnel_info(skb);
46fa062a 1907 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1908 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1909 else
1910 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1911 skb_dst_drop(skb);
1912
d0daebc3 1913 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1914 goto martian_source;
1915
5510cdf7
DA
1916 res->fi = NULL;
1917 res->table = NULL;
27a954bd 1918 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1919 goto brd_input;
1920
1921 /* Accept zero addresses only to limited broadcast;
1922 * I even do not know to fix it or not. Waiting for complains :-)
1923 */
f97c1e0c 1924 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1925 goto martian_source;
1926
d0daebc3 1927 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1928 goto martian_destination;
1929
9eb43e76
ED
1930 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1931 * and call it once if daddr or/and saddr are loopback addresses
1932 */
1933 if (ipv4_is_loopback(daddr)) {
1934 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1935 goto martian_destination;
9eb43e76
ED
1936 } else if (ipv4_is_loopback(saddr)) {
1937 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1938 goto martian_source;
1939 }
1940
1da177e4
LT
1941 /*
1942 * Now we are ready to route packet.
1943 */
68a5e3dd 1944 fl4.flowi4_oif = 0;
e0d56fdd 1945 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1946 fl4.flowi4_mark = skb->mark;
1947 fl4.flowi4_tos = tos;
1948 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1949 fl4.flowi4_flags = 0;
68a5e3dd
DM
1950 fl4.daddr = daddr;
1951 fl4.saddr = saddr;
8bcfd092 1952 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1953 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1954 if (err != 0) {
1955 if (!IN_DEV_FORWARD(in_dev))
1956 err = -EHOSTUNREACH;
1da177e4 1957 goto no_route;
cd0f0b95 1958 }
1da177e4 1959
5510cdf7 1960 if (res->type == RTN_BROADCAST)
1da177e4
LT
1961 goto brd_input;
1962
5510cdf7 1963 if (res->type == RTN_LOCAL) {
5c04c819 1964 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1965 0, dev, in_dev, &itag);
b5f7e755 1966 if (err < 0)
0d753960 1967 goto martian_source;
1da177e4
LT
1968 goto local_input;
1969 }
1970
cd0f0b95
DJ
1971 if (!IN_DEV_FORWARD(in_dev)) {
1972 err = -EHOSTUNREACH;
251da413 1973 goto no_route;
cd0f0b95 1974 }
5510cdf7 1975 if (res->type != RTN_UNICAST)
1da177e4
LT
1976 goto martian_destination;
1977
5510cdf7 1978 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1979out: return err;
1980
1981brd_input:
1982 if (skb->protocol != htons(ETH_P_IP))
1983 goto e_inval;
1984
41347dcd 1985 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1986 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1987 in_dev, &itag);
1da177e4 1988 if (err < 0)
0d753960 1989 goto martian_source;
1da177e4
LT
1990 }
1991 flags |= RTCF_BROADCAST;
5510cdf7 1992 res->type = RTN_BROADCAST;
1da177e4
LT
1993 RT_CACHE_STAT_INC(in_brd);
1994
1995local_input:
d2d68ba9 1996 do_cache = false;
5510cdf7 1997 if (res->fi) {
fe3edf45 1998 if (!itag) {
5510cdf7 1999 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2000 if (rt_cache_valid(rth)) {
c6cffba4
DM
2001 skb_dst_set_noref(skb, &rth->dst);
2002 err = 0;
2003 goto out;
d2d68ba9
DM
2004 }
2005 do_cache = true;
2006 }
2007 }
2008
f5a0aab8 2009 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2010 flags | RTCF_LOCAL, res->type,
d2d68ba9 2011 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2012 if (!rth)
2013 goto e_nobufs;
2014
d8d1f30b 2015 rth->dst.output= ip_rt_bug;
cf911662
DM
2016#ifdef CONFIG_IP_ROUTE_CLASSID
2017 rth->dst.tclassid = itag;
2018#endif
9917e1e8 2019 rth->rt_is_input = 1;
5510cdf7
DA
2020 if (res->table)
2021 rth->rt_table_id = res->table->tb_id;
571e7226 2022
a6254864 2023 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2024 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2025 rth->dst.input= ip_error;
2026 rth->dst.error= -err;
1da177e4
LT
2027 rth->rt_flags &= ~RTCF_LOCAL;
2028 }
efd85700 2029
dcdfdf56 2030 if (do_cache) {
5510cdf7 2031 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2032
2033 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2034 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2035 WARN_ON(rth->dst.input == lwtunnel_input);
2036 rth->dst.lwtstate->orig_input = rth->dst.input;
2037 rth->dst.input = lwtunnel_input;
2038 }
2039
a4c2fd7f 2040 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2041 rt_add_uncached_list(rth);
dcdfdf56 2042 }
89aef892 2043 skb_dst_set(skb, &rth->dst);
b23dd4fe 2044 err = 0;
ebc0ffae 2045 goto out;
1da177e4
LT
2046
2047no_route:
2048 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2049 res->type = RTN_UNREACHABLE;
2050 res->fi = NULL;
2051 res->table = NULL;
1da177e4
LT
2052 goto local_input;
2053
2054 /*
2055 * Do not cache martian addresses: they should be logged (RFC1812)
2056 */
2057martian_destination:
2058 RT_CACHE_STAT_INC(in_martian_dst);
2059#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2060 if (IN_DEV_LOG_MARTIANS(in_dev))
2061 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2062 &daddr, &saddr, dev->name);
1da177e4 2063#endif
2c2910a4 2064
1da177e4
LT
2065e_inval:
2066 err = -EINVAL;
ebc0ffae 2067 goto out;
1da177e4
LT
2068
2069e_nobufs:
2070 err = -ENOBUFS;
ebc0ffae 2071 goto out;
1da177e4
LT
2072
2073martian_source:
2074 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2075 goto out;
1da177e4
LT
2076}
2077
c6cffba4
DM
2078int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2079 u8 tos, struct net_device *dev)
1da177e4 2080{
5510cdf7
DA
2081 struct fib_result res;
2082 int err;
1da177e4 2083
6e28099d 2084 tos &= IPTOS_RT_MASK;
96d36220 2085 rcu_read_lock();
5510cdf7
DA
2086 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2087 rcu_read_unlock();
96d36220 2088
5510cdf7
DA
2089 return err;
2090}
2091EXPORT_SYMBOL(ip_route_input_noref);
2092
2093/* called with rcu_read_lock held */
2094int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2095 u8 tos, struct net_device *dev, struct fib_result *res)
2096{
1da177e4
LT
2097 /* Multicast recognition logic is moved from route cache to here.
2098 The problem was that too many Ethernet cards have broken/missing
2099 hardware multicast filters :-( As result the host on multicasting
2100 network acquires a lot of useless route cache entries, sort of
2101 SDR messages from all the world. Now we try to get rid of them.
2102 Really, provided software IP multicast filter is organized
2103 reasonably (at least, hashed), it does not result in a slowdown
2104 comparing with route cache reject entries.
2105 Note, that multicast routers are not affected, because
2106 route cache entry is created eventually.
2107 */
f97c1e0c 2108 if (ipv4_is_multicast(daddr)) {
96d36220 2109 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2110 int our = 0;
5510cdf7 2111 int err = -EINVAL;
1da177e4 2112
e58e4159
DA
2113 if (in_dev)
2114 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2115 ip_hdr(skb)->protocol);
2116
2117 /* check l3 master if no match yet */
2118 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2119 struct in_device *l3_in_dev;
2120
2121 l3_in_dev = __in_dev_get_rcu(skb->dev);
2122 if (l3_in_dev)
2123 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2124 ip_hdr(skb)->protocol);
2125 }
2126
e58e4159 2127 if (our
1da177e4 2128#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2129 ||
2130 (!ipv4_is_local_multicast(daddr) &&
2131 IN_DEV_MFORWARD(in_dev))
1da177e4 2132#endif
e58e4159 2133 ) {
5510cdf7 2134 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2135 tos, dev, our);
1da177e4 2136 }
5510cdf7 2137 return err;
1da177e4 2138 }
5510cdf7
DA
2139
2140 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2141}
2142
ebc0ffae 2143/* called with rcu_read_lock() */
982721f3 2144static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2145 const struct flowi4 *fl4, int orig_oif,
f61759e6 2146 struct net_device *dev_out,
5ada5527 2147 unsigned int flags)
1da177e4 2148{
982721f3 2149 struct fib_info *fi = res->fi;
f2bb4bed 2150 struct fib_nh_exception *fnhe;
5ada5527 2151 struct in_device *in_dev;
982721f3 2152 u16 type = res->type;
5ada5527 2153 struct rtable *rth;
c92b9655 2154 bool do_cache;
1da177e4 2155
d0daebc3
TG
2156 in_dev = __in_dev_get_rcu(dev_out);
2157 if (!in_dev)
5ada5527 2158 return ERR_PTR(-EINVAL);
1da177e4 2159
d0daebc3 2160 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2161 if (ipv4_is_loopback(fl4->saddr) &&
2162 !(dev_out->flags & IFF_LOOPBACK) &&
2163 !netif_is_l3_master(dev_out))
d0daebc3
TG
2164 return ERR_PTR(-EINVAL);
2165
68a5e3dd 2166 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2167 type = RTN_BROADCAST;
68a5e3dd 2168 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2169 type = RTN_MULTICAST;
68a5e3dd 2170 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2171 return ERR_PTR(-EINVAL);
1da177e4
LT
2172
2173 if (dev_out->flags & IFF_LOOPBACK)
2174 flags |= RTCF_LOCAL;
2175
63617421 2176 do_cache = true;
982721f3 2177 if (type == RTN_BROADCAST) {
1da177e4 2178 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2179 fi = NULL;
2180 } else if (type == RTN_MULTICAST) {
dd28d1a0 2181 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2182 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2183 fl4->flowi4_proto))
1da177e4 2184 flags &= ~RTCF_LOCAL;
63617421
JA
2185 else
2186 do_cache = false;
1da177e4 2187 /* If multicast route do not exist use
dd28d1a0
ED
2188 * default one, but do not gateway in this case.
2189 * Yes, it is hack.
1da177e4 2190 */
982721f3
DM
2191 if (fi && res->prefixlen < 4)
2192 fi = NULL;
d6d5e999
CF
2193 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2194 (orig_oif != dev_out->ifindex)) {
2195 /* For local routes that require a particular output interface
2196 * we do not want to cache the result. Caching the result
2197 * causes incorrect behaviour when there are multiple source
2198 * addresses on the interface, the end result being that if the
2199 * intended recipient is waiting on that interface for the
2200 * packet he won't receive it because it will be delivered on
2201 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2202 * be set to the loopback interface as well.
2203 */
2204 fi = NULL;
1da177e4
LT
2205 }
2206
f2bb4bed 2207 fnhe = NULL;
63617421
JA
2208 do_cache &= fi != NULL;
2209 if (do_cache) {
c5038a83 2210 struct rtable __rcu **prth;
c92b9655 2211 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2212
c92b9655 2213 fnhe = find_exception(nh, fl4->daddr);
deed49df 2214 if (fnhe) {
2ffae99d 2215 prth = &fnhe->fnhe_rth_output;
deed49df
XL
2216 rth = rcu_dereference(*prth);
2217 if (rth && rth->dst.expires &&
2218 time_after(jiffies, rth->dst.expires)) {
2219 ip_del_fnhe(nh, fl4->daddr);
2220 fnhe = NULL;
2221 } else {
2222 goto rt_cache;
c92b9655 2223 }
c92b9655 2224 }
deed49df
XL
2225
2226 if (unlikely(fl4->flowi4_flags &
2227 FLOWI_FLAG_KNOWN_NH &&
2228 !(nh->nh_gw &&
2229 nh->nh_scope == RT_SCOPE_LINK))) {
2230 do_cache = false;
2231 goto add;
2232 }
2233 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c5038a83 2234 rth = rcu_dereference(*prth);
deed49df
XL
2235
2236rt_cache:
9df16efa 2237 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2238 return rth;
f2bb4bed 2239 }
c92b9655
JA
2240
2241add:
d08c4f35 2242 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2243 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2244 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2245 do_cache);
8391d07b 2246 if (!rth)
5ada5527 2247 return ERR_PTR(-ENOBUFS);
8391d07b 2248
9438c871 2249 rth->rt_iif = orig_oif;
b7503e0c
DA
2250 if (res->table)
2251 rth->rt_table_id = res->table->tb_id;
2252
1da177e4
LT
2253 RT_CACHE_STAT_INC(out_slow_tot);
2254
1da177e4 2255 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2256 if (flags & RTCF_LOCAL &&
1da177e4 2257 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2258 rth->dst.output = ip_mc_output;
1da177e4
LT
2259 RT_CACHE_STAT_INC(out_slow_mc);
2260 }
2261#ifdef CONFIG_IP_MROUTE
982721f3 2262 if (type == RTN_MULTICAST) {
1da177e4 2263 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2264 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2265 rth->dst.input = ip_mr_input;
2266 rth->dst.output = ip_mc_output;
1da177e4
LT
2267 }
2268 }
2269#endif
2270 }
2271
a4c2fd7f 2272 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2273 set_lwt_redirect(rth);
1da177e4 2274
5ada5527 2275 return rth;
1da177e4
LT
2276}
2277
1da177e4
LT
2278/*
2279 * Major route resolver routine.
2280 */
2281
3abd1ade
DA
2282struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2283 const struct sk_buff *skb)
1da177e4 2284{
f61759e6 2285 __u8 tos = RT_FL_TOS(fl4);
813b3b5d 2286 struct fib_result res;
5ada5527 2287 struct rtable *rth;
1da177e4 2288
85b91b03 2289 res.tclassid = 0;
1da177e4 2290 res.fi = NULL;
8b96d22d 2291 res.table = NULL;
1da177e4 2292
1fb9489b 2293 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2294 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2295 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2296 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2297
010c2708 2298 rcu_read_lock();
3abd1ade
DA
2299 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2300 rcu_read_unlock();
2301
2302 return rth;
2303}
2304EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2305
2306struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2307 struct fib_result *res,
2308 const struct sk_buff *skb)
2309{
2310 struct net_device *dev_out = NULL;
2311 int orig_oif = fl4->flowi4_oif;
2312 unsigned int flags = 0;
2313 struct rtable *rth;
2314 int err = -ENETUNREACH;
2315
813b3b5d 2316 if (fl4->saddr) {
b23dd4fe 2317 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2318 if (ipv4_is_multicast(fl4->saddr) ||
2319 ipv4_is_lbcast(fl4->saddr) ||
2320 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2321 goto out;
2322
1da177e4
LT
2323 /* I removed check for oif == dev_out->oif here.
2324 It was wrong for two reasons:
1ab35276
DL
2325 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2326 is assigned to multiple interfaces.
1da177e4
LT
2327 2. Moreover, we are allowed to send packets with saddr
2328 of another iface. --ANK
2329 */
2330
813b3b5d
DM
2331 if (fl4->flowi4_oif == 0 &&
2332 (ipv4_is_multicast(fl4->daddr) ||
2333 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2334 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2335 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2336 if (!dev_out)
a210d01a
JA
2337 goto out;
2338
1da177e4
LT
2339 /* Special hack: user can direct multicasts
2340 and limited broadcast via necessary interface
2341 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2342 This hack is not just for fun, it allows
2343 vic,vat and friends to work.
2344 They bind socket to loopback, set ttl to zero
2345 and expect that it will work.
2346 From the viewpoint of routing cache they are broken,
2347 because we are not allowed to build multicast path
2348 with loopback source addr (look, routing cache
2349 cannot know, that ttl is zero, so that packet
2350 will not leave this host and route is valid).
2351 Luckily, this hack is good workaround.
2352 */
2353
813b3b5d 2354 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2355 goto make_route;
2356 }
a210d01a 2357
813b3b5d 2358 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2359 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2360 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2361 goto out;
a210d01a 2362 }
1da177e4
LT
2363 }
2364
2365
813b3b5d
DM
2366 if (fl4->flowi4_oif) {
2367 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2368 rth = ERR_PTR(-ENODEV);
51456b29 2369 if (!dev_out)
1da177e4 2370 goto out;
e5ed6399
HX
2371
2372 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2373 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2374 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2375 goto out;
2376 }
813b3b5d 2377 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2378 ipv4_is_lbcast(fl4->daddr) ||
2379 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2380 if (!fl4->saddr)
2381 fl4->saddr = inet_select_addr(dev_out, 0,
2382 RT_SCOPE_LINK);
1da177e4
LT
2383 goto make_route;
2384 }
0a7e2260 2385 if (!fl4->saddr) {
813b3b5d
DM
2386 if (ipv4_is_multicast(fl4->daddr))
2387 fl4->saddr = inet_select_addr(dev_out, 0,
2388 fl4->flowi4_scope);
2389 else if (!fl4->daddr)
2390 fl4->saddr = inet_select_addr(dev_out, 0,
2391 RT_SCOPE_HOST);
1da177e4
LT
2392 }
2393 }
2394
813b3b5d
DM
2395 if (!fl4->daddr) {
2396 fl4->daddr = fl4->saddr;
2397 if (!fl4->daddr)
2398 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2399 dev_out = net->loopback_dev;
1fb9489b 2400 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2401 res->type = RTN_LOCAL;
1da177e4
LT
2402 flags |= RTCF_LOCAL;
2403 goto make_route;
2404 }
2405
3abd1ade 2406 err = fib_lookup(net, fl4, res, 0);
0315e382 2407 if (err) {
3abd1ade
DA
2408 res->fi = NULL;
2409 res->table = NULL;
6104e112 2410 if (fl4->flowi4_oif &&
e58e4159
DA
2411 (ipv4_is_multicast(fl4->daddr) ||
2412 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2413 /* Apparently, routing tables are wrong. Assume,
2414 that the destination is on link.
2415
2416 WHY? DW.
2417 Because we are allowed to send to iface
2418 even if it has NO routes and NO assigned
2419 addresses. When oif is specified, routing
2420 tables are looked up with only one purpose:
2421 to catch if destination is gatewayed, rather than
2422 direct. Moreover, if MSG_DONTROUTE is set,
2423 we send packet, ignoring both routing tables
2424 and ifaddr state. --ANK
2425
2426
2427 We could make it even if oif is unknown,
2428 likely IPv6, but we do not.
2429 */
2430
813b3b5d
DM
2431 if (fl4->saddr == 0)
2432 fl4->saddr = inet_select_addr(dev_out, 0,
2433 RT_SCOPE_LINK);
3abd1ade 2434 res->type = RTN_UNICAST;
1da177e4
LT
2435 goto make_route;
2436 }
0315e382 2437 rth = ERR_PTR(err);
1da177e4
LT
2438 goto out;
2439 }
1da177e4 2440
3abd1ade 2441 if (res->type == RTN_LOCAL) {
813b3b5d 2442 if (!fl4->saddr) {
3abd1ade
DA
2443 if (res->fi->fib_prefsrc)
2444 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2445 else
813b3b5d 2446 fl4->saddr = fl4->daddr;
9fc3bbb4 2447 }
5f02ce24
DA
2448
2449 /* L3 master device is the loopback for that domain */
3abd1ade 2450 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2451 net->loopback_dev;
839da4d9
DA
2452
2453 /* make sure orig_oif points to fib result device even
2454 * though packet rx/tx happens over loopback or l3mdev
2455 */
2456 orig_oif = FIB_RES_OIF(*res);
2457
813b3b5d 2458 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2459 flags |= RTCF_LOCAL;
2460 goto make_route;
2461 }
2462
3abd1ade 2463 fib_select_path(net, res, fl4, skb);
1da177e4 2464
3abd1ade 2465 dev_out = FIB_RES_DEV(*res);
813b3b5d 2466 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2467
2468
2469make_route:
3abd1ade 2470 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2471
010c2708 2472out:
b23dd4fe 2473 return rth;
1da177e4 2474}
d8c97a94 2475
ae2688d5
JW
2476static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2477{
2478 return NULL;
2479}
2480
ebb762f2 2481static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2482{
618f9bc7
SK
2483 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2484
2485 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2486}
2487
6700c270
DM
2488static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2489 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2490{
2491}
2492
6700c270
DM
2493static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2494 struct sk_buff *skb)
b587ee3b
DM
2495{
2496}
2497
0972ddb2
HB
2498static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2499 unsigned long old)
2500{
2501 return NULL;
2502}
2503
14e50e57
DM
2504static struct dst_ops ipv4_dst_blackhole_ops = {
2505 .family = AF_INET,
ae2688d5 2506 .check = ipv4_blackhole_dst_check,
ebb762f2 2507 .mtu = ipv4_blackhole_mtu,
214f45c9 2508 .default_advmss = ipv4_default_advmss,
14e50e57 2509 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2510 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2511 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2512 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2513};
2514
2774c131 2515struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2516{
2774c131 2517 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2518 struct rtable *rt;
14e50e57 2519
6c0e7284 2520 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2521 if (rt) {
d8d1f30b 2522 struct dst_entry *new = &rt->dst;
14e50e57 2523
14e50e57 2524 new->__use = 1;
352e512c 2525 new->input = dst_discard;
ede2059d 2526 new->output = dst_discard_out;
14e50e57 2527
1dbe3252 2528 new->dev = net->loopback_dev;
14e50e57
DM
2529 if (new->dev)
2530 dev_hold(new->dev);
2531
9917e1e8 2532 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2533 rt->rt_iif = ort->rt_iif;
5943634f 2534 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2535
ca4c3fc2 2536 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2537 rt->rt_flags = ort->rt_flags;
2538 rt->rt_type = ort->rt_type;
14e50e57 2539 rt->rt_gateway = ort->rt_gateway;
155e8336 2540 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2541
caacf05e 2542 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2543 }
2544
2774c131
DM
2545 dst_release(dst_orig);
2546
2547 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2548}
2549
9d6ec938 2550struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2551 const struct sock *sk)
1da177e4 2552{
9d6ec938 2553 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2554
b23dd4fe
DM
2555 if (IS_ERR(rt))
2556 return rt;
1da177e4 2557
56157872 2558 if (flp4->flowi4_proto)
f92ee619
SK
2559 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2560 flowi4_to_flowi(flp4),
2561 sk, 0);
1da177e4 2562
b23dd4fe 2563 return rt;
1da177e4 2564}
d8c97a94
ACM
2565EXPORT_SYMBOL_GPL(ip_route_output_flow);
2566
3765d35e 2567/* called with rcu_read_lock held */
c36ba660 2568static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2569 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2570 u32 seq)
1da177e4 2571{
ba52d61e 2572 struct rtable *rt = skb_rtable(skb);
1da177e4 2573 struct rtmsg *r;
be403ea1 2574 struct nlmsghdr *nlh;
2bc8ca40 2575 unsigned long expires = 0;
f185071d 2576 u32 error;
521f5490 2577 u32 metrics[RTAX_MAX];
be403ea1 2578
d3166e0c 2579 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2580 if (!nlh)
26932566 2581 return -EMSGSIZE;
be403ea1
TG
2582
2583 r = nlmsg_data(nlh);
1da177e4
LT
2584 r->rtm_family = AF_INET;
2585 r->rtm_dst_len = 32;
2586 r->rtm_src_len = 0;
d6c0a4f6 2587 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2588 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2589 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2590 goto nla_put_failure;
1da177e4
LT
2591 r->rtm_type = rt->rt_type;
2592 r->rtm_scope = RT_SCOPE_UNIVERSE;
2593 r->rtm_protocol = RTPROT_UNSPEC;
2594 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2595 if (rt->rt_flags & RTCF_NOTIFY)
2596 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2597 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2598 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2599
930345ea 2600 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2601 goto nla_put_failure;
1a00fee4 2602 if (src) {
1da177e4 2603 r->rtm_src_len = 32;
930345ea 2604 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2605 goto nla_put_failure;
1da177e4 2606 }
f3756b79
DM
2607 if (rt->dst.dev &&
2608 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2609 goto nla_put_failure;
c7066f70 2610#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2611 if (rt->dst.tclassid &&
2612 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2613 goto nla_put_failure;
1da177e4 2614#endif
41347dcd 2615 if (!rt_is_input_route(rt) &&
d6c0a4f6 2616 fl4->saddr != src) {
930345ea 2617 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2618 goto nla_put_failure;
2619 }
155e8336 2620 if (rt->rt_uses_gateway &&
930345ea 2621 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2622 goto nla_put_failure;
be403ea1 2623
ee9a8f7a
SK
2624 expires = rt->dst.expires;
2625 if (expires) {
2626 unsigned long now = jiffies;
2627
2628 if (time_before(now, expires))
2629 expires -= now;
2630 else
2631 expires = 0;
2632 }
2633
521f5490 2634 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2635 if (rt->rt_pmtu && expires)
521f5490
JA
2636 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2637 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2638 goto nla_put_failure;
2639
b4869889 2640 if (fl4->flowi4_mark &&
68aaed54 2641 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2642 goto nla_put_failure;
963bfeee 2643
622ec2c9
LC
2644 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2645 nla_put_u32(skb, RTA_UID,
2646 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2647 goto nla_put_failure;
2648
d8d1f30b 2649 error = rt->dst.error;
be403ea1 2650
c7537967 2651 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2652#ifdef CONFIG_IP_MROUTE
2653 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2654 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2655 int err = ipmr_get_route(net, skb,
2656 fl4->saddr, fl4->daddr,
9f09eaea 2657 r, portid);
2cf75070 2658
8caaf7b6 2659 if (err <= 0) {
0c8d803f
DA
2660 if (err == 0)
2661 return 0;
2662 goto nla_put_failure;
8caaf7b6
ND
2663 }
2664 } else
2665#endif
91146153 2666 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2667 goto nla_put_failure;
1da177e4
LT
2668 }
2669
f185071d 2670 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2671 goto nla_put_failure;
be403ea1 2672
053c095a
JB
2673 nlmsg_end(skb, nlh);
2674 return 0;
1da177e4 2675
be403ea1 2676nla_put_failure:
26932566
PM
2677 nlmsg_cancel(skb, nlh);
2678 return -EMSGSIZE;
1da177e4
LT
2679}
2680
c21ef3e3
DA
2681static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2682 struct netlink_ext_ack *extack)
1da177e4 2683{
3b1e0a65 2684 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2685 struct rtmsg *rtm;
2686 struct nlattr *tb[RTA_MAX+1];
3765d35e 2687 struct fib_result res = {};
1da177e4 2688 struct rtable *rt = NULL;
d6c0a4f6 2689 struct flowi4 fl4;
9e12bb22
AV
2690 __be32 dst = 0;
2691 __be32 src = 0;
2692 u32 iif;
d889ce3b 2693 int err;
963bfeee 2694 int mark;
1da177e4 2695 struct sk_buff *skb;
c36ba660 2696 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2697 kuid_t uid;
1da177e4 2698
fceb6435 2699 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2700 extack);
d889ce3b
TG
2701 if (err < 0)
2702 goto errout;
2703
2704 rtm = nlmsg_data(nlh);
2705
1da177e4 2706 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2707 if (!skb) {
d889ce3b
TG
2708 err = -ENOBUFS;
2709 goto errout;
2710 }
1da177e4
LT
2711
2712 /* Reserve room for dummy headers, this skb can pass
2713 through good chunk of routing engine.
2714 */
459a98ed 2715 skb_reset_mac_header(skb);
c1d2bbe1 2716 skb_reset_network_header(skb);
d2c962b8 2717
67b61f6c
JB
2718 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2719 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2720 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2721 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2722 if (tb[RTA_UID])
2723 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2724 else
2725 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2726
bbadb9a2
FL
2727 /* Bugfix: need to give ip_route_input enough of an IP header to
2728 * not gag.
2729 */
2730 ip_hdr(skb)->protocol = IPPROTO_UDP;
2731 ip_hdr(skb)->saddr = src;
2732 ip_hdr(skb)->daddr = dst;
2733
2734 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2735
d6c0a4f6
DM
2736 memset(&fl4, 0, sizeof(fl4));
2737 fl4.daddr = dst;
2738 fl4.saddr = src;
2739 fl4.flowi4_tos = rtm->rtm_tos;
2740 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2741 fl4.flowi4_mark = mark;
622ec2c9 2742 fl4.flowi4_uid = uid;
d6c0a4f6 2743
3765d35e
DA
2744 rcu_read_lock();
2745
1da177e4 2746 if (iif) {
d889ce3b
TG
2747 struct net_device *dev;
2748
3765d35e 2749 dev = dev_get_by_index_rcu(net, iif);
51456b29 2750 if (!dev) {
d889ce3b
TG
2751 err = -ENODEV;
2752 goto errout_free;
2753 }
2754
1da177e4
LT
2755 skb->protocol = htons(ETH_P_IP);
2756 skb->dev = dev;
963bfeee 2757 skb->mark = mark;
3765d35e
DA
2758 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2759 dev, &res);
d889ce3b 2760
511c3f92 2761 rt = skb_rtable(skb);
d8d1f30b
CG
2762 if (err == 0 && rt->dst.error)
2763 err = -rt->dst.error;
1da177e4 2764 } else {
3765d35e 2765 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2766 err = 0;
2767 if (IS_ERR(rt))
2768 err = PTR_ERR(rt);
2c87d63a
FW
2769 else
2770 skb_dst_set(skb, &rt->dst);
1da177e4 2771 }
d889ce3b 2772
1da177e4 2773 if (err)
d889ce3b 2774 goto errout_free;
1da177e4 2775
1da177e4
LT
2776 if (rtm->rtm_flags & RTM_F_NOTIFY)
2777 rt->rt_flags |= RTCF_NOTIFY;
2778
c36ba660
DA
2779 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2780 table_id = rt->rt_table_id;
2781
bc3aae2b
RP
2782 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2783 if (!res.fi) {
2784 err = fib_props[res.type].error;
2785 if (!err)
2786 err = -EHOSTUNREACH;
2787 goto errout_free;
2788 }
b6179813
RP
2789 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2790 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2791 rt->rt_type, res.prefix, res.prefixlen,
2792 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2793 } else {
b6179813 2794 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2795 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2796 }
7b46a644 2797 if (err < 0)
d889ce3b 2798 goto errout_free;
1da177e4 2799
3765d35e
DA
2800 rcu_read_unlock();
2801
15e47304 2802 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2803errout:
2942e900 2804 return err;
1da177e4 2805
d889ce3b 2806errout_free:
3765d35e 2807 rcu_read_unlock();
1da177e4 2808 kfree_skb(skb);
d889ce3b 2809 goto errout;
1da177e4
LT
2810}
2811
1da177e4
LT
2812void ip_rt_multicast_event(struct in_device *in_dev)
2813{
4ccfe6d4 2814 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2815}
2816
2817#ifdef CONFIG_SYSCTL
082c7ca4
G
2818static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2819static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2820static int ip_rt_gc_elasticity __read_mostly = 8;
2821
fe2c6338 2822static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2823 void __user *buffer,
1da177e4
LT
2824 size_t *lenp, loff_t *ppos)
2825{
5aad1de5
TT
2826 struct net *net = (struct net *)__ctl->extra1;
2827
1da177e4 2828 if (write) {
5aad1de5
TT
2829 rt_cache_flush(net);
2830 fnhe_genid_bump(net);
1da177e4 2831 return 0;
e905a9ed 2832 }
1da177e4
LT
2833
2834 return -EINVAL;
2835}
2836
fe2c6338 2837static struct ctl_table ipv4_route_table[] = {
1da177e4 2838 {
1da177e4
LT
2839 .procname = "gc_thresh",
2840 .data = &ipv4_dst_ops.gc_thresh,
2841 .maxlen = sizeof(int),
2842 .mode = 0644,
6d9f239a 2843 .proc_handler = proc_dointvec,
1da177e4
LT
2844 },
2845 {
1da177e4
LT
2846 .procname = "max_size",
2847 .data = &ip_rt_max_size,
2848 .maxlen = sizeof(int),
2849 .mode = 0644,
6d9f239a 2850 .proc_handler = proc_dointvec,
1da177e4
LT
2851 },
2852 {
2853 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2854
1da177e4
LT
2855 .procname = "gc_min_interval",
2856 .data = &ip_rt_gc_min_interval,
2857 .maxlen = sizeof(int),
2858 .mode = 0644,
6d9f239a 2859 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2860 },
2861 {
1da177e4
LT
2862 .procname = "gc_min_interval_ms",
2863 .data = &ip_rt_gc_min_interval,
2864 .maxlen = sizeof(int),
2865 .mode = 0644,
6d9f239a 2866 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2867 },
2868 {
1da177e4
LT
2869 .procname = "gc_timeout",
2870 .data = &ip_rt_gc_timeout,
2871 .maxlen = sizeof(int),
2872 .mode = 0644,
6d9f239a 2873 .proc_handler = proc_dointvec_jiffies,
1da177e4 2874 },
9f28a2fc
ED
2875 {
2876 .procname = "gc_interval",
2877 .data = &ip_rt_gc_interval,
2878 .maxlen = sizeof(int),
2879 .mode = 0644,
2880 .proc_handler = proc_dointvec_jiffies,
2881 },
1da177e4 2882 {
1da177e4
LT
2883 .procname = "redirect_load",
2884 .data = &ip_rt_redirect_load,
2885 .maxlen = sizeof(int),
2886 .mode = 0644,
6d9f239a 2887 .proc_handler = proc_dointvec,
1da177e4
LT
2888 },
2889 {
1da177e4
LT
2890 .procname = "redirect_number",
2891 .data = &ip_rt_redirect_number,
2892 .maxlen = sizeof(int),
2893 .mode = 0644,
6d9f239a 2894 .proc_handler = proc_dointvec,
1da177e4
LT
2895 },
2896 {
1da177e4
LT
2897 .procname = "redirect_silence",
2898 .data = &ip_rt_redirect_silence,
2899 .maxlen = sizeof(int),
2900 .mode = 0644,
6d9f239a 2901 .proc_handler = proc_dointvec,
1da177e4
LT
2902 },
2903 {
1da177e4
LT
2904 .procname = "error_cost",
2905 .data = &ip_rt_error_cost,
2906 .maxlen = sizeof(int),
2907 .mode = 0644,
6d9f239a 2908 .proc_handler = proc_dointvec,
1da177e4
LT
2909 },
2910 {
1da177e4
LT
2911 .procname = "error_burst",
2912 .data = &ip_rt_error_burst,
2913 .maxlen = sizeof(int),
2914 .mode = 0644,
6d9f239a 2915 .proc_handler = proc_dointvec,
1da177e4
LT
2916 },
2917 {
1da177e4
LT
2918 .procname = "gc_elasticity",
2919 .data = &ip_rt_gc_elasticity,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
6d9f239a 2922 .proc_handler = proc_dointvec,
1da177e4
LT
2923 },
2924 {
1da177e4
LT
2925 .procname = "mtu_expires",
2926 .data = &ip_rt_mtu_expires,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
6d9f239a 2929 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2930 },
2931 {
1da177e4
LT
2932 .procname = "min_pmtu",
2933 .data = &ip_rt_min_pmtu,
2934 .maxlen = sizeof(int),
2935 .mode = 0644,
6d9f239a 2936 .proc_handler = proc_dointvec,
1da177e4
LT
2937 },
2938 {
1da177e4
LT
2939 .procname = "min_adv_mss",
2940 .data = &ip_rt_min_advmss,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
6d9f239a 2943 .proc_handler = proc_dointvec,
1da177e4 2944 },
f8572d8f 2945 { }
1da177e4 2946};
39a23e75 2947
39a23e75
DL
2948static struct ctl_table ipv4_route_flush_table[] = {
2949 {
39a23e75
DL
2950 .procname = "flush",
2951 .maxlen = sizeof(int),
2952 .mode = 0200,
6d9f239a 2953 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2954 },
f8572d8f 2955 { },
39a23e75
DL
2956};
2957
2958static __net_init int sysctl_route_net_init(struct net *net)
2959{
2960 struct ctl_table *tbl;
2961
2962 tbl = ipv4_route_flush_table;
09ad9bc7 2963 if (!net_eq(net, &init_net)) {
39a23e75 2964 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2965 if (!tbl)
39a23e75 2966 goto err_dup;
464dc801
EB
2967
2968 /* Don't export sysctls to unprivileged users */
2969 if (net->user_ns != &init_user_ns)
2970 tbl[0].procname = NULL;
39a23e75
DL
2971 }
2972 tbl[0].extra1 = net;
2973
ec8f23ce 2974 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2975 if (!net->ipv4.route_hdr)
39a23e75
DL
2976 goto err_reg;
2977 return 0;
2978
2979err_reg:
2980 if (tbl != ipv4_route_flush_table)
2981 kfree(tbl);
2982err_dup:
2983 return -ENOMEM;
2984}
2985
2986static __net_exit void sysctl_route_net_exit(struct net *net)
2987{
2988 struct ctl_table *tbl;
2989
2990 tbl = net->ipv4.route_hdr->ctl_table_arg;
2991 unregister_net_sysctl_table(net->ipv4.route_hdr);
2992 BUG_ON(tbl == ipv4_route_flush_table);
2993 kfree(tbl);
2994}
2995
2996static __net_initdata struct pernet_operations sysctl_route_ops = {
2997 .init = sysctl_route_net_init,
2998 .exit = sysctl_route_net_exit,
2999};
1da177e4
LT
3000#endif
3001
3ee94372 3002static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3003{
ca4c3fc2 3004 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3005 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3006 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3007 return 0;
3008}
3009
3ee94372
NH
3010static __net_initdata struct pernet_operations rt_genid_ops = {
3011 .init = rt_genid_init,
9f5e97e5
DL
3012};
3013
c3426b47
DM
3014static int __net_init ipv4_inetpeer_init(struct net *net)
3015{
3016 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3017
3018 if (!bp)
3019 return -ENOMEM;
3020 inet_peer_base_init(bp);
3021 net->ipv4.peers = bp;
3022 return 0;
3023}
3024
3025static void __net_exit ipv4_inetpeer_exit(struct net *net)
3026{
3027 struct inet_peer_base *bp = net->ipv4.peers;
3028
3029 net->ipv4.peers = NULL;
56a6b248 3030 inetpeer_invalidate_tree(bp);
c3426b47
DM
3031 kfree(bp);
3032}
3033
3034static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3035 .init = ipv4_inetpeer_init,
3036 .exit = ipv4_inetpeer_exit,
3037};
9f5e97e5 3038
c7066f70 3039#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3040struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3041#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3042
1da177e4
LT
3043int __init ip_rt_init(void)
3044{
5055c371 3045 int cpu;
1da177e4 3046
73f156a6
ED
3047 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3048 if (!ip_idents)
3049 panic("IP: failed to allocate ip_idents\n");
3050
3051 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3052
355b590c
ED
3053 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3054 if (!ip_tstamps)
3055 panic("IP: failed to allocate ip_tstamps\n");
3056
5055c371
ED
3057 for_each_possible_cpu(cpu) {
3058 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3059
3060 INIT_LIST_HEAD(&ul->head);
3061 spin_lock_init(&ul->lock);
3062 }
c7066f70 3063#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3064 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3065 if (!ip_rt_acct)
3066 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3067#endif
3068
e5d679f3
AD
3069 ipv4_dst_ops.kmem_cachep =
3070 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3071 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3072
14e50e57
DM
3073 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3074
fc66f95c
ED
3075 if (dst_entries_init(&ipv4_dst_ops) < 0)
3076 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3077
3078 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3079 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3080
89aef892
DM
3081 ipv4_dst_ops.gc_thresh = ~0;
3082 ip_rt_max_size = INT_MAX;
1da177e4 3083
1da177e4
LT
3084 devinet_init();
3085 ip_fib_init();
3086
73b38711 3087 if (ip_rt_proc_init())
058bd4d2 3088 pr_err("Unable to create route proc files\n");
1da177e4
LT
3089#ifdef CONFIG_XFRM
3090 xfrm_init();
703fb94e 3091 xfrm4_init();
1da177e4 3092#endif
394f51ab
FW
3093 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3094 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3095
39a23e75
DL
3096#ifdef CONFIG_SYSCTL
3097 register_pernet_subsys(&sysctl_route_ops);
3098#endif
3ee94372 3099 register_pernet_subsys(&rt_genid_ops);
c3426b47 3100 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3101 return 0;
1da177e4
LT
3102}
3103
a1bc6eb4 3104#ifdef CONFIG_SYSCTL
eeb61f71
AV
3105/*
3106 * We really need to sanitize the damn ipv4 init order, then all
3107 * this nonsense will go away.
3108 */
3109void __init ip_static_sysctl_init(void)
3110{
4e5ca785 3111 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3112}
a1bc6eb4 3113#endif