]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/route.c
net: ipv4: use a dedicated counter for icmp_v4 redirect packets
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
2eda86c3 131static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2eda86c3
SD
135
136static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
137
1da177e4
LT
138/*
139 * Interface to generic destination cache.
140 */
141
142static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 143static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 144static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
147static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu);
149static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150 struct sk_buff *skb);
caacf05e 151static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 152
62fa8a84
DM
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
31248731
DM
155 WARN_ON(1);
156 return NULL;
62fa8a84
DM
157}
158
f894cbf8
DM
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
63fca65d 162static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 163
1da177e4
LT
164static struct dst_ops ipv4_dst_ops = {
165 .family = AF_INET,
1da177e4 166 .check = ipv4_dst_check,
0dbaee3b 167 .default_advmss = ipv4_default_advmss,
ebb762f2 168 .mtu = ipv4_mtu,
62fa8a84 169 .cow_metrics = ipv4_cow_metrics,
caacf05e 170 .destroy = ipv4_dst_destroy,
1da177e4
LT
171 .negative_advice = ipv4_negative_advice,
172 .link_failure = ipv4_link_failure,
173 .update_pmtu = ip_rt_update_pmtu,
e47a185b 174 .redirect = ip_do_redirect,
b92dacd4 175 .local_out = __ip_local_out,
d3aaeb38 176 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 177 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
178};
179
180#define ECN_OR_COST(class) TC_PRIO_##class
181
4839c52b 182const __u8 ip_tos2prio[16] = {
1da177e4 183 TC_PRIO_BESTEFFORT,
4a2b9c37 184 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199};
d4a96865 200EXPORT_SYMBOL(ip_tos2prio);
1da177e4 201
2f970d83 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 203#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 204
1da177e4 205#ifdef CONFIG_PROC_FS
1da177e4
LT
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{
29e75252 208 if (*pos)
89aef892 209 return NULL;
29e75252 210 return SEQ_START_TOKEN;
1da177e4
LT
211}
212
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{
1da177e4 215 ++*pos;
89aef892 216 return NULL;
1da177e4
LT
217}
218
219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{
1da177e4
LT
221}
222
223static int rt_cache_seq_show(struct seq_file *seq, void *v)
224{
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
e905a9ed 230 return 0;
1da177e4
LT
231}
232
f690808e 233static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238};
239
240static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{
89aef892 242 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
243}
244
9a32144e 245static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
89aef892 250 .release = seq_release,
1da177e4
LT
251};
252
253
254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255{
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
0f23174a 261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
2f970d83 265 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
266 }
267 return NULL;
268}
269
270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271{
272 int cpu;
273
0f23174a 274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
2f970d83 278 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
279 }
280 return NULL;
e905a9ed 281
1da177e4
LT
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
5bec0039 294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
295 return 0;
296 }
e905a9ed 297
1da177e4
LT
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 300 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 301 0, /* st->in_hit */
1da177e4
LT
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
0baf2b35 309 0, /* st->out_hit */
1da177e4 310 st->out_slow_tot,
e905a9ed 311 st->out_slow_mc,
1da177e4 312
0baf2b35
ED
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
1da177e4
LT
319 );
320 return 0;
321}
322
f690808e 323static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
9a32144e 336static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
c7066f70 344#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 345static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 346{
a661c419
AD
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
78c686e9
PE
362 }
363
a661c419
AD
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
78c686e9 368
a661c419
AD
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 372}
a661c419
AD
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
78c686e9 381#endif
107f1634 382
73b38711 383static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
384{
385 struct proc_dir_entry *pde;
386
d4beaa66
G
387 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
388 &rt_cache_seq_fops);
107f1634
PE
389 if (!pde)
390 goto err1;
391
77020720
WC
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
394 if (!pde)
395 goto err2;
396
c7066f70 397#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
c7066f70 404#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
73b38711
DL
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 418#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 419 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 420#endif
73b38711
DL
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
107f1634 433#else
73b38711 434static inline int ip_rt_proc_init(void)
107f1634
PE
435{
436 return 0;
437}
1da177e4 438#endif /* CONFIG_PROC_FS */
e905a9ed 439
4331debc 440static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 441{
ca4c3fc2 442 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
443}
444
4ccfe6d4 445void rt_cache_flush(struct net *net)
1da177e4 446{
ca4c3fc2 447 rt_genid_bump_ipv4(net);
98376387
ED
448}
449
f894cbf8
DM
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
3769cffb 453{
d3aaeb38
DM
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
39232973 456 const struct rtable *rt;
3769cffb
DM
457 struct neighbour *n;
458
39232973 459 rt = (const struct rtable *) dst;
a263b309 460 if (rt->rt_gateway)
39232973 461 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 464
80703d26 465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
466 if (n)
467 return n;
32092ecf 468 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
469}
470
63fca65d
JA
471static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
472{
473 struct net_device *dev = dst->dev;
474 const __be32 *pkey = daddr;
475 const struct rtable *rt;
476
477 rt = (const struct rtable *)dst;
478 if (rt->rt_gateway)
479 pkey = (const __be32 *)&rt->rt_gateway;
480 else if (!daddr ||
481 (rt->rt_flags &
482 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
483 return;
484
485 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
486}
487
04ca6973 488#define IP_IDENTS_SZ 2048u
04ca6973 489
355b590c
ED
490static atomic_t *ip_idents __read_mostly;
491static u32 *ip_tstamps __read_mostly;
04ca6973
ED
492
493/* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
496 */
497u32 ip_idents_reserve(u32 hash, int segs)
498{
355b590c
ED
499 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
500 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 501 u32 old = READ_ONCE(*p_tstamp);
04ca6973 502 u32 now = (u32)jiffies;
adb03115 503 u32 new, delta = 0;
04ca6973 504
355b590c 505 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
506 delta = prandom_u32_max(now - old);
507
adb03115
ED
508 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
509 do {
510 old = (u32)atomic_read(p_id);
511 new = old + delta + segs;
512 } while (atomic_cmpxchg(p_id, old, new) != old);
513
514 return new - segs;
04ca6973
ED
515}
516EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 517
b6a7719a 518void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 519{
73f156a6
ED
520 static u32 ip_idents_hashrnd __read_mostly;
521 u32 hash, id;
1da177e4 522
73f156a6 523 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 524
04ca6973
ED
525 hash = jhash_3words((__force u32)iph->daddr,
526 (__force u32)iph->saddr,
b6a7719a 527 iph->protocol ^ net_hash_mix(net),
04ca6973 528 ip_idents_hashrnd);
73f156a6
ED
529 id = ip_idents_reserve(hash, segs);
530 iph->id = htons(id);
1da177e4 531}
4bc2f18b 532EXPORT_SYMBOL(__ip_select_ident);
1da177e4 533
e2d118a1
LC
534static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
535 const struct sock *sk,
4895c771
DM
536 const struct iphdr *iph,
537 int oif, u8 tos,
538 u8 prot, u32 mark, int flow_flags)
539{
540 if (sk) {
541 const struct inet_sock *inet = inet_sk(sk);
542
543 oif = sk->sk_bound_dev_if;
544 mark = sk->sk_mark;
545 tos = RT_CONN_FLAGS(sk);
546 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
547 }
548 flowi4_init_output(fl4, oif, mark, tos,
549 RT_SCOPE_UNIVERSE, prot,
550 flow_flags,
e2d118a1
LC
551 iph->daddr, iph->saddr, 0, 0,
552 sock_net_uid(net, sk));
4895c771
DM
553}
554
5abf7f7e
ED
555static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
556 const struct sock *sk)
4895c771 557{
d109e61b 558 const struct net *net = dev_net(skb->dev);
4895c771
DM
559 const struct iphdr *iph = ip_hdr(skb);
560 int oif = skb->dev->ifindex;
561 u8 tos = RT_TOS(iph->tos);
562 u8 prot = iph->protocol;
563 u32 mark = skb->mark;
564
d109e61b 565 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
566}
567
5abf7f7e 568static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
569{
570 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 571 const struct ip_options_rcu *inet_opt;
4895c771
DM
572 __be32 daddr = inet->inet_daddr;
573
574 rcu_read_lock();
575 inet_opt = rcu_dereference(inet->inet_opt);
576 if (inet_opt && inet_opt->opt.srr)
577 daddr = inet_opt->opt.faddr;
578 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
579 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
580 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
581 inet_sk_flowi_flags(sk),
e2d118a1 582 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
583 rcu_read_unlock();
584}
585
5abf7f7e
ED
586static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 const struct sk_buff *skb)
4895c771
DM
588{
589 if (skb)
590 build_skb_flow_key(fl4, skb, sk);
591 else
592 build_sk_flow_key(fl4, sk);
593}
594
c5038a83 595static DEFINE_SPINLOCK(fnhe_lock);
4895c771 596
2ffae99d
TT
597static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
598{
599 struct rtable *rt;
600
601 rt = rcu_dereference(fnhe->fnhe_rth_input);
602 if (rt) {
603 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 604 dst_dev_put(&rt->dst);
0830106c 605 dst_release(&rt->dst);
2ffae99d
TT
606 }
607 rt = rcu_dereference(fnhe->fnhe_rth_output);
608 if (rt) {
609 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 610 dst_dev_put(&rt->dst);
0830106c 611 dst_release(&rt->dst);
2ffae99d
TT
612 }
613}
614
aee06da6 615static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
616{
617 struct fib_nh_exception *fnhe, *oldest;
618
619 oldest = rcu_dereference(hash->chain);
620 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
621 fnhe = rcu_dereference(fnhe->fnhe_next)) {
622 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
623 oldest = fnhe;
624 }
2ffae99d 625 fnhe_flush_routes(oldest);
4895c771
DM
626 return oldest;
627}
628
d3a25c98
DM
629static inline u32 fnhe_hashfun(__be32 daddr)
630{
d546c621 631 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
632 u32 hval;
633
d546c621
ED
634 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
635 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
636 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
637}
638
387aa65a
TT
639static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640{
641 rt->rt_pmtu = fnhe->fnhe_pmtu;
a9f829f7 642 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
643 rt->dst.expires = fnhe->fnhe_expires;
644
645 if (fnhe->fnhe_gw) {
646 rt->rt_flags |= RTCF_REDIRECTED;
647 rt->rt_gateway = fnhe->fnhe_gw;
648 rt->rt_uses_gateway = 1;
649 }
650}
651
aee06da6 652static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
a9f829f7 653 u32 pmtu, bool lock, unsigned long expires)
4895c771 654{
aee06da6 655 struct fnhe_hash_bucket *hash;
4895c771 656 struct fib_nh_exception *fnhe;
387aa65a 657 struct rtable *rt;
cebe84c6 658 u32 genid, hval;
387aa65a 659 unsigned int i;
4895c771 660 int depth;
cebe84c6
XL
661
662 genid = fnhe_genid(dev_net(nh->nh_dev));
663 hval = fnhe_hashfun(daddr);
aee06da6 664
c5038a83 665 spin_lock_bh(&fnhe_lock);
4895c771 666
caa41527 667 hash = rcu_dereference(nh->nh_exceptions);
4895c771 668 if (!hash) {
aee06da6 669 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 670 if (!hash)
aee06da6 671 goto out_unlock;
caa41527 672 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
673 }
674
4895c771
DM
675 hash += hval;
676
677 depth = 0;
678 for (fnhe = rcu_dereference(hash->chain); fnhe;
679 fnhe = rcu_dereference(fnhe->fnhe_next)) {
680 if (fnhe->fnhe_daddr == daddr)
aee06da6 681 break;
4895c771
DM
682 depth++;
683 }
684
aee06da6 685 if (fnhe) {
cebe84c6
XL
686 if (fnhe->fnhe_genid != genid)
687 fnhe->fnhe_genid = genid;
aee06da6
JA
688 if (gw)
689 fnhe->fnhe_gw = gw;
a9f829f7 690 if (pmtu) {
aee06da6 691 fnhe->fnhe_pmtu = pmtu;
a9f829f7
SD
692 fnhe->fnhe_mtu_locked = lock;
693 }
e39d5246 694 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 695 /* Update all cached dsts too */
2ffae99d
TT
696 rt = rcu_dereference(fnhe->fnhe_rth_input);
697 if (rt)
698 fill_route_from_fnhe(rt, fnhe);
699 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
700 if (rt)
701 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
702 } else {
703 if (depth > FNHE_RECLAIM_DEPTH)
704 fnhe = fnhe_oldest(hash);
705 else {
706 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
707 if (!fnhe)
708 goto out_unlock;
709
710 fnhe->fnhe_next = hash->chain;
711 rcu_assign_pointer(hash->chain, fnhe);
712 }
cebe84c6 713 fnhe->fnhe_genid = genid;
aee06da6
JA
714 fnhe->fnhe_daddr = daddr;
715 fnhe->fnhe_gw = gw;
716 fnhe->fnhe_pmtu = pmtu;
a9f829f7 717 fnhe->fnhe_mtu_locked = lock;
054d7cb5 718 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
719
720 /* Exception created; mark the cached routes for the nexthop
721 * stale, so anyone caching it rechecks if this exception
722 * applies to them.
723 */
2ffae99d
TT
724 rt = rcu_dereference(nh->nh_rth_input);
725 if (rt)
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
727
387aa65a
TT
728 for_each_possible_cpu(i) {
729 struct rtable __rcu **prt;
730 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
731 rt = rcu_dereference(*prt);
732 if (rt)
733 rt->dst.obsolete = DST_OBSOLETE_KILL;
734 }
4895c771 735 }
4895c771 736
4895c771 737 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
738
739out_unlock:
c5038a83 740 spin_unlock_bh(&fnhe_lock);
4895c771
DM
741}
742
ceb33206
DM
743static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
744 bool kill_route)
1da177e4 745{
e47a185b 746 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 747 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 748 struct net_device *dev = skb->dev;
e47a185b 749 struct in_device *in_dev;
4895c771 750 struct fib_result res;
e47a185b 751 struct neighbour *n;
317805b8 752 struct net *net;
1da177e4 753
94206125
DM
754 switch (icmp_hdr(skb)->code & 7) {
755 case ICMP_REDIR_NET:
756 case ICMP_REDIR_NETTOS:
757 case ICMP_REDIR_HOST:
758 case ICMP_REDIR_HOSTTOS:
759 break;
760
761 default:
762 return;
763 }
764
e47a185b
DM
765 if (rt->rt_gateway != old_gw)
766 return;
767
768 in_dev = __in_dev_get_rcu(dev);
769 if (!in_dev)
770 return;
771
c346dca1 772 net = dev_net(dev);
9d4fb27d
JP
773 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
774 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
775 ipv4_is_zeronet(new_gw))
1da177e4
LT
776 goto reject_redirect;
777
778 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
779 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
780 goto reject_redirect;
781 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
782 goto reject_redirect;
783 } else {
317805b8 784 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
785 goto reject_redirect;
786 }
787
969447f2
SSL
788 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
789 if (!n)
790 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 791 if (!IS_ERR(n)) {
e47a185b
DM
792 if (!(n->nud_state & NUD_VALID)) {
793 neigh_event_send(n, NULL);
794 } else {
0eeb075f 795 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 796 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 797
aee06da6 798 update_or_create_fnhe(nh, fl4->daddr, new_gw,
a9f829f7
SD
799 0, false,
800 jiffies + ip_rt_gc_timeout);
4895c771 801 }
ceb33206
DM
802 if (kill_route)
803 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
804 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
805 }
806 neigh_release(n);
807 }
808 return;
809
810reject_redirect:
811#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
812 if (IN_DEV_LOG_MARTIANS(in_dev)) {
813 const struct iphdr *iph = (const struct iphdr *) skb->data;
814 __be32 daddr = iph->daddr;
815 __be32 saddr = iph->saddr;
816
e47a185b
DM
817 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
818 " Advised path = %pI4 -> %pI4\n",
819 &old_gw, dev->name, &new_gw,
820 &saddr, &daddr);
99ee038d 821 }
e47a185b
DM
822#endif
823 ;
824}
825
4895c771
DM
826static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
827{
828 struct rtable *rt;
829 struct flowi4 fl4;
f96ef988 830 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 831 struct net *net = dev_net(skb->dev);
f96ef988
MK
832 int oif = skb->dev->ifindex;
833 u8 tos = RT_TOS(iph->tos);
834 u8 prot = iph->protocol;
835 u32 mark = skb->mark;
4895c771
DM
836
837 rt = (struct rtable *) dst;
838
7d995694 839 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 840 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
841}
842
1da177e4
LT
843static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
844{
ee6b9673 845 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
846 struct dst_entry *ret = dst;
847
848 if (rt) {
d11a4dc1 849 if (dst->obsolete > 0) {
1da177e4
LT
850 ip_rt_put(rt);
851 ret = NULL;
5943634f
DM
852 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
853 rt->dst.expires) {
89aef892 854 ip_rt_put(rt);
1da177e4
LT
855 ret = NULL;
856 }
857 }
858 return ret;
859}
860
861/*
862 * Algorithm:
863 * 1. The first ip_rt_redirect_number redirects are sent
864 * with exponential backoff, then we stop sending them at all,
865 * assuming that the host ignores our redirects.
866 * 2. If we did not see packets requiring redirects
867 * during ip_rt_redirect_silence, we assume that the host
868 * forgot redirected route and start to send redirects again.
869 *
870 * This algorithm is much cheaper and more intelligent than dumb load limiting
871 * in icmp.c.
872 *
873 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
874 * and "frag. need" (breaks PMTU discovery) in icmp.c.
875 */
876
877void ip_rt_send_redirect(struct sk_buff *skb)
878{
511c3f92 879 struct rtable *rt = skb_rtable(skb);
30038fc6 880 struct in_device *in_dev;
92d86829 881 struct inet_peer *peer;
1d861aa4 882 struct net *net;
30038fc6 883 int log_martians;
192132b9 884 int vif;
1da177e4 885
30038fc6 886 rcu_read_lock();
d8d1f30b 887 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
888 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
889 rcu_read_unlock();
1da177e4 890 return;
30038fc6
ED
891 }
892 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 893 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 894 rcu_read_unlock();
1da177e4 895
1d861aa4 896 net = dev_net(rt->dst.dev);
192132b9 897 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 898 if (!peer) {
e81da0e1
JA
899 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
900 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
901 return;
902 }
903
1da177e4
LT
904 /* No redirected packets during ip_rt_redirect_silence;
905 * reset the algorithm.
906 */
9ce92d57 907 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
92d86829 908 peer->rate_tokens = 0;
9ce92d57
LB
909 peer->n_redirects = 0;
910 }
1da177e4
LT
911
912 /* Too many ignored redirects; do not send anything
d8d1f30b 913 * set dst.rate_last to the last seen redirected packet.
1da177e4 914 */
9ce92d57 915 if (peer->n_redirects >= ip_rt_redirect_number) {
92d86829 916 peer->rate_last = jiffies;
1d861aa4 917 goto out_put_peer;
1da177e4
LT
918 }
919
920 /* Check for load limit; set rate_last to the latest sent
921 * redirect.
922 */
92d86829 923 if (peer->rate_tokens == 0 ||
14fb8a76 924 time_after(jiffies,
92d86829
DM
925 (peer->rate_last +
926 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
927 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
928
929 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
930 peer->rate_last = jiffies;
931 ++peer->rate_tokens;
9ce92d57 932 ++peer->n_redirects;
1da177e4 933#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 934 if (log_martians &&
e87cc472
JP
935 peer->rate_tokens == ip_rt_redirect_number)
936 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 937 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 938 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
939#endif
940 }
1d861aa4
DM
941out_put_peer:
942 inet_putpeer(peer);
1da177e4
LT
943}
944
945static int ip_error(struct sk_buff *skb)
946{
251da413 947 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 948 struct rtable *rt = skb_rtable(skb);
92d86829 949 struct inet_peer *peer;
1da177e4 950 unsigned long now;
251da413 951 struct net *net;
92d86829 952 bool send;
1da177e4
LT
953 int code;
954
381c759d
EB
955 /* IP on this device is disabled. */
956 if (!in_dev)
957 goto out;
958
251da413
DM
959 net = dev_net(rt->dst.dev);
960 if (!IN_DEV_FORWARD(in_dev)) {
961 switch (rt->dst.error) {
962 case EHOSTUNREACH:
b45386ef 963 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
964 break;
965
966 case ENETUNREACH:
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
968 break;
969 }
970 goto out;
971 }
972
d8d1f30b 973 switch (rt->dst.error) {
4500ebf8
JP
974 case EINVAL:
975 default:
976 goto out;
977 case EHOSTUNREACH:
978 code = ICMP_HOST_UNREACH;
979 break;
980 case ENETUNREACH:
981 code = ICMP_NET_UNREACH;
b45386ef 982 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
983 break;
984 case EACCES:
985 code = ICMP_PKT_FILTERED;
986 break;
1da177e4
LT
987 }
988
192132b9 989 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 990 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
991
992 send = true;
993 if (peer) {
994 now = jiffies;
995 peer->rate_tokens += now - peer->rate_last;
996 if (peer->rate_tokens > ip_rt_error_burst)
997 peer->rate_tokens = ip_rt_error_burst;
998 peer->rate_last = now;
999 if (peer->rate_tokens >= ip_rt_error_cost)
1000 peer->rate_tokens -= ip_rt_error_cost;
1001 else
1002 send = false;
1d861aa4 1003 inet_putpeer(peer);
1da177e4 1004 }
92d86829
DM
1005 if (send)
1006 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1007
1008out: kfree_skb(skb);
1009 return 0;
e905a9ed 1010}
1da177e4 1011
d851c12b 1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1013{
d851c12b 1014 struct dst_entry *dst = &rt->dst;
969a7f07 1015 u32 old_mtu = ipv4_mtu(dst);
4895c771 1016 struct fib_result res;
a9f829f7 1017 bool lock = false;
2c8cec5c 1018
a9f829f7 1019 if (ip_mtu_locked(dst))
fa1e492a
SK
1020 return;
1021
969a7f07 1022 if (old_mtu < mtu)
3cdaa5be
LW
1023 return;
1024
a9f829f7
SD
1025 if (mtu < ip_rt_min_pmtu) {
1026 lock = true;
969a7f07 1027 mtu = min(old_mtu, ip_rt_min_pmtu);
a9f829f7 1028 }
2c8cec5c 1029
969a7f07 1030 if (rt->rt_pmtu == mtu && !lock &&
f016229e
TT
1031 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032 return;
1033
c5ae7d41 1034 rcu_read_lock();
0eeb075f 1035 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1036 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1037
a9f829f7 1038 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
aee06da6 1039 jiffies + ip_rt_mtu_expires);
4895c771 1040 }
c5ae7d41 1041 rcu_read_unlock();
1da177e4
LT
1042}
1043
4895c771
DM
1044static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045 struct sk_buff *skb, u32 mtu)
1046{
1047 struct rtable *rt = (struct rtable *) dst;
1048 struct flowi4 fl4;
1049
1050 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1051 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1052}
1053
36393395
DM
1054void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055 int oif, u32 mark, u8 protocol, int flow_flags)
1056{
4895c771 1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1058 struct flowi4 fl4;
1059 struct rtable *rt;
1060
1b3c61dc
LC
1061 if (!mark)
1062 mark = IP4_REPLY_MARK(net, skb->mark);
1063
e2d118a1 1064 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1065 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1066 rt = __ip_route_output_key(net, &fl4);
1067 if (!IS_ERR(rt)) {
4895c771 1068 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1069 ip_rt_put(rt);
1070 }
1071}
1072EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
9cb3a50c 1074static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1075{
4895c771
DM
1076 const struct iphdr *iph = (const struct iphdr *) skb->data;
1077 struct flowi4 fl4;
1078 struct rtable *rt;
36393395 1079
e2d118a1 1080 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1081
1082 if (!fl4.flowi4_mark)
1083 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
4895c771
DM
1085 rt = __ip_route_output_key(sock_net(sk), &fl4);
1086 if (!IS_ERR(rt)) {
1087 __ip_rt_update_pmtu(rt, &fl4, mtu);
1088 ip_rt_put(rt);
1089 }
36393395 1090}
9cb3a50c
SK
1091
1092void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093{
1094 const struct iphdr *iph = (const struct iphdr *) skb->data;
1095 struct flowi4 fl4;
1096 struct rtable *rt;
7f502361 1097 struct dst_entry *odst = NULL;
b44108db 1098 bool new = false;
e2d118a1 1099 struct net *net = sock_net(sk);
9cb3a50c
SK
1100
1101 bh_lock_sock(sk);
482fc609
HFS
1102
1103 if (!ip_sk_accept_pmtu(sk))
1104 goto out;
1105
7f502361 1106 odst = sk_dst_get(sk);
9cb3a50c 1107
7f502361 1108 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1109 __ipv4_sk_update_pmtu(skb, sk, mtu);
1110 goto out;
1111 }
1112
e2d118a1 1113 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1114
7f502361 1115 rt = (struct rtable *)odst;
51456b29 1116 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1117 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118 if (IS_ERR(rt))
1119 goto out;
b44108db
SK
1120
1121 new = true;
9cb3a50c
SK
1122 }
1123
1124 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1125
7f502361 1126 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1127 if (new)
1128 dst_release(&rt->dst);
1129
9cb3a50c
SK
1130 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131 if (IS_ERR(rt))
1132 goto out;
1133
b44108db 1134 new = true;
9cb3a50c
SK
1135 }
1136
b44108db 1137 if (new)
7f502361 1138 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1139
1140out:
1141 bh_unlock_sock(sk);
7f502361 1142 dst_release(odst);
9cb3a50c 1143}
36393395 1144EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1145
b42597e2
DM
1146void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147 int oif, u32 mark, u8 protocol, int flow_flags)
1148{
4895c771 1149 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1150 struct flowi4 fl4;
1151 struct rtable *rt;
1152
e2d118a1 1153 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1154 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1155 rt = __ip_route_output_key(net, &fl4);
1156 if (!IS_ERR(rt)) {
ceb33206 1157 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1158 ip_rt_put(rt);
1159 }
1160}
1161EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164{
4895c771
DM
1165 const struct iphdr *iph = (const struct iphdr *) skb->data;
1166 struct flowi4 fl4;
1167 struct rtable *rt;
e2d118a1 1168 struct net *net = sock_net(sk);
b42597e2 1169
e2d118a1
LC
1170 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171 rt = __ip_route_output_key(net, &fl4);
4895c771 1172 if (!IS_ERR(rt)) {
ceb33206 1173 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1174 ip_rt_put(rt);
1175 }
b42597e2
DM
1176}
1177EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
efbc368d
DM
1179static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180{
1181 struct rtable *rt = (struct rtable *) dst;
1182
ceb33206
DM
1183 /* All IPV4 dsts are created with ->obsolete set to the value
1184 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185 * into this function always.
1186 *
387aa65a
TT
1187 * When a PMTU/redirect information update invalidates a route,
1188 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1190 */
387aa65a 1191 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1192 return NULL;
d11a4dc1 1193 return dst;
1da177e4
LT
1194}
1195
1da177e4
LT
1196static void ipv4_link_failure(struct sk_buff *skb)
1197{
1198 struct rtable *rt;
1199
1200 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1201
511c3f92 1202 rt = skb_rtable(skb);
5943634f
DM
1203 if (rt)
1204 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1205}
1206
ede2059d 1207static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1208{
91df42be
JP
1209 pr_debug("%s: %pI4 -> %pI4, %s\n",
1210 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1211 skb->dev ? skb->dev->name : "?");
1da177e4 1212 kfree_skb(skb);
c378a9c0 1213 WARN_ON(1);
1da177e4
LT
1214 return 0;
1215}
1216
1217/*
1218 We do not cache source address of outgoing interface,
1219 because it is used only by IP RR, TS and SRR options,
1220 so that it out of fast path.
1221
1222 BTW remember: "addr" is allowed to be not aligned
1223 in IP options!
1224 */
1225
8e36360a 1226void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1227{
a61ced5d 1228 __be32 src;
1da177e4 1229
c7537967 1230 if (rt_is_output_route(rt))
c5be24ff 1231 src = ip_hdr(skb)->saddr;
ebc0ffae 1232 else {
8e36360a
DM
1233 struct fib_result res;
1234 struct flowi4 fl4;
1235 struct iphdr *iph;
1236
1237 iph = ip_hdr(skb);
1238
1239 memset(&fl4, 0, sizeof(fl4));
1240 fl4.daddr = iph->daddr;
1241 fl4.saddr = iph->saddr;
b0fe4a31 1242 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1243 fl4.flowi4_oif = rt->dst.dev->ifindex;
1244 fl4.flowi4_iif = skb->dev->ifindex;
1245 fl4.flowi4_mark = skb->mark;
5e2b61f7 1246
ebc0ffae 1247 rcu_read_lock();
0eeb075f 1248 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1249 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1250 else
f8126f1d
DM
1251 src = inet_select_addr(rt->dst.dev,
1252 rt_nexthop(rt, iph->daddr),
1253 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1254 rcu_read_unlock();
1255 }
1da177e4
LT
1256 memcpy(addr, &src, 4);
1257}
1258
c7066f70 1259#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1260static void set_class_tag(struct rtable *rt, u32 tag)
1261{
d8d1f30b
CG
1262 if (!(rt->dst.tclassid & 0xFFFF))
1263 rt->dst.tclassid |= tag & 0xFFFF;
1264 if (!(rt->dst.tclassid & 0xFFFF0000))
1265 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1266}
1267#endif
1268
0dbaee3b
DM
1269static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1270{
7ed14d97 1271 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1272 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1273 ip_rt_min_advmss);
0dbaee3b 1274
7ed14d97 1275 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1276}
1277
ebb762f2 1278static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1279{
261663b0 1280 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1281 unsigned int mtu = rt->rt_pmtu;
1282
98d75c37 1283 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1284 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1285
38d523e2 1286 if (mtu)
618f9bc7
SK
1287 return mtu;
1288
c780a049 1289 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1290
a9f829f7 1291 if (unlikely(ip_mtu_locked(dst))) {
155e8336 1292 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1293 mtu = 576;
1294 }
1295
14972cbd
RP
1296 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1297
1298 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1299}
1300
054d7cb5
JA
1301static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1302{
1303 struct fnhe_hash_bucket *hash;
1304 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1305 u32 hval = fnhe_hashfun(daddr);
1306
1307 spin_lock_bh(&fnhe_lock);
1308
1309 hash = rcu_dereference_protected(nh->nh_exceptions,
1310 lockdep_is_held(&fnhe_lock));
1311 hash += hval;
1312
1313 fnhe_p = &hash->chain;
1314 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1315 while (fnhe) {
1316 if (fnhe->fnhe_daddr == daddr) {
1317 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1318 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1319 fnhe_flush_routes(fnhe);
1320 kfree_rcu(fnhe, rcu);
1321 break;
1322 }
1323 fnhe_p = &fnhe->fnhe_next;
1324 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1325 lockdep_is_held(&fnhe_lock));
1326 }
1327
1328 spin_unlock_bh(&fnhe_lock);
1329}
1330
f2bb4bed 1331static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1332{
caa41527 1333 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1334 struct fib_nh_exception *fnhe;
1335 u32 hval;
1336
f2bb4bed
DM
1337 if (!hash)
1338 return NULL;
1339
d3a25c98 1340 hval = fnhe_hashfun(daddr);
4895c771
DM
1341
1342 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1343 fnhe = rcu_dereference(fnhe->fnhe_next)) {
054d7cb5
JA
1344 if (fnhe->fnhe_daddr == daddr) {
1345 if (fnhe->fnhe_expires &&
1346 time_after(jiffies, fnhe->fnhe_expires)) {
1347 ip_del_fnhe(nh, daddr);
1348 break;
1349 }
f2bb4bed 1350 return fnhe;
054d7cb5 1351 }
f2bb4bed
DM
1352 }
1353 return NULL;
1354}
aee06da6 1355
caacf05e 1356static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1357 __be32 daddr, const bool do_cache)
f2bb4bed 1358{
caacf05e
DM
1359 bool ret = false;
1360
c5038a83 1361 spin_lock_bh(&fnhe_lock);
f2bb4bed 1362
c5038a83 1363 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1364 struct rtable __rcu **porig;
1365 struct rtable *orig;
5aad1de5 1366 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1367
1368 if (rt_is_input_route(rt))
1369 porig = &fnhe->fnhe_rth_input;
1370 else
1371 porig = &fnhe->fnhe_rth_output;
1372 orig = rcu_dereference(*porig);
5aad1de5
TT
1373
1374 if (fnhe->fnhe_genid != genid) {
1375 fnhe->fnhe_genid = genid;
13d82bf5
SK
1376 fnhe->fnhe_gw = 0;
1377 fnhe->fnhe_pmtu = 0;
1378 fnhe->fnhe_expires = 0;
2ffae99d
TT
1379 fnhe_flush_routes(fnhe);
1380 orig = NULL;
13d82bf5 1381 }
387aa65a
TT
1382 fill_route_from_fnhe(rt, fnhe);
1383 if (!rt->rt_gateway)
155e8336 1384 rt->rt_gateway = daddr;
f2bb4bed 1385
a4c2fd7f 1386 if (do_cache) {
0830106c 1387 dst_hold(&rt->dst);
2ffae99d 1388 rcu_assign_pointer(*porig, rt);
0830106c 1389 if (orig) {
95c47f9c 1390 dst_dev_put(&orig->dst);
0830106c 1391 dst_release(&orig->dst);
0830106c 1392 }
2ffae99d
TT
1393 ret = true;
1394 }
c5038a83
DM
1395
1396 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1397 }
1398 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1399
1400 return ret;
54764bb6
ED
1401}
1402
caacf05e 1403static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1404{
d26b3a7c 1405 struct rtable *orig, *prev, **p;
caacf05e 1406 bool ret = true;
f2bb4bed 1407
d26b3a7c 1408 if (rt_is_input_route(rt)) {
54764bb6 1409 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1410 } else {
903ceff7 1411 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1412 }
f2bb4bed
DM
1413 orig = *p;
1414
0830106c
WW
1415 /* hold dst before doing cmpxchg() to avoid race condition
1416 * on this dst
1417 */
1418 dst_hold(&rt->dst);
f2bb4bed
DM
1419 prev = cmpxchg(p, orig, rt);
1420 if (prev == orig) {
0830106c 1421 if (orig) {
95c47f9c 1422 dst_dev_put(&orig->dst);
0830106c 1423 dst_release(&orig->dst);
0830106c
WW
1424 }
1425 } else {
1426 dst_release(&rt->dst);
caacf05e 1427 ret = false;
0830106c 1428 }
caacf05e
DM
1429
1430 return ret;
1431}
1432
5055c371
ED
1433struct uncached_list {
1434 spinlock_t lock;
1435 struct list_head head;
1436};
1437
1438static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1439
31a5b09d 1440void rt_add_uncached_list(struct rtable *rt)
caacf05e 1441{
5055c371
ED
1442 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444 rt->rt_uncached_list = ul;
1445
1446 spin_lock_bh(&ul->lock);
1447 list_add_tail(&rt->rt_uncached, &ul->head);
1448 spin_unlock_bh(&ul->lock);
caacf05e
DM
1449}
1450
31a5b09d 1451void rt_del_uncached_list(struct rtable *rt)
caacf05e 1452{
78df76a0 1453 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1454 struct uncached_list *ul = rt->rt_uncached_list;
1455
1456 spin_lock_bh(&ul->lock);
caacf05e 1457 list_del(&rt->rt_uncached);
5055c371 1458 spin_unlock_bh(&ul->lock);
caacf05e
DM
1459 }
1460}
1461
31a5b09d
XL
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465 struct rtable *rt = (struct rtable *)dst;
1466
1467 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468 kfree(p);
1469
1470 rt_del_uncached_list(rt);
1471}
1472
caacf05e
DM
1473void rt_flush_dev(struct net_device *dev)
1474{
5055c371
ED
1475 struct net *net = dev_net(dev);
1476 struct rtable *rt;
1477 int cpu;
1478
1479 for_each_possible_cpu(cpu) {
1480 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1481
5055c371
ED
1482 spin_lock_bh(&ul->lock);
1483 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1484 if (rt->dst.dev != dev)
1485 continue;
1486 rt->dst.dev = net->loopback_dev;
1487 dev_hold(rt->dst.dev);
1488 dev_put(dev);
1489 }
5055c371 1490 spin_unlock_bh(&ul->lock);
4895c771
DM
1491 }
1492}
1493
4331debc 1494static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1495{
4331debc
ED
1496 return rt &&
1497 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498 !rt_is_expired(rt);
d2d68ba9
DM
1499}
1500
f2bb4bed 1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1502 const struct fib_result *res,
f2bb4bed 1503 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1504 struct fib_info *fi, u16 type, u32 itag,
1505 const bool do_cache)
1da177e4 1506{
caacf05e
DM
1507 bool cached = false;
1508
1da177e4 1509 if (fi) {
4895c771
DM
1510 struct fib_nh *nh = &FIB_RES_NH(*res);
1511
155e8336 1512 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1513 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1514 rt->rt_uses_gateway = 1;
1515 }
3fb07daf
ED
1516 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517 if (fi->fib_metrics != &dst_default_metrics) {
1518 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
9620fef2 1519 refcount_inc(&fi->fib_metrics->refcnt);
3fb07daf 1520 }
c7066f70 1521#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1522 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1523#endif
61adedf3 1524 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1525 if (unlikely(fnhe))
a4c2fd7f
WW
1526 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527 else if (do_cache)
caacf05e 1528 cached = rt_cache_route(nh, rt);
155e8336
JA
1529 if (unlikely(!cached)) {
1530 /* Routes we intend to cache in nexthop exception or
1531 * FIB nexthop have the DST_NOCACHE bit clear.
1532 * However, if we are unsuccessful at storing this
1533 * route into the cache we really need to set it.
1534 */
155e8336
JA
1535 if (!rt->rt_gateway)
1536 rt->rt_gateway = daddr;
1537 rt_add_uncached_list(rt);
1538 }
1539 } else
caacf05e 1540 rt_add_uncached_list(rt);
defb3519 1541
c7066f70 1542#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1543#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1544 set_class_tag(rt, res->tclassid);
1da177e4
LT
1545#endif
1546 set_class_tag(rt, itag);
1547#endif
1da177e4
LT
1548}
1549
9ab179d8
DA
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551 unsigned int flags, u16 type,
1552 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1553{
d08c4f35
DA
1554 struct rtable *rt;
1555
1556 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1557 (will_cache ? 0 : DST_HOST) |
d08c4f35 1558 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1559 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1560
1561 if (rt) {
1562 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563 rt->rt_flags = flags;
1564 rt->rt_type = type;
1565 rt->rt_is_input = 0;
1566 rt->rt_iif = 0;
1567 rt->rt_pmtu = 0;
a9f829f7 1568 rt->rt_mtu_locked = 0;
d08c4f35
DA
1569 rt->rt_gateway = 0;
1570 rt->rt_uses_gateway = 0;
b7503e0c 1571 rt->rt_table_id = 0;
d08c4f35
DA
1572 INIT_LIST_HEAD(&rt->rt_uncached);
1573
1574 rt->dst.output = ip_output;
1575 if (flags & RTCF_LOCAL)
1576 rt->dst.input = ip_local_deliver;
1577 }
1578
1579 return rt;
0c4dcd58 1580}
9ab179d8 1581EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1582
96d36220 1583/* called in rcu_read_lock() section */
bc044e8d
PA
1584int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585 u8 tos, struct net_device *dev,
1586 struct in_device *in_dev, u32 *itag)
1da177e4 1587{
b5f7e755 1588 int err;
1da177e4
LT
1589
1590 /* Primary sanity checks. */
51456b29 1591 if (!in_dev)
1da177e4
LT
1592 return -EINVAL;
1593
1e637c74 1594 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1595 skb->protocol != htons(ETH_P_IP))
bc044e8d 1596 return -EINVAL;
1da177e4 1597
75fea73d 1598 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1599 return -EINVAL;
d0daebc3 1600
f97c1e0c
JP
1601 if (ipv4_is_zeronet(saddr)) {
1602 if (!ipv4_is_local_multicast(daddr))
bc044e8d 1603 return -EINVAL;
b5f7e755 1604 } else {
9e56e380 1605 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1606 in_dev, itag);
b5f7e755 1607 if (err < 0)
bc044e8d 1608 return err;
b5f7e755 1609 }
bc044e8d
PA
1610 return 0;
1611}
1612
1613/* called in rcu_read_lock() section */
1614static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1615 u8 tos, struct net_device *dev, int our)
1616{
1617 struct in_device *in_dev = __in_dev_get_rcu(dev);
1618 unsigned int flags = RTCF_MULTICAST;
1619 struct rtable *rth;
1620 u32 itag = 0;
1621 int err;
1622
1623 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1624 if (err)
1625 return err;
1626
d08c4f35
DA
1627 if (our)
1628 flags |= RTCF_LOCAL;
1629
1630 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1631 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1632 if (!rth)
bc044e8d 1633 return -ENOBUFS;
1da177e4 1634
cf911662
DM
1635#ifdef CONFIG_IP_ROUTE_CLASSID
1636 rth->dst.tclassid = itag;
1637#endif
d8d1f30b 1638 rth->dst.output = ip_rt_bug;
9917e1e8 1639 rth->rt_is_input= 1;
1da177e4
LT
1640
1641#ifdef CONFIG_IP_MROUTE
f97c1e0c 1642 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1643 rth->dst.input = ip_mr_input;
1da177e4
LT
1644#endif
1645 RT_CACHE_STAT_INC(in_slow_mc);
1646
89aef892
DM
1647 skb_dst_set(skb, &rth->dst);
1648 return 0;
1da177e4
LT
1649}
1650
1651
1652static void ip_handle_martian_source(struct net_device *dev,
1653 struct in_device *in_dev,
1654 struct sk_buff *skb,
9e12bb22
AV
1655 __be32 daddr,
1656 __be32 saddr)
1da177e4
LT
1657{
1658 RT_CACHE_STAT_INC(in_martian_src);
1659#ifdef CONFIG_IP_ROUTE_VERBOSE
1660 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1661 /*
1662 * RFC1812 recommendation, if source is martian,
1663 * the only hint is MAC header.
1664 */
058bd4d2 1665 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1666 &daddr, &saddr, dev->name);
98e399f8 1667 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1668 print_hex_dump(KERN_WARNING, "ll header: ",
1669 DUMP_PREFIX_OFFSET, 16, 1,
1670 skb_mac_header(skb),
1671 dev->hard_header_len, true);
1da177e4
LT
1672 }
1673 }
1674#endif
1675}
1676
efd85700
TG
1677static void set_lwt_redirect(struct rtable *rth)
1678{
1679 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1680 rth->dst.lwtstate->orig_output = rth->dst.output;
1681 rth->dst.output = lwtunnel_output;
1682 }
1683
1684 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1685 rth->dst.lwtstate->orig_input = rth->dst.input;
1686 rth->dst.input = lwtunnel_input;
1687 }
1688}
1689
47360228 1690/* called in rcu_read_lock() section */
5969f71d 1691static int __mkroute_input(struct sk_buff *skb,
982721f3 1692 const struct fib_result *res,
5969f71d 1693 struct in_device *in_dev,
c6cffba4 1694 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1695{
2ffae99d 1696 struct fib_nh_exception *fnhe;
1da177e4
LT
1697 struct rtable *rth;
1698 int err;
1699 struct in_device *out_dev;
d2d68ba9 1700 bool do_cache;
fbdc0ad0 1701 u32 itag = 0;
1da177e4
LT
1702
1703 /* get a working reference to the output device */
47360228 1704 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1705 if (!out_dev) {
e87cc472 1706 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1707 return -EINVAL;
1708 }
1709
5c04c819 1710 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1711 in_dev->dev, in_dev, &itag);
1da177e4 1712 if (err < 0) {
e905a9ed 1713 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1714 saddr);
e905a9ed 1715
1da177e4
LT
1716 goto cleanup;
1717 }
1718
e81da0e1
JA
1719 do_cache = res->fi && !itag;
1720 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1721 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1722 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1723 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1724 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1725
1726 if (skb->protocol != htons(ETH_P_IP)) {
1727 /* Not IP (i.e. ARP). Do not create route, if it is
1728 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1729 *
1730 * Proxy arp feature have been extended to allow, ARP
1731 * replies back to the same interface, to support
1732 * Private VLAN switch technologies. See arp.c.
1da177e4 1733 */
65324144
JDB
1734 if (out_dev == in_dev &&
1735 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1736 err = -EINVAL;
1737 goto cleanup;
1738 }
1739 }
1740
2ffae99d 1741 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1742 if (do_cache) {
054d7cb5 1743 if (fnhe)
2ffae99d 1744 rth = rcu_dereference(fnhe->fnhe_rth_input);
054d7cb5
JA
1745 else
1746 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
e81da0e1
JA
1747 if (rt_cache_valid(rth)) {
1748 skb_dst_set_noref(skb, &rth->dst);
1749 goto out;
d2d68ba9
DM
1750 }
1751 }
f2bb4bed 1752
d08c4f35 1753 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1754 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1755 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1756 if (!rth) {
1757 err = -ENOBUFS;
1758 goto cleanup;
1759 }
1760
9917e1e8 1761 rth->rt_is_input = 1;
b7503e0c
DA
1762 if (res->table)
1763 rth->rt_table_id = res->table->tb_id;
a6254864 1764 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1765
d8d1f30b 1766 rth->dst.input = ip_forward;
1da177e4 1767
a4c2fd7f
WW
1768 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1769 do_cache);
efd85700 1770 set_lwt_redirect(rth);
c6cffba4 1771 skb_dst_set(skb, &rth->dst);
d2d68ba9 1772out:
1da177e4
LT
1773 err = 0;
1774 cleanup:
1da177e4 1775 return err;
e905a9ed 1776}
1da177e4 1777
79a13159 1778#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1779/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1780 * calculated from the inner IP addresses.
79a13159 1781 */
bf4e0a3d
NA
1782static void ip_multipath_l3_keys(const struct sk_buff *skb,
1783 struct flow_keys *hash_keys)
79a13159
PN
1784{
1785 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1786 const struct iphdr *inner_iph;
79a13159
PN
1787 const struct icmphdr *icmph;
1788 struct iphdr _inner_iph;
bf4e0a3d
NA
1789 struct icmphdr _icmph;
1790
1791 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1792 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1793 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1794 return;
79a13159
PN
1795
1796 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1797 return;
79a13159
PN
1798
1799 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1800 &_icmph);
1801 if (!icmph)
bf4e0a3d 1802 return;
79a13159
PN
1803
1804 if (icmph->type != ICMP_DEST_UNREACH &&
1805 icmph->type != ICMP_REDIRECT &&
1806 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1807 icmph->type != ICMP_PARAMETERPROB)
1808 return;
79a13159
PN
1809
1810 inner_iph = skb_header_pointer(skb,
1811 outer_iph->ihl * 4 + sizeof(_icmph),
1812 sizeof(_inner_iph), &_inner_iph);
1813 if (!inner_iph)
bf4e0a3d
NA
1814 return;
1815 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1816 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1817}
79a13159 1818
bf4e0a3d
NA
1819/* if skb is set it will be used and fl4 can be NULL */
1820int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1821 const struct sk_buff *skb)
1822{
1823 struct net *net = fi->fib_net;
1824 struct flow_keys hash_keys;
1825 u32 mhash;
79a13159 1826
bf4e0a3d
NA
1827 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1828 case 0:
1829 memset(&hash_keys, 0, sizeof(hash_keys));
1830 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1831 if (skb) {
1832 ip_multipath_l3_keys(skb, &hash_keys);
1833 } else {
1834 hash_keys.addrs.v4addrs.src = fl4->saddr;
1835 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1836 }
1837 break;
1838 case 1:
1839 /* skb is currently provided only when forwarding */
1840 if (skb) {
1841 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1842 struct flow_keys keys;
1843
1844 /* short-circuit if we already have L4 hash present */
1845 if (skb->l4_hash)
1846 return skb_get_hash_raw(skb) >> 1;
1847 memset(&hash_keys, 0, sizeof(hash_keys));
1848 skb_flow_dissect_flow_keys(skb, &keys, flag);
d41928b3
DA
1849
1850 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
bf4e0a3d
NA
1851 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1852 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1853 hash_keys.ports.src = keys.ports.src;
1854 hash_keys.ports.dst = keys.ports.dst;
1855 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1856 } else {
1857 memset(&hash_keys, 0, sizeof(hash_keys));
1858 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1859 hash_keys.addrs.v4addrs.src = fl4->saddr;
1860 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1861 hash_keys.ports.src = fl4->fl4_sport;
1862 hash_keys.ports.dst = fl4->fl4_dport;
1863 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1864 }
1865 break;
1866 }
1867 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1868
bf4e0a3d
NA
1869 return mhash >> 1;
1870}
1871EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1872#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1873
5969f71d
SH
1874static int ip_mkroute_input(struct sk_buff *skb,
1875 struct fib_result *res,
5969f71d
SH
1876 struct in_device *in_dev,
1877 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1878{
1da177e4 1879#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1880 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1881 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1882
0e884c78
PN
1883 fib_select_multipath(res, h);
1884 }
1da177e4
LT
1885#endif
1886
1887 /* create a routing cache entry */
c6cffba4 1888 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1889}
1890
1da177e4
LT
1891/*
1892 * NOTE. We drop all the packets that has local source
1893 * addresses, because every properly looped back packet
1894 * must have correct destination already attached by output routine.
1895 *
1896 * Such approach solves two big problems:
1897 * 1. Not simplex devices are handled properly.
1898 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1899 * called with rcu_read_lock()
1da177e4
LT
1900 */
1901
9e12bb22 1902static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1903 u8 tos, struct net_device *dev,
1904 struct fib_result *res)
1da177e4 1905{
96d36220 1906 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1907 struct ip_tunnel_info *tun_info;
68a5e3dd 1908 struct flowi4 fl4;
95c96174 1909 unsigned int flags = 0;
1da177e4 1910 u32 itag = 0;
95c96174 1911 struct rtable *rth;
1da177e4 1912 int err = -EINVAL;
5e73ea1a 1913 struct net *net = dev_net(dev);
d2d68ba9 1914 bool do_cache;
1da177e4
LT
1915
1916 /* IP on this device is disabled. */
1917
1918 if (!in_dev)
1919 goto out;
1920
1921 /* Check for the most weird martians, which can be not detected
1922 by fib_lookup.
1923 */
1924
61adedf3 1925 tun_info = skb_tunnel_info(skb);
46fa062a 1926 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1927 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1928 else
1929 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1930 skb_dst_drop(skb);
1931
d0daebc3 1932 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1933 goto martian_source;
1934
5510cdf7
DA
1935 res->fi = NULL;
1936 res->table = NULL;
27a954bd 1937 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1938 goto brd_input;
1939
1940 /* Accept zero addresses only to limited broadcast;
1941 * I even do not know to fix it or not. Waiting for complains :-)
1942 */
f97c1e0c 1943 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1944 goto martian_source;
1945
d0daebc3 1946 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1947 goto martian_destination;
1948
9eb43e76
ED
1949 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1950 * and call it once if daddr or/and saddr are loopback addresses
1951 */
1952 if (ipv4_is_loopback(daddr)) {
1953 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1954 goto martian_destination;
9eb43e76
ED
1955 } else if (ipv4_is_loopback(saddr)) {
1956 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1957 goto martian_source;
1958 }
1959
1da177e4
LT
1960 /*
1961 * Now we are ready to route packet.
1962 */
68a5e3dd 1963 fl4.flowi4_oif = 0;
e0d56fdd 1964 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1965 fl4.flowi4_mark = skb->mark;
1966 fl4.flowi4_tos = tos;
1967 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1968 fl4.flowi4_flags = 0;
68a5e3dd
DM
1969 fl4.daddr = daddr;
1970 fl4.saddr = saddr;
8bcfd092 1971 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1972 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1973 if (err != 0) {
1974 if (!IN_DEV_FORWARD(in_dev))
1975 err = -EHOSTUNREACH;
1da177e4 1976 goto no_route;
cd0f0b95 1977 }
1da177e4 1978
5510cdf7 1979 if (res->type == RTN_BROADCAST)
1da177e4
LT
1980 goto brd_input;
1981
5510cdf7 1982 if (res->type == RTN_LOCAL) {
5c04c819 1983 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1984 0, dev, in_dev, &itag);
b5f7e755 1985 if (err < 0)
0d753960 1986 goto martian_source;
1da177e4
LT
1987 goto local_input;
1988 }
1989
cd0f0b95
DJ
1990 if (!IN_DEV_FORWARD(in_dev)) {
1991 err = -EHOSTUNREACH;
251da413 1992 goto no_route;
cd0f0b95 1993 }
5510cdf7 1994 if (res->type != RTN_UNICAST)
1da177e4
LT
1995 goto martian_destination;
1996
5510cdf7 1997 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1998out: return err;
1999
2000brd_input:
2001 if (skb->protocol != htons(ETH_P_IP))
2002 goto e_inval;
2003
41347dcd 2004 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2005 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006 in_dev, &itag);
1da177e4 2007 if (err < 0)
0d753960 2008 goto martian_source;
1da177e4
LT
2009 }
2010 flags |= RTCF_BROADCAST;
5510cdf7 2011 res->type = RTN_BROADCAST;
1da177e4
LT
2012 RT_CACHE_STAT_INC(in_brd);
2013
2014local_input:
d2d68ba9 2015 do_cache = false;
5510cdf7 2016 if (res->fi) {
fe3edf45 2017 if (!itag) {
5510cdf7 2018 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2019 if (rt_cache_valid(rth)) {
c6cffba4
DM
2020 skb_dst_set_noref(skb, &rth->dst);
2021 err = 0;
2022 goto out;
d2d68ba9
DM
2023 }
2024 do_cache = true;
2025 }
2026 }
2027
f5a0aab8 2028 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2029 flags | RTCF_LOCAL, res->type,
d2d68ba9 2030 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2031 if (!rth)
2032 goto e_nobufs;
2033
d8d1f30b 2034 rth->dst.output= ip_rt_bug;
cf911662
DM
2035#ifdef CONFIG_IP_ROUTE_CLASSID
2036 rth->dst.tclassid = itag;
2037#endif
9917e1e8 2038 rth->rt_is_input = 1;
5510cdf7
DA
2039 if (res->table)
2040 rth->rt_table_id = res->table->tb_id;
571e7226 2041
a6254864 2042 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2043 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2044 rth->dst.input= ip_error;
2045 rth->dst.error= -err;
1da177e4
LT
2046 rth->rt_flags &= ~RTCF_LOCAL;
2047 }
efd85700 2048
dcdfdf56 2049 if (do_cache) {
5510cdf7 2050 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2051
2052 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2053 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2054 WARN_ON(rth->dst.input == lwtunnel_input);
2055 rth->dst.lwtstate->orig_input = rth->dst.input;
2056 rth->dst.input = lwtunnel_input;
2057 }
2058
a4c2fd7f 2059 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2060 rt_add_uncached_list(rth);
dcdfdf56 2061 }
89aef892 2062 skb_dst_set(skb, &rth->dst);
b23dd4fe 2063 err = 0;
ebc0ffae 2064 goto out;
1da177e4
LT
2065
2066no_route:
2067 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2068 res->type = RTN_UNREACHABLE;
2069 res->fi = NULL;
2070 res->table = NULL;
1da177e4
LT
2071 goto local_input;
2072
2073 /*
2074 * Do not cache martian addresses: they should be logged (RFC1812)
2075 */
2076martian_destination:
2077 RT_CACHE_STAT_INC(in_martian_dst);
2078#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2079 if (IN_DEV_LOG_MARTIANS(in_dev))
2080 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2081 &daddr, &saddr, dev->name);
1da177e4 2082#endif
2c2910a4 2083
1da177e4
LT
2084e_inval:
2085 err = -EINVAL;
ebc0ffae 2086 goto out;
1da177e4
LT
2087
2088e_nobufs:
2089 err = -ENOBUFS;
ebc0ffae 2090 goto out;
1da177e4
LT
2091
2092martian_source:
2093 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2094 goto out;
1da177e4
LT
2095}
2096
c6cffba4
DM
2097int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098 u8 tos, struct net_device *dev)
1da177e4 2099{
5510cdf7
DA
2100 struct fib_result res;
2101 int err;
1da177e4 2102
6e28099d 2103 tos &= IPTOS_RT_MASK;
96d36220 2104 rcu_read_lock();
5510cdf7
DA
2105 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2106 rcu_read_unlock();
96d36220 2107
5510cdf7
DA
2108 return err;
2109}
2110EXPORT_SYMBOL(ip_route_input_noref);
2111
2112/* called with rcu_read_lock held */
2113int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2114 u8 tos, struct net_device *dev, struct fib_result *res)
2115{
1da177e4
LT
2116 /* Multicast recognition logic is moved from route cache to here.
2117 The problem was that too many Ethernet cards have broken/missing
2118 hardware multicast filters :-( As result the host on multicasting
2119 network acquires a lot of useless route cache entries, sort of
2120 SDR messages from all the world. Now we try to get rid of them.
2121 Really, provided software IP multicast filter is organized
2122 reasonably (at least, hashed), it does not result in a slowdown
2123 comparing with route cache reject entries.
2124 Note, that multicast routers are not affected, because
2125 route cache entry is created eventually.
2126 */
f97c1e0c 2127 if (ipv4_is_multicast(daddr)) {
96d36220 2128 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2129 int our = 0;
5510cdf7 2130 int err = -EINVAL;
1da177e4 2131
e58e4159
DA
2132 if (in_dev)
2133 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2134 ip_hdr(skb)->protocol);
2135
2136 /* check l3 master if no match yet */
2137 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2138 struct in_device *l3_in_dev;
2139
2140 l3_in_dev = __in_dev_get_rcu(skb->dev);
2141 if (l3_in_dev)
2142 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2143 ip_hdr(skb)->protocol);
2144 }
2145
e58e4159 2146 if (our
1da177e4 2147#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2148 ||
2149 (!ipv4_is_local_multicast(daddr) &&
2150 IN_DEV_MFORWARD(in_dev))
1da177e4 2151#endif
e58e4159 2152 ) {
5510cdf7 2153 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2154 tos, dev, our);
1da177e4 2155 }
5510cdf7 2156 return err;
1da177e4 2157 }
5510cdf7
DA
2158
2159 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2160}
2161
ebc0ffae 2162/* called with rcu_read_lock() */
982721f3 2163static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2164 const struct flowi4 *fl4, int orig_oif,
f61759e6 2165 struct net_device *dev_out,
5ada5527 2166 unsigned int flags)
1da177e4 2167{
982721f3 2168 struct fib_info *fi = res->fi;
f2bb4bed 2169 struct fib_nh_exception *fnhe;
5ada5527 2170 struct in_device *in_dev;
982721f3 2171 u16 type = res->type;
5ada5527 2172 struct rtable *rth;
c92b9655 2173 bool do_cache;
1da177e4 2174
d0daebc3
TG
2175 in_dev = __in_dev_get_rcu(dev_out);
2176 if (!in_dev)
5ada5527 2177 return ERR_PTR(-EINVAL);
1da177e4 2178
d0daebc3 2179 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2180 if (ipv4_is_loopback(fl4->saddr) &&
2181 !(dev_out->flags & IFF_LOOPBACK) &&
2182 !netif_is_l3_master(dev_out))
d0daebc3
TG
2183 return ERR_PTR(-EINVAL);
2184
68a5e3dd 2185 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2186 type = RTN_BROADCAST;
68a5e3dd 2187 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2188 type = RTN_MULTICAST;
68a5e3dd 2189 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2190 return ERR_PTR(-EINVAL);
1da177e4
LT
2191
2192 if (dev_out->flags & IFF_LOOPBACK)
2193 flags |= RTCF_LOCAL;
2194
63617421 2195 do_cache = true;
982721f3 2196 if (type == RTN_BROADCAST) {
1da177e4 2197 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2198 fi = NULL;
2199 } else if (type == RTN_MULTICAST) {
dd28d1a0 2200 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2201 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2202 fl4->flowi4_proto))
1da177e4 2203 flags &= ~RTCF_LOCAL;
63617421
JA
2204 else
2205 do_cache = false;
1da177e4 2206 /* If multicast route do not exist use
dd28d1a0
ED
2207 * default one, but do not gateway in this case.
2208 * Yes, it is hack.
1da177e4 2209 */
982721f3
DM
2210 if (fi && res->prefixlen < 4)
2211 fi = NULL;
d6d5e999
CF
2212 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2213 (orig_oif != dev_out->ifindex)) {
2214 /* For local routes that require a particular output interface
2215 * we do not want to cache the result. Caching the result
2216 * causes incorrect behaviour when there are multiple source
2217 * addresses on the interface, the end result being that if the
2218 * intended recipient is waiting on that interface for the
2219 * packet he won't receive it because it will be delivered on
2220 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2221 * be set to the loopback interface as well.
2222 */
054d7cb5 2223 do_cache = false;
1da177e4
LT
2224 }
2225
f2bb4bed 2226 fnhe = NULL;
63617421 2227 do_cache &= fi != NULL;
054d7cb5 2228 if (fi) {
c5038a83 2229 struct rtable __rcu **prth;
c92b9655 2230 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2231
c92b9655 2232 fnhe = find_exception(nh, fl4->daddr);
054d7cb5
JA
2233 if (!do_cache)
2234 goto add;
deed49df 2235 if (fnhe) {
2ffae99d 2236 prth = &fnhe->fnhe_rth_output;
054d7cb5
JA
2237 } else {
2238 if (unlikely(fl4->flowi4_flags &
2239 FLOWI_FLAG_KNOWN_NH &&
2240 !(nh->nh_gw &&
2241 nh->nh_scope == RT_SCOPE_LINK))) {
2242 do_cache = false;
2243 goto add;
c92b9655 2244 }
054d7cb5 2245 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c92b9655 2246 }
c5038a83 2247 rth = rcu_dereference(*prth);
9df16efa 2248 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2249 return rth;
f2bb4bed 2250 }
c92b9655
JA
2251
2252add:
d08c4f35 2253 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2254 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2255 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2256 do_cache);
8391d07b 2257 if (!rth)
5ada5527 2258 return ERR_PTR(-ENOBUFS);
8391d07b 2259
9438c871 2260 rth->rt_iif = orig_oif;
b7503e0c
DA
2261 if (res->table)
2262 rth->rt_table_id = res->table->tb_id;
2263
1da177e4
LT
2264 RT_CACHE_STAT_INC(out_slow_tot);
2265
1da177e4 2266 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2267 if (flags & RTCF_LOCAL &&
1da177e4 2268 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2269 rth->dst.output = ip_mc_output;
1da177e4
LT
2270 RT_CACHE_STAT_INC(out_slow_mc);
2271 }
2272#ifdef CONFIG_IP_MROUTE
982721f3 2273 if (type == RTN_MULTICAST) {
1da177e4 2274 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2275 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2276 rth->dst.input = ip_mr_input;
2277 rth->dst.output = ip_mc_output;
1da177e4
LT
2278 }
2279 }
2280#endif
2281 }
2282
a4c2fd7f 2283 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2284 set_lwt_redirect(rth);
1da177e4 2285
5ada5527 2286 return rth;
1da177e4
LT
2287}
2288
1da177e4
LT
2289/*
2290 * Major route resolver routine.
2291 */
2292
3abd1ade
DA
2293struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2294 const struct sk_buff *skb)
1da177e4 2295{
f61759e6 2296 __u8 tos = RT_FL_TOS(fl4);
26883b0d
ED
2297 struct fib_result res = {
2298 .type = RTN_UNSPEC,
2299 .fi = NULL,
2300 .table = NULL,
2301 .tclassid = 0,
2302 };
5ada5527 2303 struct rtable *rth;
1da177e4 2304
1fb9489b 2305 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2306 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2307 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2308 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2309
010c2708 2310 rcu_read_lock();
3abd1ade
DA
2311 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2312 rcu_read_unlock();
2313
2314 return rth;
2315}
2316EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2317
2318struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2319 struct fib_result *res,
2320 const struct sk_buff *skb)
2321{
2322 struct net_device *dev_out = NULL;
2323 int orig_oif = fl4->flowi4_oif;
2324 unsigned int flags = 0;
2325 struct rtable *rth;
2326 int err = -ENETUNREACH;
2327
813b3b5d 2328 if (fl4->saddr) {
b23dd4fe 2329 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2330 if (ipv4_is_multicast(fl4->saddr) ||
2331 ipv4_is_lbcast(fl4->saddr) ||
2332 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2333 goto out;
2334
1da177e4
LT
2335 /* I removed check for oif == dev_out->oif here.
2336 It was wrong for two reasons:
1ab35276
DL
2337 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2338 is assigned to multiple interfaces.
1da177e4
LT
2339 2. Moreover, we are allowed to send packets with saddr
2340 of another iface. --ANK
2341 */
2342
813b3b5d
DM
2343 if (fl4->flowi4_oif == 0 &&
2344 (ipv4_is_multicast(fl4->daddr) ||
2345 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2346 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2347 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2348 if (!dev_out)
a210d01a
JA
2349 goto out;
2350
1da177e4
LT
2351 /* Special hack: user can direct multicasts
2352 and limited broadcast via necessary interface
2353 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2354 This hack is not just for fun, it allows
2355 vic,vat and friends to work.
2356 They bind socket to loopback, set ttl to zero
2357 and expect that it will work.
2358 From the viewpoint of routing cache they are broken,
2359 because we are not allowed to build multicast path
2360 with loopback source addr (look, routing cache
2361 cannot know, that ttl is zero, so that packet
2362 will not leave this host and route is valid).
2363 Luckily, this hack is good workaround.
2364 */
2365
813b3b5d 2366 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2367 goto make_route;
2368 }
a210d01a 2369
813b3b5d 2370 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2371 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2372 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2373 goto out;
a210d01a 2374 }
1da177e4
LT
2375 }
2376
2377
813b3b5d
DM
2378 if (fl4->flowi4_oif) {
2379 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2380 rth = ERR_PTR(-ENODEV);
51456b29 2381 if (!dev_out)
1da177e4 2382 goto out;
e5ed6399
HX
2383
2384 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2385 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2386 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2387 goto out;
2388 }
813b3b5d 2389 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2390 ipv4_is_lbcast(fl4->daddr) ||
2391 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2392 if (!fl4->saddr)
2393 fl4->saddr = inet_select_addr(dev_out, 0,
2394 RT_SCOPE_LINK);
1da177e4
LT
2395 goto make_route;
2396 }
0a7e2260 2397 if (!fl4->saddr) {
813b3b5d
DM
2398 if (ipv4_is_multicast(fl4->daddr))
2399 fl4->saddr = inet_select_addr(dev_out, 0,
2400 fl4->flowi4_scope);
2401 else if (!fl4->daddr)
2402 fl4->saddr = inet_select_addr(dev_out, 0,
2403 RT_SCOPE_HOST);
1da177e4
LT
2404 }
2405 }
2406
813b3b5d
DM
2407 if (!fl4->daddr) {
2408 fl4->daddr = fl4->saddr;
2409 if (!fl4->daddr)
2410 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2411 dev_out = net->loopback_dev;
1fb9489b 2412 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2413 res->type = RTN_LOCAL;
1da177e4
LT
2414 flags |= RTCF_LOCAL;
2415 goto make_route;
2416 }
2417
3abd1ade 2418 err = fib_lookup(net, fl4, res, 0);
0315e382 2419 if (err) {
3abd1ade
DA
2420 res->fi = NULL;
2421 res->table = NULL;
6104e112 2422 if (fl4->flowi4_oif &&
e58e4159
DA
2423 (ipv4_is_multicast(fl4->daddr) ||
2424 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2425 /* Apparently, routing tables are wrong. Assume,
2426 that the destination is on link.
2427
2428 WHY? DW.
2429 Because we are allowed to send to iface
2430 even if it has NO routes and NO assigned
2431 addresses. When oif is specified, routing
2432 tables are looked up with only one purpose:
2433 to catch if destination is gatewayed, rather than
2434 direct. Moreover, if MSG_DONTROUTE is set,
2435 we send packet, ignoring both routing tables
2436 and ifaddr state. --ANK
2437
2438
2439 We could make it even if oif is unknown,
2440 likely IPv6, but we do not.
2441 */
2442
813b3b5d
DM
2443 if (fl4->saddr == 0)
2444 fl4->saddr = inet_select_addr(dev_out, 0,
2445 RT_SCOPE_LINK);
3abd1ade 2446 res->type = RTN_UNICAST;
1da177e4
LT
2447 goto make_route;
2448 }
0315e382 2449 rth = ERR_PTR(err);
1da177e4
LT
2450 goto out;
2451 }
1da177e4 2452
3abd1ade 2453 if (res->type == RTN_LOCAL) {
813b3b5d 2454 if (!fl4->saddr) {
3abd1ade
DA
2455 if (res->fi->fib_prefsrc)
2456 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2457 else
813b3b5d 2458 fl4->saddr = fl4->daddr;
9fc3bbb4 2459 }
5f02ce24
DA
2460
2461 /* L3 master device is the loopback for that domain */
3abd1ade 2462 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2463 net->loopback_dev;
839da4d9
DA
2464
2465 /* make sure orig_oif points to fib result device even
2466 * though packet rx/tx happens over loopback or l3mdev
2467 */
2468 orig_oif = FIB_RES_OIF(*res);
2469
813b3b5d 2470 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2471 flags |= RTCF_LOCAL;
2472 goto make_route;
2473 }
2474
3abd1ade 2475 fib_select_path(net, res, fl4, skb);
1da177e4 2476
3abd1ade 2477 dev_out = FIB_RES_DEV(*res);
813b3b5d 2478 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2479
2480
2481make_route:
3abd1ade 2482 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2483
010c2708 2484out:
b23dd4fe 2485 return rth;
1da177e4 2486}
d8c97a94 2487
ae2688d5
JW
2488static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2489{
2490 return NULL;
2491}
2492
ebb762f2 2493static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2494{
618f9bc7
SK
2495 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2496
2497 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2498}
2499
6700c270
DM
2500static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2501 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2502{
2503}
2504
6700c270
DM
2505static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2506 struct sk_buff *skb)
b587ee3b
DM
2507{
2508}
2509
0972ddb2
HB
2510static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2511 unsigned long old)
2512{
2513 return NULL;
2514}
2515
14e50e57
DM
2516static struct dst_ops ipv4_dst_blackhole_ops = {
2517 .family = AF_INET,
ae2688d5 2518 .check = ipv4_blackhole_dst_check,
ebb762f2 2519 .mtu = ipv4_blackhole_mtu,
214f45c9 2520 .default_advmss = ipv4_default_advmss,
14e50e57 2521 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2522 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2523 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2524 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2525};
2526
2774c131 2527struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2528{
2774c131 2529 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2530 struct rtable *rt;
14e50e57 2531
6c0e7284 2532 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2533 if (rt) {
d8d1f30b 2534 struct dst_entry *new = &rt->dst;
14e50e57 2535
14e50e57 2536 new->__use = 1;
352e512c 2537 new->input = dst_discard;
ede2059d 2538 new->output = dst_discard_out;
14e50e57 2539
1dbe3252 2540 new->dev = net->loopback_dev;
14e50e57
DM
2541 if (new->dev)
2542 dev_hold(new->dev);
2543
9917e1e8 2544 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2545 rt->rt_iif = ort->rt_iif;
5943634f 2546 rt->rt_pmtu = ort->rt_pmtu;
a9f829f7 2547 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2548
ca4c3fc2 2549 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2550 rt->rt_flags = ort->rt_flags;
2551 rt->rt_type = ort->rt_type;
14e50e57 2552 rt->rt_gateway = ort->rt_gateway;
155e8336 2553 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2554
caacf05e 2555 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2556 }
2557
2774c131
DM
2558 dst_release(dst_orig);
2559
2560 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2561}
2562
9d6ec938 2563struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2564 const struct sock *sk)
1da177e4 2565{
9d6ec938 2566 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2567
b23dd4fe
DM
2568 if (IS_ERR(rt))
2569 return rt;
1da177e4 2570
56157872 2571 if (flp4->flowi4_proto)
f92ee619
SK
2572 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2573 flowi4_to_flowi(flp4),
2574 sk, 0);
1da177e4 2575
b23dd4fe 2576 return rt;
1da177e4 2577}
d8c97a94
ACM
2578EXPORT_SYMBOL_GPL(ip_route_output_flow);
2579
3765d35e 2580/* called with rcu_read_lock held */
c36ba660 2581static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2582 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2583 u32 seq)
1da177e4 2584{
ba52d61e 2585 struct rtable *rt = skb_rtable(skb);
1da177e4 2586 struct rtmsg *r;
be403ea1 2587 struct nlmsghdr *nlh;
2bc8ca40 2588 unsigned long expires = 0;
f185071d 2589 u32 error;
521f5490 2590 u32 metrics[RTAX_MAX];
be403ea1 2591
d3166e0c 2592 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2593 if (!nlh)
26932566 2594 return -EMSGSIZE;
be403ea1
TG
2595
2596 r = nlmsg_data(nlh);
1da177e4
LT
2597 r->rtm_family = AF_INET;
2598 r->rtm_dst_len = 32;
2599 r->rtm_src_len = 0;
d6c0a4f6 2600 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2601 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2602 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2603 goto nla_put_failure;
1da177e4
LT
2604 r->rtm_type = rt->rt_type;
2605 r->rtm_scope = RT_SCOPE_UNIVERSE;
2606 r->rtm_protocol = RTPROT_UNSPEC;
2607 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2608 if (rt->rt_flags & RTCF_NOTIFY)
2609 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2610 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2611 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2612
930345ea 2613 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2614 goto nla_put_failure;
1a00fee4 2615 if (src) {
1da177e4 2616 r->rtm_src_len = 32;
930345ea 2617 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2618 goto nla_put_failure;
1da177e4 2619 }
f3756b79
DM
2620 if (rt->dst.dev &&
2621 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2622 goto nla_put_failure;
c7066f70 2623#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2624 if (rt->dst.tclassid &&
2625 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2626 goto nla_put_failure;
1da177e4 2627#endif
41347dcd 2628 if (!rt_is_input_route(rt) &&
d6c0a4f6 2629 fl4->saddr != src) {
930345ea 2630 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2631 goto nla_put_failure;
2632 }
155e8336 2633 if (rt->rt_uses_gateway &&
930345ea 2634 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2635 goto nla_put_failure;
be403ea1 2636
ee9a8f7a
SK
2637 expires = rt->dst.expires;
2638 if (expires) {
2639 unsigned long now = jiffies;
2640
2641 if (time_before(now, expires))
2642 expires -= now;
2643 else
2644 expires = 0;
2645 }
2646
521f5490 2647 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2648 if (rt->rt_pmtu && expires)
521f5490 2649 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
a9f829f7
SD
2650 if (rt->rt_mtu_locked && expires)
2651 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2652 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2653 goto nla_put_failure;
2654
b4869889 2655 if (fl4->flowi4_mark &&
68aaed54 2656 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2657 goto nla_put_failure;
963bfeee 2658
622ec2c9
LC
2659 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2660 nla_put_u32(skb, RTA_UID,
2661 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2662 goto nla_put_failure;
2663
d8d1f30b 2664 error = rt->dst.error;
be403ea1 2665
c7537967 2666 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2667#ifdef CONFIG_IP_MROUTE
2668 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2669 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2670 int err = ipmr_get_route(net, skb,
2671 fl4->saddr, fl4->daddr,
9f09eaea 2672 r, portid);
2cf75070 2673
8caaf7b6 2674 if (err <= 0) {
0c8d803f
DA
2675 if (err == 0)
2676 return 0;
2677 goto nla_put_failure;
8caaf7b6
ND
2678 }
2679 } else
2680#endif
91146153 2681 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2682 goto nla_put_failure;
1da177e4
LT
2683 }
2684
f185071d 2685 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2686 goto nla_put_failure;
be403ea1 2687
053c095a
JB
2688 nlmsg_end(skb, nlh);
2689 return 0;
1da177e4 2690
be403ea1 2691nla_put_failure:
26932566
PM
2692 nlmsg_cancel(skb, nlh);
2693 return -EMSGSIZE;
1da177e4
LT
2694}
2695
c21ef3e3
DA
2696static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2697 struct netlink_ext_ack *extack)
1da177e4 2698{
3b1e0a65 2699 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2700 struct rtmsg *rtm;
2701 struct nlattr *tb[RTA_MAX+1];
3765d35e 2702 struct fib_result res = {};
1da177e4 2703 struct rtable *rt = NULL;
d6c0a4f6 2704 struct flowi4 fl4;
9e12bb22
AV
2705 __be32 dst = 0;
2706 __be32 src = 0;
2707 u32 iif;
d889ce3b 2708 int err;
963bfeee 2709 int mark;
1da177e4 2710 struct sk_buff *skb;
c36ba660 2711 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2712 kuid_t uid;
1da177e4 2713
fceb6435 2714 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2715 extack);
d889ce3b
TG
2716 if (err < 0)
2717 goto errout;
2718
2719 rtm = nlmsg_data(nlh);
2720
1da177e4 2721 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2722 if (!skb) {
d889ce3b
TG
2723 err = -ENOBUFS;
2724 goto errout;
2725 }
1da177e4
LT
2726
2727 /* Reserve room for dummy headers, this skb can pass
2728 through good chunk of routing engine.
2729 */
459a98ed 2730 skb_reset_mac_header(skb);
c1d2bbe1 2731 skb_reset_network_header(skb);
d2c962b8 2732
67b61f6c
JB
2733 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2734 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2735 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2736 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2737 if (tb[RTA_UID])
2738 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2739 else
2740 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2741
bbadb9a2
FL
2742 /* Bugfix: need to give ip_route_input enough of an IP header to
2743 * not gag.
2744 */
2745 ip_hdr(skb)->protocol = IPPROTO_UDP;
2746 ip_hdr(skb)->saddr = src;
2747 ip_hdr(skb)->daddr = dst;
2748
2749 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2750
d6c0a4f6
DM
2751 memset(&fl4, 0, sizeof(fl4));
2752 fl4.daddr = dst;
2753 fl4.saddr = src;
2754 fl4.flowi4_tos = rtm->rtm_tos;
2755 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2756 fl4.flowi4_mark = mark;
622ec2c9 2757 fl4.flowi4_uid = uid;
d6c0a4f6 2758
3765d35e
DA
2759 rcu_read_lock();
2760
1da177e4 2761 if (iif) {
d889ce3b
TG
2762 struct net_device *dev;
2763
3765d35e 2764 dev = dev_get_by_index_rcu(net, iif);
51456b29 2765 if (!dev) {
d889ce3b
TG
2766 err = -ENODEV;
2767 goto errout_free;
2768 }
2769
1da177e4
LT
2770 skb->protocol = htons(ETH_P_IP);
2771 skb->dev = dev;
963bfeee 2772 skb->mark = mark;
3765d35e
DA
2773 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2774 dev, &res);
d889ce3b 2775
511c3f92 2776 rt = skb_rtable(skb);
d8d1f30b
CG
2777 if (err == 0 && rt->dst.error)
2778 err = -rt->dst.error;
1da177e4 2779 } else {
6503a304 2780 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3765d35e 2781 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2782 err = 0;
2783 if (IS_ERR(rt))
2784 err = PTR_ERR(rt);
2c87d63a
FW
2785 else
2786 skb_dst_set(skb, &rt->dst);
1da177e4 2787 }
d889ce3b 2788
1da177e4 2789 if (err)
d889ce3b 2790 goto errout_free;
1da177e4 2791
1da177e4
LT
2792 if (rtm->rtm_flags & RTM_F_NOTIFY)
2793 rt->rt_flags |= RTCF_NOTIFY;
2794
c36ba660
DA
2795 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2796 table_id = rt->rt_table_id;
2797
bc3aae2b
RP
2798 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2799 if (!res.fi) {
2800 err = fib_props[res.type].error;
2801 if (!err)
2802 err = -EHOSTUNREACH;
2803 goto errout_free;
2804 }
b6179813
RP
2805 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2806 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2807 rt->rt_type, res.prefix, res.prefixlen,
2808 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2809 } else {
b6179813 2810 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2811 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2812 }
7b46a644 2813 if (err < 0)
d889ce3b 2814 goto errout_free;
1da177e4 2815
3765d35e
DA
2816 rcu_read_unlock();
2817
15e47304 2818 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2819errout:
2942e900 2820 return err;
1da177e4 2821
d889ce3b 2822errout_free:
3765d35e 2823 rcu_read_unlock();
1da177e4 2824 kfree_skb(skb);
d889ce3b 2825 goto errout;
1da177e4
LT
2826}
2827
1da177e4
LT
2828void ip_rt_multicast_event(struct in_device *in_dev)
2829{
4ccfe6d4 2830 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2831}
2832
2833#ifdef CONFIG_SYSCTL
082c7ca4
G
2834static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2835static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2836static int ip_rt_gc_elasticity __read_mostly = 8;
2837
fe2c6338 2838static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2839 void __user *buffer,
1da177e4
LT
2840 size_t *lenp, loff_t *ppos)
2841{
5aad1de5
TT
2842 struct net *net = (struct net *)__ctl->extra1;
2843
1da177e4 2844 if (write) {
5aad1de5
TT
2845 rt_cache_flush(net);
2846 fnhe_genid_bump(net);
1da177e4 2847 return 0;
e905a9ed 2848 }
1da177e4
LT
2849
2850 return -EINVAL;
2851}
2852
fe2c6338 2853static struct ctl_table ipv4_route_table[] = {
1da177e4 2854 {
1da177e4
LT
2855 .procname = "gc_thresh",
2856 .data = &ipv4_dst_ops.gc_thresh,
2857 .maxlen = sizeof(int),
2858 .mode = 0644,
6d9f239a 2859 .proc_handler = proc_dointvec,
1da177e4
LT
2860 },
2861 {
1da177e4
LT
2862 .procname = "max_size",
2863 .data = &ip_rt_max_size,
2864 .maxlen = sizeof(int),
2865 .mode = 0644,
6d9f239a 2866 .proc_handler = proc_dointvec,
1da177e4
LT
2867 },
2868 {
2869 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2870
1da177e4
LT
2871 .procname = "gc_min_interval",
2872 .data = &ip_rt_gc_min_interval,
2873 .maxlen = sizeof(int),
2874 .mode = 0644,
6d9f239a 2875 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2876 },
2877 {
1da177e4
LT
2878 .procname = "gc_min_interval_ms",
2879 .data = &ip_rt_gc_min_interval,
2880 .maxlen = sizeof(int),
2881 .mode = 0644,
6d9f239a 2882 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2883 },
2884 {
1da177e4
LT
2885 .procname = "gc_timeout",
2886 .data = &ip_rt_gc_timeout,
2887 .maxlen = sizeof(int),
2888 .mode = 0644,
6d9f239a 2889 .proc_handler = proc_dointvec_jiffies,
1da177e4 2890 },
9f28a2fc
ED
2891 {
2892 .procname = "gc_interval",
2893 .data = &ip_rt_gc_interval,
2894 .maxlen = sizeof(int),
2895 .mode = 0644,
2896 .proc_handler = proc_dointvec_jiffies,
2897 },
1da177e4 2898 {
1da177e4
LT
2899 .procname = "redirect_load",
2900 .data = &ip_rt_redirect_load,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
6d9f239a 2903 .proc_handler = proc_dointvec,
1da177e4
LT
2904 },
2905 {
1da177e4
LT
2906 .procname = "redirect_number",
2907 .data = &ip_rt_redirect_number,
2908 .maxlen = sizeof(int),
2909 .mode = 0644,
6d9f239a 2910 .proc_handler = proc_dointvec,
1da177e4
LT
2911 },
2912 {
1da177e4
LT
2913 .procname = "redirect_silence",
2914 .data = &ip_rt_redirect_silence,
2915 .maxlen = sizeof(int),
2916 .mode = 0644,
6d9f239a 2917 .proc_handler = proc_dointvec,
1da177e4
LT
2918 },
2919 {
1da177e4
LT
2920 .procname = "error_cost",
2921 .data = &ip_rt_error_cost,
2922 .maxlen = sizeof(int),
2923 .mode = 0644,
6d9f239a 2924 .proc_handler = proc_dointvec,
1da177e4
LT
2925 },
2926 {
1da177e4
LT
2927 .procname = "error_burst",
2928 .data = &ip_rt_error_burst,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
6d9f239a 2931 .proc_handler = proc_dointvec,
1da177e4
LT
2932 },
2933 {
1da177e4
LT
2934 .procname = "gc_elasticity",
2935 .data = &ip_rt_gc_elasticity,
2936 .maxlen = sizeof(int),
2937 .mode = 0644,
6d9f239a 2938 .proc_handler = proc_dointvec,
1da177e4
LT
2939 },
2940 {
1da177e4
LT
2941 .procname = "mtu_expires",
2942 .data = &ip_rt_mtu_expires,
2943 .maxlen = sizeof(int),
2944 .mode = 0644,
6d9f239a 2945 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2946 },
2947 {
1da177e4
LT
2948 .procname = "min_pmtu",
2949 .data = &ip_rt_min_pmtu,
2950 .maxlen = sizeof(int),
2951 .mode = 0644,
2eda86c3
SD
2952 .proc_handler = proc_dointvec_minmax,
2953 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
2954 },
2955 {
1da177e4
LT
2956 .procname = "min_adv_mss",
2957 .data = &ip_rt_min_advmss,
2958 .maxlen = sizeof(int),
2959 .mode = 0644,
6d9f239a 2960 .proc_handler = proc_dointvec,
1da177e4 2961 },
f8572d8f 2962 { }
1da177e4 2963};
39a23e75 2964
39a23e75
DL
2965static struct ctl_table ipv4_route_flush_table[] = {
2966 {
39a23e75
DL
2967 .procname = "flush",
2968 .maxlen = sizeof(int),
2969 .mode = 0200,
6d9f239a 2970 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2971 },
f8572d8f 2972 { },
39a23e75
DL
2973};
2974
2975static __net_init int sysctl_route_net_init(struct net *net)
2976{
2977 struct ctl_table *tbl;
2978
2979 tbl = ipv4_route_flush_table;
09ad9bc7 2980 if (!net_eq(net, &init_net)) {
39a23e75 2981 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2982 if (!tbl)
39a23e75 2983 goto err_dup;
464dc801
EB
2984
2985 /* Don't export sysctls to unprivileged users */
2986 if (net->user_ns != &init_user_ns)
2987 tbl[0].procname = NULL;
39a23e75
DL
2988 }
2989 tbl[0].extra1 = net;
2990
ec8f23ce 2991 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2992 if (!net->ipv4.route_hdr)
39a23e75
DL
2993 goto err_reg;
2994 return 0;
2995
2996err_reg:
2997 if (tbl != ipv4_route_flush_table)
2998 kfree(tbl);
2999err_dup:
3000 return -ENOMEM;
3001}
3002
3003static __net_exit void sysctl_route_net_exit(struct net *net)
3004{
3005 struct ctl_table *tbl;
3006
3007 tbl = net->ipv4.route_hdr->ctl_table_arg;
3008 unregister_net_sysctl_table(net->ipv4.route_hdr);
3009 BUG_ON(tbl == ipv4_route_flush_table);
3010 kfree(tbl);
3011}
3012
3013static __net_initdata struct pernet_operations sysctl_route_ops = {
3014 .init = sysctl_route_net_init,
3015 .exit = sysctl_route_net_exit,
3016};
1da177e4
LT
3017#endif
3018
3ee94372 3019static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3020{
ca4c3fc2 3021 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3022 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3023 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3024 return 0;
3025}
3026
3ee94372
NH
3027static __net_initdata struct pernet_operations rt_genid_ops = {
3028 .init = rt_genid_init,
9f5e97e5
DL
3029};
3030
c3426b47
DM
3031static int __net_init ipv4_inetpeer_init(struct net *net)
3032{
3033 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3034
3035 if (!bp)
3036 return -ENOMEM;
3037 inet_peer_base_init(bp);
3038 net->ipv4.peers = bp;
3039 return 0;
3040}
3041
3042static void __net_exit ipv4_inetpeer_exit(struct net *net)
3043{
3044 struct inet_peer_base *bp = net->ipv4.peers;
3045
3046 net->ipv4.peers = NULL;
56a6b248 3047 inetpeer_invalidate_tree(bp);
c3426b47
DM
3048 kfree(bp);
3049}
3050
3051static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3052 .init = ipv4_inetpeer_init,
3053 .exit = ipv4_inetpeer_exit,
3054};
9f5e97e5 3055
c7066f70 3056#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3057struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3058#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3059
1da177e4
LT
3060int __init ip_rt_init(void)
3061{
5055c371 3062 int cpu;
1da177e4 3063
73f156a6
ED
3064 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3065 if (!ip_idents)
3066 panic("IP: failed to allocate ip_idents\n");
3067
3068 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3069
355b590c
ED
3070 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3071 if (!ip_tstamps)
3072 panic("IP: failed to allocate ip_tstamps\n");
3073
5055c371
ED
3074 for_each_possible_cpu(cpu) {
3075 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3076
3077 INIT_LIST_HEAD(&ul->head);
3078 spin_lock_init(&ul->lock);
3079 }
c7066f70 3080#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3081 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3082 if (!ip_rt_acct)
3083 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3084#endif
3085
e5d679f3
AD
3086 ipv4_dst_ops.kmem_cachep =
3087 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3088 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3089
14e50e57
DM
3090 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3091
fc66f95c
ED
3092 if (dst_entries_init(&ipv4_dst_ops) < 0)
3093 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3094
3095 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3096 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3097
89aef892
DM
3098 ipv4_dst_ops.gc_thresh = ~0;
3099 ip_rt_max_size = INT_MAX;
1da177e4 3100
1da177e4
LT
3101 devinet_init();
3102 ip_fib_init();
3103
73b38711 3104 if (ip_rt_proc_init())
058bd4d2 3105 pr_err("Unable to create route proc files\n");
1da177e4
LT
3106#ifdef CONFIG_XFRM
3107 xfrm_init();
703fb94e 3108 xfrm4_init();
1da177e4 3109#endif
394f51ab
FW
3110 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3111 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3112
39a23e75
DL
3113#ifdef CONFIG_SYSCTL
3114 register_pernet_subsys(&sysctl_route_ops);
3115#endif
3ee94372 3116 register_pernet_subsys(&rt_genid_ops);
c3426b47 3117 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3118 return 0;
1da177e4
LT
3119}
3120
a1bc6eb4 3121#ifdef CONFIG_SYSCTL
eeb61f71
AV
3122/*
3123 * We really need to sanitize the damn ipv4 init order, then all
3124 * this nonsense will go away.
3125 */
3126void __init ip_static_sysctl_init(void)
3127{
4e5ca785 3128 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3129}
a1bc6eb4 3130#endif