]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/route.c
cpufreq: CPPC: Don't set transition_latency
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
2eda86c3 131static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2eda86c3
SD
135
136static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
137
1da177e4
LT
138/*
139 * Interface to generic destination cache.
140 */
141
142static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 143static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 144static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
147static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu);
149static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150 struct sk_buff *skb);
caacf05e 151static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 152
62fa8a84
DM
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{
31248731
DM
155 WARN_ON(1);
156 return NULL;
62fa8a84
DM
157}
158
f894cbf8
DM
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
63fca65d 162static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 163
1da177e4
LT
164static struct dst_ops ipv4_dst_ops = {
165 .family = AF_INET,
1da177e4 166 .check = ipv4_dst_check,
0dbaee3b 167 .default_advmss = ipv4_default_advmss,
ebb762f2 168 .mtu = ipv4_mtu,
62fa8a84 169 .cow_metrics = ipv4_cow_metrics,
caacf05e 170 .destroy = ipv4_dst_destroy,
1da177e4
LT
171 .negative_advice = ipv4_negative_advice,
172 .link_failure = ipv4_link_failure,
173 .update_pmtu = ip_rt_update_pmtu,
e47a185b 174 .redirect = ip_do_redirect,
b92dacd4 175 .local_out = __ip_local_out,
d3aaeb38 176 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 177 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
178};
179
180#define ECN_OR_COST(class) TC_PRIO_##class
181
4839c52b 182const __u8 ip_tos2prio[16] = {
1da177e4 183 TC_PRIO_BESTEFFORT,
4a2b9c37 184 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199};
d4a96865 200EXPORT_SYMBOL(ip_tos2prio);
1da177e4 201
2f970d83 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 203#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 204
1da177e4 205#ifdef CONFIG_PROC_FS
1da177e4
LT
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{
29e75252 208 if (*pos)
89aef892 209 return NULL;
29e75252 210 return SEQ_START_TOKEN;
1da177e4
LT
211}
212
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{
1da177e4 215 ++*pos;
89aef892 216 return NULL;
1da177e4
LT
217}
218
219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{
1da177e4
LT
221}
222
223static int rt_cache_seq_show(struct seq_file *seq, void *v)
224{
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
e905a9ed 230 return 0;
1da177e4
LT
231}
232
f690808e 233static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238};
239
240static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{
89aef892 242 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
243}
244
9a32144e 245static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
89aef892 250 .release = seq_release,
1da177e4
LT
251};
252
253
254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255{
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
0f23174a 261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
2f970d83 265 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
266 }
267 return NULL;
268}
269
270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271{
272 int cpu;
273
0f23174a 274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
2f970d83 278 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
279 }
280 return NULL;
e905a9ed 281
1da177e4
LT
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
5bec0039 294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
295 return 0;
296 }
e905a9ed 297
1da177e4
LT
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 300 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 301 0, /* st->in_hit */
1da177e4
LT
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
0baf2b35 309 0, /* st->out_hit */
1da177e4 310 st->out_slow_tot,
e905a9ed 311 st->out_slow_mc,
1da177e4 312
0baf2b35
ED
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
1da177e4
LT
319 );
320 return 0;
321}
322
f690808e 323static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
9a32144e 336static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
c7066f70 344#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 345static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 346{
a661c419
AD
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
78c686e9
PE
362 }
363
a661c419
AD
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
78c686e9 368
a661c419
AD
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 372}
a661c419
AD
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
78c686e9 381#endif
107f1634 382
73b38711 383static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
384{
385 struct proc_dir_entry *pde;
386
d4beaa66
G
387 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
388 &rt_cache_seq_fops);
107f1634
PE
389 if (!pde)
390 goto err1;
391
77020720
WC
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
394 if (!pde)
395 goto err2;
396
c7066f70 397#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
c7066f70 404#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
73b38711
DL
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 418#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 419 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 420#endif
73b38711
DL
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
107f1634 433#else
73b38711 434static inline int ip_rt_proc_init(void)
107f1634
PE
435{
436 return 0;
437}
1da177e4 438#endif /* CONFIG_PROC_FS */
e905a9ed 439
4331debc 440static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 441{
ca4c3fc2 442 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
443}
444
4ccfe6d4 445void rt_cache_flush(struct net *net)
1da177e4 446{
ca4c3fc2 447 rt_genid_bump_ipv4(net);
98376387
ED
448}
449
f894cbf8
DM
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
3769cffb 453{
d3aaeb38
DM
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
39232973 456 const struct rtable *rt;
3769cffb
DM
457 struct neighbour *n;
458
39232973 459 rt = (const struct rtable *) dst;
a263b309 460 if (rt->rt_gateway)
39232973 461 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 464
80703d26 465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
466 if (n)
467 return n;
32092ecf 468 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
469}
470
63fca65d
JA
471static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
472{
473 struct net_device *dev = dst->dev;
474 const __be32 *pkey = daddr;
475 const struct rtable *rt;
476
477 rt = (const struct rtable *)dst;
478 if (rt->rt_gateway)
479 pkey = (const __be32 *)&rt->rt_gateway;
480 else if (!daddr ||
481 (rt->rt_flags &
482 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
483 return;
484
485 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
486}
487
04ca6973 488#define IP_IDENTS_SZ 2048u
04ca6973 489
355b590c
ED
490static atomic_t *ip_idents __read_mostly;
491static u32 *ip_tstamps __read_mostly;
04ca6973
ED
492
493/* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
496 */
497u32 ip_idents_reserve(u32 hash, int segs)
498{
355b590c
ED
499 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
500 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 501 u32 old = READ_ONCE(*p_tstamp);
04ca6973 502 u32 now = (u32)jiffies;
adb03115 503 u32 new, delta = 0;
04ca6973 504
355b590c 505 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
506 delta = prandom_u32_max(now - old);
507
adb03115
ED
508 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
509 do {
510 old = (u32)atomic_read(p_id);
511 new = old + delta + segs;
512 } while (atomic_cmpxchg(p_id, old, new) != old);
513
514 return new - segs;
04ca6973
ED
515}
516EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 517
b6a7719a 518void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 519{
73f156a6
ED
520 static u32 ip_idents_hashrnd __read_mostly;
521 u32 hash, id;
1da177e4 522
73f156a6 523 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 524
04ca6973
ED
525 hash = jhash_3words((__force u32)iph->daddr,
526 (__force u32)iph->saddr,
b6a7719a 527 iph->protocol ^ net_hash_mix(net),
04ca6973 528 ip_idents_hashrnd);
73f156a6
ED
529 id = ip_idents_reserve(hash, segs);
530 iph->id = htons(id);
1da177e4 531}
4bc2f18b 532EXPORT_SYMBOL(__ip_select_ident);
1da177e4 533
e2d118a1
LC
534static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
535 const struct sock *sk,
4895c771
DM
536 const struct iphdr *iph,
537 int oif, u8 tos,
538 u8 prot, u32 mark, int flow_flags)
539{
540 if (sk) {
541 const struct inet_sock *inet = inet_sk(sk);
542
543 oif = sk->sk_bound_dev_if;
544 mark = sk->sk_mark;
545 tos = RT_CONN_FLAGS(sk);
546 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
547 }
548 flowi4_init_output(fl4, oif, mark, tos,
549 RT_SCOPE_UNIVERSE, prot,
550 flow_flags,
e2d118a1
LC
551 iph->daddr, iph->saddr, 0, 0,
552 sock_net_uid(net, sk));
4895c771
DM
553}
554
5abf7f7e
ED
555static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
556 const struct sock *sk)
4895c771 557{
d109e61b 558 const struct net *net = dev_net(skb->dev);
4895c771
DM
559 const struct iphdr *iph = ip_hdr(skb);
560 int oif = skb->dev->ifindex;
561 u8 tos = RT_TOS(iph->tos);
562 u8 prot = iph->protocol;
563 u32 mark = skb->mark;
564
d109e61b 565 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
566}
567
5abf7f7e 568static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
569{
570 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 571 const struct ip_options_rcu *inet_opt;
4895c771
DM
572 __be32 daddr = inet->inet_daddr;
573
574 rcu_read_lock();
575 inet_opt = rcu_dereference(inet->inet_opt);
576 if (inet_opt && inet_opt->opt.srr)
577 daddr = inet_opt->opt.faddr;
578 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
579 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
580 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
581 inet_sk_flowi_flags(sk),
e2d118a1 582 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
583 rcu_read_unlock();
584}
585
5abf7f7e
ED
586static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 const struct sk_buff *skb)
4895c771
DM
588{
589 if (skb)
590 build_skb_flow_key(fl4, skb, sk);
591 else
592 build_sk_flow_key(fl4, sk);
593}
594
c5038a83 595static DEFINE_SPINLOCK(fnhe_lock);
4895c771 596
2ffae99d
TT
597static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
598{
599 struct rtable *rt;
600
601 rt = rcu_dereference(fnhe->fnhe_rth_input);
602 if (rt) {
603 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 604 dst_dev_put(&rt->dst);
0830106c 605 dst_release(&rt->dst);
2ffae99d
TT
606 }
607 rt = rcu_dereference(fnhe->fnhe_rth_output);
608 if (rt) {
609 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 610 dst_dev_put(&rt->dst);
0830106c 611 dst_release(&rt->dst);
2ffae99d
TT
612 }
613}
614
aee06da6 615static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
616{
617 struct fib_nh_exception *fnhe, *oldest;
618
619 oldest = rcu_dereference(hash->chain);
620 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
621 fnhe = rcu_dereference(fnhe->fnhe_next)) {
622 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
623 oldest = fnhe;
624 }
2ffae99d 625 fnhe_flush_routes(oldest);
4895c771
DM
626 return oldest;
627}
628
d3a25c98
DM
629static inline u32 fnhe_hashfun(__be32 daddr)
630{
d546c621 631 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
632 u32 hval;
633
d546c621
ED
634 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
635 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
636 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
637}
638
387aa65a
TT
639static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640{
641 rt->rt_pmtu = fnhe->fnhe_pmtu;
a9f829f7 642 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
643 rt->dst.expires = fnhe->fnhe_expires;
644
645 if (fnhe->fnhe_gw) {
646 rt->rt_flags |= RTCF_REDIRECTED;
647 rt->rt_gateway = fnhe->fnhe_gw;
648 rt->rt_uses_gateway = 1;
649 }
650}
651
aee06da6 652static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
a9f829f7 653 u32 pmtu, bool lock, unsigned long expires)
4895c771 654{
aee06da6 655 struct fnhe_hash_bucket *hash;
4895c771 656 struct fib_nh_exception *fnhe;
387aa65a 657 struct rtable *rt;
cebe84c6 658 u32 genid, hval;
387aa65a 659 unsigned int i;
4895c771 660 int depth;
cebe84c6
XL
661
662 genid = fnhe_genid(dev_net(nh->nh_dev));
663 hval = fnhe_hashfun(daddr);
aee06da6 664
c5038a83 665 spin_lock_bh(&fnhe_lock);
4895c771 666
caa41527 667 hash = rcu_dereference(nh->nh_exceptions);
4895c771 668 if (!hash) {
aee06da6 669 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 670 if (!hash)
aee06da6 671 goto out_unlock;
caa41527 672 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
673 }
674
4895c771
DM
675 hash += hval;
676
677 depth = 0;
678 for (fnhe = rcu_dereference(hash->chain); fnhe;
679 fnhe = rcu_dereference(fnhe->fnhe_next)) {
680 if (fnhe->fnhe_daddr == daddr)
aee06da6 681 break;
4895c771
DM
682 depth++;
683 }
684
aee06da6 685 if (fnhe) {
cebe84c6
XL
686 if (fnhe->fnhe_genid != genid)
687 fnhe->fnhe_genid = genid;
aee06da6
JA
688 if (gw)
689 fnhe->fnhe_gw = gw;
a9f829f7 690 if (pmtu) {
aee06da6 691 fnhe->fnhe_pmtu = pmtu;
a9f829f7
SD
692 fnhe->fnhe_mtu_locked = lock;
693 }
e39d5246 694 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 695 /* Update all cached dsts too */
2ffae99d
TT
696 rt = rcu_dereference(fnhe->fnhe_rth_input);
697 if (rt)
698 fill_route_from_fnhe(rt, fnhe);
699 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
700 if (rt)
701 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
702 } else {
703 if (depth > FNHE_RECLAIM_DEPTH)
704 fnhe = fnhe_oldest(hash);
705 else {
706 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
707 if (!fnhe)
708 goto out_unlock;
709
710 fnhe->fnhe_next = hash->chain;
711 rcu_assign_pointer(hash->chain, fnhe);
712 }
cebe84c6 713 fnhe->fnhe_genid = genid;
aee06da6
JA
714 fnhe->fnhe_daddr = daddr;
715 fnhe->fnhe_gw = gw;
716 fnhe->fnhe_pmtu = pmtu;
a9f829f7 717 fnhe->fnhe_mtu_locked = lock;
054d7cb5 718 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
719
720 /* Exception created; mark the cached routes for the nexthop
721 * stale, so anyone caching it rechecks if this exception
722 * applies to them.
723 */
2ffae99d
TT
724 rt = rcu_dereference(nh->nh_rth_input);
725 if (rt)
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
727
387aa65a
TT
728 for_each_possible_cpu(i) {
729 struct rtable __rcu **prt;
730 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
731 rt = rcu_dereference(*prt);
732 if (rt)
733 rt->dst.obsolete = DST_OBSOLETE_KILL;
734 }
4895c771 735 }
4895c771 736
4895c771 737 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
738
739out_unlock:
c5038a83 740 spin_unlock_bh(&fnhe_lock);
4895c771
DM
741}
742
ceb33206
DM
743static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
744 bool kill_route)
1da177e4 745{
e47a185b 746 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 747 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 748 struct net_device *dev = skb->dev;
e47a185b 749 struct in_device *in_dev;
4895c771 750 struct fib_result res;
e47a185b 751 struct neighbour *n;
317805b8 752 struct net *net;
1da177e4 753
94206125
DM
754 switch (icmp_hdr(skb)->code & 7) {
755 case ICMP_REDIR_NET:
756 case ICMP_REDIR_NETTOS:
757 case ICMP_REDIR_HOST:
758 case ICMP_REDIR_HOSTTOS:
759 break;
760
761 default:
762 return;
763 }
764
e47a185b
DM
765 if (rt->rt_gateway != old_gw)
766 return;
767
768 in_dev = __in_dev_get_rcu(dev);
769 if (!in_dev)
770 return;
771
c346dca1 772 net = dev_net(dev);
9d4fb27d
JP
773 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
774 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
775 ipv4_is_zeronet(new_gw))
1da177e4
LT
776 goto reject_redirect;
777
778 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
779 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
780 goto reject_redirect;
781 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
782 goto reject_redirect;
783 } else {
317805b8 784 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
785 goto reject_redirect;
786 }
787
969447f2
SSL
788 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
789 if (!n)
790 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 791 if (!IS_ERR(n)) {
e47a185b
DM
792 if (!(n->nud_state & NUD_VALID)) {
793 neigh_event_send(n, NULL);
794 } else {
0eeb075f 795 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 796 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 797
aee06da6 798 update_or_create_fnhe(nh, fl4->daddr, new_gw,
a9f829f7
SD
799 0, false,
800 jiffies + ip_rt_gc_timeout);
4895c771 801 }
ceb33206
DM
802 if (kill_route)
803 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
804 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
805 }
806 neigh_release(n);
807 }
808 return;
809
810reject_redirect:
811#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
812 if (IN_DEV_LOG_MARTIANS(in_dev)) {
813 const struct iphdr *iph = (const struct iphdr *) skb->data;
814 __be32 daddr = iph->daddr;
815 __be32 saddr = iph->saddr;
816
e47a185b
DM
817 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
818 " Advised path = %pI4 -> %pI4\n",
819 &old_gw, dev->name, &new_gw,
820 &saddr, &daddr);
99ee038d 821 }
e47a185b
DM
822#endif
823 ;
824}
825
4895c771
DM
826static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
827{
828 struct rtable *rt;
829 struct flowi4 fl4;
f96ef988 830 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 831 struct net *net = dev_net(skb->dev);
f96ef988
MK
832 int oif = skb->dev->ifindex;
833 u8 tos = RT_TOS(iph->tos);
834 u8 prot = iph->protocol;
835 u32 mark = skb->mark;
4895c771
DM
836
837 rt = (struct rtable *) dst;
838
7d995694 839 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 840 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
841}
842
1da177e4
LT
843static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
844{
ee6b9673 845 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
846 struct dst_entry *ret = dst;
847
848 if (rt) {
d11a4dc1 849 if (dst->obsolete > 0) {
1da177e4
LT
850 ip_rt_put(rt);
851 ret = NULL;
5943634f
DM
852 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
853 rt->dst.expires) {
89aef892 854 ip_rt_put(rt);
1da177e4
LT
855 ret = NULL;
856 }
857 }
858 return ret;
859}
860
861/*
862 * Algorithm:
863 * 1. The first ip_rt_redirect_number redirects are sent
864 * with exponential backoff, then we stop sending them at all,
865 * assuming that the host ignores our redirects.
866 * 2. If we did not see packets requiring redirects
867 * during ip_rt_redirect_silence, we assume that the host
868 * forgot redirected route and start to send redirects again.
869 *
870 * This algorithm is much cheaper and more intelligent than dumb load limiting
871 * in icmp.c.
872 *
873 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
874 * and "frag. need" (breaks PMTU discovery) in icmp.c.
875 */
876
877void ip_rt_send_redirect(struct sk_buff *skb)
878{
511c3f92 879 struct rtable *rt = skb_rtable(skb);
30038fc6 880 struct in_device *in_dev;
92d86829 881 struct inet_peer *peer;
1d861aa4 882 struct net *net;
30038fc6 883 int log_martians;
192132b9 884 int vif;
1da177e4 885
30038fc6 886 rcu_read_lock();
d8d1f30b 887 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
888 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
889 rcu_read_unlock();
1da177e4 890 return;
30038fc6
ED
891 }
892 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 893 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 894 rcu_read_unlock();
1da177e4 895
1d861aa4 896 net = dev_net(rt->dst.dev);
192132b9 897 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 898 if (!peer) {
e81da0e1
JA
899 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
900 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
901 return;
902 }
903
1da177e4
LT
904 /* No redirected packets during ip_rt_redirect_silence;
905 * reset the algorithm.
906 */
92d86829
DM
907 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
908 peer->rate_tokens = 0;
1da177e4
LT
909
910 /* Too many ignored redirects; do not send anything
d8d1f30b 911 * set dst.rate_last to the last seen redirected packet.
1da177e4 912 */
92d86829
DM
913 if (peer->rate_tokens >= ip_rt_redirect_number) {
914 peer->rate_last = jiffies;
1d861aa4 915 goto out_put_peer;
1da177e4
LT
916 }
917
918 /* Check for load limit; set rate_last to the latest sent
919 * redirect.
920 */
92d86829 921 if (peer->rate_tokens == 0 ||
14fb8a76 922 time_after(jiffies,
92d86829
DM
923 (peer->rate_last +
924 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
925 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
926
927 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
928 peer->rate_last = jiffies;
929 ++peer->rate_tokens;
1da177e4 930#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 931 if (log_martians &&
e87cc472
JP
932 peer->rate_tokens == ip_rt_redirect_number)
933 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 934 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 935 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
936#endif
937 }
1d861aa4
DM
938out_put_peer:
939 inet_putpeer(peer);
1da177e4
LT
940}
941
942static int ip_error(struct sk_buff *skb)
943{
251da413 944 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 945 struct rtable *rt = skb_rtable(skb);
92d86829 946 struct inet_peer *peer;
1da177e4 947 unsigned long now;
251da413 948 struct net *net;
92d86829 949 bool send;
1da177e4
LT
950 int code;
951
381c759d
EB
952 /* IP on this device is disabled. */
953 if (!in_dev)
954 goto out;
955
251da413
DM
956 net = dev_net(rt->dst.dev);
957 if (!IN_DEV_FORWARD(in_dev)) {
958 switch (rt->dst.error) {
959 case EHOSTUNREACH:
b45386ef 960 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
961 break;
962
963 case ENETUNREACH:
b45386ef 964 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
965 break;
966 }
967 goto out;
968 }
969
d8d1f30b 970 switch (rt->dst.error) {
4500ebf8
JP
971 case EINVAL:
972 default:
973 goto out;
974 case EHOSTUNREACH:
975 code = ICMP_HOST_UNREACH;
976 break;
977 case ENETUNREACH:
978 code = ICMP_NET_UNREACH;
b45386ef 979 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
980 break;
981 case EACCES:
982 code = ICMP_PKT_FILTERED;
983 break;
1da177e4
LT
984 }
985
192132b9 986 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 987 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
988
989 send = true;
990 if (peer) {
991 now = jiffies;
992 peer->rate_tokens += now - peer->rate_last;
993 if (peer->rate_tokens > ip_rt_error_burst)
994 peer->rate_tokens = ip_rt_error_burst;
995 peer->rate_last = now;
996 if (peer->rate_tokens >= ip_rt_error_cost)
997 peer->rate_tokens -= ip_rt_error_cost;
998 else
999 send = false;
1d861aa4 1000 inet_putpeer(peer);
1da177e4 1001 }
92d86829
DM
1002 if (send)
1003 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1004
1005out: kfree_skb(skb);
1006 return 0;
e905a9ed 1007}
1da177e4 1008
d851c12b 1009static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1010{
d851c12b 1011 struct dst_entry *dst = &rt->dst;
4895c771 1012 struct fib_result res;
a9f829f7 1013 bool lock = false;
2c8cec5c 1014
a9f829f7 1015 if (ip_mtu_locked(dst))
fa1e492a
SK
1016 return;
1017
cb6ccf09 1018 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1019 return;
1020
a9f829f7
SD
1021 if (mtu < ip_rt_min_pmtu) {
1022 lock = true;
5943634f 1023 mtu = ip_rt_min_pmtu;
a9f829f7 1024 }
2c8cec5c 1025
f016229e
TT
1026 if (rt->rt_pmtu == mtu &&
1027 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1028 return;
1029
c5ae7d41 1030 rcu_read_lock();
0eeb075f 1031 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1032 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1033
a9f829f7 1034 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
aee06da6 1035 jiffies + ip_rt_mtu_expires);
4895c771 1036 }
c5ae7d41 1037 rcu_read_unlock();
1da177e4
LT
1038}
1039
4895c771
DM
1040static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1041 struct sk_buff *skb, u32 mtu)
1042{
1043 struct rtable *rt = (struct rtable *) dst;
1044 struct flowi4 fl4;
1045
1046 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1047 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1048}
1049
36393395
DM
1050void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1051 int oif, u32 mark, u8 protocol, int flow_flags)
1052{
4895c771 1053 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1054 struct flowi4 fl4;
1055 struct rtable *rt;
1056
1b3c61dc
LC
1057 if (!mark)
1058 mark = IP4_REPLY_MARK(net, skb->mark);
1059
e2d118a1 1060 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1061 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1062 rt = __ip_route_output_key(net, &fl4);
1063 if (!IS_ERR(rt)) {
4895c771 1064 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1065 ip_rt_put(rt);
1066 }
1067}
1068EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1069
9cb3a50c 1070static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1071{
4895c771
DM
1072 const struct iphdr *iph = (const struct iphdr *) skb->data;
1073 struct flowi4 fl4;
1074 struct rtable *rt;
36393395 1075
e2d118a1 1076 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1077
1078 if (!fl4.flowi4_mark)
1079 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1080
4895c771
DM
1081 rt = __ip_route_output_key(sock_net(sk), &fl4);
1082 if (!IS_ERR(rt)) {
1083 __ip_rt_update_pmtu(rt, &fl4, mtu);
1084 ip_rt_put(rt);
1085 }
36393395 1086}
9cb3a50c
SK
1087
1088void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1089{
1090 const struct iphdr *iph = (const struct iphdr *) skb->data;
1091 struct flowi4 fl4;
1092 struct rtable *rt;
7f502361 1093 struct dst_entry *odst = NULL;
b44108db 1094 bool new = false;
e2d118a1 1095 struct net *net = sock_net(sk);
9cb3a50c
SK
1096
1097 bh_lock_sock(sk);
482fc609
HFS
1098
1099 if (!ip_sk_accept_pmtu(sk))
1100 goto out;
1101
7f502361 1102 odst = sk_dst_get(sk);
9cb3a50c 1103
7f502361 1104 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1105 __ipv4_sk_update_pmtu(skb, sk, mtu);
1106 goto out;
1107 }
1108
e2d118a1 1109 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1110
7f502361 1111 rt = (struct rtable *)odst;
51456b29 1112 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1113 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114 if (IS_ERR(rt))
1115 goto out;
b44108db
SK
1116
1117 new = true;
9cb3a50c
SK
1118 }
1119
1120 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1121
7f502361 1122 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1123 if (new)
1124 dst_release(&rt->dst);
1125
9cb3a50c
SK
1126 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127 if (IS_ERR(rt))
1128 goto out;
1129
b44108db 1130 new = true;
9cb3a50c
SK
1131 }
1132
b44108db 1133 if (new)
7f502361 1134 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1135
1136out:
1137 bh_unlock_sock(sk);
7f502361 1138 dst_release(odst);
9cb3a50c 1139}
36393395 1140EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1141
b42597e2
DM
1142void ipv4_redirect(struct sk_buff *skb, struct net *net,
1143 int oif, u32 mark, u8 protocol, int flow_flags)
1144{
4895c771 1145 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1146 struct flowi4 fl4;
1147 struct rtable *rt;
1148
e2d118a1 1149 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1150 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1151 rt = __ip_route_output_key(net, &fl4);
1152 if (!IS_ERR(rt)) {
ceb33206 1153 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1154 ip_rt_put(rt);
1155 }
1156}
1157EXPORT_SYMBOL_GPL(ipv4_redirect);
1158
1159void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1160{
4895c771
DM
1161 const struct iphdr *iph = (const struct iphdr *) skb->data;
1162 struct flowi4 fl4;
1163 struct rtable *rt;
e2d118a1 1164 struct net *net = sock_net(sk);
b42597e2 1165
e2d118a1
LC
1166 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1167 rt = __ip_route_output_key(net, &fl4);
4895c771 1168 if (!IS_ERR(rt)) {
ceb33206 1169 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1170 ip_rt_put(rt);
1171 }
b42597e2
DM
1172}
1173EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1174
efbc368d
DM
1175static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1176{
1177 struct rtable *rt = (struct rtable *) dst;
1178
ceb33206
DM
1179 /* All IPV4 dsts are created with ->obsolete set to the value
1180 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1181 * into this function always.
1182 *
387aa65a
TT
1183 * When a PMTU/redirect information update invalidates a route,
1184 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1185 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1186 */
387aa65a 1187 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1188 return NULL;
d11a4dc1 1189 return dst;
1da177e4
LT
1190}
1191
1da177e4
LT
1192static void ipv4_link_failure(struct sk_buff *skb)
1193{
1194 struct rtable *rt;
1195
1196 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1197
511c3f92 1198 rt = skb_rtable(skb);
5943634f
DM
1199 if (rt)
1200 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1201}
1202
ede2059d 1203static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1204{
91df42be
JP
1205 pr_debug("%s: %pI4 -> %pI4, %s\n",
1206 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1207 skb->dev ? skb->dev->name : "?");
1da177e4 1208 kfree_skb(skb);
c378a9c0 1209 WARN_ON(1);
1da177e4
LT
1210 return 0;
1211}
1212
1213/*
1214 We do not cache source address of outgoing interface,
1215 because it is used only by IP RR, TS and SRR options,
1216 so that it out of fast path.
1217
1218 BTW remember: "addr" is allowed to be not aligned
1219 in IP options!
1220 */
1221
8e36360a 1222void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1223{
a61ced5d 1224 __be32 src;
1da177e4 1225
c7537967 1226 if (rt_is_output_route(rt))
c5be24ff 1227 src = ip_hdr(skb)->saddr;
ebc0ffae 1228 else {
8e36360a
DM
1229 struct fib_result res;
1230 struct flowi4 fl4;
1231 struct iphdr *iph;
1232
1233 iph = ip_hdr(skb);
1234
1235 memset(&fl4, 0, sizeof(fl4));
1236 fl4.daddr = iph->daddr;
1237 fl4.saddr = iph->saddr;
b0fe4a31 1238 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1239 fl4.flowi4_oif = rt->dst.dev->ifindex;
1240 fl4.flowi4_iif = skb->dev->ifindex;
1241 fl4.flowi4_mark = skb->mark;
5e2b61f7 1242
ebc0ffae 1243 rcu_read_lock();
0eeb075f 1244 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1245 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1246 else
f8126f1d
DM
1247 src = inet_select_addr(rt->dst.dev,
1248 rt_nexthop(rt, iph->daddr),
1249 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1250 rcu_read_unlock();
1251 }
1da177e4
LT
1252 memcpy(addr, &src, 4);
1253}
1254
c7066f70 1255#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1256static void set_class_tag(struct rtable *rt, u32 tag)
1257{
d8d1f30b
CG
1258 if (!(rt->dst.tclassid & 0xFFFF))
1259 rt->dst.tclassid |= tag & 0xFFFF;
1260 if (!(rt->dst.tclassid & 0xFFFF0000))
1261 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1262}
1263#endif
1264
0dbaee3b
DM
1265static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1266{
7ed14d97 1267 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1268 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1269 ip_rt_min_advmss);
0dbaee3b 1270
7ed14d97 1271 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1272}
1273
ebb762f2 1274static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1275{
261663b0 1276 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1277 unsigned int mtu = rt->rt_pmtu;
1278
98d75c37 1279 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1280 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1281
38d523e2 1282 if (mtu)
618f9bc7
SK
1283 return mtu;
1284
c780a049 1285 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1286
a9f829f7 1287 if (unlikely(ip_mtu_locked(dst))) {
155e8336 1288 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1289 mtu = 576;
1290 }
1291
14972cbd
RP
1292 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1293
1294 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1295}
1296
054d7cb5
JA
1297static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1298{
1299 struct fnhe_hash_bucket *hash;
1300 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1301 u32 hval = fnhe_hashfun(daddr);
1302
1303 spin_lock_bh(&fnhe_lock);
1304
1305 hash = rcu_dereference_protected(nh->nh_exceptions,
1306 lockdep_is_held(&fnhe_lock));
1307 hash += hval;
1308
1309 fnhe_p = &hash->chain;
1310 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1311 while (fnhe) {
1312 if (fnhe->fnhe_daddr == daddr) {
1313 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1314 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1315 fnhe_flush_routes(fnhe);
1316 kfree_rcu(fnhe, rcu);
1317 break;
1318 }
1319 fnhe_p = &fnhe->fnhe_next;
1320 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1321 lockdep_is_held(&fnhe_lock));
1322 }
1323
1324 spin_unlock_bh(&fnhe_lock);
1325}
1326
f2bb4bed 1327static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1328{
caa41527 1329 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1330 struct fib_nh_exception *fnhe;
1331 u32 hval;
1332
f2bb4bed
DM
1333 if (!hash)
1334 return NULL;
1335
d3a25c98 1336 hval = fnhe_hashfun(daddr);
4895c771
DM
1337
1338 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1339 fnhe = rcu_dereference(fnhe->fnhe_next)) {
054d7cb5
JA
1340 if (fnhe->fnhe_daddr == daddr) {
1341 if (fnhe->fnhe_expires &&
1342 time_after(jiffies, fnhe->fnhe_expires)) {
1343 ip_del_fnhe(nh, daddr);
1344 break;
1345 }
f2bb4bed 1346 return fnhe;
054d7cb5 1347 }
f2bb4bed
DM
1348 }
1349 return NULL;
1350}
aee06da6 1351
caacf05e 1352static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1353 __be32 daddr, const bool do_cache)
f2bb4bed 1354{
caacf05e
DM
1355 bool ret = false;
1356
c5038a83 1357 spin_lock_bh(&fnhe_lock);
f2bb4bed 1358
c5038a83 1359 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1360 struct rtable __rcu **porig;
1361 struct rtable *orig;
5aad1de5 1362 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1363
1364 if (rt_is_input_route(rt))
1365 porig = &fnhe->fnhe_rth_input;
1366 else
1367 porig = &fnhe->fnhe_rth_output;
1368 orig = rcu_dereference(*porig);
5aad1de5
TT
1369
1370 if (fnhe->fnhe_genid != genid) {
1371 fnhe->fnhe_genid = genid;
13d82bf5
SK
1372 fnhe->fnhe_gw = 0;
1373 fnhe->fnhe_pmtu = 0;
1374 fnhe->fnhe_expires = 0;
2ffae99d
TT
1375 fnhe_flush_routes(fnhe);
1376 orig = NULL;
13d82bf5 1377 }
387aa65a
TT
1378 fill_route_from_fnhe(rt, fnhe);
1379 if (!rt->rt_gateway)
155e8336 1380 rt->rt_gateway = daddr;
f2bb4bed 1381
a4c2fd7f 1382 if (do_cache) {
0830106c 1383 dst_hold(&rt->dst);
2ffae99d 1384 rcu_assign_pointer(*porig, rt);
0830106c 1385 if (orig) {
95c47f9c 1386 dst_dev_put(&orig->dst);
0830106c 1387 dst_release(&orig->dst);
0830106c 1388 }
2ffae99d
TT
1389 ret = true;
1390 }
c5038a83
DM
1391
1392 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1393 }
1394 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1395
1396 return ret;
54764bb6
ED
1397}
1398
caacf05e 1399static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1400{
d26b3a7c 1401 struct rtable *orig, *prev, **p;
caacf05e 1402 bool ret = true;
f2bb4bed 1403
d26b3a7c 1404 if (rt_is_input_route(rt)) {
54764bb6 1405 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1406 } else {
903ceff7 1407 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1408 }
f2bb4bed
DM
1409 orig = *p;
1410
0830106c
WW
1411 /* hold dst before doing cmpxchg() to avoid race condition
1412 * on this dst
1413 */
1414 dst_hold(&rt->dst);
f2bb4bed
DM
1415 prev = cmpxchg(p, orig, rt);
1416 if (prev == orig) {
0830106c 1417 if (orig) {
95c47f9c 1418 dst_dev_put(&orig->dst);
0830106c 1419 dst_release(&orig->dst);
0830106c
WW
1420 }
1421 } else {
1422 dst_release(&rt->dst);
caacf05e 1423 ret = false;
0830106c 1424 }
caacf05e
DM
1425
1426 return ret;
1427}
1428
5055c371
ED
1429struct uncached_list {
1430 spinlock_t lock;
1431 struct list_head head;
1432};
1433
1434static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1435
31a5b09d 1436void rt_add_uncached_list(struct rtable *rt)
caacf05e 1437{
5055c371
ED
1438 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1439
1440 rt->rt_uncached_list = ul;
1441
1442 spin_lock_bh(&ul->lock);
1443 list_add_tail(&rt->rt_uncached, &ul->head);
1444 spin_unlock_bh(&ul->lock);
caacf05e
DM
1445}
1446
31a5b09d 1447void rt_del_uncached_list(struct rtable *rt)
caacf05e 1448{
78df76a0 1449 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1450 struct uncached_list *ul = rt->rt_uncached_list;
1451
1452 spin_lock_bh(&ul->lock);
caacf05e 1453 list_del(&rt->rt_uncached);
5055c371 1454 spin_unlock_bh(&ul->lock);
caacf05e
DM
1455 }
1456}
1457
31a5b09d
XL
1458static void ipv4_dst_destroy(struct dst_entry *dst)
1459{
1460 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1461 struct rtable *rt = (struct rtable *)dst;
1462
1463 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1464 kfree(p);
1465
1466 rt_del_uncached_list(rt);
1467}
1468
caacf05e
DM
1469void rt_flush_dev(struct net_device *dev)
1470{
5055c371
ED
1471 struct net *net = dev_net(dev);
1472 struct rtable *rt;
1473 int cpu;
1474
1475 for_each_possible_cpu(cpu) {
1476 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1477
5055c371
ED
1478 spin_lock_bh(&ul->lock);
1479 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1480 if (rt->dst.dev != dev)
1481 continue;
1482 rt->dst.dev = net->loopback_dev;
1483 dev_hold(rt->dst.dev);
1484 dev_put(dev);
1485 }
5055c371 1486 spin_unlock_bh(&ul->lock);
4895c771
DM
1487 }
1488}
1489
4331debc 1490static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1491{
4331debc
ED
1492 return rt &&
1493 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1494 !rt_is_expired(rt);
d2d68ba9
DM
1495}
1496
f2bb4bed 1497static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1498 const struct fib_result *res,
f2bb4bed 1499 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1500 struct fib_info *fi, u16 type, u32 itag,
1501 const bool do_cache)
1da177e4 1502{
caacf05e
DM
1503 bool cached = false;
1504
1da177e4 1505 if (fi) {
4895c771
DM
1506 struct fib_nh *nh = &FIB_RES_NH(*res);
1507
155e8336 1508 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1509 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1510 rt->rt_uses_gateway = 1;
1511 }
3fb07daf
ED
1512 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1513 if (fi->fib_metrics != &dst_default_metrics) {
1514 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
9620fef2 1515 refcount_inc(&fi->fib_metrics->refcnt);
3fb07daf 1516 }
c7066f70 1517#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1518 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1519#endif
61adedf3 1520 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1521 if (unlikely(fnhe))
a4c2fd7f
WW
1522 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1523 else if (do_cache)
caacf05e 1524 cached = rt_cache_route(nh, rt);
155e8336
JA
1525 if (unlikely(!cached)) {
1526 /* Routes we intend to cache in nexthop exception or
1527 * FIB nexthop have the DST_NOCACHE bit clear.
1528 * However, if we are unsuccessful at storing this
1529 * route into the cache we really need to set it.
1530 */
155e8336
JA
1531 if (!rt->rt_gateway)
1532 rt->rt_gateway = daddr;
1533 rt_add_uncached_list(rt);
1534 }
1535 } else
caacf05e 1536 rt_add_uncached_list(rt);
defb3519 1537
c7066f70 1538#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1539#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1540 set_class_tag(rt, res->tclassid);
1da177e4
LT
1541#endif
1542 set_class_tag(rt, itag);
1543#endif
1da177e4
LT
1544}
1545
9ab179d8
DA
1546struct rtable *rt_dst_alloc(struct net_device *dev,
1547 unsigned int flags, u16 type,
1548 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1549{
d08c4f35
DA
1550 struct rtable *rt;
1551
1552 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1553 (will_cache ? 0 : DST_HOST) |
d08c4f35 1554 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1555 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1556
1557 if (rt) {
1558 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1559 rt->rt_flags = flags;
1560 rt->rt_type = type;
1561 rt->rt_is_input = 0;
1562 rt->rt_iif = 0;
1563 rt->rt_pmtu = 0;
a9f829f7 1564 rt->rt_mtu_locked = 0;
d08c4f35
DA
1565 rt->rt_gateway = 0;
1566 rt->rt_uses_gateway = 0;
b7503e0c 1567 rt->rt_table_id = 0;
d08c4f35
DA
1568 INIT_LIST_HEAD(&rt->rt_uncached);
1569
1570 rt->dst.output = ip_output;
1571 if (flags & RTCF_LOCAL)
1572 rt->dst.input = ip_local_deliver;
1573 }
1574
1575 return rt;
0c4dcd58 1576}
9ab179d8 1577EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1578
96d36220 1579/* called in rcu_read_lock() section */
bc044e8d
PA
1580int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1581 u8 tos, struct net_device *dev,
1582 struct in_device *in_dev, u32 *itag)
1da177e4 1583{
b5f7e755 1584 int err;
1da177e4
LT
1585
1586 /* Primary sanity checks. */
51456b29 1587 if (!in_dev)
1da177e4
LT
1588 return -EINVAL;
1589
1e637c74 1590 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1591 skb->protocol != htons(ETH_P_IP))
bc044e8d 1592 return -EINVAL;
1da177e4 1593
75fea73d 1594 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1595 return -EINVAL;
d0daebc3 1596
f97c1e0c
JP
1597 if (ipv4_is_zeronet(saddr)) {
1598 if (!ipv4_is_local_multicast(daddr))
bc044e8d 1599 return -EINVAL;
b5f7e755 1600 } else {
9e56e380 1601 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1602 in_dev, itag);
b5f7e755 1603 if (err < 0)
bc044e8d 1604 return err;
b5f7e755 1605 }
bc044e8d
PA
1606 return 0;
1607}
1608
1609/* called in rcu_read_lock() section */
1610static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1611 u8 tos, struct net_device *dev, int our)
1612{
1613 struct in_device *in_dev = __in_dev_get_rcu(dev);
1614 unsigned int flags = RTCF_MULTICAST;
1615 struct rtable *rth;
1616 u32 itag = 0;
1617 int err;
1618
1619 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1620 if (err)
1621 return err;
1622
d08c4f35
DA
1623 if (our)
1624 flags |= RTCF_LOCAL;
1625
1626 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1627 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1628 if (!rth)
bc044e8d 1629 return -ENOBUFS;
1da177e4 1630
cf911662
DM
1631#ifdef CONFIG_IP_ROUTE_CLASSID
1632 rth->dst.tclassid = itag;
1633#endif
d8d1f30b 1634 rth->dst.output = ip_rt_bug;
9917e1e8 1635 rth->rt_is_input= 1;
1da177e4
LT
1636
1637#ifdef CONFIG_IP_MROUTE
f97c1e0c 1638 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1639 rth->dst.input = ip_mr_input;
1da177e4
LT
1640#endif
1641 RT_CACHE_STAT_INC(in_slow_mc);
1642
89aef892
DM
1643 skb_dst_set(skb, &rth->dst);
1644 return 0;
1da177e4
LT
1645}
1646
1647
1648static void ip_handle_martian_source(struct net_device *dev,
1649 struct in_device *in_dev,
1650 struct sk_buff *skb,
9e12bb22
AV
1651 __be32 daddr,
1652 __be32 saddr)
1da177e4
LT
1653{
1654 RT_CACHE_STAT_INC(in_martian_src);
1655#ifdef CONFIG_IP_ROUTE_VERBOSE
1656 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1657 /*
1658 * RFC1812 recommendation, if source is martian,
1659 * the only hint is MAC header.
1660 */
058bd4d2 1661 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1662 &daddr, &saddr, dev->name);
98e399f8 1663 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1664 print_hex_dump(KERN_WARNING, "ll header: ",
1665 DUMP_PREFIX_OFFSET, 16, 1,
1666 skb_mac_header(skb),
1667 dev->hard_header_len, true);
1da177e4
LT
1668 }
1669 }
1670#endif
1671}
1672
efd85700
TG
1673static void set_lwt_redirect(struct rtable *rth)
1674{
1675 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1676 rth->dst.lwtstate->orig_output = rth->dst.output;
1677 rth->dst.output = lwtunnel_output;
1678 }
1679
1680 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1681 rth->dst.lwtstate->orig_input = rth->dst.input;
1682 rth->dst.input = lwtunnel_input;
1683 }
1684}
1685
47360228 1686/* called in rcu_read_lock() section */
5969f71d 1687static int __mkroute_input(struct sk_buff *skb,
982721f3 1688 const struct fib_result *res,
5969f71d 1689 struct in_device *in_dev,
c6cffba4 1690 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1691{
2ffae99d 1692 struct fib_nh_exception *fnhe;
1da177e4
LT
1693 struct rtable *rth;
1694 int err;
1695 struct in_device *out_dev;
d2d68ba9 1696 bool do_cache;
fbdc0ad0 1697 u32 itag = 0;
1da177e4
LT
1698
1699 /* get a working reference to the output device */
47360228 1700 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1701 if (!out_dev) {
e87cc472 1702 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1703 return -EINVAL;
1704 }
1705
5c04c819 1706 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1707 in_dev->dev, in_dev, &itag);
1da177e4 1708 if (err < 0) {
e905a9ed 1709 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1710 saddr);
e905a9ed 1711
1da177e4
LT
1712 goto cleanup;
1713 }
1714
e81da0e1
JA
1715 do_cache = res->fi && !itag;
1716 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1717 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1718 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1719 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1720 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1721
1722 if (skb->protocol != htons(ETH_P_IP)) {
1723 /* Not IP (i.e. ARP). Do not create route, if it is
1724 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1725 *
1726 * Proxy arp feature have been extended to allow, ARP
1727 * replies back to the same interface, to support
1728 * Private VLAN switch technologies. See arp.c.
1da177e4 1729 */
65324144
JDB
1730 if (out_dev == in_dev &&
1731 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1732 err = -EINVAL;
1733 goto cleanup;
1734 }
1735 }
1736
2ffae99d 1737 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1738 if (do_cache) {
054d7cb5 1739 if (fnhe)
2ffae99d 1740 rth = rcu_dereference(fnhe->fnhe_rth_input);
054d7cb5
JA
1741 else
1742 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
e81da0e1
JA
1743 if (rt_cache_valid(rth)) {
1744 skb_dst_set_noref(skb, &rth->dst);
1745 goto out;
d2d68ba9
DM
1746 }
1747 }
f2bb4bed 1748
d08c4f35 1749 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1750 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1751 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1752 if (!rth) {
1753 err = -ENOBUFS;
1754 goto cleanup;
1755 }
1756
9917e1e8 1757 rth->rt_is_input = 1;
b7503e0c
DA
1758 if (res->table)
1759 rth->rt_table_id = res->table->tb_id;
a6254864 1760 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1761
d8d1f30b 1762 rth->dst.input = ip_forward;
1da177e4 1763
a4c2fd7f
WW
1764 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1765 do_cache);
efd85700 1766 set_lwt_redirect(rth);
c6cffba4 1767 skb_dst_set(skb, &rth->dst);
d2d68ba9 1768out:
1da177e4
LT
1769 err = 0;
1770 cleanup:
1da177e4 1771 return err;
e905a9ed 1772}
1da177e4 1773
79a13159 1774#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1775/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1776 * calculated from the inner IP addresses.
79a13159 1777 */
bf4e0a3d
NA
1778static void ip_multipath_l3_keys(const struct sk_buff *skb,
1779 struct flow_keys *hash_keys)
79a13159
PN
1780{
1781 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1782 const struct iphdr *inner_iph;
79a13159
PN
1783 const struct icmphdr *icmph;
1784 struct iphdr _inner_iph;
bf4e0a3d
NA
1785 struct icmphdr _icmph;
1786
1787 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1788 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1789 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1790 return;
79a13159
PN
1791
1792 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1793 return;
79a13159
PN
1794
1795 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1796 &_icmph);
1797 if (!icmph)
bf4e0a3d 1798 return;
79a13159
PN
1799
1800 if (icmph->type != ICMP_DEST_UNREACH &&
1801 icmph->type != ICMP_REDIRECT &&
1802 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1803 icmph->type != ICMP_PARAMETERPROB)
1804 return;
79a13159
PN
1805
1806 inner_iph = skb_header_pointer(skb,
1807 outer_iph->ihl * 4 + sizeof(_icmph),
1808 sizeof(_inner_iph), &_inner_iph);
1809 if (!inner_iph)
bf4e0a3d
NA
1810 return;
1811 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1812 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1813}
79a13159 1814
bf4e0a3d
NA
1815/* if skb is set it will be used and fl4 can be NULL */
1816int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1817 const struct sk_buff *skb)
1818{
1819 struct net *net = fi->fib_net;
1820 struct flow_keys hash_keys;
1821 u32 mhash;
79a13159 1822
bf4e0a3d
NA
1823 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1824 case 0:
1825 memset(&hash_keys, 0, sizeof(hash_keys));
1826 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1827 if (skb) {
1828 ip_multipath_l3_keys(skb, &hash_keys);
1829 } else {
1830 hash_keys.addrs.v4addrs.src = fl4->saddr;
1831 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1832 }
1833 break;
1834 case 1:
1835 /* skb is currently provided only when forwarding */
1836 if (skb) {
1837 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1838 struct flow_keys keys;
1839
1840 /* short-circuit if we already have L4 hash present */
1841 if (skb->l4_hash)
1842 return skb_get_hash_raw(skb) >> 1;
1843 memset(&hash_keys, 0, sizeof(hash_keys));
1844 skb_flow_dissect_flow_keys(skb, &keys, flag);
d41928b3
DA
1845
1846 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
bf4e0a3d
NA
1847 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1848 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1849 hash_keys.ports.src = keys.ports.src;
1850 hash_keys.ports.dst = keys.ports.dst;
1851 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1852 } else {
1853 memset(&hash_keys, 0, sizeof(hash_keys));
1854 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1855 hash_keys.addrs.v4addrs.src = fl4->saddr;
1856 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1857 hash_keys.ports.src = fl4->fl4_sport;
1858 hash_keys.ports.dst = fl4->fl4_dport;
1859 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1860 }
1861 break;
1862 }
1863 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1864
bf4e0a3d
NA
1865 return mhash >> 1;
1866}
1867EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1868#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1869
5969f71d
SH
1870static int ip_mkroute_input(struct sk_buff *skb,
1871 struct fib_result *res,
5969f71d
SH
1872 struct in_device *in_dev,
1873 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1874{
1da177e4 1875#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1876 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1877 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1878
0e884c78
PN
1879 fib_select_multipath(res, h);
1880 }
1da177e4
LT
1881#endif
1882
1883 /* create a routing cache entry */
c6cffba4 1884 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1885}
1886
1da177e4
LT
1887/*
1888 * NOTE. We drop all the packets that has local source
1889 * addresses, because every properly looped back packet
1890 * must have correct destination already attached by output routine.
1891 *
1892 * Such approach solves two big problems:
1893 * 1. Not simplex devices are handled properly.
1894 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1895 * called with rcu_read_lock()
1da177e4
LT
1896 */
1897
9e12bb22 1898static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1899 u8 tos, struct net_device *dev,
1900 struct fib_result *res)
1da177e4 1901{
96d36220 1902 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1903 struct ip_tunnel_info *tun_info;
68a5e3dd 1904 struct flowi4 fl4;
95c96174 1905 unsigned int flags = 0;
1da177e4 1906 u32 itag = 0;
95c96174 1907 struct rtable *rth;
1da177e4 1908 int err = -EINVAL;
5e73ea1a 1909 struct net *net = dev_net(dev);
d2d68ba9 1910 bool do_cache;
1da177e4
LT
1911
1912 /* IP on this device is disabled. */
1913
1914 if (!in_dev)
1915 goto out;
1916
1917 /* Check for the most weird martians, which can be not detected
1918 by fib_lookup.
1919 */
1920
61adedf3 1921 tun_info = skb_tunnel_info(skb);
46fa062a 1922 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1923 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1924 else
1925 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1926 skb_dst_drop(skb);
1927
d0daebc3 1928 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1929 goto martian_source;
1930
5510cdf7
DA
1931 res->fi = NULL;
1932 res->table = NULL;
27a954bd 1933 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1934 goto brd_input;
1935
1936 /* Accept zero addresses only to limited broadcast;
1937 * I even do not know to fix it or not. Waiting for complains :-)
1938 */
f97c1e0c 1939 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1940 goto martian_source;
1941
d0daebc3 1942 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1943 goto martian_destination;
1944
9eb43e76
ED
1945 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1946 * and call it once if daddr or/and saddr are loopback addresses
1947 */
1948 if (ipv4_is_loopback(daddr)) {
1949 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1950 goto martian_destination;
9eb43e76
ED
1951 } else if (ipv4_is_loopback(saddr)) {
1952 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1953 goto martian_source;
1954 }
1955
1da177e4
LT
1956 /*
1957 * Now we are ready to route packet.
1958 */
68a5e3dd 1959 fl4.flowi4_oif = 0;
e0d56fdd 1960 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1961 fl4.flowi4_mark = skb->mark;
1962 fl4.flowi4_tos = tos;
1963 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1964 fl4.flowi4_flags = 0;
68a5e3dd
DM
1965 fl4.daddr = daddr;
1966 fl4.saddr = saddr;
8bcfd092 1967 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1968 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1969 if (err != 0) {
1970 if (!IN_DEV_FORWARD(in_dev))
1971 err = -EHOSTUNREACH;
1da177e4 1972 goto no_route;
cd0f0b95 1973 }
1da177e4 1974
5510cdf7 1975 if (res->type == RTN_BROADCAST)
1da177e4
LT
1976 goto brd_input;
1977
5510cdf7 1978 if (res->type == RTN_LOCAL) {
5c04c819 1979 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1980 0, dev, in_dev, &itag);
b5f7e755 1981 if (err < 0)
0d753960 1982 goto martian_source;
1da177e4
LT
1983 goto local_input;
1984 }
1985
cd0f0b95
DJ
1986 if (!IN_DEV_FORWARD(in_dev)) {
1987 err = -EHOSTUNREACH;
251da413 1988 goto no_route;
cd0f0b95 1989 }
5510cdf7 1990 if (res->type != RTN_UNICAST)
1da177e4
LT
1991 goto martian_destination;
1992
5510cdf7 1993 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1994out: return err;
1995
1996brd_input:
1997 if (skb->protocol != htons(ETH_P_IP))
1998 goto e_inval;
1999
41347dcd 2000 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2001 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2002 in_dev, &itag);
1da177e4 2003 if (err < 0)
0d753960 2004 goto martian_source;
1da177e4
LT
2005 }
2006 flags |= RTCF_BROADCAST;
5510cdf7 2007 res->type = RTN_BROADCAST;
1da177e4
LT
2008 RT_CACHE_STAT_INC(in_brd);
2009
2010local_input:
d2d68ba9 2011 do_cache = false;
5510cdf7 2012 if (res->fi) {
fe3edf45 2013 if (!itag) {
5510cdf7 2014 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2015 if (rt_cache_valid(rth)) {
c6cffba4
DM
2016 skb_dst_set_noref(skb, &rth->dst);
2017 err = 0;
2018 goto out;
d2d68ba9
DM
2019 }
2020 do_cache = true;
2021 }
2022 }
2023
f5a0aab8 2024 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2025 flags | RTCF_LOCAL, res->type,
d2d68ba9 2026 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2027 if (!rth)
2028 goto e_nobufs;
2029
d8d1f30b 2030 rth->dst.output= ip_rt_bug;
cf911662
DM
2031#ifdef CONFIG_IP_ROUTE_CLASSID
2032 rth->dst.tclassid = itag;
2033#endif
9917e1e8 2034 rth->rt_is_input = 1;
5510cdf7
DA
2035 if (res->table)
2036 rth->rt_table_id = res->table->tb_id;
571e7226 2037
a6254864 2038 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2039 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2040 rth->dst.input= ip_error;
2041 rth->dst.error= -err;
1da177e4
LT
2042 rth->rt_flags &= ~RTCF_LOCAL;
2043 }
efd85700 2044
dcdfdf56 2045 if (do_cache) {
5510cdf7 2046 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2047
2048 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2049 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2050 WARN_ON(rth->dst.input == lwtunnel_input);
2051 rth->dst.lwtstate->orig_input = rth->dst.input;
2052 rth->dst.input = lwtunnel_input;
2053 }
2054
a4c2fd7f 2055 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2056 rt_add_uncached_list(rth);
dcdfdf56 2057 }
89aef892 2058 skb_dst_set(skb, &rth->dst);
b23dd4fe 2059 err = 0;
ebc0ffae 2060 goto out;
1da177e4
LT
2061
2062no_route:
2063 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2064 res->type = RTN_UNREACHABLE;
2065 res->fi = NULL;
2066 res->table = NULL;
1da177e4
LT
2067 goto local_input;
2068
2069 /*
2070 * Do not cache martian addresses: they should be logged (RFC1812)
2071 */
2072martian_destination:
2073 RT_CACHE_STAT_INC(in_martian_dst);
2074#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2075 if (IN_DEV_LOG_MARTIANS(in_dev))
2076 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2077 &daddr, &saddr, dev->name);
1da177e4 2078#endif
2c2910a4 2079
1da177e4
LT
2080e_inval:
2081 err = -EINVAL;
ebc0ffae 2082 goto out;
1da177e4
LT
2083
2084e_nobufs:
2085 err = -ENOBUFS;
ebc0ffae 2086 goto out;
1da177e4
LT
2087
2088martian_source:
2089 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2090 goto out;
1da177e4
LT
2091}
2092
c6cffba4
DM
2093int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094 u8 tos, struct net_device *dev)
1da177e4 2095{
5510cdf7
DA
2096 struct fib_result res;
2097 int err;
1da177e4 2098
6e28099d 2099 tos &= IPTOS_RT_MASK;
96d36220 2100 rcu_read_lock();
5510cdf7
DA
2101 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2102 rcu_read_unlock();
96d36220 2103
5510cdf7
DA
2104 return err;
2105}
2106EXPORT_SYMBOL(ip_route_input_noref);
2107
2108/* called with rcu_read_lock held */
2109int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2110 u8 tos, struct net_device *dev, struct fib_result *res)
2111{
1da177e4
LT
2112 /* Multicast recognition logic is moved from route cache to here.
2113 The problem was that too many Ethernet cards have broken/missing
2114 hardware multicast filters :-( As result the host on multicasting
2115 network acquires a lot of useless route cache entries, sort of
2116 SDR messages from all the world. Now we try to get rid of them.
2117 Really, provided software IP multicast filter is organized
2118 reasonably (at least, hashed), it does not result in a slowdown
2119 comparing with route cache reject entries.
2120 Note, that multicast routers are not affected, because
2121 route cache entry is created eventually.
2122 */
f97c1e0c 2123 if (ipv4_is_multicast(daddr)) {
96d36220 2124 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2125 int our = 0;
5510cdf7 2126 int err = -EINVAL;
1da177e4 2127
e58e4159
DA
2128 if (in_dev)
2129 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2130 ip_hdr(skb)->protocol);
2131
2132 /* check l3 master if no match yet */
2133 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2134 struct in_device *l3_in_dev;
2135
2136 l3_in_dev = __in_dev_get_rcu(skb->dev);
2137 if (l3_in_dev)
2138 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2139 ip_hdr(skb)->protocol);
2140 }
2141
e58e4159 2142 if (our
1da177e4 2143#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2144 ||
2145 (!ipv4_is_local_multicast(daddr) &&
2146 IN_DEV_MFORWARD(in_dev))
1da177e4 2147#endif
e58e4159 2148 ) {
5510cdf7 2149 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2150 tos, dev, our);
1da177e4 2151 }
5510cdf7 2152 return err;
1da177e4 2153 }
5510cdf7
DA
2154
2155 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2156}
2157
ebc0ffae 2158/* called with rcu_read_lock() */
982721f3 2159static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2160 const struct flowi4 *fl4, int orig_oif,
f61759e6 2161 struct net_device *dev_out,
5ada5527 2162 unsigned int flags)
1da177e4 2163{
982721f3 2164 struct fib_info *fi = res->fi;
f2bb4bed 2165 struct fib_nh_exception *fnhe;
5ada5527 2166 struct in_device *in_dev;
982721f3 2167 u16 type = res->type;
5ada5527 2168 struct rtable *rth;
c92b9655 2169 bool do_cache;
1da177e4 2170
d0daebc3
TG
2171 in_dev = __in_dev_get_rcu(dev_out);
2172 if (!in_dev)
5ada5527 2173 return ERR_PTR(-EINVAL);
1da177e4 2174
d0daebc3 2175 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2176 if (ipv4_is_loopback(fl4->saddr) &&
2177 !(dev_out->flags & IFF_LOOPBACK) &&
2178 !netif_is_l3_master(dev_out))
d0daebc3
TG
2179 return ERR_PTR(-EINVAL);
2180
68a5e3dd 2181 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2182 type = RTN_BROADCAST;
68a5e3dd 2183 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2184 type = RTN_MULTICAST;
68a5e3dd 2185 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2186 return ERR_PTR(-EINVAL);
1da177e4
LT
2187
2188 if (dev_out->flags & IFF_LOOPBACK)
2189 flags |= RTCF_LOCAL;
2190
63617421 2191 do_cache = true;
982721f3 2192 if (type == RTN_BROADCAST) {
1da177e4 2193 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2194 fi = NULL;
2195 } else if (type == RTN_MULTICAST) {
dd28d1a0 2196 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2197 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2198 fl4->flowi4_proto))
1da177e4 2199 flags &= ~RTCF_LOCAL;
63617421
JA
2200 else
2201 do_cache = false;
1da177e4 2202 /* If multicast route do not exist use
dd28d1a0
ED
2203 * default one, but do not gateway in this case.
2204 * Yes, it is hack.
1da177e4 2205 */
982721f3
DM
2206 if (fi && res->prefixlen < 4)
2207 fi = NULL;
d6d5e999
CF
2208 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2209 (orig_oif != dev_out->ifindex)) {
2210 /* For local routes that require a particular output interface
2211 * we do not want to cache the result. Caching the result
2212 * causes incorrect behaviour when there are multiple source
2213 * addresses on the interface, the end result being that if the
2214 * intended recipient is waiting on that interface for the
2215 * packet he won't receive it because it will be delivered on
2216 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2217 * be set to the loopback interface as well.
2218 */
054d7cb5 2219 do_cache = false;
1da177e4
LT
2220 }
2221
f2bb4bed 2222 fnhe = NULL;
63617421 2223 do_cache &= fi != NULL;
054d7cb5 2224 if (fi) {
c5038a83 2225 struct rtable __rcu **prth;
c92b9655 2226 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2227
c92b9655 2228 fnhe = find_exception(nh, fl4->daddr);
054d7cb5
JA
2229 if (!do_cache)
2230 goto add;
deed49df 2231 if (fnhe) {
2ffae99d 2232 prth = &fnhe->fnhe_rth_output;
054d7cb5
JA
2233 } else {
2234 if (unlikely(fl4->flowi4_flags &
2235 FLOWI_FLAG_KNOWN_NH &&
2236 !(nh->nh_gw &&
2237 nh->nh_scope == RT_SCOPE_LINK))) {
2238 do_cache = false;
2239 goto add;
c92b9655 2240 }
054d7cb5 2241 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c92b9655 2242 }
c5038a83 2243 rth = rcu_dereference(*prth);
9df16efa 2244 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2245 return rth;
f2bb4bed 2246 }
c92b9655
JA
2247
2248add:
d08c4f35 2249 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2250 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2251 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2252 do_cache);
8391d07b 2253 if (!rth)
5ada5527 2254 return ERR_PTR(-ENOBUFS);
8391d07b 2255
9438c871 2256 rth->rt_iif = orig_oif;
b7503e0c
DA
2257 if (res->table)
2258 rth->rt_table_id = res->table->tb_id;
2259
1da177e4
LT
2260 RT_CACHE_STAT_INC(out_slow_tot);
2261
1da177e4 2262 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2263 if (flags & RTCF_LOCAL &&
1da177e4 2264 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2265 rth->dst.output = ip_mc_output;
1da177e4
LT
2266 RT_CACHE_STAT_INC(out_slow_mc);
2267 }
2268#ifdef CONFIG_IP_MROUTE
982721f3 2269 if (type == RTN_MULTICAST) {
1da177e4 2270 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2271 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2272 rth->dst.input = ip_mr_input;
2273 rth->dst.output = ip_mc_output;
1da177e4
LT
2274 }
2275 }
2276#endif
2277 }
2278
a4c2fd7f 2279 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
efd85700 2280 set_lwt_redirect(rth);
1da177e4 2281
5ada5527 2282 return rth;
1da177e4
LT
2283}
2284
1da177e4
LT
2285/*
2286 * Major route resolver routine.
2287 */
2288
3abd1ade
DA
2289struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290 const struct sk_buff *skb)
1da177e4 2291{
f61759e6 2292 __u8 tos = RT_FL_TOS(fl4);
26883b0d
ED
2293 struct fib_result res = {
2294 .type = RTN_UNSPEC,
2295 .fi = NULL,
2296 .table = NULL,
2297 .tclassid = 0,
2298 };
5ada5527 2299 struct rtable *rth;
1da177e4 2300
1fb9489b 2301 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2302 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2305
010c2708 2306 rcu_read_lock();
3abd1ade
DA
2307 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308 rcu_read_unlock();
2309
2310 return rth;
2311}
2312EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315 struct fib_result *res,
2316 const struct sk_buff *skb)
2317{
2318 struct net_device *dev_out = NULL;
2319 int orig_oif = fl4->flowi4_oif;
2320 unsigned int flags = 0;
2321 struct rtable *rth;
2322 int err = -ENETUNREACH;
2323
813b3b5d 2324 if (fl4->saddr) {
b23dd4fe 2325 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2326 if (ipv4_is_multicast(fl4->saddr) ||
2327 ipv4_is_lbcast(fl4->saddr) ||
2328 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2329 goto out;
2330
1da177e4
LT
2331 /* I removed check for oif == dev_out->oif here.
2332 It was wrong for two reasons:
1ab35276
DL
2333 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334 is assigned to multiple interfaces.
1da177e4
LT
2335 2. Moreover, we are allowed to send packets with saddr
2336 of another iface. --ANK
2337 */
2338
813b3b5d
DM
2339 if (fl4->flowi4_oif == 0 &&
2340 (ipv4_is_multicast(fl4->daddr) ||
2341 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2342 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2343 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2344 if (!dev_out)
a210d01a
JA
2345 goto out;
2346
1da177e4
LT
2347 /* Special hack: user can direct multicasts
2348 and limited broadcast via necessary interface
2349 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350 This hack is not just for fun, it allows
2351 vic,vat and friends to work.
2352 They bind socket to loopback, set ttl to zero
2353 and expect that it will work.
2354 From the viewpoint of routing cache they are broken,
2355 because we are not allowed to build multicast path
2356 with loopback source addr (look, routing cache
2357 cannot know, that ttl is zero, so that packet
2358 will not leave this host and route is valid).
2359 Luckily, this hack is good workaround.
2360 */
2361
813b3b5d 2362 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2363 goto make_route;
2364 }
a210d01a 2365
813b3b5d 2366 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2367 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2368 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2369 goto out;
a210d01a 2370 }
1da177e4
LT
2371 }
2372
2373
813b3b5d
DM
2374 if (fl4->flowi4_oif) {
2375 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2376 rth = ERR_PTR(-ENODEV);
51456b29 2377 if (!dev_out)
1da177e4 2378 goto out;
e5ed6399
HX
2379
2380 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2381 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2382 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2383 goto out;
2384 }
813b3b5d 2385 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2386 ipv4_is_lbcast(fl4->daddr) ||
2387 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2388 if (!fl4->saddr)
2389 fl4->saddr = inet_select_addr(dev_out, 0,
2390 RT_SCOPE_LINK);
1da177e4
LT
2391 goto make_route;
2392 }
0a7e2260 2393 if (!fl4->saddr) {
813b3b5d
DM
2394 if (ipv4_is_multicast(fl4->daddr))
2395 fl4->saddr = inet_select_addr(dev_out, 0,
2396 fl4->flowi4_scope);
2397 else if (!fl4->daddr)
2398 fl4->saddr = inet_select_addr(dev_out, 0,
2399 RT_SCOPE_HOST);
1da177e4
LT
2400 }
2401 }
2402
813b3b5d
DM
2403 if (!fl4->daddr) {
2404 fl4->daddr = fl4->saddr;
2405 if (!fl4->daddr)
2406 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2407 dev_out = net->loopback_dev;
1fb9489b 2408 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2409 res->type = RTN_LOCAL;
1da177e4
LT
2410 flags |= RTCF_LOCAL;
2411 goto make_route;
2412 }
2413
3abd1ade 2414 err = fib_lookup(net, fl4, res, 0);
0315e382 2415 if (err) {
3abd1ade
DA
2416 res->fi = NULL;
2417 res->table = NULL;
6104e112 2418 if (fl4->flowi4_oif &&
e58e4159
DA
2419 (ipv4_is_multicast(fl4->daddr) ||
2420 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2421 /* Apparently, routing tables are wrong. Assume,
2422 that the destination is on link.
2423
2424 WHY? DW.
2425 Because we are allowed to send to iface
2426 even if it has NO routes and NO assigned
2427 addresses. When oif is specified, routing
2428 tables are looked up with only one purpose:
2429 to catch if destination is gatewayed, rather than
2430 direct. Moreover, if MSG_DONTROUTE is set,
2431 we send packet, ignoring both routing tables
2432 and ifaddr state. --ANK
2433
2434
2435 We could make it even if oif is unknown,
2436 likely IPv6, but we do not.
2437 */
2438
813b3b5d
DM
2439 if (fl4->saddr == 0)
2440 fl4->saddr = inet_select_addr(dev_out, 0,
2441 RT_SCOPE_LINK);
3abd1ade 2442 res->type = RTN_UNICAST;
1da177e4
LT
2443 goto make_route;
2444 }
0315e382 2445 rth = ERR_PTR(err);
1da177e4
LT
2446 goto out;
2447 }
1da177e4 2448
3abd1ade 2449 if (res->type == RTN_LOCAL) {
813b3b5d 2450 if (!fl4->saddr) {
3abd1ade
DA
2451 if (res->fi->fib_prefsrc)
2452 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2453 else
813b3b5d 2454 fl4->saddr = fl4->daddr;
9fc3bbb4 2455 }
5f02ce24
DA
2456
2457 /* L3 master device is the loopback for that domain */
3abd1ade 2458 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2459 net->loopback_dev;
839da4d9
DA
2460
2461 /* make sure orig_oif points to fib result device even
2462 * though packet rx/tx happens over loopback or l3mdev
2463 */
2464 orig_oif = FIB_RES_OIF(*res);
2465
813b3b5d 2466 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2467 flags |= RTCF_LOCAL;
2468 goto make_route;
2469 }
2470
3abd1ade 2471 fib_select_path(net, res, fl4, skb);
1da177e4 2472
3abd1ade 2473 dev_out = FIB_RES_DEV(*res);
813b3b5d 2474 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2475
2476
2477make_route:
3abd1ade 2478 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2479
010c2708 2480out:
b23dd4fe 2481 return rth;
1da177e4 2482}
d8c97a94 2483
ae2688d5
JW
2484static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485{
2486 return NULL;
2487}
2488
ebb762f2 2489static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2490{
618f9bc7
SK
2491 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2494}
2495
6700c270
DM
2496static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2498{
2499}
2500
6700c270
DM
2501static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502 struct sk_buff *skb)
b587ee3b
DM
2503{
2504}
2505
0972ddb2
HB
2506static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507 unsigned long old)
2508{
2509 return NULL;
2510}
2511
14e50e57
DM
2512static struct dst_ops ipv4_dst_blackhole_ops = {
2513 .family = AF_INET,
ae2688d5 2514 .check = ipv4_blackhole_dst_check,
ebb762f2 2515 .mtu = ipv4_blackhole_mtu,
214f45c9 2516 .default_advmss = ipv4_default_advmss,
14e50e57 2517 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2518 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2519 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2520 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2521};
2522
2774c131 2523struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2524{
2774c131 2525 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2526 struct rtable *rt;
14e50e57 2527
6c0e7284 2528 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2529 if (rt) {
d8d1f30b 2530 struct dst_entry *new = &rt->dst;
14e50e57 2531
14e50e57 2532 new->__use = 1;
352e512c 2533 new->input = dst_discard;
ede2059d 2534 new->output = dst_discard_out;
14e50e57 2535
1dbe3252 2536 new->dev = net->loopback_dev;
14e50e57
DM
2537 if (new->dev)
2538 dev_hold(new->dev);
2539
9917e1e8 2540 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2541 rt->rt_iif = ort->rt_iif;
5943634f 2542 rt->rt_pmtu = ort->rt_pmtu;
a9f829f7 2543 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2544
ca4c3fc2 2545 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2546 rt->rt_flags = ort->rt_flags;
2547 rt->rt_type = ort->rt_type;
14e50e57 2548 rt->rt_gateway = ort->rt_gateway;
155e8336 2549 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2550
caacf05e 2551 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2552 }
2553
2774c131
DM
2554 dst_release(dst_orig);
2555
2556 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2557}
2558
9d6ec938 2559struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2560 const struct sock *sk)
1da177e4 2561{
9d6ec938 2562 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2563
b23dd4fe
DM
2564 if (IS_ERR(rt))
2565 return rt;
1da177e4 2566
56157872 2567 if (flp4->flowi4_proto)
f92ee619
SK
2568 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569 flowi4_to_flowi(flp4),
2570 sk, 0);
1da177e4 2571
b23dd4fe 2572 return rt;
1da177e4 2573}
d8c97a94
ACM
2574EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
3765d35e 2576/* called with rcu_read_lock held */
c36ba660 2577static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2578 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2579 u32 seq)
1da177e4 2580{
ba52d61e 2581 struct rtable *rt = skb_rtable(skb);
1da177e4 2582 struct rtmsg *r;
be403ea1 2583 struct nlmsghdr *nlh;
2bc8ca40 2584 unsigned long expires = 0;
f185071d 2585 u32 error;
521f5490 2586 u32 metrics[RTAX_MAX];
be403ea1 2587
d3166e0c 2588 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2589 if (!nlh)
26932566 2590 return -EMSGSIZE;
be403ea1
TG
2591
2592 r = nlmsg_data(nlh);
1da177e4
LT
2593 r->rtm_family = AF_INET;
2594 r->rtm_dst_len = 32;
2595 r->rtm_src_len = 0;
d6c0a4f6 2596 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2597 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2598 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2599 goto nla_put_failure;
1da177e4
LT
2600 r->rtm_type = rt->rt_type;
2601 r->rtm_scope = RT_SCOPE_UNIVERSE;
2602 r->rtm_protocol = RTPROT_UNSPEC;
2603 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604 if (rt->rt_flags & RTCF_NOTIFY)
2605 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2606 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2608
930345ea 2609 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2610 goto nla_put_failure;
1a00fee4 2611 if (src) {
1da177e4 2612 r->rtm_src_len = 32;
930345ea 2613 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2614 goto nla_put_failure;
1da177e4 2615 }
f3756b79
DM
2616 if (rt->dst.dev &&
2617 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618 goto nla_put_failure;
c7066f70 2619#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2620 if (rt->dst.tclassid &&
2621 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622 goto nla_put_failure;
1da177e4 2623#endif
41347dcd 2624 if (!rt_is_input_route(rt) &&
d6c0a4f6 2625 fl4->saddr != src) {
930345ea 2626 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2627 goto nla_put_failure;
2628 }
155e8336 2629 if (rt->rt_uses_gateway &&
930345ea 2630 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2631 goto nla_put_failure;
be403ea1 2632
ee9a8f7a
SK
2633 expires = rt->dst.expires;
2634 if (expires) {
2635 unsigned long now = jiffies;
2636
2637 if (time_before(now, expires))
2638 expires -= now;
2639 else
2640 expires = 0;
2641 }
2642
521f5490 2643 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2644 if (rt->rt_pmtu && expires)
521f5490 2645 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
a9f829f7
SD
2646 if (rt->rt_mtu_locked && expires)
2647 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2648 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2649 goto nla_put_failure;
2650
b4869889 2651 if (fl4->flowi4_mark &&
68aaed54 2652 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2653 goto nla_put_failure;
963bfeee 2654
622ec2c9
LC
2655 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656 nla_put_u32(skb, RTA_UID,
2657 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658 goto nla_put_failure;
2659
d8d1f30b 2660 error = rt->dst.error;
be403ea1 2661
c7537967 2662 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2663#ifdef CONFIG_IP_MROUTE
2664 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666 int err = ipmr_get_route(net, skb,
2667 fl4->saddr, fl4->daddr,
9f09eaea 2668 r, portid);
2cf75070 2669
8caaf7b6 2670 if (err <= 0) {
0c8d803f
DA
2671 if (err == 0)
2672 return 0;
2673 goto nla_put_failure;
8caaf7b6
ND
2674 }
2675 } else
2676#endif
91146153 2677 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2678 goto nla_put_failure;
1da177e4
LT
2679 }
2680
f185071d 2681 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2682 goto nla_put_failure;
be403ea1 2683
053c095a
JB
2684 nlmsg_end(skb, nlh);
2685 return 0;
1da177e4 2686
be403ea1 2687nla_put_failure:
26932566
PM
2688 nlmsg_cancel(skb, nlh);
2689 return -EMSGSIZE;
1da177e4
LT
2690}
2691
c21ef3e3
DA
2692static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693 struct netlink_ext_ack *extack)
1da177e4 2694{
3b1e0a65 2695 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2696 struct rtmsg *rtm;
2697 struct nlattr *tb[RTA_MAX+1];
3765d35e 2698 struct fib_result res = {};
1da177e4 2699 struct rtable *rt = NULL;
d6c0a4f6 2700 struct flowi4 fl4;
9e12bb22
AV
2701 __be32 dst = 0;
2702 __be32 src = 0;
2703 u32 iif;
d889ce3b 2704 int err;
963bfeee 2705 int mark;
1da177e4 2706 struct sk_buff *skb;
c36ba660 2707 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2708 kuid_t uid;
1da177e4 2709
fceb6435 2710 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2711 extack);
d889ce3b
TG
2712 if (err < 0)
2713 goto errout;
2714
2715 rtm = nlmsg_data(nlh);
2716
1da177e4 2717 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2718 if (!skb) {
d889ce3b
TG
2719 err = -ENOBUFS;
2720 goto errout;
2721 }
1da177e4
LT
2722
2723 /* Reserve room for dummy headers, this skb can pass
2724 through good chunk of routing engine.
2725 */
459a98ed 2726 skb_reset_mac_header(skb);
c1d2bbe1 2727 skb_reset_network_header(skb);
d2c962b8 2728
67b61f6c
JB
2729 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2731 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2732 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2733 if (tb[RTA_UID])
2734 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735 else
2736 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2737
bbadb9a2
FL
2738 /* Bugfix: need to give ip_route_input enough of an IP header to
2739 * not gag.
2740 */
2741 ip_hdr(skb)->protocol = IPPROTO_UDP;
2742 ip_hdr(skb)->saddr = src;
2743 ip_hdr(skb)->daddr = dst;
2744
2745 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2746
d6c0a4f6
DM
2747 memset(&fl4, 0, sizeof(fl4));
2748 fl4.daddr = dst;
2749 fl4.saddr = src;
2750 fl4.flowi4_tos = rtm->rtm_tos;
2751 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752 fl4.flowi4_mark = mark;
622ec2c9 2753 fl4.flowi4_uid = uid;
d6c0a4f6 2754
3765d35e
DA
2755 rcu_read_lock();
2756
1da177e4 2757 if (iif) {
d889ce3b
TG
2758 struct net_device *dev;
2759
3765d35e 2760 dev = dev_get_by_index_rcu(net, iif);
51456b29 2761 if (!dev) {
d889ce3b
TG
2762 err = -ENODEV;
2763 goto errout_free;
2764 }
2765
1da177e4
LT
2766 skb->protocol = htons(ETH_P_IP);
2767 skb->dev = dev;
963bfeee 2768 skb->mark = mark;
3765d35e
DA
2769 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770 dev, &res);
d889ce3b 2771
511c3f92 2772 rt = skb_rtable(skb);
d8d1f30b
CG
2773 if (err == 0 && rt->dst.error)
2774 err = -rt->dst.error;
1da177e4 2775 } else {
6503a304 2776 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3765d35e 2777 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2778 err = 0;
2779 if (IS_ERR(rt))
2780 err = PTR_ERR(rt);
2c87d63a
FW
2781 else
2782 skb_dst_set(skb, &rt->dst);
1da177e4 2783 }
d889ce3b 2784
1da177e4 2785 if (err)
d889ce3b 2786 goto errout_free;
1da177e4 2787
1da177e4
LT
2788 if (rtm->rtm_flags & RTM_F_NOTIFY)
2789 rt->rt_flags |= RTCF_NOTIFY;
2790
c36ba660
DA
2791 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792 table_id = rt->rt_table_id;
2793
bc3aae2b
RP
2794 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2795 if (!res.fi) {
2796 err = fib_props[res.type].error;
2797 if (!err)
2798 err = -EHOSTUNREACH;
2799 goto errout_free;
2800 }
b6179813
RP
2801 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803 rt->rt_type, res.prefix, res.prefixlen,
2804 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2805 } else {
b6179813 2806 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2807 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2808 }
7b46a644 2809 if (err < 0)
d889ce3b 2810 goto errout_free;
1da177e4 2811
3765d35e
DA
2812 rcu_read_unlock();
2813
15e47304 2814 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2815errout:
2942e900 2816 return err;
1da177e4 2817
d889ce3b 2818errout_free:
3765d35e 2819 rcu_read_unlock();
1da177e4 2820 kfree_skb(skb);
d889ce3b 2821 goto errout;
1da177e4
LT
2822}
2823
1da177e4
LT
2824void ip_rt_multicast_event(struct in_device *in_dev)
2825{
4ccfe6d4 2826 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2827}
2828
2829#ifdef CONFIG_SYSCTL
082c7ca4
G
2830static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2831static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2832static int ip_rt_gc_elasticity __read_mostly = 8;
2833
fe2c6338 2834static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2835 void __user *buffer,
1da177e4
LT
2836 size_t *lenp, loff_t *ppos)
2837{
5aad1de5
TT
2838 struct net *net = (struct net *)__ctl->extra1;
2839
1da177e4 2840 if (write) {
5aad1de5
TT
2841 rt_cache_flush(net);
2842 fnhe_genid_bump(net);
1da177e4 2843 return 0;
e905a9ed 2844 }
1da177e4
LT
2845
2846 return -EINVAL;
2847}
2848
fe2c6338 2849static struct ctl_table ipv4_route_table[] = {
1da177e4 2850 {
1da177e4
LT
2851 .procname = "gc_thresh",
2852 .data = &ipv4_dst_ops.gc_thresh,
2853 .maxlen = sizeof(int),
2854 .mode = 0644,
6d9f239a 2855 .proc_handler = proc_dointvec,
1da177e4
LT
2856 },
2857 {
1da177e4
LT
2858 .procname = "max_size",
2859 .data = &ip_rt_max_size,
2860 .maxlen = sizeof(int),
2861 .mode = 0644,
6d9f239a 2862 .proc_handler = proc_dointvec,
1da177e4
LT
2863 },
2864 {
2865 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2866
1da177e4
LT
2867 .procname = "gc_min_interval",
2868 .data = &ip_rt_gc_min_interval,
2869 .maxlen = sizeof(int),
2870 .mode = 0644,
6d9f239a 2871 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2872 },
2873 {
1da177e4
LT
2874 .procname = "gc_min_interval_ms",
2875 .data = &ip_rt_gc_min_interval,
2876 .maxlen = sizeof(int),
2877 .mode = 0644,
6d9f239a 2878 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2879 },
2880 {
1da177e4
LT
2881 .procname = "gc_timeout",
2882 .data = &ip_rt_gc_timeout,
2883 .maxlen = sizeof(int),
2884 .mode = 0644,
6d9f239a 2885 .proc_handler = proc_dointvec_jiffies,
1da177e4 2886 },
9f28a2fc
ED
2887 {
2888 .procname = "gc_interval",
2889 .data = &ip_rt_gc_interval,
2890 .maxlen = sizeof(int),
2891 .mode = 0644,
2892 .proc_handler = proc_dointvec_jiffies,
2893 },
1da177e4 2894 {
1da177e4
LT
2895 .procname = "redirect_load",
2896 .data = &ip_rt_redirect_load,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
6d9f239a 2899 .proc_handler = proc_dointvec,
1da177e4
LT
2900 },
2901 {
1da177e4
LT
2902 .procname = "redirect_number",
2903 .data = &ip_rt_redirect_number,
2904 .maxlen = sizeof(int),
2905 .mode = 0644,
6d9f239a 2906 .proc_handler = proc_dointvec,
1da177e4
LT
2907 },
2908 {
1da177e4
LT
2909 .procname = "redirect_silence",
2910 .data = &ip_rt_redirect_silence,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
6d9f239a 2913 .proc_handler = proc_dointvec,
1da177e4
LT
2914 },
2915 {
1da177e4
LT
2916 .procname = "error_cost",
2917 .data = &ip_rt_error_cost,
2918 .maxlen = sizeof(int),
2919 .mode = 0644,
6d9f239a 2920 .proc_handler = proc_dointvec,
1da177e4
LT
2921 },
2922 {
1da177e4
LT
2923 .procname = "error_burst",
2924 .data = &ip_rt_error_burst,
2925 .maxlen = sizeof(int),
2926 .mode = 0644,
6d9f239a 2927 .proc_handler = proc_dointvec,
1da177e4
LT
2928 },
2929 {
1da177e4
LT
2930 .procname = "gc_elasticity",
2931 .data = &ip_rt_gc_elasticity,
2932 .maxlen = sizeof(int),
2933 .mode = 0644,
6d9f239a 2934 .proc_handler = proc_dointvec,
1da177e4
LT
2935 },
2936 {
1da177e4
LT
2937 .procname = "mtu_expires",
2938 .data = &ip_rt_mtu_expires,
2939 .maxlen = sizeof(int),
2940 .mode = 0644,
6d9f239a 2941 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2942 },
2943 {
1da177e4
LT
2944 .procname = "min_pmtu",
2945 .data = &ip_rt_min_pmtu,
2946 .maxlen = sizeof(int),
2947 .mode = 0644,
2eda86c3
SD
2948 .proc_handler = proc_dointvec_minmax,
2949 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
2950 },
2951 {
1da177e4
LT
2952 .procname = "min_adv_mss",
2953 .data = &ip_rt_min_advmss,
2954 .maxlen = sizeof(int),
2955 .mode = 0644,
6d9f239a 2956 .proc_handler = proc_dointvec,
1da177e4 2957 },
f8572d8f 2958 { }
1da177e4 2959};
39a23e75 2960
39a23e75
DL
2961static struct ctl_table ipv4_route_flush_table[] = {
2962 {
39a23e75
DL
2963 .procname = "flush",
2964 .maxlen = sizeof(int),
2965 .mode = 0200,
6d9f239a 2966 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2967 },
f8572d8f 2968 { },
39a23e75
DL
2969};
2970
2971static __net_init int sysctl_route_net_init(struct net *net)
2972{
2973 struct ctl_table *tbl;
2974
2975 tbl = ipv4_route_flush_table;
09ad9bc7 2976 if (!net_eq(net, &init_net)) {
39a23e75 2977 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2978 if (!tbl)
39a23e75 2979 goto err_dup;
464dc801
EB
2980
2981 /* Don't export sysctls to unprivileged users */
2982 if (net->user_ns != &init_user_ns)
2983 tbl[0].procname = NULL;
39a23e75
DL
2984 }
2985 tbl[0].extra1 = net;
2986
ec8f23ce 2987 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2988 if (!net->ipv4.route_hdr)
39a23e75
DL
2989 goto err_reg;
2990 return 0;
2991
2992err_reg:
2993 if (tbl != ipv4_route_flush_table)
2994 kfree(tbl);
2995err_dup:
2996 return -ENOMEM;
2997}
2998
2999static __net_exit void sysctl_route_net_exit(struct net *net)
3000{
3001 struct ctl_table *tbl;
3002
3003 tbl = net->ipv4.route_hdr->ctl_table_arg;
3004 unregister_net_sysctl_table(net->ipv4.route_hdr);
3005 BUG_ON(tbl == ipv4_route_flush_table);
3006 kfree(tbl);
3007}
3008
3009static __net_initdata struct pernet_operations sysctl_route_ops = {
3010 .init = sysctl_route_net_init,
3011 .exit = sysctl_route_net_exit,
3012};
1da177e4
LT
3013#endif
3014
3ee94372 3015static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3016{
ca4c3fc2 3017 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3018 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3019 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3020 return 0;
3021}
3022
3ee94372
NH
3023static __net_initdata struct pernet_operations rt_genid_ops = {
3024 .init = rt_genid_init,
9f5e97e5
DL
3025};
3026
c3426b47
DM
3027static int __net_init ipv4_inetpeer_init(struct net *net)
3028{
3029 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3030
3031 if (!bp)
3032 return -ENOMEM;
3033 inet_peer_base_init(bp);
3034 net->ipv4.peers = bp;
3035 return 0;
3036}
3037
3038static void __net_exit ipv4_inetpeer_exit(struct net *net)
3039{
3040 struct inet_peer_base *bp = net->ipv4.peers;
3041
3042 net->ipv4.peers = NULL;
56a6b248 3043 inetpeer_invalidate_tree(bp);
c3426b47
DM
3044 kfree(bp);
3045}
3046
3047static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3048 .init = ipv4_inetpeer_init,
3049 .exit = ipv4_inetpeer_exit,
3050};
9f5e97e5 3051
c7066f70 3052#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3053struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3054#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3055
1da177e4
LT
3056int __init ip_rt_init(void)
3057{
5055c371 3058 int cpu;
1da177e4 3059
73f156a6
ED
3060 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3061 if (!ip_idents)
3062 panic("IP: failed to allocate ip_idents\n");
3063
3064 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3065
355b590c
ED
3066 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3067 if (!ip_tstamps)
3068 panic("IP: failed to allocate ip_tstamps\n");
3069
5055c371
ED
3070 for_each_possible_cpu(cpu) {
3071 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3072
3073 INIT_LIST_HEAD(&ul->head);
3074 spin_lock_init(&ul->lock);
3075 }
c7066f70 3076#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3077 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3078 if (!ip_rt_acct)
3079 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3080#endif
3081
e5d679f3
AD
3082 ipv4_dst_ops.kmem_cachep =
3083 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3084 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3085
14e50e57
DM
3086 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3087
fc66f95c
ED
3088 if (dst_entries_init(&ipv4_dst_ops) < 0)
3089 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3090
3091 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3092 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3093
89aef892
DM
3094 ipv4_dst_ops.gc_thresh = ~0;
3095 ip_rt_max_size = INT_MAX;
1da177e4 3096
1da177e4
LT
3097 devinet_init();
3098 ip_fib_init();
3099
73b38711 3100 if (ip_rt_proc_init())
058bd4d2 3101 pr_err("Unable to create route proc files\n");
1da177e4
LT
3102#ifdef CONFIG_XFRM
3103 xfrm_init();
703fb94e 3104 xfrm4_init();
1da177e4 3105#endif
394f51ab
FW
3106 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3107 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3108
39a23e75
DL
3109#ifdef CONFIG_SYSCTL
3110 register_pernet_subsys(&sysctl_route_ops);
3111#endif
3ee94372 3112 register_pernet_subsys(&rt_genid_ops);
c3426b47 3113 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3114 return 0;
1da177e4
LT
3115}
3116
a1bc6eb4 3117#ifdef CONFIG_SYSCTL
eeb61f71
AV
3118/*
3119 * We really need to sanitize the damn ipv4 init order, then all
3120 * this nonsense will go away.
3121 */
3122void __init ip_static_sysctl_init(void)
3123{
4e5ca785 3124 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3125}
a1bc6eb4 3126#endif