net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .owner   = THIS_MODULE,
 244         .open    = rt_cache_seq_open,
 245         .read    = seq_read,
 246         .llseek  = seq_lseek,
 247         .release = seq_release,
 248 };
 249
 250
 251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 252 {
 253         int cpu;
 254
 255         if (*pos == 0)
 256                 return SEQ_START_TOKEN;
 257
 258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 259                 if (!cpu_possible(cpu))
 260                         continue;
 261                 *pos = cpu+1;
 262                 return &per_cpu(rt_cache_stat, cpu);
 263         }
 264         return NULL;
 265 }
 266
 267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268 {
 269         int cpu;
 270
 271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 272                 if (!cpu_possible(cpu))
 273                         continue;
 274                 *pos = cpu+1;
 275                 return &per_cpu(rt_cache_stat, cpu);
 276         }
 277         return NULL;
 278
 279 }
 280
 281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 282 {
 283
 284 }
 285
 286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 287 {
 288         struct rt_cache_stat *st = v;
 289
 290         if (v == SEQ_START_TOKEN) {
 291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 292                 return 0;
 293         }
 294
 295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 297                    dst_entries_get_slow(&ipv4_dst_ops),
 298                    0, /* st->in_hit */
 299                    st->in_slow_tot,
 300                    st->in_slow_mc,
 301                    st->in_no_route,
 302                    st->in_brd,
 303                    st->in_martian_dst,
 304                    st->in_martian_src,
 305
 306                    0, /* st->out_hit */
 307                    st->out_slow_tot,
 308                    st->out_slow_mc,
 309
 310                    0, /* st->gc_total */
 311                    0, /* st->gc_ignored */
 312                    0, /* st->gc_goal_miss */
 313                    0, /* st->gc_dst_overflow */
 314                    0, /* st->in_hlist_search */
 315                    0  /* st->out_hlist_search */
 316                 );
 317         return 0;
 318 }
 319
 320 static const struct seq_operations rt_cpu_seq_ops = {
 321         .start  = rt_cpu_seq_start,
 322         .next   = rt_cpu_seq_next,
 323         .stop   = rt_cpu_seq_stop,
 324         .show   = rt_cpu_seq_show,
 325 };
 326
 327
 328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 329 {
 330         return seq_open(file, &rt_cpu_seq_ops);
 331 }
 332
 333 static const struct file_operations rt_cpu_seq_fops = {
 334         .owner   = THIS_MODULE,
 335         .open    = rt_cpu_seq_open,
 336         .read    = seq_read,
 337         .llseek  = seq_lseek,
 338         .release = seq_release,
 339 };
 340
 341 #ifdef CONFIG_IP_ROUTE_CLASSID
 342 static int rt_acct_proc_show(struct seq_file *m, void *v)
 343 {
 344         struct ip_rt_acct *dst, *src;
 345         unsigned int i, j;
 346
 347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 348         if (!dst)
 349                 return -ENOMEM;
 350
 351         for_each_possible_cpu(i) {
 352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 353                 for (j = 0; j < 256; j++) {
 354                         dst[j].o_bytes   += src[j].o_bytes;
 355                         dst[j].o_packets += src[j].o_packets;
 356                         dst[j].i_bytes   += src[j].i_bytes;
 357                         dst[j].i_packets += src[j].i_packets;
 358                 }
 359         }
 360
 361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 362         kfree(dst);
 363         return 0;
 364 }
 365
 366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 367 {
 368         return single_open(file, rt_acct_proc_show, NULL);
 369 }
 370
 371 static const struct file_operations rt_acct_proc_fops = {
 372         .owner          = THIS_MODULE,
 373         .open           = rt_acct_proc_open,
 374         .read           = seq_read,
 375         .llseek         = seq_lseek,
 376         .release        = single_release,
 377 };
 378 #endif
 379
 380 static int __net_init ip_rt_do_proc_init(struct net *net)
 381 {
 382         struct proc_dir_entry *pde;
 383
 384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 385                           &rt_cache_seq_fops);
 386         if (!pde)
 387                 goto err1;
 388
 389         pde = proc_create("rt_cache", S_IRUGO,
 390                           net->proc_net_stat, &rt_cpu_seq_fops);
 391         if (!pde)
 392                 goto err2;
 393
 394 #ifdef CONFIG_IP_ROUTE_CLASSID
 395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 396         if (!pde)
 397                 goto err3;
 398 #endif
 399         return 0;
 400
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402 err3:
 403         remove_proc_entry("rt_cache", net->proc_net_stat);
 404 #endif
 405 err2:
 406         remove_proc_entry("rt_cache", net->proc_net);
 407 err1:
 408         return -ENOMEM;
 409 }
 410
 411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 412 {
 413         remove_proc_entry("rt_cache", net->proc_net_stat);
 414         remove_proc_entry("rt_cache", net->proc_net);
 415 #ifdef CONFIG_IP_ROUTE_CLASSID
 416         remove_proc_entry("rt_acct", net->proc_net);
 417 #endif
 418 }
 419
 420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 421         .init = ip_rt_do_proc_init,
 422         .exit = ip_rt_do_proc_exit,
 423 };
 424
 425 static int __init ip_rt_proc_init(void)
 426 {
 427         return register_pernet_subsys(&ip_rt_proc_ops);
 428 }
 429
 430 #else
 431 static inline int ip_rt_proc_init(void)
 432 {
 433         return 0;
 434 }
 435 #endif /* CONFIG_PROC_FS */
 436
 437 static inline bool rt_is_expired(const struct rtable *rth)
 438 {
 439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 440 }
 441
 442 void rt_cache_flush(struct net *net)
 443 {
 444         rt_genid_bump_ipv4(net);
 445 }
 446
 447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 448                                            struct sk_buff *skb,
 449                                            const void *daddr)
 450 {
 451         struct net_device *dev = dst->dev;
 452         const __be32 *pkey = daddr;
 453         const struct rtable *rt;
 454         struct neighbour *n;
 455
 456         rt = (const struct rtable *) dst;
 457         if (rt->rt_gateway)
 458                 pkey = (const __be32 *) &rt->rt_gateway;
 459         else if (skb)
 460                 pkey = &ip_hdr(skb)->daddr;
 461
 462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 463         if (n)
 464                 return n;
 465         return neigh_create(&arp_tbl, pkey, dev);
 466 }
 467
 468 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 469 {
 470         struct net_device *dev = dst->dev;
 471         const __be32 *pkey = daddr;
 472         const struct rtable *rt;
 473
 474         rt = (const struct rtable *)dst;
 475         if (rt->rt_gateway)
 476                 pkey = (const __be32 *)&rt->rt_gateway;
 477         else if (!daddr ||
 478                  (rt->rt_flags &
 479                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 480                 return;
 481
 482         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 483 }
 484
 485 #define IP_IDENTS_SZ 2048u
 486
 487 static atomic_t *ip_idents __read_mostly;
 488 static u32 *ip_tstamps __read_mostly;
 489
 490 /* In order to protect privacy, we add a perturbation to identifiers
 491  * if one generator is seldom used. This makes hard for an attacker
 492  * to infer how many packets were sent between two points in time.
 493  */
 494 u32 ip_idents_reserve(u32 hash, int segs)
 495 {
 496         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 497         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 498         u32 old = ACCESS_ONCE(*p_tstamp);
 499         u32 now = (u32)jiffies;
 500         u32 new, delta = 0;
 501
 502         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 503                 delta = prandom_u32_max(now - old);
 504
 505         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 506         do {
 507                 old = (u32)atomic_read(p_id);
 508                 new = old + delta + segs;
 509         } while (atomic_cmpxchg(p_id, old, new) != old);
 510
 511         return new - segs;
 512 }
 513 EXPORT_SYMBOL(ip_idents_reserve);
 514
 515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 516 {
 517         static u32 ip_idents_hashrnd __read_mostly;
 518         u32 hash, id;
 519
 520         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 521
 522         hash = jhash_3words((__force u32)iph->daddr,
 523                             (__force u32)iph->saddr,
 524                             iph->protocol ^ net_hash_mix(net),
 525                             ip_idents_hashrnd);
 526         id = ip_idents_reserve(hash, segs);
 527         iph->id = htons(id);
 528 }
 529 EXPORT_SYMBOL(__ip_select_ident);
 530
 531 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 532                              const struct sock *sk,
 533                              const struct iphdr *iph,
 534                              int oif, u8 tos,
 535                              u8 prot, u32 mark, int flow_flags)
 536 {
 537         if (sk) {
 538                 const struct inet_sock *inet = inet_sk(sk);
 539
 540                 oif = sk->sk_bound_dev_if;
 541                 mark = sk->sk_mark;
 542                 tos = RT_CONN_FLAGS(sk);
 543                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 544         }
 545         flowi4_init_output(fl4, oif, mark, tos,
 546                            RT_SCOPE_UNIVERSE, prot,
 547                            flow_flags,
 548                            iph->daddr, iph->saddr, 0, 0,
 549                            sock_net_uid(net, sk));
 550 }
 551
 552 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 553                                const struct sock *sk)
 554 {
 555         const struct net *net = dev_net(skb->dev);
 556         const struct iphdr *iph = ip_hdr(skb);
 557         int oif = skb->dev->ifindex;
 558         u8 tos = RT_TOS(iph->tos);
 559         u8 prot = iph->protocol;
 560         u32 mark = skb->mark;
 561
 562         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 563 }
 564
 565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568         const struct ip_options_rcu *inet_opt;
 569         __be32 daddr = inet->inet_daddr;
 570
 571         rcu_read_lock();
 572         inet_opt = rcu_dereference(inet->inet_opt);
 573         if (inet_opt && inet_opt->opt.srr)
 574                 daddr = inet_opt->opt.faddr;
 575         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 576                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 577                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 578                            inet_sk_flowi_flags(sk),
 579                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 580         rcu_read_unlock();
 581 }
 582
 583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 584                                  const struct sk_buff *skb)
 585 {
 586         if (skb)
 587                 build_skb_flow_key(fl4, skb, sk);
 588         else
 589                 build_sk_flow_key(fl4, sk);
 590 }
 591
 592 static DEFINE_SPINLOCK(fnhe_lock);
 593
 594 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 595 {
 596         struct rtable *rt;
 597
 598         rt = rcu_dereference(fnhe->fnhe_rth_input);
 599         if (rt) {
 600                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 601                 dst_dev_put(&rt->dst);
 602                 dst_release(&rt->dst);
 603         }
 604         rt = rcu_dereference(fnhe->fnhe_rth_output);
 605         if (rt) {
 606                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 607                 dst_dev_put(&rt->dst);
 608                 dst_release(&rt->dst);
 609         }
 610 }
 611
 612 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 613 {
 614         struct fib_nh_exception *fnhe, *oldest;
 615
 616         oldest = rcu_dereference(hash->chain);
 617         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 618              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 619                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 620                         oldest = fnhe;
 621         }
 622         fnhe_flush_routes(oldest);
 623         return oldest;
 624 }
 625
 626 static inline u32 fnhe_hashfun(__be32 daddr)
 627 {
 628         static u32 fnhe_hashrnd __read_mostly;
 629         u32 hval;
 630
 631         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 632         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 633         return hash_32(hval, FNHE_HASH_SHIFT);
 634 }
 635
 636 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 637 {
 638         rt->rt_pmtu = fnhe->fnhe_pmtu;
 639         rt->dst.expires = fnhe->fnhe_expires;
 640
 641         if (fnhe->fnhe_gw) {
 642                 rt->rt_flags |= RTCF_REDIRECTED;
 643                 rt->rt_gateway = fnhe->fnhe_gw;
 644                 rt->rt_uses_gateway = 1;
 645         }
 646 }
 647
 648 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 649                                   u32 pmtu, unsigned long expires)
 650 {
 651         struct fnhe_hash_bucket *hash;
 652         struct fib_nh_exception *fnhe;
 653         struct rtable *rt;
 654         unsigned int i;
 655         int depth;
 656         u32 hval = fnhe_hashfun(daddr);
 657
 658         spin_lock_bh(&fnhe_lock);
 659
 660         hash = rcu_dereference(nh->nh_exceptions);
 661         if (!hash) {
 662                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 663                 if (!hash)
 664                         goto out_unlock;
 665                 rcu_assign_pointer(nh->nh_exceptions, hash);
 666         }
 667
 668         hash += hval;
 669
 670         depth = 0;
 671         for (fnhe = rcu_dereference(hash->chain); fnhe;
 672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673                 if (fnhe->fnhe_daddr == daddr)
 674                         break;
 675                 depth++;
 676         }
 677
 678         if (fnhe) {
 679                 if (gw)
 680                         fnhe->fnhe_gw = gw;
 681                 if (pmtu) {
 682                         fnhe->fnhe_pmtu = pmtu;
 683                         fnhe->fnhe_expires = max(1UL, expires);
 684                 }
 685                 /* Update all cached dsts too */
 686                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 687                 if (rt)
 688                         fill_route_from_fnhe(rt, fnhe);
 689                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 690                 if (rt)
 691                         fill_route_from_fnhe(rt, fnhe);
 692         } else {
 693                 if (depth > FNHE_RECLAIM_DEPTH)
 694                         fnhe = fnhe_oldest(hash);
 695                 else {
 696                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 697                         if (!fnhe)
 698                                 goto out_unlock;
 699
 700                         fnhe->fnhe_next = hash->chain;
 701                         rcu_assign_pointer(hash->chain, fnhe);
 702                 }
 703                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 704                 fnhe->fnhe_daddr = daddr;
 705                 fnhe->fnhe_gw = gw;
 706                 fnhe->fnhe_pmtu = pmtu;
 707                 fnhe->fnhe_expires = expires;
 708
 709                 /* Exception created; mark the cached routes for the nexthop
 710                  * stale, so anyone caching it rechecks if this exception
 711                  * applies to them.
 712                  */
 713                 rt = rcu_dereference(nh->nh_rth_input);
 714                 if (rt)
 715                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 716
 717                 for_each_possible_cpu(i) {
 718                         struct rtable __rcu **prt;
 719                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 720                         rt = rcu_dereference(*prt);
 721                         if (rt)
 722                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 723                 }
 724         }
 725
 726         fnhe->fnhe_stamp = jiffies;
 727
 728 out_unlock:
 729         spin_unlock_bh(&fnhe_lock);
 730 }
 731
 732 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 733                              bool kill_route)
 734 {
 735         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 736         __be32 old_gw = ip_hdr(skb)->saddr;
 737         struct net_device *dev = skb->dev;
 738         struct in_device *in_dev;
 739         struct fib_result res;
 740         struct neighbour *n;
 741         struct net *net;
 742
 743         switch (icmp_hdr(skb)->code & 7) {
 744         case ICMP_REDIR_NET:
 745         case ICMP_REDIR_NETTOS:
 746         case ICMP_REDIR_HOST:
 747         case ICMP_REDIR_HOSTTOS:
 748                 break;
 749
 750         default:
 751                 return;
 752         }
 753
 754         if (rt->rt_gateway != old_gw)
 755                 return;
 756
 757         in_dev = __in_dev_get_rcu(dev);
 758         if (!in_dev)
 759                 return;
 760
 761         net = dev_net(dev);
 762         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 763             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 764             ipv4_is_zeronet(new_gw))
 765                 goto reject_redirect;
 766
 767         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 768                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 769                         goto reject_redirect;
 770                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 771                         goto reject_redirect;
 772         } else {
 773                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 774                         goto reject_redirect;
 775         }
 776
 777         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 778         if (!n)
 779                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 780         if (!IS_ERR(n)) {
 781                 if (!(n->nud_state & NUD_VALID)) {
 782                         neigh_event_send(n, NULL);
 783                 } else {
 784                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 785                                 struct fib_nh *nh = &FIB_RES_NH(res);
 786
 787                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 788                                                 0, jiffies + ip_rt_gc_timeout);
 789                         }
 790                         if (kill_route)
 791                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 792                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 793                 }
 794                 neigh_release(n);
 795         }
 796         return;
 797
 798 reject_redirect:
 799 #ifdef CONFIG_IP_ROUTE_VERBOSE
 800         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 801                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 802                 __be32 daddr = iph->daddr;
 803                 __be32 saddr = iph->saddr;
 804
 805                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 806                                      "  Advised path = %pI4 -> %pI4\n",
 807                                      &old_gw, dev->name, &new_gw,
 808                                      &saddr, &daddr);
 809         }
 810 #endif
 811         ;
 812 }
 813
 814 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 815 {
 816         struct rtable *rt;
 817         struct flowi4 fl4;
 818         const struct iphdr *iph = (const struct iphdr *) skb->data;
 819         struct net *net = dev_net(skb->dev);
 820         int oif = skb->dev->ifindex;
 821         u8 tos = RT_TOS(iph->tos);
 822         u8 prot = iph->protocol;
 823         u32 mark = skb->mark;
 824
 825         rt = (struct rtable *) dst;
 826
 827         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 828         __ip_do_redirect(rt, skb, &fl4, true);
 829 }
 830
 831 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 832 {
 833         struct rtable *rt = (struct rtable *)dst;
 834         struct dst_entry *ret = dst;
 835
 836         if (rt) {
 837                 if (dst->obsolete > 0) {
 838                         ip_rt_put(rt);
 839                         ret = NULL;
 840                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 841                            rt->dst.expires) {
 842                         ip_rt_put(rt);
 843                         ret = NULL;
 844                 }
 845         }
 846         return ret;
 847 }
 848
 849 /*
 850  * Algorithm:
 851  *      1. The first ip_rt_redirect_number redirects are sent
 852  *         with exponential backoff, then we stop sending them at all,
 853  *         assuming that the host ignores our redirects.
 854  *      2. If we did not see packets requiring redirects
 855  *         during ip_rt_redirect_silence, we assume that the host
 856  *         forgot redirected route and start to send redirects again.
 857  *
 858  * This algorithm is much cheaper and more intelligent than dumb load limiting
 859  * in icmp.c.
 860  *
 861  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 862  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 863  */
 864
 865 void ip_rt_send_redirect(struct sk_buff *skb)
 866 {
 867         struct rtable *rt = skb_rtable(skb);
 868         struct in_device *in_dev;
 869         struct inet_peer *peer;
 870         struct net *net;
 871         int log_martians;
 872         int vif;
 873
 874         rcu_read_lock();
 875         in_dev = __in_dev_get_rcu(rt->dst.dev);
 876         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 877                 rcu_read_unlock();
 878                 return;
 879         }
 880         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 881         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 882         rcu_read_unlock();
 883
 884         net = dev_net(rt->dst.dev);
 885         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 886         if (!peer) {
 887                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 888                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 889                 return;
 890         }
 891
 892         /* No redirected packets during ip_rt_redirect_silence;
 893          * reset the algorithm.
 894          */
 895         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 896                 peer->rate_tokens = 0;
 897
 898         /* Too many ignored redirects; do not send anything
 899          * set dst.rate_last to the last seen redirected packet.
 900          */
 901         if (peer->rate_tokens >= ip_rt_redirect_number) {
 902                 peer->rate_last = jiffies;
 903                 goto out_put_peer;
 904         }
 905
 906         /* Check for load limit; set rate_last to the latest sent
 907          * redirect.
 908          */
 909         if (peer->rate_tokens == 0 ||
 910             time_after(jiffies,
 911                        (peer->rate_last +
 912                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 913                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 914
 915                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 916                 peer->rate_last = jiffies;
 917                 ++peer->rate_tokens;
 918 #ifdef CONFIG_IP_ROUTE_VERBOSE
 919                 if (log_martians &&
 920                     peer->rate_tokens == ip_rt_redirect_number)
 921                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 922                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 923                                              &ip_hdr(skb)->daddr, &gw);
 924 #endif
 925         }
 926 out_put_peer:
 927         inet_putpeer(peer);
 928 }
 929
 930 static int ip_error(struct sk_buff *skb)
 931 {
 932         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 933         struct rtable *rt = skb_rtable(skb);
 934         struct inet_peer *peer;
 935         unsigned long now;
 936         struct net *net;
 937         bool send;
 938         int code;
 939
 940         /* IP on this device is disabled. */
 941         if (!in_dev)
 942                 goto out;
 943
 944         net = dev_net(rt->dst.dev);
 945         if (!IN_DEV_FORWARD(in_dev)) {
 946                 switch (rt->dst.error) {
 947                 case EHOSTUNREACH:
 948                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 949                         break;
 950
 951                 case ENETUNREACH:
 952                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 953                         break;
 954                 }
 955                 goto out;
 956         }
 957
 958         switch (rt->dst.error) {
 959         case EINVAL:
 960         default:
 961                 goto out;
 962         case EHOSTUNREACH:
 963                 code = ICMP_HOST_UNREACH;
 964                 break;
 965         case ENETUNREACH:
 966                 code = ICMP_NET_UNREACH;
 967                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                 break;
 969         case EACCES:
 970                 code = ICMP_PKT_FILTERED;
 971                 break;
 972         }
 973
 974         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 975                                l3mdev_master_ifindex(skb->dev), 1);
 976
 977         send = true;
 978         if (peer) {
 979                 now = jiffies;
 980                 peer->rate_tokens += now - peer->rate_last;
 981                 if (peer->rate_tokens > ip_rt_error_burst)
 982                         peer->rate_tokens = ip_rt_error_burst;
 983                 peer->rate_last = now;
 984                 if (peer->rate_tokens >= ip_rt_error_cost)
 985                         peer->rate_tokens -= ip_rt_error_cost;
 986                 else
 987                         send = false;
 988                 inet_putpeer(peer);
 989         }
 990         if (send)
 991                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 992
 993 out:    kfree_skb(skb);
 994         return 0;
 995 }
 996
 997 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 998 {
 999         struct dst_entry *dst = &rt->dst;
1000         struct fib_result res;
1001
1002         if (dst_metric_locked(dst, RTAX_MTU))
1003                 return;
1004
1005         if (ipv4_mtu(dst) < mtu)
1006                 return;
1007
1008         if (mtu < ip_rt_min_pmtu)
1009                 mtu = ip_rt_min_pmtu;
1010
1011         if (rt->rt_pmtu == mtu &&
1012             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013                 return;
1014
1015         rcu_read_lock();
1016         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1017                 struct fib_nh *nh = &FIB_RES_NH(res);
1018
1019                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020                                       jiffies + ip_rt_mtu_expires);
1021         }
1022         rcu_read_unlock();
1023 }
1024
1025 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026                               struct sk_buff *skb, u32 mtu)
1027 {
1028         struct rtable *rt = (struct rtable *) dst;
1029         struct flowi4 fl4;
1030
1031         ip_rt_build_flow_key(&fl4, sk, skb);
1032         __ip_rt_update_pmtu(rt, &fl4, mtu);
1033 }
1034
1035 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036                       int oif, u32 mark, u8 protocol, int flow_flags)
1037 {
1038         const struct iphdr *iph = (const struct iphdr *) skb->data;
1039         struct flowi4 fl4;
1040         struct rtable *rt;
1041
1042         if (!mark)
1043                 mark = IP4_REPLY_MARK(net, skb->mark);
1044
1045         __build_flow_key(net, &fl4, NULL, iph, oif,
1046                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1047         rt = __ip_route_output_key(net, &fl4);
1048         if (!IS_ERR(rt)) {
1049                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1050                 ip_rt_put(rt);
1051         }
1052 }
1053 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
1055 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060
1061         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1062
1063         if (!fl4.flowi4_mark)
1064                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
1066         rt = __ip_route_output_key(sock_net(sk), &fl4);
1067         if (!IS_ERR(rt)) {
1068                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                 ip_rt_put(rt);
1070         }
1071 }
1072
1073 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078         struct dst_entry *odst = NULL;
1079         bool new = false;
1080         struct net *net = sock_net(sk);
1081
1082         bh_lock_sock(sk);
1083
1084         if (!ip_sk_accept_pmtu(sk))
1085                 goto out;
1086
1087         odst = sk_dst_get(sk);
1088
1089         if (sock_owned_by_user(sk) || !odst) {
1090                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091                 goto out;
1092         }
1093
1094         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1095
1096         rt = (struct rtable *)odst;
1097         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1098                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099                 if (IS_ERR(rt))
1100                         goto out;
1101
1102                 new = true;
1103         }
1104
1105         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
1107         if (!dst_check(&rt->dst, 0)) {
1108                 if (new)
1109                         dst_release(&rt->dst);
1110
1111                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112                 if (IS_ERR(rt))
1113                         goto out;
1114
1115                 new = true;
1116         }
1117
1118         if (new)
1119                 sk_dst_set(sk, &rt->dst);
1120
1121 out:
1122         bh_unlock_sock(sk);
1123         dst_release(odst);
1124 }
1125 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1126
1127 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128                    int oif, u32 mark, u8 protocol, int flow_flags)
1129 {
1130         const struct iphdr *iph = (const struct iphdr *) skb->data;
1131         struct flowi4 fl4;
1132         struct rtable *rt;
1133
1134         __build_flow_key(net, &fl4, NULL, iph, oif,
1135                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1136         rt = __ip_route_output_key(net, &fl4);
1137         if (!IS_ERR(rt)) {
1138                 __ip_do_redirect(rt, skb, &fl4, false);
1139                 ip_rt_put(rt);
1140         }
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145 {
1146         const struct iphdr *iph = (const struct iphdr *) skb->data;
1147         struct flowi4 fl4;
1148         struct rtable *rt;
1149         struct net *net = sock_net(sk);
1150
1151         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152         rt = __ip_route_output_key(net, &fl4);
1153         if (!IS_ERR(rt)) {
1154                 __ip_do_redirect(rt, skb, &fl4, false);
1155                 ip_rt_put(rt);
1156         }
1157 }
1158 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
1160 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161 {
1162         struct rtable *rt = (struct rtable *) dst;
1163
1164         /* All IPV4 dsts are created with ->obsolete set to the value
1165          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166          * into this function always.
1167          *
1168          * When a PMTU/redirect information update invalidates a route,
1169          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170          * DST_OBSOLETE_DEAD by dst_free().
1171          */
1172         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1173                 return NULL;
1174         return dst;
1175 }
1176
1177 static void ipv4_link_failure(struct sk_buff *skb)
1178 {
1179         struct rtable *rt;
1180
1181         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
1183         rt = skb_rtable(skb);
1184         if (rt)
1185                 dst_set_expires(&rt->dst, 0);
1186 }
1187
1188 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1189 {
1190         pr_debug("%s: %pI4 -> %pI4, %s\n",
1191                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192                  skb->dev ? skb->dev->name : "?");
1193         kfree_skb(skb);
1194         WARN_ON(1);
1195         return 0;
1196 }
1197
1198 /*
1199    We do not cache source address of outgoing interface,
1200    because it is used only by IP RR, TS and SRR options,
1201    so that it out of fast path.
1202
1203    BTW remember: "addr" is allowed to be not aligned
1204    in IP options!
1205  */
1206
1207 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1208 {
1209         __be32 src;
1210
1211         if (rt_is_output_route(rt))
1212                 src = ip_hdr(skb)->saddr;
1213         else {
1214                 struct fib_result res;
1215                 struct flowi4 fl4;
1216                 struct iphdr *iph;
1217
1218                 iph = ip_hdr(skb);
1219
1220                 memset(&fl4, 0, sizeof(fl4));
1221                 fl4.daddr = iph->daddr;
1222                 fl4.saddr = iph->saddr;
1223                 fl4.flowi4_tos = RT_TOS(iph->tos);
1224                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225                 fl4.flowi4_iif = skb->dev->ifindex;
1226                 fl4.flowi4_mark = skb->mark;
1227
1228                 rcu_read_lock();
1229                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1230                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1231                 else
1232                         src = inet_select_addr(rt->dst.dev,
1233                                                rt_nexthop(rt, iph->daddr),
1234                                                RT_SCOPE_UNIVERSE);
1235                 rcu_read_unlock();
1236         }
1237         memcpy(addr, &src, 4);
1238 }
1239
1240 #ifdef CONFIG_IP_ROUTE_CLASSID
1241 static void set_class_tag(struct rtable *rt, u32 tag)
1242 {
1243         if (!(rt->dst.tclassid & 0xFFFF))
1244                 rt->dst.tclassid |= tag & 0xFFFF;
1245         if (!(rt->dst.tclassid & 0xFFFF0000))
1246                 rt->dst.tclassid |= tag & 0xFFFF0000;
1247 }
1248 #endif
1249
1250 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251 {
1252         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253         unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254                                     ip_rt_min_advmss);
1255
1256         return min(advmss, IPV4_MAX_PMTU - header_size);
1257 }
1258
1259 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1260 {
1261         const struct rtable *rt = (const struct rtable *) dst;
1262         unsigned int mtu = rt->rt_pmtu;
1263
1264         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1265                 mtu = dst_metric_raw(dst, RTAX_MTU);
1266
1267         if (mtu)
1268                 return mtu;
1269
1270         mtu = READ_ONCE(dst->dev->mtu);
1271
1272         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1273                 if (rt->rt_uses_gateway && mtu > 576)
1274                         mtu = 576;
1275         }
1276
1277         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1280 }
1281
1282 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1283 {
1284         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1285         struct fib_nh_exception *fnhe;
1286         u32 hval;
1287
1288         if (!hash)
1289                 return NULL;
1290
1291         hval = fnhe_hashfun(daddr);
1292
1293         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1295                 if (fnhe->fnhe_daddr == daddr)
1296                         return fnhe;
1297         }
1298         return NULL;
1299 }
1300
1301 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1302                               __be32 daddr, const bool do_cache)
1303 {
1304         bool ret = false;
1305
1306         spin_lock_bh(&fnhe_lock);
1307
1308         if (daddr == fnhe->fnhe_daddr) {
1309                 struct rtable __rcu **porig;
1310                 struct rtable *orig;
1311                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1312
1313                 if (rt_is_input_route(rt))
1314                         porig = &fnhe->fnhe_rth_input;
1315                 else
1316                         porig = &fnhe->fnhe_rth_output;
1317                 orig = rcu_dereference(*porig);
1318
1319                 if (fnhe->fnhe_genid != genid) {
1320                         fnhe->fnhe_genid = genid;
1321                         fnhe->fnhe_gw = 0;
1322                         fnhe->fnhe_pmtu = 0;
1323                         fnhe->fnhe_expires = 0;
1324                         fnhe_flush_routes(fnhe);
1325                         orig = NULL;
1326                 }
1327                 fill_route_from_fnhe(rt, fnhe);
1328                 if (!rt->rt_gateway)
1329                         rt->rt_gateway = daddr;
1330
1331                 if (do_cache) {
1332                         dst_hold(&rt->dst);
1333                         rcu_assign_pointer(*porig, rt);
1334                         if (orig) {
1335                                 dst_dev_put(&orig->dst);
1336                                 dst_release(&orig->dst);
1337                         }
1338                         ret = true;
1339                 }
1340
1341                 fnhe->fnhe_stamp = jiffies;
1342         }
1343         spin_unlock_bh(&fnhe_lock);
1344
1345         return ret;
1346 }
1347
1348 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1349 {
1350         struct rtable *orig, *prev, **p;
1351         bool ret = true;
1352
1353         if (rt_is_input_route(rt)) {
1354                 p = (struct rtable **)&nh->nh_rth_input;
1355         } else {
1356                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1357         }
1358         orig = *p;
1359
1360         /* hold dst before doing cmpxchg() to avoid race condition
1361          * on this dst
1362          */
1363         dst_hold(&rt->dst);
1364         prev = cmpxchg(p, orig, rt);
1365         if (prev == orig) {
1366                 if (orig) {
1367                         dst_dev_put(&orig->dst);
1368                         dst_release(&orig->dst);
1369                 }
1370         } else {
1371                 dst_release(&rt->dst);
1372                 ret = false;
1373         }
1374
1375         return ret;
1376 }
1377
1378 struct uncached_list {
1379         spinlock_t              lock;
1380         struct list_head        head;
1381 };
1382
1383 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1384
1385 void rt_add_uncached_list(struct rtable *rt)
1386 {
1387         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389         rt->rt_uncached_list = ul;
1390
1391         spin_lock_bh(&ul->lock);
1392         list_add_tail(&rt->rt_uncached, &ul->head);
1393         spin_unlock_bh(&ul->lock);
1394 }
1395
1396 void rt_del_uncached_list(struct rtable *rt)
1397 {
1398         if (!list_empty(&rt->rt_uncached)) {
1399                 struct uncached_list *ul = rt->rt_uncached_list;
1400
1401                 spin_lock_bh(&ul->lock);
1402                 list_del(&rt->rt_uncached);
1403                 spin_unlock_bh(&ul->lock);
1404         }
1405 }
1406
1407 static void ipv4_dst_destroy(struct dst_entry *dst)
1408 {
1409         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1410         struct rtable *rt = (struct rtable *)dst;
1411
1412         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1413                 kfree(p);
1414
1415         rt_del_uncached_list(rt);
1416 }
1417
1418 void rt_flush_dev(struct net_device *dev)
1419 {
1420         struct net *net = dev_net(dev);
1421         struct rtable *rt;
1422         int cpu;
1423
1424         for_each_possible_cpu(cpu) {
1425                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1426
1427                 spin_lock_bh(&ul->lock);
1428                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1429                         if (rt->dst.dev != dev)
1430                                 continue;
1431                         rt->dst.dev = net->loopback_dev;
1432                         dev_hold(rt->dst.dev);
1433                         dev_put(dev);
1434                 }
1435                 spin_unlock_bh(&ul->lock);
1436         }
1437 }
1438
1439 static bool rt_cache_valid(const struct rtable *rt)
1440 {
1441         return  rt &&
1442                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1443                 !rt_is_expired(rt);
1444 }
1445
1446 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1447                            const struct fib_result *res,
1448                            struct fib_nh_exception *fnhe,
1449                            struct fib_info *fi, u16 type, u32 itag,
1450                            const bool do_cache)
1451 {
1452         bool cached = false;
1453
1454         if (fi) {
1455                 struct fib_nh *nh = &FIB_RES_NH(*res);
1456
1457                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1458                         rt->rt_gateway = nh->nh_gw;
1459                         rt->rt_uses_gateway = 1;
1460                 }
1461                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1462                 if (fi->fib_metrics != &dst_default_metrics) {
1463                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1464                         refcount_inc(&fi->fib_metrics->refcnt);
1465                 }
1466 #ifdef CONFIG_IP_ROUTE_CLASSID
1467                 rt->dst.tclassid = nh->nh_tclassid;
1468 #endif
1469                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1470                 if (unlikely(fnhe))
1471                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1472                 else if (do_cache)
1473                         cached = rt_cache_route(nh, rt);
1474                 if (unlikely(!cached)) {
1475                         /* Routes we intend to cache in nexthop exception or
1476                          * FIB nexthop have the DST_NOCACHE bit clear.
1477                          * However, if we are unsuccessful at storing this
1478                          * route into the cache we really need to set it.
1479                          */
1480                         if (!rt->rt_gateway)
1481                                 rt->rt_gateway = daddr;
1482                         rt_add_uncached_list(rt);
1483                 }
1484         } else
1485                 rt_add_uncached_list(rt);
1486
1487 #ifdef CONFIG_IP_ROUTE_CLASSID
1488 #ifdef CONFIG_IP_MULTIPLE_TABLES
1489         set_class_tag(rt, res->tclassid);
1490 #endif
1491         set_class_tag(rt, itag);
1492 #endif
1493 }
1494
1495 struct rtable *rt_dst_alloc(struct net_device *dev,
1496                             unsigned int flags, u16 type,
1497                             bool nopolicy, bool noxfrm, bool will_cache)
1498 {
1499         struct rtable *rt;
1500
1501         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1502                        (will_cache ? 0 : DST_HOST) |
1503                        (nopolicy ? DST_NOPOLICY : 0) |
1504                        (noxfrm ? DST_NOXFRM : 0));
1505
1506         if (rt) {
1507                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1508                 rt->rt_flags = flags;
1509                 rt->rt_type = type;
1510                 rt->rt_is_input = 0;
1511                 rt->rt_iif = 0;
1512                 rt->rt_pmtu = 0;
1513                 rt->rt_gateway = 0;
1514                 rt->rt_uses_gateway = 0;
1515                 rt->rt_table_id = 0;
1516                 INIT_LIST_HEAD(&rt->rt_uncached);
1517
1518                 rt->dst.output = ip_output;
1519                 if (flags & RTCF_LOCAL)
1520                         rt->dst.input = ip_local_deliver;
1521         }
1522
1523         return rt;
1524 }
1525 EXPORT_SYMBOL(rt_dst_alloc);
1526
1527 /* called in rcu_read_lock() section */
1528 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1529                           u8 tos, struct net_device *dev,
1530                           struct in_device *in_dev, u32 *itag)
1531 {
1532         int err;
1533
1534         /* Primary sanity checks. */
1535         if (!in_dev)
1536                 return -EINVAL;
1537
1538         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1539             skb->protocol != htons(ETH_P_IP))
1540                 return -EINVAL;
1541
1542         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1543                 return -EINVAL;
1544
1545         if (ipv4_is_zeronet(saddr)) {
1546                 if (!ipv4_is_local_multicast(daddr))
1547                         return -EINVAL;
1548         } else {
1549                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1550                                           in_dev, itag);
1551                 if (err < 0)
1552                         return err;
1553         }
1554         return 0;
1555 }
1556
1557 /* called in rcu_read_lock() section */
1558 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1559                              u8 tos, struct net_device *dev, int our)
1560 {
1561         struct in_device *in_dev = __in_dev_get_rcu(dev);
1562         unsigned int flags = RTCF_MULTICAST;
1563         struct rtable *rth;
1564         u32 itag = 0;
1565         int err;
1566
1567         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1568         if (err)
1569                 return err;
1570
1571         if (our)
1572                 flags |= RTCF_LOCAL;
1573
1574         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1575                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1576         if (!rth)
1577                 return -ENOBUFS;
1578
1579 #ifdef CONFIG_IP_ROUTE_CLASSID
1580         rth->dst.tclassid = itag;
1581 #endif
1582         rth->dst.output = ip_rt_bug;
1583         rth->rt_is_input= 1;
1584
1585 #ifdef CONFIG_IP_MROUTE
1586         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1587                 rth->dst.input = ip_mr_input;
1588 #endif
1589         RT_CACHE_STAT_INC(in_slow_mc);
1590
1591         skb_dst_set(skb, &rth->dst);
1592         return 0;
1593 }
1594
1595
1596 static void ip_handle_martian_source(struct net_device *dev,
1597                                      struct in_device *in_dev,
1598                                      struct sk_buff *skb,
1599                                      __be32 daddr,
1600                                      __be32 saddr)
1601 {
1602         RT_CACHE_STAT_INC(in_martian_src);
1603 #ifdef CONFIG_IP_ROUTE_VERBOSE
1604         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1605                 /*
1606                  *      RFC1812 recommendation, if source is martian,
1607                  *      the only hint is MAC header.
1608                  */
1609                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1610                         &daddr, &saddr, dev->name);
1611                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1612                         print_hex_dump(KERN_WARNING, "ll header: ",
1613                                        DUMP_PREFIX_OFFSET, 16, 1,
1614                                        skb_mac_header(skb),
1615                                        dev->hard_header_len, true);
1616                 }
1617         }
1618 #endif
1619 }
1620
1621 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1622 {
1623         struct fnhe_hash_bucket *hash;
1624         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1625         u32 hval = fnhe_hashfun(daddr);
1626
1627         spin_lock_bh(&fnhe_lock);
1628
1629         hash = rcu_dereference_protected(nh->nh_exceptions,
1630                                          lockdep_is_held(&fnhe_lock));
1631         hash += hval;
1632
1633         fnhe_p = &hash->chain;
1634         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1635         while (fnhe) {
1636                 if (fnhe->fnhe_daddr == daddr) {
1637                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1638                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1639                         fnhe_flush_routes(fnhe);
1640                         kfree_rcu(fnhe, rcu);
1641                         break;
1642                 }
1643                 fnhe_p = &fnhe->fnhe_next;
1644                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1645                                                  lockdep_is_held(&fnhe_lock));
1646         }
1647
1648         spin_unlock_bh(&fnhe_lock);
1649 }
1650
1651 static void set_lwt_redirect(struct rtable *rth)
1652 {
1653         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1654                 rth->dst.lwtstate->orig_output = rth->dst.output;
1655                 rth->dst.output = lwtunnel_output;
1656         }
1657
1658         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1659                 rth->dst.lwtstate->orig_input = rth->dst.input;
1660                 rth->dst.input = lwtunnel_input;
1661         }
1662 }
1663
1664 /* called in rcu_read_lock() section */
1665 static int __mkroute_input(struct sk_buff *skb,
1666                            const struct fib_result *res,
1667                            struct in_device *in_dev,
1668                            __be32 daddr, __be32 saddr, u32 tos)
1669 {
1670         struct fib_nh_exception *fnhe;
1671         struct rtable *rth;
1672         int err;
1673         struct in_device *out_dev;
1674         bool do_cache;
1675         u32 itag = 0;
1676
1677         /* get a working reference to the output device */
1678         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1679         if (!out_dev) {
1680                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1681                 return -EINVAL;
1682         }
1683
1684         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1685                                   in_dev->dev, in_dev, &itag);
1686         if (err < 0) {
1687                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1688                                          saddr);
1689
1690                 goto cleanup;
1691         }
1692
1693         do_cache = res->fi && !itag;
1694         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1695             skb->protocol == htons(ETH_P_IP) &&
1696             (IN_DEV_SHARED_MEDIA(out_dev) ||
1697              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1698                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1699
1700         if (skb->protocol != htons(ETH_P_IP)) {
1701                 /* Not IP (i.e. ARP). Do not create route, if it is
1702                  * invalid for proxy arp. DNAT routes are always valid.
1703                  *
1704                  * Proxy arp feature have been extended to allow, ARP
1705                  * replies back to the same interface, to support
1706                  * Private VLAN switch technologies. See arp.c.
1707                  */
1708                 if (out_dev == in_dev &&
1709                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1710                         err = -EINVAL;
1711                         goto cleanup;
1712                 }
1713         }
1714
1715         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1716         if (do_cache) {
1717                 if (fnhe) {
1718                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1719                         if (rth && rth->dst.expires &&
1720                             time_after(jiffies, rth->dst.expires)) {
1721                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1722                                 fnhe = NULL;
1723                         } else {
1724                                 goto rt_cache;
1725                         }
1726                 }
1727
1728                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1729
1730 rt_cache:
1731                 if (rt_cache_valid(rth)) {
1732                         skb_dst_set_noref(skb, &rth->dst);
1733                         goto out;
1734                 }
1735         }
1736
1737         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1738                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1739                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1740         if (!rth) {
1741                 err = -ENOBUFS;
1742                 goto cleanup;
1743         }
1744
1745         rth->rt_is_input = 1;
1746         if (res->table)
1747                 rth->rt_table_id = res->table->tb_id;
1748         RT_CACHE_STAT_INC(in_slow_tot);
1749
1750         rth->dst.input = ip_forward;
1751
1752         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753                        do_cache);
1754         set_lwt_redirect(rth);
1755         skb_dst_set(skb, &rth->dst);
1756 out:
1757         err = 0;
1758  cleanup:
1759         return err;
1760 }
1761
1762 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1763 /* To make ICMP packets follow the right flow, the multipath hash is
1764  * calculated from the inner IP addresses.
1765  */
1766 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767                                  struct flow_keys *hash_keys)
1768 {
1769         const struct iphdr *outer_iph = ip_hdr(skb);
1770         const struct iphdr *inner_iph;
1771         const struct icmphdr *icmph;
1772         struct iphdr _inner_iph;
1773         struct icmphdr _icmph;
1774
1775         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1776         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1777         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1778                 return;
1779
1780         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1781                 return;
1782
1783         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1784                                    &_icmph);
1785         if (!icmph)
1786                 return;
1787
1788         if (icmph->type != ICMP_DEST_UNREACH &&
1789             icmph->type != ICMP_REDIRECT &&
1790             icmph->type != ICMP_TIME_EXCEEDED &&
1791             icmph->type != ICMP_PARAMETERPROB)
1792                 return;
1793
1794         inner_iph = skb_header_pointer(skb,
1795                                        outer_iph->ihl * 4 + sizeof(_icmph),
1796                                        sizeof(_inner_iph), &_inner_iph);
1797         if (!inner_iph)
1798                 return;
1799         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1800         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1801 }
1802
1803 /* if skb is set it will be used and fl4 can be NULL */
1804 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1805                        const struct sk_buff *skb)
1806 {
1807         struct net *net = fi->fib_net;
1808         struct flow_keys hash_keys;
1809         u32 mhash;
1810
1811         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1812         case 0:
1813                 memset(&hash_keys, 0, sizeof(hash_keys));
1814                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1815                 if (skb) {
1816                         ip_multipath_l3_keys(skb, &hash_keys);
1817                 } else {
1818                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1819                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1820                 }
1821                 break;
1822         case 1:
1823                 /* skb is currently provided only when forwarding */
1824                 if (skb) {
1825                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1826                         struct flow_keys keys;
1827
1828                         /* short-circuit if we already have L4 hash present */
1829                         if (skb->l4_hash)
1830                                 return skb_get_hash_raw(skb) >> 1;
1831                         memset(&hash_keys, 0, sizeof(hash_keys));
1832                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1833                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1834                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1835                         hash_keys.ports.src = keys.ports.src;
1836                         hash_keys.ports.dst = keys.ports.dst;
1837                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1838                 } else {
1839                         memset(&hash_keys, 0, sizeof(hash_keys));
1840                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1842                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1843                         hash_keys.ports.src = fl4->fl4_sport;
1844                         hash_keys.ports.dst = fl4->fl4_dport;
1845                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1846                 }
1847                 break;
1848         }
1849         mhash = flow_hash_from_keys(&hash_keys);
1850
1851         return mhash >> 1;
1852 }
1853 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1854 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1855
1856 static int ip_mkroute_input(struct sk_buff *skb,
1857                             struct fib_result *res,
1858                             struct in_device *in_dev,
1859                             __be32 daddr, __be32 saddr, u32 tos)
1860 {
1861 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1862         if (res->fi && res->fi->fib_nhs > 1) {
1863                 int h = fib_multipath_hash(res->fi, NULL, skb);
1864
1865                 fib_select_multipath(res, h);
1866         }
1867 #endif
1868
1869         /* create a routing cache entry */
1870         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1871 }
1872
1873 /*
1874  *      NOTE. We drop all the packets that has local source
1875  *      addresses, because every properly looped back packet
1876  *      must have correct destination already attached by output routine.
1877  *
1878  *      Such approach solves two big problems:
1879  *      1. Not simplex devices are handled properly.
1880  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1881  *      called with rcu_read_lock()
1882  */
1883
1884 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1885                                u8 tos, struct net_device *dev,
1886                                struct fib_result *res)
1887 {
1888         struct in_device *in_dev = __in_dev_get_rcu(dev);
1889         struct ip_tunnel_info *tun_info;
1890         struct flowi4   fl4;
1891         unsigned int    flags = 0;
1892         u32             itag = 0;
1893         struct rtable   *rth;
1894         int             err = -EINVAL;
1895         struct net    *net = dev_net(dev);
1896         bool do_cache;
1897
1898         /* IP on this device is disabled. */
1899
1900         if (!in_dev)
1901                 goto out;
1902
1903         /* Check for the most weird martians, which can be not detected
1904            by fib_lookup.
1905          */
1906
1907         tun_info = skb_tunnel_info(skb);
1908         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1909                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1910         else
1911                 fl4.flowi4_tun_key.tun_id = 0;
1912         skb_dst_drop(skb);
1913
1914         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1915                 goto martian_source;
1916
1917         res->fi = NULL;
1918         res->table = NULL;
1919         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1920                 goto brd_input;
1921
1922         /* Accept zero addresses only to limited broadcast;
1923          * I even do not know to fix it or not. Waiting for complains :-)
1924          */
1925         if (ipv4_is_zeronet(saddr))
1926                 goto martian_source;
1927
1928         if (ipv4_is_zeronet(daddr))
1929                 goto martian_destination;
1930
1931         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1932          * and call it once if daddr or/and saddr are loopback addresses
1933          */
1934         if (ipv4_is_loopback(daddr)) {
1935                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1936                         goto martian_destination;
1937         } else if (ipv4_is_loopback(saddr)) {
1938                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1939                         goto martian_source;
1940         }
1941
1942         /*
1943          *      Now we are ready to route packet.
1944          */
1945         fl4.flowi4_oif = 0;
1946         fl4.flowi4_iif = dev->ifindex;
1947         fl4.flowi4_mark = skb->mark;
1948         fl4.flowi4_tos = tos;
1949         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1950         fl4.flowi4_flags = 0;
1951         fl4.daddr = daddr;
1952         fl4.saddr = saddr;
1953         fl4.flowi4_uid = sock_net_uid(net, NULL);
1954         err = fib_lookup(net, &fl4, res, 0);
1955         if (err != 0) {
1956                 if (!IN_DEV_FORWARD(in_dev))
1957                         err = -EHOSTUNREACH;
1958                 goto no_route;
1959         }
1960
1961         if (res->type == RTN_BROADCAST)
1962                 goto brd_input;
1963
1964         if (res->type == RTN_LOCAL) {
1965                 err = fib_validate_source(skb, saddr, daddr, tos,
1966                                           0, dev, in_dev, &itag);
1967                 if (err < 0)
1968                         goto martian_source;
1969                 goto local_input;
1970         }
1971
1972         if (!IN_DEV_FORWARD(in_dev)) {
1973                 err = -EHOSTUNREACH;
1974                 goto no_route;
1975         }
1976         if (res->type != RTN_UNICAST)
1977                 goto martian_destination;
1978
1979         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1980 out:    return err;
1981
1982 brd_input:
1983         if (skb->protocol != htons(ETH_P_IP))
1984                 goto e_inval;
1985
1986         if (!ipv4_is_zeronet(saddr)) {
1987                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1988                                           in_dev, &itag);
1989                 if (err < 0)
1990                         goto martian_source;
1991         }
1992         flags |= RTCF_BROADCAST;
1993         res->type = RTN_BROADCAST;
1994         RT_CACHE_STAT_INC(in_brd);
1995
1996 local_input:
1997         do_cache = false;
1998         if (res->fi) {
1999                 if (!itag) {
2000                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2001                         if (rt_cache_valid(rth)) {
2002                                 skb_dst_set_noref(skb, &rth->dst);
2003                                 err = 0;
2004                                 goto out;
2005                         }
2006                         do_cache = true;
2007                 }
2008         }
2009
2010         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2011                            flags | RTCF_LOCAL, res->type,
2012                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2013         if (!rth)
2014                 goto e_nobufs;
2015
2016         rth->dst.output= ip_rt_bug;
2017 #ifdef CONFIG_IP_ROUTE_CLASSID
2018         rth->dst.tclassid = itag;
2019 #endif
2020         rth->rt_is_input = 1;
2021         if (res->table)
2022                 rth->rt_table_id = res->table->tb_id;
2023
2024         RT_CACHE_STAT_INC(in_slow_tot);
2025         if (res->type == RTN_UNREACHABLE) {
2026                 rth->dst.input= ip_error;
2027                 rth->dst.error= -err;
2028                 rth->rt_flags   &= ~RTCF_LOCAL;
2029         }
2030
2031         if (do_cache) {
2032                 struct fib_nh *nh = &FIB_RES_NH(*res);
2033
2034                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2035                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2036                         WARN_ON(rth->dst.input == lwtunnel_input);
2037                         rth->dst.lwtstate->orig_input = rth->dst.input;
2038                         rth->dst.input = lwtunnel_input;
2039                 }
2040
2041                 if (unlikely(!rt_cache_route(nh, rth)))
2042                         rt_add_uncached_list(rth);
2043         }
2044         skb_dst_set(skb, &rth->dst);
2045         err = 0;
2046         goto out;
2047
2048 no_route:
2049         RT_CACHE_STAT_INC(in_no_route);
2050         res->type = RTN_UNREACHABLE;
2051         res->fi = NULL;
2052         res->table = NULL;
2053         goto local_input;
2054
2055         /*
2056          *      Do not cache martian addresses: they should be logged (RFC1812)
2057          */
2058 martian_destination:
2059         RT_CACHE_STAT_INC(in_martian_dst);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061         if (IN_DEV_LOG_MARTIANS(in_dev))
2062                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2063                                      &daddr, &saddr, dev->name);
2064 #endif
2065
2066 e_inval:
2067         err = -EINVAL;
2068         goto out;
2069
2070 e_nobufs:
2071         err = -ENOBUFS;
2072         goto out;
2073
2074 martian_source:
2075         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2076         goto out;
2077 }
2078
2079 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080                          u8 tos, struct net_device *dev)
2081 {
2082         struct fib_result res;
2083         int err;
2084
2085         tos &= IPTOS_RT_MASK;
2086         rcu_read_lock();
2087         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2088         rcu_read_unlock();
2089
2090         return err;
2091 }
2092 EXPORT_SYMBOL(ip_route_input_noref);
2093
2094 /* called with rcu_read_lock held */
2095 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096                        u8 tos, struct net_device *dev, struct fib_result *res)
2097 {
2098         /* Multicast recognition logic is moved from route cache to here.
2099            The problem was that too many Ethernet cards have broken/missing
2100            hardware multicast filters :-( As result the host on multicasting
2101            network acquires a lot of useless route cache entries, sort of
2102            SDR messages from all the world. Now we try to get rid of them.
2103            Really, provided software IP multicast filter is organized
2104            reasonably (at least, hashed), it does not result in a slowdown
2105            comparing with route cache reject entries.
2106            Note, that multicast routers are not affected, because
2107            route cache entry is created eventually.
2108          */
2109         if (ipv4_is_multicast(daddr)) {
2110                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2111                 int our = 0;
2112                 int err = -EINVAL;
2113
2114                 if (in_dev)
2115                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2116                                               ip_hdr(skb)->protocol);
2117
2118                 /* check l3 master if no match yet */
2119                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2120                         struct in_device *l3_in_dev;
2121
2122                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2123                         if (l3_in_dev)
2124                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2125                                                       ip_hdr(skb)->protocol);
2126                 }
2127
2128                 if (our
2129 #ifdef CONFIG_IP_MROUTE
2130                         ||
2131                     (!ipv4_is_local_multicast(daddr) &&
2132                      IN_DEV_MFORWARD(in_dev))
2133 #endif
2134                    ) {
2135                         err = ip_route_input_mc(skb, daddr, saddr,
2136                                                 tos, dev, our);
2137                 }
2138                 return err;
2139         }
2140
2141         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2142 }
2143
2144 /* called with rcu_read_lock() */
2145 static struct rtable *__mkroute_output(const struct fib_result *res,
2146                                        const struct flowi4 *fl4, int orig_oif,
2147                                        struct net_device *dev_out,
2148                                        unsigned int flags)
2149 {
2150         struct fib_info *fi = res->fi;
2151         struct fib_nh_exception *fnhe;
2152         struct in_device *in_dev;
2153         u16 type = res->type;
2154         struct rtable *rth;
2155         bool do_cache;
2156
2157         in_dev = __in_dev_get_rcu(dev_out);
2158         if (!in_dev)
2159                 return ERR_PTR(-EINVAL);
2160
2161         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2162                 if (ipv4_is_loopback(fl4->saddr) &&
2163                     !(dev_out->flags & IFF_LOOPBACK) &&
2164                     !netif_is_l3_master(dev_out))
2165                         return ERR_PTR(-EINVAL);
2166
2167         if (ipv4_is_lbcast(fl4->daddr))
2168                 type = RTN_BROADCAST;
2169         else if (ipv4_is_multicast(fl4->daddr))
2170                 type = RTN_MULTICAST;
2171         else if (ipv4_is_zeronet(fl4->daddr))
2172                 return ERR_PTR(-EINVAL);
2173
2174         if (dev_out->flags & IFF_LOOPBACK)
2175                 flags |= RTCF_LOCAL;
2176
2177         do_cache = true;
2178         if (type == RTN_BROADCAST) {
2179                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2180                 fi = NULL;
2181         } else if (type == RTN_MULTICAST) {
2182                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2183                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2184                                      fl4->flowi4_proto))
2185                         flags &= ~RTCF_LOCAL;
2186                 else
2187                         do_cache = false;
2188                 /* If multicast route do not exist use
2189                  * default one, but do not gateway in this case.
2190                  * Yes, it is hack.
2191                  */
2192                 if (fi && res->prefixlen < 4)
2193                         fi = NULL;
2194         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2195                    (orig_oif != dev_out->ifindex)) {
2196                 /* For local routes that require a particular output interface
2197                  * we do not want to cache the result.  Caching the result
2198                  * causes incorrect behaviour when there are multiple source
2199                  * addresses on the interface, the end result being that if the
2200                  * intended recipient is waiting on that interface for the
2201                  * packet he won't receive it because it will be delivered on
2202                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2203                  * be set to the loopback interface as well.
2204                  */
2205                 fi = NULL;
2206         }
2207
2208         fnhe = NULL;
2209         do_cache &= fi != NULL;
2210         if (do_cache) {
2211                 struct rtable __rcu **prth;
2212                 struct fib_nh *nh = &FIB_RES_NH(*res);
2213
2214                 fnhe = find_exception(nh, fl4->daddr);
2215                 if (fnhe) {
2216                         prth = &fnhe->fnhe_rth_output;
2217                         rth = rcu_dereference(*prth);
2218                         if (rth && rth->dst.expires &&
2219                             time_after(jiffies, rth->dst.expires)) {
2220                                 ip_del_fnhe(nh, fl4->daddr);
2221                                 fnhe = NULL;
2222                         } else {
2223                                 goto rt_cache;
2224                         }
2225                 }
2226
2227                 if (unlikely(fl4->flowi4_flags &
2228                              FLOWI_FLAG_KNOWN_NH &&
2229                              !(nh->nh_gw &&
2230                                nh->nh_scope == RT_SCOPE_LINK))) {
2231                         do_cache = false;
2232                         goto add;
2233                 }
2234                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2235                 rth = rcu_dereference(*prth);
2236
2237 rt_cache:
2238                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2239                         return rth;
2240         }
2241
2242 add:
2243         rth = rt_dst_alloc(dev_out, flags, type,
2244                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2245                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2246                            do_cache);
2247         if (!rth)
2248                 return ERR_PTR(-ENOBUFS);
2249
2250         rth->rt_iif     = orig_oif ? : 0;
2251         if (res->table)
2252                 rth->rt_table_id = res->table->tb_id;
2253
2254         RT_CACHE_STAT_INC(out_slow_tot);
2255
2256         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2257                 if (flags & RTCF_LOCAL &&
2258                     !(dev_out->flags & IFF_LOOPBACK)) {
2259                         rth->dst.output = ip_mc_output;
2260                         RT_CACHE_STAT_INC(out_slow_mc);
2261                 }
2262 #ifdef CONFIG_IP_MROUTE
2263                 if (type == RTN_MULTICAST) {
2264                         if (IN_DEV_MFORWARD(in_dev) &&
2265                             !ipv4_is_local_multicast(fl4->daddr)) {
2266                                 rth->dst.input = ip_mr_input;
2267                                 rth->dst.output = ip_mc_output;
2268                         }
2269                 }
2270 #endif
2271         }
2272
2273         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2274         set_lwt_redirect(rth);
2275
2276         return rth;
2277 }
2278
2279 /*
2280  * Major route resolver routine.
2281  */
2282
2283 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2284                                         const struct sk_buff *skb)
2285 {
2286         __u8 tos = RT_FL_TOS(fl4);
2287         struct fib_result res;
2288         struct rtable *rth;
2289
2290         res.tclassid    = 0;
2291         res.fi          = NULL;
2292         res.table       = NULL;
2293
2294         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2295         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2296         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2297                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2298
2299         rcu_read_lock();
2300         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2301         rcu_read_unlock();
2302
2303         return rth;
2304 }
2305 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2306
2307 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2308                                             struct fib_result *res,
2309                                             const struct sk_buff *skb)
2310 {
2311         struct net_device *dev_out = NULL;
2312         int orig_oif = fl4->flowi4_oif;
2313         unsigned int flags = 0;
2314         struct rtable *rth;
2315         int err = -ENETUNREACH;
2316
2317         if (fl4->saddr) {
2318                 rth = ERR_PTR(-EINVAL);
2319                 if (ipv4_is_multicast(fl4->saddr) ||
2320                     ipv4_is_lbcast(fl4->saddr) ||
2321                     ipv4_is_zeronet(fl4->saddr))
2322                         goto out;
2323
2324                 /* I removed check for oif == dev_out->oif here.
2325                    It was wrong for two reasons:
2326                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2327                       is assigned to multiple interfaces.
2328                    2. Moreover, we are allowed to send packets with saddr
2329                       of another iface. --ANK
2330                  */
2331
2332                 if (fl4->flowi4_oif == 0 &&
2333                     (ipv4_is_multicast(fl4->daddr) ||
2334                      ipv4_is_lbcast(fl4->daddr))) {
2335                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2336                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2337                         if (!dev_out)
2338                                 goto out;
2339
2340                         /* Special hack: user can direct multicasts
2341                            and limited broadcast via necessary interface
2342                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2343                            This hack is not just for fun, it allows
2344                            vic,vat and friends to work.
2345                            They bind socket to loopback, set ttl to zero
2346                            and expect that it will work.
2347                            From the viewpoint of routing cache they are broken,
2348                            because we are not allowed to build multicast path
2349                            with loopback source addr (look, routing cache
2350                            cannot know, that ttl is zero, so that packet
2351                            will not leave this host and route is valid).
2352                            Luckily, this hack is good workaround.
2353                          */
2354
2355                         fl4->flowi4_oif = dev_out->ifindex;
2356                         goto make_route;
2357                 }
2358
2359                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2360                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2361                         if (!__ip_dev_find(net, fl4->saddr, false))
2362                                 goto out;
2363                 }
2364         }
2365
2366
2367         if (fl4->flowi4_oif) {
2368                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2369                 rth = ERR_PTR(-ENODEV);
2370                 if (!dev_out)
2371                         goto out;
2372
2373                 /* RACE: Check return value of inet_select_addr instead. */
2374                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2375                         rth = ERR_PTR(-ENETUNREACH);
2376                         goto out;
2377                 }
2378                 if (ipv4_is_local_multicast(fl4->daddr) ||
2379                     ipv4_is_lbcast(fl4->daddr) ||
2380                     fl4->flowi4_proto == IPPROTO_IGMP) {
2381                         if (!fl4->saddr)
2382                                 fl4->saddr = inet_select_addr(dev_out, 0,
2383                                                               RT_SCOPE_LINK);
2384                         goto make_route;
2385                 }
2386                 if (!fl4->saddr) {
2387                         if (ipv4_is_multicast(fl4->daddr))
2388                                 fl4->saddr = inet_select_addr(dev_out, 0,
2389                                                               fl4->flowi4_scope);
2390                         else if (!fl4->daddr)
2391                                 fl4->saddr = inet_select_addr(dev_out, 0,
2392                                                               RT_SCOPE_HOST);
2393                 }
2394         }
2395
2396         if (!fl4->daddr) {
2397                 fl4->daddr = fl4->saddr;
2398                 if (!fl4->daddr)
2399                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2400                 dev_out = net->loopback_dev;
2401                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2402                 res->type = RTN_LOCAL;
2403                 flags |= RTCF_LOCAL;
2404                 goto make_route;
2405         }
2406
2407         err = fib_lookup(net, fl4, res, 0);
2408         if (err) {
2409                 res->fi = NULL;
2410                 res->table = NULL;
2411                 if (fl4->flowi4_oif &&
2412                     (ipv4_is_multicast(fl4->daddr) ||
2413                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2414                         /* Apparently, routing tables are wrong. Assume,
2415                            that the destination is on link.
2416
2417                            WHY? DW.
2418                            Because we are allowed to send to iface
2419                            even if it has NO routes and NO assigned
2420                            addresses. When oif is specified, routing
2421                            tables are looked up with only one purpose:
2422                            to catch if destination is gatewayed, rather than
2423                            direct. Moreover, if MSG_DONTROUTE is set,
2424                            we send packet, ignoring both routing tables
2425                            and ifaddr state. --ANK
2426
2427
2428                            We could make it even if oif is unknown,
2429                            likely IPv6, but we do not.
2430                          */
2431
2432                         if (fl4->saddr == 0)
2433                                 fl4->saddr = inet_select_addr(dev_out, 0,
2434                                                               RT_SCOPE_LINK);
2435                         res->type = RTN_UNICAST;
2436                         goto make_route;
2437                 }
2438                 rth = ERR_PTR(err);
2439                 goto out;
2440         }
2441
2442         if (res->type == RTN_LOCAL) {
2443                 if (!fl4->saddr) {
2444                         if (res->fi->fib_prefsrc)
2445                                 fl4->saddr = res->fi->fib_prefsrc;
2446                         else
2447                                 fl4->saddr = fl4->daddr;
2448                 }
2449
2450                 /* L3 master device is the loopback for that domain */
2451                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2452                         net->loopback_dev;
2453                 fl4->flowi4_oif = dev_out->ifindex;
2454                 flags |= RTCF_LOCAL;
2455                 goto make_route;
2456         }
2457
2458         fib_select_path(net, res, fl4, skb);
2459
2460         dev_out = FIB_RES_DEV(*res);
2461         fl4->flowi4_oif = dev_out->ifindex;
2462
2463
2464 make_route:
2465         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2466
2467 out:
2468         return rth;
2469 }
2470
2471 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2472 {
2473         return NULL;
2474 }
2475
2476 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2477 {
2478         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2479
2480         return mtu ? : dst->dev->mtu;
2481 }
2482
2483 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2484                                           struct sk_buff *skb, u32 mtu)
2485 {
2486 }
2487
2488 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2489                                        struct sk_buff *skb)
2490 {
2491 }
2492
2493 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2494                                           unsigned long old)
2495 {
2496         return NULL;
2497 }
2498
2499 static struct dst_ops ipv4_dst_blackhole_ops = {
2500         .family                 =       AF_INET,
2501         .check                  =       ipv4_blackhole_dst_check,
2502         .mtu                    =       ipv4_blackhole_mtu,
2503         .default_advmss         =       ipv4_default_advmss,
2504         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2505         .redirect               =       ipv4_rt_blackhole_redirect,
2506         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2507         .neigh_lookup           =       ipv4_neigh_lookup,
2508 };
2509
2510 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2511 {
2512         struct rtable *ort = (struct rtable *) dst_orig;
2513         struct rtable *rt;
2514
2515         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2516         if (rt) {
2517                 struct dst_entry *new = &rt->dst;
2518
2519                 new->__use = 1;
2520                 new->input = dst_discard;
2521                 new->output = dst_discard_out;
2522
2523                 new->dev = net->loopback_dev;
2524                 if (new->dev)
2525                         dev_hold(new->dev);
2526
2527                 rt->rt_is_input = ort->rt_is_input;
2528                 rt->rt_iif = ort->rt_iif;
2529                 rt->rt_pmtu = ort->rt_pmtu;
2530
2531                 rt->rt_genid = rt_genid_ipv4(net);
2532                 rt->rt_flags = ort->rt_flags;
2533                 rt->rt_type = ort->rt_type;
2534                 rt->rt_gateway = ort->rt_gateway;
2535                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2536
2537                 INIT_LIST_HEAD(&rt->rt_uncached);
2538         }
2539
2540         dst_release(dst_orig);
2541
2542         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2543 }
2544
2545 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2546                                     const struct sock *sk)
2547 {
2548         struct rtable *rt = __ip_route_output_key(net, flp4);
2549
2550         if (IS_ERR(rt))
2551                 return rt;
2552
2553         if (flp4->flowi4_proto)
2554                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2555                                                         flowi4_to_flowi(flp4),
2556                                                         sk, 0);
2557
2558         return rt;
2559 }
2560 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2561
2562 /* called with rcu_read_lock held */
2563 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2564                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2565                         u32 seq)
2566 {
2567         struct rtable *rt = skb_rtable(skb);
2568         struct rtmsg *r;
2569         struct nlmsghdr *nlh;
2570         unsigned long expires = 0;
2571         u32 error;
2572         u32 metrics[RTAX_MAX];
2573
2574         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2575         if (!nlh)
2576                 return -EMSGSIZE;
2577
2578         r = nlmsg_data(nlh);
2579         r->rtm_family    = AF_INET;
2580         r->rtm_dst_len  = 32;
2581         r->rtm_src_len  = 0;
2582         r->rtm_tos      = fl4->flowi4_tos;
2583         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2584         if (nla_put_u32(skb, RTA_TABLE, table_id))
2585                 goto nla_put_failure;
2586         r->rtm_type     = rt->rt_type;
2587         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2588         r->rtm_protocol = RTPROT_UNSPEC;
2589         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2590         if (rt->rt_flags & RTCF_NOTIFY)
2591                 r->rtm_flags |= RTM_F_NOTIFY;
2592         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2593                 r->rtm_flags |= RTCF_DOREDIRECT;
2594
2595         if (nla_put_in_addr(skb, RTA_DST, dst))
2596                 goto nla_put_failure;
2597         if (src) {
2598                 r->rtm_src_len = 32;
2599                 if (nla_put_in_addr(skb, RTA_SRC, src))
2600                         goto nla_put_failure;
2601         }
2602         if (rt->dst.dev &&
2603             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2604                 goto nla_put_failure;
2605 #ifdef CONFIG_IP_ROUTE_CLASSID
2606         if (rt->dst.tclassid &&
2607             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2608                 goto nla_put_failure;
2609 #endif
2610         if (!rt_is_input_route(rt) &&
2611             fl4->saddr != src) {
2612                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2613                         goto nla_put_failure;
2614         }
2615         if (rt->rt_uses_gateway &&
2616             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2617                 goto nla_put_failure;
2618
2619         expires = rt->dst.expires;
2620         if (expires) {
2621                 unsigned long now = jiffies;
2622
2623                 if (time_before(now, expires))
2624                         expires -= now;
2625                 else
2626                         expires = 0;
2627         }
2628
2629         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2630         if (rt->rt_pmtu && expires)
2631                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2632         if (rtnetlink_put_metrics(skb, metrics) < 0)
2633                 goto nla_put_failure;
2634
2635         if (fl4->flowi4_mark &&
2636             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2637                 goto nla_put_failure;
2638
2639         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2640             nla_put_u32(skb, RTA_UID,
2641                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2642                 goto nla_put_failure;
2643
2644         error = rt->dst.error;
2645
2646         if (rt_is_input_route(rt)) {
2647 #ifdef CONFIG_IP_MROUTE
2648                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2649                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2650                         int err = ipmr_get_route(net, skb,
2651                                                  fl4->saddr, fl4->daddr,
2652                                                  r, portid);
2653
2654                         if (err <= 0) {
2655                                 if (err == 0)
2656                                         return 0;
2657                                 goto nla_put_failure;
2658                         }
2659                 } else
2660 #endif
2661                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2662                                 goto nla_put_failure;
2663         }
2664
2665         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2666                 goto nla_put_failure;
2667
2668         nlmsg_end(skb, nlh);
2669         return 0;
2670
2671 nla_put_failure:
2672         nlmsg_cancel(skb, nlh);
2673         return -EMSGSIZE;
2674 }
2675
2676 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2677                              struct netlink_ext_ack *extack)
2678 {
2679         struct net *net = sock_net(in_skb->sk);
2680         struct rtmsg *rtm;
2681         struct nlattr *tb[RTA_MAX+1];
2682         struct fib_result res = {};
2683         struct rtable *rt = NULL;
2684         struct flowi4 fl4;
2685         __be32 dst = 0;
2686         __be32 src = 0;
2687         u32 iif;
2688         int err;
2689         int mark;
2690         struct sk_buff *skb;
2691         u32 table_id = RT_TABLE_MAIN;
2692         kuid_t uid;
2693
2694         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2695                           extack);
2696         if (err < 0)
2697                 goto errout;
2698
2699         rtm = nlmsg_data(nlh);
2700
2701         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2702         if (!skb) {
2703                 err = -ENOBUFS;
2704                 goto errout;
2705         }
2706
2707         /* Reserve room for dummy headers, this skb can pass
2708            through good chunk of routing engine.
2709          */
2710         skb_reset_mac_header(skb);
2711         skb_reset_network_header(skb);
2712
2713         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2714         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2715         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2716         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2717         if (tb[RTA_UID])
2718                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2719         else
2720                 uid = (iif ? INVALID_UID : current_uid());
2721
2722         /* Bugfix: need to give ip_route_input enough of an IP header to
2723          * not gag.
2724          */
2725         ip_hdr(skb)->protocol = IPPROTO_UDP;
2726         ip_hdr(skb)->saddr = src;
2727         ip_hdr(skb)->daddr = dst;
2728
2729         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2730
2731         memset(&fl4, 0, sizeof(fl4));
2732         fl4.daddr = dst;
2733         fl4.saddr = src;
2734         fl4.flowi4_tos = rtm->rtm_tos;
2735         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2736         fl4.flowi4_mark = mark;
2737         fl4.flowi4_uid = uid;
2738
2739         rcu_read_lock();
2740
2741         if (iif) {
2742                 struct net_device *dev;
2743
2744                 dev = dev_get_by_index_rcu(net, iif);
2745                 if (!dev) {
2746                         err = -ENODEV;
2747                         goto errout_free;
2748                 }
2749
2750                 skb->protocol   = htons(ETH_P_IP);
2751                 skb->dev        = dev;
2752                 skb->mark       = mark;
2753                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2754                                          dev, &res);
2755
2756                 rt = skb_rtable(skb);
2757                 if (err == 0 && rt->dst.error)
2758                         err = -rt->dst.error;
2759         } else {
2760                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2761                 err = 0;
2762                 if (IS_ERR(rt))
2763                         err = PTR_ERR(rt);
2764                 else
2765                         skb_dst_set(skb, &rt->dst);
2766         }
2767
2768         if (err)
2769                 goto errout_free;
2770
2771         if (rtm->rtm_flags & RTM_F_NOTIFY)
2772                 rt->rt_flags |= RTCF_NOTIFY;
2773
2774         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2775                 table_id = rt->rt_table_id;
2776
2777         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2778                 if (!res.fi) {
2779                         err = fib_props[res.type].error;
2780                         if (!err)
2781                                 err = -EHOSTUNREACH;
2782                         goto errout_free;
2783                 }
2784                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2785                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2786                                     rt->rt_type, res.prefix, res.prefixlen,
2787                                     fl4.flowi4_tos, res.fi, 0);
2788         } else {
2789                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2790                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2791         }
2792         if (err < 0)
2793                 goto errout_free;
2794
2795         rcu_read_unlock();
2796
2797         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2798 errout:
2799         return err;
2800
2801 errout_free:
2802         rcu_read_unlock();
2803         kfree_skb(skb);
2804         goto errout;
2805 }
2806
2807 void ip_rt_multicast_event(struct in_device *in_dev)
2808 {
2809         rt_cache_flush(dev_net(in_dev->dev));
2810 }
2811
2812 #ifdef CONFIG_SYSCTL
2813 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2814 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2815 static int ip_rt_gc_elasticity __read_mostly    = 8;
2816
2817 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2818                                         void __user *buffer,
2819                                         size_t *lenp, loff_t *ppos)
2820 {
2821         struct net *net = (struct net *)__ctl->extra1;
2822
2823         if (write) {
2824                 rt_cache_flush(net);
2825                 fnhe_genid_bump(net);
2826                 return 0;
2827         }
2828
2829         return -EINVAL;
2830 }
2831
2832 static struct ctl_table ipv4_route_table[] = {
2833         {
2834                 .procname       = "gc_thresh",
2835                 .data           = &ipv4_dst_ops.gc_thresh,
2836                 .maxlen         = sizeof(int),
2837                 .mode           = 0644,
2838                 .proc_handler   = proc_dointvec,
2839         },
2840         {
2841                 .procname       = "max_size",
2842                 .data           = &ip_rt_max_size,
2843                 .maxlen         = sizeof(int),
2844                 .mode           = 0644,
2845                 .proc_handler   = proc_dointvec,
2846         },
2847         {
2848                 /*  Deprecated. Use gc_min_interval_ms */
2849
2850                 .procname       = "gc_min_interval",
2851                 .data           = &ip_rt_gc_min_interval,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = proc_dointvec_jiffies,
2855         },
2856         {
2857                 .procname       = "gc_min_interval_ms",
2858                 .data           = &ip_rt_gc_min_interval,
2859                 .maxlen         = sizeof(int),
2860                 .mode           = 0644,
2861                 .proc_handler   = proc_dointvec_ms_jiffies,
2862         },
2863         {
2864                 .procname       = "gc_timeout",
2865                 .data           = &ip_rt_gc_timeout,
2866                 .maxlen         = sizeof(int),
2867                 .mode           = 0644,
2868                 .proc_handler   = proc_dointvec_jiffies,
2869         },
2870         {
2871                 .procname       = "gc_interval",
2872                 .data           = &ip_rt_gc_interval,
2873                 .maxlen         = sizeof(int),
2874                 .mode           = 0644,
2875                 .proc_handler   = proc_dointvec_jiffies,
2876         },
2877         {
2878                 .procname       = "redirect_load",
2879                 .data           = &ip_rt_redirect_load,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0644,
2882                 .proc_handler   = proc_dointvec,
2883         },
2884         {
2885                 .procname       = "redirect_number",
2886                 .data           = &ip_rt_redirect_number,
2887                 .maxlen         = sizeof(int),
2888                 .mode           = 0644,
2889                 .proc_handler   = proc_dointvec,
2890         },
2891         {
2892                 .procname       = "redirect_silence",
2893                 .data           = &ip_rt_redirect_silence,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = proc_dointvec,
2897         },
2898         {
2899                 .procname       = "error_cost",
2900                 .data           = &ip_rt_error_cost,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0644,
2903                 .proc_handler   = proc_dointvec,
2904         },
2905         {
2906                 .procname       = "error_burst",
2907                 .data           = &ip_rt_error_burst,
2908                 .maxlen         = sizeof(int),
2909                 .mode           = 0644,
2910                 .proc_handler   = proc_dointvec,
2911         },
2912         {
2913                 .procname       = "gc_elasticity",
2914                 .data           = &ip_rt_gc_elasticity,
2915                 .maxlen         = sizeof(int),
2916                 .mode           = 0644,
2917                 .proc_handler   = proc_dointvec,
2918         },
2919         {
2920                 .procname       = "mtu_expires",
2921                 .data           = &ip_rt_mtu_expires,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = proc_dointvec_jiffies,
2925         },
2926         {
2927                 .procname       = "min_pmtu",
2928                 .data           = &ip_rt_min_pmtu,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = proc_dointvec,
2932         },
2933         {
2934                 .procname       = "min_adv_mss",
2935                 .data           = &ip_rt_min_advmss,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = proc_dointvec,
2939         },
2940         { }
2941 };
2942
2943 static struct ctl_table ipv4_route_flush_table[] = {
2944         {
2945                 .procname       = "flush",
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0200,
2948                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2949         },
2950         { },
2951 };
2952
2953 static __net_init int sysctl_route_net_init(struct net *net)
2954 {
2955         struct ctl_table *tbl;
2956
2957         tbl = ipv4_route_flush_table;
2958         if (!net_eq(net, &init_net)) {
2959                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2960                 if (!tbl)
2961                         goto err_dup;
2962
2963                 /* Don't export sysctls to unprivileged users */
2964                 if (net->user_ns != &init_user_ns)
2965                         tbl[0].procname = NULL;
2966         }
2967         tbl[0].extra1 = net;
2968
2969         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2970         if (!net->ipv4.route_hdr)
2971                 goto err_reg;
2972         return 0;
2973
2974 err_reg:
2975         if (tbl != ipv4_route_flush_table)
2976                 kfree(tbl);
2977 err_dup:
2978         return -ENOMEM;
2979 }
2980
2981 static __net_exit void sysctl_route_net_exit(struct net *net)
2982 {
2983         struct ctl_table *tbl;
2984
2985         tbl = net->ipv4.route_hdr->ctl_table_arg;
2986         unregister_net_sysctl_table(net->ipv4.route_hdr);
2987         BUG_ON(tbl == ipv4_route_flush_table);
2988         kfree(tbl);
2989 }
2990
2991 static __net_initdata struct pernet_operations sysctl_route_ops = {
2992         .init = sysctl_route_net_init,
2993         .exit = sysctl_route_net_exit,
2994 };
2995 #endif
2996
2997 static __net_init int rt_genid_init(struct net *net)
2998 {
2999         atomic_set(&net->ipv4.rt_genid, 0);
3000         atomic_set(&net->fnhe_genid, 0);
3001         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3002         return 0;
3003 }
3004
3005 static __net_initdata struct pernet_operations rt_genid_ops = {
3006         .init = rt_genid_init,
3007 };
3008
3009 static int __net_init ipv4_inetpeer_init(struct net *net)
3010 {
3011         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3012
3013         if (!bp)
3014                 return -ENOMEM;
3015         inet_peer_base_init(bp);
3016         net->ipv4.peers = bp;
3017         return 0;
3018 }
3019
3020 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3021 {
3022         struct inet_peer_base *bp = net->ipv4.peers;
3023
3024         net->ipv4.peers = NULL;
3025         inetpeer_invalidate_tree(bp);
3026         kfree(bp);
3027 }
3028
3029 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3030         .init   =       ipv4_inetpeer_init,
3031         .exit   =       ipv4_inetpeer_exit,
3032 };
3033
3034 #ifdef CONFIG_IP_ROUTE_CLASSID
3035 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3036 #endif /* CONFIG_IP_ROUTE_CLASSID */
3037
3038 int __init ip_rt_init(void)
3039 {
3040         int rc = 0;
3041         int cpu;
3042
3043         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3044         if (!ip_idents)
3045                 panic("IP: failed to allocate ip_idents\n");
3046
3047         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3048
3049         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3050         if (!ip_tstamps)
3051                 panic("IP: failed to allocate ip_tstamps\n");
3052
3053         for_each_possible_cpu(cpu) {
3054                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3055
3056                 INIT_LIST_HEAD(&ul->head);
3057                 spin_lock_init(&ul->lock);
3058         }
3059 #ifdef CONFIG_IP_ROUTE_CLASSID
3060         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3061         if (!ip_rt_acct)
3062                 panic("IP: failed to allocate ip_rt_acct\n");
3063 #endif
3064
3065         ipv4_dst_ops.kmem_cachep =
3066                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3067                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3068
3069         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3070
3071         if (dst_entries_init(&ipv4_dst_ops) < 0)
3072                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3073
3074         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3075                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3076
3077         ipv4_dst_ops.gc_thresh = ~0;
3078         ip_rt_max_size = INT_MAX;
3079
3080         devinet_init();
3081         ip_fib_init();
3082
3083         if (ip_rt_proc_init())
3084                 pr_err("Unable to create route proc files\n");
3085 #ifdef CONFIG_XFRM
3086         xfrm_init();
3087         xfrm4_init();
3088 #endif
3089         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3090
3091 #ifdef CONFIG_SYSCTL
3092         register_pernet_subsys(&sysctl_route_ops);
3093 #endif
3094         register_pernet_subsys(&rt_genid_ops);
3095         register_pernet_subsys(&ipv4_inetpeer_ops);
3096         return rc;
3097 }
3098
3099 #ifdef CONFIG_SYSCTL
3100 /*
3101  * We really need to sanitize the damn ipv4 init order, then all
3102  * this nonsense will go away.
3103  */
3104 void __init ip_static_sysctl_init(void)
3105 {
3106         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3107 }
3108 #endif