net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .owner   = THIS_MODULE,
 244         .open    = rt_cache_seq_open,
 245         .read    = seq_read,
 246         .llseek  = seq_lseek,
 247         .release = seq_release,
 248 };
 249
 250
 251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 252 {
 253         int cpu;
 254
 255         if (*pos == 0)
 256                 return SEQ_START_TOKEN;
 257
 258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 259                 if (!cpu_possible(cpu))
 260                         continue;
 261                 *pos = cpu+1;
 262                 return &per_cpu(rt_cache_stat, cpu);
 263         }
 264         return NULL;
 265 }
 266
 267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268 {
 269         int cpu;
 270
 271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 272                 if (!cpu_possible(cpu))
 273                         continue;
 274                 *pos = cpu+1;
 275                 return &per_cpu(rt_cache_stat, cpu);
 276         }
 277         return NULL;
 278
 279 }
 280
 281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 282 {
 283
 284 }
 285
 286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 287 {
 288         struct rt_cache_stat *st = v;
 289
 290         if (v == SEQ_START_TOKEN) {
 291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 292                 return 0;
 293         }
 294
 295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 297                    dst_entries_get_slow(&ipv4_dst_ops),
 298                    0, /* st->in_hit */
 299                    st->in_slow_tot,
 300                    st->in_slow_mc,
 301                    st->in_no_route,
 302                    st->in_brd,
 303                    st->in_martian_dst,
 304                    st->in_martian_src,
 305
 306                    0, /* st->out_hit */
 307                    st->out_slow_tot,
 308                    st->out_slow_mc,
 309
 310                    0, /* st->gc_total */
 311                    0, /* st->gc_ignored */
 312                    0, /* st->gc_goal_miss */
 313                    0, /* st->gc_dst_overflow */
 314                    0, /* st->in_hlist_search */
 315                    0  /* st->out_hlist_search */
 316                 );
 317         return 0;
 318 }
 319
 320 static const struct seq_operations rt_cpu_seq_ops = {
 321         .start  = rt_cpu_seq_start,
 322         .next   = rt_cpu_seq_next,
 323         .stop   = rt_cpu_seq_stop,
 324         .show   = rt_cpu_seq_show,
 325 };
 326
 327
 328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 329 {
 330         return seq_open(file, &rt_cpu_seq_ops);
 331 }
 332
 333 static const struct file_operations rt_cpu_seq_fops = {
 334         .owner   = THIS_MODULE,
 335         .open    = rt_cpu_seq_open,
 336         .read    = seq_read,
 337         .llseek  = seq_lseek,
 338         .release = seq_release,
 339 };
 340
 341 #ifdef CONFIG_IP_ROUTE_CLASSID
 342 static int rt_acct_proc_show(struct seq_file *m, void *v)
 343 {
 344         struct ip_rt_acct *dst, *src;
 345         unsigned int i, j;
 346
 347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 348         if (!dst)
 349                 return -ENOMEM;
 350
 351         for_each_possible_cpu(i) {
 352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 353                 for (j = 0; j < 256; j++) {
 354                         dst[j].o_bytes   += src[j].o_bytes;
 355                         dst[j].o_packets += src[j].o_packets;
 356                         dst[j].i_bytes   += src[j].i_bytes;
 357                         dst[j].i_packets += src[j].i_packets;
 358                 }
 359         }
 360
 361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 362         kfree(dst);
 363         return 0;
 364 }
 365
 366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 367 {
 368         return single_open(file, rt_acct_proc_show, NULL);
 369 }
 370
 371 static const struct file_operations rt_acct_proc_fops = {
 372         .owner          = THIS_MODULE,
 373         .open           = rt_acct_proc_open,
 374         .read           = seq_read,
 375         .llseek         = seq_lseek,
 376         .release        = single_release,
 377 };
 378 #endif
 379
 380 static int __net_init ip_rt_do_proc_init(struct net *net)
 381 {
 382         struct proc_dir_entry *pde;
 383
 384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 385                           &rt_cache_seq_fops);
 386         if (!pde)
 387                 goto err1;
 388
 389         pde = proc_create("rt_cache", S_IRUGO,
 390                           net->proc_net_stat, &rt_cpu_seq_fops);
 391         if (!pde)
 392                 goto err2;
 393
 394 #ifdef CONFIG_IP_ROUTE_CLASSID
 395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 396         if (!pde)
 397                 goto err3;
 398 #endif
 399         return 0;
 400
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402 err3:
 403         remove_proc_entry("rt_cache", net->proc_net_stat);
 404 #endif
 405 err2:
 406         remove_proc_entry("rt_cache", net->proc_net);
 407 err1:
 408         return -ENOMEM;
 409 }
 410
 411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 412 {
 413         remove_proc_entry("rt_cache", net->proc_net_stat);
 414         remove_proc_entry("rt_cache", net->proc_net);
 415 #ifdef CONFIG_IP_ROUTE_CLASSID
 416         remove_proc_entry("rt_acct", net->proc_net);
 417 #endif
 418 }
 419
 420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 421         .init = ip_rt_do_proc_init,
 422         .exit = ip_rt_do_proc_exit,
 423 };
 424
 425 static int __init ip_rt_proc_init(void)
 426 {
 427         return register_pernet_subsys(&ip_rt_proc_ops);
 428 }
 429
 430 #else
 431 static inline int ip_rt_proc_init(void)
 432 {
 433         return 0;
 434 }
 435 #endif /* CONFIG_PROC_FS */
 436
 437 static inline bool rt_is_expired(const struct rtable *rth)
 438 {
 439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 440 }
 441
 442 void rt_cache_flush(struct net *net)
 443 {
 444         rt_genid_bump_ipv4(net);
 445 }
 446
 447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 448                                            struct sk_buff *skb,
 449                                            const void *daddr)
 450 {
 451         struct net_device *dev = dst->dev;
 452         const __be32 *pkey = daddr;
 453         const struct rtable *rt;
 454         struct neighbour *n;
 455
 456         rt = (const struct rtable *) dst;
 457         if (rt->rt_gateway)
 458                 pkey = (const __be32 *) &rt->rt_gateway;
 459         else if (skb)
 460                 pkey = &ip_hdr(skb)->daddr;
 461
 462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 463         if (n)
 464                 return n;
 465         return neigh_create(&arp_tbl, pkey, dev);
 466 }
 467
 468 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 469 {
 470         struct net_device *dev = dst->dev;
 471         const __be32 *pkey = daddr;
 472         const struct rtable *rt;
 473
 474         rt = (const struct rtable *)dst;
 475         if (rt->rt_gateway)
 476                 pkey = (const __be32 *)&rt->rt_gateway;
 477         else if (!daddr ||
 478                  (rt->rt_flags &
 479                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 480                 return;
 481
 482         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 483 }
 484
 485 #define IP_IDENTS_SZ 2048u
 486
 487 static atomic_t *ip_idents __read_mostly;
 488 static u32 *ip_tstamps __read_mostly;
 489
 490 /* In order to protect privacy, we add a perturbation to identifiers
 491  * if one generator is seldom used. This makes hard for an attacker
 492  * to infer how many packets were sent between two points in time.
 493  */
 494 u32 ip_idents_reserve(u32 hash, int segs)
 495 {
 496         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 497         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 498         u32 old = ACCESS_ONCE(*p_tstamp);
 499         u32 now = (u32)jiffies;
 500         u32 new, delta = 0;
 501
 502         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 503                 delta = prandom_u32_max(now - old);
 504
 505         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 506         do {
 507                 old = (u32)atomic_read(p_id);
 508                 new = old + delta + segs;
 509         } while (atomic_cmpxchg(p_id, old, new) != old);
 510
 511         return new - segs;
 512 }
 513 EXPORT_SYMBOL(ip_idents_reserve);
 514
 515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 516 {
 517         static u32 ip_idents_hashrnd __read_mostly;
 518         u32 hash, id;
 519
 520         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 521
 522         hash = jhash_3words((__force u32)iph->daddr,
 523                             (__force u32)iph->saddr,
 524                             iph->protocol ^ net_hash_mix(net),
 525                             ip_idents_hashrnd);
 526         id = ip_idents_reserve(hash, segs);
 527         iph->id = htons(id);
 528 }
 529 EXPORT_SYMBOL(__ip_select_ident);
 530
 531 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 532                              const struct sock *sk,
 533                              const struct iphdr *iph,
 534                              int oif, u8 tos,
 535                              u8 prot, u32 mark, int flow_flags)
 536 {
 537         if (sk) {
 538                 const struct inet_sock *inet = inet_sk(sk);
 539
 540                 oif = sk->sk_bound_dev_if;
 541                 mark = sk->sk_mark;
 542                 tos = RT_CONN_FLAGS(sk);
 543                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 544         }
 545         flowi4_init_output(fl4, oif, mark, tos,
 546                            RT_SCOPE_UNIVERSE, prot,
 547                            flow_flags,
 548                            iph->daddr, iph->saddr, 0, 0,
 549                            sock_net_uid(net, sk));
 550 }
 551
 552 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 553                                const struct sock *sk)
 554 {
 555         const struct net *net = dev_net(skb->dev);
 556         const struct iphdr *iph = ip_hdr(skb);
 557         int oif = skb->dev->ifindex;
 558         u8 tos = RT_TOS(iph->tos);
 559         u8 prot = iph->protocol;
 560         u32 mark = skb->mark;
 561
 562         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 563 }
 564
 565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568         const struct ip_options_rcu *inet_opt;
 569         __be32 daddr = inet->inet_daddr;
 570
 571         rcu_read_lock();
 572         inet_opt = rcu_dereference(inet->inet_opt);
 573         if (inet_opt && inet_opt->opt.srr)
 574                 daddr = inet_opt->opt.faddr;
 575         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 576                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 577                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 578                            inet_sk_flowi_flags(sk),
 579                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 580         rcu_read_unlock();
 581 }
 582
 583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 584                                  const struct sk_buff *skb)
 585 {
 586         if (skb)
 587                 build_skb_flow_key(fl4, skb, sk);
 588         else
 589                 build_sk_flow_key(fl4, sk);
 590 }
 591
 592 static DEFINE_SPINLOCK(fnhe_lock);
 593
 594 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 595 {
 596         struct rtable *rt;
 597
 598         rt = rcu_dereference(fnhe->fnhe_rth_input);
 599         if (rt) {
 600                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 601                 dst_dev_put(&rt->dst);
 602                 dst_release(&rt->dst);
 603         }
 604         rt = rcu_dereference(fnhe->fnhe_rth_output);
 605         if (rt) {
 606                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 607                 dst_dev_put(&rt->dst);
 608                 dst_release(&rt->dst);
 609         }
 610 }
 611
 612 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 613 {
 614         struct fib_nh_exception *fnhe, *oldest;
 615
 616         oldest = rcu_dereference(hash->chain);
 617         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 618              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 619                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 620                         oldest = fnhe;
 621         }
 622         fnhe_flush_routes(oldest);
 623         return oldest;
 624 }
 625
 626 static inline u32 fnhe_hashfun(__be32 daddr)
 627 {
 628         static u32 fnhe_hashrnd __read_mostly;
 629         u32 hval;
 630
 631         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 632         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 633         return hash_32(hval, FNHE_HASH_SHIFT);
 634 }
 635
 636 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 637 {
 638         rt->rt_pmtu = fnhe->fnhe_pmtu;
 639         rt->dst.expires = fnhe->fnhe_expires;
 640
 641         if (fnhe->fnhe_gw) {
 642                 rt->rt_flags |= RTCF_REDIRECTED;
 643                 rt->rt_gateway = fnhe->fnhe_gw;
 644                 rt->rt_uses_gateway = 1;
 645         }
 646 }
 647
 648 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 649                                   u32 pmtu, unsigned long expires)
 650 {
 651         struct fnhe_hash_bucket *hash;
 652         struct fib_nh_exception *fnhe;
 653         struct rtable *rt;
 654         unsigned int i;
 655         int depth;
 656         u32 hval = fnhe_hashfun(daddr);
 657
 658         spin_lock_bh(&fnhe_lock);
 659
 660         hash = rcu_dereference(nh->nh_exceptions);
 661         if (!hash) {
 662                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 663                 if (!hash)
 664                         goto out_unlock;
 665                 rcu_assign_pointer(nh->nh_exceptions, hash);
 666         }
 667
 668         hash += hval;
 669
 670         depth = 0;
 671         for (fnhe = rcu_dereference(hash->chain); fnhe;
 672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673                 if (fnhe->fnhe_daddr == daddr)
 674                         break;
 675                 depth++;
 676         }
 677
 678         if (fnhe) {
 679                 if (gw)
 680                         fnhe->fnhe_gw = gw;
 681                 if (pmtu) {
 682                         fnhe->fnhe_pmtu = pmtu;
 683                         fnhe->fnhe_expires = max(1UL, expires);
 684                 }
 685                 /* Update all cached dsts too */
 686                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 687                 if (rt)
 688                         fill_route_from_fnhe(rt, fnhe);
 689                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 690                 if (rt)
 691                         fill_route_from_fnhe(rt, fnhe);
 692         } else {
 693                 if (depth > FNHE_RECLAIM_DEPTH)
 694                         fnhe = fnhe_oldest(hash);
 695                 else {
 696                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 697                         if (!fnhe)
 698                                 goto out_unlock;
 699
 700                         fnhe->fnhe_next = hash->chain;
 701                         rcu_assign_pointer(hash->chain, fnhe);
 702                 }
 703                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 704                 fnhe->fnhe_daddr = daddr;
 705                 fnhe->fnhe_gw = gw;
 706                 fnhe->fnhe_pmtu = pmtu;
 707                 fnhe->fnhe_expires = expires;
 708
 709                 /* Exception created; mark the cached routes for the nexthop
 710                  * stale, so anyone caching it rechecks if this exception
 711                  * applies to them.
 712                  */
 713                 rt = rcu_dereference(nh->nh_rth_input);
 714                 if (rt)
 715                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 716
 717                 for_each_possible_cpu(i) {
 718                         struct rtable __rcu **prt;
 719                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 720                         rt = rcu_dereference(*prt);
 721                         if (rt)
 722                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 723                 }
 724         }
 725
 726         fnhe->fnhe_stamp = jiffies;
 727
 728 out_unlock:
 729         spin_unlock_bh(&fnhe_lock);
 730 }
 731
 732 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 733                              bool kill_route)
 734 {
 735         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 736         __be32 old_gw = ip_hdr(skb)->saddr;
 737         struct net_device *dev = skb->dev;
 738         struct in_device *in_dev;
 739         struct fib_result res;
 740         struct neighbour *n;
 741         struct net *net;
 742
 743         switch (icmp_hdr(skb)->code & 7) {
 744         case ICMP_REDIR_NET:
 745         case ICMP_REDIR_NETTOS:
 746         case ICMP_REDIR_HOST:
 747         case ICMP_REDIR_HOSTTOS:
 748                 break;
 749
 750         default:
 751                 return;
 752         }
 753
 754         if (rt->rt_gateway != old_gw)
 755                 return;
 756
 757         in_dev = __in_dev_get_rcu(dev);
 758         if (!in_dev)
 759                 return;
 760
 761         net = dev_net(dev);
 762         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 763             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 764             ipv4_is_zeronet(new_gw))
 765                 goto reject_redirect;
 766
 767         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 768                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 769                         goto reject_redirect;
 770                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 771                         goto reject_redirect;
 772         } else {
 773                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 774                         goto reject_redirect;
 775         }
 776
 777         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 778         if (!n)
 779                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 780         if (!IS_ERR(n)) {
 781                 if (!(n->nud_state & NUD_VALID)) {
 782                         neigh_event_send(n, NULL);
 783                 } else {
 784                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 785                                 struct fib_nh *nh = &FIB_RES_NH(res);
 786
 787                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 788                                                 0, jiffies + ip_rt_gc_timeout);
 789                         }
 790                         if (kill_route)
 791                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 792                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 793                 }
 794                 neigh_release(n);
 795         }
 796         return;
 797
 798 reject_redirect:
 799 #ifdef CONFIG_IP_ROUTE_VERBOSE
 800         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 801                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 802                 __be32 daddr = iph->daddr;
 803                 __be32 saddr = iph->saddr;
 804
 805                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 806                                      "  Advised path = %pI4 -> %pI4\n",
 807                                      &old_gw, dev->name, &new_gw,
 808                                      &saddr, &daddr);
 809         }
 810 #endif
 811         ;
 812 }
 813
 814 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 815 {
 816         struct rtable *rt;
 817         struct flowi4 fl4;
 818         const struct iphdr *iph = (const struct iphdr *) skb->data;
 819         struct net *net = dev_net(skb->dev);
 820         int oif = skb->dev->ifindex;
 821         u8 tos = RT_TOS(iph->tos);
 822         u8 prot = iph->protocol;
 823         u32 mark = skb->mark;
 824
 825         rt = (struct rtable *) dst;
 826
 827         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 828         __ip_do_redirect(rt, skb, &fl4, true);
 829 }
 830
 831 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 832 {
 833         struct rtable *rt = (struct rtable *)dst;
 834         struct dst_entry *ret = dst;
 835
 836         if (rt) {
 837                 if (dst->obsolete > 0) {
 838                         ip_rt_put(rt);
 839                         ret = NULL;
 840                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 841                            rt->dst.expires) {
 842                         ip_rt_put(rt);
 843                         ret = NULL;
 844                 }
 845         }
 846         return ret;
 847 }
 848
 849 /*
 850  * Algorithm:
 851  *      1. The first ip_rt_redirect_number redirects are sent
 852  *         with exponential backoff, then we stop sending them at all,
 853  *         assuming that the host ignores our redirects.
 854  *      2. If we did not see packets requiring redirects
 855  *         during ip_rt_redirect_silence, we assume that the host
 856  *         forgot redirected route and start to send redirects again.
 857  *
 858  * This algorithm is much cheaper and more intelligent than dumb load limiting
 859  * in icmp.c.
 860  *
 861  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 862  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 863  */
 864
 865 void ip_rt_send_redirect(struct sk_buff *skb)
 866 {
 867         struct rtable *rt = skb_rtable(skb);
 868         struct in_device *in_dev;
 869         struct inet_peer *peer;
 870         struct net *net;
 871         int log_martians;
 872         int vif;
 873
 874         rcu_read_lock();
 875         in_dev = __in_dev_get_rcu(rt->dst.dev);
 876         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 877                 rcu_read_unlock();
 878                 return;
 879         }
 880         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 881         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 882         rcu_read_unlock();
 883
 884         net = dev_net(rt->dst.dev);
 885         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 886         if (!peer) {
 887                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 888                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 889                 return;
 890         }
 891
 892         /* No redirected packets during ip_rt_redirect_silence;
 893          * reset the algorithm.
 894          */
 895         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 896                 peer->rate_tokens = 0;
 897
 898         /* Too many ignored redirects; do not send anything
 899          * set dst.rate_last to the last seen redirected packet.
 900          */
 901         if (peer->rate_tokens >= ip_rt_redirect_number) {
 902                 peer->rate_last = jiffies;
 903                 goto out_put_peer;
 904         }
 905
 906         /* Check for load limit; set rate_last to the latest sent
 907          * redirect.
 908          */
 909         if (peer->rate_tokens == 0 ||
 910             time_after(jiffies,
 911                        (peer->rate_last +
 912                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 913                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 914
 915                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 916                 peer->rate_last = jiffies;
 917                 ++peer->rate_tokens;
 918 #ifdef CONFIG_IP_ROUTE_VERBOSE
 919                 if (log_martians &&
 920                     peer->rate_tokens == ip_rt_redirect_number)
 921                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 922                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 923                                              &ip_hdr(skb)->daddr, &gw);
 924 #endif
 925         }
 926 out_put_peer:
 927         inet_putpeer(peer);
 928 }
 929
 930 static int ip_error(struct sk_buff *skb)
 931 {
 932         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 933         struct rtable *rt = skb_rtable(skb);
 934         struct inet_peer *peer;
 935         unsigned long now;
 936         struct net *net;
 937         bool send;
 938         int code;
 939
 940         /* IP on this device is disabled. */
 941         if (!in_dev)
 942                 goto out;
 943
 944         net = dev_net(rt->dst.dev);
 945         if (!IN_DEV_FORWARD(in_dev)) {
 946                 switch (rt->dst.error) {
 947                 case EHOSTUNREACH:
 948                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 949                         break;
 950
 951                 case ENETUNREACH:
 952                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 953                         break;
 954                 }
 955                 goto out;
 956         }
 957
 958         switch (rt->dst.error) {
 959         case EINVAL:
 960         default:
 961                 goto out;
 962         case EHOSTUNREACH:
 963                 code = ICMP_HOST_UNREACH;
 964                 break;
 965         case ENETUNREACH:
 966                 code = ICMP_NET_UNREACH;
 967                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                 break;
 969         case EACCES:
 970                 code = ICMP_PKT_FILTERED;
 971                 break;
 972         }
 973
 974         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 975                                l3mdev_master_ifindex(skb->dev), 1);
 976
 977         send = true;
 978         if (peer) {
 979                 now = jiffies;
 980                 peer->rate_tokens += now - peer->rate_last;
 981                 if (peer->rate_tokens > ip_rt_error_burst)
 982                         peer->rate_tokens = ip_rt_error_burst;
 983                 peer->rate_last = now;
 984                 if (peer->rate_tokens >= ip_rt_error_cost)
 985                         peer->rate_tokens -= ip_rt_error_cost;
 986                 else
 987                         send = false;
 988                 inet_putpeer(peer);
 989         }
 990         if (send)
 991                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 992
 993 out:    kfree_skb(skb);
 994         return 0;
 995 }
 996
 997 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 998 {
 999         struct dst_entry *dst = &rt->dst;
1000         struct fib_result res;
1001
1002         if (dst_metric_locked(dst, RTAX_MTU))
1003                 return;
1004
1005         if (ipv4_mtu(dst) < mtu)
1006                 return;
1007
1008         if (mtu < ip_rt_min_pmtu)
1009                 mtu = ip_rt_min_pmtu;
1010
1011         if (rt->rt_pmtu == mtu &&
1012             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013                 return;
1014
1015         rcu_read_lock();
1016         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1017                 struct fib_nh *nh = &FIB_RES_NH(res);
1018
1019                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020                                       jiffies + ip_rt_mtu_expires);
1021         }
1022         rcu_read_unlock();
1023 }
1024
1025 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026                               struct sk_buff *skb, u32 mtu)
1027 {
1028         struct rtable *rt = (struct rtable *) dst;
1029         struct flowi4 fl4;
1030
1031         ip_rt_build_flow_key(&fl4, sk, skb);
1032         __ip_rt_update_pmtu(rt, &fl4, mtu);
1033 }
1034
1035 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036                       int oif, u32 mark, u8 protocol, int flow_flags)
1037 {
1038         const struct iphdr *iph = (const struct iphdr *) skb->data;
1039         struct flowi4 fl4;
1040         struct rtable *rt;
1041
1042         if (!mark)
1043                 mark = IP4_REPLY_MARK(net, skb->mark);
1044
1045         __build_flow_key(net, &fl4, NULL, iph, oif,
1046                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1047         rt = __ip_route_output_key(net, &fl4);
1048         if (!IS_ERR(rt)) {
1049                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1050                 ip_rt_put(rt);
1051         }
1052 }
1053 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
1055 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060
1061         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1062
1063         if (!fl4.flowi4_mark)
1064                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
1066         rt = __ip_route_output_key(sock_net(sk), &fl4);
1067         if (!IS_ERR(rt)) {
1068                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                 ip_rt_put(rt);
1070         }
1071 }
1072
1073 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078         struct dst_entry *odst = NULL;
1079         bool new = false;
1080         struct net *net = sock_net(sk);
1081
1082         bh_lock_sock(sk);
1083
1084         if (!ip_sk_accept_pmtu(sk))
1085                 goto out;
1086
1087         odst = sk_dst_get(sk);
1088
1089         if (sock_owned_by_user(sk) || !odst) {
1090                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091                 goto out;
1092         }
1093
1094         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1095
1096         rt = (struct rtable *)odst;
1097         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1098                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099                 if (IS_ERR(rt))
1100                         goto out;
1101
1102                 new = true;
1103         }
1104
1105         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
1107         if (!dst_check(&rt->dst, 0)) {
1108                 if (new)
1109                         dst_release(&rt->dst);
1110
1111                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112                 if (IS_ERR(rt))
1113                         goto out;
1114
1115                 new = true;
1116         }
1117
1118         if (new)
1119                 sk_dst_set(sk, &rt->dst);
1120
1121 out:
1122         bh_unlock_sock(sk);
1123         dst_release(odst);
1124 }
1125 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1126
1127 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128                    int oif, u32 mark, u8 protocol, int flow_flags)
1129 {
1130         const struct iphdr *iph = (const struct iphdr *) skb->data;
1131         struct flowi4 fl4;
1132         struct rtable *rt;
1133
1134         __build_flow_key(net, &fl4, NULL, iph, oif,
1135                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1136         rt = __ip_route_output_key(net, &fl4);
1137         if (!IS_ERR(rt)) {
1138                 __ip_do_redirect(rt, skb, &fl4, false);
1139                 ip_rt_put(rt);
1140         }
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145 {
1146         const struct iphdr *iph = (const struct iphdr *) skb->data;
1147         struct flowi4 fl4;
1148         struct rtable *rt;
1149         struct net *net = sock_net(sk);
1150
1151         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152         rt = __ip_route_output_key(net, &fl4);
1153         if (!IS_ERR(rt)) {
1154                 __ip_do_redirect(rt, skb, &fl4, false);
1155                 ip_rt_put(rt);
1156         }
1157 }
1158 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
1160 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161 {
1162         struct rtable *rt = (struct rtable *) dst;
1163
1164         /* All IPV4 dsts are created with ->obsolete set to the value
1165          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166          * into this function always.
1167          *
1168          * When a PMTU/redirect information update invalidates a route,
1169          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170          * DST_OBSOLETE_DEAD by dst_free().
1171          */
1172         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1173                 return NULL;
1174         return dst;
1175 }
1176
1177 static void ipv4_link_failure(struct sk_buff *skb)
1178 {
1179         struct rtable *rt;
1180
1181         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
1183         rt = skb_rtable(skb);
1184         if (rt)
1185                 dst_set_expires(&rt->dst, 0);
1186 }
1187
1188 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1189 {
1190         pr_debug("%s: %pI4 -> %pI4, %s\n",
1191                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192                  skb->dev ? skb->dev->name : "?");
1193         kfree_skb(skb);
1194         WARN_ON(1);
1195         return 0;
1196 }
1197
1198 /*
1199    We do not cache source address of outgoing interface,
1200    because it is used only by IP RR, TS and SRR options,
1201    so that it out of fast path.
1202
1203    BTW remember: "addr" is allowed to be not aligned
1204    in IP options!
1205  */
1206
1207 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1208 {
1209         __be32 src;
1210
1211         if (rt_is_output_route(rt))
1212                 src = ip_hdr(skb)->saddr;
1213         else {
1214                 struct fib_result res;
1215                 struct flowi4 fl4;
1216                 struct iphdr *iph;
1217
1218                 iph = ip_hdr(skb);
1219
1220                 memset(&fl4, 0, sizeof(fl4));
1221                 fl4.daddr = iph->daddr;
1222                 fl4.saddr = iph->saddr;
1223                 fl4.flowi4_tos = RT_TOS(iph->tos);
1224                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225                 fl4.flowi4_iif = skb->dev->ifindex;
1226                 fl4.flowi4_mark = skb->mark;
1227
1228                 rcu_read_lock();
1229                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1230                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1231                 else
1232                         src = inet_select_addr(rt->dst.dev,
1233                                                rt_nexthop(rt, iph->daddr),
1234                                                RT_SCOPE_UNIVERSE);
1235                 rcu_read_unlock();
1236         }
1237         memcpy(addr, &src, 4);
1238 }
1239
1240 #ifdef CONFIG_IP_ROUTE_CLASSID
1241 static void set_class_tag(struct rtable *rt, u32 tag)
1242 {
1243         if (!(rt->dst.tclassid & 0xFFFF))
1244                 rt->dst.tclassid |= tag & 0xFFFF;
1245         if (!(rt->dst.tclassid & 0xFFFF0000))
1246                 rt->dst.tclassid |= tag & 0xFFFF0000;
1247 }
1248 #endif
1249
1250 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251 {
1252         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253         unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254                                     ip_rt_min_advmss);
1255
1256         return min(advmss, IPV4_MAX_PMTU - header_size);
1257 }
1258
1259 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1260 {
1261         const struct rtable *rt = (const struct rtable *) dst;
1262         unsigned int mtu = rt->rt_pmtu;
1263
1264         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1265                 mtu = dst_metric_raw(dst, RTAX_MTU);
1266
1267         if (mtu)
1268                 return mtu;
1269
1270         mtu = READ_ONCE(dst->dev->mtu);
1271
1272         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1273                 if (rt->rt_uses_gateway && mtu > 576)
1274                         mtu = 576;
1275         }
1276
1277         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1280 }
1281
1282 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1283 {
1284         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1285         struct fib_nh_exception *fnhe;
1286         u32 hval;
1287
1288         if (!hash)
1289                 return NULL;
1290
1291         hval = fnhe_hashfun(daddr);
1292
1293         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1295                 if (fnhe->fnhe_daddr == daddr)
1296                         return fnhe;
1297         }
1298         return NULL;
1299 }
1300
1301 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1302                               __be32 daddr, const bool do_cache)
1303 {
1304         bool ret = false;
1305
1306         spin_lock_bh(&fnhe_lock);
1307
1308         if (daddr == fnhe->fnhe_daddr) {
1309                 struct rtable __rcu **porig;
1310                 struct rtable *orig;
1311                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1312
1313                 if (rt_is_input_route(rt))
1314                         porig = &fnhe->fnhe_rth_input;
1315                 else
1316                         porig = &fnhe->fnhe_rth_output;
1317                 orig = rcu_dereference(*porig);
1318
1319                 if (fnhe->fnhe_genid != genid) {
1320                         fnhe->fnhe_genid = genid;
1321                         fnhe->fnhe_gw = 0;
1322                         fnhe->fnhe_pmtu = 0;
1323                         fnhe->fnhe_expires = 0;
1324                         fnhe_flush_routes(fnhe);
1325                         orig = NULL;
1326                 }
1327                 fill_route_from_fnhe(rt, fnhe);
1328                 if (!rt->rt_gateway)
1329                         rt->rt_gateway = daddr;
1330
1331                 if (do_cache) {
1332                         dst_hold(&rt->dst);
1333                         rcu_assign_pointer(*porig, rt);
1334                         if (orig) {
1335                                 dst_dev_put(&orig->dst);
1336                                 dst_release(&orig->dst);
1337                         }
1338                         ret = true;
1339                 }
1340
1341                 fnhe->fnhe_stamp = jiffies;
1342         }
1343         spin_unlock_bh(&fnhe_lock);
1344
1345         return ret;
1346 }
1347
1348 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1349 {
1350         struct rtable *orig, *prev, **p;
1351         bool ret = true;
1352
1353         if (rt_is_input_route(rt)) {
1354                 p = (struct rtable **)&nh->nh_rth_input;
1355         } else {
1356                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1357         }
1358         orig = *p;
1359
1360         /* hold dst before doing cmpxchg() to avoid race condition
1361          * on this dst
1362          */
1363         dst_hold(&rt->dst);
1364         prev = cmpxchg(p, orig, rt);
1365         if (prev == orig) {
1366                 if (orig) {
1367                         dst_dev_put(&orig->dst);
1368                         dst_release(&orig->dst);
1369                 }
1370         } else {
1371                 dst_release(&rt->dst);
1372                 ret = false;
1373         }
1374
1375         return ret;
1376 }
1377
1378 struct uncached_list {
1379         spinlock_t              lock;
1380         struct list_head        head;
1381 };
1382
1383 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1384
1385 static void rt_add_uncached_list(struct rtable *rt)
1386 {
1387         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389         rt->rt_uncached_list = ul;
1390
1391         spin_lock_bh(&ul->lock);
1392         list_add_tail(&rt->rt_uncached, &ul->head);
1393         spin_unlock_bh(&ul->lock);
1394 }
1395
1396 static void ipv4_dst_destroy(struct dst_entry *dst)
1397 {
1398         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1399         struct rtable *rt = (struct rtable *) dst;
1400
1401         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1402                 kfree(p);
1403
1404         if (!list_empty(&rt->rt_uncached)) {
1405                 struct uncached_list *ul = rt->rt_uncached_list;
1406
1407                 spin_lock_bh(&ul->lock);
1408                 list_del(&rt->rt_uncached);
1409                 spin_unlock_bh(&ul->lock);
1410         }
1411 }
1412
1413 void rt_flush_dev(struct net_device *dev)
1414 {
1415         struct net *net = dev_net(dev);
1416         struct rtable *rt;
1417         int cpu;
1418
1419         for_each_possible_cpu(cpu) {
1420                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1421
1422                 spin_lock_bh(&ul->lock);
1423                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1424                         if (rt->dst.dev != dev)
1425                                 continue;
1426                         rt->dst.dev = net->loopback_dev;
1427                         dev_hold(rt->dst.dev);
1428                         dev_put(dev);
1429                 }
1430                 spin_unlock_bh(&ul->lock);
1431         }
1432 }
1433
1434 static bool rt_cache_valid(const struct rtable *rt)
1435 {
1436         return  rt &&
1437                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438                 !rt_is_expired(rt);
1439 }
1440
1441 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1442                            const struct fib_result *res,
1443                            struct fib_nh_exception *fnhe,
1444                            struct fib_info *fi, u16 type, u32 itag,
1445                            const bool do_cache)
1446 {
1447         bool cached = false;
1448
1449         if (fi) {
1450                 struct fib_nh *nh = &FIB_RES_NH(*res);
1451
1452                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1453                         rt->rt_gateway = nh->nh_gw;
1454                         rt->rt_uses_gateway = 1;
1455                 }
1456                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1457                 if (fi->fib_metrics != &dst_default_metrics) {
1458                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1459                         atomic_inc(&fi->fib_metrics->refcnt);
1460                 }
1461 #ifdef CONFIG_IP_ROUTE_CLASSID
1462                 rt->dst.tclassid = nh->nh_tclassid;
1463 #endif
1464                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1465                 if (unlikely(fnhe))
1466                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1467                 else if (do_cache)
1468                         cached = rt_cache_route(nh, rt);
1469                 if (unlikely(!cached)) {
1470                         /* Routes we intend to cache in nexthop exception or
1471                          * FIB nexthop have the DST_NOCACHE bit clear.
1472                          * However, if we are unsuccessful at storing this
1473                          * route into the cache we really need to set it.
1474                          */
1475                         if (!rt->rt_gateway)
1476                                 rt->rt_gateway = daddr;
1477                         rt_add_uncached_list(rt);
1478                 }
1479         } else
1480                 rt_add_uncached_list(rt);
1481
1482 #ifdef CONFIG_IP_ROUTE_CLASSID
1483 #ifdef CONFIG_IP_MULTIPLE_TABLES
1484         set_class_tag(rt, res->tclassid);
1485 #endif
1486         set_class_tag(rt, itag);
1487 #endif
1488 }
1489
1490 struct rtable *rt_dst_alloc(struct net_device *dev,
1491                             unsigned int flags, u16 type,
1492                             bool nopolicy, bool noxfrm, bool will_cache)
1493 {
1494         struct rtable *rt;
1495
1496         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1497                        (will_cache ? 0 : DST_HOST) |
1498                        (nopolicy ? DST_NOPOLICY : 0) |
1499                        (noxfrm ? DST_NOXFRM : 0));
1500
1501         if (rt) {
1502                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1503                 rt->rt_flags = flags;
1504                 rt->rt_type = type;
1505                 rt->rt_is_input = 0;
1506                 rt->rt_iif = 0;
1507                 rt->rt_pmtu = 0;
1508                 rt->rt_gateway = 0;
1509                 rt->rt_uses_gateway = 0;
1510                 rt->rt_table_id = 0;
1511                 INIT_LIST_HEAD(&rt->rt_uncached);
1512
1513                 rt->dst.output = ip_output;
1514                 if (flags & RTCF_LOCAL)
1515                         rt->dst.input = ip_local_deliver;
1516         }
1517
1518         return rt;
1519 }
1520 EXPORT_SYMBOL(rt_dst_alloc);
1521
1522 /* called in rcu_read_lock() section */
1523 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1524                           u8 tos, struct net_device *dev,
1525                           struct in_device *in_dev, u32 *itag)
1526 {
1527         int err;
1528
1529         /* Primary sanity checks. */
1530         if (!in_dev)
1531                 return -EINVAL;
1532
1533         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1534             skb->protocol != htons(ETH_P_IP))
1535                 return -EINVAL;
1536
1537         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1538                 return -EINVAL;
1539
1540         if (ipv4_is_zeronet(saddr)) {
1541                 if (!ipv4_is_local_multicast(daddr))
1542                         return -EINVAL;
1543         } else {
1544                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1545                                           in_dev, itag);
1546                 if (err < 0)
1547                         return err;
1548         }
1549         return 0;
1550 }
1551
1552 /* called in rcu_read_lock() section */
1553 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1554                              u8 tos, struct net_device *dev, int our)
1555 {
1556         struct in_device *in_dev = __in_dev_get_rcu(dev);
1557         unsigned int flags = RTCF_MULTICAST;
1558         struct rtable *rth;
1559         u32 itag = 0;
1560         int err;
1561
1562         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1563         if (err)
1564                 return err;
1565
1566         if (our)
1567                 flags |= RTCF_LOCAL;
1568
1569         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1570                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1571         if (!rth)
1572                 return -ENOBUFS;
1573
1574 #ifdef CONFIG_IP_ROUTE_CLASSID
1575         rth->dst.tclassid = itag;
1576 #endif
1577         rth->dst.output = ip_rt_bug;
1578         rth->rt_is_input= 1;
1579
1580 #ifdef CONFIG_IP_MROUTE
1581         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1582                 rth->dst.input = ip_mr_input;
1583 #endif
1584         RT_CACHE_STAT_INC(in_slow_mc);
1585
1586         skb_dst_set(skb, &rth->dst);
1587         return 0;
1588 }
1589
1590
1591 static void ip_handle_martian_source(struct net_device *dev,
1592                                      struct in_device *in_dev,
1593                                      struct sk_buff *skb,
1594                                      __be32 daddr,
1595                                      __be32 saddr)
1596 {
1597         RT_CACHE_STAT_INC(in_martian_src);
1598 #ifdef CONFIG_IP_ROUTE_VERBOSE
1599         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1600                 /*
1601                  *      RFC1812 recommendation, if source is martian,
1602                  *      the only hint is MAC header.
1603                  */
1604                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1605                         &daddr, &saddr, dev->name);
1606                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1607                         print_hex_dump(KERN_WARNING, "ll header: ",
1608                                        DUMP_PREFIX_OFFSET, 16, 1,
1609                                        skb_mac_header(skb),
1610                                        dev->hard_header_len, true);
1611                 }
1612         }
1613 #endif
1614 }
1615
1616 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1617 {
1618         struct fnhe_hash_bucket *hash;
1619         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1620         u32 hval = fnhe_hashfun(daddr);
1621
1622         spin_lock_bh(&fnhe_lock);
1623
1624         hash = rcu_dereference_protected(nh->nh_exceptions,
1625                                          lockdep_is_held(&fnhe_lock));
1626         hash += hval;
1627
1628         fnhe_p = &hash->chain;
1629         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1630         while (fnhe) {
1631                 if (fnhe->fnhe_daddr == daddr) {
1632                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1633                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1634                         fnhe_flush_routes(fnhe);
1635                         kfree_rcu(fnhe, rcu);
1636                         break;
1637                 }
1638                 fnhe_p = &fnhe->fnhe_next;
1639                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1640                                                  lockdep_is_held(&fnhe_lock));
1641         }
1642
1643         spin_unlock_bh(&fnhe_lock);
1644 }
1645
1646 static void set_lwt_redirect(struct rtable *rth)
1647 {
1648         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1649                 rth->dst.lwtstate->orig_output = rth->dst.output;
1650                 rth->dst.output = lwtunnel_output;
1651         }
1652
1653         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1654                 rth->dst.lwtstate->orig_input = rth->dst.input;
1655                 rth->dst.input = lwtunnel_input;
1656         }
1657 }
1658
1659 /* called in rcu_read_lock() section */
1660 static int __mkroute_input(struct sk_buff *skb,
1661                            const struct fib_result *res,
1662                            struct in_device *in_dev,
1663                            __be32 daddr, __be32 saddr, u32 tos)
1664 {
1665         struct fib_nh_exception *fnhe;
1666         struct rtable *rth;
1667         int err;
1668         struct in_device *out_dev;
1669         bool do_cache;
1670         u32 itag = 0;
1671
1672         /* get a working reference to the output device */
1673         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1674         if (!out_dev) {
1675                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1676                 return -EINVAL;
1677         }
1678
1679         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1680                                   in_dev->dev, in_dev, &itag);
1681         if (err < 0) {
1682                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1683                                          saddr);
1684
1685                 goto cleanup;
1686         }
1687
1688         do_cache = res->fi && !itag;
1689         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1690             skb->protocol == htons(ETH_P_IP) &&
1691             (IN_DEV_SHARED_MEDIA(out_dev) ||
1692              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1693                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1694
1695         if (skb->protocol != htons(ETH_P_IP)) {
1696                 /* Not IP (i.e. ARP). Do not create route, if it is
1697                  * invalid for proxy arp. DNAT routes are always valid.
1698                  *
1699                  * Proxy arp feature have been extended to allow, ARP
1700                  * replies back to the same interface, to support
1701                  * Private VLAN switch technologies. See arp.c.
1702                  */
1703                 if (out_dev == in_dev &&
1704                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1705                         err = -EINVAL;
1706                         goto cleanup;
1707                 }
1708         }
1709
1710         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1711         if (do_cache) {
1712                 if (fnhe) {
1713                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1714                         if (rth && rth->dst.expires &&
1715                             time_after(jiffies, rth->dst.expires)) {
1716                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1717                                 fnhe = NULL;
1718                         } else {
1719                                 goto rt_cache;
1720                         }
1721                 }
1722
1723                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1724
1725 rt_cache:
1726                 if (rt_cache_valid(rth)) {
1727                         skb_dst_set_noref(skb, &rth->dst);
1728                         goto out;
1729                 }
1730         }
1731
1732         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1733                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1734                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1735         if (!rth) {
1736                 err = -ENOBUFS;
1737                 goto cleanup;
1738         }
1739
1740         rth->rt_is_input = 1;
1741         if (res->table)
1742                 rth->rt_table_id = res->table->tb_id;
1743         RT_CACHE_STAT_INC(in_slow_tot);
1744
1745         rth->dst.input = ip_forward;
1746
1747         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1748                        do_cache);
1749         set_lwt_redirect(rth);
1750         skb_dst_set(skb, &rth->dst);
1751 out:
1752         err = 0;
1753  cleanup:
1754         return err;
1755 }
1756
1757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1758 /* To make ICMP packets follow the right flow, the multipath hash is
1759  * calculated from the inner IP addresses.
1760  */
1761 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1762                                  struct flow_keys *hash_keys)
1763 {
1764         const struct iphdr *outer_iph = ip_hdr(skb);
1765         const struct iphdr *inner_iph;
1766         const struct icmphdr *icmph;
1767         struct iphdr _inner_iph;
1768         struct icmphdr _icmph;
1769
1770         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1771         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1772         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1773                 return;
1774
1775         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1776                 return;
1777
1778         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1779                                    &_icmph);
1780         if (!icmph)
1781                 return;
1782
1783         if (icmph->type != ICMP_DEST_UNREACH &&
1784             icmph->type != ICMP_REDIRECT &&
1785             icmph->type != ICMP_TIME_EXCEEDED &&
1786             icmph->type != ICMP_PARAMETERPROB)
1787                 return;
1788
1789         inner_iph = skb_header_pointer(skb,
1790                                        outer_iph->ihl * 4 + sizeof(_icmph),
1791                                        sizeof(_inner_iph), &_inner_iph);
1792         if (!inner_iph)
1793                 return;
1794         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1795         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1796 }
1797
1798 /* if skb is set it will be used and fl4 can be NULL */
1799 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1800                        const struct sk_buff *skb)
1801 {
1802         struct net *net = fi->fib_net;
1803         struct flow_keys hash_keys;
1804         u32 mhash;
1805
1806         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1807         case 0:
1808                 memset(&hash_keys, 0, sizeof(hash_keys));
1809                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1810                 if (skb) {
1811                         ip_multipath_l3_keys(skb, &hash_keys);
1812                 } else {
1813                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1814                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1815                 }
1816                 break;
1817         case 1:
1818                 /* skb is currently provided only when forwarding */
1819                 if (skb) {
1820                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1821                         struct flow_keys keys;
1822
1823                         /* short-circuit if we already have L4 hash present */
1824                         if (skb->l4_hash)
1825                                 return skb_get_hash_raw(skb) >> 1;
1826                         memset(&hash_keys, 0, sizeof(hash_keys));
1827                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1828                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1829                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1830                         hash_keys.ports.src = keys.ports.src;
1831                         hash_keys.ports.dst = keys.ports.dst;
1832                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1833                 } else {
1834                         memset(&hash_keys, 0, sizeof(hash_keys));
1835                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1836                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1837                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1838                         hash_keys.ports.src = fl4->fl4_sport;
1839                         hash_keys.ports.dst = fl4->fl4_dport;
1840                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1841                 }
1842                 break;
1843         }
1844         mhash = flow_hash_from_keys(&hash_keys);
1845
1846         return mhash >> 1;
1847 }
1848 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1849 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1850
1851 static int ip_mkroute_input(struct sk_buff *skb,
1852                             struct fib_result *res,
1853                             struct in_device *in_dev,
1854                             __be32 daddr, __be32 saddr, u32 tos)
1855 {
1856 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1857         if (res->fi && res->fi->fib_nhs > 1) {
1858                 int h = fib_multipath_hash(res->fi, NULL, skb);
1859
1860                 fib_select_multipath(res, h);
1861         }
1862 #endif
1863
1864         /* create a routing cache entry */
1865         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1866 }
1867
1868 /*
1869  *      NOTE. We drop all the packets that has local source
1870  *      addresses, because every properly looped back packet
1871  *      must have correct destination already attached by output routine.
1872  *
1873  *      Such approach solves two big problems:
1874  *      1. Not simplex devices are handled properly.
1875  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1876  *      called with rcu_read_lock()
1877  */
1878
1879 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1880                                u8 tos, struct net_device *dev,
1881                                struct fib_result *res)
1882 {
1883         struct in_device *in_dev = __in_dev_get_rcu(dev);
1884         struct ip_tunnel_info *tun_info;
1885         struct flowi4   fl4;
1886         unsigned int    flags = 0;
1887         u32             itag = 0;
1888         struct rtable   *rth;
1889         int             err = -EINVAL;
1890         struct net    *net = dev_net(dev);
1891         bool do_cache;
1892
1893         /* IP on this device is disabled. */
1894
1895         if (!in_dev)
1896                 goto out;
1897
1898         /* Check for the most weird martians, which can be not detected
1899            by fib_lookup.
1900          */
1901
1902         tun_info = skb_tunnel_info(skb);
1903         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1904                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1905         else
1906                 fl4.flowi4_tun_key.tun_id = 0;
1907         skb_dst_drop(skb);
1908
1909         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1910                 goto martian_source;
1911
1912         res->fi = NULL;
1913         res->table = NULL;
1914         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1915                 goto brd_input;
1916
1917         /* Accept zero addresses only to limited broadcast;
1918          * I even do not know to fix it or not. Waiting for complains :-)
1919          */
1920         if (ipv4_is_zeronet(saddr))
1921                 goto martian_source;
1922
1923         if (ipv4_is_zeronet(daddr))
1924                 goto martian_destination;
1925
1926         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1927          * and call it once if daddr or/and saddr are loopback addresses
1928          */
1929         if (ipv4_is_loopback(daddr)) {
1930                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1931                         goto martian_destination;
1932         } else if (ipv4_is_loopback(saddr)) {
1933                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1934                         goto martian_source;
1935         }
1936
1937         /*
1938          *      Now we are ready to route packet.
1939          */
1940         fl4.flowi4_oif = 0;
1941         fl4.flowi4_iif = dev->ifindex;
1942         fl4.flowi4_mark = skb->mark;
1943         fl4.flowi4_tos = tos;
1944         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1945         fl4.flowi4_flags = 0;
1946         fl4.daddr = daddr;
1947         fl4.saddr = saddr;
1948         fl4.flowi4_uid = sock_net_uid(net, NULL);
1949         err = fib_lookup(net, &fl4, res, 0);
1950         if (err != 0) {
1951                 if (!IN_DEV_FORWARD(in_dev))
1952                         err = -EHOSTUNREACH;
1953                 goto no_route;
1954         }
1955
1956         if (res->type == RTN_BROADCAST)
1957                 goto brd_input;
1958
1959         if (res->type == RTN_LOCAL) {
1960                 err = fib_validate_source(skb, saddr, daddr, tos,
1961                                           0, dev, in_dev, &itag);
1962                 if (err < 0)
1963                         goto martian_source;
1964                 goto local_input;
1965         }
1966
1967         if (!IN_DEV_FORWARD(in_dev)) {
1968                 err = -EHOSTUNREACH;
1969                 goto no_route;
1970         }
1971         if (res->type != RTN_UNICAST)
1972                 goto martian_destination;
1973
1974         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1975 out:    return err;
1976
1977 brd_input:
1978         if (skb->protocol != htons(ETH_P_IP))
1979                 goto e_inval;
1980
1981         if (!ipv4_is_zeronet(saddr)) {
1982                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1983                                           in_dev, &itag);
1984                 if (err < 0)
1985                         goto martian_source;
1986         }
1987         flags |= RTCF_BROADCAST;
1988         res->type = RTN_BROADCAST;
1989         RT_CACHE_STAT_INC(in_brd);
1990
1991 local_input:
1992         do_cache = false;
1993         if (res->fi) {
1994                 if (!itag) {
1995                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1996                         if (rt_cache_valid(rth)) {
1997                                 skb_dst_set_noref(skb, &rth->dst);
1998                                 err = 0;
1999                                 goto out;
2000                         }
2001                         do_cache = true;
2002                 }
2003         }
2004
2005         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2006                            flags | RTCF_LOCAL, res->type,
2007                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2008         if (!rth)
2009                 goto e_nobufs;
2010
2011         rth->dst.output= ip_rt_bug;
2012 #ifdef CONFIG_IP_ROUTE_CLASSID
2013         rth->dst.tclassid = itag;
2014 #endif
2015         rth->rt_is_input = 1;
2016         if (res->table)
2017                 rth->rt_table_id = res->table->tb_id;
2018
2019         RT_CACHE_STAT_INC(in_slow_tot);
2020         if (res->type == RTN_UNREACHABLE) {
2021                 rth->dst.input= ip_error;
2022                 rth->dst.error= -err;
2023                 rth->rt_flags   &= ~RTCF_LOCAL;
2024         }
2025
2026         if (do_cache) {
2027                 struct fib_nh *nh = &FIB_RES_NH(*res);
2028
2029                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2030                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2031                         WARN_ON(rth->dst.input == lwtunnel_input);
2032                         rth->dst.lwtstate->orig_input = rth->dst.input;
2033                         rth->dst.input = lwtunnel_input;
2034                 }
2035
2036                 if (unlikely(!rt_cache_route(nh, rth)))
2037                         rt_add_uncached_list(rth);
2038         }
2039         skb_dst_set(skb, &rth->dst);
2040         err = 0;
2041         goto out;
2042
2043 no_route:
2044         RT_CACHE_STAT_INC(in_no_route);
2045         res->type = RTN_UNREACHABLE;
2046         res->fi = NULL;
2047         res->table = NULL;
2048         goto local_input;
2049
2050         /*
2051          *      Do not cache martian addresses: they should be logged (RFC1812)
2052          */
2053 martian_destination:
2054         RT_CACHE_STAT_INC(in_martian_dst);
2055 #ifdef CONFIG_IP_ROUTE_VERBOSE
2056         if (IN_DEV_LOG_MARTIANS(in_dev))
2057                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2058                                      &daddr, &saddr, dev->name);
2059 #endif
2060
2061 e_inval:
2062         err = -EINVAL;
2063         goto out;
2064
2065 e_nobufs:
2066         err = -ENOBUFS;
2067         goto out;
2068
2069 martian_source:
2070         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2071         goto out;
2072 }
2073
2074 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075                          u8 tos, struct net_device *dev)
2076 {
2077         struct fib_result res;
2078         int err;
2079
2080         tos &= IPTOS_RT_MASK;
2081         rcu_read_lock();
2082         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2083         rcu_read_unlock();
2084
2085         return err;
2086 }
2087 EXPORT_SYMBOL(ip_route_input_noref);
2088
2089 /* called with rcu_read_lock held */
2090 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091                        u8 tos, struct net_device *dev, struct fib_result *res)
2092 {
2093         /* Multicast recognition logic is moved from route cache to here.
2094            The problem was that too many Ethernet cards have broken/missing
2095            hardware multicast filters :-( As result the host on multicasting
2096            network acquires a lot of useless route cache entries, sort of
2097            SDR messages from all the world. Now we try to get rid of them.
2098            Really, provided software IP multicast filter is organized
2099            reasonably (at least, hashed), it does not result in a slowdown
2100            comparing with route cache reject entries.
2101            Note, that multicast routers are not affected, because
2102            route cache entry is created eventually.
2103          */
2104         if (ipv4_is_multicast(daddr)) {
2105                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2106                 int our = 0;
2107                 int err = -EINVAL;
2108
2109                 if (in_dev)
2110                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2111                                               ip_hdr(skb)->protocol);
2112
2113                 /* check l3 master if no match yet */
2114                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2115                         struct in_device *l3_in_dev;
2116
2117                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2118                         if (l3_in_dev)
2119                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2120                                                       ip_hdr(skb)->protocol);
2121                 }
2122
2123                 if (our
2124 #ifdef CONFIG_IP_MROUTE
2125                         ||
2126                     (!ipv4_is_local_multicast(daddr) &&
2127                      IN_DEV_MFORWARD(in_dev))
2128 #endif
2129                    ) {
2130                         err = ip_route_input_mc(skb, daddr, saddr,
2131                                                 tos, dev, our);
2132                 }
2133                 return err;
2134         }
2135
2136         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2137 }
2138
2139 /* called with rcu_read_lock() */
2140 static struct rtable *__mkroute_output(const struct fib_result *res,
2141                                        const struct flowi4 *fl4, int orig_oif,
2142                                        struct net_device *dev_out,
2143                                        unsigned int flags)
2144 {
2145         struct fib_info *fi = res->fi;
2146         struct fib_nh_exception *fnhe;
2147         struct in_device *in_dev;
2148         u16 type = res->type;
2149         struct rtable *rth;
2150         bool do_cache;
2151
2152         in_dev = __in_dev_get_rcu(dev_out);
2153         if (!in_dev)
2154                 return ERR_PTR(-EINVAL);
2155
2156         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2157                 if (ipv4_is_loopback(fl4->saddr) &&
2158                     !(dev_out->flags & IFF_LOOPBACK) &&
2159                     !netif_is_l3_master(dev_out))
2160                         return ERR_PTR(-EINVAL);
2161
2162         if (ipv4_is_lbcast(fl4->daddr))
2163                 type = RTN_BROADCAST;
2164         else if (ipv4_is_multicast(fl4->daddr))
2165                 type = RTN_MULTICAST;
2166         else if (ipv4_is_zeronet(fl4->daddr))
2167                 return ERR_PTR(-EINVAL);
2168
2169         if (dev_out->flags & IFF_LOOPBACK)
2170                 flags |= RTCF_LOCAL;
2171
2172         do_cache = true;
2173         if (type == RTN_BROADCAST) {
2174                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175                 fi = NULL;
2176         } else if (type == RTN_MULTICAST) {
2177                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2178                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2179                                      fl4->flowi4_proto))
2180                         flags &= ~RTCF_LOCAL;
2181                 else
2182                         do_cache = false;
2183                 /* If multicast route do not exist use
2184                  * default one, but do not gateway in this case.
2185                  * Yes, it is hack.
2186                  */
2187                 if (fi && res->prefixlen < 4)
2188                         fi = NULL;
2189         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2190                    (orig_oif != dev_out->ifindex)) {
2191                 /* For local routes that require a particular output interface
2192                  * we do not want to cache the result.  Caching the result
2193                  * causes incorrect behaviour when there are multiple source
2194                  * addresses on the interface, the end result being that if the
2195                  * intended recipient is waiting on that interface for the
2196                  * packet he won't receive it because it will be delivered on
2197                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2198                  * be set to the loopback interface as well.
2199                  */
2200                 fi = NULL;
2201         }
2202
2203         fnhe = NULL;
2204         do_cache &= fi != NULL;
2205         if (do_cache) {
2206                 struct rtable __rcu **prth;
2207                 struct fib_nh *nh = &FIB_RES_NH(*res);
2208
2209                 fnhe = find_exception(nh, fl4->daddr);
2210                 if (fnhe) {
2211                         prth = &fnhe->fnhe_rth_output;
2212                         rth = rcu_dereference(*prth);
2213                         if (rth && rth->dst.expires &&
2214                             time_after(jiffies, rth->dst.expires)) {
2215                                 ip_del_fnhe(nh, fl4->daddr);
2216                                 fnhe = NULL;
2217                         } else {
2218                                 goto rt_cache;
2219                         }
2220                 }
2221
2222                 if (unlikely(fl4->flowi4_flags &
2223                              FLOWI_FLAG_KNOWN_NH &&
2224                              !(nh->nh_gw &&
2225                                nh->nh_scope == RT_SCOPE_LINK))) {
2226                         do_cache = false;
2227                         goto add;
2228                 }
2229                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2230                 rth = rcu_dereference(*prth);
2231
2232 rt_cache:
2233                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2234                         return rth;
2235         }
2236
2237 add:
2238         rth = rt_dst_alloc(dev_out, flags, type,
2239                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2240                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2241                            do_cache);
2242         if (!rth)
2243                 return ERR_PTR(-ENOBUFS);
2244
2245         rth->rt_iif     = orig_oif ? : 0;
2246         if (res->table)
2247                 rth->rt_table_id = res->table->tb_id;
2248
2249         RT_CACHE_STAT_INC(out_slow_tot);
2250
2251         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252                 if (flags & RTCF_LOCAL &&
2253                     !(dev_out->flags & IFF_LOOPBACK)) {
2254                         rth->dst.output = ip_mc_output;
2255                         RT_CACHE_STAT_INC(out_slow_mc);
2256                 }
2257 #ifdef CONFIG_IP_MROUTE
2258                 if (type == RTN_MULTICAST) {
2259                         if (IN_DEV_MFORWARD(in_dev) &&
2260                             !ipv4_is_local_multicast(fl4->daddr)) {
2261                                 rth->dst.input = ip_mr_input;
2262                                 rth->dst.output = ip_mc_output;
2263                         }
2264                 }
2265 #endif
2266         }
2267
2268         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2269         set_lwt_redirect(rth);
2270
2271         return rth;
2272 }
2273
2274 /*
2275  * Major route resolver routine.
2276  */
2277
2278 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2279                                         const struct sk_buff *skb)
2280 {
2281         __u8 tos = RT_FL_TOS(fl4);
2282         struct fib_result res;
2283         struct rtable *rth;
2284
2285         res.tclassid    = 0;
2286         res.fi          = NULL;
2287         res.table       = NULL;
2288
2289         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2290         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2291         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2292                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2293
2294         rcu_read_lock();
2295         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2296         rcu_read_unlock();
2297
2298         return rth;
2299 }
2300 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2301
2302 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2303                                             struct fib_result *res,
2304                                             const struct sk_buff *skb)
2305 {
2306         struct net_device *dev_out = NULL;
2307         int orig_oif = fl4->flowi4_oif;
2308         unsigned int flags = 0;
2309         struct rtable *rth;
2310         int err = -ENETUNREACH;
2311
2312         if (fl4->saddr) {
2313                 rth = ERR_PTR(-EINVAL);
2314                 if (ipv4_is_multicast(fl4->saddr) ||
2315                     ipv4_is_lbcast(fl4->saddr) ||
2316                     ipv4_is_zeronet(fl4->saddr))
2317                         goto out;
2318
2319                 /* I removed check for oif == dev_out->oif here.
2320                    It was wrong for two reasons:
2321                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322                       is assigned to multiple interfaces.
2323                    2. Moreover, we are allowed to send packets with saddr
2324                       of another iface. --ANK
2325                  */
2326
2327                 if (fl4->flowi4_oif == 0 &&
2328                     (ipv4_is_multicast(fl4->daddr) ||
2329                      ipv4_is_lbcast(fl4->daddr))) {
2330                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2331                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2332                         if (!dev_out)
2333                                 goto out;
2334
2335                         /* Special hack: user can direct multicasts
2336                            and limited broadcast via necessary interface
2337                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2338                            This hack is not just for fun, it allows
2339                            vic,vat and friends to work.
2340                            They bind socket to loopback, set ttl to zero
2341                            and expect that it will work.
2342                            From the viewpoint of routing cache they are broken,
2343                            because we are not allowed to build multicast path
2344                            with loopback source addr (look, routing cache
2345                            cannot know, that ttl is zero, so that packet
2346                            will not leave this host and route is valid).
2347                            Luckily, this hack is good workaround.
2348                          */
2349
2350                         fl4->flowi4_oif = dev_out->ifindex;
2351                         goto make_route;
2352                 }
2353
2354                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2355                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2356                         if (!__ip_dev_find(net, fl4->saddr, false))
2357                                 goto out;
2358                 }
2359         }
2360
2361
2362         if (fl4->flowi4_oif) {
2363                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2364                 rth = ERR_PTR(-ENODEV);
2365                 if (!dev_out)
2366                         goto out;
2367
2368                 /* RACE: Check return value of inet_select_addr instead. */
2369                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2370                         rth = ERR_PTR(-ENETUNREACH);
2371                         goto out;
2372                 }
2373                 if (ipv4_is_local_multicast(fl4->daddr) ||
2374                     ipv4_is_lbcast(fl4->daddr) ||
2375                     fl4->flowi4_proto == IPPROTO_IGMP) {
2376                         if (!fl4->saddr)
2377                                 fl4->saddr = inet_select_addr(dev_out, 0,
2378                                                               RT_SCOPE_LINK);
2379                         goto make_route;
2380                 }
2381                 if (!fl4->saddr) {
2382                         if (ipv4_is_multicast(fl4->daddr))
2383                                 fl4->saddr = inet_select_addr(dev_out, 0,
2384                                                               fl4->flowi4_scope);
2385                         else if (!fl4->daddr)
2386                                 fl4->saddr = inet_select_addr(dev_out, 0,
2387                                                               RT_SCOPE_HOST);
2388                 }
2389         }
2390
2391         if (!fl4->daddr) {
2392                 fl4->daddr = fl4->saddr;
2393                 if (!fl4->daddr)
2394                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2395                 dev_out = net->loopback_dev;
2396                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2397                 res->type = RTN_LOCAL;
2398                 flags |= RTCF_LOCAL;
2399                 goto make_route;
2400         }
2401
2402         err = fib_lookup(net, fl4, res, 0);
2403         if (err) {
2404                 res->fi = NULL;
2405                 res->table = NULL;
2406                 if (fl4->flowi4_oif &&
2407                     (ipv4_is_multicast(fl4->daddr) ||
2408                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2409                         /* Apparently, routing tables are wrong. Assume,
2410                            that the destination is on link.
2411
2412                            WHY? DW.
2413                            Because we are allowed to send to iface
2414                            even if it has NO routes and NO assigned
2415                            addresses. When oif is specified, routing
2416                            tables are looked up with only one purpose:
2417                            to catch if destination is gatewayed, rather than
2418                            direct. Moreover, if MSG_DONTROUTE is set,
2419                            we send packet, ignoring both routing tables
2420                            and ifaddr state. --ANK
2421
2422
2423                            We could make it even if oif is unknown,
2424                            likely IPv6, but we do not.
2425                          */
2426
2427                         if (fl4->saddr == 0)
2428                                 fl4->saddr = inet_select_addr(dev_out, 0,
2429                                                               RT_SCOPE_LINK);
2430                         res->type = RTN_UNICAST;
2431                         goto make_route;
2432                 }
2433                 rth = ERR_PTR(err);
2434                 goto out;
2435         }
2436
2437         if (res->type == RTN_LOCAL) {
2438                 if (!fl4->saddr) {
2439                         if (res->fi->fib_prefsrc)
2440                                 fl4->saddr = res->fi->fib_prefsrc;
2441                         else
2442                                 fl4->saddr = fl4->daddr;
2443                 }
2444
2445                 /* L3 master device is the loopback for that domain */
2446                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2447                         net->loopback_dev;
2448                 fl4->flowi4_oif = dev_out->ifindex;
2449                 flags |= RTCF_LOCAL;
2450                 goto make_route;
2451         }
2452
2453         fib_select_path(net, res, fl4, skb);
2454
2455         dev_out = FIB_RES_DEV(*res);
2456         fl4->flowi4_oif = dev_out->ifindex;
2457
2458
2459 make_route:
2460         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2461
2462 out:
2463         return rth;
2464 }
2465
2466 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2467 {
2468         return NULL;
2469 }
2470
2471 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2472 {
2473         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2474
2475         return mtu ? : dst->dev->mtu;
2476 }
2477
2478 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2479                                           struct sk_buff *skb, u32 mtu)
2480 {
2481 }
2482
2483 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2484                                        struct sk_buff *skb)
2485 {
2486 }
2487
2488 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2489                                           unsigned long old)
2490 {
2491         return NULL;
2492 }
2493
2494 static struct dst_ops ipv4_dst_blackhole_ops = {
2495         .family                 =       AF_INET,
2496         .check                  =       ipv4_blackhole_dst_check,
2497         .mtu                    =       ipv4_blackhole_mtu,
2498         .default_advmss         =       ipv4_default_advmss,
2499         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2500         .redirect               =       ipv4_rt_blackhole_redirect,
2501         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2502         .neigh_lookup           =       ipv4_neigh_lookup,
2503 };
2504
2505 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2506 {
2507         struct rtable *ort = (struct rtable *) dst_orig;
2508         struct rtable *rt;
2509
2510         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2511         if (rt) {
2512                 struct dst_entry *new = &rt->dst;
2513
2514                 new->__use = 1;
2515                 new->input = dst_discard;
2516                 new->output = dst_discard_out;
2517
2518                 new->dev = net->loopback_dev;
2519                 if (new->dev)
2520                         dev_hold(new->dev);
2521
2522                 rt->rt_is_input = ort->rt_is_input;
2523                 rt->rt_iif = ort->rt_iif;
2524                 rt->rt_pmtu = ort->rt_pmtu;
2525
2526                 rt->rt_genid = rt_genid_ipv4(net);
2527                 rt->rt_flags = ort->rt_flags;
2528                 rt->rt_type = ort->rt_type;
2529                 rt->rt_gateway = ort->rt_gateway;
2530                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2531
2532                 INIT_LIST_HEAD(&rt->rt_uncached);
2533         }
2534
2535         dst_release(dst_orig);
2536
2537         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2538 }
2539
2540 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2541                                     const struct sock *sk)
2542 {
2543         struct rtable *rt = __ip_route_output_key(net, flp4);
2544
2545         if (IS_ERR(rt))
2546                 return rt;
2547
2548         if (flp4->flowi4_proto)
2549                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2550                                                         flowi4_to_flowi(flp4),
2551                                                         sk, 0);
2552
2553         return rt;
2554 }
2555 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2556
2557 /* called with rcu_read_lock held */
2558 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2559                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2560                         u32 seq)
2561 {
2562         struct rtable *rt = skb_rtable(skb);
2563         struct rtmsg *r;
2564         struct nlmsghdr *nlh;
2565         unsigned long expires = 0;
2566         u32 error;
2567         u32 metrics[RTAX_MAX];
2568
2569         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2570         if (!nlh)
2571                 return -EMSGSIZE;
2572
2573         r = nlmsg_data(nlh);
2574         r->rtm_family    = AF_INET;
2575         r->rtm_dst_len  = 32;
2576         r->rtm_src_len  = 0;
2577         r->rtm_tos      = fl4->flowi4_tos;
2578         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2579         if (nla_put_u32(skb, RTA_TABLE, table_id))
2580                 goto nla_put_failure;
2581         r->rtm_type     = rt->rt_type;
2582         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2583         r->rtm_protocol = RTPROT_UNSPEC;
2584         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2585         if (rt->rt_flags & RTCF_NOTIFY)
2586                 r->rtm_flags |= RTM_F_NOTIFY;
2587         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2588                 r->rtm_flags |= RTCF_DOREDIRECT;
2589
2590         if (nla_put_in_addr(skb, RTA_DST, dst))
2591                 goto nla_put_failure;
2592         if (src) {
2593                 r->rtm_src_len = 32;
2594                 if (nla_put_in_addr(skb, RTA_SRC, src))
2595                         goto nla_put_failure;
2596         }
2597         if (rt->dst.dev &&
2598             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2599                 goto nla_put_failure;
2600 #ifdef CONFIG_IP_ROUTE_CLASSID
2601         if (rt->dst.tclassid &&
2602             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2603                 goto nla_put_failure;
2604 #endif
2605         if (!rt_is_input_route(rt) &&
2606             fl4->saddr != src) {
2607                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2608                         goto nla_put_failure;
2609         }
2610         if (rt->rt_uses_gateway &&
2611             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2612                 goto nla_put_failure;
2613
2614         expires = rt->dst.expires;
2615         if (expires) {
2616                 unsigned long now = jiffies;
2617
2618                 if (time_before(now, expires))
2619                         expires -= now;
2620                 else
2621                         expires = 0;
2622         }
2623
2624         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2625         if (rt->rt_pmtu && expires)
2626                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2627         if (rtnetlink_put_metrics(skb, metrics) < 0)
2628                 goto nla_put_failure;
2629
2630         if (fl4->flowi4_mark &&
2631             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2632                 goto nla_put_failure;
2633
2634         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2635             nla_put_u32(skb, RTA_UID,
2636                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2637                 goto nla_put_failure;
2638
2639         error = rt->dst.error;
2640
2641         if (rt_is_input_route(rt)) {
2642 #ifdef CONFIG_IP_MROUTE
2643                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2644                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2645                         int err = ipmr_get_route(net, skb,
2646                                                  fl4->saddr, fl4->daddr,
2647                                                  r, portid);
2648
2649                         if (err <= 0) {
2650                                 if (err == 0)
2651                                         return 0;
2652                                 goto nla_put_failure;
2653                         }
2654                 } else
2655 #endif
2656                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2657                                 goto nla_put_failure;
2658         }
2659
2660         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2661                 goto nla_put_failure;
2662
2663         nlmsg_end(skb, nlh);
2664         return 0;
2665
2666 nla_put_failure:
2667         nlmsg_cancel(skb, nlh);
2668         return -EMSGSIZE;
2669 }
2670
2671 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2672                              struct netlink_ext_ack *extack)
2673 {
2674         struct net *net = sock_net(in_skb->sk);
2675         struct rtmsg *rtm;
2676         struct nlattr *tb[RTA_MAX+1];
2677         struct fib_result res = {};
2678         struct rtable *rt = NULL;
2679         struct flowi4 fl4;
2680         __be32 dst = 0;
2681         __be32 src = 0;
2682         u32 iif;
2683         int err;
2684         int mark;
2685         struct sk_buff *skb;
2686         u32 table_id = RT_TABLE_MAIN;
2687         kuid_t uid;
2688
2689         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2690                           extack);
2691         if (err < 0)
2692                 goto errout;
2693
2694         rtm = nlmsg_data(nlh);
2695
2696         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697         if (!skb) {
2698                 err = -ENOBUFS;
2699                 goto errout;
2700         }
2701
2702         /* Reserve room for dummy headers, this skb can pass
2703            through good chunk of routing engine.
2704          */
2705         skb_reset_mac_header(skb);
2706         skb_reset_network_header(skb);
2707
2708         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2709         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2710         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2711         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2712         if (tb[RTA_UID])
2713                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2714         else
2715                 uid = (iif ? INVALID_UID : current_uid());
2716
2717         /* Bugfix: need to give ip_route_input enough of an IP header to
2718          * not gag.
2719          */
2720         ip_hdr(skb)->protocol = IPPROTO_UDP;
2721         ip_hdr(skb)->saddr = src;
2722         ip_hdr(skb)->daddr = dst;
2723
2724         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2725
2726         memset(&fl4, 0, sizeof(fl4));
2727         fl4.daddr = dst;
2728         fl4.saddr = src;
2729         fl4.flowi4_tos = rtm->rtm_tos;
2730         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2731         fl4.flowi4_mark = mark;
2732         fl4.flowi4_uid = uid;
2733
2734         rcu_read_lock();
2735
2736         if (iif) {
2737                 struct net_device *dev;
2738
2739                 dev = dev_get_by_index_rcu(net, iif);
2740                 if (!dev) {
2741                         err = -ENODEV;
2742                         goto errout_free;
2743                 }
2744
2745                 skb->protocol   = htons(ETH_P_IP);
2746                 skb->dev        = dev;
2747                 skb->mark       = mark;
2748                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2749                                          dev, &res);
2750
2751                 rt = skb_rtable(skb);
2752                 if (err == 0 && rt->dst.error)
2753                         err = -rt->dst.error;
2754         } else {
2755                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2756                 err = 0;
2757                 if (IS_ERR(rt))
2758                         err = PTR_ERR(rt);
2759                 else
2760                         skb_dst_set(skb, &rt->dst);
2761         }
2762
2763         if (err)
2764                 goto errout_free;
2765
2766         if (rtm->rtm_flags & RTM_F_NOTIFY)
2767                 rt->rt_flags |= RTCF_NOTIFY;
2768
2769         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2770                 table_id = rt->rt_table_id;
2771
2772         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2773                 if (!res.fi) {
2774                         err = fib_props[res.type].error;
2775                         if (!err)
2776                                 err = -EHOSTUNREACH;
2777                         goto errout_free;
2778                 }
2779                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2780                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2781                                     rt->rt_type, res.prefix, res.prefixlen,
2782                                     fl4.flowi4_tos, res.fi, 0);
2783         } else {
2784                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2785                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2786         }
2787         if (err < 0)
2788                 goto errout_free;
2789
2790         rcu_read_unlock();
2791
2792         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2793 errout:
2794         return err;
2795
2796 errout_free:
2797         rcu_read_unlock();
2798         kfree_skb(skb);
2799         goto errout;
2800 }
2801
2802 void ip_rt_multicast_event(struct in_device *in_dev)
2803 {
2804         rt_cache_flush(dev_net(in_dev->dev));
2805 }
2806
2807 #ifdef CONFIG_SYSCTL
2808 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2809 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2810 static int ip_rt_gc_elasticity __read_mostly    = 8;
2811
2812 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2813                                         void __user *buffer,
2814                                         size_t *lenp, loff_t *ppos)
2815 {
2816         struct net *net = (struct net *)__ctl->extra1;
2817
2818         if (write) {
2819                 rt_cache_flush(net);
2820                 fnhe_genid_bump(net);
2821                 return 0;
2822         }
2823
2824         return -EINVAL;
2825 }
2826
2827 static struct ctl_table ipv4_route_table[] = {
2828         {
2829                 .procname       = "gc_thresh",
2830                 .data           = &ipv4_dst_ops.gc_thresh,
2831                 .maxlen         = sizeof(int),
2832                 .mode           = 0644,
2833                 .proc_handler   = proc_dointvec,
2834         },
2835         {
2836                 .procname       = "max_size",
2837                 .data           = &ip_rt_max_size,
2838                 .maxlen         = sizeof(int),
2839                 .mode           = 0644,
2840                 .proc_handler   = proc_dointvec,
2841         },
2842         {
2843                 /*  Deprecated. Use gc_min_interval_ms */
2844
2845                 .procname       = "gc_min_interval",
2846                 .data           = &ip_rt_gc_min_interval,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = proc_dointvec_jiffies,
2850         },
2851         {
2852                 .procname       = "gc_min_interval_ms",
2853                 .data           = &ip_rt_gc_min_interval,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = proc_dointvec_ms_jiffies,
2857         },
2858         {
2859                 .procname       = "gc_timeout",
2860                 .data           = &ip_rt_gc_timeout,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = proc_dointvec_jiffies,
2864         },
2865         {
2866                 .procname       = "gc_interval",
2867                 .data           = &ip_rt_gc_interval,
2868                 .maxlen         = sizeof(int),
2869                 .mode           = 0644,
2870                 .proc_handler   = proc_dointvec_jiffies,
2871         },
2872         {
2873                 .procname       = "redirect_load",
2874                 .data           = &ip_rt_redirect_load,
2875                 .maxlen         = sizeof(int),
2876                 .mode           = 0644,
2877                 .proc_handler   = proc_dointvec,
2878         },
2879         {
2880                 .procname       = "redirect_number",
2881                 .data           = &ip_rt_redirect_number,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = proc_dointvec,
2885         },
2886         {
2887                 .procname       = "redirect_silence",
2888                 .data           = &ip_rt_redirect_silence,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = proc_dointvec,
2892         },
2893         {
2894                 .procname       = "error_cost",
2895                 .data           = &ip_rt_error_cost,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = proc_dointvec,
2899         },
2900         {
2901                 .procname       = "error_burst",
2902                 .data           = &ip_rt_error_burst,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = proc_dointvec,
2906         },
2907         {
2908                 .procname       = "gc_elasticity",
2909                 .data           = &ip_rt_gc_elasticity,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = proc_dointvec,
2913         },
2914         {
2915                 .procname       = "mtu_expires",
2916                 .data           = &ip_rt_mtu_expires,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = proc_dointvec_jiffies,
2920         },
2921         {
2922                 .procname       = "min_pmtu",
2923                 .data           = &ip_rt_min_pmtu,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = proc_dointvec,
2927         },
2928         {
2929                 .procname       = "min_adv_mss",
2930                 .data           = &ip_rt_min_advmss,
2931                 .maxlen         = sizeof(int),
2932                 .mode           = 0644,
2933                 .proc_handler   = proc_dointvec,
2934         },
2935         { }
2936 };
2937
2938 static struct ctl_table ipv4_route_flush_table[] = {
2939         {
2940                 .procname       = "flush",
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0200,
2943                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2944         },
2945         { },
2946 };
2947
2948 static __net_init int sysctl_route_net_init(struct net *net)
2949 {
2950         struct ctl_table *tbl;
2951
2952         tbl = ipv4_route_flush_table;
2953         if (!net_eq(net, &init_net)) {
2954                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2955                 if (!tbl)
2956                         goto err_dup;
2957
2958                 /* Don't export sysctls to unprivileged users */
2959                 if (net->user_ns != &init_user_ns)
2960                         tbl[0].procname = NULL;
2961         }
2962         tbl[0].extra1 = net;
2963
2964         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2965         if (!net->ipv4.route_hdr)
2966                 goto err_reg;
2967         return 0;
2968
2969 err_reg:
2970         if (tbl != ipv4_route_flush_table)
2971                 kfree(tbl);
2972 err_dup:
2973         return -ENOMEM;
2974 }
2975
2976 static __net_exit void sysctl_route_net_exit(struct net *net)
2977 {
2978         struct ctl_table *tbl;
2979
2980         tbl = net->ipv4.route_hdr->ctl_table_arg;
2981         unregister_net_sysctl_table(net->ipv4.route_hdr);
2982         BUG_ON(tbl == ipv4_route_flush_table);
2983         kfree(tbl);
2984 }
2985
2986 static __net_initdata struct pernet_operations sysctl_route_ops = {
2987         .init = sysctl_route_net_init,
2988         .exit = sysctl_route_net_exit,
2989 };
2990 #endif
2991
2992 static __net_init int rt_genid_init(struct net *net)
2993 {
2994         atomic_set(&net->ipv4.rt_genid, 0);
2995         atomic_set(&net->fnhe_genid, 0);
2996         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
2997         return 0;
2998 }
2999
3000 static __net_initdata struct pernet_operations rt_genid_ops = {
3001         .init = rt_genid_init,
3002 };
3003
3004 static int __net_init ipv4_inetpeer_init(struct net *net)
3005 {
3006         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3007
3008         if (!bp)
3009                 return -ENOMEM;
3010         inet_peer_base_init(bp);
3011         net->ipv4.peers = bp;
3012         return 0;
3013 }
3014
3015 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3016 {
3017         struct inet_peer_base *bp = net->ipv4.peers;
3018
3019         net->ipv4.peers = NULL;
3020         inetpeer_invalidate_tree(bp);
3021         kfree(bp);
3022 }
3023
3024 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3025         .init   =       ipv4_inetpeer_init,
3026         .exit   =       ipv4_inetpeer_exit,
3027 };
3028
3029 #ifdef CONFIG_IP_ROUTE_CLASSID
3030 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3031 #endif /* CONFIG_IP_ROUTE_CLASSID */
3032
3033 int __init ip_rt_init(void)
3034 {
3035         int rc = 0;
3036         int cpu;
3037
3038         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3039         if (!ip_idents)
3040                 panic("IP: failed to allocate ip_idents\n");
3041
3042         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3043
3044         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3045         if (!ip_tstamps)
3046                 panic("IP: failed to allocate ip_tstamps\n");
3047
3048         for_each_possible_cpu(cpu) {
3049                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3050
3051                 INIT_LIST_HEAD(&ul->head);
3052                 spin_lock_init(&ul->lock);
3053         }
3054 #ifdef CONFIG_IP_ROUTE_CLASSID
3055         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3056         if (!ip_rt_acct)
3057                 panic("IP: failed to allocate ip_rt_acct\n");
3058 #endif
3059
3060         ipv4_dst_ops.kmem_cachep =
3061                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3062                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3063
3064         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3065
3066         if (dst_entries_init(&ipv4_dst_ops) < 0)
3067                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3068
3069         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3070                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3071
3072         ipv4_dst_ops.gc_thresh = ~0;
3073         ip_rt_max_size = INT_MAX;
3074
3075         devinet_init();
3076         ip_fib_init();
3077
3078         if (ip_rt_proc_init())
3079                 pr_err("Unable to create route proc files\n");
3080 #ifdef CONFIG_XFRM
3081         xfrm_init();
3082         xfrm4_init();
3083 #endif
3084         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3085
3086 #ifdef CONFIG_SYSCTL
3087         register_pernet_subsys(&sysctl_route_ops);
3088 #endif
3089         register_pernet_subsys(&rt_genid_ops);
3090         register_pernet_subsys(&ipv4_inetpeer_ops);
3091         return rc;
3092 }
3093
3094 #ifdef CONFIG_SYSCTL
3095 /*
3096  * We really need to sanitize the damn ipv4 init order, then all
3097  * this nonsense will go away.
3098  */
3099 void __init ip_static_sysctl_init(void)
3100 {
3101         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3102 }
3103 #endif