net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 148                                            struct sk_buff *skb, u32 mtu);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         return NULL;
 281
 282 }
 283
 284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285 {
 286
 287 }
 288
 289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290 {
 291         struct rt_cache_stat *st = v;
 292
 293         if (v == SEQ_START_TOKEN) {
 294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                 return 0;
 296         }
 297
 298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                    dst_entries_get_slow(&ipv4_dst_ops),
 301                    0, /* st->in_hit */
 302                    st->in_slow_tot,
 303                    st->in_slow_mc,
 304                    st->in_no_route,
 305                    st->in_brd,
 306                    st->in_martian_dst,
 307                    st->in_martian_src,
 308
 309                    0, /* st->out_hit */
 310                    st->out_slow_tot,
 311                    st->out_slow_mc,
 312
 313                    0, /* st->gc_total */
 314                    0, /* st->gc_ignored */
 315                    0, /* st->gc_goal_miss */
 316                    0, /* st->gc_dst_overflow */
 317                    0, /* st->in_hlist_search */
 318                    0  /* st->out_hlist_search */
 319                 );
 320         return 0;
 321 }
 322
 323 static const struct seq_operations rt_cpu_seq_ops = {
 324         .start  = rt_cpu_seq_start,
 325         .next   = rt_cpu_seq_next,
 326         .stop   = rt_cpu_seq_stop,
 327         .show   = rt_cpu_seq_show,
 328 };
 329
 330
 331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332 {
 333         return seq_open(file, &rt_cpu_seq_ops);
 334 }
 335
 336 static const struct file_operations rt_cpu_seq_fops = {
 337         .owner   = THIS_MODULE,
 338         .open    = rt_cpu_seq_open,
 339         .read    = seq_read,
 340         .llseek  = seq_lseek,
 341         .release = seq_release,
 342 };
 343
 344 #ifdef CONFIG_IP_ROUTE_CLASSID
 345 static int rt_acct_proc_show(struct seq_file *m, void *v)
 346 {
 347         struct ip_rt_acct *dst, *src;
 348         unsigned int i, j;
 349
 350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351         if (!dst)
 352                 return -ENOMEM;
 353
 354         for_each_possible_cpu(i) {
 355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                 for (j = 0; j < 256; j++) {
 357                         dst[j].o_bytes   += src[j].o_bytes;
 358                         dst[j].o_packets += src[j].o_packets;
 359                         dst[j].i_bytes   += src[j].i_bytes;
 360                         dst[j].i_packets += src[j].i_packets;
 361                 }
 362         }
 363
 364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365         kfree(dst);
 366         return 0;
 367 }
 368
 369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370 {
 371         return single_open(file, rt_acct_proc_show, NULL);
 372 }
 373
 374 static const struct file_operations rt_acct_proc_fops = {
 375         .owner          = THIS_MODULE,
 376         .open           = rt_acct_proc_open,
 377         .read           = seq_read,
 378         .llseek         = seq_lseek,
 379         .release        = single_release,
 380 };
 381 #endif
 382
 383 static int __net_init ip_rt_do_proc_init(struct net *net)
 384 {
 385         struct proc_dir_entry *pde;
 386
 387         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 388                           &rt_cache_seq_fops);
 389         if (!pde)
 390                 goto err1;
 391
 392         pde = proc_create("rt_cache", S_IRUGO,
 393                           net->proc_net_stat, &rt_cpu_seq_fops);
 394         if (!pde)
 395                 goto err2;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399         if (!pde)
 400                 goto err3;
 401 #endif
 402         return 0;
 403
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405 err3:
 406         remove_proc_entry("rt_cache", net->proc_net_stat);
 407 #endif
 408 err2:
 409         remove_proc_entry("rt_cache", net->proc_net);
 410 err1:
 411         return -ENOMEM;
 412 }
 413
 414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415 {
 416         remove_proc_entry("rt_cache", net->proc_net_stat);
 417         remove_proc_entry("rt_cache", net->proc_net);
 418 #ifdef CONFIG_IP_ROUTE_CLASSID
 419         remove_proc_entry("rt_acct", net->proc_net);
 420 #endif
 421 }
 422
 423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424         .init = ip_rt_do_proc_init,
 425         .exit = ip_rt_do_proc_exit,
 426 };
 427
 428 static int __init ip_rt_proc_init(void)
 429 {
 430         return register_pernet_subsys(&ip_rt_proc_ops);
 431 }
 432
 433 #else
 434 static inline int ip_rt_proc_init(void)
 435 {
 436         return 0;
 437 }
 438 #endif /* CONFIG_PROC_FS */
 439
 440 static inline bool rt_is_expired(const struct rtable *rth)
 441 {
 442         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 443 }
 444
 445 void rt_cache_flush(struct net *net)
 446 {
 447         rt_genid_bump_ipv4(net);
 448 }
 449
 450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                            struct sk_buff *skb,
 452                                            const void *daddr)
 453 {
 454         struct net_device *dev = dst->dev;
 455         const __be32 *pkey = daddr;
 456         const struct rtable *rt;
 457         struct neighbour *n;
 458
 459         rt = (const struct rtable *) dst;
 460         if (rt->rt_gateway)
 461                 pkey = (const __be32 *) &rt->rt_gateway;
 462         else if (skb)
 463                 pkey = &ip_hdr(skb)->daddr;
 464
 465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466         if (n)
 467                 return n;
 468         return neigh_create(&arp_tbl, pkey, dev);
 469 }
 470
 471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 472 {
 473         struct net_device *dev = dst->dev;
 474         const __be32 *pkey = daddr;
 475         const struct rtable *rt;
 476
 477         rt = (const struct rtable *)dst;
 478         if (rt->rt_gateway)
 479                 pkey = (const __be32 *)&rt->rt_gateway;
 480         else if (!daddr ||
 481                  (rt->rt_flags &
 482                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 483                 return;
 484
 485         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 486 }
 487
 488 #define IP_IDENTS_SZ 2048u
 489
 490 static atomic_t *ip_idents __read_mostly;
 491 static u32 *ip_tstamps __read_mostly;
 492
 493 /* In order to protect privacy, we add a perturbation to identifiers
 494  * if one generator is seldom used. This makes hard for an attacker
 495  * to infer how many packets were sent between two points in time.
 496  */
 497 u32 ip_idents_reserve(u32 hash, int segs)
 498 {
 499         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 500         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 501         u32 old = READ_ONCE(*p_tstamp);
 502         u32 now = (u32)jiffies;
 503         u32 new, delta = 0;
 504
 505         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 506                 delta = prandom_u32_max(now - old);
 507
 508         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 509         do {
 510                 old = (u32)atomic_read(p_id);
 511                 new = old + delta + segs;
 512         } while (atomic_cmpxchg(p_id, old, new) != old);
 513
 514         return new - segs;
 515 }
 516 EXPORT_SYMBOL(ip_idents_reserve);
 517
 518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 519 {
 520         static u32 ip_idents_hashrnd __read_mostly;
 521         u32 hash, id;
 522
 523         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 524
 525         hash = jhash_3words((__force u32)iph->daddr,
 526                             (__force u32)iph->saddr,
 527                             iph->protocol ^ net_hash_mix(net),
 528                             ip_idents_hashrnd);
 529         id = ip_idents_reserve(hash, segs);
 530         iph->id = htons(id);
 531 }
 532 EXPORT_SYMBOL(__ip_select_ident);
 533
 534 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 535                              const struct sock *sk,
 536                              const struct iphdr *iph,
 537                              int oif, u8 tos,
 538                              u8 prot, u32 mark, int flow_flags)
 539 {
 540         if (sk) {
 541                 const struct inet_sock *inet = inet_sk(sk);
 542
 543                 oif = sk->sk_bound_dev_if;
 544                 mark = sk->sk_mark;
 545                 tos = RT_CONN_FLAGS(sk);
 546                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 547         }
 548         flowi4_init_output(fl4, oif, mark, tos,
 549                            RT_SCOPE_UNIVERSE, prot,
 550                            flow_flags,
 551                            iph->daddr, iph->saddr, 0, 0,
 552                            sock_net_uid(net, sk));
 553 }
 554
 555 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 556                                const struct sock *sk)
 557 {
 558         const struct net *net = dev_net(skb->dev);
 559         const struct iphdr *iph = ip_hdr(skb);
 560         int oif = skb->dev->ifindex;
 561         u8 tos = RT_TOS(iph->tos);
 562         u8 prot = iph->protocol;
 563         u32 mark = skb->mark;
 564
 565         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 566 }
 567
 568 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 569 {
 570         const struct inet_sock *inet = inet_sk(sk);
 571         const struct ip_options_rcu *inet_opt;
 572         __be32 daddr = inet->inet_daddr;
 573
 574         rcu_read_lock();
 575         inet_opt = rcu_dereference(inet->inet_opt);
 576         if (inet_opt && inet_opt->opt.srr)
 577                 daddr = inet_opt->opt.faddr;
 578         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 579                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 580                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 581                            inet_sk_flowi_flags(sk),
 582                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 583         rcu_read_unlock();
 584 }
 585
 586 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 587                                  const struct sk_buff *skb)
 588 {
 589         if (skb)
 590                 build_skb_flow_key(fl4, skb, sk);
 591         else
 592                 build_sk_flow_key(fl4, sk);
 593 }
 594
 595 static DEFINE_SPINLOCK(fnhe_lock);
 596
 597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 598 {
 599         struct rtable *rt;
 600
 601         rt = rcu_dereference(fnhe->fnhe_rth_input);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607         rt = rcu_dereference(fnhe->fnhe_rth_output);
 608         if (rt) {
 609                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 610                 dst_dev_put(&rt->dst);
 611                 dst_release(&rt->dst);
 612         }
 613 }
 614
 615 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 616 {
 617         struct fib_nh_exception *fnhe, *oldest;
 618
 619         oldest = rcu_dereference(hash->chain);
 620         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 621              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 622                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 623                         oldest = fnhe;
 624         }
 625         fnhe_flush_routes(oldest);
 626         return oldest;
 627 }
 628
 629 static inline u32 fnhe_hashfun(__be32 daddr)
 630 {
 631         static u32 fnhe_hashrnd __read_mostly;
 632         u32 hval;
 633
 634         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 635         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 636         return hash_32(hval, FNHE_HASH_SHIFT);
 637 }
 638
 639 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 640 {
 641         rt->rt_pmtu = fnhe->fnhe_pmtu;
 642         rt->dst.expires = fnhe->fnhe_expires;
 643
 644         if (fnhe->fnhe_gw) {
 645                 rt->rt_flags |= RTCF_REDIRECTED;
 646                 rt->rt_gateway = fnhe->fnhe_gw;
 647                 rt->rt_uses_gateway = 1;
 648         }
 649 }
 650
 651 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 652                                   u32 pmtu, unsigned long expires)
 653 {
 654         struct fnhe_hash_bucket *hash;
 655         struct fib_nh_exception *fnhe;
 656         struct rtable *rt;
 657         u32 genid, hval;
 658         unsigned int i;
 659         int depth;
 660
 661         genid = fnhe_genid(dev_net(nh->nh_dev));
 662         hval = fnhe_hashfun(daddr);
 663
 664         spin_lock_bh(&fnhe_lock);
 665
 666         hash = rcu_dereference(nh->nh_exceptions);
 667         if (!hash) {
 668                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 669                 if (!hash)
 670                         goto out_unlock;
 671                 rcu_assign_pointer(nh->nh_exceptions, hash);
 672         }
 673
 674         hash += hval;
 675
 676         depth = 0;
 677         for (fnhe = rcu_dereference(hash->chain); fnhe;
 678              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 679                 if (fnhe->fnhe_daddr == daddr)
 680                         break;
 681                 depth++;
 682         }
 683
 684         if (fnhe) {
 685                 if (fnhe->fnhe_genid != genid)
 686                         fnhe->fnhe_genid = genid;
 687                 if (gw)
 688                         fnhe->fnhe_gw = gw;
 689                 if (pmtu)
 690                         fnhe->fnhe_pmtu = pmtu;
 691                 fnhe->fnhe_expires = max(1UL, expires);
 692                 /* Update all cached dsts too */
 693                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 694                 if (rt)
 695                         fill_route_from_fnhe(rt, fnhe);
 696                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 697                 if (rt)
 698                         fill_route_from_fnhe(rt, fnhe);
 699         } else {
 700                 if (depth > FNHE_RECLAIM_DEPTH)
 701                         fnhe = fnhe_oldest(hash);
 702                 else {
 703                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 704                         if (!fnhe)
 705                                 goto out_unlock;
 706
 707                         fnhe->fnhe_next = hash->chain;
 708                         rcu_assign_pointer(hash->chain, fnhe);
 709                 }
 710                 fnhe->fnhe_genid = genid;
 711                 fnhe->fnhe_daddr = daddr;
 712                 fnhe->fnhe_gw = gw;
 713                 fnhe->fnhe_pmtu = pmtu;
 714                 fnhe->fnhe_expires = expires;
 715
 716                 /* Exception created; mark the cached routes for the nexthop
 717                  * stale, so anyone caching it rechecks if this exception
 718                  * applies to them.
 719                  */
 720                 rt = rcu_dereference(nh->nh_rth_input);
 721                 if (rt)
 722                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 723
 724                 for_each_possible_cpu(i) {
 725                         struct rtable __rcu **prt;
 726                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 727                         rt = rcu_dereference(*prt);
 728                         if (rt)
 729                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 730                 }
 731         }
 732
 733         fnhe->fnhe_stamp = jiffies;
 734
 735 out_unlock:
 736         spin_unlock_bh(&fnhe_lock);
 737 }
 738
 739 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 740                              bool kill_route)
 741 {
 742         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 743         __be32 old_gw = ip_hdr(skb)->saddr;
 744         struct net_device *dev = skb->dev;
 745         struct in_device *in_dev;
 746         struct fib_result res;
 747         struct neighbour *n;
 748         struct net *net;
 749
 750         switch (icmp_hdr(skb)->code & 7) {
 751         case ICMP_REDIR_NET:
 752         case ICMP_REDIR_NETTOS:
 753         case ICMP_REDIR_HOST:
 754         case ICMP_REDIR_HOSTTOS:
 755                 break;
 756
 757         default:
 758                 return;
 759         }
 760
 761         if (rt->rt_gateway != old_gw)
 762                 return;
 763
 764         in_dev = __in_dev_get_rcu(dev);
 765         if (!in_dev)
 766                 return;
 767
 768         net = dev_net(dev);
 769         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 770             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 771             ipv4_is_zeronet(new_gw))
 772                 goto reject_redirect;
 773
 774         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 775                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 776                         goto reject_redirect;
 777                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 778                         goto reject_redirect;
 779         } else {
 780                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 781                         goto reject_redirect;
 782         }
 783
 784         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 785         if (!n)
 786                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 787         if (!IS_ERR(n)) {
 788                 if (!(n->nud_state & NUD_VALID)) {
 789                         neigh_event_send(n, NULL);
 790                 } else {
 791                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 792                                 struct fib_nh *nh = &FIB_RES_NH(res);
 793
 794                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 795                                                 0, jiffies + ip_rt_gc_timeout);
 796                         }
 797                         if (kill_route)
 798                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 799                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 800                 }
 801                 neigh_release(n);
 802         }
 803         return;
 804
 805 reject_redirect:
 806 #ifdef CONFIG_IP_ROUTE_VERBOSE
 807         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 808                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 809                 __be32 daddr = iph->daddr;
 810                 __be32 saddr = iph->saddr;
 811
 812                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 813                                      "  Advised path = %pI4 -> %pI4\n",
 814                                      &old_gw, dev->name, &new_gw,
 815                                      &saddr, &daddr);
 816         }
 817 #endif
 818         ;
 819 }
 820
 821 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 822 {
 823         struct rtable *rt;
 824         struct flowi4 fl4;
 825         const struct iphdr *iph = (const struct iphdr *) skb->data;
 826         struct net *net = dev_net(skb->dev);
 827         int oif = skb->dev->ifindex;
 828         u8 tos = RT_TOS(iph->tos);
 829         u8 prot = iph->protocol;
 830         u32 mark = skb->mark;
 831
 832         rt = (struct rtable *) dst;
 833
 834         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 835         __ip_do_redirect(rt, skb, &fl4, true);
 836 }
 837
 838 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 839 {
 840         struct rtable *rt = (struct rtable *)dst;
 841         struct dst_entry *ret = dst;
 842
 843         if (rt) {
 844                 if (dst->obsolete > 0) {
 845                         ip_rt_put(rt);
 846                         ret = NULL;
 847                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 848                            rt->dst.expires) {
 849                         ip_rt_put(rt);
 850                         ret = NULL;
 851                 }
 852         }
 853         return ret;
 854 }
 855
 856 /*
 857  * Algorithm:
 858  *      1. The first ip_rt_redirect_number redirects are sent
 859  *         with exponential backoff, then we stop sending them at all,
 860  *         assuming that the host ignores our redirects.
 861  *      2. If we did not see packets requiring redirects
 862  *         during ip_rt_redirect_silence, we assume that the host
 863  *         forgot redirected route and start to send redirects again.
 864  *
 865  * This algorithm is much cheaper and more intelligent than dumb load limiting
 866  * in icmp.c.
 867  *
 868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 870  */
 871
 872 void ip_rt_send_redirect(struct sk_buff *skb)
 873 {
 874         struct rtable *rt = skb_rtable(skb);
 875         struct in_device *in_dev;
 876         struct inet_peer *peer;
 877         struct net *net;
 878         int log_martians;
 879         int vif;
 880
 881         rcu_read_lock();
 882         in_dev = __in_dev_get_rcu(rt->dst.dev);
 883         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 884                 rcu_read_unlock();
 885                 return;
 886         }
 887         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 888         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 889         rcu_read_unlock();
 890
 891         net = dev_net(rt->dst.dev);
 892         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 893         if (!peer) {
 894                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 895                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 896                 return;
 897         }
 898
 899         /* No redirected packets during ip_rt_redirect_silence;
 900          * reset the algorithm.
 901          */
 902         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 903                 peer->rate_tokens = 0;
 904
 905         /* Too many ignored redirects; do not send anything
 906          * set dst.rate_last to the last seen redirected packet.
 907          */
 908         if (peer->rate_tokens >= ip_rt_redirect_number) {
 909                 peer->rate_last = jiffies;
 910                 goto out_put_peer;
 911         }
 912
 913         /* Check for load limit; set rate_last to the latest sent
 914          * redirect.
 915          */
 916         if (peer->rate_tokens == 0 ||
 917             time_after(jiffies,
 918                        (peer->rate_last +
 919                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 920                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923                 peer->rate_last = jiffies;
 924                 ++peer->rate_tokens;
 925 #ifdef CONFIG_IP_ROUTE_VERBOSE
 926                 if (log_martians &&
 927                     peer->rate_tokens == ip_rt_redirect_number)
 928                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 930                                              &ip_hdr(skb)->daddr, &gw);
 931 #endif
 932         }
 933 out_put_peer:
 934         inet_putpeer(peer);
 935 }
 936
 937 static int ip_error(struct sk_buff *skb)
 938 {
 939         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 940         struct rtable *rt = skb_rtable(skb);
 941         struct inet_peer *peer;
 942         unsigned long now;
 943         struct net *net;
 944         bool send;
 945         int code;
 946
 947         /* IP on this device is disabled. */
 948         if (!in_dev)
 949                 goto out;
 950
 951         net = dev_net(rt->dst.dev);
 952         if (!IN_DEV_FORWARD(in_dev)) {
 953                 switch (rt->dst.error) {
 954                 case EHOSTUNREACH:
 955                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 956                         break;
 957
 958                 case ENETUNREACH:
 959                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 960                         break;
 961                 }
 962                 goto out;
 963         }
 964
 965         switch (rt->dst.error) {
 966         case EINVAL:
 967         default:
 968                 goto out;
 969         case EHOSTUNREACH:
 970                 code = ICMP_HOST_UNREACH;
 971                 break;
 972         case ENETUNREACH:
 973                 code = ICMP_NET_UNREACH;
 974                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 975                 break;
 976         case EACCES:
 977                 code = ICMP_PKT_FILTERED;
 978                 break;
 979         }
 980
 981         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 982                                l3mdev_master_ifindex(skb->dev), 1);
 983
 984         send = true;
 985         if (peer) {
 986                 now = jiffies;
 987                 peer->rate_tokens += now - peer->rate_last;
 988                 if (peer->rate_tokens > ip_rt_error_burst)
 989                         peer->rate_tokens = ip_rt_error_burst;
 990                 peer->rate_last = now;
 991                 if (peer->rate_tokens >= ip_rt_error_cost)
 992                         peer->rate_tokens -= ip_rt_error_cost;
 993                 else
 994                         send = false;
 995                 inet_putpeer(peer);
 996         }
 997         if (send)
 998                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 999
1000 out:    kfree_skb(skb);
1001         return 0;
1002 }
1003
1004 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1005 {
1006         struct dst_entry *dst = &rt->dst;
1007         struct fib_result res;
1008
1009         if (dst_metric_locked(dst, RTAX_MTU))
1010                 return;
1011
1012         if (ipv4_mtu(dst) < mtu)
1013                 return;
1014
1015         if (mtu < ip_rt_min_pmtu)
1016                 mtu = ip_rt_min_pmtu;
1017
1018         if (rt->rt_pmtu == mtu &&
1019             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1020                 return;
1021
1022         rcu_read_lock();
1023         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1024                 struct fib_nh *nh = &FIB_RES_NH(res);
1025
1026                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1027                                       jiffies + ip_rt_mtu_expires);
1028         }
1029         rcu_read_unlock();
1030 }
1031
1032 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1033                               struct sk_buff *skb, u32 mtu)
1034 {
1035         struct rtable *rt = (struct rtable *) dst;
1036         struct flowi4 fl4;
1037
1038         ip_rt_build_flow_key(&fl4, sk, skb);
1039         __ip_rt_update_pmtu(rt, &fl4, mtu);
1040 }
1041
1042 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1043                       int oif, u32 mark, u8 protocol, int flow_flags)
1044 {
1045         const struct iphdr *iph = (const struct iphdr *) skb->data;
1046         struct flowi4 fl4;
1047         struct rtable *rt;
1048
1049         if (!mark)
1050                 mark = IP4_REPLY_MARK(net, skb->mark);
1051
1052         __build_flow_key(net, &fl4, NULL, iph, oif,
1053                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1054         rt = __ip_route_output_key(net, &fl4);
1055         if (!IS_ERR(rt)) {
1056                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1057                 ip_rt_put(rt);
1058         }
1059 }
1060 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1061
1062 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1063 {
1064         const struct iphdr *iph = (const struct iphdr *) skb->data;
1065         struct flowi4 fl4;
1066         struct rtable *rt;
1067
1068         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1069
1070         if (!fl4.flowi4_mark)
1071                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1072
1073         rt = __ip_route_output_key(sock_net(sk), &fl4);
1074         if (!IS_ERR(rt)) {
1075                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1076                 ip_rt_put(rt);
1077         }
1078 }
1079
1080 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1081 {
1082         const struct iphdr *iph = (const struct iphdr *) skb->data;
1083         struct flowi4 fl4;
1084         struct rtable *rt;
1085         struct dst_entry *odst = NULL;
1086         bool new = false;
1087         struct net *net = sock_net(sk);
1088
1089         bh_lock_sock(sk);
1090
1091         if (!ip_sk_accept_pmtu(sk))
1092                 goto out;
1093
1094         odst = sk_dst_get(sk);
1095
1096         if (sock_owned_by_user(sk) || !odst) {
1097                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1098                 goto out;
1099         }
1100
1101         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1102
1103         rt = (struct rtable *)odst;
1104         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1105                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1106                 if (IS_ERR(rt))
1107                         goto out;
1108
1109                 new = true;
1110         }
1111
1112         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1113
1114         if (!dst_check(&rt->dst, 0)) {
1115                 if (new)
1116                         dst_release(&rt->dst);
1117
1118                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1119                 if (IS_ERR(rt))
1120                         goto out;
1121
1122                 new = true;
1123         }
1124
1125         if (new)
1126                 sk_dst_set(sk, &rt->dst);
1127
1128 out:
1129         bh_unlock_sock(sk);
1130         dst_release(odst);
1131 }
1132 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1133
1134 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1135                    int oif, u32 mark, u8 protocol, int flow_flags)
1136 {
1137         const struct iphdr *iph = (const struct iphdr *) skb->data;
1138         struct flowi4 fl4;
1139         struct rtable *rt;
1140
1141         __build_flow_key(net, &fl4, NULL, iph, oif,
1142                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1143         rt = __ip_route_output_key(net, &fl4);
1144         if (!IS_ERR(rt)) {
1145                 __ip_do_redirect(rt, skb, &fl4, false);
1146                 ip_rt_put(rt);
1147         }
1148 }
1149 EXPORT_SYMBOL_GPL(ipv4_redirect);
1150
1151 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1152 {
1153         const struct iphdr *iph = (const struct iphdr *) skb->data;
1154         struct flowi4 fl4;
1155         struct rtable *rt;
1156         struct net *net = sock_net(sk);
1157
1158         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1159         rt = __ip_route_output_key(net, &fl4);
1160         if (!IS_ERR(rt)) {
1161                 __ip_do_redirect(rt, skb, &fl4, false);
1162                 ip_rt_put(rt);
1163         }
1164 }
1165 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1166
1167 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1168 {
1169         struct rtable *rt = (struct rtable *) dst;
1170
1171         /* All IPV4 dsts are created with ->obsolete set to the value
1172          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1173          * into this function always.
1174          *
1175          * When a PMTU/redirect information update invalidates a route,
1176          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1177          * DST_OBSOLETE_DEAD by dst_free().
1178          */
1179         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1180                 return NULL;
1181         return dst;
1182 }
1183
1184 static void ipv4_link_failure(struct sk_buff *skb)
1185 {
1186         struct rtable *rt;
1187
1188         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1189
1190         rt = skb_rtable(skb);
1191         if (rt)
1192                 dst_set_expires(&rt->dst, 0);
1193 }
1194
1195 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1196 {
1197         pr_debug("%s: %pI4 -> %pI4, %s\n",
1198                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1199                  skb->dev ? skb->dev->name : "?");
1200         kfree_skb(skb);
1201         WARN_ON(1);
1202         return 0;
1203 }
1204
1205 /*
1206    We do not cache source address of outgoing interface,
1207    because it is used only by IP RR, TS and SRR options,
1208    so that it out of fast path.
1209
1210    BTW remember: "addr" is allowed to be not aligned
1211    in IP options!
1212  */
1213
1214 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1215 {
1216         __be32 src;
1217
1218         if (rt_is_output_route(rt))
1219                 src = ip_hdr(skb)->saddr;
1220         else {
1221                 struct fib_result res;
1222                 struct flowi4 fl4;
1223                 struct iphdr *iph;
1224
1225                 iph = ip_hdr(skb);
1226
1227                 memset(&fl4, 0, sizeof(fl4));
1228                 fl4.daddr = iph->daddr;
1229                 fl4.saddr = iph->saddr;
1230                 fl4.flowi4_tos = RT_TOS(iph->tos);
1231                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1232                 fl4.flowi4_iif = skb->dev->ifindex;
1233                 fl4.flowi4_mark = skb->mark;
1234
1235                 rcu_read_lock();
1236                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1237                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1238                 else
1239                         src = inet_select_addr(rt->dst.dev,
1240                                                rt_nexthop(rt, iph->daddr),
1241                                                RT_SCOPE_UNIVERSE);
1242                 rcu_read_unlock();
1243         }
1244         memcpy(addr, &src, 4);
1245 }
1246
1247 #ifdef CONFIG_IP_ROUTE_CLASSID
1248 static void set_class_tag(struct rtable *rt, u32 tag)
1249 {
1250         if (!(rt->dst.tclassid & 0xFFFF))
1251                 rt->dst.tclassid |= tag & 0xFFFF;
1252         if (!(rt->dst.tclassid & 0xFFFF0000))
1253                 rt->dst.tclassid |= tag & 0xFFFF0000;
1254 }
1255 #endif
1256
1257 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1258 {
1259         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1260         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1261                                     ip_rt_min_advmss);
1262
1263         return min(advmss, IPV4_MAX_PMTU - header_size);
1264 }
1265
1266 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1267 {
1268         const struct rtable *rt = (const struct rtable *) dst;
1269         unsigned int mtu = rt->rt_pmtu;
1270
1271         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1272                 mtu = dst_metric_raw(dst, RTAX_MTU);
1273
1274         if (mtu)
1275                 return mtu;
1276
1277         mtu = READ_ONCE(dst->dev->mtu);
1278
1279         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1280                 if (rt->rt_uses_gateway && mtu > 576)
1281                         mtu = 576;
1282         }
1283
1284         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1285
1286         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1287 }
1288
1289 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1290 {
1291         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1292         struct fib_nh_exception *fnhe;
1293         u32 hval;
1294
1295         if (!hash)
1296                 return NULL;
1297
1298         hval = fnhe_hashfun(daddr);
1299
1300         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1301              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1302                 if (fnhe->fnhe_daddr == daddr)
1303                         return fnhe;
1304         }
1305         return NULL;
1306 }
1307
1308 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1309                               __be32 daddr, const bool do_cache)
1310 {
1311         bool ret = false;
1312
1313         spin_lock_bh(&fnhe_lock);
1314
1315         if (daddr == fnhe->fnhe_daddr) {
1316                 struct rtable __rcu **porig;
1317                 struct rtable *orig;
1318                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1319
1320                 if (rt_is_input_route(rt))
1321                         porig = &fnhe->fnhe_rth_input;
1322                 else
1323                         porig = &fnhe->fnhe_rth_output;
1324                 orig = rcu_dereference(*porig);
1325
1326                 if (fnhe->fnhe_genid != genid) {
1327                         fnhe->fnhe_genid = genid;
1328                         fnhe->fnhe_gw = 0;
1329                         fnhe->fnhe_pmtu = 0;
1330                         fnhe->fnhe_expires = 0;
1331                         fnhe_flush_routes(fnhe);
1332                         orig = NULL;
1333                 }
1334                 fill_route_from_fnhe(rt, fnhe);
1335                 if (!rt->rt_gateway)
1336                         rt->rt_gateway = daddr;
1337
1338                 if (do_cache) {
1339                         dst_hold(&rt->dst);
1340                         rcu_assign_pointer(*porig, rt);
1341                         if (orig) {
1342                                 dst_dev_put(&orig->dst);
1343                                 dst_release(&orig->dst);
1344                         }
1345                         ret = true;
1346                 }
1347
1348                 fnhe->fnhe_stamp = jiffies;
1349         }
1350         spin_unlock_bh(&fnhe_lock);
1351
1352         return ret;
1353 }
1354
1355 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1356 {
1357         struct rtable *orig, *prev, **p;
1358         bool ret = true;
1359
1360         if (rt_is_input_route(rt)) {
1361                 p = (struct rtable **)&nh->nh_rth_input;
1362         } else {
1363                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1364         }
1365         orig = *p;
1366
1367         /* hold dst before doing cmpxchg() to avoid race condition
1368          * on this dst
1369          */
1370         dst_hold(&rt->dst);
1371         prev = cmpxchg(p, orig, rt);
1372         if (prev == orig) {
1373                 if (orig) {
1374                         dst_dev_put(&orig->dst);
1375                         dst_release(&orig->dst);
1376                 }
1377         } else {
1378                 dst_release(&rt->dst);
1379                 ret = false;
1380         }
1381
1382         return ret;
1383 }
1384
1385 struct uncached_list {
1386         spinlock_t              lock;
1387         struct list_head        head;
1388 };
1389
1390 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1391
1392 static void rt_add_uncached_list(struct rtable *rt)
1393 {
1394         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1395
1396         rt->rt_uncached_list = ul;
1397
1398         spin_lock_bh(&ul->lock);
1399         list_add_tail(&rt->rt_uncached, &ul->head);
1400         spin_unlock_bh(&ul->lock);
1401 }
1402
1403 static void ipv4_dst_destroy(struct dst_entry *dst)
1404 {
1405         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1406         struct rtable *rt = (struct rtable *) dst;
1407
1408         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1409                 kfree(p);
1410
1411         if (!list_empty(&rt->rt_uncached)) {
1412                 struct uncached_list *ul = rt->rt_uncached_list;
1413
1414                 spin_lock_bh(&ul->lock);
1415                 list_del(&rt->rt_uncached);
1416                 spin_unlock_bh(&ul->lock);
1417         }
1418 }
1419
1420 void rt_flush_dev(struct net_device *dev)
1421 {
1422         struct net *net = dev_net(dev);
1423         struct rtable *rt;
1424         int cpu;
1425
1426         for_each_possible_cpu(cpu) {
1427                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1428
1429                 spin_lock_bh(&ul->lock);
1430                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1431                         if (rt->dst.dev != dev)
1432                                 continue;
1433                         rt->dst.dev = net->loopback_dev;
1434                         dev_hold(rt->dst.dev);
1435                         dev_put(dev);
1436                 }
1437                 spin_unlock_bh(&ul->lock);
1438         }
1439 }
1440
1441 static bool rt_cache_valid(const struct rtable *rt)
1442 {
1443         return  rt &&
1444                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1445                 !rt_is_expired(rt);
1446 }
1447
1448 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1449                            const struct fib_result *res,
1450                            struct fib_nh_exception *fnhe,
1451                            struct fib_info *fi, u16 type, u32 itag,
1452                            const bool do_cache)
1453 {
1454         bool cached = false;
1455
1456         if (fi) {
1457                 struct fib_nh *nh = &FIB_RES_NH(*res);
1458
1459                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1460                         rt->rt_gateway = nh->nh_gw;
1461                         rt->rt_uses_gateway = 1;
1462                 }
1463                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1464                 if (fi->fib_metrics != &dst_default_metrics) {
1465                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1466                         refcount_inc(&fi->fib_metrics->refcnt);
1467                 }
1468 #ifdef CONFIG_IP_ROUTE_CLASSID
1469                 rt->dst.tclassid = nh->nh_tclassid;
1470 #endif
1471                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1472                 if (unlikely(fnhe))
1473                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1474                 else if (do_cache)
1475                         cached = rt_cache_route(nh, rt);
1476                 if (unlikely(!cached)) {
1477                         /* Routes we intend to cache in nexthop exception or
1478                          * FIB nexthop have the DST_NOCACHE bit clear.
1479                          * However, if we are unsuccessful at storing this
1480                          * route into the cache we really need to set it.
1481                          */
1482                         if (!rt->rt_gateway)
1483                                 rt->rt_gateway = daddr;
1484                         rt_add_uncached_list(rt);
1485                 }
1486         } else
1487                 rt_add_uncached_list(rt);
1488
1489 #ifdef CONFIG_IP_ROUTE_CLASSID
1490 #ifdef CONFIG_IP_MULTIPLE_TABLES
1491         set_class_tag(rt, res->tclassid);
1492 #endif
1493         set_class_tag(rt, itag);
1494 #endif
1495 }
1496
1497 struct rtable *rt_dst_alloc(struct net_device *dev,
1498                             unsigned int flags, u16 type,
1499                             bool nopolicy, bool noxfrm, bool will_cache)
1500 {
1501         struct rtable *rt;
1502
1503         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1504                        (will_cache ? 0 : DST_HOST) |
1505                        (nopolicy ? DST_NOPOLICY : 0) |
1506                        (noxfrm ? DST_NOXFRM : 0));
1507
1508         if (rt) {
1509                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1510                 rt->rt_flags = flags;
1511                 rt->rt_type = type;
1512                 rt->rt_is_input = 0;
1513                 rt->rt_iif = 0;
1514                 rt->rt_pmtu = 0;
1515                 rt->rt_gateway = 0;
1516                 rt->rt_uses_gateway = 0;
1517                 rt->rt_table_id = 0;
1518                 INIT_LIST_HEAD(&rt->rt_uncached);
1519
1520                 rt->dst.output = ip_output;
1521                 if (flags & RTCF_LOCAL)
1522                         rt->dst.input = ip_local_deliver;
1523         }
1524
1525         return rt;
1526 }
1527 EXPORT_SYMBOL(rt_dst_alloc);
1528
1529 /* called in rcu_read_lock() section */
1530 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1531                           u8 tos, struct net_device *dev,
1532                           struct in_device *in_dev, u32 *itag)
1533 {
1534         int err;
1535
1536         /* Primary sanity checks. */
1537         if (!in_dev)
1538                 return -EINVAL;
1539
1540         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1541             skb->protocol != htons(ETH_P_IP))
1542                 return -EINVAL;
1543
1544         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1545                 return -EINVAL;
1546
1547         if (ipv4_is_zeronet(saddr)) {
1548                 if (!ipv4_is_local_multicast(daddr))
1549                         return -EINVAL;
1550         } else {
1551                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1552                                           in_dev, itag);
1553                 if (err < 0)
1554                         return err;
1555         }
1556         return 0;
1557 }
1558
1559 /* called in rcu_read_lock() section */
1560 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1561                              u8 tos, struct net_device *dev, int our)
1562 {
1563         struct in_device *in_dev = __in_dev_get_rcu(dev);
1564         unsigned int flags = RTCF_MULTICAST;
1565         struct rtable *rth;
1566         u32 itag = 0;
1567         int err;
1568
1569         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1570         if (err)
1571                 return err;
1572
1573         if (our)
1574                 flags |= RTCF_LOCAL;
1575
1576         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1577                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1578         if (!rth)
1579                 return -ENOBUFS;
1580
1581 #ifdef CONFIG_IP_ROUTE_CLASSID
1582         rth->dst.tclassid = itag;
1583 #endif
1584         rth->dst.output = ip_rt_bug;
1585         rth->rt_is_input= 1;
1586
1587 #ifdef CONFIG_IP_MROUTE
1588         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1589                 rth->dst.input = ip_mr_input;
1590 #endif
1591         RT_CACHE_STAT_INC(in_slow_mc);
1592
1593         skb_dst_set(skb, &rth->dst);
1594         return 0;
1595 }
1596
1597
1598 static void ip_handle_martian_source(struct net_device *dev,
1599                                      struct in_device *in_dev,
1600                                      struct sk_buff *skb,
1601                                      __be32 daddr,
1602                                      __be32 saddr)
1603 {
1604         RT_CACHE_STAT_INC(in_martian_src);
1605 #ifdef CONFIG_IP_ROUTE_VERBOSE
1606         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1607                 /*
1608                  *      RFC1812 recommendation, if source is martian,
1609                  *      the only hint is MAC header.
1610                  */
1611                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1612                         &daddr, &saddr, dev->name);
1613                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1614                         print_hex_dump(KERN_WARNING, "ll header: ",
1615                                        DUMP_PREFIX_OFFSET, 16, 1,
1616                                        skb_mac_header(skb),
1617                                        dev->hard_header_len, true);
1618                 }
1619         }
1620 #endif
1621 }
1622
1623 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1624 {
1625         struct fnhe_hash_bucket *hash;
1626         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1627         u32 hval = fnhe_hashfun(daddr);
1628
1629         spin_lock_bh(&fnhe_lock);
1630
1631         hash = rcu_dereference_protected(nh->nh_exceptions,
1632                                          lockdep_is_held(&fnhe_lock));
1633         hash += hval;
1634
1635         fnhe_p = &hash->chain;
1636         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1637         while (fnhe) {
1638                 if (fnhe->fnhe_daddr == daddr) {
1639                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1640                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1641                         fnhe_flush_routes(fnhe);
1642                         kfree_rcu(fnhe, rcu);
1643                         break;
1644                 }
1645                 fnhe_p = &fnhe->fnhe_next;
1646                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1647                                                  lockdep_is_held(&fnhe_lock));
1648         }
1649
1650         spin_unlock_bh(&fnhe_lock);
1651 }
1652
1653 static void set_lwt_redirect(struct rtable *rth)
1654 {
1655         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1656                 rth->dst.lwtstate->orig_output = rth->dst.output;
1657                 rth->dst.output = lwtunnel_output;
1658         }
1659
1660         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1661                 rth->dst.lwtstate->orig_input = rth->dst.input;
1662                 rth->dst.input = lwtunnel_input;
1663         }
1664 }
1665
1666 /* called in rcu_read_lock() section */
1667 static int __mkroute_input(struct sk_buff *skb,
1668                            const struct fib_result *res,
1669                            struct in_device *in_dev,
1670                            __be32 daddr, __be32 saddr, u32 tos)
1671 {
1672         struct fib_nh_exception *fnhe;
1673         struct rtable *rth;
1674         int err;
1675         struct in_device *out_dev;
1676         bool do_cache;
1677         u32 itag = 0;
1678
1679         /* get a working reference to the output device */
1680         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1681         if (!out_dev) {
1682                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1683                 return -EINVAL;
1684         }
1685
1686         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1687                                   in_dev->dev, in_dev, &itag);
1688         if (err < 0) {
1689                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1690                                          saddr);
1691
1692                 goto cleanup;
1693         }
1694
1695         do_cache = res->fi && !itag;
1696         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1697             skb->protocol == htons(ETH_P_IP) &&
1698             (IN_DEV_SHARED_MEDIA(out_dev) ||
1699              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1700                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1701
1702         if (skb->protocol != htons(ETH_P_IP)) {
1703                 /* Not IP (i.e. ARP). Do not create route, if it is
1704                  * invalid for proxy arp. DNAT routes are always valid.
1705                  *
1706                  * Proxy arp feature have been extended to allow, ARP
1707                  * replies back to the same interface, to support
1708                  * Private VLAN switch technologies. See arp.c.
1709                  */
1710                 if (out_dev == in_dev &&
1711                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1712                         err = -EINVAL;
1713                         goto cleanup;
1714                 }
1715         }
1716
1717         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1718         if (do_cache) {
1719                 if (fnhe) {
1720                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1721                         if (rth && rth->dst.expires &&
1722                             time_after(jiffies, rth->dst.expires)) {
1723                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1724                                 fnhe = NULL;
1725                         } else {
1726                                 goto rt_cache;
1727                         }
1728                 }
1729
1730                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1731
1732 rt_cache:
1733                 if (rt_cache_valid(rth)) {
1734                         skb_dst_set_noref(skb, &rth->dst);
1735                         goto out;
1736                 }
1737         }
1738
1739         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742         if (!rth) {
1743                 err = -ENOBUFS;
1744                 goto cleanup;
1745         }
1746
1747         rth->rt_is_input = 1;
1748         if (res->table)
1749                 rth->rt_table_id = res->table->tb_id;
1750         RT_CACHE_STAT_INC(in_slow_tot);
1751
1752         rth->dst.input = ip_forward;
1753
1754         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1755                        do_cache);
1756         set_lwt_redirect(rth);
1757         skb_dst_set(skb, &rth->dst);
1758 out:
1759         err = 0;
1760  cleanup:
1761         return err;
1762 }
1763
1764 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1765 /* To make ICMP packets follow the right flow, the multipath hash is
1766  * calculated from the inner IP addresses.
1767  */
1768 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1769                                  struct flow_keys *hash_keys)
1770 {
1771         const struct iphdr *outer_iph = ip_hdr(skb);
1772         const struct iphdr *inner_iph;
1773         const struct icmphdr *icmph;
1774         struct iphdr _inner_iph;
1775         struct icmphdr _icmph;
1776
1777         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1778         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1779         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1780                 return;
1781
1782         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1783                 return;
1784
1785         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1786                                    &_icmph);
1787         if (!icmph)
1788                 return;
1789
1790         if (icmph->type != ICMP_DEST_UNREACH &&
1791             icmph->type != ICMP_REDIRECT &&
1792             icmph->type != ICMP_TIME_EXCEEDED &&
1793             icmph->type != ICMP_PARAMETERPROB)
1794                 return;
1795
1796         inner_iph = skb_header_pointer(skb,
1797                                        outer_iph->ihl * 4 + sizeof(_icmph),
1798                                        sizeof(_inner_iph), &_inner_iph);
1799         if (!inner_iph)
1800                 return;
1801         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1802         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1803 }
1804
1805 /* if skb is set it will be used and fl4 can be NULL */
1806 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1807                        const struct sk_buff *skb)
1808 {
1809         struct net *net = fi->fib_net;
1810         struct flow_keys hash_keys;
1811         u32 mhash;
1812
1813         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1814         case 0:
1815                 memset(&hash_keys, 0, sizeof(hash_keys));
1816                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1817                 if (skb) {
1818                         ip_multipath_l3_keys(skb, &hash_keys);
1819                 } else {
1820                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1821                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1822                 }
1823                 break;
1824         case 1:
1825                 /* skb is currently provided only when forwarding */
1826                 if (skb) {
1827                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1828                         struct flow_keys keys;
1829
1830                         /* short-circuit if we already have L4 hash present */
1831                         if (skb->l4_hash)
1832                                 return skb_get_hash_raw(skb) >> 1;
1833                         memset(&hash_keys, 0, sizeof(hash_keys));
1834                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1835                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1836                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1837                         hash_keys.ports.src = keys.ports.src;
1838                         hash_keys.ports.dst = keys.ports.dst;
1839                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1840                 } else {
1841                         memset(&hash_keys, 0, sizeof(hash_keys));
1842                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1843                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1844                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1845                         hash_keys.ports.src = fl4->fl4_sport;
1846                         hash_keys.ports.dst = fl4->fl4_dport;
1847                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1848                 }
1849                 break;
1850         }
1851         mhash = flow_hash_from_keys(&hash_keys);
1852
1853         return mhash >> 1;
1854 }
1855 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1856 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1857
1858 static int ip_mkroute_input(struct sk_buff *skb,
1859                             struct fib_result *res,
1860                             struct in_device *in_dev,
1861                             __be32 daddr, __be32 saddr, u32 tos)
1862 {
1863 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1864         if (res->fi && res->fi->fib_nhs > 1) {
1865                 int h = fib_multipath_hash(res->fi, NULL, skb);
1866
1867                 fib_select_multipath(res, h);
1868         }
1869 #endif
1870
1871         /* create a routing cache entry */
1872         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1873 }
1874
1875 /*
1876  *      NOTE. We drop all the packets that has local source
1877  *      addresses, because every properly looped back packet
1878  *      must have correct destination already attached by output routine.
1879  *
1880  *      Such approach solves two big problems:
1881  *      1. Not simplex devices are handled properly.
1882  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1883  *      called with rcu_read_lock()
1884  */
1885
1886 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1887                                u8 tos, struct net_device *dev,
1888                                struct fib_result *res)
1889 {
1890         struct in_device *in_dev = __in_dev_get_rcu(dev);
1891         struct ip_tunnel_info *tun_info;
1892         struct flowi4   fl4;
1893         unsigned int    flags = 0;
1894         u32             itag = 0;
1895         struct rtable   *rth;
1896         int             err = -EINVAL;
1897         struct net    *net = dev_net(dev);
1898         bool do_cache;
1899
1900         /* IP on this device is disabled. */
1901
1902         if (!in_dev)
1903                 goto out;
1904
1905         /* Check for the most weird martians, which can be not detected
1906            by fib_lookup.
1907          */
1908
1909         tun_info = skb_tunnel_info(skb);
1910         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1911                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1912         else
1913                 fl4.flowi4_tun_key.tun_id = 0;
1914         skb_dst_drop(skb);
1915
1916         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1917                 goto martian_source;
1918
1919         res->fi = NULL;
1920         res->table = NULL;
1921         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1922                 goto brd_input;
1923
1924         /* Accept zero addresses only to limited broadcast;
1925          * I even do not know to fix it or not. Waiting for complains :-)
1926          */
1927         if (ipv4_is_zeronet(saddr))
1928                 goto martian_source;
1929
1930         if (ipv4_is_zeronet(daddr))
1931                 goto martian_destination;
1932
1933         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1934          * and call it once if daddr or/and saddr are loopback addresses
1935          */
1936         if (ipv4_is_loopback(daddr)) {
1937                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1938                         goto martian_destination;
1939         } else if (ipv4_is_loopback(saddr)) {
1940                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1941                         goto martian_source;
1942         }
1943
1944         /*
1945          *      Now we are ready to route packet.
1946          */
1947         fl4.flowi4_oif = 0;
1948         fl4.flowi4_iif = dev->ifindex;
1949         fl4.flowi4_mark = skb->mark;
1950         fl4.flowi4_tos = tos;
1951         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1952         fl4.flowi4_flags = 0;
1953         fl4.daddr = daddr;
1954         fl4.saddr = saddr;
1955         fl4.flowi4_uid = sock_net_uid(net, NULL);
1956         err = fib_lookup(net, &fl4, res, 0);
1957         if (err != 0) {
1958                 if (!IN_DEV_FORWARD(in_dev))
1959                         err = -EHOSTUNREACH;
1960                 goto no_route;
1961         }
1962
1963         if (res->type == RTN_BROADCAST)
1964                 goto brd_input;
1965
1966         if (res->type == RTN_LOCAL) {
1967                 err = fib_validate_source(skb, saddr, daddr, tos,
1968                                           0, dev, in_dev, &itag);
1969                 if (err < 0)
1970                         goto martian_source;
1971                 goto local_input;
1972         }
1973
1974         if (!IN_DEV_FORWARD(in_dev)) {
1975                 err = -EHOSTUNREACH;
1976                 goto no_route;
1977         }
1978         if (res->type != RTN_UNICAST)
1979                 goto martian_destination;
1980
1981         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1982 out:    return err;
1983
1984 brd_input:
1985         if (skb->protocol != htons(ETH_P_IP))
1986                 goto e_inval;
1987
1988         if (!ipv4_is_zeronet(saddr)) {
1989                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1990                                           in_dev, &itag);
1991                 if (err < 0)
1992                         goto martian_source;
1993         }
1994         flags |= RTCF_BROADCAST;
1995         res->type = RTN_BROADCAST;
1996         RT_CACHE_STAT_INC(in_brd);
1997
1998 local_input:
1999         do_cache = false;
2000         if (res->fi) {
2001                 if (!itag) {
2002                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2003                         if (rt_cache_valid(rth)) {
2004                                 skb_dst_set_noref(skb, &rth->dst);
2005                                 err = 0;
2006                                 goto out;
2007                         }
2008                         do_cache = true;
2009                 }
2010         }
2011
2012         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2013                            flags | RTCF_LOCAL, res->type,
2014                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2015         if (!rth)
2016                 goto e_nobufs;
2017
2018         rth->dst.output= ip_rt_bug;
2019 #ifdef CONFIG_IP_ROUTE_CLASSID
2020         rth->dst.tclassid = itag;
2021 #endif
2022         rth->rt_is_input = 1;
2023         if (res->table)
2024                 rth->rt_table_id = res->table->tb_id;
2025
2026         RT_CACHE_STAT_INC(in_slow_tot);
2027         if (res->type == RTN_UNREACHABLE) {
2028                 rth->dst.input= ip_error;
2029                 rth->dst.error= -err;
2030                 rth->rt_flags   &= ~RTCF_LOCAL;
2031         }
2032
2033         if (do_cache) {
2034                 struct fib_nh *nh = &FIB_RES_NH(*res);
2035
2036                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2037                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2038                         WARN_ON(rth->dst.input == lwtunnel_input);
2039                         rth->dst.lwtstate->orig_input = rth->dst.input;
2040                         rth->dst.input = lwtunnel_input;
2041                 }
2042
2043                 if (unlikely(!rt_cache_route(nh, rth)))
2044                         rt_add_uncached_list(rth);
2045         }
2046         skb_dst_set(skb, &rth->dst);
2047         err = 0;
2048         goto out;
2049
2050 no_route:
2051         RT_CACHE_STAT_INC(in_no_route);
2052         res->type = RTN_UNREACHABLE;
2053         res->fi = NULL;
2054         res->table = NULL;
2055         goto local_input;
2056
2057         /*
2058          *      Do not cache martian addresses: they should be logged (RFC1812)
2059          */
2060 martian_destination:
2061         RT_CACHE_STAT_INC(in_martian_dst);
2062 #ifdef CONFIG_IP_ROUTE_VERBOSE
2063         if (IN_DEV_LOG_MARTIANS(in_dev))
2064                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2065                                      &daddr, &saddr, dev->name);
2066 #endif
2067
2068 e_inval:
2069         err = -EINVAL;
2070         goto out;
2071
2072 e_nobufs:
2073         err = -ENOBUFS;
2074         goto out;
2075
2076 martian_source:
2077         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2078         goto out;
2079 }
2080
2081 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2082                          u8 tos, struct net_device *dev)
2083 {
2084         struct fib_result res;
2085         int err;
2086
2087         tos &= IPTOS_RT_MASK;
2088         rcu_read_lock();
2089         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2090         rcu_read_unlock();
2091
2092         return err;
2093 }
2094 EXPORT_SYMBOL(ip_route_input_noref);
2095
2096 /* called with rcu_read_lock held */
2097 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098                        u8 tos, struct net_device *dev, struct fib_result *res)
2099 {
2100         /* Multicast recognition logic is moved from route cache to here.
2101            The problem was that too many Ethernet cards have broken/missing
2102            hardware multicast filters :-( As result the host on multicasting
2103            network acquires a lot of useless route cache entries, sort of
2104            SDR messages from all the world. Now we try to get rid of them.
2105            Really, provided software IP multicast filter is organized
2106            reasonably (at least, hashed), it does not result in a slowdown
2107            comparing with route cache reject entries.
2108            Note, that multicast routers are not affected, because
2109            route cache entry is created eventually.
2110          */
2111         if (ipv4_is_multicast(daddr)) {
2112                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2113                 int our = 0;
2114                 int err = -EINVAL;
2115
2116                 if (in_dev)
2117                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2118                                               ip_hdr(skb)->protocol);
2119
2120                 /* check l3 master if no match yet */
2121                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2122                         struct in_device *l3_in_dev;
2123
2124                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2125                         if (l3_in_dev)
2126                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2127                                                       ip_hdr(skb)->protocol);
2128                 }
2129
2130                 if (our
2131 #ifdef CONFIG_IP_MROUTE
2132                         ||
2133                     (!ipv4_is_local_multicast(daddr) &&
2134                      IN_DEV_MFORWARD(in_dev))
2135 #endif
2136                    ) {
2137                         err = ip_route_input_mc(skb, daddr, saddr,
2138                                                 tos, dev, our);
2139                 }
2140                 return err;
2141         }
2142
2143         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2144 }
2145
2146 /* called with rcu_read_lock() */
2147 static struct rtable *__mkroute_output(const struct fib_result *res,
2148                                        const struct flowi4 *fl4, int orig_oif,
2149                                        struct net_device *dev_out,
2150                                        unsigned int flags)
2151 {
2152         struct fib_info *fi = res->fi;
2153         struct fib_nh_exception *fnhe;
2154         struct in_device *in_dev;
2155         u16 type = res->type;
2156         struct rtable *rth;
2157         bool do_cache;
2158
2159         in_dev = __in_dev_get_rcu(dev_out);
2160         if (!in_dev)
2161                 return ERR_PTR(-EINVAL);
2162
2163         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2164                 if (ipv4_is_loopback(fl4->saddr) &&
2165                     !(dev_out->flags & IFF_LOOPBACK) &&
2166                     !netif_is_l3_master(dev_out))
2167                         return ERR_PTR(-EINVAL);
2168
2169         if (ipv4_is_lbcast(fl4->daddr))
2170                 type = RTN_BROADCAST;
2171         else if (ipv4_is_multicast(fl4->daddr))
2172                 type = RTN_MULTICAST;
2173         else if (ipv4_is_zeronet(fl4->daddr))
2174                 return ERR_PTR(-EINVAL);
2175
2176         if (dev_out->flags & IFF_LOOPBACK)
2177                 flags |= RTCF_LOCAL;
2178
2179         do_cache = true;
2180         if (type == RTN_BROADCAST) {
2181                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2182                 fi = NULL;
2183         } else if (type == RTN_MULTICAST) {
2184                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2185                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2186                                      fl4->flowi4_proto))
2187                         flags &= ~RTCF_LOCAL;
2188                 else
2189                         do_cache = false;
2190                 /* If multicast route do not exist use
2191                  * default one, but do not gateway in this case.
2192                  * Yes, it is hack.
2193                  */
2194                 if (fi && res->prefixlen < 4)
2195                         fi = NULL;
2196         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2197                    (orig_oif != dev_out->ifindex)) {
2198                 /* For local routes that require a particular output interface
2199                  * we do not want to cache the result.  Caching the result
2200                  * causes incorrect behaviour when there are multiple source
2201                  * addresses on the interface, the end result being that if the
2202                  * intended recipient is waiting on that interface for the
2203                  * packet he won't receive it because it will be delivered on
2204                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2205                  * be set to the loopback interface as well.
2206                  */
2207                 fi = NULL;
2208         }
2209
2210         fnhe = NULL;
2211         do_cache &= fi != NULL;
2212         if (do_cache) {
2213                 struct rtable __rcu **prth;
2214                 struct fib_nh *nh = &FIB_RES_NH(*res);
2215
2216                 fnhe = find_exception(nh, fl4->daddr);
2217                 if (fnhe) {
2218                         prth = &fnhe->fnhe_rth_output;
2219                         rth = rcu_dereference(*prth);
2220                         if (rth && rth->dst.expires &&
2221                             time_after(jiffies, rth->dst.expires)) {
2222                                 ip_del_fnhe(nh, fl4->daddr);
2223                                 fnhe = NULL;
2224                         } else {
2225                                 goto rt_cache;
2226                         }
2227                 }
2228
2229                 if (unlikely(fl4->flowi4_flags &
2230                              FLOWI_FLAG_KNOWN_NH &&
2231                              !(nh->nh_gw &&
2232                                nh->nh_scope == RT_SCOPE_LINK))) {
2233                         do_cache = false;
2234                         goto add;
2235                 }
2236                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2237                 rth = rcu_dereference(*prth);
2238
2239 rt_cache:
2240                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2241                         return rth;
2242         }
2243
2244 add:
2245         rth = rt_dst_alloc(dev_out, flags, type,
2246                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2247                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2248                            do_cache);
2249         if (!rth)
2250                 return ERR_PTR(-ENOBUFS);
2251
2252         rth->rt_iif = orig_oif;
2253         if (res->table)
2254                 rth->rt_table_id = res->table->tb_id;
2255
2256         RT_CACHE_STAT_INC(out_slow_tot);
2257
2258         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2259                 if (flags & RTCF_LOCAL &&
2260                     !(dev_out->flags & IFF_LOOPBACK)) {
2261                         rth->dst.output = ip_mc_output;
2262                         RT_CACHE_STAT_INC(out_slow_mc);
2263                 }
2264 #ifdef CONFIG_IP_MROUTE
2265                 if (type == RTN_MULTICAST) {
2266                         if (IN_DEV_MFORWARD(in_dev) &&
2267                             !ipv4_is_local_multicast(fl4->daddr)) {
2268                                 rth->dst.input = ip_mr_input;
2269                                 rth->dst.output = ip_mc_output;
2270                         }
2271                 }
2272 #endif
2273         }
2274
2275         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2276         set_lwt_redirect(rth);
2277
2278         return rth;
2279 }
2280
2281 /*
2282  * Major route resolver routine.
2283  */
2284
2285 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2286                                         const struct sk_buff *skb)
2287 {
2288         __u8 tos = RT_FL_TOS(fl4);
2289         struct fib_result res;
2290         struct rtable *rth;
2291
2292         res.tclassid    = 0;
2293         res.fi          = NULL;
2294         res.table       = NULL;
2295
2296         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2297         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2298         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2299                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2300
2301         rcu_read_lock();
2302         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2303         rcu_read_unlock();
2304
2305         return rth;
2306 }
2307 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2308
2309 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2310                                             struct fib_result *res,
2311                                             const struct sk_buff *skb)
2312 {
2313         struct net_device *dev_out = NULL;
2314         int orig_oif = fl4->flowi4_oif;
2315         unsigned int flags = 0;
2316         struct rtable *rth;
2317         int err = -ENETUNREACH;
2318
2319         if (fl4->saddr) {
2320                 rth = ERR_PTR(-EINVAL);
2321                 if (ipv4_is_multicast(fl4->saddr) ||
2322                     ipv4_is_lbcast(fl4->saddr) ||
2323                     ipv4_is_zeronet(fl4->saddr))
2324                         goto out;
2325
2326                 /* I removed check for oif == dev_out->oif here.
2327                    It was wrong for two reasons:
2328                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2329                       is assigned to multiple interfaces.
2330                    2. Moreover, we are allowed to send packets with saddr
2331                       of another iface. --ANK
2332                  */
2333
2334                 if (fl4->flowi4_oif == 0 &&
2335                     (ipv4_is_multicast(fl4->daddr) ||
2336                      ipv4_is_lbcast(fl4->daddr))) {
2337                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2338                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2339                         if (!dev_out)
2340                                 goto out;
2341
2342                         /* Special hack: user can direct multicasts
2343                            and limited broadcast via necessary interface
2344                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2345                            This hack is not just for fun, it allows
2346                            vic,vat and friends to work.
2347                            They bind socket to loopback, set ttl to zero
2348                            and expect that it will work.
2349                            From the viewpoint of routing cache they are broken,
2350                            because we are not allowed to build multicast path
2351                            with loopback source addr (look, routing cache
2352                            cannot know, that ttl is zero, so that packet
2353                            will not leave this host and route is valid).
2354                            Luckily, this hack is good workaround.
2355                          */
2356
2357                         fl4->flowi4_oif = dev_out->ifindex;
2358                         goto make_route;
2359                 }
2360
2361                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2362                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2363                         if (!__ip_dev_find(net, fl4->saddr, false))
2364                                 goto out;
2365                 }
2366         }
2367
2368
2369         if (fl4->flowi4_oif) {
2370                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2371                 rth = ERR_PTR(-ENODEV);
2372                 if (!dev_out)
2373                         goto out;
2374
2375                 /* RACE: Check return value of inet_select_addr instead. */
2376                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2377                         rth = ERR_PTR(-ENETUNREACH);
2378                         goto out;
2379                 }
2380                 if (ipv4_is_local_multicast(fl4->daddr) ||
2381                     ipv4_is_lbcast(fl4->daddr) ||
2382                     fl4->flowi4_proto == IPPROTO_IGMP) {
2383                         if (!fl4->saddr)
2384                                 fl4->saddr = inet_select_addr(dev_out, 0,
2385                                                               RT_SCOPE_LINK);
2386                         goto make_route;
2387                 }
2388                 if (!fl4->saddr) {
2389                         if (ipv4_is_multicast(fl4->daddr))
2390                                 fl4->saddr = inet_select_addr(dev_out, 0,
2391                                                               fl4->flowi4_scope);
2392                         else if (!fl4->daddr)
2393                                 fl4->saddr = inet_select_addr(dev_out, 0,
2394                                                               RT_SCOPE_HOST);
2395                 }
2396         }
2397
2398         if (!fl4->daddr) {
2399                 fl4->daddr = fl4->saddr;
2400                 if (!fl4->daddr)
2401                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2402                 dev_out = net->loopback_dev;
2403                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2404                 res->type = RTN_LOCAL;
2405                 flags |= RTCF_LOCAL;
2406                 goto make_route;
2407         }
2408
2409         err = fib_lookup(net, fl4, res, 0);
2410         if (err) {
2411                 res->fi = NULL;
2412                 res->table = NULL;
2413                 if (fl4->flowi4_oif &&
2414                     (ipv4_is_multicast(fl4->daddr) ||
2415                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2416                         /* Apparently, routing tables are wrong. Assume,
2417                            that the destination is on link.
2418
2419                            WHY? DW.
2420                            Because we are allowed to send to iface
2421                            even if it has NO routes and NO assigned
2422                            addresses. When oif is specified, routing
2423                            tables are looked up with only one purpose:
2424                            to catch if destination is gatewayed, rather than
2425                            direct. Moreover, if MSG_DONTROUTE is set,
2426                            we send packet, ignoring both routing tables
2427                            and ifaddr state. --ANK
2428
2429
2430                            We could make it even if oif is unknown,
2431                            likely IPv6, but we do not.
2432                          */
2433
2434                         if (fl4->saddr == 0)
2435                                 fl4->saddr = inet_select_addr(dev_out, 0,
2436                                                               RT_SCOPE_LINK);
2437                         res->type = RTN_UNICAST;
2438                         goto make_route;
2439                 }
2440                 rth = ERR_PTR(err);
2441                 goto out;
2442         }
2443
2444         if (res->type == RTN_LOCAL) {
2445                 if (!fl4->saddr) {
2446                         if (res->fi->fib_prefsrc)
2447                                 fl4->saddr = res->fi->fib_prefsrc;
2448                         else
2449                                 fl4->saddr = fl4->daddr;
2450                 }
2451
2452                 /* L3 master device is the loopback for that domain */
2453                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2454                         net->loopback_dev;
2455
2456                 /* make sure orig_oif points to fib result device even
2457                  * though packet rx/tx happens over loopback or l3mdev
2458                  */
2459                 orig_oif = FIB_RES_OIF(*res);
2460
2461                 fl4->flowi4_oif = dev_out->ifindex;
2462                 flags |= RTCF_LOCAL;
2463                 goto make_route;
2464         }
2465
2466         fib_select_path(net, res, fl4, skb);
2467
2468         dev_out = FIB_RES_DEV(*res);
2469         fl4->flowi4_oif = dev_out->ifindex;
2470
2471
2472 make_route:
2473         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2474
2475 out:
2476         return rth;
2477 }
2478
2479 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2480 {
2481         return NULL;
2482 }
2483
2484 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2485 {
2486         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2487
2488         return mtu ? : dst->dev->mtu;
2489 }
2490
2491 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2492                                           struct sk_buff *skb, u32 mtu)
2493 {
2494 }
2495
2496 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2497                                        struct sk_buff *skb)
2498 {
2499 }
2500
2501 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2502                                           unsigned long old)
2503 {
2504         return NULL;
2505 }
2506
2507 static struct dst_ops ipv4_dst_blackhole_ops = {
2508         .family                 =       AF_INET,
2509         .check                  =       ipv4_blackhole_dst_check,
2510         .mtu                    =       ipv4_blackhole_mtu,
2511         .default_advmss         =       ipv4_default_advmss,
2512         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2513         .redirect               =       ipv4_rt_blackhole_redirect,
2514         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2515         .neigh_lookup           =       ipv4_neigh_lookup,
2516 };
2517
2518 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2519 {
2520         struct rtable *ort = (struct rtable *) dst_orig;
2521         struct rtable *rt;
2522
2523         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2524         if (rt) {
2525                 struct dst_entry *new = &rt->dst;
2526
2527                 new->__use = 1;
2528                 new->input = dst_discard;
2529                 new->output = dst_discard_out;
2530
2531                 new->dev = net->loopback_dev;
2532                 if (new->dev)
2533                         dev_hold(new->dev);
2534
2535                 rt->rt_is_input = ort->rt_is_input;
2536                 rt->rt_iif = ort->rt_iif;
2537                 rt->rt_pmtu = ort->rt_pmtu;
2538
2539                 rt->rt_genid = rt_genid_ipv4(net);
2540                 rt->rt_flags = ort->rt_flags;
2541                 rt->rt_type = ort->rt_type;
2542                 rt->rt_gateway = ort->rt_gateway;
2543                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2544
2545                 INIT_LIST_HEAD(&rt->rt_uncached);
2546         }
2547
2548         dst_release(dst_orig);
2549
2550         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2551 }
2552
2553 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2554                                     const struct sock *sk)
2555 {
2556         struct rtable *rt = __ip_route_output_key(net, flp4);
2557
2558         if (IS_ERR(rt))
2559                 return rt;
2560
2561         if (flp4->flowi4_proto)
2562                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2563                                                         flowi4_to_flowi(flp4),
2564                                                         sk, 0);
2565
2566         return rt;
2567 }
2568 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2569
2570 /* called with rcu_read_lock held */
2571 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2572                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2573                         u32 seq)
2574 {
2575         struct rtable *rt = skb_rtable(skb);
2576         struct rtmsg *r;
2577         struct nlmsghdr *nlh;
2578         unsigned long expires = 0;
2579         u32 error;
2580         u32 metrics[RTAX_MAX];
2581
2582         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2583         if (!nlh)
2584                 return -EMSGSIZE;
2585
2586         r = nlmsg_data(nlh);
2587         r->rtm_family    = AF_INET;
2588         r->rtm_dst_len  = 32;
2589         r->rtm_src_len  = 0;
2590         r->rtm_tos      = fl4->flowi4_tos;
2591         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2592         if (nla_put_u32(skb, RTA_TABLE, table_id))
2593                 goto nla_put_failure;
2594         r->rtm_type     = rt->rt_type;
2595         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2596         r->rtm_protocol = RTPROT_UNSPEC;
2597         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2598         if (rt->rt_flags & RTCF_NOTIFY)
2599                 r->rtm_flags |= RTM_F_NOTIFY;
2600         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2601                 r->rtm_flags |= RTCF_DOREDIRECT;
2602
2603         if (nla_put_in_addr(skb, RTA_DST, dst))
2604                 goto nla_put_failure;
2605         if (src) {
2606                 r->rtm_src_len = 32;
2607                 if (nla_put_in_addr(skb, RTA_SRC, src))
2608                         goto nla_put_failure;
2609         }
2610         if (rt->dst.dev &&
2611             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2612                 goto nla_put_failure;
2613 #ifdef CONFIG_IP_ROUTE_CLASSID
2614         if (rt->dst.tclassid &&
2615             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2616                 goto nla_put_failure;
2617 #endif
2618         if (!rt_is_input_route(rt) &&
2619             fl4->saddr != src) {
2620                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2621                         goto nla_put_failure;
2622         }
2623         if (rt->rt_uses_gateway &&
2624             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2625                 goto nla_put_failure;
2626
2627         expires = rt->dst.expires;
2628         if (expires) {
2629                 unsigned long now = jiffies;
2630
2631                 if (time_before(now, expires))
2632                         expires -= now;
2633                 else
2634                         expires = 0;
2635         }
2636
2637         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2638         if (rt->rt_pmtu && expires)
2639                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2640         if (rtnetlink_put_metrics(skb, metrics) < 0)
2641                 goto nla_put_failure;
2642
2643         if (fl4->flowi4_mark &&
2644             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2645                 goto nla_put_failure;
2646
2647         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2648             nla_put_u32(skb, RTA_UID,
2649                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2650                 goto nla_put_failure;
2651
2652         error = rt->dst.error;
2653
2654         if (rt_is_input_route(rt)) {
2655 #ifdef CONFIG_IP_MROUTE
2656                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2657                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2658                         int err = ipmr_get_route(net, skb,
2659                                                  fl4->saddr, fl4->daddr,
2660                                                  r, portid);
2661
2662                         if (err <= 0) {
2663                                 if (err == 0)
2664                                         return 0;
2665                                 goto nla_put_failure;
2666                         }
2667                 } else
2668 #endif
2669                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2670                                 goto nla_put_failure;
2671         }
2672
2673         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2674                 goto nla_put_failure;
2675
2676         nlmsg_end(skb, nlh);
2677         return 0;
2678
2679 nla_put_failure:
2680         nlmsg_cancel(skb, nlh);
2681         return -EMSGSIZE;
2682 }
2683
2684 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2685                              struct netlink_ext_ack *extack)
2686 {
2687         struct net *net = sock_net(in_skb->sk);
2688         struct rtmsg *rtm;
2689         struct nlattr *tb[RTA_MAX+1];
2690         struct fib_result res = {};
2691         struct rtable *rt = NULL;
2692         struct flowi4 fl4;
2693         __be32 dst = 0;
2694         __be32 src = 0;
2695         u32 iif;
2696         int err;
2697         int mark;
2698         struct sk_buff *skb;
2699         u32 table_id = RT_TABLE_MAIN;
2700         kuid_t uid;
2701
2702         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2703                           extack);
2704         if (err < 0)
2705                 goto errout;
2706
2707         rtm = nlmsg_data(nlh);
2708
2709         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2710         if (!skb) {
2711                 err = -ENOBUFS;
2712                 goto errout;
2713         }
2714
2715         /* Reserve room for dummy headers, this skb can pass
2716            through good chunk of routing engine.
2717          */
2718         skb_reset_mac_header(skb);
2719         skb_reset_network_header(skb);
2720
2721         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2722         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2723         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2724         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2725         if (tb[RTA_UID])
2726                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2727         else
2728                 uid = (iif ? INVALID_UID : current_uid());
2729
2730         /* Bugfix: need to give ip_route_input enough of an IP header to
2731          * not gag.
2732          */
2733         ip_hdr(skb)->protocol = IPPROTO_UDP;
2734         ip_hdr(skb)->saddr = src;
2735         ip_hdr(skb)->daddr = dst;
2736
2737         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2738
2739         memset(&fl4, 0, sizeof(fl4));
2740         fl4.daddr = dst;
2741         fl4.saddr = src;
2742         fl4.flowi4_tos = rtm->rtm_tos;
2743         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2744         fl4.flowi4_mark = mark;
2745         fl4.flowi4_uid = uid;
2746
2747         rcu_read_lock();
2748
2749         if (iif) {
2750                 struct net_device *dev;
2751
2752                 dev = dev_get_by_index_rcu(net, iif);
2753                 if (!dev) {
2754                         err = -ENODEV;
2755                         goto errout_free;
2756                 }
2757
2758                 skb->protocol   = htons(ETH_P_IP);
2759                 skb->dev        = dev;
2760                 skb->mark       = mark;
2761                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2762                                          dev, &res);
2763
2764                 rt = skb_rtable(skb);
2765                 if (err == 0 && rt->dst.error)
2766                         err = -rt->dst.error;
2767         } else {
2768                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2769                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2770                 err = 0;
2771                 if (IS_ERR(rt))
2772                         err = PTR_ERR(rt);
2773                 else
2774                         skb_dst_set(skb, &rt->dst);
2775         }
2776
2777         if (err)
2778                 goto errout_free;
2779
2780         if (rtm->rtm_flags & RTM_F_NOTIFY)
2781                 rt->rt_flags |= RTCF_NOTIFY;
2782
2783         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2784                 table_id = rt->rt_table_id;
2785
2786         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2787                 if (!res.fi) {
2788                         err = fib_props[res.type].error;
2789                         if (!err)
2790                                 err = -EHOSTUNREACH;
2791                         goto errout_free;
2792                 }
2793                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2794                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2795                                     rt->rt_type, res.prefix, res.prefixlen,
2796                                     fl4.flowi4_tos, res.fi, 0);
2797         } else {
2798                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2799                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2800         }
2801         if (err < 0)
2802                 goto errout_free;
2803
2804         rcu_read_unlock();
2805
2806         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2807 errout:
2808         return err;
2809
2810 errout_free:
2811         rcu_read_unlock();
2812         kfree_skb(skb);
2813         goto errout;
2814 }
2815
2816 void ip_rt_multicast_event(struct in_device *in_dev)
2817 {
2818         rt_cache_flush(dev_net(in_dev->dev));
2819 }
2820
2821 #ifdef CONFIG_SYSCTL
2822 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2823 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2824 static int ip_rt_gc_elasticity __read_mostly    = 8;
2825
2826 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2827                                         void __user *buffer,
2828                                         size_t *lenp, loff_t *ppos)
2829 {
2830         struct net *net = (struct net *)__ctl->extra1;
2831
2832         if (write) {
2833                 rt_cache_flush(net);
2834                 fnhe_genid_bump(net);
2835                 return 0;
2836         }
2837
2838         return -EINVAL;
2839 }
2840
2841 static struct ctl_table ipv4_route_table[] = {
2842         {
2843                 .procname       = "gc_thresh",
2844                 .data           = &ipv4_dst_ops.gc_thresh,
2845                 .maxlen         = sizeof(int),
2846                 .mode           = 0644,
2847                 .proc_handler   = proc_dointvec,
2848         },
2849         {
2850                 .procname       = "max_size",
2851                 .data           = &ip_rt_max_size,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = proc_dointvec,
2855         },
2856         {
2857                 /*  Deprecated. Use gc_min_interval_ms */
2858
2859                 .procname       = "gc_min_interval",
2860                 .data           = &ip_rt_gc_min_interval,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = proc_dointvec_jiffies,
2864         },
2865         {
2866                 .procname       = "gc_min_interval_ms",
2867                 .data           = &ip_rt_gc_min_interval,
2868                 .maxlen         = sizeof(int),
2869                 .mode           = 0644,
2870                 .proc_handler   = proc_dointvec_ms_jiffies,
2871         },
2872         {
2873                 .procname       = "gc_timeout",
2874                 .data           = &ip_rt_gc_timeout,
2875                 .maxlen         = sizeof(int),
2876                 .mode           = 0644,
2877                 .proc_handler   = proc_dointvec_jiffies,
2878         },
2879         {
2880                 .procname       = "gc_interval",
2881                 .data           = &ip_rt_gc_interval,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = proc_dointvec_jiffies,
2885         },
2886         {
2887                 .procname       = "redirect_load",
2888                 .data           = &ip_rt_redirect_load,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = proc_dointvec,
2892         },
2893         {
2894                 .procname       = "redirect_number",
2895                 .data           = &ip_rt_redirect_number,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = proc_dointvec,
2899         },
2900         {
2901                 .procname       = "redirect_silence",
2902                 .data           = &ip_rt_redirect_silence,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = proc_dointvec,
2906         },
2907         {
2908                 .procname       = "error_cost",
2909                 .data           = &ip_rt_error_cost,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = proc_dointvec,
2913         },
2914         {
2915                 .procname       = "error_burst",
2916                 .data           = &ip_rt_error_burst,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = proc_dointvec,
2920         },
2921         {
2922                 .procname       = "gc_elasticity",
2923                 .data           = &ip_rt_gc_elasticity,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = proc_dointvec,
2927         },
2928         {
2929                 .procname       = "mtu_expires",
2930                 .data           = &ip_rt_mtu_expires,
2931                 .maxlen         = sizeof(int),
2932                 .mode           = 0644,
2933                 .proc_handler   = proc_dointvec_jiffies,
2934         },
2935         {
2936                 .procname       = "min_pmtu",
2937                 .data           = &ip_rt_min_pmtu,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = proc_dointvec_minmax,
2941                 .extra1         = &ip_min_valid_pmtu,
2942         },
2943         {
2944                 .procname       = "min_adv_mss",
2945                 .data           = &ip_rt_min_advmss,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = proc_dointvec,
2949         },
2950         { }
2951 };
2952
2953 static struct ctl_table ipv4_route_flush_table[] = {
2954         {
2955                 .procname       = "flush",
2956                 .maxlen         = sizeof(int),
2957                 .mode           = 0200,
2958                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2959         },
2960         { },
2961 };
2962
2963 static __net_init int sysctl_route_net_init(struct net *net)
2964 {
2965         struct ctl_table *tbl;
2966
2967         tbl = ipv4_route_flush_table;
2968         if (!net_eq(net, &init_net)) {
2969                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2970                 if (!tbl)
2971                         goto err_dup;
2972
2973                 /* Don't export sysctls to unprivileged users */
2974                 if (net->user_ns != &init_user_ns)
2975                         tbl[0].procname = NULL;
2976         }
2977         tbl[0].extra1 = net;
2978
2979         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2980         if (!net->ipv4.route_hdr)
2981                 goto err_reg;
2982         return 0;
2983
2984 err_reg:
2985         if (tbl != ipv4_route_flush_table)
2986                 kfree(tbl);
2987 err_dup:
2988         return -ENOMEM;
2989 }
2990
2991 static __net_exit void sysctl_route_net_exit(struct net *net)
2992 {
2993         struct ctl_table *tbl;
2994
2995         tbl = net->ipv4.route_hdr->ctl_table_arg;
2996         unregister_net_sysctl_table(net->ipv4.route_hdr);
2997         BUG_ON(tbl == ipv4_route_flush_table);
2998         kfree(tbl);
2999 }
3000
3001 static __net_initdata struct pernet_operations sysctl_route_ops = {
3002         .init = sysctl_route_net_init,
3003         .exit = sysctl_route_net_exit,
3004 };
3005 #endif
3006
3007 static __net_init int rt_genid_init(struct net *net)
3008 {
3009         atomic_set(&net->ipv4.rt_genid, 0);
3010         atomic_set(&net->fnhe_genid, 0);
3011         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3012         return 0;
3013 }
3014
3015 static __net_initdata struct pernet_operations rt_genid_ops = {
3016         .init = rt_genid_init,
3017 };
3018
3019 static int __net_init ipv4_inetpeer_init(struct net *net)
3020 {
3021         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3022
3023         if (!bp)
3024                 return -ENOMEM;
3025         inet_peer_base_init(bp);
3026         net->ipv4.peers = bp;
3027         return 0;
3028 }
3029
3030 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3031 {
3032         struct inet_peer_base *bp = net->ipv4.peers;
3033
3034         net->ipv4.peers = NULL;
3035         inetpeer_invalidate_tree(bp);
3036         kfree(bp);
3037 }
3038
3039 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3040         .init   =       ipv4_inetpeer_init,
3041         .exit   =       ipv4_inetpeer_exit,
3042 };
3043
3044 #ifdef CONFIG_IP_ROUTE_CLASSID
3045 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3046 #endif /* CONFIG_IP_ROUTE_CLASSID */
3047
3048 int __init ip_rt_init(void)
3049 {
3050         int cpu;
3051
3052         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3053         if (!ip_idents)
3054                 panic("IP: failed to allocate ip_idents\n");
3055
3056         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3057
3058         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3059         if (!ip_tstamps)
3060                 panic("IP: failed to allocate ip_tstamps\n");
3061
3062         for_each_possible_cpu(cpu) {
3063                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3064
3065                 INIT_LIST_HEAD(&ul->head);
3066                 spin_lock_init(&ul->lock);
3067         }
3068 #ifdef CONFIG_IP_ROUTE_CLASSID
3069         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3070         if (!ip_rt_acct)
3071                 panic("IP: failed to allocate ip_rt_acct\n");
3072 #endif
3073
3074         ipv4_dst_ops.kmem_cachep =
3075                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3076                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3077
3078         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3079
3080         if (dst_entries_init(&ipv4_dst_ops) < 0)
3081                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3082
3083         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3084                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3085
3086         ipv4_dst_ops.gc_thresh = ~0;
3087         ip_rt_max_size = INT_MAX;
3088
3089         devinet_init();
3090         ip_fib_init();
3091
3092         if (ip_rt_proc_init())
3093                 pr_err("Unable to create route proc files\n");
3094 #ifdef CONFIG_XFRM
3095         xfrm_init();
3096         xfrm4_init();
3097 #endif
3098         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3099                       RTNL_FLAG_DOIT_UNLOCKED);
3100
3101 #ifdef CONFIG_SYSCTL
3102         register_pernet_subsys(&sysctl_route_ops);
3103 #endif
3104         register_pernet_subsys(&rt_genid_ops);
3105         register_pernet_subsys(&ipv4_inetpeer_ops);
3106         return 0;
3107 }
3108
3109 #ifdef CONFIG_SYSCTL
3110 /*
3111  * We really need to sanitize the damn ipv4 init order, then all
3112  * this nonsense will go away.
3113  */
3114 void __init ip_static_sysctl_init(void)
3115 {
3116         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3117 }
3118 #endif