net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 148                                            struct sk_buff *skb, u32 mtu);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         return NULL;
 281
 282 }
 283
 284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285 {
 286
 287 }
 288
 289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290 {
 291         struct rt_cache_stat *st = v;
 292
 293         if (v == SEQ_START_TOKEN) {
 294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                 return 0;
 296         }
 297
 298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                    dst_entries_get_slow(&ipv4_dst_ops),
 301                    0, /* st->in_hit */
 302                    st->in_slow_tot,
 303                    st->in_slow_mc,
 304                    st->in_no_route,
 305                    st->in_brd,
 306                    st->in_martian_dst,
 307                    st->in_martian_src,
 308
 309                    0, /* st->out_hit */
 310                    st->out_slow_tot,
 311                    st->out_slow_mc,
 312
 313                    0, /* st->gc_total */
 314                    0, /* st->gc_ignored */
 315                    0, /* st->gc_goal_miss */
 316                    0, /* st->gc_dst_overflow */
 317                    0, /* st->in_hlist_search */
 318                    0  /* st->out_hlist_search */
 319                 );
 320         return 0;
 321 }
 322
 323 static const struct seq_operations rt_cpu_seq_ops = {
 324         .start  = rt_cpu_seq_start,
 325         .next   = rt_cpu_seq_next,
 326         .stop   = rt_cpu_seq_stop,
 327         .show   = rt_cpu_seq_show,
 328 };
 329
 330
 331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332 {
 333         return seq_open(file, &rt_cpu_seq_ops);
 334 }
 335
 336 static const struct file_operations rt_cpu_seq_fops = {
 337         .owner   = THIS_MODULE,
 338         .open    = rt_cpu_seq_open,
 339         .read    = seq_read,
 340         .llseek  = seq_lseek,
 341         .release = seq_release,
 342 };
 343
 344 #ifdef CONFIG_IP_ROUTE_CLASSID
 345 static int rt_acct_proc_show(struct seq_file *m, void *v)
 346 {
 347         struct ip_rt_acct *dst, *src;
 348         unsigned int i, j;
 349
 350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351         if (!dst)
 352                 return -ENOMEM;
 353
 354         for_each_possible_cpu(i) {
 355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                 for (j = 0; j < 256; j++) {
 357                         dst[j].o_bytes   += src[j].o_bytes;
 358                         dst[j].o_packets += src[j].o_packets;
 359                         dst[j].i_bytes   += src[j].i_bytes;
 360                         dst[j].i_packets += src[j].i_packets;
 361                 }
 362         }
 363
 364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365         kfree(dst);
 366         return 0;
 367 }
 368
 369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370 {
 371         return single_open(file, rt_acct_proc_show, NULL);
 372 }
 373
 374 static const struct file_operations rt_acct_proc_fops = {
 375         .owner          = THIS_MODULE,
 376         .open           = rt_acct_proc_open,
 377         .read           = seq_read,
 378         .llseek         = seq_lseek,
 379         .release        = single_release,
 380 };
 381 #endif
 382
 383 static int __net_init ip_rt_do_proc_init(struct net *net)
 384 {
 385         struct proc_dir_entry *pde;
 386
 387         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 388                           &rt_cache_seq_fops);
 389         if (!pde)
 390                 goto err1;
 391
 392         pde = proc_create("rt_cache", S_IRUGO,
 393                           net->proc_net_stat, &rt_cpu_seq_fops);
 394         if (!pde)
 395                 goto err2;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399         if (!pde)
 400                 goto err3;
 401 #endif
 402         return 0;
 403
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405 err3:
 406         remove_proc_entry("rt_cache", net->proc_net_stat);
 407 #endif
 408 err2:
 409         remove_proc_entry("rt_cache", net->proc_net);
 410 err1:
 411         return -ENOMEM;
 412 }
 413
 414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415 {
 416         remove_proc_entry("rt_cache", net->proc_net_stat);
 417         remove_proc_entry("rt_cache", net->proc_net);
 418 #ifdef CONFIG_IP_ROUTE_CLASSID
 419         remove_proc_entry("rt_acct", net->proc_net);
 420 #endif
 421 }
 422
 423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424         .init = ip_rt_do_proc_init,
 425         .exit = ip_rt_do_proc_exit,
 426 };
 427
 428 static int __init ip_rt_proc_init(void)
 429 {
 430         return register_pernet_subsys(&ip_rt_proc_ops);
 431 }
 432
 433 #else
 434 static inline int ip_rt_proc_init(void)
 435 {
 436         return 0;
 437 }
 438 #endif /* CONFIG_PROC_FS */
 439
 440 static inline bool rt_is_expired(const struct rtable *rth)
 441 {
 442         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 443 }
 444
 445 void rt_cache_flush(struct net *net)
 446 {
 447         rt_genid_bump_ipv4(net);
 448 }
 449
 450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                            struct sk_buff *skb,
 452                                            const void *daddr)
 453 {
 454         struct net_device *dev = dst->dev;
 455         const __be32 *pkey = daddr;
 456         const struct rtable *rt;
 457         struct neighbour *n;
 458
 459         rt = (const struct rtable *) dst;
 460         if (rt->rt_gateway)
 461                 pkey = (const __be32 *) &rt->rt_gateway;
 462         else if (skb)
 463                 pkey = &ip_hdr(skb)->daddr;
 464
 465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466         if (n)
 467                 return n;
 468         return neigh_create(&arp_tbl, pkey, dev);
 469 }
 470
 471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 472 {
 473         struct net_device *dev = dst->dev;
 474         const __be32 *pkey = daddr;
 475         const struct rtable *rt;
 476
 477         rt = (const struct rtable *)dst;
 478         if (rt->rt_gateway)
 479                 pkey = (const __be32 *)&rt->rt_gateway;
 480         else if (!daddr ||
 481                  (rt->rt_flags &
 482                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 483                 return;
 484
 485         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 486 }
 487
 488 #define IP_IDENTS_SZ 2048u
 489
 490 static atomic_t *ip_idents __read_mostly;
 491 static u32 *ip_tstamps __read_mostly;
 492
 493 /* In order to protect privacy, we add a perturbation to identifiers
 494  * if one generator is seldom used. This makes hard for an attacker
 495  * to infer how many packets were sent between two points in time.
 496  */
 497 u32 ip_idents_reserve(u32 hash, int segs)
 498 {
 499         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 500         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 501         u32 old = READ_ONCE(*p_tstamp);
 502         u32 now = (u32)jiffies;
 503         u32 new, delta = 0;
 504
 505         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 506                 delta = prandom_u32_max(now - old);
 507
 508         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 509         do {
 510                 old = (u32)atomic_read(p_id);
 511                 new = old + delta + segs;
 512         } while (atomic_cmpxchg(p_id, old, new) != old);
 513
 514         return new - segs;
 515 }
 516 EXPORT_SYMBOL(ip_idents_reserve);
 517
 518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 519 {
 520         static u32 ip_idents_hashrnd __read_mostly;
 521         u32 hash, id;
 522
 523         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 524
 525         hash = jhash_3words((__force u32)iph->daddr,
 526                             (__force u32)iph->saddr,
 527                             iph->protocol ^ net_hash_mix(net),
 528                             ip_idents_hashrnd);
 529         id = ip_idents_reserve(hash, segs);
 530         iph->id = htons(id);
 531 }
 532 EXPORT_SYMBOL(__ip_select_ident);
 533
 534 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 535                              const struct sock *sk,
 536                              const struct iphdr *iph,
 537                              int oif, u8 tos,
 538                              u8 prot, u32 mark, int flow_flags)
 539 {
 540         if (sk) {
 541                 const struct inet_sock *inet = inet_sk(sk);
 542
 543                 oif = sk->sk_bound_dev_if;
 544                 mark = sk->sk_mark;
 545                 tos = RT_CONN_FLAGS(sk);
 546                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 547         }
 548         flowi4_init_output(fl4, oif, mark, tos,
 549                            RT_SCOPE_UNIVERSE, prot,
 550                            flow_flags,
 551                            iph->daddr, iph->saddr, 0, 0,
 552                            sock_net_uid(net, sk));
 553 }
 554
 555 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 556                                const struct sock *sk)
 557 {
 558         const struct net *net = dev_net(skb->dev);
 559         const struct iphdr *iph = ip_hdr(skb);
 560         int oif = skb->dev->ifindex;
 561         u8 tos = RT_TOS(iph->tos);
 562         u8 prot = iph->protocol;
 563         u32 mark = skb->mark;
 564
 565         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 566 }
 567
 568 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 569 {
 570         const struct inet_sock *inet = inet_sk(sk);
 571         const struct ip_options_rcu *inet_opt;
 572         __be32 daddr = inet->inet_daddr;
 573
 574         rcu_read_lock();
 575         inet_opt = rcu_dereference(inet->inet_opt);
 576         if (inet_opt && inet_opt->opt.srr)
 577                 daddr = inet_opt->opt.faddr;
 578         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 579                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 580                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 581                            inet_sk_flowi_flags(sk),
 582                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 583         rcu_read_unlock();
 584 }
 585
 586 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 587                                  const struct sk_buff *skb)
 588 {
 589         if (skb)
 590                 build_skb_flow_key(fl4, skb, sk);
 591         else
 592                 build_sk_flow_key(fl4, sk);
 593 }
 594
 595 static DEFINE_SPINLOCK(fnhe_lock);
 596
 597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 598 {
 599         struct rtable *rt;
 600
 601         rt = rcu_dereference(fnhe->fnhe_rth_input);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607         rt = rcu_dereference(fnhe->fnhe_rth_output);
 608         if (rt) {
 609                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 610                 dst_dev_put(&rt->dst);
 611                 dst_release(&rt->dst);
 612         }
 613 }
 614
 615 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 616 {
 617         struct fib_nh_exception *fnhe, *oldest;
 618
 619         oldest = rcu_dereference(hash->chain);
 620         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 621              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 622                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 623                         oldest = fnhe;
 624         }
 625         fnhe_flush_routes(oldest);
 626         return oldest;
 627 }
 628
 629 static inline u32 fnhe_hashfun(__be32 daddr)
 630 {
 631         static u32 fnhe_hashrnd __read_mostly;
 632         u32 hval;
 633
 634         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 635         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 636         return hash_32(hval, FNHE_HASH_SHIFT);
 637 }
 638
 639 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 640 {
 641         rt->rt_pmtu = fnhe->fnhe_pmtu;
 642         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 643         rt->dst.expires = fnhe->fnhe_expires;
 644
 645         if (fnhe->fnhe_gw) {
 646                 rt->rt_flags |= RTCF_REDIRECTED;
 647                 rt->rt_gateway = fnhe->fnhe_gw;
 648                 rt->rt_uses_gateway = 1;
 649         }
 650 }
 651
 652 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 653                                   u32 pmtu, bool lock, unsigned long expires)
 654 {
 655         struct fnhe_hash_bucket *hash;
 656         struct fib_nh_exception *fnhe;
 657         struct rtable *rt;
 658         u32 genid, hval;
 659         unsigned int i;
 660         int depth;
 661
 662         genid = fnhe_genid(dev_net(nh->nh_dev));
 663         hval = fnhe_hashfun(daddr);
 664
 665         spin_lock_bh(&fnhe_lock);
 666
 667         hash = rcu_dereference(nh->nh_exceptions);
 668         if (!hash) {
 669                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 670                 if (!hash)
 671                         goto out_unlock;
 672                 rcu_assign_pointer(nh->nh_exceptions, hash);
 673         }
 674
 675         hash += hval;
 676
 677         depth = 0;
 678         for (fnhe = rcu_dereference(hash->chain); fnhe;
 679              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 680                 if (fnhe->fnhe_daddr == daddr)
 681                         break;
 682                 depth++;
 683         }
 684
 685         if (fnhe) {
 686                 if (fnhe->fnhe_genid != genid)
 687                         fnhe->fnhe_genid = genid;
 688                 if (gw)
 689                         fnhe->fnhe_gw = gw;
 690                 if (pmtu) {
 691                         fnhe->fnhe_pmtu = pmtu;
 692                         fnhe->fnhe_mtu_locked = lock;
 693                 }
 694                 fnhe->fnhe_expires = max(1UL, expires);
 695                 /* Update all cached dsts too */
 696                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 697                 if (rt)
 698                         fill_route_from_fnhe(rt, fnhe);
 699                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 700                 if (rt)
 701                         fill_route_from_fnhe(rt, fnhe);
 702         } else {
 703                 if (depth > FNHE_RECLAIM_DEPTH)
 704                         fnhe = fnhe_oldest(hash);
 705                 else {
 706                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 707                         if (!fnhe)
 708                                 goto out_unlock;
 709
 710                         fnhe->fnhe_next = hash->chain;
 711                         rcu_assign_pointer(hash->chain, fnhe);
 712                 }
 713                 fnhe->fnhe_genid = genid;
 714                 fnhe->fnhe_daddr = daddr;
 715                 fnhe->fnhe_gw = gw;
 716                 fnhe->fnhe_pmtu = pmtu;
 717                 fnhe->fnhe_mtu_locked = lock;
 718                 fnhe->fnhe_expires = max(1UL, expires);
 719
 720                 /* Exception created; mark the cached routes for the nexthop
 721                  * stale, so anyone caching it rechecks if this exception
 722                  * applies to them.
 723                  */
 724                 rt = rcu_dereference(nh->nh_rth_input);
 725                 if (rt)
 726                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 727
 728                 for_each_possible_cpu(i) {
 729                         struct rtable __rcu **prt;
 730                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 731                         rt = rcu_dereference(*prt);
 732                         if (rt)
 733                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 734                 }
 735         }
 736
 737         fnhe->fnhe_stamp = jiffies;
 738
 739 out_unlock:
 740         spin_unlock_bh(&fnhe_lock);
 741 }
 742
 743 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 744                              bool kill_route)
 745 {
 746         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 747         __be32 old_gw = ip_hdr(skb)->saddr;
 748         struct net_device *dev = skb->dev;
 749         struct in_device *in_dev;
 750         struct fib_result res;
 751         struct neighbour *n;
 752         struct net *net;
 753
 754         switch (icmp_hdr(skb)->code & 7) {
 755         case ICMP_REDIR_NET:
 756         case ICMP_REDIR_NETTOS:
 757         case ICMP_REDIR_HOST:
 758         case ICMP_REDIR_HOSTTOS:
 759                 break;
 760
 761         default:
 762                 return;
 763         }
 764
 765         if (rt->rt_gateway != old_gw)
 766                 return;
 767
 768         in_dev = __in_dev_get_rcu(dev);
 769         if (!in_dev)
 770                 return;
 771
 772         net = dev_net(dev);
 773         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 774             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 775             ipv4_is_zeronet(new_gw))
 776                 goto reject_redirect;
 777
 778         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 779                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 780                         goto reject_redirect;
 781                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 782                         goto reject_redirect;
 783         } else {
 784                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 785                         goto reject_redirect;
 786         }
 787
 788         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 789         if (!n)
 790                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 791         if (!IS_ERR(n)) {
 792                 if (!(n->nud_state & NUD_VALID)) {
 793                         neigh_event_send(n, NULL);
 794                 } else {
 795                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 796                                 struct fib_nh *nh = &FIB_RES_NH(res);
 797
 798                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 799                                                 0, false,
 800                                                 jiffies + ip_rt_gc_timeout);
 801                         }
 802                         if (kill_route)
 803                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 804                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 805                 }
 806                 neigh_release(n);
 807         }
 808         return;
 809
 810 reject_redirect:
 811 #ifdef CONFIG_IP_ROUTE_VERBOSE
 812         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 813                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 814                 __be32 daddr = iph->daddr;
 815                 __be32 saddr = iph->saddr;
 816
 817                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 818                                      "  Advised path = %pI4 -> %pI4\n",
 819                                      &old_gw, dev->name, &new_gw,
 820                                      &saddr, &daddr);
 821         }
 822 #endif
 823         ;
 824 }
 825
 826 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 827 {
 828         struct rtable *rt;
 829         struct flowi4 fl4;
 830         const struct iphdr *iph = (const struct iphdr *) skb->data;
 831         struct net *net = dev_net(skb->dev);
 832         int oif = skb->dev->ifindex;
 833         u8 tos = RT_TOS(iph->tos);
 834         u8 prot = iph->protocol;
 835         u32 mark = skb->mark;
 836
 837         rt = (struct rtable *) dst;
 838
 839         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 840         __ip_do_redirect(rt, skb, &fl4, true);
 841 }
 842
 843 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 844 {
 845         struct rtable *rt = (struct rtable *)dst;
 846         struct dst_entry *ret = dst;
 847
 848         if (rt) {
 849                 if (dst->obsolete > 0) {
 850                         ip_rt_put(rt);
 851                         ret = NULL;
 852                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 853                            rt->dst.expires) {
 854                         ip_rt_put(rt);
 855                         ret = NULL;
 856                 }
 857         }
 858         return ret;
 859 }
 860
 861 /*
 862  * Algorithm:
 863  *      1. The first ip_rt_redirect_number redirects are sent
 864  *         with exponential backoff, then we stop sending them at all,
 865  *         assuming that the host ignores our redirects.
 866  *      2. If we did not see packets requiring redirects
 867  *         during ip_rt_redirect_silence, we assume that the host
 868  *         forgot redirected route and start to send redirects again.
 869  *
 870  * This algorithm is much cheaper and more intelligent than dumb load limiting
 871  * in icmp.c.
 872  *
 873  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 874  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 875  */
 876
 877 void ip_rt_send_redirect(struct sk_buff *skb)
 878 {
 879         struct rtable *rt = skb_rtable(skb);
 880         struct in_device *in_dev;
 881         struct inet_peer *peer;
 882         struct net *net;
 883         int log_martians;
 884         int vif;
 885
 886         rcu_read_lock();
 887         in_dev = __in_dev_get_rcu(rt->dst.dev);
 888         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 889                 rcu_read_unlock();
 890                 return;
 891         }
 892         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 893         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 894         rcu_read_unlock();
 895
 896         net = dev_net(rt->dst.dev);
 897         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 898         if (!peer) {
 899                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 900                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 901                 return;
 902         }
 903
 904         /* No redirected packets during ip_rt_redirect_silence;
 905          * reset the algorithm.
 906          */
 907         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 908                 peer->rate_tokens = 0;
 909                 peer->n_redirects = 0;
 910         }
 911
 912         /* Too many ignored redirects; do not send anything
 913          * set dst.rate_last to the last seen redirected packet.
 914          */
 915         if (peer->n_redirects >= ip_rt_redirect_number) {
 916                 peer->rate_last = jiffies;
 917                 goto out_put_peer;
 918         }
 919
 920         /* Check for load limit; set rate_last to the latest sent
 921          * redirect.
 922          */
 923         if (peer->rate_tokens == 0 ||
 924             time_after(jiffies,
 925                        (peer->rate_last +
 926                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 927                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 928
 929                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 930                 peer->rate_last = jiffies;
 931                 ++peer->rate_tokens;
 932                 ++peer->n_redirects;
 933 #ifdef CONFIG_IP_ROUTE_VERBOSE
 934                 if (log_martians &&
 935                     peer->rate_tokens == ip_rt_redirect_number)
 936                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 937                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 938                                              &ip_hdr(skb)->daddr, &gw);
 939 #endif
 940         }
 941 out_put_peer:
 942         inet_putpeer(peer);
 943 }
 944
 945 static int ip_error(struct sk_buff *skb)
 946 {
 947         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 948         struct rtable *rt = skb_rtable(skb);
 949         struct inet_peer *peer;
 950         unsigned long now;
 951         struct net *net;
 952         bool send;
 953         int code;
 954
 955         /* IP on this device is disabled. */
 956         if (!in_dev)
 957                 goto out;
 958
 959         net = dev_net(rt->dst.dev);
 960         if (!IN_DEV_FORWARD(in_dev)) {
 961                 switch (rt->dst.error) {
 962                 case EHOSTUNREACH:
 963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964                         break;
 965
 966                 case ENETUNREACH:
 967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                         break;
 969                 }
 970                 goto out;
 971         }
 972
 973         switch (rt->dst.error) {
 974         case EINVAL:
 975         default:
 976                 goto out;
 977         case EHOSTUNREACH:
 978                 code = ICMP_HOST_UNREACH;
 979                 break;
 980         case ENETUNREACH:
 981                 code = ICMP_NET_UNREACH;
 982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                 break;
 984         case EACCES:
 985                 code = ICMP_PKT_FILTERED;
 986                 break;
 987         }
 988
 989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990                                l3mdev_master_ifindex(skb->dev), 1);
 991
 992         send = true;
 993         if (peer) {
 994                 now = jiffies;
 995                 peer->rate_tokens += now - peer->rate_last;
 996                 if (peer->rate_tokens > ip_rt_error_burst)
 997                         peer->rate_tokens = ip_rt_error_burst;
 998                 peer->rate_last = now;
 999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         u32 old_mtu = ipv4_mtu(dst);
1016         struct fib_result res;
1017         bool lock = false;
1018
1019         if (ip_mtu_locked(dst))
1020                 return;
1021
1022         if (old_mtu < mtu)
1023                 return;
1024
1025         if (mtu < ip_rt_min_pmtu) {
1026                 lock = true;
1027                 mtu = min(old_mtu, ip_rt_min_pmtu);
1028         }
1029
1030         if (rt->rt_pmtu == mtu && !lock &&
1031             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032                 return;
1033
1034         rcu_read_lock();
1035         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036                 struct fib_nh *nh = &FIB_RES_NH(res);
1037
1038                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1039                                       jiffies + ip_rt_mtu_expires);
1040         }
1041         rcu_read_unlock();
1042 }
1043
1044 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045                               struct sk_buff *skb, u32 mtu)
1046 {
1047         struct rtable *rt = (struct rtable *) dst;
1048         struct flowi4 fl4;
1049
1050         ip_rt_build_flow_key(&fl4, sk, skb);
1051         __ip_rt_update_pmtu(rt, &fl4, mtu);
1052 }
1053
1054 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055                       int oif, u32 mark, u8 protocol, int flow_flags)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060
1061         if (!mark)
1062                 mark = IP4_REPLY_MARK(net, skb->mark);
1063
1064         __build_flow_key(net, &fl4, NULL, iph, oif,
1065                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1066         rt = __ip_route_output_key(net, &fl4);
1067         if (!IS_ERR(rt)) {
1068                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                 ip_rt_put(rt);
1070         }
1071 }
1072 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
1074 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079
1080         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081
1082         if (!fl4.flowi4_mark)
1083                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
1085         rt = __ip_route_output_key(sock_net(sk), &fl4);
1086         if (!IS_ERR(rt)) {
1087                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1088                 ip_rt_put(rt);
1089         }
1090 }
1091
1092 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093 {
1094         const struct iphdr *iph = (const struct iphdr *) skb->data;
1095         struct flowi4 fl4;
1096         struct rtable *rt;
1097         struct dst_entry *odst = NULL;
1098         bool new = false;
1099         struct net *net = sock_net(sk);
1100
1101         bh_lock_sock(sk);
1102
1103         if (!ip_sk_accept_pmtu(sk))
1104                 goto out;
1105
1106         odst = sk_dst_get(sk);
1107
1108         if (sock_owned_by_user(sk) || !odst) {
1109                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1110                 goto out;
1111         }
1112
1113         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114
1115         rt = (struct rtable *)odst;
1116         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118                 if (IS_ERR(rt))
1119                         goto out;
1120
1121                 new = true;
1122         }
1123
1124         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1125
1126         if (!dst_check(&rt->dst, 0)) {
1127                 if (new)
1128                         dst_release(&rt->dst);
1129
1130                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                 if (IS_ERR(rt))
1132                         goto out;
1133
1134                 new = true;
1135         }
1136
1137         if (new)
1138                 sk_dst_set(sk, &rt->dst);
1139
1140 out:
1141         bh_unlock_sock(sk);
1142         dst_release(odst);
1143 }
1144 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145
1146 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147                    int oif, u32 mark, u8 protocol, int flow_flags)
1148 {
1149         const struct iphdr *iph = (const struct iphdr *) skb->data;
1150         struct flowi4 fl4;
1151         struct rtable *rt;
1152
1153         __build_flow_key(net, &fl4, NULL, iph, oif,
1154                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1155         rt = __ip_route_output_key(net, &fl4);
1156         if (!IS_ERR(rt)) {
1157                 __ip_do_redirect(rt, skb, &fl4, false);
1158                 ip_rt_put(rt);
1159         }
1160 }
1161 EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164 {
1165         const struct iphdr *iph = (const struct iphdr *) skb->data;
1166         struct flowi4 fl4;
1167         struct rtable *rt;
1168         struct net *net = sock_net(sk);
1169
1170         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171         rt = __ip_route_output_key(net, &fl4);
1172         if (!IS_ERR(rt)) {
1173                 __ip_do_redirect(rt, skb, &fl4, false);
1174                 ip_rt_put(rt);
1175         }
1176 }
1177 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
1179 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180 {
1181         struct rtable *rt = (struct rtable *) dst;
1182
1183         /* All IPV4 dsts are created with ->obsolete set to the value
1184          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185          * into this function always.
1186          *
1187          * When a PMTU/redirect information update invalidates a route,
1188          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189          * DST_OBSOLETE_DEAD by dst_free().
1190          */
1191         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192                 return NULL;
1193         return dst;
1194 }
1195
1196 static void ipv4_link_failure(struct sk_buff *skb)
1197 {
1198         struct rtable *rt;
1199
1200         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1201
1202         rt = skb_rtable(skb);
1203         if (rt)
1204                 dst_set_expires(&rt->dst, 0);
1205 }
1206
1207 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1208 {
1209         pr_debug("%s: %pI4 -> %pI4, %s\n",
1210                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1211                  skb->dev ? skb->dev->name : "?");
1212         kfree_skb(skb);
1213         WARN_ON(1);
1214         return 0;
1215 }
1216
1217 /*
1218    We do not cache source address of outgoing interface,
1219    because it is used only by IP RR, TS and SRR options,
1220    so that it out of fast path.
1221
1222    BTW remember: "addr" is allowed to be not aligned
1223    in IP options!
1224  */
1225
1226 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1227 {
1228         __be32 src;
1229
1230         if (rt_is_output_route(rt))
1231                 src = ip_hdr(skb)->saddr;
1232         else {
1233                 struct fib_result res;
1234                 struct flowi4 fl4;
1235                 struct iphdr *iph;
1236
1237                 iph = ip_hdr(skb);
1238
1239                 memset(&fl4, 0, sizeof(fl4));
1240                 fl4.daddr = iph->daddr;
1241                 fl4.saddr = iph->saddr;
1242                 fl4.flowi4_tos = RT_TOS(iph->tos);
1243                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1244                 fl4.flowi4_iif = skb->dev->ifindex;
1245                 fl4.flowi4_mark = skb->mark;
1246
1247                 rcu_read_lock();
1248                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1249                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1250                 else
1251                         src = inet_select_addr(rt->dst.dev,
1252                                                rt_nexthop(rt, iph->daddr),
1253                                                RT_SCOPE_UNIVERSE);
1254                 rcu_read_unlock();
1255         }
1256         memcpy(addr, &src, 4);
1257 }
1258
1259 #ifdef CONFIG_IP_ROUTE_CLASSID
1260 static void set_class_tag(struct rtable *rt, u32 tag)
1261 {
1262         if (!(rt->dst.tclassid & 0xFFFF))
1263                 rt->dst.tclassid |= tag & 0xFFFF;
1264         if (!(rt->dst.tclassid & 0xFFFF0000))
1265                 rt->dst.tclassid |= tag & 0xFFFF0000;
1266 }
1267 #endif
1268
1269 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1270 {
1271         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1272         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1273                                     ip_rt_min_advmss);
1274
1275         return min(advmss, IPV4_MAX_PMTU - header_size);
1276 }
1277
1278 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1279 {
1280         const struct rtable *rt = (const struct rtable *) dst;
1281         unsigned int mtu = rt->rt_pmtu;
1282
1283         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1284                 mtu = dst_metric_raw(dst, RTAX_MTU);
1285
1286         if (mtu)
1287                 return mtu;
1288
1289         mtu = READ_ONCE(dst->dev->mtu);
1290
1291         if (unlikely(ip_mtu_locked(dst))) {
1292                 if (rt->rt_uses_gateway && mtu > 576)
1293                         mtu = 576;
1294         }
1295
1296         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1297
1298         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1299 }
1300
1301 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1302 {
1303         struct fnhe_hash_bucket *hash;
1304         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1305         u32 hval = fnhe_hashfun(daddr);
1306
1307         spin_lock_bh(&fnhe_lock);
1308
1309         hash = rcu_dereference_protected(nh->nh_exceptions,
1310                                          lockdep_is_held(&fnhe_lock));
1311         hash += hval;
1312
1313         fnhe_p = &hash->chain;
1314         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1315         while (fnhe) {
1316                 if (fnhe->fnhe_daddr == daddr) {
1317                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1318                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1319                         fnhe_flush_routes(fnhe);
1320                         kfree_rcu(fnhe, rcu);
1321                         break;
1322                 }
1323                 fnhe_p = &fnhe->fnhe_next;
1324                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1325                                                  lockdep_is_held(&fnhe_lock));
1326         }
1327
1328         spin_unlock_bh(&fnhe_lock);
1329 }
1330
1331 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1332 {
1333         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1334         struct fib_nh_exception *fnhe;
1335         u32 hval;
1336
1337         if (!hash)
1338                 return NULL;
1339
1340         hval = fnhe_hashfun(daddr);
1341
1342         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1343              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1344                 if (fnhe->fnhe_daddr == daddr) {
1345                         if (fnhe->fnhe_expires &&
1346                             time_after(jiffies, fnhe->fnhe_expires)) {
1347                                 ip_del_fnhe(nh, daddr);
1348                                 break;
1349                         }
1350                         return fnhe;
1351                 }
1352         }
1353         return NULL;
1354 }
1355
1356 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1357                               __be32 daddr, const bool do_cache)
1358 {
1359         bool ret = false;
1360
1361         spin_lock_bh(&fnhe_lock);
1362
1363         if (daddr == fnhe->fnhe_daddr) {
1364                 struct rtable __rcu **porig;
1365                 struct rtable *orig;
1366                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1367
1368                 if (rt_is_input_route(rt))
1369                         porig = &fnhe->fnhe_rth_input;
1370                 else
1371                         porig = &fnhe->fnhe_rth_output;
1372                 orig = rcu_dereference(*porig);
1373
1374                 if (fnhe->fnhe_genid != genid) {
1375                         fnhe->fnhe_genid = genid;
1376                         fnhe->fnhe_gw = 0;
1377                         fnhe->fnhe_pmtu = 0;
1378                         fnhe->fnhe_expires = 0;
1379                         fnhe_flush_routes(fnhe);
1380                         orig = NULL;
1381                 }
1382                 fill_route_from_fnhe(rt, fnhe);
1383                 if (!rt->rt_gateway)
1384                         rt->rt_gateway = daddr;
1385
1386                 if (do_cache) {
1387                         dst_hold(&rt->dst);
1388                         rcu_assign_pointer(*porig, rt);
1389                         if (orig) {
1390                                 dst_dev_put(&orig->dst);
1391                                 dst_release(&orig->dst);
1392                         }
1393                         ret = true;
1394                 }
1395
1396                 fnhe->fnhe_stamp = jiffies;
1397         }
1398         spin_unlock_bh(&fnhe_lock);
1399
1400         return ret;
1401 }
1402
1403 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1404 {
1405         struct rtable *orig, *prev, **p;
1406         bool ret = true;
1407
1408         if (rt_is_input_route(rt)) {
1409                 p = (struct rtable **)&nh->nh_rth_input;
1410         } else {
1411                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1412         }
1413         orig = *p;
1414
1415         /* hold dst before doing cmpxchg() to avoid race condition
1416          * on this dst
1417          */
1418         dst_hold(&rt->dst);
1419         prev = cmpxchg(p, orig, rt);
1420         if (prev == orig) {
1421                 if (orig) {
1422                         dst_dev_put(&orig->dst);
1423                         dst_release(&orig->dst);
1424                 }
1425         } else {
1426                 dst_release(&rt->dst);
1427                 ret = false;
1428         }
1429
1430         return ret;
1431 }
1432
1433 struct uncached_list {
1434         spinlock_t              lock;
1435         struct list_head        head;
1436 };
1437
1438 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1439
1440 void rt_add_uncached_list(struct rtable *rt)
1441 {
1442         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444         rt->rt_uncached_list = ul;
1445
1446         spin_lock_bh(&ul->lock);
1447         list_add_tail(&rt->rt_uncached, &ul->head);
1448         spin_unlock_bh(&ul->lock);
1449 }
1450
1451 void rt_del_uncached_list(struct rtable *rt)
1452 {
1453         if (!list_empty(&rt->rt_uncached)) {
1454                 struct uncached_list *ul = rt->rt_uncached_list;
1455
1456                 spin_lock_bh(&ul->lock);
1457                 list_del(&rt->rt_uncached);
1458                 spin_unlock_bh(&ul->lock);
1459         }
1460 }
1461
1462 static void ipv4_dst_destroy(struct dst_entry *dst)
1463 {
1464         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465         struct rtable *rt = (struct rtable *)dst;
1466
1467         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468                 kfree(p);
1469
1470         rt_del_uncached_list(rt);
1471 }
1472
1473 void rt_flush_dev(struct net_device *dev)
1474 {
1475         struct net *net = dev_net(dev);
1476         struct rtable *rt;
1477         int cpu;
1478
1479         for_each_possible_cpu(cpu) {
1480                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
1482                 spin_lock_bh(&ul->lock);
1483                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1484                         if (rt->dst.dev != dev)
1485                                 continue;
1486                         rt->dst.dev = net->loopback_dev;
1487                         dev_hold(rt->dst.dev);
1488                         dev_put(dev);
1489                 }
1490                 spin_unlock_bh(&ul->lock);
1491         }
1492 }
1493
1494 static bool rt_cache_valid(const struct rtable *rt)
1495 {
1496         return  rt &&
1497                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498                 !rt_is_expired(rt);
1499 }
1500
1501 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502                            const struct fib_result *res,
1503                            struct fib_nh_exception *fnhe,
1504                            struct fib_info *fi, u16 type, u32 itag,
1505                            const bool do_cache)
1506 {
1507         bool cached = false;
1508
1509         if (fi) {
1510                 struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513                         rt->rt_gateway = nh->nh_gw;
1514                         rt->rt_uses_gateway = 1;
1515                 }
1516                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517                 if (fi->fib_metrics != &dst_default_metrics) {
1518                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519                         refcount_inc(&fi->fib_metrics->refcnt);
1520                 }
1521 #ifdef CONFIG_IP_ROUTE_CLASSID
1522                 rt->dst.tclassid = nh->nh_tclassid;
1523 #endif
1524                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525                 if (unlikely(fnhe))
1526                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527                 else if (do_cache)
1528                         cached = rt_cache_route(nh, rt);
1529                 if (unlikely(!cached)) {
1530                         /* Routes we intend to cache in nexthop exception or
1531                          * FIB nexthop have the DST_NOCACHE bit clear.
1532                          * However, if we are unsuccessful at storing this
1533                          * route into the cache we really need to set it.
1534                          */
1535                         if (!rt->rt_gateway)
1536                                 rt->rt_gateway = daddr;
1537                         rt_add_uncached_list(rt);
1538                 }
1539         } else
1540                 rt_add_uncached_list(rt);
1541
1542 #ifdef CONFIG_IP_ROUTE_CLASSID
1543 #ifdef CONFIG_IP_MULTIPLE_TABLES
1544         set_class_tag(rt, res->tclassid);
1545 #endif
1546         set_class_tag(rt, itag);
1547 #endif
1548 }
1549
1550 struct rtable *rt_dst_alloc(struct net_device *dev,
1551                             unsigned int flags, u16 type,
1552                             bool nopolicy, bool noxfrm, bool will_cache)
1553 {
1554         struct rtable *rt;
1555
1556         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557                        (will_cache ? 0 : DST_HOST) |
1558                        (nopolicy ? DST_NOPOLICY : 0) |
1559                        (noxfrm ? DST_NOXFRM : 0));
1560
1561         if (rt) {
1562                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563                 rt->rt_flags = flags;
1564                 rt->rt_type = type;
1565                 rt->rt_is_input = 0;
1566                 rt->rt_iif = 0;
1567                 rt->rt_pmtu = 0;
1568                 rt->rt_mtu_locked = 0;
1569                 rt->rt_gateway = 0;
1570                 rt->rt_uses_gateway = 0;
1571                 rt->rt_table_id = 0;
1572                 INIT_LIST_HEAD(&rt->rt_uncached);
1573
1574                 rt->dst.output = ip_output;
1575                 if (flags & RTCF_LOCAL)
1576                         rt->dst.input = ip_local_deliver;
1577         }
1578
1579         return rt;
1580 }
1581 EXPORT_SYMBOL(rt_dst_alloc);
1582
1583 /* called in rcu_read_lock() section */
1584 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585                           u8 tos, struct net_device *dev,
1586                           struct in_device *in_dev, u32 *itag)
1587 {
1588         int err;
1589
1590         /* Primary sanity checks. */
1591         if (!in_dev)
1592                 return -EINVAL;
1593
1594         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1595             skb->protocol != htons(ETH_P_IP))
1596                 return -EINVAL;
1597
1598         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1599                 return -EINVAL;
1600
1601         if (ipv4_is_zeronet(saddr)) {
1602                 if (!ipv4_is_local_multicast(daddr))
1603                         return -EINVAL;
1604         } else {
1605                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1606                                           in_dev, itag);
1607                 if (err < 0)
1608                         return err;
1609         }
1610         return 0;
1611 }
1612
1613 /* called in rcu_read_lock() section */
1614 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1615                              u8 tos, struct net_device *dev, int our)
1616 {
1617         struct in_device *in_dev = __in_dev_get_rcu(dev);
1618         unsigned int flags = RTCF_MULTICAST;
1619         struct rtable *rth;
1620         u32 itag = 0;
1621         int err;
1622
1623         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1624         if (err)
1625                 return err;
1626
1627         if (our)
1628                 flags |= RTCF_LOCAL;
1629
1630         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1631                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1632         if (!rth)
1633                 return -ENOBUFS;
1634
1635 #ifdef CONFIG_IP_ROUTE_CLASSID
1636         rth->dst.tclassid = itag;
1637 #endif
1638         rth->dst.output = ip_rt_bug;
1639         rth->rt_is_input= 1;
1640
1641 #ifdef CONFIG_IP_MROUTE
1642         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1643                 rth->dst.input = ip_mr_input;
1644 #endif
1645         RT_CACHE_STAT_INC(in_slow_mc);
1646
1647         skb_dst_set(skb, &rth->dst);
1648         return 0;
1649 }
1650
1651
1652 static void ip_handle_martian_source(struct net_device *dev,
1653                                      struct in_device *in_dev,
1654                                      struct sk_buff *skb,
1655                                      __be32 daddr,
1656                                      __be32 saddr)
1657 {
1658         RT_CACHE_STAT_INC(in_martian_src);
1659 #ifdef CONFIG_IP_ROUTE_VERBOSE
1660         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1661                 /*
1662                  *      RFC1812 recommendation, if source is martian,
1663                  *      the only hint is MAC header.
1664                  */
1665                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1666                         &daddr, &saddr, dev->name);
1667                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1668                         print_hex_dump(KERN_WARNING, "ll header: ",
1669                                        DUMP_PREFIX_OFFSET, 16, 1,
1670                                        skb_mac_header(skb),
1671                                        dev->hard_header_len, true);
1672                 }
1673         }
1674 #endif
1675 }
1676
1677 static void set_lwt_redirect(struct rtable *rth)
1678 {
1679         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1680                 rth->dst.lwtstate->orig_output = rth->dst.output;
1681                 rth->dst.output = lwtunnel_output;
1682         }
1683
1684         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1685                 rth->dst.lwtstate->orig_input = rth->dst.input;
1686                 rth->dst.input = lwtunnel_input;
1687         }
1688 }
1689
1690 /* called in rcu_read_lock() section */
1691 static int __mkroute_input(struct sk_buff *skb,
1692                            const struct fib_result *res,
1693                            struct in_device *in_dev,
1694                            __be32 daddr, __be32 saddr, u32 tos)
1695 {
1696         struct fib_nh_exception *fnhe;
1697         struct rtable *rth;
1698         int err;
1699         struct in_device *out_dev;
1700         bool do_cache;
1701         u32 itag = 0;
1702
1703         /* get a working reference to the output device */
1704         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1705         if (!out_dev) {
1706                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1707                 return -EINVAL;
1708         }
1709
1710         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1711                                   in_dev->dev, in_dev, &itag);
1712         if (err < 0) {
1713                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1714                                          saddr);
1715
1716                 goto cleanup;
1717         }
1718
1719         do_cache = res->fi && !itag;
1720         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1721             skb->protocol == htons(ETH_P_IP) &&
1722             (IN_DEV_SHARED_MEDIA(out_dev) ||
1723              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1724                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1725
1726         if (skb->protocol != htons(ETH_P_IP)) {
1727                 /* Not IP (i.e. ARP). Do not create route, if it is
1728                  * invalid for proxy arp. DNAT routes are always valid.
1729                  *
1730                  * Proxy arp feature have been extended to allow, ARP
1731                  * replies back to the same interface, to support
1732                  * Private VLAN switch technologies. See arp.c.
1733                  */
1734                 if (out_dev == in_dev &&
1735                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1736                         err = -EINVAL;
1737                         goto cleanup;
1738                 }
1739         }
1740
1741         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1742         if (do_cache) {
1743                 if (fnhe)
1744                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1745                 else
1746                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1747                 if (rt_cache_valid(rth)) {
1748                         skb_dst_set_noref(skb, &rth->dst);
1749                         goto out;
1750                 }
1751         }
1752
1753         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1754                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1755                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1756         if (!rth) {
1757                 err = -ENOBUFS;
1758                 goto cleanup;
1759         }
1760
1761         rth->rt_is_input = 1;
1762         if (res->table)
1763                 rth->rt_table_id = res->table->tb_id;
1764         RT_CACHE_STAT_INC(in_slow_tot);
1765
1766         rth->dst.input = ip_forward;
1767
1768         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1769                        do_cache);
1770         set_lwt_redirect(rth);
1771         skb_dst_set(skb, &rth->dst);
1772 out:
1773         err = 0;
1774  cleanup:
1775         return err;
1776 }
1777
1778 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1779 /* To make ICMP packets follow the right flow, the multipath hash is
1780  * calculated from the inner IP addresses.
1781  */
1782 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1783                                  struct flow_keys *hash_keys)
1784 {
1785         const struct iphdr *outer_iph = ip_hdr(skb);
1786         const struct iphdr *inner_iph;
1787         const struct icmphdr *icmph;
1788         struct iphdr _inner_iph;
1789         struct icmphdr _icmph;
1790
1791         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1792         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1793         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1794                 return;
1795
1796         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1797                 return;
1798
1799         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1800                                    &_icmph);
1801         if (!icmph)
1802                 return;
1803
1804         if (icmph->type != ICMP_DEST_UNREACH &&
1805             icmph->type != ICMP_REDIRECT &&
1806             icmph->type != ICMP_TIME_EXCEEDED &&
1807             icmph->type != ICMP_PARAMETERPROB)
1808                 return;
1809
1810         inner_iph = skb_header_pointer(skb,
1811                                        outer_iph->ihl * 4 + sizeof(_icmph),
1812                                        sizeof(_inner_iph), &_inner_iph);
1813         if (!inner_iph)
1814                 return;
1815         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1816         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1817 }
1818
1819 /* if skb is set it will be used and fl4 can be NULL */
1820 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1821                        const struct sk_buff *skb)
1822 {
1823         struct net *net = fi->fib_net;
1824         struct flow_keys hash_keys;
1825         u32 mhash;
1826
1827         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1828         case 0:
1829                 memset(&hash_keys, 0, sizeof(hash_keys));
1830                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1831                 if (skb) {
1832                         ip_multipath_l3_keys(skb, &hash_keys);
1833                 } else {
1834                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1835                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1836                 }
1837                 break;
1838         case 1:
1839                 /* skb is currently provided only when forwarding */
1840                 if (skb) {
1841                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1842                         struct flow_keys keys;
1843
1844                         /* short-circuit if we already have L4 hash present */
1845                         if (skb->l4_hash)
1846                                 return skb_get_hash_raw(skb) >> 1;
1847                         memset(&hash_keys, 0, sizeof(hash_keys));
1848                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1849
1850                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1851                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1852                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1853                         hash_keys.ports.src = keys.ports.src;
1854                         hash_keys.ports.dst = keys.ports.dst;
1855                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1856                 } else {
1857                         memset(&hash_keys, 0, sizeof(hash_keys));
1858                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1859                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1860                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1861                         hash_keys.ports.src = fl4->fl4_sport;
1862                         hash_keys.ports.dst = fl4->fl4_dport;
1863                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1864                 }
1865                 break;
1866         }
1867         mhash = flow_hash_from_keys(&hash_keys);
1868
1869         return mhash >> 1;
1870 }
1871 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1872 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1873
1874 static int ip_mkroute_input(struct sk_buff *skb,
1875                             struct fib_result *res,
1876                             struct in_device *in_dev,
1877                             __be32 daddr, __be32 saddr, u32 tos)
1878 {
1879 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1880         if (res->fi && res->fi->fib_nhs > 1) {
1881                 int h = fib_multipath_hash(res->fi, NULL, skb);
1882
1883                 fib_select_multipath(res, h);
1884         }
1885 #endif
1886
1887         /* create a routing cache entry */
1888         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1889 }
1890
1891 /*
1892  *      NOTE. We drop all the packets that has local source
1893  *      addresses, because every properly looped back packet
1894  *      must have correct destination already attached by output routine.
1895  *
1896  *      Such approach solves two big problems:
1897  *      1. Not simplex devices are handled properly.
1898  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1899  *      called with rcu_read_lock()
1900  */
1901
1902 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1903                                u8 tos, struct net_device *dev,
1904                                struct fib_result *res)
1905 {
1906         struct in_device *in_dev = __in_dev_get_rcu(dev);
1907         struct ip_tunnel_info *tun_info;
1908         struct flowi4   fl4;
1909         unsigned int    flags = 0;
1910         u32             itag = 0;
1911         struct rtable   *rth;
1912         int             err = -EINVAL;
1913         struct net    *net = dev_net(dev);
1914         bool do_cache;
1915
1916         /* IP on this device is disabled. */
1917
1918         if (!in_dev)
1919                 goto out;
1920
1921         /* Check for the most weird martians, which can be not detected
1922            by fib_lookup.
1923          */
1924
1925         tun_info = skb_tunnel_info(skb);
1926         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1927                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1928         else
1929                 fl4.flowi4_tun_key.tun_id = 0;
1930         skb_dst_drop(skb);
1931
1932         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1933                 goto martian_source;
1934
1935         res->fi = NULL;
1936         res->table = NULL;
1937         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1938                 goto brd_input;
1939
1940         /* Accept zero addresses only to limited broadcast;
1941          * I even do not know to fix it or not. Waiting for complains :-)
1942          */
1943         if (ipv4_is_zeronet(saddr))
1944                 goto martian_source;
1945
1946         if (ipv4_is_zeronet(daddr))
1947                 goto martian_destination;
1948
1949         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1950          * and call it once if daddr or/and saddr are loopback addresses
1951          */
1952         if (ipv4_is_loopback(daddr)) {
1953                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1954                         goto martian_destination;
1955         } else if (ipv4_is_loopback(saddr)) {
1956                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1957                         goto martian_source;
1958         }
1959
1960         /*
1961          *      Now we are ready to route packet.
1962          */
1963         fl4.flowi4_oif = 0;
1964         fl4.flowi4_iif = dev->ifindex;
1965         fl4.flowi4_mark = skb->mark;
1966         fl4.flowi4_tos = tos;
1967         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1968         fl4.flowi4_flags = 0;
1969         fl4.daddr = daddr;
1970         fl4.saddr = saddr;
1971         fl4.flowi4_uid = sock_net_uid(net, NULL);
1972         err = fib_lookup(net, &fl4, res, 0);
1973         if (err != 0) {
1974                 if (!IN_DEV_FORWARD(in_dev))
1975                         err = -EHOSTUNREACH;
1976                 goto no_route;
1977         }
1978
1979         if (res->type == RTN_BROADCAST)
1980                 goto brd_input;
1981
1982         if (res->type == RTN_LOCAL) {
1983                 err = fib_validate_source(skb, saddr, daddr, tos,
1984                                           0, dev, in_dev, &itag);
1985                 if (err < 0)
1986                         goto martian_source;
1987                 goto local_input;
1988         }
1989
1990         if (!IN_DEV_FORWARD(in_dev)) {
1991                 err = -EHOSTUNREACH;
1992                 goto no_route;
1993         }
1994         if (res->type != RTN_UNICAST)
1995                 goto martian_destination;
1996
1997         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1998 out:    return err;
1999
2000 brd_input:
2001         if (skb->protocol != htons(ETH_P_IP))
2002                 goto e_inval;
2003
2004         if (!ipv4_is_zeronet(saddr)) {
2005                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006                                           in_dev, &itag);
2007                 if (err < 0)
2008                         goto martian_source;
2009         }
2010         flags |= RTCF_BROADCAST;
2011         res->type = RTN_BROADCAST;
2012         RT_CACHE_STAT_INC(in_brd);
2013
2014 local_input:
2015         do_cache = false;
2016         if (res->fi) {
2017                 if (!itag) {
2018                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019                         if (rt_cache_valid(rth)) {
2020                                 skb_dst_set_noref(skb, &rth->dst);
2021                                 err = 0;
2022                                 goto out;
2023                         }
2024                         do_cache = true;
2025                 }
2026         }
2027
2028         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029                            flags | RTCF_LOCAL, res->type,
2030                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031         if (!rth)
2032                 goto e_nobufs;
2033
2034         rth->dst.output= ip_rt_bug;
2035 #ifdef CONFIG_IP_ROUTE_CLASSID
2036         rth->dst.tclassid = itag;
2037 #endif
2038         rth->rt_is_input = 1;
2039         if (res->table)
2040                 rth->rt_table_id = res->table->tb_id;
2041
2042         RT_CACHE_STAT_INC(in_slow_tot);
2043         if (res->type == RTN_UNREACHABLE) {
2044                 rth->dst.input= ip_error;
2045                 rth->dst.error= -err;
2046                 rth->rt_flags   &= ~RTCF_LOCAL;
2047         }
2048
2049         if (do_cache) {
2050                 struct fib_nh *nh = &FIB_RES_NH(*res);
2051
2052                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2053                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2054                         WARN_ON(rth->dst.input == lwtunnel_input);
2055                         rth->dst.lwtstate->orig_input = rth->dst.input;
2056                         rth->dst.input = lwtunnel_input;
2057                 }
2058
2059                 if (unlikely(!rt_cache_route(nh, rth)))
2060                         rt_add_uncached_list(rth);
2061         }
2062         skb_dst_set(skb, &rth->dst);
2063         err = 0;
2064         goto out;
2065
2066 no_route:
2067         RT_CACHE_STAT_INC(in_no_route);
2068         res->type = RTN_UNREACHABLE;
2069         res->fi = NULL;
2070         res->table = NULL;
2071         goto local_input;
2072
2073         /*
2074          *      Do not cache martian addresses: they should be logged (RFC1812)
2075          */
2076 martian_destination:
2077         RT_CACHE_STAT_INC(in_martian_dst);
2078 #ifdef CONFIG_IP_ROUTE_VERBOSE
2079         if (IN_DEV_LOG_MARTIANS(in_dev))
2080                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2081                                      &daddr, &saddr, dev->name);
2082 #endif
2083
2084 e_inval:
2085         err = -EINVAL;
2086         goto out;
2087
2088 e_nobufs:
2089         err = -ENOBUFS;
2090         goto out;
2091
2092 martian_source:
2093         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2094         goto out;
2095 }
2096
2097 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098                          u8 tos, struct net_device *dev)
2099 {
2100         struct fib_result res;
2101         int err;
2102
2103         tos &= IPTOS_RT_MASK;
2104         rcu_read_lock();
2105         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2106         rcu_read_unlock();
2107
2108         return err;
2109 }
2110 EXPORT_SYMBOL(ip_route_input_noref);
2111
2112 /* called with rcu_read_lock held */
2113 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2114                        u8 tos, struct net_device *dev, struct fib_result *res)
2115 {
2116         /* Multicast recognition logic is moved from route cache to here.
2117            The problem was that too many Ethernet cards have broken/missing
2118            hardware multicast filters :-( As result the host on multicasting
2119            network acquires a lot of useless route cache entries, sort of
2120            SDR messages from all the world. Now we try to get rid of them.
2121            Really, provided software IP multicast filter is organized
2122            reasonably (at least, hashed), it does not result in a slowdown
2123            comparing with route cache reject entries.
2124            Note, that multicast routers are not affected, because
2125            route cache entry is created eventually.
2126          */
2127         if (ipv4_is_multicast(daddr)) {
2128                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2129                 int our = 0;
2130                 int err = -EINVAL;
2131
2132                 if (in_dev)
2133                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2134                                               ip_hdr(skb)->protocol);
2135
2136                 /* check l3 master if no match yet */
2137                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2138                         struct in_device *l3_in_dev;
2139
2140                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2141                         if (l3_in_dev)
2142                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2143                                                       ip_hdr(skb)->protocol);
2144                 }
2145
2146                 if (our
2147 #ifdef CONFIG_IP_MROUTE
2148                         ||
2149                     (!ipv4_is_local_multicast(daddr) &&
2150                      IN_DEV_MFORWARD(in_dev))
2151 #endif
2152                    ) {
2153                         err = ip_route_input_mc(skb, daddr, saddr,
2154                                                 tos, dev, our);
2155                 }
2156                 return err;
2157         }
2158
2159         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2160 }
2161
2162 /* called with rcu_read_lock() */
2163 static struct rtable *__mkroute_output(const struct fib_result *res,
2164                                        const struct flowi4 *fl4, int orig_oif,
2165                                        struct net_device *dev_out,
2166                                        unsigned int flags)
2167 {
2168         struct fib_info *fi = res->fi;
2169         struct fib_nh_exception *fnhe;
2170         struct in_device *in_dev;
2171         u16 type = res->type;
2172         struct rtable *rth;
2173         bool do_cache;
2174
2175         in_dev = __in_dev_get_rcu(dev_out);
2176         if (!in_dev)
2177                 return ERR_PTR(-EINVAL);
2178
2179         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2180                 if (ipv4_is_loopback(fl4->saddr) &&
2181                     !(dev_out->flags & IFF_LOOPBACK) &&
2182                     !netif_is_l3_master(dev_out))
2183                         return ERR_PTR(-EINVAL);
2184
2185         if (ipv4_is_lbcast(fl4->daddr))
2186                 type = RTN_BROADCAST;
2187         else if (ipv4_is_multicast(fl4->daddr))
2188                 type = RTN_MULTICAST;
2189         else if (ipv4_is_zeronet(fl4->daddr))
2190                 return ERR_PTR(-EINVAL);
2191
2192         if (dev_out->flags & IFF_LOOPBACK)
2193                 flags |= RTCF_LOCAL;
2194
2195         do_cache = true;
2196         if (type == RTN_BROADCAST) {
2197                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2198                 fi = NULL;
2199         } else if (type == RTN_MULTICAST) {
2200                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2201                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2202                                      fl4->flowi4_proto))
2203                         flags &= ~RTCF_LOCAL;
2204                 else
2205                         do_cache = false;
2206                 /* If multicast route do not exist use
2207                  * default one, but do not gateway in this case.
2208                  * Yes, it is hack.
2209                  */
2210                 if (fi && res->prefixlen < 4)
2211                         fi = NULL;
2212         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2213                    (orig_oif != dev_out->ifindex)) {
2214                 /* For local routes that require a particular output interface
2215                  * we do not want to cache the result.  Caching the result
2216                  * causes incorrect behaviour when there are multiple source
2217                  * addresses on the interface, the end result being that if the
2218                  * intended recipient is waiting on that interface for the
2219                  * packet he won't receive it because it will be delivered on
2220                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2221                  * be set to the loopback interface as well.
2222                  */
2223                 do_cache = false;
2224         }
2225
2226         fnhe = NULL;
2227         do_cache &= fi != NULL;
2228         if (fi) {
2229                 struct rtable __rcu **prth;
2230                 struct fib_nh *nh = &FIB_RES_NH(*res);
2231
2232                 fnhe = find_exception(nh, fl4->daddr);
2233                 if (!do_cache)
2234                         goto add;
2235                 if (fnhe) {
2236                         prth = &fnhe->fnhe_rth_output;
2237                 } else {
2238                         if (unlikely(fl4->flowi4_flags &
2239                                      FLOWI_FLAG_KNOWN_NH &&
2240                                      !(nh->nh_gw &&
2241                                        nh->nh_scope == RT_SCOPE_LINK))) {
2242                                 do_cache = false;
2243                                 goto add;
2244                         }
2245                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2246                 }
2247                 rth = rcu_dereference(*prth);
2248                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2249                         return rth;
2250         }
2251
2252 add:
2253         rth = rt_dst_alloc(dev_out, flags, type,
2254                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2255                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2256                            do_cache);
2257         if (!rth)
2258                 return ERR_PTR(-ENOBUFS);
2259
2260         rth->rt_iif = orig_oif;
2261         if (res->table)
2262                 rth->rt_table_id = res->table->tb_id;
2263
2264         RT_CACHE_STAT_INC(out_slow_tot);
2265
2266         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2267                 if (flags & RTCF_LOCAL &&
2268                     !(dev_out->flags & IFF_LOOPBACK)) {
2269                         rth->dst.output = ip_mc_output;
2270                         RT_CACHE_STAT_INC(out_slow_mc);
2271                 }
2272 #ifdef CONFIG_IP_MROUTE
2273                 if (type == RTN_MULTICAST) {
2274                         if (IN_DEV_MFORWARD(in_dev) &&
2275                             !ipv4_is_local_multicast(fl4->daddr)) {
2276                                 rth->dst.input = ip_mr_input;
2277                                 rth->dst.output = ip_mc_output;
2278                         }
2279                 }
2280 #endif
2281         }
2282
2283         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2284         set_lwt_redirect(rth);
2285
2286         return rth;
2287 }
2288
2289 /*
2290  * Major route resolver routine.
2291  */
2292
2293 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2294                                         const struct sk_buff *skb)
2295 {
2296         __u8 tos = RT_FL_TOS(fl4);
2297         struct fib_result res = {
2298                 .type           = RTN_UNSPEC,
2299                 .fi             = NULL,
2300                 .table          = NULL,
2301                 .tclassid       = 0,
2302         };
2303         struct rtable *rth;
2304
2305         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2306         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2307         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2308                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2309
2310         rcu_read_lock();
2311         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2312         rcu_read_unlock();
2313
2314         return rth;
2315 }
2316 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2317
2318 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2319                                             struct fib_result *res,
2320                                             const struct sk_buff *skb)
2321 {
2322         struct net_device *dev_out = NULL;
2323         int orig_oif = fl4->flowi4_oif;
2324         unsigned int flags = 0;
2325         struct rtable *rth;
2326         int err = -ENETUNREACH;
2327
2328         if (fl4->saddr) {
2329                 rth = ERR_PTR(-EINVAL);
2330                 if (ipv4_is_multicast(fl4->saddr) ||
2331                     ipv4_is_lbcast(fl4->saddr) ||
2332                     ipv4_is_zeronet(fl4->saddr))
2333                         goto out;
2334
2335                 /* I removed check for oif == dev_out->oif here.
2336                    It was wrong for two reasons:
2337                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2338                       is assigned to multiple interfaces.
2339                    2. Moreover, we are allowed to send packets with saddr
2340                       of another iface. --ANK
2341                  */
2342
2343                 if (fl4->flowi4_oif == 0 &&
2344                     (ipv4_is_multicast(fl4->daddr) ||
2345                      ipv4_is_lbcast(fl4->daddr))) {
2346                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2347                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2348                         if (!dev_out)
2349                                 goto out;
2350
2351                         /* Special hack: user can direct multicasts
2352                            and limited broadcast via necessary interface
2353                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2354                            This hack is not just for fun, it allows
2355                            vic,vat and friends to work.
2356                            They bind socket to loopback, set ttl to zero
2357                            and expect that it will work.
2358                            From the viewpoint of routing cache they are broken,
2359                            because we are not allowed to build multicast path
2360                            with loopback source addr (look, routing cache
2361                            cannot know, that ttl is zero, so that packet
2362                            will not leave this host and route is valid).
2363                            Luckily, this hack is good workaround.
2364                          */
2365
2366                         fl4->flowi4_oif = dev_out->ifindex;
2367                         goto make_route;
2368                 }
2369
2370                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2371                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2372                         if (!__ip_dev_find(net, fl4->saddr, false))
2373                                 goto out;
2374                 }
2375         }
2376
2377
2378         if (fl4->flowi4_oif) {
2379                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2380                 rth = ERR_PTR(-ENODEV);
2381                 if (!dev_out)
2382                         goto out;
2383
2384                 /* RACE: Check return value of inet_select_addr instead. */
2385                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2386                         rth = ERR_PTR(-ENETUNREACH);
2387                         goto out;
2388                 }
2389                 if (ipv4_is_local_multicast(fl4->daddr) ||
2390                     ipv4_is_lbcast(fl4->daddr) ||
2391                     fl4->flowi4_proto == IPPROTO_IGMP) {
2392                         if (!fl4->saddr)
2393                                 fl4->saddr = inet_select_addr(dev_out, 0,
2394                                                               RT_SCOPE_LINK);
2395                         goto make_route;
2396                 }
2397                 if (!fl4->saddr) {
2398                         if (ipv4_is_multicast(fl4->daddr))
2399                                 fl4->saddr = inet_select_addr(dev_out, 0,
2400                                                               fl4->flowi4_scope);
2401                         else if (!fl4->daddr)
2402                                 fl4->saddr = inet_select_addr(dev_out, 0,
2403                                                               RT_SCOPE_HOST);
2404                 }
2405         }
2406
2407         if (!fl4->daddr) {
2408                 fl4->daddr = fl4->saddr;
2409                 if (!fl4->daddr)
2410                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2411                 dev_out = net->loopback_dev;
2412                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2413                 res->type = RTN_LOCAL;
2414                 flags |= RTCF_LOCAL;
2415                 goto make_route;
2416         }
2417
2418         err = fib_lookup(net, fl4, res, 0);
2419         if (err) {
2420                 res->fi = NULL;
2421                 res->table = NULL;
2422                 if (fl4->flowi4_oif &&
2423                     (ipv4_is_multicast(fl4->daddr) ||
2424                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2425                         /* Apparently, routing tables are wrong. Assume,
2426                            that the destination is on link.
2427
2428                            WHY? DW.
2429                            Because we are allowed to send to iface
2430                            even if it has NO routes and NO assigned
2431                            addresses. When oif is specified, routing
2432                            tables are looked up with only one purpose:
2433                            to catch if destination is gatewayed, rather than
2434                            direct. Moreover, if MSG_DONTROUTE is set,
2435                            we send packet, ignoring both routing tables
2436                            and ifaddr state. --ANK
2437
2438
2439                            We could make it even if oif is unknown,
2440                            likely IPv6, but we do not.
2441                          */
2442
2443                         if (fl4->saddr == 0)
2444                                 fl4->saddr = inet_select_addr(dev_out, 0,
2445                                                               RT_SCOPE_LINK);
2446                         res->type = RTN_UNICAST;
2447                         goto make_route;
2448                 }
2449                 rth = ERR_PTR(err);
2450                 goto out;
2451         }
2452
2453         if (res->type == RTN_LOCAL) {
2454                 if (!fl4->saddr) {
2455                         if (res->fi->fib_prefsrc)
2456                                 fl4->saddr = res->fi->fib_prefsrc;
2457                         else
2458                                 fl4->saddr = fl4->daddr;
2459                 }
2460
2461                 /* L3 master device is the loopback for that domain */
2462                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2463                         net->loopback_dev;
2464
2465                 /* make sure orig_oif points to fib result device even
2466                  * though packet rx/tx happens over loopback or l3mdev
2467                  */
2468                 orig_oif = FIB_RES_OIF(*res);
2469
2470                 fl4->flowi4_oif = dev_out->ifindex;
2471                 flags |= RTCF_LOCAL;
2472                 goto make_route;
2473         }
2474
2475         fib_select_path(net, res, fl4, skb);
2476
2477         dev_out = FIB_RES_DEV(*res);
2478         fl4->flowi4_oif = dev_out->ifindex;
2479
2480
2481 make_route:
2482         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2483
2484 out:
2485         return rth;
2486 }
2487
2488 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2489 {
2490         return NULL;
2491 }
2492
2493 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2494 {
2495         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2496
2497         return mtu ? : dst->dev->mtu;
2498 }
2499
2500 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2501                                           struct sk_buff *skb, u32 mtu)
2502 {
2503 }
2504
2505 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2506                                        struct sk_buff *skb)
2507 {
2508 }
2509
2510 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2511                                           unsigned long old)
2512 {
2513         return NULL;
2514 }
2515
2516 static struct dst_ops ipv4_dst_blackhole_ops = {
2517         .family                 =       AF_INET,
2518         .check                  =       ipv4_blackhole_dst_check,
2519         .mtu                    =       ipv4_blackhole_mtu,
2520         .default_advmss         =       ipv4_default_advmss,
2521         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2522         .redirect               =       ipv4_rt_blackhole_redirect,
2523         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2524         .neigh_lookup           =       ipv4_neigh_lookup,
2525 };
2526
2527 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2528 {
2529         struct rtable *ort = (struct rtable *) dst_orig;
2530         struct rtable *rt;
2531
2532         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2533         if (rt) {
2534                 struct dst_entry *new = &rt->dst;
2535
2536                 new->__use = 1;
2537                 new->input = dst_discard;
2538                 new->output = dst_discard_out;
2539
2540                 new->dev = net->loopback_dev;
2541                 if (new->dev)
2542                         dev_hold(new->dev);
2543
2544                 rt->rt_is_input = ort->rt_is_input;
2545                 rt->rt_iif = ort->rt_iif;
2546                 rt->rt_pmtu = ort->rt_pmtu;
2547                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2548
2549                 rt->rt_genid = rt_genid_ipv4(net);
2550                 rt->rt_flags = ort->rt_flags;
2551                 rt->rt_type = ort->rt_type;
2552                 rt->rt_gateway = ort->rt_gateway;
2553                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2554
2555                 INIT_LIST_HEAD(&rt->rt_uncached);
2556         }
2557
2558         dst_release(dst_orig);
2559
2560         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2561 }
2562
2563 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2564                                     const struct sock *sk)
2565 {
2566         struct rtable *rt = __ip_route_output_key(net, flp4);
2567
2568         if (IS_ERR(rt))
2569                 return rt;
2570
2571         if (flp4->flowi4_proto)
2572                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2573                                                         flowi4_to_flowi(flp4),
2574                                                         sk, 0);
2575
2576         return rt;
2577 }
2578 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2579
2580 /* called with rcu_read_lock held */
2581 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2582                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2583                         u32 seq)
2584 {
2585         struct rtable *rt = skb_rtable(skb);
2586         struct rtmsg *r;
2587         struct nlmsghdr *nlh;
2588         unsigned long expires = 0;
2589         u32 error;
2590         u32 metrics[RTAX_MAX];
2591
2592         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2593         if (!nlh)
2594                 return -EMSGSIZE;
2595
2596         r = nlmsg_data(nlh);
2597         r->rtm_family    = AF_INET;
2598         r->rtm_dst_len  = 32;
2599         r->rtm_src_len  = 0;
2600         r->rtm_tos      = fl4->flowi4_tos;
2601         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2602         if (nla_put_u32(skb, RTA_TABLE, table_id))
2603                 goto nla_put_failure;
2604         r->rtm_type     = rt->rt_type;
2605         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2606         r->rtm_protocol = RTPROT_UNSPEC;
2607         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2608         if (rt->rt_flags & RTCF_NOTIFY)
2609                 r->rtm_flags |= RTM_F_NOTIFY;
2610         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2611                 r->rtm_flags |= RTCF_DOREDIRECT;
2612
2613         if (nla_put_in_addr(skb, RTA_DST, dst))
2614                 goto nla_put_failure;
2615         if (src) {
2616                 r->rtm_src_len = 32;
2617                 if (nla_put_in_addr(skb, RTA_SRC, src))
2618                         goto nla_put_failure;
2619         }
2620         if (rt->dst.dev &&
2621             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2622                 goto nla_put_failure;
2623 #ifdef CONFIG_IP_ROUTE_CLASSID
2624         if (rt->dst.tclassid &&
2625             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2626                 goto nla_put_failure;
2627 #endif
2628         if (!rt_is_input_route(rt) &&
2629             fl4->saddr != src) {
2630                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2631                         goto nla_put_failure;
2632         }
2633         if (rt->rt_uses_gateway &&
2634             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2635                 goto nla_put_failure;
2636
2637         expires = rt->dst.expires;
2638         if (expires) {
2639                 unsigned long now = jiffies;
2640
2641                 if (time_before(now, expires))
2642                         expires -= now;
2643                 else
2644                         expires = 0;
2645         }
2646
2647         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2648         if (rt->rt_pmtu && expires)
2649                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2650         if (rt->rt_mtu_locked && expires)
2651                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2652         if (rtnetlink_put_metrics(skb, metrics) < 0)
2653                 goto nla_put_failure;
2654
2655         if (fl4->flowi4_mark &&
2656             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2657                 goto nla_put_failure;
2658
2659         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2660             nla_put_u32(skb, RTA_UID,
2661                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2662                 goto nla_put_failure;
2663
2664         error = rt->dst.error;
2665
2666         if (rt_is_input_route(rt)) {
2667 #ifdef CONFIG_IP_MROUTE
2668                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2669                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2670                         int err = ipmr_get_route(net, skb,
2671                                                  fl4->saddr, fl4->daddr,
2672                                                  r, portid);
2673
2674                         if (err <= 0) {
2675                                 if (err == 0)
2676                                         return 0;
2677                                 goto nla_put_failure;
2678                         }
2679                 } else
2680 #endif
2681                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2682                                 goto nla_put_failure;
2683         }
2684
2685         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2686                 goto nla_put_failure;
2687
2688         nlmsg_end(skb, nlh);
2689         return 0;
2690
2691 nla_put_failure:
2692         nlmsg_cancel(skb, nlh);
2693         return -EMSGSIZE;
2694 }
2695
2696 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2697                              struct netlink_ext_ack *extack)
2698 {
2699         struct net *net = sock_net(in_skb->sk);
2700         struct rtmsg *rtm;
2701         struct nlattr *tb[RTA_MAX+1];
2702         struct fib_result res = {};
2703         struct rtable *rt = NULL;
2704         struct flowi4 fl4;
2705         __be32 dst = 0;
2706         __be32 src = 0;
2707         u32 iif;
2708         int err;
2709         int mark;
2710         struct sk_buff *skb;
2711         u32 table_id = RT_TABLE_MAIN;
2712         kuid_t uid;
2713
2714         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2715                           extack);
2716         if (err < 0)
2717                 goto errout;
2718
2719         rtm = nlmsg_data(nlh);
2720
2721         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2722         if (!skb) {
2723                 err = -ENOBUFS;
2724                 goto errout;
2725         }
2726
2727         /* Reserve room for dummy headers, this skb can pass
2728            through good chunk of routing engine.
2729          */
2730         skb_reset_mac_header(skb);
2731         skb_reset_network_header(skb);
2732
2733         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2734         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2735         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2736         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2737         if (tb[RTA_UID])
2738                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2739         else
2740                 uid = (iif ? INVALID_UID : current_uid());
2741
2742         /* Bugfix: need to give ip_route_input enough of an IP header to
2743          * not gag.
2744          */
2745         ip_hdr(skb)->protocol = IPPROTO_UDP;
2746         ip_hdr(skb)->saddr = src;
2747         ip_hdr(skb)->daddr = dst;
2748
2749         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2750
2751         memset(&fl4, 0, sizeof(fl4));
2752         fl4.daddr = dst;
2753         fl4.saddr = src;
2754         fl4.flowi4_tos = rtm->rtm_tos;
2755         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2756         fl4.flowi4_mark = mark;
2757         fl4.flowi4_uid = uid;
2758
2759         rcu_read_lock();
2760
2761         if (iif) {
2762                 struct net_device *dev;
2763
2764                 dev = dev_get_by_index_rcu(net, iif);
2765                 if (!dev) {
2766                         err = -ENODEV;
2767                         goto errout_free;
2768                 }
2769
2770                 skb->protocol   = htons(ETH_P_IP);
2771                 skb->dev        = dev;
2772                 skb->mark       = mark;
2773                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2774                                          dev, &res);
2775
2776                 rt = skb_rtable(skb);
2777                 if (err == 0 && rt->dst.error)
2778                         err = -rt->dst.error;
2779         } else {
2780                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2781                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2782                 err = 0;
2783                 if (IS_ERR(rt))
2784                         err = PTR_ERR(rt);
2785                 else
2786                         skb_dst_set(skb, &rt->dst);
2787         }
2788
2789         if (err)
2790                 goto errout_free;
2791
2792         if (rtm->rtm_flags & RTM_F_NOTIFY)
2793                 rt->rt_flags |= RTCF_NOTIFY;
2794
2795         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2796                 table_id = rt->rt_table_id;
2797
2798         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2799                 if (!res.fi) {
2800                         err = fib_props[res.type].error;
2801                         if (!err)
2802                                 err = -EHOSTUNREACH;
2803                         goto errout_free;
2804                 }
2805                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2806                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2807                                     rt->rt_type, res.prefix, res.prefixlen,
2808                                     fl4.flowi4_tos, res.fi, 0);
2809         } else {
2810                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2811                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2812         }
2813         if (err < 0)
2814                 goto errout_free;
2815
2816         rcu_read_unlock();
2817
2818         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2819 errout:
2820         return err;
2821
2822 errout_free:
2823         rcu_read_unlock();
2824         kfree_skb(skb);
2825         goto errout;
2826 }
2827
2828 void ip_rt_multicast_event(struct in_device *in_dev)
2829 {
2830         rt_cache_flush(dev_net(in_dev->dev));
2831 }
2832
2833 #ifdef CONFIG_SYSCTL
2834 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2835 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2836 static int ip_rt_gc_elasticity __read_mostly    = 8;
2837
2838 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2839                                         void __user *buffer,
2840                                         size_t *lenp, loff_t *ppos)
2841 {
2842         struct net *net = (struct net *)__ctl->extra1;
2843
2844         if (write) {
2845                 rt_cache_flush(net);
2846                 fnhe_genid_bump(net);
2847                 return 0;
2848         }
2849
2850         return -EINVAL;
2851 }
2852
2853 static struct ctl_table ipv4_route_table[] = {
2854         {
2855                 .procname       = "gc_thresh",
2856                 .data           = &ipv4_dst_ops.gc_thresh,
2857                 .maxlen         = sizeof(int),
2858                 .mode           = 0644,
2859                 .proc_handler   = proc_dointvec,
2860         },
2861         {
2862                 .procname       = "max_size",
2863                 .data           = &ip_rt_max_size,
2864                 .maxlen         = sizeof(int),
2865                 .mode           = 0644,
2866                 .proc_handler   = proc_dointvec,
2867         },
2868         {
2869                 /*  Deprecated. Use gc_min_interval_ms */
2870
2871                 .procname       = "gc_min_interval",
2872                 .data           = &ip_rt_gc_min_interval,
2873                 .maxlen         = sizeof(int),
2874                 .mode           = 0644,
2875                 .proc_handler   = proc_dointvec_jiffies,
2876         },
2877         {
2878                 .procname       = "gc_min_interval_ms",
2879                 .data           = &ip_rt_gc_min_interval,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0644,
2882                 .proc_handler   = proc_dointvec_ms_jiffies,
2883         },
2884         {
2885                 .procname       = "gc_timeout",
2886                 .data           = &ip_rt_gc_timeout,
2887                 .maxlen         = sizeof(int),
2888                 .mode           = 0644,
2889                 .proc_handler   = proc_dointvec_jiffies,
2890         },
2891         {
2892                 .procname       = "gc_interval",
2893                 .data           = &ip_rt_gc_interval,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = proc_dointvec_jiffies,
2897         },
2898         {
2899                 .procname       = "redirect_load",
2900                 .data           = &ip_rt_redirect_load,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0644,
2903                 .proc_handler   = proc_dointvec,
2904         },
2905         {
2906                 .procname       = "redirect_number",
2907                 .data           = &ip_rt_redirect_number,
2908                 .maxlen         = sizeof(int),
2909                 .mode           = 0644,
2910                 .proc_handler   = proc_dointvec,
2911         },
2912         {
2913                 .procname       = "redirect_silence",
2914                 .data           = &ip_rt_redirect_silence,
2915                 .maxlen         = sizeof(int),
2916                 .mode           = 0644,
2917                 .proc_handler   = proc_dointvec,
2918         },
2919         {
2920                 .procname       = "error_cost",
2921                 .data           = &ip_rt_error_cost,
2922                 .maxlen         = sizeof(int),
2923                 .mode           = 0644,
2924                 .proc_handler   = proc_dointvec,
2925         },
2926         {
2927                 .procname       = "error_burst",
2928                 .data           = &ip_rt_error_burst,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = proc_dointvec,
2932         },
2933         {
2934                 .procname       = "gc_elasticity",
2935                 .data           = &ip_rt_gc_elasticity,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = proc_dointvec,
2939         },
2940         {
2941                 .procname       = "mtu_expires",
2942                 .data           = &ip_rt_mtu_expires,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = proc_dointvec_jiffies,
2946         },
2947         {
2948                 .procname       = "min_pmtu",
2949                 .data           = &ip_rt_min_pmtu,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = proc_dointvec_minmax,
2953                 .extra1         = &ip_min_valid_pmtu,
2954         },
2955         {
2956                 .procname       = "min_adv_mss",
2957                 .data           = &ip_rt_min_advmss,
2958                 .maxlen         = sizeof(int),
2959                 .mode           = 0644,
2960                 .proc_handler   = proc_dointvec,
2961         },
2962         { }
2963 };
2964
2965 static struct ctl_table ipv4_route_flush_table[] = {
2966         {
2967                 .procname       = "flush",
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0200,
2970                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2971         },
2972         { },
2973 };
2974
2975 static __net_init int sysctl_route_net_init(struct net *net)
2976 {
2977         struct ctl_table *tbl;
2978
2979         tbl = ipv4_route_flush_table;
2980         if (!net_eq(net, &init_net)) {
2981                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2982                 if (!tbl)
2983                         goto err_dup;
2984
2985                 /* Don't export sysctls to unprivileged users */
2986                 if (net->user_ns != &init_user_ns)
2987                         tbl[0].procname = NULL;
2988         }
2989         tbl[0].extra1 = net;
2990
2991         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2992         if (!net->ipv4.route_hdr)
2993                 goto err_reg;
2994         return 0;
2995
2996 err_reg:
2997         if (tbl != ipv4_route_flush_table)
2998                 kfree(tbl);
2999 err_dup:
3000         return -ENOMEM;
3001 }
3002
3003 static __net_exit void sysctl_route_net_exit(struct net *net)
3004 {
3005         struct ctl_table *tbl;
3006
3007         tbl = net->ipv4.route_hdr->ctl_table_arg;
3008         unregister_net_sysctl_table(net->ipv4.route_hdr);
3009         BUG_ON(tbl == ipv4_route_flush_table);
3010         kfree(tbl);
3011 }
3012
3013 static __net_initdata struct pernet_operations sysctl_route_ops = {
3014         .init = sysctl_route_net_init,
3015         .exit = sysctl_route_net_exit,
3016 };
3017 #endif
3018
3019 static __net_init int rt_genid_init(struct net *net)
3020 {
3021         atomic_set(&net->ipv4.rt_genid, 0);
3022         atomic_set(&net->fnhe_genid, 0);
3023         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3024         return 0;
3025 }
3026
3027 static __net_initdata struct pernet_operations rt_genid_ops = {
3028         .init = rt_genid_init,
3029 };
3030
3031 static int __net_init ipv4_inetpeer_init(struct net *net)
3032 {
3033         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3034
3035         if (!bp)
3036                 return -ENOMEM;
3037         inet_peer_base_init(bp);
3038         net->ipv4.peers = bp;
3039         return 0;
3040 }
3041
3042 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3043 {
3044         struct inet_peer_base *bp = net->ipv4.peers;
3045
3046         net->ipv4.peers = NULL;
3047         inetpeer_invalidate_tree(bp);
3048         kfree(bp);
3049 }
3050
3051 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3052         .init   =       ipv4_inetpeer_init,
3053         .exit   =       ipv4_inetpeer_exit,
3054 };
3055
3056 #ifdef CONFIG_IP_ROUTE_CLASSID
3057 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3058 #endif /* CONFIG_IP_ROUTE_CLASSID */
3059
3060 int __init ip_rt_init(void)
3061 {
3062         int cpu;
3063
3064         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3065         if (!ip_idents)
3066                 panic("IP: failed to allocate ip_idents\n");
3067
3068         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3069
3070         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3071         if (!ip_tstamps)
3072                 panic("IP: failed to allocate ip_tstamps\n");
3073
3074         for_each_possible_cpu(cpu) {
3075                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3076
3077                 INIT_LIST_HEAD(&ul->head);
3078                 spin_lock_init(&ul->lock);
3079         }
3080 #ifdef CONFIG_IP_ROUTE_CLASSID
3081         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3082         if (!ip_rt_acct)
3083                 panic("IP: failed to allocate ip_rt_acct\n");
3084 #endif
3085
3086         ipv4_dst_ops.kmem_cachep =
3087                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3088                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3089
3090         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3091
3092         if (dst_entries_init(&ipv4_dst_ops) < 0)
3093                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3094
3095         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3096                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3097
3098         ipv4_dst_ops.gc_thresh = ~0;
3099         ip_rt_max_size = INT_MAX;
3100
3101         devinet_init();
3102         ip_fib_init();
3103
3104         if (ip_rt_proc_init())
3105                 pr_err("Unable to create route proc files\n");
3106 #ifdef CONFIG_XFRM
3107         xfrm_init();
3108         xfrm4_init();
3109 #endif
3110         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3111                       RTNL_FLAG_DOIT_UNLOCKED);
3112
3113 #ifdef CONFIG_SYSCTL
3114         register_pernet_subsys(&sysctl_route_ops);
3115 #endif
3116         register_pernet_subsys(&rt_genid_ops);
3117         register_pernet_subsys(&ipv4_inetpeer_ops);
3118         return 0;
3119 }
3120
3121 #ifdef CONFIG_SYSCTL
3122 /*
3123  * We really need to sanitize the damn ipv4 init order, then all
3124  * this nonsense will go away.
3125  */
3126 void __init ip_static_sysctl_init(void)
3127 {
3128         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3129 }
3130 #endif