net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 152                                            struct sk_buff *skb, u32 mtu);
 153 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 154                                         struct sk_buff *skb);
 155 static int rt_garbage_collect(struct dst_ops *ops);
 156
 157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 158                             int how)
 159 {
 160 }
 161
 162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 163 {
 164         WARN_ON(1);
 165         return NULL;
 166 }
 167
 168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 169                                            struct sk_buff *skb,
 170                                            const void *daddr);
 171
 172 static struct dst_ops ipv4_dst_ops = {
 173         .family =               AF_INET,
 174         .protocol =             cpu_to_be16(ETH_P_IP),
 175         .gc =                   rt_garbage_collect,
 176         .check =                ipv4_dst_check,
 177         .default_advmss =       ipv4_default_advmss,
 178         .mtu =                  ipv4_mtu,
 179         .cow_metrics =          ipv4_cow_metrics,
 180         .destroy =              ipv4_dst_destroy,
 181         .ifdown =               ipv4_dst_ifdown,
 182         .negative_advice =      ipv4_negative_advice,
 183         .link_failure =         ipv4_link_failure,
 184         .update_pmtu =          ip_rt_update_pmtu,
 185         .redirect =             ip_do_redirect,
 186         .local_out =            __ip_local_out,
 187         .neigh_lookup =         ipv4_neigh_lookup,
 188 };
 189
 190 #define ECN_OR_COST(class)      TC_PRIO_##class
 191
 192 const __u8 ip_tos2prio[16] = {
 193         TC_PRIO_BESTEFFORT,
 194         ECN_OR_COST(BESTEFFORT),
 195         TC_PRIO_BESTEFFORT,
 196         ECN_OR_COST(BESTEFFORT),
 197         TC_PRIO_BULK,
 198         ECN_OR_COST(BULK),
 199         TC_PRIO_BULK,
 200         ECN_OR_COST(BULK),
 201         TC_PRIO_INTERACTIVE,
 202         ECN_OR_COST(INTERACTIVE),
 203         TC_PRIO_INTERACTIVE,
 204         ECN_OR_COST(INTERACTIVE),
 205         TC_PRIO_INTERACTIVE_BULK,
 206         ECN_OR_COST(INTERACTIVE_BULK),
 207         TC_PRIO_INTERACTIVE_BULK,
 208         ECN_OR_COST(INTERACTIVE_BULK)
 209 };
 210 EXPORT_SYMBOL(ip_tos2prio);
 211
 212 /*
 213  * Route cache.
 214  */
 215
 216 /* The locking scheme is rather straight forward:
 217  *
 218  * 1) Read-Copy Update protects the buckets of the central route hash.
 219  * 2) Only writers remove entries, and they hold the lock
 220  *    as they look at rtable reference counts.
 221  * 3) Only readers acquire references to rtable entries,
 222  *    they do so with atomic increments and with the
 223  *    lock held.
 224  */
 225
 226 struct rt_hash_bucket {
 227         struct rtable __rcu     *chain;
 228 };
 229
 230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 231         defined(CONFIG_PROVE_LOCKING)
 232 /*
 233  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 234  * The size of this table is a power of two and depends on the number of CPUS.
 235  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 236  */
 237 #ifdef CONFIG_LOCKDEP
 238 # define RT_HASH_LOCK_SZ        256
 239 #else
 240 # if NR_CPUS >= 32
 241 #  define RT_HASH_LOCK_SZ       4096
 242 # elif NR_CPUS >= 16
 243 #  define RT_HASH_LOCK_SZ       2048
 244 # elif NR_CPUS >= 8
 245 #  define RT_HASH_LOCK_SZ       1024
 246 # elif NR_CPUS >= 4
 247 #  define RT_HASH_LOCK_SZ       512
 248 # else
 249 #  define RT_HASH_LOCK_SZ       256
 250 # endif
 251 #endif
 252
 253 static spinlock_t       *rt_hash_locks;
 254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 255
 256 static __init void rt_hash_lock_init(void)
 257 {
 258         int i;
 259
 260         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 261                         GFP_KERNEL);
 262         if (!rt_hash_locks)
 263                 panic("IP: failed to allocate rt_hash_locks\n");
 264
 265         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 266                 spin_lock_init(&rt_hash_locks[i]);
 267 }
 268 #else
 269 # define rt_hash_lock_addr(slot) NULL
 270
 271 static inline void rt_hash_lock_init(void)
 272 {
 273 }
 274 #endif
 275
 276 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 277 static unsigned int             rt_hash_mask __read_mostly;
 278 static unsigned int             rt_hash_log  __read_mostly;
 279
 280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 282
 283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 284                                    int genid)
 285 {
 286         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 287                             idx, genid)
 288                 & rt_hash_mask;
 289 }
 290
 291 static inline int rt_genid(struct net *net)
 292 {
 293         return atomic_read(&net->ipv4.rt_genid);
 294 }
 295
 296 #ifdef CONFIG_PROC_FS
 297 struct rt_cache_iter_state {
 298         struct seq_net_private p;
 299         int bucket;
 300         int genid;
 301 };
 302
 303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306         struct rtable *r = NULL;
 307
 308         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 309                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 310                         continue;
 311                 rcu_read_lock_bh();
 312                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 313                 while (r) {
 314                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 315                             r->rt_genid == st->genid)
 316                                 return r;
 317                         r = rcu_dereference_bh(r->dst.rt_next);
 318                 }
 319                 rcu_read_unlock_bh();
 320         }
 321         return r;
 322 }
 323
 324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 325                                           struct rtable *r)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328
 329         r = rcu_dereference_bh(r->dst.rt_next);
 330         while (!r) {
 331                 rcu_read_unlock_bh();
 332                 do {
 333                         if (--st->bucket < 0)
 334                                 return NULL;
 335                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 336                 rcu_read_lock_bh();
 337                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 343                                         struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 347                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 348                         continue;
 349                 if (r->rt_genid == st->genid)
 350                         break;
 351         }
 352         return r;
 353 }
 354
 355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 356 {
 357         struct rtable *r = rt_cache_get_first(seq);
 358
 359         if (r)
 360                 while (pos && (r = rt_cache_get_next(seq, r)))
 361                         --pos;
 362         return pos ? NULL : r;
 363 }
 364
 365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         if (*pos)
 369                 return rt_cache_get_idx(seq, *pos - 1);
 370         st->genid = rt_genid(seq_file_net(seq));
 371         return SEQ_START_TOKEN;
 372 }
 373
 374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 375 {
 376         struct rtable *r;
 377
 378         if (v == SEQ_START_TOKEN)
 379                 r = rt_cache_get_first(seq);
 380         else
 381                 r = rt_cache_get_next(seq, v);
 382         ++*pos;
 383         return r;
 384 }
 385
 386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 387 {
 388         if (v && v != SEQ_START_TOKEN)
 389                 rcu_read_unlock_bh();
 390 }
 391
 392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 393 {
 394         if (v == SEQ_START_TOKEN)
 395                 seq_printf(seq, "%-127s\n",
 396                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 397                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 398                            "HHUptod\tSpecDst");
 399         else {
 400                 struct rtable *r = v;
 401                 int len;
 402
 403                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 404                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 405                            r->dst.dev ? r->dst.dev->name : "*",
 406                            (__force u32)r->rt_dst,
 407                            (__force u32)r->rt_gateway,
 408                            r->rt_flags, atomic_read(&r->dst.__refcnt),
 409                            r->dst.__use, 0, (__force u32)r->rt_src,
 410                            dst_metric_advmss(&r->dst) + 40,
 411                            dst_metric(&r->dst, RTAX_WINDOW), 0,
 412                            r->rt_key_tos,
 413                            -1, 0, 0, &len);
 414
 415                 seq_printf(seq, "%*s\n", 127 - len, "");
 416         }
 417         return 0;
 418 }
 419
 420 static const struct seq_operations rt_cache_seq_ops = {
 421         .start  = rt_cache_seq_start,
 422         .next   = rt_cache_seq_next,
 423         .stop   = rt_cache_seq_stop,
 424         .show   = rt_cache_seq_show,
 425 };
 426
 427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 428 {
 429         return seq_open_net(inode, file, &rt_cache_seq_ops,
 430                         sizeof(struct rt_cache_iter_state));
 431 }
 432
 433 static const struct file_operations rt_cache_seq_fops = {
 434         .owner   = THIS_MODULE,
 435         .open    = rt_cache_seq_open,
 436         .read    = seq_read,
 437         .llseek  = seq_lseek,
 438         .release = seq_release_net,
 439 };
 440
 441
 442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 443 {
 444         int cpu;
 445
 446         if (*pos == 0)
 447                 return SEQ_START_TOKEN;
 448
 449         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 450                 if (!cpu_possible(cpu))
 451                         continue;
 452                 *pos = cpu+1;
 453                 return &per_cpu(rt_cache_stat, cpu);
 454         }
 455         return NULL;
 456 }
 457
 458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 459 {
 460         int cpu;
 461
 462         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 463                 if (!cpu_possible(cpu))
 464                         continue;
 465                 *pos = cpu+1;
 466                 return &per_cpu(rt_cache_stat, cpu);
 467         }
 468         return NULL;
 469
 470 }
 471
 472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 473 {
 474
 475 }
 476
 477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 478 {
 479         struct rt_cache_stat *st = v;
 480
 481         if (v == SEQ_START_TOKEN) {
 482                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 483                 return 0;
 484         }
 485
 486         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 487                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 488                    dst_entries_get_slow(&ipv4_dst_ops),
 489                    st->in_hit,
 490                    st->in_slow_tot,
 491                    st->in_slow_mc,
 492                    st->in_no_route,
 493                    st->in_brd,
 494                    st->in_martian_dst,
 495                    st->in_martian_src,
 496
 497                    st->out_hit,
 498                    st->out_slow_tot,
 499                    st->out_slow_mc,
 500
 501                    st->gc_total,
 502                    st->gc_ignored,
 503                    st->gc_goal_miss,
 504                    st->gc_dst_overflow,
 505                    st->in_hlist_search,
 506                    st->out_hlist_search
 507                 );
 508         return 0;
 509 }
 510
 511 static const struct seq_operations rt_cpu_seq_ops = {
 512         .start  = rt_cpu_seq_start,
 513         .next   = rt_cpu_seq_next,
 514         .stop   = rt_cpu_seq_stop,
 515         .show   = rt_cpu_seq_show,
 516 };
 517
 518
 519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 520 {
 521         return seq_open(file, &rt_cpu_seq_ops);
 522 }
 523
 524 static const struct file_operations rt_cpu_seq_fops = {
 525         .owner   = THIS_MODULE,
 526         .open    = rt_cpu_seq_open,
 527         .read    = seq_read,
 528         .llseek  = seq_lseek,
 529         .release = seq_release,
 530 };
 531
 532 #ifdef CONFIG_IP_ROUTE_CLASSID
 533 static int rt_acct_proc_show(struct seq_file *m, void *v)
 534 {
 535         struct ip_rt_acct *dst, *src;
 536         unsigned int i, j;
 537
 538         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 539         if (!dst)
 540                 return -ENOMEM;
 541
 542         for_each_possible_cpu(i) {
 543                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 544                 for (j = 0; j < 256; j++) {
 545                         dst[j].o_bytes   += src[j].o_bytes;
 546                         dst[j].o_packets += src[j].o_packets;
 547                         dst[j].i_bytes   += src[j].i_bytes;
 548                         dst[j].i_packets += src[j].i_packets;
 549                 }
 550         }
 551
 552         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 553         kfree(dst);
 554         return 0;
 555 }
 556
 557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 558 {
 559         return single_open(file, rt_acct_proc_show, NULL);
 560 }
 561
 562 static const struct file_operations rt_acct_proc_fops = {
 563         .owner          = THIS_MODULE,
 564         .open           = rt_acct_proc_open,
 565         .read           = seq_read,
 566         .llseek         = seq_lseek,
 567         .release        = single_release,
 568 };
 569 #endif
 570
 571 static int __net_init ip_rt_do_proc_init(struct net *net)
 572 {
 573         struct proc_dir_entry *pde;
 574
 575         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 576                         &rt_cache_seq_fops);
 577         if (!pde)
 578                 goto err1;
 579
 580         pde = proc_create("rt_cache", S_IRUGO,
 581                           net->proc_net_stat, &rt_cpu_seq_fops);
 582         if (!pde)
 583                 goto err2;
 584
 585 #ifdef CONFIG_IP_ROUTE_CLASSID
 586         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 587         if (!pde)
 588                 goto err3;
 589 #endif
 590         return 0;
 591
 592 #ifdef CONFIG_IP_ROUTE_CLASSID
 593 err3:
 594         remove_proc_entry("rt_cache", net->proc_net_stat);
 595 #endif
 596 err2:
 597         remove_proc_entry("rt_cache", net->proc_net);
 598 err1:
 599         return -ENOMEM;
 600 }
 601
 602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 603 {
 604         remove_proc_entry("rt_cache", net->proc_net_stat);
 605         remove_proc_entry("rt_cache", net->proc_net);
 606 #ifdef CONFIG_IP_ROUTE_CLASSID
 607         remove_proc_entry("rt_acct", net->proc_net);
 608 #endif
 609 }
 610
 611 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 612         .init = ip_rt_do_proc_init,
 613         .exit = ip_rt_do_proc_exit,
 614 };
 615
 616 static int __init ip_rt_proc_init(void)
 617 {
 618         return register_pernet_subsys(&ip_rt_proc_ops);
 619 }
 620
 621 #else
 622 static inline int ip_rt_proc_init(void)
 623 {
 624         return 0;
 625 }
 626 #endif /* CONFIG_PROC_FS */
 627
 628 static inline void rt_free(struct rtable *rt)
 629 {
 630         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 631 }
 632
 633 static inline void rt_drop(struct rtable *rt)
 634 {
 635         ip_rt_put(rt);
 636         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 637 }
 638
 639 static inline int rt_fast_clean(struct rtable *rth)
 640 {
 641         /* Kill broadcast/multicast entries very aggresively, if they
 642            collide in hash table with more useful entries */
 643         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 644                 rt_is_input_route(rth) && rth->dst.rt_next;
 645 }
 646
 647 static inline int rt_valuable(struct rtable *rth)
 648 {
 649         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 650                 rth->dst.expires;
 651 }
 652
 653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 654 {
 655         unsigned long age;
 656         int ret = 0;
 657
 658         if (atomic_read(&rth->dst.__refcnt))
 659                 goto out;
 660
 661         age = jiffies - rth->dst.lastuse;
 662         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 663             (age <= tmo2 && rt_valuable(rth)))
 664                 goto out;
 665         ret = 1;
 666 out:    return ret;
 667 }
 668
 669 /* Bits of score are:
 670  * 31: very valuable
 671  * 30: not quite useless
 672  * 29..0: usage counter
 673  */
 674 static inline u32 rt_score(struct rtable *rt)
 675 {
 676         u32 score = jiffies - rt->dst.lastuse;
 677
 678         score = ~score & ~(3<<30);
 679
 680         if (rt_valuable(rt))
 681                 score |= (1<<31);
 682
 683         if (rt_is_output_route(rt) ||
 684             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 685                 score |= (1<<30);
 686
 687         return score;
 688 }
 689
 690 static inline bool rt_caching(const struct net *net)
 691 {
 692         return net->ipv4.current_rt_cache_rebuild_count <=
 693                 net->ipv4.sysctl_rt_cache_rebuild_count;
 694 }
 695
 696 static inline bool compare_hash_inputs(const struct rtable *rt1,
 697                                        const struct rtable *rt2)
 698 {
 699         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 700                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 701                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 702 }
 703
 704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 707                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 708                 (rt1->rt_mark ^ rt2->rt_mark) |
 709                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 710                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 711                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 712 }
 713
 714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 715 {
 716         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 717 }
 718
 719 static inline int rt_is_expired(struct rtable *rth)
 720 {
 721         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 722 }
 723
 724 /*
 725  * Perform a full scan of hash table and free all entries.
 726  * Can be called by a softirq or a process.
 727  * In the later case, we want to be reschedule if necessary
 728  */
 729 static void rt_do_flush(struct net *net, int process_context)
 730 {
 731         unsigned int i;
 732         struct rtable *rth, *next;
 733
 734         for (i = 0; i <= rt_hash_mask; i++) {
 735                 struct rtable __rcu **pprev;
 736                 struct rtable *list;
 737
 738                 if (process_context && need_resched())
 739                         cond_resched();
 740                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 741                 if (!rth)
 742                         continue;
 743
 744                 spin_lock_bh(rt_hash_lock_addr(i));
 745
 746                 list = NULL;
 747                 pprev = &rt_hash_table[i].chain;
 748                 rth = rcu_dereference_protected(*pprev,
 749                         lockdep_is_held(rt_hash_lock_addr(i)));
 750
 751                 while (rth) {
 752                         next = rcu_dereference_protected(rth->dst.rt_next,
 753                                 lockdep_is_held(rt_hash_lock_addr(i)));
 754
 755                         if (!net ||
 756                             net_eq(dev_net(rth->dst.dev), net)) {
 757                                 rcu_assign_pointer(*pprev, next);
 758                                 rcu_assign_pointer(rth->dst.rt_next, list);
 759                                 list = rth;
 760                         } else {
 761                                 pprev = &rth->dst.rt_next;
 762                         }
 763                         rth = next;
 764                 }
 765
 766                 spin_unlock_bh(rt_hash_lock_addr(i));
 767
 768                 for (; list; list = next) {
 769                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 770                         rt_free(list);
 771                 }
 772         }
 773 }
 774
 775 /*
 776  * While freeing expired entries, we compute average chain length
 777  * and standard deviation, using fixed-point arithmetic.
 778  * This to have an estimation of rt_chain_length_max
 779  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 780  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 781  */
 782
 783 #define FRACT_BITS 3
 784 #define ONE (1UL << FRACT_BITS)
 785
 786 /*
 787  * Given a hash chain and an item in this hash chain,
 788  * find if a previous entry has the same hash_inputs
 789  * (but differs on tos, mark or oif)
 790  * Returns 0 if an alias is found.
 791  * Returns ONE if rth has no alias before itself.
 792  */
 793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 794 {
 795         const struct rtable *aux = head;
 796
 797         while (aux != rth) {
 798                 if (compare_hash_inputs(aux, rth))
 799                         return 0;
 800                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 801         }
 802         return ONE;
 803 }
 804
 805 static void rt_check_expire(void)
 806 {
 807         static unsigned int rover;
 808         unsigned int i = rover, goal;
 809         struct rtable *rth;
 810         struct rtable __rcu **rthp;
 811         unsigned long samples = 0;
 812         unsigned long sum = 0, sum2 = 0;
 813         unsigned long delta;
 814         u64 mult;
 815
 816         delta = jiffies - expires_ljiffies;
 817         expires_ljiffies = jiffies;
 818         mult = ((u64)delta) << rt_hash_log;
 819         if (ip_rt_gc_timeout > 1)
 820                 do_div(mult, ip_rt_gc_timeout);
 821         goal = (unsigned int)mult;
 822         if (goal > rt_hash_mask)
 823                 goal = rt_hash_mask + 1;
 824         for (; goal > 0; goal--) {
 825                 unsigned long tmo = ip_rt_gc_timeout;
 826                 unsigned long length;
 827
 828                 i = (i + 1) & rt_hash_mask;
 829                 rthp = &rt_hash_table[i].chain;
 830
 831                 if (need_resched())
 832                         cond_resched();
 833
 834                 samples++;
 835
 836                 if (rcu_dereference_raw(*rthp) == NULL)
 837                         continue;
 838                 length = 0;
 839                 spin_lock_bh(rt_hash_lock_addr(i));
 840                 while ((rth = rcu_dereference_protected(*rthp,
 841                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 842                         prefetch(rth->dst.rt_next);
 843                         if (rt_is_expired(rth) ||
 844                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 845                                 *rthp = rth->dst.rt_next;
 846                                 rt_free(rth);
 847                                 continue;
 848                         }
 849
 850                         /* We only count entries on a chain with equal
 851                          * hash inputs once so that entries for
 852                          * different QOS levels, and other non-hash
 853                          * input attributes don't unfairly skew the
 854                          * length computation
 855                          */
 856                         tmo >>= 1;
 857                         rthp = &rth->dst.rt_next;
 858                         length += has_noalias(rt_hash_table[i].chain, rth);
 859                 }
 860                 spin_unlock_bh(rt_hash_lock_addr(i));
 861                 sum += length;
 862                 sum2 += length*length;
 863         }
 864         if (samples) {
 865                 unsigned long avg = sum / samples;
 866                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 867                 rt_chain_length_max = max_t(unsigned long,
 868                                         ip_rt_gc_elasticity,
 869                                         (avg + 4*sd) >> FRACT_BITS);
 870         }
 871         rover = i;
 872 }
 873
 874 /*
 875  * rt_worker_func() is run in process context.
 876  * we call rt_check_expire() to scan part of the hash table
 877  */
 878 static void rt_worker_func(struct work_struct *work)
 879 {
 880         rt_check_expire();
 881         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 882 }
 883
 884 /*
 885  * Perturbation of rt_genid by a small quantity [1..256]
 886  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 887  * many times (2^24) without giving recent rt_genid.
 888  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 889  */
 890 static void rt_cache_invalidate(struct net *net)
 891 {
 892         unsigned char shuffle;
 893
 894         get_random_bytes(&shuffle, sizeof(shuffle));
 895         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 896 }
 897
 898 /*
 899  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 900  * delay >= 0 : invalidate & flush cache (can be long)
 901  */
 902 void rt_cache_flush(struct net *net, int delay)
 903 {
 904         rt_cache_invalidate(net);
 905         if (delay >= 0)
 906                 rt_do_flush(net, !in_softirq());
 907 }
 908
 909 /* Flush previous cache invalidated entries from the cache */
 910 void rt_cache_flush_batch(struct net *net)
 911 {
 912         rt_do_flush(net, !in_softirq());
 913 }
 914
 915 static void rt_emergency_hash_rebuild(struct net *net)
 916 {
 917         net_warn_ratelimited("Route hash chain too long!\n");
 918         rt_cache_invalidate(net);
 919 }
 920
 921 /*
 922    Short description of GC goals.
 923
 924    We want to build algorithm, which will keep routing cache
 925    at some equilibrium point, when number of aged off entries
 926    is kept approximately equal to newly generated ones.
 927
 928    Current expiration strength is variable "expire".
 929    We try to adjust it dynamically, so that if networking
 930    is idle expires is large enough to keep enough of warm entries,
 931    and when load increases it reduces to limit cache size.
 932  */
 933
 934 static int rt_garbage_collect(struct dst_ops *ops)
 935 {
 936         static unsigned long expire = RT_GC_TIMEOUT;
 937         static unsigned long last_gc;
 938         static int rover;
 939         static int equilibrium;
 940         struct rtable *rth;
 941         struct rtable __rcu **rthp;
 942         unsigned long now = jiffies;
 943         int goal;
 944         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 945
 946         /*
 947          * Garbage collection is pretty expensive,
 948          * do not make it too frequently.
 949          */
 950
 951         RT_CACHE_STAT_INC(gc_total);
 952
 953         if (now - last_gc < ip_rt_gc_min_interval &&
 954             entries < ip_rt_max_size) {
 955                 RT_CACHE_STAT_INC(gc_ignored);
 956                 goto out;
 957         }
 958
 959         entries = dst_entries_get_slow(&ipv4_dst_ops);
 960         /* Calculate number of entries, which we want to expire now. */
 961         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 962         if (goal <= 0) {
 963                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 964                         equilibrium = ipv4_dst_ops.gc_thresh;
 965                 goal = entries - equilibrium;
 966                 if (goal > 0) {
 967                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 968                         goal = entries - equilibrium;
 969                 }
 970         } else {
 971                 /* We are in dangerous area. Try to reduce cache really
 972                  * aggressively.
 973                  */
 974                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 975                 equilibrium = entries - goal;
 976         }
 977
 978         if (now - last_gc >= ip_rt_gc_min_interval)
 979                 last_gc = now;
 980
 981         if (goal <= 0) {
 982                 equilibrium += goal;
 983                 goto work_done;
 984         }
 985
 986         do {
 987                 int i, k;
 988
 989                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 990                         unsigned long tmo = expire;
 991
 992                         k = (k + 1) & rt_hash_mask;
 993                         rthp = &rt_hash_table[k].chain;
 994                         spin_lock_bh(rt_hash_lock_addr(k));
 995                         while ((rth = rcu_dereference_protected(*rthp,
 996                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 997                                 if (!rt_is_expired(rth) &&
 998                                         !rt_may_expire(rth, tmo, expire)) {
 999                                         tmo >>= 1;
1000                                         rthp = &rth->dst.rt_next;
1001                                         continue;
1002                                 }
1003                                 *rthp = rth->dst.rt_next;
1004                                 rt_free(rth);
1005                                 goal--;
1006                         }
1007                         spin_unlock_bh(rt_hash_lock_addr(k));
1008                         if (goal <= 0)
1009                                 break;
1010                 }
1011                 rover = k;
1012
1013                 if (goal <= 0)
1014                         goto work_done;
1015
1016                 /* Goal is not achieved. We stop process if:
1017
1018                    - if expire reduced to zero. Otherwise, expire is halfed.
1019                    - if table is not full.
1020                    - if we are called from interrupt.
1021                    - jiffies check is just fallback/debug loop breaker.
1022                      We will not spin here for long time in any case.
1023                  */
1024
1025                 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027                 if (expire == 0)
1028                         break;
1029
1030                 expire >>= 1;
1031
1032                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                         goto out;
1034         } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037                 goto out;
1038         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039                 goto out;
1040         net_warn_ratelimited("dst cache overflow\n");
1041         RT_CACHE_STAT_INC(gc_dst_overflow);
1042         return 1;
1043
1044 work_done:
1045         expire += ip_rt_gc_min_interval;
1046         if (expire > ip_rt_gc_timeout ||
1047             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049                 expire = ip_rt_gc_timeout;
1050 out:    return 0;
1051 }
1052
1053 /*
1054  * Returns number of entries in a hash chain that have different hash_inputs
1055  */
1056 static int slow_chain_length(const struct rtable *head)
1057 {
1058         int length = 0;
1059         const struct rtable *rth = head;
1060
1061         while (rth) {
1062                 length += has_noalias(head, rth);
1063                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064         }
1065         return length >> FRACT_BITS;
1066 }
1067
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069                                            struct sk_buff *skb,
1070                                            const void *daddr)
1071 {
1072         struct net_device *dev = dst->dev;
1073         const __be32 *pkey = daddr;
1074         const struct rtable *rt;
1075         struct neighbour *n;
1076
1077         rt = (const struct rtable *) dst;
1078         if (rt->rt_gateway)
1079                 pkey = (const __be32 *) &rt->rt_gateway;
1080         else if (skb)
1081                 pkey = &ip_hdr(skb)->daddr;
1082
1083         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1084         if (n)
1085                 return n;
1086         return neigh_create(&arp_tbl, pkey, dev);
1087 }
1088
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090                                      struct sk_buff *skb, int ifindex)
1091 {
1092         struct rtable   *rth, *cand;
1093         struct rtable __rcu **rthp, **candp;
1094         unsigned long   now;
1095         u32             min_score;
1096         int             chain_length;
1097
1098 restart:
1099         chain_length = 0;
1100         min_score = ~(u32)0;
1101         cand = NULL;
1102         candp = NULL;
1103         now = jiffies;
1104
1105         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106                 /*
1107                  * If we're not caching, just tell the caller we
1108                  * were successful and don't touch the route.  The
1109                  * caller hold the sole reference to the cache entry, and
1110                  * it will be released when the caller is done with it.
1111                  * If we drop it here, the callers have no way to resolve routes
1112                  * when we're not caching.  Instead, just point *rp at rt, so
1113                  * the caller gets a single use out of the route
1114                  * Note that we do rt_free on this new route entry, so that
1115                  * once its refcount hits zero, we are still able to reap it
1116                  * (Thanks Alexey)
1117                  * Note: To avoid expensive rcu stuff for this uncached dst,
1118                  * we set DST_NOCACHE so that dst_release() can free dst without
1119                  * waiting a grace period.
1120                  */
1121
1122                 rt->dst.flags |= DST_NOCACHE;
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = rcu_dereference_protected(*rthp,
1130                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131                 if (rt_is_expired(rth)) {
1132                         *rthp = rth->dst.rt_next;
1133                         rt_free(rth);
1134                         continue;
1135                 }
1136                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137                         /* Put it first */
1138                         *rthp = rth->dst.rt_next;
1139                         /*
1140                          * Since lookup is lockfree, the deletion
1141                          * must be visible to another weakly ordered CPU before
1142                          * the insertion at the start of the hash chain.
1143                          */
1144                         rcu_assign_pointer(rth->dst.rt_next,
1145                                            rt_hash_table[hash].chain);
1146                         /*
1147                          * Since lookup is lockfree, the update writes
1148                          * must be ordered for consistency on SMP.
1149                          */
1150                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152                         dst_use(&rth->dst, now);
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         rt_drop(rt);
1156                         if (skb)
1157                                 skb_dst_set(skb, &rth->dst);
1158                         return rth;
1159                 }
1160
1161                 if (!atomic_read(&rth->dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max &&
1189                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190                         struct net *net = dev_net(rt->dst.dev);
1191                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192                         if (!rt_caching(net)) {
1193                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194                                         rt->dst.dev->name, num);
1195                         }
1196                         rt_emergency_hash_rebuild(net);
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200                                         ifindex, rt_genid(net));
1201                         goto restart;
1202                 }
1203         }
1204
1205         rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207         /*
1208          * Since lookup is lockfree, we must make sure
1209          * previous writes to rt are committed to memory
1210          * before making rt visible to other CPUS.
1211          */
1212         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214         spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216 skip_hashing:
1217         if (skb)
1218                 skb_dst_set(skb, &rt->dst);
1219         return rt;
1220 }
1221
1222 /*
1223  * Peer allocation may fail only in serious out-of-memory conditions.  However
1224  * we still can generate some output.
1225  * Random ID selection looks a bit dangerous because we have no chances to
1226  * select ID being unique in a reasonable period of time.
1227  * But broken packet identifier may be better than no packet at all.
1228  */
1229 static void ip_select_fb_ident(struct iphdr *iph)
1230 {
1231         static DEFINE_SPINLOCK(ip_fb_id_lock);
1232         static u32 ip_fallback_id;
1233         u32 salt;
1234
1235         spin_lock_bh(&ip_fb_id_lock);
1236         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237         iph->id = htons(salt & 0xFFFF);
1238         ip_fallback_id = salt;
1239         spin_unlock_bh(&ip_fb_id_lock);
1240 }
1241
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243 {
1244         struct net *net = dev_net(dst->dev);
1245         struct inet_peer *peer;
1246
1247         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248         if (peer) {
1249                 iph->id = htons(inet_getid(peer, more));
1250                 inet_putpeer(peer);
1251                 return;
1252         }
1253
1254         ip_select_fb_ident(iph);
1255 }
1256 EXPORT_SYMBOL(__ip_select_ident);
1257
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1259 {
1260         struct rtable __rcu **rthp;
1261         struct rtable *aux;
1262
1263         rthp = &rt_hash_table[hash].chain;
1264         spin_lock_bh(rt_hash_lock_addr(hash));
1265         ip_rt_put(rt);
1266         while ((aux = rcu_dereference_protected(*rthp,
1267                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268                 if (aux == rt || rt_is_expired(aux)) {
1269                         *rthp = aux->dst.rt_next;
1270                         rt_free(aux);
1271                         continue;
1272                 }
1273                 rthp = &aux->dst.rt_next;
1274         }
1275         spin_unlock_bh(rt_hash_lock_addr(hash));
1276 }
1277
1278 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1279                              const struct iphdr *iph,
1280                              int oif, u8 tos,
1281                              u8 prot, u32 mark, int flow_flags)
1282 {
1283         if (sk) {
1284                 const struct inet_sock *inet = inet_sk(sk);
1285
1286                 oif = sk->sk_bound_dev_if;
1287                 mark = sk->sk_mark;
1288                 tos = RT_CONN_FLAGS(sk);
1289                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290         }
1291         flowi4_init_output(fl4, oif, mark, tos,
1292                            RT_SCOPE_UNIVERSE, prot,
1293                            flow_flags,
1294                            iph->daddr, iph->saddr, 0, 0);
1295 }
1296
1297 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
1298                                const struct sock *sk)
1299 {
1300         const struct iphdr *iph = ip_hdr(skb);
1301         int oif = skb->dev->ifindex;
1302         u8 tos = RT_TOS(iph->tos);
1303         u8 prot = iph->protocol;
1304         u32 mark = skb->mark;
1305
1306         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1307 }
1308
1309 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
1310 {
1311         const struct inet_sock *inet = inet_sk(sk);
1312         const struct ip_options_rcu *inet_opt;
1313         __be32 daddr = inet->inet_daddr;
1314
1315         rcu_read_lock();
1316         inet_opt = rcu_dereference(inet->inet_opt);
1317         if (inet_opt && inet_opt->opt.srr)
1318                 daddr = inet_opt->opt.faddr;
1319         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1320                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1321                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1322                            inet_sk_flowi_flags(sk),
1323                            daddr, inet->inet_saddr, 0, 0);
1324         rcu_read_unlock();
1325 }
1326
1327 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1328                                  const struct sk_buff *skb)
1329 {
1330         if (skb)
1331                 build_skb_flow_key(fl4, skb, sk);
1332         else
1333                 build_sk_flow_key(fl4, sk);
1334 }
1335
1336 static DEFINE_SPINLOCK(fnhe_lock);
1337
1338 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1339 {
1340         struct fib_nh_exception *fnhe, *oldest;
1341
1342         oldest = rcu_dereference(hash->chain);
1343         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1344              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1345                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1346                         oldest = fnhe;
1347         }
1348         return oldest;
1349 }
1350
1351 static inline u32 fnhe_hashfun(__be32 daddr)
1352 {
1353         u32 hval;
1354
1355         hval = (__force u32) daddr;
1356         hval ^= (hval >> 11) ^ (hval >> 22);
1357
1358         return hval & (FNHE_HASH_SIZE - 1);
1359 }
1360
1361 static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1362 {
1363         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1364         struct fib_nh_exception *fnhe;
1365         int depth;
1366         u32 hval;
1367
1368         if (!hash) {
1369                 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1370                                                    GFP_ATOMIC);
1371                 if (!hash)
1372                         return NULL;
1373         }
1374
1375         hval = fnhe_hashfun(daddr);
1376         hash += hval;
1377
1378         depth = 0;
1379         for (fnhe = rcu_dereference(hash->chain); fnhe;
1380              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1381                 if (fnhe->fnhe_daddr == daddr)
1382                         goto out;
1383                 depth++;
1384         }
1385
1386         if (depth > FNHE_RECLAIM_DEPTH) {
1387                 fnhe = fnhe_oldest(hash + hval, daddr);
1388                 goto out_daddr;
1389         }
1390         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1391         if (!fnhe)
1392                 return NULL;
1393
1394         fnhe->fnhe_next = hash->chain;
1395         rcu_assign_pointer(hash->chain, fnhe);
1396
1397 out_daddr:
1398         fnhe->fnhe_daddr = daddr;
1399 out:
1400         fnhe->fnhe_stamp = jiffies;
1401         return fnhe;
1402 }
1403
1404 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1405 {
1406         __be32 new_gw = icmp_hdr(skb)->un.gateway;
1407         __be32 old_gw = ip_hdr(skb)->saddr;
1408         struct net_device *dev = skb->dev;
1409         struct in_device *in_dev;
1410         struct fib_result res;
1411         struct neighbour *n;
1412         struct net *net;
1413
1414         switch (icmp_hdr(skb)->code & 7) {
1415         case ICMP_REDIR_NET:
1416         case ICMP_REDIR_NETTOS:
1417         case ICMP_REDIR_HOST:
1418         case ICMP_REDIR_HOSTTOS:
1419                 break;
1420
1421         default:
1422                 return;
1423         }
1424
1425         if (rt->rt_gateway != old_gw)
1426                 return;
1427
1428         in_dev = __in_dev_get_rcu(dev);
1429         if (!in_dev)
1430                 return;
1431
1432         net = dev_net(dev);
1433         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1434             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1435             ipv4_is_zeronet(new_gw))
1436                 goto reject_redirect;
1437
1438         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1439                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1440                         goto reject_redirect;
1441                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1442                         goto reject_redirect;
1443         } else {
1444                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1445                         goto reject_redirect;
1446         }
1447
1448         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1449         if (n) {
1450                 if (!(n->nud_state & NUD_VALID)) {
1451                         neigh_event_send(n, NULL);
1452                 } else {
1453                         if (fib_lookup(net, fl4, &res) == 0) {
1454                                 struct fib_nh *nh = &FIB_RES_NH(res);
1455                                 struct fib_nh_exception *fnhe;
1456
1457                                 spin_lock_bh(&fnhe_lock);
1458                                 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1459                                 if (fnhe)
1460                                         fnhe->fnhe_gw = new_gw;
1461                                 spin_unlock_bh(&fnhe_lock);
1462                         }
1463                         rt->rt_gateway = new_gw;
1464                         rt->rt_flags |= RTCF_REDIRECTED;
1465                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1466                 }
1467                 neigh_release(n);
1468         }
1469         return;
1470
1471 reject_redirect:
1472 #ifdef CONFIG_IP_ROUTE_VERBOSE
1473         if (IN_DEV_LOG_MARTIANS(in_dev)) {
1474                 const struct iphdr *iph = (const struct iphdr *) skb->data;
1475                 __be32 daddr = iph->daddr;
1476                 __be32 saddr = iph->saddr;
1477
1478                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1479                                      "  Advised path = %pI4 -> %pI4\n",
1480                                      &old_gw, dev->name, &new_gw,
1481                                      &saddr, &daddr);
1482         }
1483 #endif
1484         ;
1485 }
1486
1487 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1488 {
1489         struct rtable *rt;
1490         struct flowi4 fl4;
1491
1492         rt = (struct rtable *) dst;
1493
1494         ip_rt_build_flow_key(&fl4, sk, skb);
1495         __ip_do_redirect(rt, skb, &fl4);
1496 }
1497
1498 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1499 {
1500         struct rtable *rt = (struct rtable *)dst;
1501         struct dst_entry *ret = dst;
1502
1503         if (rt) {
1504                 if (dst->obsolete > 0) {
1505                         ip_rt_put(rt);
1506                         ret = NULL;
1507                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1508                            rt->dst.expires) {
1509                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1510                                                 rt->rt_oif,
1511                                                 rt_genid(dev_net(dst->dev)));
1512                         rt_del(hash, rt);
1513                         ret = NULL;
1514                 }
1515         }
1516         return ret;
1517 }
1518
1519 /*
1520  * Algorithm:
1521  *      1. The first ip_rt_redirect_number redirects are sent
1522  *         with exponential backoff, then we stop sending them at all,
1523  *         assuming that the host ignores our redirects.
1524  *      2. If we did not see packets requiring redirects
1525  *         during ip_rt_redirect_silence, we assume that the host
1526  *         forgot redirected route and start to send redirects again.
1527  *
1528  * This algorithm is much cheaper and more intelligent than dumb load limiting
1529  * in icmp.c.
1530  *
1531  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1532  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1533  */
1534
1535 void ip_rt_send_redirect(struct sk_buff *skb)
1536 {
1537         struct rtable *rt = skb_rtable(skb);
1538         struct in_device *in_dev;
1539         struct inet_peer *peer;
1540         struct net *net;
1541         int log_martians;
1542
1543         rcu_read_lock();
1544         in_dev = __in_dev_get_rcu(rt->dst.dev);
1545         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1546                 rcu_read_unlock();
1547                 return;
1548         }
1549         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1550         rcu_read_unlock();
1551
1552         net = dev_net(rt->dst.dev);
1553         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1554         if (!peer) {
1555                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1556                 return;
1557         }
1558
1559         /* No redirected packets during ip_rt_redirect_silence;
1560          * reset the algorithm.
1561          */
1562         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1563                 peer->rate_tokens = 0;
1564
1565         /* Too many ignored redirects; do not send anything
1566          * set dst.rate_last to the last seen redirected packet.
1567          */
1568         if (peer->rate_tokens >= ip_rt_redirect_number) {
1569                 peer->rate_last = jiffies;
1570                 goto out_put_peer;
1571         }
1572
1573         /* Check for load limit; set rate_last to the latest sent
1574          * redirect.
1575          */
1576         if (peer->rate_tokens == 0 ||
1577             time_after(jiffies,
1578                        (peer->rate_last +
1579                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1580                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1581                 peer->rate_last = jiffies;
1582                 ++peer->rate_tokens;
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584                 if (log_martians &&
1585                     peer->rate_tokens == ip_rt_redirect_number)
1586                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1587                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1588                                              &rt->rt_dst, &rt->rt_gateway);
1589 #endif
1590         }
1591 out_put_peer:
1592         inet_putpeer(peer);
1593 }
1594
1595 static int ip_error(struct sk_buff *skb)
1596 {
1597         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1598         struct rtable *rt = skb_rtable(skb);
1599         struct inet_peer *peer;
1600         unsigned long now;
1601         struct net *net;
1602         bool send;
1603         int code;
1604
1605         net = dev_net(rt->dst.dev);
1606         if (!IN_DEV_FORWARD(in_dev)) {
1607                 switch (rt->dst.error) {
1608                 case EHOSTUNREACH:
1609                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1610                         break;
1611
1612                 case ENETUNREACH:
1613                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1614                         break;
1615                 }
1616                 goto out;
1617         }
1618
1619         switch (rt->dst.error) {
1620         case EINVAL:
1621         default:
1622                 goto out;
1623         case EHOSTUNREACH:
1624                 code = ICMP_HOST_UNREACH;
1625                 break;
1626         case ENETUNREACH:
1627                 code = ICMP_NET_UNREACH;
1628                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1629                 break;
1630         case EACCES:
1631                 code = ICMP_PKT_FILTERED;
1632                 break;
1633         }
1634
1635         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1636
1637         send = true;
1638         if (peer) {
1639                 now = jiffies;
1640                 peer->rate_tokens += now - peer->rate_last;
1641                 if (peer->rate_tokens > ip_rt_error_burst)
1642                         peer->rate_tokens = ip_rt_error_burst;
1643                 peer->rate_last = now;
1644                 if (peer->rate_tokens >= ip_rt_error_cost)
1645                         peer->rate_tokens -= ip_rt_error_cost;
1646                 else
1647                         send = false;
1648                 inet_putpeer(peer);
1649         }
1650         if (send)
1651                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1652
1653 out:    kfree_skb(skb);
1654         return 0;
1655 }
1656
1657 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1658 {
1659         struct fib_result res;
1660
1661         if (mtu < ip_rt_min_pmtu)
1662                 mtu = ip_rt_min_pmtu;
1663
1664         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1665                 struct fib_nh *nh = &FIB_RES_NH(res);
1666                 struct fib_nh_exception *fnhe;
1667
1668                 spin_lock_bh(&fnhe_lock);
1669                 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1670                 if (fnhe) {
1671                         fnhe->fnhe_pmtu = mtu;
1672                         fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1673                 }
1674                 spin_unlock_bh(&fnhe_lock);
1675         }
1676         rt->rt_pmtu = mtu;
1677         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1678 }
1679
1680 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1681                               struct sk_buff *skb, u32 mtu)
1682 {
1683         struct rtable *rt = (struct rtable *) dst;
1684         struct flowi4 fl4;
1685
1686         ip_rt_build_flow_key(&fl4, sk, skb);
1687         __ip_rt_update_pmtu(rt, &fl4, mtu);
1688 }
1689
1690 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1691                       int oif, u32 mark, u8 protocol, int flow_flags)
1692 {
1693         const struct iphdr *iph = (const struct iphdr *) skb->data;
1694         struct flowi4 fl4;
1695         struct rtable *rt;
1696
1697         __build_flow_key(&fl4, NULL, iph, oif,
1698                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1699         rt = __ip_route_output_key(net, &fl4);
1700         if (!IS_ERR(rt)) {
1701                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1702                 ip_rt_put(rt);
1703         }
1704 }
1705 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1706
1707 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1708 {
1709         const struct iphdr *iph = (const struct iphdr *) skb->data;
1710         struct flowi4 fl4;
1711         struct rtable *rt;
1712
1713         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1714         rt = __ip_route_output_key(sock_net(sk), &fl4);
1715         if (!IS_ERR(rt)) {
1716                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1717                 ip_rt_put(rt);
1718         }
1719 }
1720 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1721
1722 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1723                    int oif, u32 mark, u8 protocol, int flow_flags)
1724 {
1725         const struct iphdr *iph = (const struct iphdr *) skb->data;
1726         struct flowi4 fl4;
1727         struct rtable *rt;
1728
1729         __build_flow_key(&fl4, NULL, iph, oif,
1730                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1731         rt = __ip_route_output_key(net, &fl4);
1732         if (!IS_ERR(rt)) {
1733                 __ip_do_redirect(rt, skb, &fl4);
1734                 ip_rt_put(rt);
1735         }
1736 }
1737 EXPORT_SYMBOL_GPL(ipv4_redirect);
1738
1739 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1740 {
1741         const struct iphdr *iph = (const struct iphdr *) skb->data;
1742         struct flowi4 fl4;
1743         struct rtable *rt;
1744
1745         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1746         rt = __ip_route_output_key(sock_net(sk), &fl4);
1747         if (!IS_ERR(rt)) {
1748                 __ip_do_redirect(rt, skb, &fl4);
1749                 ip_rt_put(rt);
1750         }
1751 }
1752 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1753
1754 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1755 {
1756         struct rtable *rt = (struct rtable *) dst;
1757
1758         if (rt_is_expired(rt))
1759                 return NULL;
1760         return dst;
1761 }
1762
1763 static void ipv4_dst_destroy(struct dst_entry *dst)
1764 {
1765         struct rtable *rt = (struct rtable *) dst;
1766
1767         if (rt->fi) {
1768                 fib_info_put(rt->fi);
1769                 rt->fi = NULL;
1770         }
1771 }
1772
1773
1774 static void ipv4_link_failure(struct sk_buff *skb)
1775 {
1776         struct rtable *rt;
1777
1778         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1779
1780         rt = skb_rtable(skb);
1781         if (rt)
1782                 dst_set_expires(&rt->dst, 0);
1783 }
1784
1785 static int ip_rt_bug(struct sk_buff *skb)
1786 {
1787         pr_debug("%s: %pI4 -> %pI4, %s\n",
1788                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1789                  skb->dev ? skb->dev->name : "?");
1790         kfree_skb(skb);
1791         WARN_ON(1);
1792         return 0;
1793 }
1794
1795 /*
1796    We do not cache source address of outgoing interface,
1797    because it is used only by IP RR, TS and SRR options,
1798    so that it out of fast path.
1799
1800    BTW remember: "addr" is allowed to be not aligned
1801    in IP options!
1802  */
1803
1804 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1805 {
1806         __be32 src;
1807
1808         if (rt_is_output_route(rt))
1809                 src = ip_hdr(skb)->saddr;
1810         else {
1811                 struct fib_result res;
1812                 struct flowi4 fl4;
1813                 struct iphdr *iph;
1814
1815                 iph = ip_hdr(skb);
1816
1817                 memset(&fl4, 0, sizeof(fl4));
1818                 fl4.daddr = iph->daddr;
1819                 fl4.saddr = iph->saddr;
1820                 fl4.flowi4_tos = RT_TOS(iph->tos);
1821                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1822                 fl4.flowi4_iif = skb->dev->ifindex;
1823                 fl4.flowi4_mark = skb->mark;
1824
1825                 rcu_read_lock();
1826                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1827                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1828                 else
1829                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1830                                         RT_SCOPE_UNIVERSE);
1831                 rcu_read_unlock();
1832         }
1833         memcpy(addr, &src, 4);
1834 }
1835
1836 #ifdef CONFIG_IP_ROUTE_CLASSID
1837 static void set_class_tag(struct rtable *rt, u32 tag)
1838 {
1839         if (!(rt->dst.tclassid & 0xFFFF))
1840                 rt->dst.tclassid |= tag & 0xFFFF;
1841         if (!(rt->dst.tclassid & 0xFFFF0000))
1842                 rt->dst.tclassid |= tag & 0xFFFF0000;
1843 }
1844 #endif
1845
1846 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1847 {
1848         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1849
1850         if (advmss == 0) {
1851                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1852                                ip_rt_min_advmss);
1853                 if (advmss > 65535 - 40)
1854                         advmss = 65535 - 40;
1855         }
1856         return advmss;
1857 }
1858
1859 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1860 {
1861         const struct rtable *rt = (const struct rtable *) dst;
1862         unsigned int mtu = rt->rt_pmtu;
1863
1864         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1865                 mtu = 0;
1866
1867         if (!mtu)
1868                 mtu = dst_metric_raw(dst, RTAX_MTU);
1869
1870         if (mtu && rt_is_output_route(rt))
1871                 return mtu;
1872
1873         mtu = dst->dev->mtu;
1874
1875         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1876
1877                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1878                         mtu = 576;
1879         }
1880
1881         if (mtu > IP_MAX_MTU)
1882                 mtu = IP_MAX_MTU;
1883
1884         return mtu;
1885 }
1886
1887 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1888                             struct fib_info *fi)
1889 {
1890         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1891                 rt->fi = fi;
1892                 atomic_inc(&fi->fib_clntref);
1893         }
1894         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1895 }
1896
1897 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1898 {
1899         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1900         struct fib_nh_exception *fnhe;
1901         u32 hval;
1902
1903         hval = fnhe_hashfun(daddr);
1904
1905         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1906              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1907                 if (fnhe->fnhe_daddr == daddr) {
1908                         if (fnhe->fnhe_pmtu) {
1909                                 unsigned long expires = fnhe->fnhe_expires;
1910                                 unsigned long diff = expires - jiffies;
1911
1912                                 if (time_before(jiffies, expires)) {
1913                                         rt->rt_pmtu = fnhe->fnhe_pmtu;
1914                                         dst_set_expires(&rt->dst, diff);
1915                                 }
1916                         }
1917                         if (fnhe->fnhe_gw)
1918                                 rt->rt_gateway = fnhe->fnhe_gw;
1919                         fnhe->fnhe_stamp = jiffies;
1920                         break;
1921                 }
1922         }
1923 }
1924
1925 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1926                            const struct fib_result *res,
1927                            struct fib_info *fi, u16 type, u32 itag)
1928 {
1929         if (fi) {
1930                 struct fib_nh *nh = &FIB_RES_NH(*res);
1931
1932                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1933                         rt->rt_gateway = nh->nh_gw;
1934                 if (unlikely(nh->nh_exceptions))
1935                         rt_bind_exception(rt, nh, fl4->daddr);
1936                 rt_init_metrics(rt, fl4, fi);
1937 #ifdef CONFIG_IP_ROUTE_CLASSID
1938                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1939 #endif
1940         }
1941
1942 #ifdef CONFIG_IP_ROUTE_CLASSID
1943 #ifdef CONFIG_IP_MULTIPLE_TABLES
1944         set_class_tag(rt, res->tclassid);
1945 #endif
1946         set_class_tag(rt, itag);
1947 #endif
1948 }
1949
1950 static struct rtable *rt_dst_alloc(struct net_device *dev,
1951                                    bool nopolicy, bool noxfrm)
1952 {
1953         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1954                          DST_HOST |
1955                          (nopolicy ? DST_NOPOLICY : 0) |
1956                          (noxfrm ? DST_NOXFRM : 0));
1957 }
1958
1959 /* called in rcu_read_lock() section */
1960 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1961                                 u8 tos, struct net_device *dev, int our)
1962 {
1963         unsigned int hash;
1964         struct rtable *rth;
1965         struct in_device *in_dev = __in_dev_get_rcu(dev);
1966         u32 itag = 0;
1967         int err;
1968
1969         /* Primary sanity checks. */
1970
1971         if (in_dev == NULL)
1972                 return -EINVAL;
1973
1974         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1975             skb->protocol != htons(ETH_P_IP))
1976                 goto e_inval;
1977
1978         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1979                 if (ipv4_is_loopback(saddr))
1980                         goto e_inval;
1981
1982         if (ipv4_is_zeronet(saddr)) {
1983                 if (!ipv4_is_local_multicast(daddr))
1984                         goto e_inval;
1985         } else {
1986                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1987                                           in_dev, &itag);
1988                 if (err < 0)
1989                         goto e_err;
1990         }
1991         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1992                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1993         if (!rth)
1994                 goto e_nobufs;
1995
1996 #ifdef CONFIG_IP_ROUTE_CLASSID
1997         rth->dst.tclassid = itag;
1998 #endif
1999         rth->dst.output = ip_rt_bug;
2000
2001         rth->rt_key_dst = daddr;
2002         rth->rt_key_src = saddr;
2003         rth->rt_genid   = rt_genid(dev_net(dev));
2004         rth->rt_flags   = RTCF_MULTICAST;
2005         rth->rt_type    = RTN_MULTICAST;
2006         rth->rt_key_tos = tos;
2007         rth->rt_dst     = daddr;
2008         rth->rt_src     = saddr;
2009         rth->rt_route_iif = dev->ifindex;
2010         rth->rt_iif     = dev->ifindex;
2011         rth->rt_oif     = 0;
2012         rth->rt_mark    = skb->mark;
2013         rth->rt_pmtu    = 0;
2014         rth->rt_gateway = daddr;
2015         rth->fi = NULL;
2016         if (our) {
2017                 rth->dst.input= ip_local_deliver;
2018                 rth->rt_flags |= RTCF_LOCAL;
2019         }
2020
2021 #ifdef CONFIG_IP_MROUTE
2022         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2023                 rth->dst.input = ip_mr_input;
2024 #endif
2025         RT_CACHE_STAT_INC(in_slow_mc);
2026
2027         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2028         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2029         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2030
2031 e_nobufs:
2032         return -ENOBUFS;
2033 e_inval:
2034         return -EINVAL;
2035 e_err:
2036         return err;
2037 }
2038
2039
2040 static void ip_handle_martian_source(struct net_device *dev,
2041                                      struct in_device *in_dev,
2042                                      struct sk_buff *skb,
2043                                      __be32 daddr,
2044                                      __be32 saddr)
2045 {
2046         RT_CACHE_STAT_INC(in_martian_src);
2047 #ifdef CONFIG_IP_ROUTE_VERBOSE
2048         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2049                 /*
2050                  *      RFC1812 recommendation, if source is martian,
2051                  *      the only hint is MAC header.
2052                  */
2053                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2054                         &daddr, &saddr, dev->name);
2055                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2056                         print_hex_dump(KERN_WARNING, "ll header: ",
2057                                        DUMP_PREFIX_OFFSET, 16, 1,
2058                                        skb_mac_header(skb),
2059                                        dev->hard_header_len, true);
2060                 }
2061         }
2062 #endif
2063 }
2064
2065 /* called in rcu_read_lock() section */
2066 static int __mkroute_input(struct sk_buff *skb,
2067                            const struct fib_result *res,
2068                            struct in_device *in_dev,
2069                            __be32 daddr, __be32 saddr, u32 tos,
2070                            struct rtable **result)
2071 {
2072         struct rtable *rth;
2073         int err;
2074         struct in_device *out_dev;
2075         unsigned int flags = 0;
2076         u32 itag;
2077
2078         /* get a working reference to the output device */
2079         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2080         if (out_dev == NULL) {
2081                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2082                 return -EINVAL;
2083         }
2084
2085
2086         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2087                                   in_dev->dev, in_dev, &itag);
2088         if (err < 0) {
2089                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2090                                          saddr);
2091
2092                 goto cleanup;
2093         }
2094
2095         if (err)
2096                 flags |= RTCF_DIRECTSRC;
2097
2098         if (out_dev == in_dev && err &&
2099             (IN_DEV_SHARED_MEDIA(out_dev) ||
2100              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2101                 flags |= RTCF_DOREDIRECT;
2102
2103         if (skb->protocol != htons(ETH_P_IP)) {
2104                 /* Not IP (i.e. ARP). Do not create route, if it is
2105                  * invalid for proxy arp. DNAT routes are always valid.
2106                  *
2107                  * Proxy arp feature have been extended to allow, ARP
2108                  * replies back to the same interface, to support
2109                  * Private VLAN switch technologies. See arp.c.
2110                  */
2111                 if (out_dev == in_dev &&
2112                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2113                         err = -EINVAL;
2114                         goto cleanup;
2115                 }
2116         }
2117
2118         rth = rt_dst_alloc(out_dev->dev,
2119                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2120                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2121         if (!rth) {
2122                 err = -ENOBUFS;
2123                 goto cleanup;
2124         }
2125
2126         rth->rt_key_dst = daddr;
2127         rth->rt_key_src = saddr;
2128         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2129         rth->rt_flags = flags;
2130         rth->rt_type = res->type;
2131         rth->rt_key_tos = tos;
2132         rth->rt_dst     = daddr;
2133         rth->rt_src     = saddr;
2134         rth->rt_route_iif = in_dev->dev->ifindex;
2135         rth->rt_iif     = in_dev->dev->ifindex;
2136         rth->rt_oif     = 0;
2137         rth->rt_mark    = skb->mark;
2138         rth->rt_pmtu    = 0;
2139         rth->rt_gateway = daddr;
2140         rth->fi = NULL;
2141
2142         rth->dst.input = ip_forward;
2143         rth->dst.output = ip_output;
2144
2145         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2146
2147         *result = rth;
2148         err = 0;
2149  cleanup:
2150         return err;
2151 }
2152
2153 static int ip_mkroute_input(struct sk_buff *skb,
2154                             struct fib_result *res,
2155                             const struct flowi4 *fl4,
2156                             struct in_device *in_dev,
2157                             __be32 daddr, __be32 saddr, u32 tos)
2158 {
2159         struct rtable *rth = NULL;
2160         int err;
2161         unsigned int hash;
2162
2163 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2164         if (res->fi && res->fi->fib_nhs > 1)
2165                 fib_select_multipath(res);
2166 #endif
2167
2168         /* create a routing cache entry */
2169         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2170         if (err)
2171                 return err;
2172
2173         /* put it into the cache */
2174         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2175                        rt_genid(dev_net(rth->dst.dev)));
2176         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2177         if (IS_ERR(rth))
2178                 return PTR_ERR(rth);
2179         return 0;
2180 }
2181
2182 /*
2183  *      NOTE. We drop all the packets that has local source
2184  *      addresses, because every properly looped back packet
2185  *      must have correct destination already attached by output routine.
2186  *
2187  *      Such approach solves two big problems:
2188  *      1. Not simplex devices are handled properly.
2189  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2190  *      called with rcu_read_lock()
2191  */
2192
2193 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2194                                u8 tos, struct net_device *dev)
2195 {
2196         struct fib_result res;
2197         struct in_device *in_dev = __in_dev_get_rcu(dev);
2198         struct flowi4   fl4;
2199         unsigned int    flags = 0;
2200         u32             itag = 0;
2201         struct rtable   *rth;
2202         unsigned int    hash;
2203         int             err = -EINVAL;
2204         struct net    *net = dev_net(dev);
2205
2206         /* IP on this device is disabled. */
2207
2208         if (!in_dev)
2209                 goto out;
2210
2211         /* Check for the most weird martians, which can be not detected
2212            by fib_lookup.
2213          */
2214
2215         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2216                 goto martian_source;
2217
2218         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2219                 goto brd_input;
2220
2221         /* Accept zero addresses only to limited broadcast;
2222          * I even do not know to fix it or not. Waiting for complains :-)
2223          */
2224         if (ipv4_is_zeronet(saddr))
2225                 goto martian_source;
2226
2227         if (ipv4_is_zeronet(daddr))
2228                 goto martian_destination;
2229
2230         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2231                 if (ipv4_is_loopback(daddr))
2232                         goto martian_destination;
2233
2234                 if (ipv4_is_loopback(saddr))
2235                         goto martian_source;
2236         }
2237
2238         /*
2239          *      Now we are ready to route packet.
2240          */
2241         fl4.flowi4_oif = 0;
2242         fl4.flowi4_iif = dev->ifindex;
2243         fl4.flowi4_mark = skb->mark;
2244         fl4.flowi4_tos = tos;
2245         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2246         fl4.daddr = daddr;
2247         fl4.saddr = saddr;
2248         err = fib_lookup(net, &fl4, &res);
2249         if (err != 0)
2250                 goto no_route;
2251
2252         RT_CACHE_STAT_INC(in_slow_tot);
2253
2254         if (res.type == RTN_BROADCAST)
2255                 goto brd_input;
2256
2257         if (res.type == RTN_LOCAL) {
2258                 err = fib_validate_source(skb, saddr, daddr, tos,
2259                                           net->loopback_dev->ifindex,
2260                                           dev, in_dev, &itag);
2261                 if (err < 0)
2262                         goto martian_source_keep_err;
2263                 if (err)
2264                         flags |= RTCF_DIRECTSRC;
2265                 goto local_input;
2266         }
2267
2268         if (!IN_DEV_FORWARD(in_dev))
2269                 goto no_route;
2270         if (res.type != RTN_UNICAST)
2271                 goto martian_destination;
2272
2273         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2274 out:    return err;
2275
2276 brd_input:
2277         if (skb->protocol != htons(ETH_P_IP))
2278                 goto e_inval;
2279
2280         if (!ipv4_is_zeronet(saddr)) {
2281                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2282                                           in_dev, &itag);
2283                 if (err < 0)
2284                         goto martian_source_keep_err;
2285                 if (err)
2286                         flags |= RTCF_DIRECTSRC;
2287         }
2288         flags |= RTCF_BROADCAST;
2289         res.type = RTN_BROADCAST;
2290         RT_CACHE_STAT_INC(in_brd);
2291
2292 local_input:
2293         rth = rt_dst_alloc(net->loopback_dev,
2294                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2295         if (!rth)
2296                 goto e_nobufs;
2297
2298         rth->dst.input= ip_local_deliver;
2299         rth->dst.output= ip_rt_bug;
2300 #ifdef CONFIG_IP_ROUTE_CLASSID
2301         rth->dst.tclassid = itag;
2302 #endif
2303
2304         rth->rt_key_dst = daddr;
2305         rth->rt_key_src = saddr;
2306         rth->rt_genid = rt_genid(net);
2307         rth->rt_flags   = flags|RTCF_LOCAL;
2308         rth->rt_type    = res.type;
2309         rth->rt_key_tos = tos;
2310         rth->rt_dst     = daddr;
2311         rth->rt_src     = saddr;
2312         rth->rt_route_iif = dev->ifindex;
2313         rth->rt_iif     = dev->ifindex;
2314         rth->rt_oif     = 0;
2315         rth->rt_mark    = skb->mark;
2316         rth->rt_pmtu    = 0;
2317         rth->rt_gateway = daddr;
2318         rth->fi = NULL;
2319         if (res.type == RTN_UNREACHABLE) {
2320                 rth->dst.input= ip_error;
2321                 rth->dst.error= -err;
2322                 rth->rt_flags   &= ~RTCF_LOCAL;
2323         }
2324         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2325         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2326         err = 0;
2327         if (IS_ERR(rth))
2328                 err = PTR_ERR(rth);
2329         goto out;
2330
2331 no_route:
2332         RT_CACHE_STAT_INC(in_no_route);
2333         res.type = RTN_UNREACHABLE;
2334         if (err == -ESRCH)
2335                 err = -ENETUNREACH;
2336         goto local_input;
2337
2338         /*
2339          *      Do not cache martian addresses: they should be logged (RFC1812)
2340          */
2341 martian_destination:
2342         RT_CACHE_STAT_INC(in_martian_dst);
2343 #ifdef CONFIG_IP_ROUTE_VERBOSE
2344         if (IN_DEV_LOG_MARTIANS(in_dev))
2345                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2346                                      &daddr, &saddr, dev->name);
2347 #endif
2348
2349 e_inval:
2350         err = -EINVAL;
2351         goto out;
2352
2353 e_nobufs:
2354         err = -ENOBUFS;
2355         goto out;
2356
2357 martian_source:
2358         err = -EINVAL;
2359 martian_source_keep_err:
2360         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2361         goto out;
2362 }
2363
2364 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2365                            u8 tos, struct net_device *dev, bool noref)
2366 {
2367         struct rtable   *rth;
2368         unsigned int    hash;
2369         int iif = dev->ifindex;
2370         struct net *net;
2371         int res;
2372
2373         net = dev_net(dev);
2374
2375         rcu_read_lock();
2376
2377         if (!rt_caching(net))
2378                 goto skip_cache;
2379
2380         tos &= IPTOS_RT_MASK;
2381         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2382
2383         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2384              rth = rcu_dereference(rth->dst.rt_next)) {
2385                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2386                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2387                      (rth->rt_route_iif ^ iif) |
2388                      (rth->rt_key_tos ^ tos)) == 0 &&
2389                     rth->rt_mark == skb->mark &&
2390                     net_eq(dev_net(rth->dst.dev), net) &&
2391                     !rt_is_expired(rth)) {
2392                         if (noref) {
2393                                 dst_use_noref(&rth->dst, jiffies);
2394                                 skb_dst_set_noref(skb, &rth->dst);
2395                         } else {
2396                                 dst_use(&rth->dst, jiffies);
2397                                 skb_dst_set(skb, &rth->dst);
2398                         }
2399                         RT_CACHE_STAT_INC(in_hit);
2400                         rcu_read_unlock();
2401                         return 0;
2402                 }
2403                 RT_CACHE_STAT_INC(in_hlist_search);
2404         }
2405
2406 skip_cache:
2407         /* Multicast recognition logic is moved from route cache to here.
2408            The problem was that too many Ethernet cards have broken/missing
2409            hardware multicast filters :-( As result the host on multicasting
2410            network acquires a lot of useless route cache entries, sort of
2411            SDR messages from all the world. Now we try to get rid of them.
2412            Really, provided software IP multicast filter is organized
2413            reasonably (at least, hashed), it does not result in a slowdown
2414            comparing with route cache reject entries.
2415            Note, that multicast routers are not affected, because
2416            route cache entry is created eventually.
2417          */
2418         if (ipv4_is_multicast(daddr)) {
2419                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2420
2421                 if (in_dev) {
2422                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2423                                                   ip_hdr(skb)->protocol);
2424                         if (our
2425 #ifdef CONFIG_IP_MROUTE
2426                                 ||
2427                             (!ipv4_is_local_multicast(daddr) &&
2428                              IN_DEV_MFORWARD(in_dev))
2429 #endif
2430                            ) {
2431                                 int res = ip_route_input_mc(skb, daddr, saddr,
2432                                                             tos, dev, our);
2433                                 rcu_read_unlock();
2434                                 return res;
2435                         }
2436                 }
2437                 rcu_read_unlock();
2438                 return -EINVAL;
2439         }
2440         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2441         rcu_read_unlock();
2442         return res;
2443 }
2444 EXPORT_SYMBOL(ip_route_input_common);
2445
2446 /* called with rcu_read_lock() */
2447 static struct rtable *__mkroute_output(const struct fib_result *res,
2448                                        const struct flowi4 *fl4,
2449                                        __be32 orig_daddr, __be32 orig_saddr,
2450                                        int orig_oif, __u8 orig_rtos,
2451                                        struct net_device *dev_out,
2452                                        unsigned int flags)
2453 {
2454         struct fib_info *fi = res->fi;
2455         struct in_device *in_dev;
2456         u16 type = res->type;
2457         struct rtable *rth;
2458
2459         in_dev = __in_dev_get_rcu(dev_out);
2460         if (!in_dev)
2461                 return ERR_PTR(-EINVAL);
2462
2463         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2464                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2465                         return ERR_PTR(-EINVAL);
2466
2467         if (ipv4_is_lbcast(fl4->daddr))
2468                 type = RTN_BROADCAST;
2469         else if (ipv4_is_multicast(fl4->daddr))
2470                 type = RTN_MULTICAST;
2471         else if (ipv4_is_zeronet(fl4->daddr))
2472                 return ERR_PTR(-EINVAL);
2473
2474         if (dev_out->flags & IFF_LOOPBACK)
2475                 flags |= RTCF_LOCAL;
2476
2477         if (type == RTN_BROADCAST) {
2478                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2479                 fi = NULL;
2480         } else if (type == RTN_MULTICAST) {
2481                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2482                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2483                                      fl4->flowi4_proto))
2484                         flags &= ~RTCF_LOCAL;
2485                 /* If multicast route do not exist use
2486                  * default one, but do not gateway in this case.
2487                  * Yes, it is hack.
2488                  */
2489                 if (fi && res->prefixlen < 4)
2490                         fi = NULL;
2491         }
2492
2493         rth = rt_dst_alloc(dev_out,
2494                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2495                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2496         if (!rth)
2497                 return ERR_PTR(-ENOBUFS);
2498
2499         rth->dst.output = ip_output;
2500
2501         rth->rt_key_dst = orig_daddr;
2502         rth->rt_key_src = orig_saddr;
2503         rth->rt_genid = rt_genid(dev_net(dev_out));
2504         rth->rt_flags   = flags;
2505         rth->rt_type    = type;
2506         rth->rt_key_tos = orig_rtos;
2507         rth->rt_dst     = fl4->daddr;
2508         rth->rt_src     = fl4->saddr;
2509         rth->rt_route_iif = 0;
2510         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2511         rth->rt_oif     = orig_oif;
2512         rth->rt_mark    = fl4->flowi4_mark;
2513         rth->rt_pmtu    = 0;
2514         rth->rt_gateway = fl4->daddr;
2515         rth->fi = NULL;
2516
2517         RT_CACHE_STAT_INC(out_slow_tot);
2518
2519         if (flags & RTCF_LOCAL)
2520                 rth->dst.input = ip_local_deliver;
2521         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2522                 if (flags & RTCF_LOCAL &&
2523                     !(dev_out->flags & IFF_LOOPBACK)) {
2524                         rth->dst.output = ip_mc_output;
2525                         RT_CACHE_STAT_INC(out_slow_mc);
2526                 }
2527 #ifdef CONFIG_IP_MROUTE
2528                 if (type == RTN_MULTICAST) {
2529                         if (IN_DEV_MFORWARD(in_dev) &&
2530                             !ipv4_is_local_multicast(fl4->daddr)) {
2531                                 rth->dst.input = ip_mr_input;
2532                                 rth->dst.output = ip_mc_output;
2533                         }
2534                 }
2535 #endif
2536         }
2537
2538         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2539
2540         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2541                 rth->dst.flags |= DST_NOCACHE;
2542
2543         return rth;
2544 }
2545
2546 /*
2547  * Major route resolver routine.
2548  * called with rcu_read_lock();
2549  */
2550
2551 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2552 {
2553         struct net_device *dev_out = NULL;
2554         __u8 tos = RT_FL_TOS(fl4);
2555         unsigned int flags = 0;
2556         struct fib_result res;
2557         struct rtable *rth;
2558         __be32 orig_daddr;
2559         __be32 orig_saddr;
2560         int orig_oif;
2561
2562         res.tclassid    = 0;
2563         res.fi          = NULL;
2564         res.table       = NULL;
2565
2566         orig_daddr = fl4->daddr;
2567         orig_saddr = fl4->saddr;
2568         orig_oif = fl4->flowi4_oif;
2569
2570         fl4->flowi4_iif = net->loopback_dev->ifindex;
2571         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2572         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2573                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2574
2575         rcu_read_lock();
2576         if (fl4->saddr) {
2577                 rth = ERR_PTR(-EINVAL);
2578                 if (ipv4_is_multicast(fl4->saddr) ||
2579                     ipv4_is_lbcast(fl4->saddr) ||
2580                     ipv4_is_zeronet(fl4->saddr))
2581                         goto out;
2582
2583                 /* I removed check for oif == dev_out->oif here.
2584                    It was wrong for two reasons:
2585                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2586                       is assigned to multiple interfaces.
2587                    2. Moreover, we are allowed to send packets with saddr
2588                       of another iface. --ANK
2589                  */
2590
2591                 if (fl4->flowi4_oif == 0 &&
2592                     (ipv4_is_multicast(fl4->daddr) ||
2593                      ipv4_is_lbcast(fl4->daddr))) {
2594                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2595                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2596                         if (dev_out == NULL)
2597                                 goto out;
2598
2599                         /* Special hack: user can direct multicasts
2600                            and limited broadcast via necessary interface
2601                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2602                            This hack is not just for fun, it allows
2603                            vic,vat and friends to work.
2604                            They bind socket to loopback, set ttl to zero
2605                            and expect that it will work.
2606                            From the viewpoint of routing cache they are broken,
2607                            because we are not allowed to build multicast path
2608                            with loopback source addr (look, routing cache
2609                            cannot know, that ttl is zero, so that packet
2610                            will not leave this host and route is valid).
2611                            Luckily, this hack is good workaround.
2612                          */
2613
2614                         fl4->flowi4_oif = dev_out->ifindex;
2615                         goto make_route;
2616                 }
2617
2618                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2619                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2620                         if (!__ip_dev_find(net, fl4->saddr, false))
2621                                 goto out;
2622                 }
2623         }
2624
2625
2626         if (fl4->flowi4_oif) {
2627                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2628                 rth = ERR_PTR(-ENODEV);
2629                 if (dev_out == NULL)
2630                         goto out;
2631
2632                 /* RACE: Check return value of inet_select_addr instead. */
2633                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2634                         rth = ERR_PTR(-ENETUNREACH);
2635                         goto out;
2636                 }
2637                 if (ipv4_is_local_multicast(fl4->daddr) ||
2638                     ipv4_is_lbcast(fl4->daddr)) {
2639                         if (!fl4->saddr)
2640                                 fl4->saddr = inet_select_addr(dev_out, 0,
2641                                                               RT_SCOPE_LINK);
2642                         goto make_route;
2643                 }
2644                 if (fl4->saddr) {
2645                         if (ipv4_is_multicast(fl4->daddr))
2646                                 fl4->saddr = inet_select_addr(dev_out, 0,
2647                                                               fl4->flowi4_scope);
2648                         else if (!fl4->daddr)
2649                                 fl4->saddr = inet_select_addr(dev_out, 0,
2650                                                               RT_SCOPE_HOST);
2651                 }
2652         }
2653
2654         if (!fl4->daddr) {
2655                 fl4->daddr = fl4->saddr;
2656                 if (!fl4->daddr)
2657                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2658                 dev_out = net->loopback_dev;
2659                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2660                 res.type = RTN_LOCAL;
2661                 flags |= RTCF_LOCAL;
2662                 goto make_route;
2663         }
2664
2665         if (fib_lookup(net, fl4, &res)) {
2666                 res.fi = NULL;
2667                 res.table = NULL;
2668                 if (fl4->flowi4_oif) {
2669                         /* Apparently, routing tables are wrong. Assume,
2670                            that the destination is on link.
2671
2672                            WHY? DW.
2673                            Because we are allowed to send to iface
2674                            even if it has NO routes and NO assigned
2675                            addresses. When oif is specified, routing
2676                            tables are looked up with only one purpose:
2677                            to catch if destination is gatewayed, rather than
2678                            direct. Moreover, if MSG_DONTROUTE is set,
2679                            we send packet, ignoring both routing tables
2680                            and ifaddr state. --ANK
2681
2682
2683                            We could make it even if oif is unknown,
2684                            likely IPv6, but we do not.
2685                          */
2686
2687                         if (fl4->saddr == 0)
2688                                 fl4->saddr = inet_select_addr(dev_out, 0,
2689                                                               RT_SCOPE_LINK);
2690                         res.type = RTN_UNICAST;
2691                         goto make_route;
2692                 }
2693                 rth = ERR_PTR(-ENETUNREACH);
2694                 goto out;
2695         }
2696
2697         if (res.type == RTN_LOCAL) {
2698                 if (!fl4->saddr) {
2699                         if (res.fi->fib_prefsrc)
2700                                 fl4->saddr = res.fi->fib_prefsrc;
2701                         else
2702                                 fl4->saddr = fl4->daddr;
2703                 }
2704                 dev_out = net->loopback_dev;
2705                 fl4->flowi4_oif = dev_out->ifindex;
2706                 res.fi = NULL;
2707                 flags |= RTCF_LOCAL;
2708                 goto make_route;
2709         }
2710
2711 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2712         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2713                 fib_select_multipath(&res);
2714         else
2715 #endif
2716         if (!res.prefixlen &&
2717             res.table->tb_num_default > 1 &&
2718             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2719                 fib_select_default(&res);
2720
2721         if (!fl4->saddr)
2722                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2723
2724         dev_out = FIB_RES_DEV(res);
2725         fl4->flowi4_oif = dev_out->ifindex;
2726
2727
2728 make_route:
2729         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2730                                tos, dev_out, flags);
2731         if (!IS_ERR(rth)) {
2732                 unsigned int hash;
2733
2734                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2735                                rt_genid(dev_net(dev_out)));
2736                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2737         }
2738
2739 out:
2740         rcu_read_unlock();
2741         return rth;
2742 }
2743
2744 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2745 {
2746         struct rtable *rth;
2747         unsigned int hash;
2748
2749         if (!rt_caching(net))
2750                 goto slow_output;
2751
2752         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2753
2754         rcu_read_lock_bh();
2755         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2756                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2757                 if (rth->rt_key_dst == flp4->daddr &&
2758                     rth->rt_key_src == flp4->saddr &&
2759                     rt_is_output_route(rth) &&
2760                     rth->rt_oif == flp4->flowi4_oif &&
2761                     rth->rt_mark == flp4->flowi4_mark &&
2762                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2763                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2764                     net_eq(dev_net(rth->dst.dev), net) &&
2765                     !rt_is_expired(rth)) {
2766                         dst_use(&rth->dst, jiffies);
2767                         RT_CACHE_STAT_INC(out_hit);
2768                         rcu_read_unlock_bh();
2769                         if (!flp4->saddr)
2770                                 flp4->saddr = rth->rt_src;
2771                         if (!flp4->daddr)
2772                                 flp4->daddr = rth->rt_dst;
2773                         return rth;
2774                 }
2775                 RT_CACHE_STAT_INC(out_hlist_search);
2776         }
2777         rcu_read_unlock_bh();
2778
2779 slow_output:
2780         return ip_route_output_slow(net, flp4);
2781 }
2782 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2783
2784 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2785 {
2786         return NULL;
2787 }
2788
2789 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2790 {
2791         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2792
2793         return mtu ? : dst->dev->mtu;
2794 }
2795
2796 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2797                                           struct sk_buff *skb, u32 mtu)
2798 {
2799 }
2800
2801 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2802                                        struct sk_buff *skb)
2803 {
2804 }
2805
2806 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2807                                           unsigned long old)
2808 {
2809         return NULL;
2810 }
2811
2812 static struct dst_ops ipv4_dst_blackhole_ops = {
2813         .family                 =       AF_INET,
2814         .protocol               =       cpu_to_be16(ETH_P_IP),
2815         .destroy                =       ipv4_dst_destroy,
2816         .check                  =       ipv4_blackhole_dst_check,
2817         .mtu                    =       ipv4_blackhole_mtu,
2818         .default_advmss         =       ipv4_default_advmss,
2819         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2820         .redirect               =       ipv4_rt_blackhole_redirect,
2821         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2822         .neigh_lookup           =       ipv4_neigh_lookup,
2823 };
2824
2825 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2826 {
2827         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2828         struct rtable *ort = (struct rtable *) dst_orig;
2829
2830         if (rt) {
2831                 struct dst_entry *new = &rt->dst;
2832
2833                 new->__use = 1;
2834                 new->input = dst_discard;
2835                 new->output = dst_discard;
2836
2837                 new->dev = ort->dst.dev;
2838                 if (new->dev)
2839                         dev_hold(new->dev);
2840
2841                 rt->rt_key_dst = ort->rt_key_dst;
2842                 rt->rt_key_src = ort->rt_key_src;
2843                 rt->rt_key_tos = ort->rt_key_tos;
2844                 rt->rt_route_iif = ort->rt_route_iif;
2845                 rt->rt_iif = ort->rt_iif;
2846                 rt->rt_oif = ort->rt_oif;
2847                 rt->rt_mark = ort->rt_mark;
2848                 rt->rt_pmtu = ort->rt_pmtu;
2849
2850                 rt->rt_genid = rt_genid(net);
2851                 rt->rt_flags = ort->rt_flags;
2852                 rt->rt_type = ort->rt_type;
2853                 rt->rt_dst = ort->rt_dst;
2854                 rt->rt_src = ort->rt_src;
2855                 rt->rt_gateway = ort->rt_gateway;
2856                 rt->fi = ort->fi;
2857                 if (rt->fi)
2858                         atomic_inc(&rt->fi->fib_clntref);
2859
2860                 dst_free(new);
2861         }
2862
2863         dst_release(dst_orig);
2864
2865         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2866 }
2867
2868 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2869                                     struct sock *sk)
2870 {
2871         struct rtable *rt = __ip_route_output_key(net, flp4);
2872
2873         if (IS_ERR(rt))
2874                 return rt;
2875
2876         if (flp4->flowi4_proto)
2877                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2878                                                    flowi4_to_flowi(flp4),
2879                                                    sk, 0);
2880
2881         return rt;
2882 }
2883 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2884
2885 static int rt_fill_info(struct net *net,
2886                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2887                         int nowait, unsigned int flags)
2888 {
2889         struct rtable *rt = skb_rtable(skb);
2890         struct rtmsg *r;
2891         struct nlmsghdr *nlh;
2892         unsigned long expires = 0;
2893         u32 error;
2894
2895         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2896         if (nlh == NULL)
2897                 return -EMSGSIZE;
2898
2899         r = nlmsg_data(nlh);
2900         r->rtm_family    = AF_INET;
2901         r->rtm_dst_len  = 32;
2902         r->rtm_src_len  = 0;
2903         r->rtm_tos      = rt->rt_key_tos;
2904         r->rtm_table    = RT_TABLE_MAIN;
2905         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2906                 goto nla_put_failure;
2907         r->rtm_type     = rt->rt_type;
2908         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2909         r->rtm_protocol = RTPROT_UNSPEC;
2910         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2911         if (rt->rt_flags & RTCF_NOTIFY)
2912                 r->rtm_flags |= RTM_F_NOTIFY;
2913
2914         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2915                 goto nla_put_failure;
2916         if (rt->rt_key_src) {
2917                 r->rtm_src_len = 32;
2918                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2919                         goto nla_put_failure;
2920         }
2921         if (rt->dst.dev &&
2922             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2923                 goto nla_put_failure;
2924 #ifdef CONFIG_IP_ROUTE_CLASSID
2925         if (rt->dst.tclassid &&
2926             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2927                 goto nla_put_failure;
2928 #endif
2929         if (!rt_is_input_route(rt) &&
2930             rt->rt_src != rt->rt_key_src) {
2931                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2932                         goto nla_put_failure;
2933         }
2934         if (rt->rt_dst != rt->rt_gateway &&
2935             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2936                 goto nla_put_failure;
2937
2938         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2939                 goto nla_put_failure;
2940
2941         if (rt->rt_mark &&
2942             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2943                 goto nla_put_failure;
2944
2945         error = rt->dst.error;
2946         expires = rt->dst.expires;
2947         if (expires) {
2948                 if (time_before(jiffies, expires))
2949                         expires -= jiffies;
2950                 else
2951                         expires = 0;
2952         }
2953
2954         if (rt_is_input_route(rt)) {
2955 #ifdef CONFIG_IP_MROUTE
2956                 __be32 dst = rt->rt_dst;
2957
2958                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2959                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2960                         int err = ipmr_get_route(net, skb,
2961                                                  rt->rt_src, rt->rt_dst,
2962                                                  r, nowait);
2963                         if (err <= 0) {
2964                                 if (!nowait) {
2965                                         if (err == 0)
2966                                                 return 0;
2967                                         goto nla_put_failure;
2968                                 } else {
2969                                         if (err == -EMSGSIZE)
2970                                                 goto nla_put_failure;
2971                                         error = err;
2972                                 }
2973                         }
2974                 } else
2975 #endif
2976                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2977                                 goto nla_put_failure;
2978         }
2979
2980         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2981                 goto nla_put_failure;
2982
2983         return nlmsg_end(skb, nlh);
2984
2985 nla_put_failure:
2986         nlmsg_cancel(skb, nlh);
2987         return -EMSGSIZE;
2988 }
2989
2990 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2991 {
2992         struct net *net = sock_net(in_skb->sk);
2993         struct rtmsg *rtm;
2994         struct nlattr *tb[RTA_MAX+1];
2995         struct rtable *rt = NULL;
2996         __be32 dst = 0;
2997         __be32 src = 0;
2998         u32 iif;
2999         int err;
3000         int mark;
3001         struct sk_buff *skb;
3002
3003         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3004         if (err < 0)
3005                 goto errout;
3006
3007         rtm = nlmsg_data(nlh);
3008
3009         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3010         if (skb == NULL) {
3011                 err = -ENOBUFS;
3012                 goto errout;
3013         }
3014
3015         /* Reserve room for dummy headers, this skb can pass
3016            through good chunk of routing engine.
3017          */
3018         skb_reset_mac_header(skb);
3019         skb_reset_network_header(skb);
3020
3021         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3022         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3023         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3024
3025         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3026         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3027         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3028         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3029
3030         if (iif) {
3031                 struct net_device *dev;
3032
3033                 dev = __dev_get_by_index(net, iif);
3034                 if (dev == NULL) {
3035                         err = -ENODEV;
3036                         goto errout_free;
3037                 }
3038
3039                 skb->protocol   = htons(ETH_P_IP);
3040                 skb->dev        = dev;
3041                 skb->mark       = mark;
3042                 local_bh_disable();
3043                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3044                 local_bh_enable();
3045
3046                 rt = skb_rtable(skb);
3047                 if (err == 0 && rt->dst.error)
3048                         err = -rt->dst.error;
3049         } else {
3050                 struct flowi4 fl4 = {
3051                         .daddr = dst,
3052                         .saddr = src,
3053                         .flowi4_tos = rtm->rtm_tos,
3054                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3055                         .flowi4_mark = mark,
3056                 };
3057                 rt = ip_route_output_key(net, &fl4);
3058
3059                 err = 0;
3060                 if (IS_ERR(rt))
3061                         err = PTR_ERR(rt);
3062         }
3063
3064         if (err)
3065                 goto errout_free;
3066
3067         skb_dst_set(skb, &rt->dst);
3068         if (rtm->rtm_flags & RTM_F_NOTIFY)
3069                 rt->rt_flags |= RTCF_NOTIFY;
3070
3071         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3072                            RTM_NEWROUTE, 0, 0);
3073         if (err <= 0)
3074                 goto errout_free;
3075
3076         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3077 errout:
3078         return err;
3079
3080 errout_free:
3081         kfree_skb(skb);
3082         goto errout;
3083 }
3084
3085 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3086 {
3087         struct rtable *rt;
3088         int h, s_h;
3089         int idx, s_idx;
3090         struct net *net;
3091
3092         net = sock_net(skb->sk);
3093
3094         s_h = cb->args[0];
3095         if (s_h < 0)
3096                 s_h = 0;
3097         s_idx = idx = cb->args[1];
3098         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3099                 if (!rt_hash_table[h].chain)
3100                         continue;
3101                 rcu_read_lock_bh();
3102                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3103                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3104                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3105                                 continue;
3106                         if (rt_is_expired(rt))
3107                                 continue;
3108                         skb_dst_set_noref(skb, &rt->dst);
3109                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3110                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3111                                          1, NLM_F_MULTI) <= 0) {
3112                                 skb_dst_drop(skb);
3113                                 rcu_read_unlock_bh();
3114                                 goto done;
3115                         }
3116                         skb_dst_drop(skb);
3117                 }
3118                 rcu_read_unlock_bh();
3119         }
3120
3121 done:
3122         cb->args[0] = h;
3123         cb->args[1] = idx;
3124         return skb->len;
3125 }
3126
3127 void ip_rt_multicast_event(struct in_device *in_dev)
3128 {
3129         rt_cache_flush(dev_net(in_dev->dev), 0);
3130 }
3131
3132 #ifdef CONFIG_SYSCTL
3133 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3134                                         void __user *buffer,
3135                                         size_t *lenp, loff_t *ppos)
3136 {
3137         if (write) {
3138                 int flush_delay;
3139                 ctl_table ctl;
3140                 struct net *net;
3141
3142                 memcpy(&ctl, __ctl, sizeof(ctl));
3143                 ctl.data = &flush_delay;
3144                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3145
3146                 net = (struct net *)__ctl->extra1;
3147                 rt_cache_flush(net, flush_delay);
3148                 return 0;
3149         }
3150
3151         return -EINVAL;
3152 }
3153
3154 static ctl_table ipv4_route_table[] = {
3155         {
3156                 .procname       = "gc_thresh",
3157                 .data           = &ipv4_dst_ops.gc_thresh,
3158                 .maxlen         = sizeof(int),
3159                 .mode           = 0644,
3160                 .proc_handler   = proc_dointvec,
3161         },
3162         {
3163                 .procname       = "max_size",
3164                 .data           = &ip_rt_max_size,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = proc_dointvec,
3168         },
3169         {
3170                 /*  Deprecated. Use gc_min_interval_ms */
3171
3172                 .procname       = "gc_min_interval",
3173                 .data           = &ip_rt_gc_min_interval,
3174                 .maxlen         = sizeof(int),
3175                 .mode           = 0644,
3176                 .proc_handler   = proc_dointvec_jiffies,
3177         },
3178         {
3179                 .procname       = "gc_min_interval_ms",
3180                 .data           = &ip_rt_gc_min_interval,
3181                 .maxlen         = sizeof(int),
3182                 .mode           = 0644,
3183                 .proc_handler   = proc_dointvec_ms_jiffies,
3184         },
3185         {
3186                 .procname       = "gc_timeout",
3187                 .data           = &ip_rt_gc_timeout,
3188                 .maxlen         = sizeof(int),
3189                 .mode           = 0644,
3190                 .proc_handler   = proc_dointvec_jiffies,
3191         },
3192         {
3193                 .procname       = "gc_interval",
3194                 .data           = &ip_rt_gc_interval,
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0644,
3197                 .proc_handler   = proc_dointvec_jiffies,
3198         },
3199         {
3200                 .procname       = "redirect_load",
3201                 .data           = &ip_rt_redirect_load,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec,
3205         },
3206         {
3207                 .procname       = "redirect_number",
3208                 .data           = &ip_rt_redirect_number,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec,
3212         },
3213         {
3214                 .procname       = "redirect_silence",
3215                 .data           = &ip_rt_redirect_silence,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .procname       = "error_cost",
3222                 .data           = &ip_rt_error_cost,
3223                 .maxlen         = sizeof(int),
3224                 .mode           = 0644,
3225                 .proc_handler   = proc_dointvec,
3226         },
3227         {
3228                 .procname       = "error_burst",
3229                 .data           = &ip_rt_error_burst,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "gc_elasticity",
3236                 .data           = &ip_rt_gc_elasticity,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         {
3242                 .procname       = "mtu_expires",
3243                 .data           = &ip_rt_mtu_expires,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec_jiffies,
3247         },
3248         {
3249                 .procname       = "min_pmtu",
3250                 .data           = &ip_rt_min_pmtu,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec,
3254         },
3255         {
3256                 .procname       = "min_adv_mss",
3257                 .data           = &ip_rt_min_advmss,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec,
3261         },
3262         { }
3263 };
3264
3265 static struct ctl_table ipv4_route_flush_table[] = {
3266         {
3267                 .procname       = "flush",
3268                 .maxlen         = sizeof(int),
3269                 .mode           = 0200,
3270                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3271         },
3272         { },
3273 };
3274
3275 static __net_init int sysctl_route_net_init(struct net *net)
3276 {
3277         struct ctl_table *tbl;
3278
3279         tbl = ipv4_route_flush_table;
3280         if (!net_eq(net, &init_net)) {
3281                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3282                 if (tbl == NULL)
3283                         goto err_dup;
3284         }
3285         tbl[0].extra1 = net;
3286
3287         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3288         if (net->ipv4.route_hdr == NULL)
3289                 goto err_reg;
3290         return 0;
3291
3292 err_reg:
3293         if (tbl != ipv4_route_flush_table)
3294                 kfree(tbl);
3295 err_dup:
3296         return -ENOMEM;
3297 }
3298
3299 static __net_exit void sysctl_route_net_exit(struct net *net)
3300 {
3301         struct ctl_table *tbl;
3302
3303         tbl = net->ipv4.route_hdr->ctl_table_arg;
3304         unregister_net_sysctl_table(net->ipv4.route_hdr);
3305         BUG_ON(tbl == ipv4_route_flush_table);
3306         kfree(tbl);
3307 }
3308
3309 static __net_initdata struct pernet_operations sysctl_route_ops = {
3310         .init = sysctl_route_net_init,
3311         .exit = sysctl_route_net_exit,
3312 };
3313 #endif
3314
3315 static __net_init int rt_genid_init(struct net *net)
3316 {
3317         get_random_bytes(&net->ipv4.rt_genid,
3318                          sizeof(net->ipv4.rt_genid));
3319         get_random_bytes(&net->ipv4.dev_addr_genid,
3320                          sizeof(net->ipv4.dev_addr_genid));
3321         return 0;
3322 }
3323
3324 static __net_initdata struct pernet_operations rt_genid_ops = {
3325         .init = rt_genid_init,
3326 };
3327
3328 static int __net_init ipv4_inetpeer_init(struct net *net)
3329 {
3330         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3331
3332         if (!bp)
3333                 return -ENOMEM;
3334         inet_peer_base_init(bp);
3335         net->ipv4.peers = bp;
3336         return 0;
3337 }
3338
3339 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3340 {
3341         struct inet_peer_base *bp = net->ipv4.peers;
3342
3343         net->ipv4.peers = NULL;
3344         inetpeer_invalidate_tree(bp);
3345         kfree(bp);
3346 }
3347
3348 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3349         .init   =       ipv4_inetpeer_init,
3350         .exit   =       ipv4_inetpeer_exit,
3351 };
3352
3353 #ifdef CONFIG_IP_ROUTE_CLASSID
3354 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3355 #endif /* CONFIG_IP_ROUTE_CLASSID */
3356
3357 static __initdata unsigned long rhash_entries;
3358 static int __init set_rhash_entries(char *str)
3359 {
3360         ssize_t ret;
3361
3362         if (!str)
3363                 return 0;
3364
3365         ret = kstrtoul(str, 0, &rhash_entries);
3366         if (ret)
3367                 return 0;
3368
3369         return 1;
3370 }
3371 __setup("rhash_entries=", set_rhash_entries);
3372
3373 int __init ip_rt_init(void)
3374 {
3375         int rc = 0;
3376
3377 #ifdef CONFIG_IP_ROUTE_CLASSID
3378         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3379         if (!ip_rt_acct)
3380                 panic("IP: failed to allocate ip_rt_acct\n");
3381 #endif
3382
3383         ipv4_dst_ops.kmem_cachep =
3384                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3385                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3386
3387         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3388
3389         if (dst_entries_init(&ipv4_dst_ops) < 0)
3390                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3391
3392         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3393                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3394
3395         rt_hash_table = (struct rt_hash_bucket *)
3396                 alloc_large_system_hash("IP route cache",
3397                                         sizeof(struct rt_hash_bucket),
3398                                         rhash_entries,
3399                                         (totalram_pages >= 128 * 1024) ?
3400                                         15 : 17,
3401                                         0,
3402                                         &rt_hash_log,
3403                                         &rt_hash_mask,
3404                                         0,
3405                                         rhash_entries ? 0 : 512 * 1024);
3406         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3407         rt_hash_lock_init();
3408
3409         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3410         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3411
3412         devinet_init();
3413         ip_fib_init();
3414
3415         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3416         expires_ljiffies = jiffies;
3417         schedule_delayed_work(&expires_work,
3418                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3419
3420         if (ip_rt_proc_init())
3421                 pr_err("Unable to create route proc files\n");
3422 #ifdef CONFIG_XFRM
3423         xfrm_init();
3424         xfrm4_init(ip_rt_max_size);
3425 #endif
3426         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3427
3428 #ifdef CONFIG_SYSCTL
3429         register_pernet_subsys(&sysctl_route_ops);
3430 #endif
3431         register_pernet_subsys(&rt_genid_ops);
3432         register_pernet_subsys(&ipv4_inetpeer_ops);
3433         return rc;
3434 }
3435
3436 #ifdef CONFIG_SYSCTL
3437 /*
3438  * We really need to sanitize the damn ipv4 init order, then all
3439  * this nonsense will go away.
3440  */
3441 void __init ip_static_sysctl_init(void)
3442 {
3443         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3444 }
3445 #endif