net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         if (!rt->peer)
 166                 rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168         peer = rt->peer;
 169         if (peer) {
 170                 u32 *old_p = __DST_METRICS_PTR(old);
 171                 unsigned long prev, new;
 172
 173                 p = peer->metrics;
 174                 if (inet_metrics_new(peer))
 175                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177                 new = (unsigned long) p;
 178                 prev = cmpxchg(&dst->_metrics, old, new);
 179
 180                 if (prev != old) {
 181                         p = __DST_METRICS_PTR(prev);
 182                         if (prev & DST_METRICS_READ_ONLY)
 183                                 p = NULL;
 184                 } else {
 185                         if (rt->fi) {
 186                                 fib_info_put(rt->fi);
 187                                 rt->fi = NULL;
 188                         }
 189                 }
 190         }
 191         return p;
 192 }
 193
 194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 195
 196 static struct dst_ops ipv4_dst_ops = {
 197         .family =               AF_INET,
 198         .protocol =             cpu_to_be16(ETH_P_IP),
 199         .gc =                   rt_garbage_collect,
 200         .check =                ipv4_dst_check,
 201         .default_advmss =       ipv4_default_advmss,
 202         .mtu =                  ipv4_mtu,
 203         .cow_metrics =          ipv4_cow_metrics,
 204         .destroy =              ipv4_dst_destroy,
 205         .ifdown =               ipv4_dst_ifdown,
 206         .negative_advice =      ipv4_negative_advice,
 207         .link_failure =         ipv4_link_failure,
 208         .update_pmtu =          ip_rt_update_pmtu,
 209         .local_out =            __ip_local_out,
 210         .neigh_lookup =         ipv4_neigh_lookup,
 211 };
 212
 213 #define ECN_OR_COST(class)      TC_PRIO_##class
 214
 215 const __u8 ip_tos2prio[16] = {
 216         TC_PRIO_BESTEFFORT,
 217         ECN_OR_COST(BESTEFFORT),
 218         TC_PRIO_BESTEFFORT,
 219         ECN_OR_COST(BESTEFFORT),
 220         TC_PRIO_BULK,
 221         ECN_OR_COST(BULK),
 222         TC_PRIO_BULK,
 223         ECN_OR_COST(BULK),
 224         TC_PRIO_INTERACTIVE,
 225         ECN_OR_COST(INTERACTIVE),
 226         TC_PRIO_INTERACTIVE,
 227         ECN_OR_COST(INTERACTIVE),
 228         TC_PRIO_INTERACTIVE_BULK,
 229         ECN_OR_COST(INTERACTIVE_BULK),
 230         TC_PRIO_INTERACTIVE_BULK,
 231         ECN_OR_COST(INTERACTIVE_BULK)
 232 };
 233 EXPORT_SYMBOL(ip_tos2prio);
 234
 235 /*
 236  * Route cache.
 237  */
 238
 239 /* The locking scheme is rather straight forward:
 240  *
 241  * 1) Read-Copy Update protects the buckets of the central route hash.
 242  * 2) Only writers remove entries, and they hold the lock
 243  *    as they look at rtable reference counts.
 244  * 3) Only readers acquire references to rtable entries,
 245  *    they do so with atomic increments and with the
 246  *    lock held.
 247  */
 248
 249 struct rt_hash_bucket {
 250         struct rtable __rcu     *chain;
 251 };
 252
 253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254         defined(CONFIG_PROVE_LOCKING)
 255 /*
 256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257  * The size of this table is a power of two and depends on the number of CPUS.
 258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259  */
 260 #ifdef CONFIG_LOCKDEP
 261 # define RT_HASH_LOCK_SZ        256
 262 #else
 263 # if NR_CPUS >= 32
 264 #  define RT_HASH_LOCK_SZ       4096
 265 # elif NR_CPUS >= 16
 266 #  define RT_HASH_LOCK_SZ       2048
 267 # elif NR_CPUS >= 8
 268 #  define RT_HASH_LOCK_SZ       1024
 269 # elif NR_CPUS >= 4
 270 #  define RT_HASH_LOCK_SZ       512
 271 # else
 272 #  define RT_HASH_LOCK_SZ       256
 273 # endif
 274 #endif
 275
 276 static spinlock_t       *rt_hash_locks;
 277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279 static __init void rt_hash_lock_init(void)
 280 {
 281         int i;
 282
 283         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284                         GFP_KERNEL);
 285         if (!rt_hash_locks)
 286                 panic("IP: failed to allocate rt_hash_locks\n");
 287
 288         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289                 spin_lock_init(&rt_hash_locks[i]);
 290 }
 291 #else
 292 # define rt_hash_lock_addr(slot) NULL
 293
 294 static inline void rt_hash_lock_init(void)
 295 {
 296 }
 297 #endif
 298
 299 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 300 static unsigned int             rt_hash_mask __read_mostly;
 301 static unsigned int             rt_hash_log  __read_mostly;
 302
 303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307                                    int genid)
 308 {
 309         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310                             idx, genid)
 311                 & rt_hash_mask;
 312 }
 313
 314 static inline int rt_genid(struct net *net)
 315 {
 316         return atomic_read(&net->ipv4.rt_genid);
 317 }
 318
 319 #ifdef CONFIG_PROC_FS
 320 struct rt_cache_iter_state {
 321         struct seq_net_private p;
 322         int bucket;
 323         int genid;
 324 };
 325
 326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327 {
 328         struct rt_cache_iter_state *st = seq->private;
 329         struct rtable *r = NULL;
 330
 331         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333                         continue;
 334                 rcu_read_lock_bh();
 335                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336                 while (r) {
 337                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338                             r->rt_genid == st->genid)
 339                                 return r;
 340                         r = rcu_dereference_bh(r->dst.rt_next);
 341                 }
 342                 rcu_read_unlock_bh();
 343         }
 344         return r;
 345 }
 346
 347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348                                           struct rtable *r)
 349 {
 350         struct rt_cache_iter_state *st = seq->private;
 351
 352         r = rcu_dereference_bh(r->dst.rt_next);
 353         while (!r) {
 354                 rcu_read_unlock_bh();
 355                 do {
 356                         if (--st->bucket < 0)
 357                                 return NULL;
 358                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359                 rcu_read_lock_bh();
 360                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361         }
 362         return r;
 363 }
 364
 365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366                                         struct rtable *r)
 367 {
 368         struct rt_cache_iter_state *st = seq->private;
 369         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 371                         continue;
 372                 if (r->rt_genid == st->genid)
 373                         break;
 374         }
 375         return r;
 376 }
 377
 378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379 {
 380         struct rtable *r = rt_cache_get_first(seq);
 381
 382         if (r)
 383                 while (pos && (r = rt_cache_get_next(seq, r)))
 384                         --pos;
 385         return pos ? NULL : r;
 386 }
 387
 388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389 {
 390         struct rt_cache_iter_state *st = seq->private;
 391         if (*pos)
 392                 return rt_cache_get_idx(seq, *pos - 1);
 393         st->genid = rt_genid(seq_file_net(seq));
 394         return SEQ_START_TOKEN;
 395 }
 396
 397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398 {
 399         struct rtable *r;
 400
 401         if (v == SEQ_START_TOKEN)
 402                 r = rt_cache_get_first(seq);
 403         else
 404                 r = rt_cache_get_next(seq, v);
 405         ++*pos;
 406         return r;
 407 }
 408
 409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410 {
 411         if (v && v != SEQ_START_TOKEN)
 412                 rcu_read_unlock_bh();
 413 }
 414
 415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416 {
 417         if (v == SEQ_START_TOKEN)
 418                 seq_printf(seq, "%-127s\n",
 419                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421                            "HHUptod\tSpecDst");
 422         else {
 423                 struct rtable *r = v;
 424                 struct neighbour *n;
 425                 int len, HHUptod;
 426
 427                 rcu_read_lock();
 428                 n = dst_get_neighbour_noref(&r->dst);
 429                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430                 rcu_read_unlock();
 431
 432                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434                         r->dst.dev ? r->dst.dev->name : "*",
 435                         (__force u32)r->rt_dst,
 436                         (__force u32)r->rt_gateway,
 437                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 438                         r->dst.__use, 0, (__force u32)r->rt_src,
 439                         dst_metric_advmss(&r->dst) + 40,
 440                         dst_metric(&r->dst, RTAX_WINDOW),
 441                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442                               dst_metric(&r->dst, RTAX_RTTVAR)),
 443                         r->rt_key_tos,
 444                         -1,
 445                         HHUptod,
 446                         r->rt_spec_dst, &len);
 447
 448                 seq_printf(seq, "%*s\n", 127 - len, "");
 449         }
 450         return 0;
 451 }
 452
 453 static const struct seq_operations rt_cache_seq_ops = {
 454         .start  = rt_cache_seq_start,
 455         .next   = rt_cache_seq_next,
 456         .stop   = rt_cache_seq_stop,
 457         .show   = rt_cache_seq_show,
 458 };
 459
 460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461 {
 462         return seq_open_net(inode, file, &rt_cache_seq_ops,
 463                         sizeof(struct rt_cache_iter_state));
 464 }
 465
 466 static const struct file_operations rt_cache_seq_fops = {
 467         .owner   = THIS_MODULE,
 468         .open    = rt_cache_seq_open,
 469         .read    = seq_read,
 470         .llseek  = seq_lseek,
 471         .release = seq_release_net,
 472 };
 473
 474
 475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476 {
 477         int cpu;
 478
 479         if (*pos == 0)
 480                 return SEQ_START_TOKEN;
 481
 482         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483                 if (!cpu_possible(cpu))
 484                         continue;
 485                 *pos = cpu+1;
 486                 return &per_cpu(rt_cache_stat, cpu);
 487         }
 488         return NULL;
 489 }
 490
 491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492 {
 493         int cpu;
 494
 495         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496                 if (!cpu_possible(cpu))
 497                         continue;
 498                 *pos = cpu+1;
 499                 return &per_cpu(rt_cache_stat, cpu);
 500         }
 501         return NULL;
 502
 503 }
 504
 505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506 {
 507
 508 }
 509
 510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511 {
 512         struct rt_cache_stat *st = v;
 513
 514         if (v == SEQ_START_TOKEN) {
 515                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516                 return 0;
 517         }
 518
 519         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521                    dst_entries_get_slow(&ipv4_dst_ops),
 522                    st->in_hit,
 523                    st->in_slow_tot,
 524                    st->in_slow_mc,
 525                    st->in_no_route,
 526                    st->in_brd,
 527                    st->in_martian_dst,
 528                    st->in_martian_src,
 529
 530                    st->out_hit,
 531                    st->out_slow_tot,
 532                    st->out_slow_mc,
 533
 534                    st->gc_total,
 535                    st->gc_ignored,
 536                    st->gc_goal_miss,
 537                    st->gc_dst_overflow,
 538                    st->in_hlist_search,
 539                    st->out_hlist_search
 540                 );
 541         return 0;
 542 }
 543
 544 static const struct seq_operations rt_cpu_seq_ops = {
 545         .start  = rt_cpu_seq_start,
 546         .next   = rt_cpu_seq_next,
 547         .stop   = rt_cpu_seq_stop,
 548         .show   = rt_cpu_seq_show,
 549 };
 550
 551
 552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553 {
 554         return seq_open(file, &rt_cpu_seq_ops);
 555 }
 556
 557 static const struct file_operations rt_cpu_seq_fops = {
 558         .owner   = THIS_MODULE,
 559         .open    = rt_cpu_seq_open,
 560         .read    = seq_read,
 561         .llseek  = seq_lseek,
 562         .release = seq_release,
 563 };
 564
 565 #ifdef CONFIG_IP_ROUTE_CLASSID
 566 static int rt_acct_proc_show(struct seq_file *m, void *v)
 567 {
 568         struct ip_rt_acct *dst, *src;
 569         unsigned int i, j;
 570
 571         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572         if (!dst)
 573                 return -ENOMEM;
 574
 575         for_each_possible_cpu(i) {
 576                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577                 for (j = 0; j < 256; j++) {
 578                         dst[j].o_bytes   += src[j].o_bytes;
 579                         dst[j].o_packets += src[j].o_packets;
 580                         dst[j].i_bytes   += src[j].i_bytes;
 581                         dst[j].i_packets += src[j].i_packets;
 582                 }
 583         }
 584
 585         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586         kfree(dst);
 587         return 0;
 588 }
 589
 590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591 {
 592         return single_open(file, rt_acct_proc_show, NULL);
 593 }
 594
 595 static const struct file_operations rt_acct_proc_fops = {
 596         .owner          = THIS_MODULE,
 597         .open           = rt_acct_proc_open,
 598         .read           = seq_read,
 599         .llseek         = seq_lseek,
 600         .release        = single_release,
 601 };
 602 #endif
 603
 604 static int __net_init ip_rt_do_proc_init(struct net *net)
 605 {
 606         struct proc_dir_entry *pde;
 607
 608         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609                         &rt_cache_seq_fops);
 610         if (!pde)
 611                 goto err1;
 612
 613         pde = proc_create("rt_cache", S_IRUGO,
 614                           net->proc_net_stat, &rt_cpu_seq_fops);
 615         if (!pde)
 616                 goto err2;
 617
 618 #ifdef CONFIG_IP_ROUTE_CLASSID
 619         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 620         if (!pde)
 621                 goto err3;
 622 #endif
 623         return 0;
 624
 625 #ifdef CONFIG_IP_ROUTE_CLASSID
 626 err3:
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628 #endif
 629 err2:
 630         remove_proc_entry("rt_cache", net->proc_net);
 631 err1:
 632         return -ENOMEM;
 633 }
 634
 635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636 {
 637         remove_proc_entry("rt_cache", net->proc_net_stat);
 638         remove_proc_entry("rt_cache", net->proc_net);
 639 #ifdef CONFIG_IP_ROUTE_CLASSID
 640         remove_proc_entry("rt_acct", net->proc_net);
 641 #endif
 642 }
 643
 644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645         .init = ip_rt_do_proc_init,
 646         .exit = ip_rt_do_proc_exit,
 647 };
 648
 649 static int __init ip_rt_proc_init(void)
 650 {
 651         return register_pernet_subsys(&ip_rt_proc_ops);
 652 }
 653
 654 #else
 655 static inline int ip_rt_proc_init(void)
 656 {
 657         return 0;
 658 }
 659 #endif /* CONFIG_PROC_FS */
 660
 661 static inline void rt_free(struct rtable *rt)
 662 {
 663         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664 }
 665
 666 static inline void rt_drop(struct rtable *rt)
 667 {
 668         ip_rt_put(rt);
 669         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670 }
 671
 672 static inline int rt_fast_clean(struct rtable *rth)
 673 {
 674         /* Kill broadcast/multicast entries very aggresively, if they
 675            collide in hash table with more useful entries */
 676         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677                 rt_is_input_route(rth) && rth->dst.rt_next;
 678 }
 679
 680 static inline int rt_valuable(struct rtable *rth)
 681 {
 682         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683                 (rth->peer && rth->peer->pmtu_expires);
 684 }
 685
 686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687 {
 688         unsigned long age;
 689         int ret = 0;
 690
 691         if (atomic_read(&rth->dst.__refcnt))
 692                 goto out;
 693
 694         age = jiffies - rth->dst.lastuse;
 695         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696             (age <= tmo2 && rt_valuable(rth)))
 697                 goto out;
 698         ret = 1;
 699 out:    return ret;
 700 }
 701
 702 /* Bits of score are:
 703  * 31: very valuable
 704  * 30: not quite useless
 705  * 29..0: usage counter
 706  */
 707 static inline u32 rt_score(struct rtable *rt)
 708 {
 709         u32 score = jiffies - rt->dst.lastuse;
 710
 711         score = ~score & ~(3<<30);
 712
 713         if (rt_valuable(rt))
 714                 score |= (1<<31);
 715
 716         if (rt_is_output_route(rt) ||
 717             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718                 score |= (1<<30);
 719
 720         return score;
 721 }
 722
 723 static inline bool rt_caching(const struct net *net)
 724 {
 725         return net->ipv4.current_rt_cache_rebuild_count <=
 726                 net->ipv4.sysctl_rt_cache_rebuild_count;
 727 }
 728
 729 static inline bool compare_hash_inputs(const struct rtable *rt1,
 730                                        const struct rtable *rt2)
 731 {
 732         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 735 }
 736
 737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741                 (rt1->rt_mark ^ rt2->rt_mark) |
 742                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745 }
 746
 747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748 {
 749         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750 }
 751
 752 static inline int rt_is_expired(struct rtable *rth)
 753 {
 754         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 755 }
 756
 757 /*
 758  * Perform a full scan of hash table and free all entries.
 759  * Can be called by a softirq or a process.
 760  * In the later case, we want to be reschedule if necessary
 761  */
 762 static void rt_do_flush(struct net *net, int process_context)
 763 {
 764         unsigned int i;
 765         struct rtable *rth, *next;
 766
 767         for (i = 0; i <= rt_hash_mask; i++) {
 768                 struct rtable __rcu **pprev;
 769                 struct rtable *list;
 770
 771                 if (process_context && need_resched())
 772                         cond_resched();
 773                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 774                 if (!rth)
 775                         continue;
 776
 777                 spin_lock_bh(rt_hash_lock_addr(i));
 778
 779                 list = NULL;
 780                 pprev = &rt_hash_table[i].chain;
 781                 rth = rcu_dereference_protected(*pprev,
 782                         lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784                 while (rth) {
 785                         next = rcu_dereference_protected(rth->dst.rt_next,
 786                                 lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788                         if (!net ||
 789                             net_eq(dev_net(rth->dst.dev), net)) {
 790                                 rcu_assign_pointer(*pprev, next);
 791                                 rcu_assign_pointer(rth->dst.rt_next, list);
 792                                 list = rth;
 793                         } else {
 794                                 pprev = &rth->dst.rt_next;
 795                         }
 796                         rth = next;
 797                 }
 798
 799                 spin_unlock_bh(rt_hash_lock_addr(i));
 800
 801                 for (; list; list = next) {
 802                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 803                         rt_free(list);
 804                 }
 805         }
 806 }
 807
 808 /*
 809  * While freeing expired entries, we compute average chain length
 810  * and standard deviation, using fixed-point arithmetic.
 811  * This to have an estimation of rt_chain_length_max
 812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814  */
 815
 816 #define FRACT_BITS 3
 817 #define ONE (1UL << FRACT_BITS)
 818
 819 /*
 820  * Given a hash chain and an item in this hash chain,
 821  * find if a previous entry has the same hash_inputs
 822  * (but differs on tos, mark or oif)
 823  * Returns 0 if an alias is found.
 824  * Returns ONE if rth has no alias before itself.
 825  */
 826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827 {
 828         const struct rtable *aux = head;
 829
 830         while (aux != rth) {
 831                 if (compare_hash_inputs(aux, rth))
 832                         return 0;
 833                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834         }
 835         return ONE;
 836 }
 837
 838 static void rt_check_expire(void)
 839 {
 840         static unsigned int rover;
 841         unsigned int i = rover, goal;
 842         struct rtable *rth;
 843         struct rtable __rcu **rthp;
 844         unsigned long samples = 0;
 845         unsigned long sum = 0, sum2 = 0;
 846         unsigned long delta;
 847         u64 mult;
 848
 849         delta = jiffies - expires_ljiffies;
 850         expires_ljiffies = jiffies;
 851         mult = ((u64)delta) << rt_hash_log;
 852         if (ip_rt_gc_timeout > 1)
 853                 do_div(mult, ip_rt_gc_timeout);
 854         goal = (unsigned int)mult;
 855         if (goal > rt_hash_mask)
 856                 goal = rt_hash_mask + 1;
 857         for (; goal > 0; goal--) {
 858                 unsigned long tmo = ip_rt_gc_timeout;
 859                 unsigned long length;
 860
 861                 i = (i + 1) & rt_hash_mask;
 862                 rthp = &rt_hash_table[i].chain;
 863
 864                 if (need_resched())
 865                         cond_resched();
 866
 867                 samples++;
 868
 869                 if (rcu_dereference_raw(*rthp) == NULL)
 870                         continue;
 871                 length = 0;
 872                 spin_lock_bh(rt_hash_lock_addr(i));
 873                 while ((rth = rcu_dereference_protected(*rthp,
 874                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875                         prefetch(rth->dst.rt_next);
 876                         if (rt_is_expired(rth)) {
 877                                 *rthp = rth->dst.rt_next;
 878                                 rt_free(rth);
 879                                 continue;
 880                         }
 881                         if (rth->dst.expires) {
 882                                 /* Entry is expired even if it is in use */
 883                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 884 nofree:
 885                                         tmo >>= 1;
 886                                         rthp = &rth->dst.rt_next;
 887                                         /*
 888                                          * We only count entries on
 889                                          * a chain with equal hash inputs once
 890                                          * so that entries for different QOS
 891                                          * levels, and other non-hash input
 892                                          * attributes don't unfairly skew
 893                                          * the length computation
 894                                          */
 895                                         length += has_noalias(rt_hash_table[i].chain, rth);
 896                                         continue;
 897                                 }
 898                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899                                 goto nofree;
 900
 901                         /* Cleanup aged off entries. */
 902                         *rthp = rth->dst.rt_next;
 903                         rt_free(rth);
 904                 }
 905                 spin_unlock_bh(rt_hash_lock_addr(i));
 906                 sum += length;
 907                 sum2 += length*length;
 908         }
 909         if (samples) {
 910                 unsigned long avg = sum / samples;
 911                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912                 rt_chain_length_max = max_t(unsigned long,
 913                                         ip_rt_gc_elasticity,
 914                                         (avg + 4*sd) >> FRACT_BITS);
 915         }
 916         rover = i;
 917 }
 918
 919 /*
 920  * rt_worker_func() is run in process context.
 921  * we call rt_check_expire() to scan part of the hash table
 922  */
 923 static void rt_worker_func(struct work_struct *work)
 924 {
 925         rt_check_expire();
 926         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927 }
 928
 929 /*
 930  * Perturbation of rt_genid by a small quantity [1..256]
 931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932  * many times (2^24) without giving recent rt_genid.
 933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934  */
 935 static void rt_cache_invalidate(struct net *net)
 936 {
 937         unsigned char shuffle;
 938
 939         get_random_bytes(&shuffle, sizeof(shuffle));
 940         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941         inetpeer_invalidate_tree(AF_INET);
 942 }
 943
 944 /*
 945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 946  * delay >= 0 : invalidate & flush cache (can be long)
 947  */
 948 void rt_cache_flush(struct net *net, int delay)
 949 {
 950         rt_cache_invalidate(net);
 951         if (delay >= 0)
 952                 rt_do_flush(net, !in_softirq());
 953 }
 954
 955 /* Flush previous cache invalidated entries from the cache */
 956 void rt_cache_flush_batch(struct net *net)
 957 {
 958         rt_do_flush(net, !in_softirq());
 959 }
 960
 961 static void rt_emergency_hash_rebuild(struct net *net)
 962 {
 963         net_warn_ratelimited("Route hash chain too long!\n");
 964         rt_cache_invalidate(net);
 965 }
 966
 967 /*
 968    Short description of GC goals.
 969
 970    We want to build algorithm, which will keep routing cache
 971    at some equilibrium point, when number of aged off entries
 972    is kept approximately equal to newly generated ones.
 973
 974    Current expiration strength is variable "expire".
 975    We try to adjust it dynamically, so that if networking
 976    is idle expires is large enough to keep enough of warm entries,
 977    and when load increases it reduces to limit cache size.
 978  */
 979
 980 static int rt_garbage_collect(struct dst_ops *ops)
 981 {
 982         static unsigned long expire = RT_GC_TIMEOUT;
 983         static unsigned long last_gc;
 984         static int rover;
 985         static int equilibrium;
 986         struct rtable *rth;
 987         struct rtable __rcu **rthp;
 988         unsigned long now = jiffies;
 989         int goal;
 990         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 991
 992         /*
 993          * Garbage collection is pretty expensive,
 994          * do not make it too frequently.
 995          */
 996
 997         RT_CACHE_STAT_INC(gc_total);
 998
 999         if (now - last_gc < ip_rt_gc_min_interval &&
1000             entries < ip_rt_max_size) {
1001                 RT_CACHE_STAT_INC(gc_ignored);
1002                 goto out;
1003         }
1004
1005         entries = dst_entries_get_slow(&ipv4_dst_ops);
1006         /* Calculate number of entries, which we want to expire now. */
1007         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008         if (goal <= 0) {
1009                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010                         equilibrium = ipv4_dst_ops.gc_thresh;
1011                 goal = entries - equilibrium;
1012                 if (goal > 0) {
1013                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014                         goal = entries - equilibrium;
1015                 }
1016         } else {
1017                 /* We are in dangerous area. Try to reduce cache really
1018                  * aggressively.
1019                  */
1020                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021                 equilibrium = entries - goal;
1022         }
1023
1024         if (now - last_gc >= ip_rt_gc_min_interval)
1025                 last_gc = now;
1026
1027         if (goal <= 0) {
1028                 equilibrium += goal;
1029                 goto work_done;
1030         }
1031
1032         do {
1033                 int i, k;
1034
1035                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036                         unsigned long tmo = expire;
1037
1038                         k = (k + 1) & rt_hash_mask;
1039                         rthp = &rt_hash_table[k].chain;
1040                         spin_lock_bh(rt_hash_lock_addr(k));
1041                         while ((rth = rcu_dereference_protected(*rthp,
1042                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043                                 if (!rt_is_expired(rth) &&
1044                                         !rt_may_expire(rth, tmo, expire)) {
1045                                         tmo >>= 1;
1046                                         rthp = &rth->dst.rt_next;
1047                                         continue;
1048                                 }
1049                                 *rthp = rth->dst.rt_next;
1050                                 rt_free(rth);
1051                                 goal--;
1052                         }
1053                         spin_unlock_bh(rt_hash_lock_addr(k));
1054                         if (goal <= 0)
1055                                 break;
1056                 }
1057                 rover = k;
1058
1059                 if (goal <= 0)
1060                         goto work_done;
1061
1062                 /* Goal is not achieved. We stop process if:
1063
1064                    - if expire reduced to zero. Otherwise, expire is halfed.
1065                    - if table is not full.
1066                    - if we are called from interrupt.
1067                    - jiffies check is just fallback/debug loop breaker.
1068                      We will not spin here for long time in any case.
1069                  */
1070
1071                 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073                 if (expire == 0)
1074                         break;
1075
1076                 expire >>= 1;
1077
1078                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079                         goto out;
1080         } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         net_warn_ratelimited("dst cache overflow\n");
1087         RT_CACHE_STAT_INC(gc_dst_overflow);
1088         return 1;
1089
1090 work_done:
1091         expire += ip_rt_gc_min_interval;
1092         if (expire > ip_rt_gc_timeout ||
1093             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095                 expire = ip_rt_gc_timeout;
1096 out:    return 0;
1097 }
1098
1099 /*
1100  * Returns number of entries in a hash chain that have different hash_inputs
1101  */
1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104         int length = 0;
1105         const struct rtable *rth = head;
1106
1107         while (rth) {
1108                 length += has_noalias(head, rth);
1109                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110         }
1111         return length >> FRACT_BITS;
1112 }
1113
1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116         static const __be32 inaddr_any = 0;
1117         struct net_device *dev = dst->dev;
1118         const __be32 *pkey = daddr;
1119         const struct rtable *rt;
1120         struct neighbour *n;
1121
1122         rt = (const struct rtable *) dst;
1123
1124         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125                 pkey = &inaddr_any;
1126         else if (rt->rt_gateway)
1127                 pkey = (const __be32 *) &rt->rt_gateway;
1128
1129         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130         if (n)
1131                 return n;
1132         return neigh_create(&arp_tbl, pkey, dev);
1133 }
1134
1135 static int rt_bind_neighbour(struct rtable *rt)
1136 {
1137         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138         if (IS_ERR(n))
1139                 return PTR_ERR(n);
1140         dst_set_neighbour(&rt->dst, n);
1141
1142         return 0;
1143 }
1144
1145 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146                                      struct sk_buff *skb, int ifindex)
1147 {
1148         struct rtable   *rth, *cand;
1149         struct rtable __rcu **rthp, **candp;
1150         unsigned long   now;
1151         u32             min_score;
1152         int             chain_length;
1153         int attempts = !in_softirq();
1154
1155 restart:
1156         chain_length = 0;
1157         min_score = ~(u32)0;
1158         cand = NULL;
1159         candp = NULL;
1160         now = jiffies;
1161
1162         if (!rt_caching(dev_net(rt->dst.dev))) {
1163                 /*
1164                  * If we're not caching, just tell the caller we
1165                  * were successful and don't touch the route.  The
1166                  * caller hold the sole reference to the cache entry, and
1167                  * it will be released when the caller is done with it.
1168                  * If we drop it here, the callers have no way to resolve routes
1169                  * when we're not caching.  Instead, just point *rp at rt, so
1170                  * the caller gets a single use out of the route
1171                  * Note that we do rt_free on this new route entry, so that
1172                  * once its refcount hits zero, we are still able to reap it
1173                  * (Thanks Alexey)
1174                  * Note: To avoid expensive rcu stuff for this uncached dst,
1175                  * we set DST_NOCACHE so that dst_release() can free dst without
1176                  * waiting a grace period.
1177                  */
1178
1179                 rt->dst.flags |= DST_NOCACHE;
1180                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181                         int err = rt_bind_neighbour(rt);
1182                         if (err) {
1183                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184                                 ip_rt_put(rt);
1185                                 return ERR_PTR(err);
1186                         }
1187                 }
1188
1189                 goto skip_hashing;
1190         }
1191
1192         rthp = &rt_hash_table[hash].chain;
1193
1194         spin_lock_bh(rt_hash_lock_addr(hash));
1195         while ((rth = rcu_dereference_protected(*rthp,
1196                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1197                 if (rt_is_expired(rth)) {
1198                         *rthp = rth->dst.rt_next;
1199                         rt_free(rth);
1200                         continue;
1201                 }
1202                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1203                         /* Put it first */
1204                         *rthp = rth->dst.rt_next;
1205                         /*
1206                          * Since lookup is lockfree, the deletion
1207                          * must be visible to another weakly ordered CPU before
1208                          * the insertion at the start of the hash chain.
1209                          */
1210                         rcu_assign_pointer(rth->dst.rt_next,
1211                                            rt_hash_table[hash].chain);
1212                         /*
1213                          * Since lookup is lockfree, the update writes
1214                          * must be ordered for consistency on SMP.
1215                          */
1216                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1217
1218                         dst_use(&rth->dst, now);
1219                         spin_unlock_bh(rt_hash_lock_addr(hash));
1220
1221                         rt_drop(rt);
1222                         if (skb)
1223                                 skb_dst_set(skb, &rth->dst);
1224                         return rth;
1225                 }
1226
1227                 if (!atomic_read(&rth->dst.__refcnt)) {
1228                         u32 score = rt_score(rth);
1229
1230                         if (score <= min_score) {
1231                                 cand = rth;
1232                                 candp = rthp;
1233                                 min_score = score;
1234                         }
1235                 }
1236
1237                 chain_length++;
1238
1239                 rthp = &rth->dst.rt_next;
1240         }
1241
1242         if (cand) {
1243                 /* ip_rt_gc_elasticity used to be average length of chain
1244                  * length, when exceeded gc becomes really aggressive.
1245                  *
1246                  * The second limit is less certain. At the moment it allows
1247                  * only 2 entries per bucket. We will see.
1248                  */
1249                 if (chain_length > ip_rt_gc_elasticity) {
1250                         *candp = cand->dst.rt_next;
1251                         rt_free(cand);
1252                 }
1253         } else {
1254                 if (chain_length > rt_chain_length_max &&
1255                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1256                         struct net *net = dev_net(rt->dst.dev);
1257                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1258                         if (!rt_caching(net)) {
1259                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1260                                         rt->dst.dev->name, num);
1261                         }
1262                         rt_emergency_hash_rebuild(net);
1263                         spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1266                                         ifindex, rt_genid(net));
1267                         goto restart;
1268                 }
1269         }
1270
1271         /* Try to bind route to arp only if it is output
1272            route or unicast forwarding path.
1273          */
1274         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275                 int err = rt_bind_neighbour(rt);
1276                 if (err) {
1277                         spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279                         if (err != -ENOBUFS) {
1280                                 rt_drop(rt);
1281                                 return ERR_PTR(err);
1282                         }
1283
1284                         /* Neighbour tables are full and nothing
1285                            can be released. Try to shrink route cache,
1286                            it is most likely it holds some neighbour records.
1287                          */
1288                         if (attempts-- > 0) {
1289                                 int saved_elasticity = ip_rt_gc_elasticity;
1290                                 int saved_int = ip_rt_gc_min_interval;
1291                                 ip_rt_gc_elasticity     = 1;
1292                                 ip_rt_gc_min_interval   = 0;
1293                                 rt_garbage_collect(&ipv4_dst_ops);
1294                                 ip_rt_gc_min_interval   = saved_int;
1295                                 ip_rt_gc_elasticity     = saved_elasticity;
1296                                 goto restart;
1297                         }
1298
1299                         net_warn_ratelimited("Neighbour table overflow\n");
1300                         rt_drop(rt);
1301                         return ERR_PTR(-ENOBUFS);
1302                 }
1303         }
1304
1305         rt->dst.rt_next = rt_hash_table[hash].chain;
1306
1307         /*
1308          * Since lookup is lockfree, we must make sure
1309          * previous writes to rt are committed to memory
1310          * before making rt visible to other CPUS.
1311          */
1312         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313
1314         spin_unlock_bh(rt_hash_lock_addr(hash));
1315
1316 skip_hashing:
1317         if (skb)
1318                 skb_dst_set(skb, &rt->dst);
1319         return rt;
1320 }
1321
1322 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324 static u32 rt_peer_genid(void)
1325 {
1326         return atomic_read(&__rt_peer_genid);
1327 }
1328
1329 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330 {
1331         struct inet_peer *peer;
1332
1333         peer = inet_getpeer_v4(daddr, create);
1334
1335         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336                 inet_putpeer(peer);
1337         else
1338                 rt->rt_peer_genid = rt_peer_genid();
1339 }
1340
1341 /*
1342  * Peer allocation may fail only in serious out-of-memory conditions.  However
1343  * we still can generate some output.
1344  * Random ID selection looks a bit dangerous because we have no chances to
1345  * select ID being unique in a reasonable period of time.
1346  * But broken packet identifier may be better than no packet at all.
1347  */
1348 static void ip_select_fb_ident(struct iphdr *iph)
1349 {
1350         static DEFINE_SPINLOCK(ip_fb_id_lock);
1351         static u32 ip_fallback_id;
1352         u32 salt;
1353
1354         spin_lock_bh(&ip_fb_id_lock);
1355         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1356         iph->id = htons(salt & 0xFFFF);
1357         ip_fallback_id = salt;
1358         spin_unlock_bh(&ip_fb_id_lock);
1359 }
1360
1361 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362 {
1363         struct rtable *rt = (struct rtable *) dst;
1364
1365         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366                 if (rt->peer == NULL)
1367                         rt_bind_peer(rt, rt->rt_dst, 1);
1368
1369                 /* If peer is attached to destination, it is never detached,
1370                    so that we need not to grab a lock to dereference it.
1371                  */
1372                 if (rt->peer) {
1373                         iph->id = htons(inet_getid(rt->peer, more));
1374                         return;
1375                 }
1376         } else if (!rt)
1377                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1378
1379         ip_select_fb_ident(iph);
1380 }
1381 EXPORT_SYMBOL(__ip_select_ident);
1382
1383 static void rt_del(unsigned int hash, struct rtable *rt)
1384 {
1385         struct rtable __rcu **rthp;
1386         struct rtable *aux;
1387
1388         rthp = &rt_hash_table[hash].chain;
1389         spin_lock_bh(rt_hash_lock_addr(hash));
1390         ip_rt_put(rt);
1391         while ((aux = rcu_dereference_protected(*rthp,
1392                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393                 if (aux == rt || rt_is_expired(aux)) {
1394                         *rthp = aux->dst.rt_next;
1395                         rt_free(aux);
1396                         continue;
1397                 }
1398                 rthp = &aux->dst.rt_next;
1399         }
1400         spin_unlock_bh(rt_hash_lock_addr(hash));
1401 }
1402
1403 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404 {
1405         struct rtable *rt = (struct rtable *) dst;
1406         __be32 orig_gw = rt->rt_gateway;
1407         struct neighbour *n, *old_n;
1408
1409         dst_confirm(&rt->dst);
1410
1411         rt->rt_gateway = peer->redirect_learned.a4;
1412
1413         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414         if (IS_ERR(n)) {
1415                 rt->rt_gateway = orig_gw;
1416                 return;
1417         }
1418         old_n = xchg(&rt->dst._neighbour, n);
1419         if (old_n)
1420                 neigh_release(old_n);
1421         if (!(n->nud_state & NUD_VALID)) {
1422                 neigh_event_send(n, NULL);
1423         } else {
1424                 rt->rt_flags |= RTCF_REDIRECTED;
1425                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426         }
1427 }
1428
1429 /* called in rcu_read_lock() section */
1430 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431                     __be32 saddr, struct net_device *dev)
1432 {
1433         int s, i;
1434         struct in_device *in_dev = __in_dev_get_rcu(dev);
1435         __be32 skeys[2] = { saddr, 0 };
1436         int    ikeys[2] = { dev->ifindex, 0 };
1437         struct inet_peer *peer;
1438         struct net *net;
1439
1440         if (!in_dev)
1441                 return;
1442
1443         net = dev_net(dev);
1444         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446             ipv4_is_zeronet(new_gw))
1447                 goto reject_redirect;
1448
1449         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451                         goto reject_redirect;
1452                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453                         goto reject_redirect;
1454         } else {
1455                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456                         goto reject_redirect;
1457         }
1458
1459         for (s = 0; s < 2; s++) {
1460                 for (i = 0; i < 2; i++) {
1461                         unsigned int hash;
1462                         struct rtable __rcu **rthp;
1463                         struct rtable *rt;
1464
1465                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466
1467                         rthp = &rt_hash_table[hash].chain;
1468
1469                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1470                                 rthp = &rt->dst.rt_next;
1471
1472                                 if (rt->rt_key_dst != daddr ||
1473                                     rt->rt_key_src != skeys[s] ||
1474                                     rt->rt_oif != ikeys[i] ||
1475                                     rt_is_input_route(rt) ||
1476                                     rt_is_expired(rt) ||
1477                                     !net_eq(dev_net(rt->dst.dev), net) ||
1478                                     rt->dst.error ||
1479                                     rt->dst.dev != dev ||
1480                                     rt->rt_gateway != old_gw)
1481                                         continue;
1482
1483                                 if (!rt->peer)
1484                                         rt_bind_peer(rt, rt->rt_dst, 1);
1485
1486                                 peer = rt->peer;
1487                                 if (peer) {
1488                                         if (peer->redirect_learned.a4 != new_gw) {
1489                                                 peer->redirect_learned.a4 = new_gw;
1490                                                 atomic_inc(&__rt_peer_genid);
1491                                         }
1492                                         check_peer_redir(&rt->dst, peer);
1493                                 }
1494                         }
1495                 }
1496         }
1497         return;
1498
1499 reject_redirect:
1500 #ifdef CONFIG_IP_ROUTE_VERBOSE
1501         if (IN_DEV_LOG_MARTIANS(in_dev))
1502                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1503                                      "  Advised path = %pI4 -> %pI4\n",
1504                                      &old_gw, dev->name, &new_gw,
1505                                      &saddr, &daddr);
1506 #endif
1507         ;
1508 }
1509
1510 static bool peer_pmtu_expired(struct inet_peer *peer)
1511 {
1512         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1513
1514         return orig &&
1515                time_after_eq(jiffies, orig) &&
1516                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1517 }
1518
1519 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1520 {
1521         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522
1523         return orig &&
1524                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525 }
1526
1527 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1528 {
1529         struct rtable *rt = (struct rtable *)dst;
1530         struct dst_entry *ret = dst;
1531
1532         if (rt) {
1533                 if (dst->obsolete > 0) {
1534                         ip_rt_put(rt);
1535                         ret = NULL;
1536                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1537                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1538                                                 rt->rt_oif,
1539                                                 rt_genid(dev_net(dst->dev)));
1540                         rt_del(hash, rt);
1541                         ret = NULL;
1542                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1543                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1544                 }
1545         }
1546         return ret;
1547 }
1548
1549 /*
1550  * Algorithm:
1551  *      1. The first ip_rt_redirect_number redirects are sent
1552  *         with exponential backoff, then we stop sending them at all,
1553  *         assuming that the host ignores our redirects.
1554  *      2. If we did not see packets requiring redirects
1555  *         during ip_rt_redirect_silence, we assume that the host
1556  *         forgot redirected route and start to send redirects again.
1557  *
1558  * This algorithm is much cheaper and more intelligent than dumb load limiting
1559  * in icmp.c.
1560  *
1561  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563  */
1564
1565 void ip_rt_send_redirect(struct sk_buff *skb)
1566 {
1567         struct rtable *rt = skb_rtable(skb);
1568         struct in_device *in_dev;
1569         struct inet_peer *peer;
1570         int log_martians;
1571
1572         rcu_read_lock();
1573         in_dev = __in_dev_get_rcu(rt->dst.dev);
1574         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575                 rcu_read_unlock();
1576                 return;
1577         }
1578         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579         rcu_read_unlock();
1580
1581         if (!rt->peer)
1582                 rt_bind_peer(rt, rt->rt_dst, 1);
1583         peer = rt->peer;
1584         if (!peer) {
1585                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1586                 return;
1587         }
1588
1589         /* No redirected packets during ip_rt_redirect_silence;
1590          * reset the algorithm.
1591          */
1592         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1593                 peer->rate_tokens = 0;
1594
1595         /* Too many ignored redirects; do not send anything
1596          * set dst.rate_last to the last seen redirected packet.
1597          */
1598         if (peer->rate_tokens >= ip_rt_redirect_number) {
1599                 peer->rate_last = jiffies;
1600                 return;
1601         }
1602
1603         /* Check for load limit; set rate_last to the latest sent
1604          * redirect.
1605          */
1606         if (peer->rate_tokens == 0 ||
1607             time_after(jiffies,
1608                        (peer->rate_last +
1609                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1610                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1611                 peer->rate_last = jiffies;
1612                 ++peer->rate_tokens;
1613 #ifdef CONFIG_IP_ROUTE_VERBOSE
1614                 if (log_martians &&
1615                     peer->rate_tokens == ip_rt_redirect_number)
1616                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1617                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1618                                              &rt->rt_dst, &rt->rt_gateway);
1619 #endif
1620         }
1621 }
1622
1623 static int ip_error(struct sk_buff *skb)
1624 {
1625         struct rtable *rt = skb_rtable(skb);
1626         struct inet_peer *peer;
1627         unsigned long now;
1628         bool send;
1629         int code;
1630
1631         switch (rt->dst.error) {
1632         case EINVAL:
1633         default:
1634                 goto out;
1635         case EHOSTUNREACH:
1636                 code = ICMP_HOST_UNREACH;
1637                 break;
1638         case ENETUNREACH:
1639                 code = ICMP_NET_UNREACH;
1640                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641                                 IPSTATS_MIB_INNOROUTES);
1642                 break;
1643         case EACCES:
1644                 code = ICMP_PKT_FILTERED;
1645                 break;
1646         }
1647
1648         if (!rt->peer)
1649                 rt_bind_peer(rt, rt->rt_dst, 1);
1650         peer = rt->peer;
1651
1652         send = true;
1653         if (peer) {
1654                 now = jiffies;
1655                 peer->rate_tokens += now - peer->rate_last;
1656                 if (peer->rate_tokens > ip_rt_error_burst)
1657                         peer->rate_tokens = ip_rt_error_burst;
1658                 peer->rate_last = now;
1659                 if (peer->rate_tokens >= ip_rt_error_cost)
1660                         peer->rate_tokens -= ip_rt_error_cost;
1661                 else
1662                         send = false;
1663         }
1664         if (send)
1665                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666
1667 out:    kfree_skb(skb);
1668         return 0;
1669 }
1670
1671 /*
1672  *      The last two values are not from the RFC but
1673  *      are needed for AMPRnet AX.25 paths.
1674  */
1675
1676 static const unsigned short mtu_plateau[] =
1677 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678
1679 static inline unsigned short guess_mtu(unsigned short old_mtu)
1680 {
1681         int i;
1682
1683         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684                 if (old_mtu > mtu_plateau[i])
1685                         return mtu_plateau[i];
1686         return 68;
1687 }
1688
1689 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690                                  unsigned short new_mtu,
1691                                  struct net_device *dev)
1692 {
1693         unsigned short old_mtu = ntohs(iph->tot_len);
1694         unsigned short est_mtu = 0;
1695         struct inet_peer *peer;
1696
1697         peer = inet_getpeer_v4(iph->daddr, 1);
1698         if (peer) {
1699                 unsigned short mtu = new_mtu;
1700
1701                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1702                         /* BSD 4.2 derived systems incorrectly adjust
1703                          * tot_len by the IP header length, and report
1704                          * a zero MTU in the ICMP message.
1705                          */
1706                         if (mtu == 0 &&
1707                             old_mtu >= 68 + (iph->ihl << 2))
1708                                 old_mtu -= iph->ihl << 2;
1709                         mtu = guess_mtu(old_mtu);
1710                 }
1711
1712                 if (mtu < ip_rt_min_pmtu)
1713                         mtu = ip_rt_min_pmtu;
1714                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715                         unsigned long pmtu_expires;
1716
1717                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1718                         if (!pmtu_expires)
1719                                 pmtu_expires = 1UL;
1720
1721                         est_mtu = mtu;
1722                         peer->pmtu_learned = mtu;
1723                         peer->pmtu_expires = pmtu_expires;
1724                         atomic_inc(&__rt_peer_genid);
1725                 }
1726
1727                 inet_putpeer(peer);
1728         }
1729         return est_mtu ? : new_mtu;
1730 }
1731
1732 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733 {
1734         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735
1736         if (!expires)
1737                 return;
1738         if (time_before(jiffies, expires)) {
1739                 u32 orig_dst_mtu = dst_mtu(dst);
1740                 if (peer->pmtu_learned < orig_dst_mtu) {
1741                         if (!peer->pmtu_orig)
1742                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744                 }
1745         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747 }
1748
1749 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750 {
1751         struct rtable *rt = (struct rtable *) dst;
1752         struct inet_peer *peer;
1753
1754         dst_confirm(dst);
1755
1756         if (!rt->peer)
1757                 rt_bind_peer(rt, rt->rt_dst, 1);
1758         peer = rt->peer;
1759         if (peer) {
1760                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762                 if (mtu < ip_rt_min_pmtu)
1763                         mtu = ip_rt_min_pmtu;
1764                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765
1766                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1767                         if (!pmtu_expires)
1768                                 pmtu_expires = 1UL;
1769
1770                         peer->pmtu_learned = mtu;
1771                         peer->pmtu_expires = pmtu_expires;
1772
1773                         atomic_inc(&__rt_peer_genid);
1774                         rt->rt_peer_genid = rt_peer_genid();
1775                 }
1776                 check_peer_pmtu(dst, peer);
1777         }
1778 }
1779
1780
1781 static void ipv4_validate_peer(struct rtable *rt)
1782 {
1783         if (rt->rt_peer_genid != rt_peer_genid()) {
1784                 struct inet_peer *peer;
1785
1786                 if (!rt->peer)
1787                         rt_bind_peer(rt, rt->rt_dst, 0);
1788
1789                 peer = rt->peer;
1790                 if (peer) {
1791                         check_peer_pmtu(&rt->dst, peer);
1792
1793                         if (peer->redirect_learned.a4 &&
1794                             peer->redirect_learned.a4 != rt->rt_gateway)
1795                                 check_peer_redir(&rt->dst, peer);
1796                 }
1797
1798                 rt->rt_peer_genid = rt_peer_genid();
1799         }
1800 }
1801
1802 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803 {
1804         struct rtable *rt = (struct rtable *) dst;
1805
1806         if (rt_is_expired(rt))
1807                 return NULL;
1808         ipv4_validate_peer(rt);
1809         return dst;
1810 }
1811
1812 static void ipv4_dst_destroy(struct dst_entry *dst)
1813 {
1814         struct rtable *rt = (struct rtable *) dst;
1815         struct inet_peer *peer = rt->peer;
1816
1817         if (rt->fi) {
1818                 fib_info_put(rt->fi);
1819                 rt->fi = NULL;
1820         }
1821         if (peer) {
1822                 rt->peer = NULL;
1823                 inet_putpeer(peer);
1824         }
1825 }
1826
1827
1828 static void ipv4_link_failure(struct sk_buff *skb)
1829 {
1830         struct rtable *rt;
1831
1832         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
1834         rt = skb_rtable(skb);
1835         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837 }
1838
1839 static int ip_rt_bug(struct sk_buff *skb)
1840 {
1841         pr_debug("%s: %pI4 -> %pI4, %s\n",
1842                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843                  skb->dev ? skb->dev->name : "?");
1844         kfree_skb(skb);
1845         WARN_ON(1);
1846         return 0;
1847 }
1848
1849 /*
1850    We do not cache source address of outgoing interface,
1851    because it is used only by IP RR, TS and SRR options,
1852    so that it out of fast path.
1853
1854    BTW remember: "addr" is allowed to be not aligned
1855    in IP options!
1856  */
1857
1858 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859 {
1860         __be32 src;
1861
1862         if (rt_is_output_route(rt))
1863                 src = ip_hdr(skb)->saddr;
1864         else {
1865                 struct fib_result res;
1866                 struct flowi4 fl4;
1867                 struct iphdr *iph;
1868
1869                 iph = ip_hdr(skb);
1870
1871                 memset(&fl4, 0, sizeof(fl4));
1872                 fl4.daddr = iph->daddr;
1873                 fl4.saddr = iph->saddr;
1874                 fl4.flowi4_tos = RT_TOS(iph->tos);
1875                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1876                 fl4.flowi4_iif = skb->dev->ifindex;
1877                 fl4.flowi4_mark = skb->mark;
1878
1879                 rcu_read_lock();
1880                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882                 else
1883                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884                                         RT_SCOPE_UNIVERSE);
1885                 rcu_read_unlock();
1886         }
1887         memcpy(addr, &src, 4);
1888 }
1889
1890 #ifdef CONFIG_IP_ROUTE_CLASSID
1891 static void set_class_tag(struct rtable *rt, u32 tag)
1892 {
1893         if (!(rt->dst.tclassid & 0xFFFF))
1894                 rt->dst.tclassid |= tag & 0xFFFF;
1895         if (!(rt->dst.tclassid & 0xFFFF0000))
1896                 rt->dst.tclassid |= tag & 0xFFFF0000;
1897 }
1898 #endif
1899
1900 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901 {
1902         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903
1904         if (advmss == 0) {
1905                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906                                ip_rt_min_advmss);
1907                 if (advmss > 65535 - 40)
1908                         advmss = 65535 - 40;
1909         }
1910         return advmss;
1911 }
1912
1913 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914 {
1915         const struct rtable *rt = (const struct rtable *) dst;
1916         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917
1918         if (mtu && rt_is_output_route(rt))
1919                 return mtu;
1920
1921         mtu = dst->dev->mtu;
1922
1923         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924
1925                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926                         mtu = 576;
1927         }
1928
1929         if (mtu > IP_MAX_MTU)
1930                 mtu = IP_MAX_MTU;
1931
1932         return mtu;
1933 }
1934
1935 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936                             struct fib_info *fi)
1937 {
1938         struct inet_peer *peer;
1939         int create = 0;
1940
1941         /* If a peer entry exists for this destination, we must hook
1942          * it up in order to get at cached metrics.
1943          */
1944         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945                 create = 1;
1946
1947         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948         if (peer) {
1949                 rt->rt_peer_genid = rt_peer_genid();
1950                 if (inet_metrics_new(peer))
1951                         memcpy(peer->metrics, fi->fib_metrics,
1952                                sizeof(u32) * RTAX_MAX);
1953                 dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955                 check_peer_pmtu(&rt->dst, peer);
1956
1957                 if (peer->redirect_learned.a4 &&
1958                     peer->redirect_learned.a4 != rt->rt_gateway) {
1959                         rt->rt_gateway = peer->redirect_learned.a4;
1960                         rt->rt_flags |= RTCF_REDIRECTED;
1961                 }
1962         } else {
1963                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964                         rt->fi = fi;
1965                         atomic_inc(&fi->fib_clntref);
1966                 }
1967                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968         }
1969 }
1970
1971 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972                            const struct fib_result *res,
1973                            struct fib_info *fi, u16 type, u32 itag)
1974 {
1975         struct dst_entry *dst = &rt->dst;
1976
1977         if (fi) {
1978                 if (FIB_RES_GW(*res) &&
1979                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980                         rt->rt_gateway = FIB_RES_GW(*res);
1981                 rt_init_metrics(rt, fl4, fi);
1982 #ifdef CONFIG_IP_ROUTE_CLASSID
1983                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984 #endif
1985         }
1986
1987         if (dst_mtu(dst) > IP_MAX_MTU)
1988                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991
1992 #ifdef CONFIG_IP_ROUTE_CLASSID
1993 #ifdef CONFIG_IP_MULTIPLE_TABLES
1994         set_class_tag(rt, fib_rules_tclass(res));
1995 #endif
1996         set_class_tag(rt, itag);
1997 #endif
1998 }
1999
2000 static struct rtable *rt_dst_alloc(struct net_device *dev,
2001                                    bool nopolicy, bool noxfrm)
2002 {
2003         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004                          DST_HOST |
2005                          (nopolicy ? DST_NOPOLICY : 0) |
2006                          (noxfrm ? DST_NOXFRM : 0));
2007 }
2008
2009 /* called in rcu_read_lock() section */
2010 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011                                 u8 tos, struct net_device *dev, int our)
2012 {
2013         unsigned int hash;
2014         struct rtable *rth;
2015         __be32 spec_dst;
2016         struct in_device *in_dev = __in_dev_get_rcu(dev);
2017         u32 itag = 0;
2018         int err;
2019
2020         /* Primary sanity checks. */
2021
2022         if (in_dev == NULL)
2023                 return -EINVAL;
2024
2025         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027                 goto e_inval;
2028
2029         if (ipv4_is_zeronet(saddr)) {
2030                 if (!ipv4_is_local_multicast(daddr))
2031                         goto e_inval;
2032                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033         } else {
2034                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035                                           &itag);
2036                 if (err < 0)
2037                         goto e_err;
2038         }
2039         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2040                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041         if (!rth)
2042                 goto e_nobufs;
2043
2044 #ifdef CONFIG_IP_ROUTE_CLASSID
2045         rth->dst.tclassid = itag;
2046 #endif
2047         rth->dst.output = ip_rt_bug;
2048
2049         rth->rt_key_dst = daddr;
2050         rth->rt_key_src = saddr;
2051         rth->rt_genid   = rt_genid(dev_net(dev));
2052         rth->rt_flags   = RTCF_MULTICAST;
2053         rth->rt_type    = RTN_MULTICAST;
2054         rth->rt_key_tos = tos;
2055         rth->rt_dst     = daddr;
2056         rth->rt_src     = saddr;
2057         rth->rt_route_iif = dev->ifindex;
2058         rth->rt_iif     = dev->ifindex;
2059         rth->rt_oif     = 0;
2060         rth->rt_mark    = skb->mark;
2061         rth->rt_gateway = daddr;
2062         rth->rt_spec_dst= spec_dst;
2063         rth->rt_peer_genid = 0;
2064         rth->peer = NULL;
2065         rth->fi = NULL;
2066         if (our) {
2067                 rth->dst.input= ip_local_deliver;
2068                 rth->rt_flags |= RTCF_LOCAL;
2069         }
2070
2071 #ifdef CONFIG_IP_MROUTE
2072         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073                 rth->dst.input = ip_mr_input;
2074 #endif
2075         RT_CACHE_STAT_INC(in_slow_mc);
2076
2077         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080
2081 e_nobufs:
2082         return -ENOBUFS;
2083 e_inval:
2084         return -EINVAL;
2085 e_err:
2086         return err;
2087 }
2088
2089
2090 static void ip_handle_martian_source(struct net_device *dev,
2091                                      struct in_device *in_dev,
2092                                      struct sk_buff *skb,
2093                                      __be32 daddr,
2094                                      __be32 saddr)
2095 {
2096         RT_CACHE_STAT_INC(in_martian_src);
2097 #ifdef CONFIG_IP_ROUTE_VERBOSE
2098         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099                 /*
2100                  *      RFC1812 recommendation, if source is martian,
2101                  *      the only hint is MAC header.
2102                  */
2103                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2104                         &daddr, &saddr, dev->name);
2105                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106                         print_hex_dump(KERN_WARNING, "ll header: ",
2107                                        DUMP_PREFIX_OFFSET, 16, 1,
2108                                        skb_mac_header(skb),
2109                                        dev->hard_header_len, true);
2110                 }
2111         }
2112 #endif
2113 }
2114
2115 /* called in rcu_read_lock() section */
2116 static int __mkroute_input(struct sk_buff *skb,
2117                            const struct fib_result *res,
2118                            struct in_device *in_dev,
2119                            __be32 daddr, __be32 saddr, u32 tos,
2120                            struct rtable **result)
2121 {
2122         struct rtable *rth;
2123         int err;
2124         struct in_device *out_dev;
2125         unsigned int flags = 0;
2126         __be32 spec_dst;
2127         u32 itag;
2128
2129         /* get a working reference to the output device */
2130         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2131         if (out_dev == NULL) {
2132                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2133                 return -EINVAL;
2134         }
2135
2136
2137         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2138                                   in_dev->dev, &spec_dst, &itag);
2139         if (err < 0) {
2140                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2141                                          saddr);
2142
2143                 goto cleanup;
2144         }
2145
2146         if (err)
2147                 flags |= RTCF_DIRECTSRC;
2148
2149         if (out_dev == in_dev && err &&
2150             (IN_DEV_SHARED_MEDIA(out_dev) ||
2151              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2152                 flags |= RTCF_DOREDIRECT;
2153
2154         if (skb->protocol != htons(ETH_P_IP)) {
2155                 /* Not IP (i.e. ARP). Do not create route, if it is
2156                  * invalid for proxy arp. DNAT routes are always valid.
2157                  *
2158                  * Proxy arp feature have been extended to allow, ARP
2159                  * replies back to the same interface, to support
2160                  * Private VLAN switch technologies. See arp.c.
2161                  */
2162                 if (out_dev == in_dev &&
2163                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2164                         err = -EINVAL;
2165                         goto cleanup;
2166                 }
2167         }
2168
2169         rth = rt_dst_alloc(out_dev->dev,
2170                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2171                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2172         if (!rth) {
2173                 err = -ENOBUFS;
2174                 goto cleanup;
2175         }
2176
2177         rth->rt_key_dst = daddr;
2178         rth->rt_key_src = saddr;
2179         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2180         rth->rt_flags = flags;
2181         rth->rt_type = res->type;
2182         rth->rt_key_tos = tos;
2183         rth->rt_dst     = daddr;
2184         rth->rt_src     = saddr;
2185         rth->rt_route_iif = in_dev->dev->ifindex;
2186         rth->rt_iif     = in_dev->dev->ifindex;
2187         rth->rt_oif     = 0;
2188         rth->rt_mark    = skb->mark;
2189         rth->rt_gateway = daddr;
2190         rth->rt_spec_dst= spec_dst;
2191         rth->rt_peer_genid = 0;
2192         rth->peer = NULL;
2193         rth->fi = NULL;
2194
2195         rth->dst.input = ip_forward;
2196         rth->dst.output = ip_output;
2197
2198         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2199
2200         *result = rth;
2201         err = 0;
2202  cleanup:
2203         return err;
2204 }
2205
2206 static int ip_mkroute_input(struct sk_buff *skb,
2207                             struct fib_result *res,
2208                             const struct flowi4 *fl4,
2209                             struct in_device *in_dev,
2210                             __be32 daddr, __be32 saddr, u32 tos)
2211 {
2212         struct rtable *rth = NULL;
2213         int err;
2214         unsigned int hash;
2215
2216 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2217         if (res->fi && res->fi->fib_nhs > 1)
2218                 fib_select_multipath(res);
2219 #endif
2220
2221         /* create a routing cache entry */
2222         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2223         if (err)
2224                 return err;
2225
2226         /* put it into the cache */
2227         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2228                        rt_genid(dev_net(rth->dst.dev)));
2229         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2230         if (IS_ERR(rth))
2231                 return PTR_ERR(rth);
2232         return 0;
2233 }
2234
2235 /*
2236  *      NOTE. We drop all the packets that has local source
2237  *      addresses, because every properly looped back packet
2238  *      must have correct destination already attached by output routine.
2239  *
2240  *      Such approach solves two big problems:
2241  *      1. Not simplex devices are handled properly.
2242  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2243  *      called with rcu_read_lock()
2244  */
2245
2246 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2247                                u8 tos, struct net_device *dev)
2248 {
2249         struct fib_result res;
2250         struct in_device *in_dev = __in_dev_get_rcu(dev);
2251         struct flowi4   fl4;
2252         unsigned int    flags = 0;
2253         u32             itag = 0;
2254         struct rtable   *rth;
2255         unsigned int    hash;
2256         __be32          spec_dst;
2257         int             err = -EINVAL;
2258         struct net    *net = dev_net(dev);
2259
2260         /* IP on this device is disabled. */
2261
2262         if (!in_dev)
2263                 goto out;
2264
2265         /* Check for the most weird martians, which can be not detected
2266            by fib_lookup.
2267          */
2268
2269         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2270             ipv4_is_loopback(saddr))
2271                 goto martian_source;
2272
2273         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2274                 goto brd_input;
2275
2276         /* Accept zero addresses only to limited broadcast;
2277          * I even do not know to fix it or not. Waiting for complains :-)
2278          */
2279         if (ipv4_is_zeronet(saddr))
2280                 goto martian_source;
2281
2282         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2283                 goto martian_destination;
2284
2285         /*
2286          *      Now we are ready to route packet.
2287          */
2288         fl4.flowi4_oif = 0;
2289         fl4.flowi4_iif = dev->ifindex;
2290         fl4.flowi4_mark = skb->mark;
2291         fl4.flowi4_tos = tos;
2292         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2293         fl4.daddr = daddr;
2294         fl4.saddr = saddr;
2295         err = fib_lookup(net, &fl4, &res);
2296         if (err != 0) {
2297                 if (!IN_DEV_FORWARD(in_dev))
2298                         goto e_hostunreach;
2299                 goto no_route;
2300         }
2301
2302         RT_CACHE_STAT_INC(in_slow_tot);
2303
2304         if (res.type == RTN_BROADCAST)
2305                 goto brd_input;
2306
2307         if (res.type == RTN_LOCAL) {
2308                 err = fib_validate_source(skb, saddr, daddr, tos,
2309                                           net->loopback_dev->ifindex,
2310                                           dev, &spec_dst, &itag);
2311                 if (err < 0)
2312                         goto martian_source_keep_err;
2313                 if (err)
2314                         flags |= RTCF_DIRECTSRC;
2315                 spec_dst = daddr;
2316                 goto local_input;
2317         }
2318
2319         if (!IN_DEV_FORWARD(in_dev))
2320                 goto e_hostunreach;
2321         if (res.type != RTN_UNICAST)
2322                 goto martian_destination;
2323
2324         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2325 out:    return err;
2326
2327 brd_input:
2328         if (skb->protocol != htons(ETH_P_IP))
2329                 goto e_inval;
2330
2331         if (ipv4_is_zeronet(saddr))
2332                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2333         else {
2334                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2335                                           &itag);
2336                 if (err < 0)
2337                         goto martian_source_keep_err;
2338                 if (err)
2339                         flags |= RTCF_DIRECTSRC;
2340         }
2341         flags |= RTCF_BROADCAST;
2342         res.type = RTN_BROADCAST;
2343         RT_CACHE_STAT_INC(in_brd);
2344
2345 local_input:
2346         rth = rt_dst_alloc(net->loopback_dev,
2347                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2348         if (!rth)
2349                 goto e_nobufs;
2350
2351         rth->dst.input= ip_local_deliver;
2352         rth->dst.output= ip_rt_bug;
2353 #ifdef CONFIG_IP_ROUTE_CLASSID
2354         rth->dst.tclassid = itag;
2355 #endif
2356
2357         rth->rt_key_dst = daddr;
2358         rth->rt_key_src = saddr;
2359         rth->rt_genid = rt_genid(net);
2360         rth->rt_flags   = flags|RTCF_LOCAL;
2361         rth->rt_type    = res.type;
2362         rth->rt_key_tos = tos;
2363         rth->rt_dst     = daddr;
2364         rth->rt_src     = saddr;
2365 #ifdef CONFIG_IP_ROUTE_CLASSID
2366         rth->dst.tclassid = itag;
2367 #endif
2368         rth->rt_route_iif = dev->ifindex;
2369         rth->rt_iif     = dev->ifindex;
2370         rth->rt_oif     = 0;
2371         rth->rt_mark    = skb->mark;
2372         rth->rt_gateway = daddr;
2373         rth->rt_spec_dst= spec_dst;
2374         rth->rt_peer_genid = 0;
2375         rth->peer = NULL;
2376         rth->fi = NULL;
2377         if (res.type == RTN_UNREACHABLE) {
2378                 rth->dst.input= ip_error;
2379                 rth->dst.error= -err;
2380                 rth->rt_flags   &= ~RTCF_LOCAL;
2381         }
2382         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2383         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2384         err = 0;
2385         if (IS_ERR(rth))
2386                 err = PTR_ERR(rth);
2387         goto out;
2388
2389 no_route:
2390         RT_CACHE_STAT_INC(in_no_route);
2391         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2392         res.type = RTN_UNREACHABLE;
2393         if (err == -ESRCH)
2394                 err = -ENETUNREACH;
2395         goto local_input;
2396
2397         /*
2398          *      Do not cache martian addresses: they should be logged (RFC1812)
2399          */
2400 martian_destination:
2401         RT_CACHE_STAT_INC(in_martian_dst);
2402 #ifdef CONFIG_IP_ROUTE_VERBOSE
2403         if (IN_DEV_LOG_MARTIANS(in_dev))
2404                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2405                                      &daddr, &saddr, dev->name);
2406 #endif
2407
2408 e_hostunreach:
2409         err = -EHOSTUNREACH;
2410         goto out;
2411
2412 e_inval:
2413         err = -EINVAL;
2414         goto out;
2415
2416 e_nobufs:
2417         err = -ENOBUFS;
2418         goto out;
2419
2420 martian_source:
2421         err = -EINVAL;
2422 martian_source_keep_err:
2423         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2424         goto out;
2425 }
2426
2427 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2428                            u8 tos, struct net_device *dev, bool noref)
2429 {
2430         struct rtable   *rth;
2431         unsigned int    hash;
2432         int iif = dev->ifindex;
2433         struct net *net;
2434         int res;
2435
2436         net = dev_net(dev);
2437
2438         rcu_read_lock();
2439
2440         if (!rt_caching(net))
2441                 goto skip_cache;
2442
2443         tos &= IPTOS_RT_MASK;
2444         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2445
2446         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2447              rth = rcu_dereference(rth->dst.rt_next)) {
2448                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2449                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2450                      (rth->rt_route_iif ^ iif) |
2451                      (rth->rt_key_tos ^ tos)) == 0 &&
2452                     rth->rt_mark == skb->mark &&
2453                     net_eq(dev_net(rth->dst.dev), net) &&
2454                     !rt_is_expired(rth)) {
2455                         ipv4_validate_peer(rth);
2456                         if (noref) {
2457                                 dst_use_noref(&rth->dst, jiffies);
2458                                 skb_dst_set_noref(skb, &rth->dst);
2459                         } else {
2460                                 dst_use(&rth->dst, jiffies);
2461                                 skb_dst_set(skb, &rth->dst);
2462                         }
2463                         RT_CACHE_STAT_INC(in_hit);
2464                         rcu_read_unlock();
2465                         return 0;
2466                 }
2467                 RT_CACHE_STAT_INC(in_hlist_search);
2468         }
2469
2470 skip_cache:
2471         /* Multicast recognition logic is moved from route cache to here.
2472            The problem was that too many Ethernet cards have broken/missing
2473            hardware multicast filters :-( As result the host on multicasting
2474            network acquires a lot of useless route cache entries, sort of
2475            SDR messages from all the world. Now we try to get rid of them.
2476            Really, provided software IP multicast filter is organized
2477            reasonably (at least, hashed), it does not result in a slowdown
2478            comparing with route cache reject entries.
2479            Note, that multicast routers are not affected, because
2480            route cache entry is created eventually.
2481          */
2482         if (ipv4_is_multicast(daddr)) {
2483                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2484
2485                 if (in_dev) {
2486                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2487                                                   ip_hdr(skb)->protocol);
2488                         if (our
2489 #ifdef CONFIG_IP_MROUTE
2490                                 ||
2491                             (!ipv4_is_local_multicast(daddr) &&
2492                              IN_DEV_MFORWARD(in_dev))
2493 #endif
2494                            ) {
2495                                 int res = ip_route_input_mc(skb, daddr, saddr,
2496                                                             tos, dev, our);
2497                                 rcu_read_unlock();
2498                                 return res;
2499                         }
2500                 }
2501                 rcu_read_unlock();
2502                 return -EINVAL;
2503         }
2504         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2505         rcu_read_unlock();
2506         return res;
2507 }
2508 EXPORT_SYMBOL(ip_route_input_common);
2509
2510 /* called with rcu_read_lock() */
2511 static struct rtable *__mkroute_output(const struct fib_result *res,
2512                                        const struct flowi4 *fl4,
2513                                        __be32 orig_daddr, __be32 orig_saddr,
2514                                        int orig_oif, __u8 orig_rtos,
2515                                        struct net_device *dev_out,
2516                                        unsigned int flags)
2517 {
2518         struct fib_info *fi = res->fi;
2519         struct in_device *in_dev;
2520         u16 type = res->type;
2521         struct rtable *rth;
2522
2523         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2524                 return ERR_PTR(-EINVAL);
2525
2526         if (ipv4_is_lbcast(fl4->daddr))
2527                 type = RTN_BROADCAST;
2528         else if (ipv4_is_multicast(fl4->daddr))
2529                 type = RTN_MULTICAST;
2530         else if (ipv4_is_zeronet(fl4->daddr))
2531                 return ERR_PTR(-EINVAL);
2532
2533         if (dev_out->flags & IFF_LOOPBACK)
2534                 flags |= RTCF_LOCAL;
2535
2536         in_dev = __in_dev_get_rcu(dev_out);
2537         if (!in_dev)
2538                 return ERR_PTR(-EINVAL);
2539
2540         if (type == RTN_BROADCAST) {
2541                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2542                 fi = NULL;
2543         } else if (type == RTN_MULTICAST) {
2544                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2545                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2546                                      fl4->flowi4_proto))
2547                         flags &= ~RTCF_LOCAL;
2548                 /* If multicast route do not exist use
2549                  * default one, but do not gateway in this case.
2550                  * Yes, it is hack.
2551                  */
2552                 if (fi && res->prefixlen < 4)
2553                         fi = NULL;
2554         }
2555
2556         rth = rt_dst_alloc(dev_out,
2557                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2558                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2559         if (!rth)
2560                 return ERR_PTR(-ENOBUFS);
2561
2562         rth->dst.output = ip_output;
2563
2564         rth->rt_key_dst = orig_daddr;
2565         rth->rt_key_src = orig_saddr;
2566         rth->rt_genid = rt_genid(dev_net(dev_out));
2567         rth->rt_flags   = flags;
2568         rth->rt_type    = type;
2569         rth->rt_key_tos = orig_rtos;
2570         rth->rt_dst     = fl4->daddr;
2571         rth->rt_src     = fl4->saddr;
2572         rth->rt_route_iif = 0;
2573         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2574         rth->rt_oif     = orig_oif;
2575         rth->rt_mark    = fl4->flowi4_mark;
2576         rth->rt_gateway = fl4->daddr;
2577         rth->rt_spec_dst= fl4->saddr;
2578         rth->rt_peer_genid = 0;
2579         rth->peer = NULL;
2580         rth->fi = NULL;
2581
2582         RT_CACHE_STAT_INC(out_slow_tot);
2583
2584         if (flags & RTCF_LOCAL) {
2585                 rth->dst.input = ip_local_deliver;
2586                 rth->rt_spec_dst = fl4->daddr;
2587         }
2588         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2589                 rth->rt_spec_dst = fl4->saddr;
2590                 if (flags & RTCF_LOCAL &&
2591                     !(dev_out->flags & IFF_LOOPBACK)) {
2592                         rth->dst.output = ip_mc_output;
2593                         RT_CACHE_STAT_INC(out_slow_mc);
2594                 }
2595 #ifdef CONFIG_IP_MROUTE
2596                 if (type == RTN_MULTICAST) {
2597                         if (IN_DEV_MFORWARD(in_dev) &&
2598                             !ipv4_is_local_multicast(fl4->daddr)) {
2599                                 rth->dst.input = ip_mr_input;
2600                                 rth->dst.output = ip_mc_output;
2601                         }
2602                 }
2603 #endif
2604         }
2605
2606         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2607
2608         return rth;
2609 }
2610
2611 /*
2612  * Major route resolver routine.
2613  * called with rcu_read_lock();
2614  */
2615
2616 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2617 {
2618         struct net_device *dev_out = NULL;
2619         __u8 tos = RT_FL_TOS(fl4);
2620         unsigned int flags = 0;
2621         struct fib_result res;
2622         struct rtable *rth;
2623         __be32 orig_daddr;
2624         __be32 orig_saddr;
2625         int orig_oif;
2626
2627         res.fi          = NULL;
2628 #ifdef CONFIG_IP_MULTIPLE_TABLES
2629         res.r           = NULL;
2630 #endif
2631
2632         orig_daddr = fl4->daddr;
2633         orig_saddr = fl4->saddr;
2634         orig_oif = fl4->flowi4_oif;
2635
2636         fl4->flowi4_iif = net->loopback_dev->ifindex;
2637         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2638         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2639                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2640
2641         rcu_read_lock();
2642         if (fl4->saddr) {
2643                 rth = ERR_PTR(-EINVAL);
2644                 if (ipv4_is_multicast(fl4->saddr) ||
2645                     ipv4_is_lbcast(fl4->saddr) ||
2646                     ipv4_is_zeronet(fl4->saddr))
2647                         goto out;
2648
2649                 /* I removed check for oif == dev_out->oif here.
2650                    It was wrong for two reasons:
2651                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2652                       is assigned to multiple interfaces.
2653                    2. Moreover, we are allowed to send packets with saddr
2654                       of another iface. --ANK
2655                  */
2656
2657                 if (fl4->flowi4_oif == 0 &&
2658                     (ipv4_is_multicast(fl4->daddr) ||
2659                      ipv4_is_lbcast(fl4->daddr))) {
2660                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2661                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2662                         if (dev_out == NULL)
2663                                 goto out;
2664
2665                         /* Special hack: user can direct multicasts
2666                            and limited broadcast via necessary interface
2667                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2668                            This hack is not just for fun, it allows
2669                            vic,vat and friends to work.
2670                            They bind socket to loopback, set ttl to zero
2671                            and expect that it will work.
2672                            From the viewpoint of routing cache they are broken,
2673                            because we are not allowed to build multicast path
2674                            with loopback source addr (look, routing cache
2675                            cannot know, that ttl is zero, so that packet
2676                            will not leave this host and route is valid).
2677                            Luckily, this hack is good workaround.
2678                          */
2679
2680                         fl4->flowi4_oif = dev_out->ifindex;
2681                         goto make_route;
2682                 }
2683
2684                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2685                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2686                         if (!__ip_dev_find(net, fl4->saddr, false))
2687                                 goto out;
2688                 }
2689         }
2690
2691
2692         if (fl4->flowi4_oif) {
2693                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2694                 rth = ERR_PTR(-ENODEV);
2695                 if (dev_out == NULL)
2696                         goto out;
2697
2698                 /* RACE: Check return value of inet_select_addr instead. */
2699                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2700                         rth = ERR_PTR(-ENETUNREACH);
2701                         goto out;
2702                 }
2703                 if (ipv4_is_local_multicast(fl4->daddr) ||
2704                     ipv4_is_lbcast(fl4->daddr)) {
2705                         if (!fl4->saddr)
2706                                 fl4->saddr = inet_select_addr(dev_out, 0,
2707                                                               RT_SCOPE_LINK);
2708                         goto make_route;
2709                 }
2710                 if (fl4->saddr) {
2711                         if (ipv4_is_multicast(fl4->daddr))
2712                                 fl4->saddr = inet_select_addr(dev_out, 0,
2713                                                               fl4->flowi4_scope);
2714                         else if (!fl4->daddr)
2715                                 fl4->saddr = inet_select_addr(dev_out, 0,
2716                                                               RT_SCOPE_HOST);
2717                 }
2718         }
2719
2720         if (!fl4->daddr) {
2721                 fl4->daddr = fl4->saddr;
2722                 if (!fl4->daddr)
2723                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2724                 dev_out = net->loopback_dev;
2725                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2726                 res.type = RTN_LOCAL;
2727                 flags |= RTCF_LOCAL;
2728                 goto make_route;
2729         }
2730
2731         if (fib_lookup(net, fl4, &res)) {
2732                 res.fi = NULL;
2733                 if (fl4->flowi4_oif) {
2734                         /* Apparently, routing tables are wrong. Assume,
2735                            that the destination is on link.
2736
2737                            WHY? DW.
2738                            Because we are allowed to send to iface
2739                            even if it has NO routes and NO assigned
2740                            addresses. When oif is specified, routing
2741                            tables are looked up with only one purpose:
2742                            to catch if destination is gatewayed, rather than
2743                            direct. Moreover, if MSG_DONTROUTE is set,
2744                            we send packet, ignoring both routing tables
2745                            and ifaddr state. --ANK
2746
2747
2748                            We could make it even if oif is unknown,
2749                            likely IPv6, but we do not.
2750                          */
2751
2752                         if (fl4->saddr == 0)
2753                                 fl4->saddr = inet_select_addr(dev_out, 0,
2754                                                               RT_SCOPE_LINK);
2755                         res.type = RTN_UNICAST;
2756                         goto make_route;
2757                 }
2758                 rth = ERR_PTR(-ENETUNREACH);
2759                 goto out;
2760         }
2761
2762         if (res.type == RTN_LOCAL) {
2763                 if (!fl4->saddr) {
2764                         if (res.fi->fib_prefsrc)
2765                                 fl4->saddr = res.fi->fib_prefsrc;
2766                         else
2767                                 fl4->saddr = fl4->daddr;
2768                 }
2769                 dev_out = net->loopback_dev;
2770                 fl4->flowi4_oif = dev_out->ifindex;
2771                 res.fi = NULL;
2772                 flags |= RTCF_LOCAL;
2773                 goto make_route;
2774         }
2775
2776 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2777         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2778                 fib_select_multipath(&res);
2779         else
2780 #endif
2781         if (!res.prefixlen &&
2782             res.table->tb_num_default > 1 &&
2783             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2784                 fib_select_default(&res);
2785
2786         if (!fl4->saddr)
2787                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2788
2789         dev_out = FIB_RES_DEV(res);
2790         fl4->flowi4_oif = dev_out->ifindex;
2791
2792
2793 make_route:
2794         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2795                                tos, dev_out, flags);
2796         if (!IS_ERR(rth)) {
2797                 unsigned int hash;
2798
2799                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2800                                rt_genid(dev_net(dev_out)));
2801                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2802         }
2803
2804 out:
2805         rcu_read_unlock();
2806         return rth;
2807 }
2808
2809 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2810 {
2811         struct rtable *rth;
2812         unsigned int hash;
2813
2814         if (!rt_caching(net))
2815                 goto slow_output;
2816
2817         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2818
2819         rcu_read_lock_bh();
2820         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2821                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2822                 if (rth->rt_key_dst == flp4->daddr &&
2823                     rth->rt_key_src == flp4->saddr &&
2824                     rt_is_output_route(rth) &&
2825                     rth->rt_oif == flp4->flowi4_oif &&
2826                     rth->rt_mark == flp4->flowi4_mark &&
2827                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2828                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2829                     net_eq(dev_net(rth->dst.dev), net) &&
2830                     !rt_is_expired(rth)) {
2831                         ipv4_validate_peer(rth);
2832                         dst_use(&rth->dst, jiffies);
2833                         RT_CACHE_STAT_INC(out_hit);
2834                         rcu_read_unlock_bh();
2835                         if (!flp4->saddr)
2836                                 flp4->saddr = rth->rt_src;
2837                         if (!flp4->daddr)
2838                                 flp4->daddr = rth->rt_dst;
2839                         return rth;
2840                 }
2841                 RT_CACHE_STAT_INC(out_hlist_search);
2842         }
2843         rcu_read_unlock_bh();
2844
2845 slow_output:
2846         return ip_route_output_slow(net, flp4);
2847 }
2848 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2849
2850 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2851 {
2852         return NULL;
2853 }
2854
2855 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2856 {
2857         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2858
2859         return mtu ? : dst->dev->mtu;
2860 }
2861
2862 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2863 {
2864 }
2865
2866 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2867                                           unsigned long old)
2868 {
2869         return NULL;
2870 }
2871
2872 static struct dst_ops ipv4_dst_blackhole_ops = {
2873         .family                 =       AF_INET,
2874         .protocol               =       cpu_to_be16(ETH_P_IP),
2875         .destroy                =       ipv4_dst_destroy,
2876         .check                  =       ipv4_blackhole_dst_check,
2877         .mtu                    =       ipv4_blackhole_mtu,
2878         .default_advmss         =       ipv4_default_advmss,
2879         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2880         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2881         .neigh_lookup           =       ipv4_neigh_lookup,
2882 };
2883
2884 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2885 {
2886         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2887         struct rtable *ort = (struct rtable *) dst_orig;
2888
2889         if (rt) {
2890                 struct dst_entry *new = &rt->dst;
2891
2892                 new->__use = 1;
2893                 new->input = dst_discard;
2894                 new->output = dst_discard;
2895                 dst_copy_metrics(new, &ort->dst);
2896
2897                 new->dev = ort->dst.dev;
2898                 if (new->dev)
2899                         dev_hold(new->dev);
2900
2901                 rt->rt_key_dst = ort->rt_key_dst;
2902                 rt->rt_key_src = ort->rt_key_src;
2903                 rt->rt_key_tos = ort->rt_key_tos;
2904                 rt->rt_route_iif = ort->rt_route_iif;
2905                 rt->rt_iif = ort->rt_iif;
2906                 rt->rt_oif = ort->rt_oif;
2907                 rt->rt_mark = ort->rt_mark;
2908
2909                 rt->rt_genid = rt_genid(net);
2910                 rt->rt_flags = ort->rt_flags;
2911                 rt->rt_type = ort->rt_type;
2912                 rt->rt_dst = ort->rt_dst;
2913                 rt->rt_src = ort->rt_src;
2914                 rt->rt_gateway = ort->rt_gateway;
2915                 rt->rt_spec_dst = ort->rt_spec_dst;
2916                 rt->peer = ort->peer;
2917                 if (rt->peer)
2918                         atomic_inc(&rt->peer->refcnt);
2919                 rt->fi = ort->fi;
2920                 if (rt->fi)
2921                         atomic_inc(&rt->fi->fib_clntref);
2922
2923                 dst_free(new);
2924         }
2925
2926         dst_release(dst_orig);
2927
2928         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2929 }
2930
2931 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2932                                     struct sock *sk)
2933 {
2934         struct rtable *rt = __ip_route_output_key(net, flp4);
2935
2936         if (IS_ERR(rt))
2937                 return rt;
2938
2939         if (flp4->flowi4_proto)
2940                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2941                                                    flowi4_to_flowi(flp4),
2942                                                    sk, 0);
2943
2944         return rt;
2945 }
2946 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2947
2948 static int rt_fill_info(struct net *net,
2949                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2950                         int nowait, unsigned int flags)
2951 {
2952         struct rtable *rt = skb_rtable(skb);
2953         struct rtmsg *r;
2954         struct nlmsghdr *nlh;
2955         unsigned long expires = 0;
2956         const struct inet_peer *peer = rt->peer;
2957         u32 id = 0, ts = 0, tsage = 0, error;
2958
2959         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2960         if (nlh == NULL)
2961                 return -EMSGSIZE;
2962
2963         r = nlmsg_data(nlh);
2964         r->rtm_family    = AF_INET;
2965         r->rtm_dst_len  = 32;
2966         r->rtm_src_len  = 0;
2967         r->rtm_tos      = rt->rt_key_tos;
2968         r->rtm_table    = RT_TABLE_MAIN;
2969         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2970                 goto nla_put_failure;
2971         r->rtm_type     = rt->rt_type;
2972         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2973         r->rtm_protocol = RTPROT_UNSPEC;
2974         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2975         if (rt->rt_flags & RTCF_NOTIFY)
2976                 r->rtm_flags |= RTM_F_NOTIFY;
2977
2978         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2979                 goto nla_put_failure;
2980         if (rt->rt_key_src) {
2981                 r->rtm_src_len = 32;
2982                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2983                         goto nla_put_failure;
2984         }
2985         if (rt->dst.dev &&
2986             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2987                 goto nla_put_failure;
2988 #ifdef CONFIG_IP_ROUTE_CLASSID
2989         if (rt->dst.tclassid &&
2990             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991                 goto nla_put_failure;
2992 #endif
2993         if (rt_is_input_route(rt)) {
2994                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2995                         goto nla_put_failure;
2996         } else if (rt->rt_src != rt->rt_key_src) {
2997                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2998                         goto nla_put_failure;
2999         }
3000         if (rt->rt_dst != rt->rt_gateway &&
3001             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3002                 goto nla_put_failure;
3003
3004         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005                 goto nla_put_failure;
3006
3007         if (rt->rt_mark &&
3008             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3009                 goto nla_put_failure;
3010
3011         error = rt->dst.error;
3012         if (peer) {
3013                 inet_peer_refcheck(rt->peer);
3014                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3015                 if (peer->tcp_ts_stamp) {
3016                         ts = peer->tcp_ts;
3017                         tsage = get_seconds() - peer->tcp_ts_stamp;
3018                 }
3019                 expires = ACCESS_ONCE(peer->pmtu_expires);
3020                 if (expires) {
3021                         if (time_before(jiffies, expires))
3022                                 expires -= jiffies;
3023                         else
3024                                 expires = 0;
3025                 }
3026         }
3027
3028         if (rt_is_input_route(rt)) {
3029 #ifdef CONFIG_IP_MROUTE
3030                 __be32 dst = rt->rt_dst;
3031
3032                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3033                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3034                         int err = ipmr_get_route(net, skb,
3035                                                  rt->rt_src, rt->rt_dst,
3036                                                  r, nowait);
3037                         if (err <= 0) {
3038                                 if (!nowait) {
3039                                         if (err == 0)
3040                                                 return 0;
3041                                         goto nla_put_failure;
3042                                 } else {
3043                                         if (err == -EMSGSIZE)
3044                                                 goto nla_put_failure;
3045                                         error = err;
3046                                 }
3047                         }
3048                 } else
3049 #endif
3050                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3051                                 goto nla_put_failure;
3052         }
3053
3054         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3055                                expires, error) < 0)
3056                 goto nla_put_failure;
3057
3058         return nlmsg_end(skb, nlh);
3059
3060 nla_put_failure:
3061         nlmsg_cancel(skb, nlh);
3062         return -EMSGSIZE;
3063 }
3064
3065 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3066 {
3067         struct net *net = sock_net(in_skb->sk);
3068         struct rtmsg *rtm;
3069         struct nlattr *tb[RTA_MAX+1];
3070         struct rtable *rt = NULL;
3071         __be32 dst = 0;
3072         __be32 src = 0;
3073         u32 iif;
3074         int err;
3075         int mark;
3076         struct sk_buff *skb;
3077
3078         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3079         if (err < 0)
3080                 goto errout;
3081
3082         rtm = nlmsg_data(nlh);
3083
3084         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3085         if (skb == NULL) {
3086                 err = -ENOBUFS;
3087                 goto errout;
3088         }
3089
3090         /* Reserve room for dummy headers, this skb can pass
3091            through good chunk of routing engine.
3092          */
3093         skb_reset_mac_header(skb);
3094         skb_reset_network_header(skb);
3095
3096         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3097         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3098         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3099
3100         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3101         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3102         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3103         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3104
3105         if (iif) {
3106                 struct net_device *dev;
3107
3108                 dev = __dev_get_by_index(net, iif);
3109                 if (dev == NULL) {
3110                         err = -ENODEV;
3111                         goto errout_free;
3112                 }
3113
3114                 skb->protocol   = htons(ETH_P_IP);
3115                 skb->dev        = dev;
3116                 skb->mark       = mark;
3117                 local_bh_disable();
3118                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3119                 local_bh_enable();
3120
3121                 rt = skb_rtable(skb);
3122                 if (err == 0 && rt->dst.error)
3123                         err = -rt->dst.error;
3124         } else {
3125                 struct flowi4 fl4 = {
3126                         .daddr = dst,
3127                         .saddr = src,
3128                         .flowi4_tos = rtm->rtm_tos,
3129                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3130                         .flowi4_mark = mark,
3131                 };
3132                 rt = ip_route_output_key(net, &fl4);
3133
3134                 err = 0;
3135                 if (IS_ERR(rt))
3136                         err = PTR_ERR(rt);
3137         }
3138
3139         if (err)
3140                 goto errout_free;
3141
3142         skb_dst_set(skb, &rt->dst);
3143         if (rtm->rtm_flags & RTM_F_NOTIFY)
3144                 rt->rt_flags |= RTCF_NOTIFY;
3145
3146         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3147                            RTM_NEWROUTE, 0, 0);
3148         if (err <= 0)
3149                 goto errout_free;
3150
3151         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3152 errout:
3153         return err;
3154
3155 errout_free:
3156         kfree_skb(skb);
3157         goto errout;
3158 }
3159
3160 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3161 {
3162         struct rtable *rt;
3163         int h, s_h;
3164         int idx, s_idx;
3165         struct net *net;
3166
3167         net = sock_net(skb->sk);
3168
3169         s_h = cb->args[0];
3170         if (s_h < 0)
3171                 s_h = 0;
3172         s_idx = idx = cb->args[1];
3173         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3174                 if (!rt_hash_table[h].chain)
3175                         continue;
3176                 rcu_read_lock_bh();
3177                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3178                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3179                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3180                                 continue;
3181                         if (rt_is_expired(rt))
3182                                 continue;
3183                         skb_dst_set_noref(skb, &rt->dst);
3184                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3185                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3186                                          1, NLM_F_MULTI) <= 0) {
3187                                 skb_dst_drop(skb);
3188                                 rcu_read_unlock_bh();
3189                                 goto done;
3190                         }
3191                         skb_dst_drop(skb);
3192                 }
3193                 rcu_read_unlock_bh();
3194         }
3195
3196 done:
3197         cb->args[0] = h;
3198         cb->args[1] = idx;
3199         return skb->len;
3200 }
3201
3202 void ip_rt_multicast_event(struct in_device *in_dev)
3203 {
3204         rt_cache_flush(dev_net(in_dev->dev), 0);
3205 }
3206
3207 #ifdef CONFIG_SYSCTL
3208 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3209                                         void __user *buffer,
3210                                         size_t *lenp, loff_t *ppos)
3211 {
3212         if (write) {
3213                 int flush_delay;
3214                 ctl_table ctl;
3215                 struct net *net;
3216
3217                 memcpy(&ctl, __ctl, sizeof(ctl));
3218                 ctl.data = &flush_delay;
3219                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3220
3221                 net = (struct net *)__ctl->extra1;
3222                 rt_cache_flush(net, flush_delay);
3223                 return 0;
3224         }
3225
3226         return -EINVAL;
3227 }
3228
3229 static ctl_table ipv4_route_table[] = {
3230         {
3231                 .procname       = "gc_thresh",
3232                 .data           = &ipv4_dst_ops.gc_thresh,
3233                 .maxlen         = sizeof(int),
3234                 .mode           = 0644,
3235                 .proc_handler   = proc_dointvec,
3236         },
3237         {
3238                 .procname       = "max_size",
3239                 .data           = &ip_rt_max_size,
3240                 .maxlen         = sizeof(int),
3241                 .mode           = 0644,
3242                 .proc_handler   = proc_dointvec,
3243         },
3244         {
3245                 /*  Deprecated. Use gc_min_interval_ms */
3246
3247                 .procname       = "gc_min_interval",
3248                 .data           = &ip_rt_gc_min_interval,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec_jiffies,
3252         },
3253         {
3254                 .procname       = "gc_min_interval_ms",
3255                 .data           = &ip_rt_gc_min_interval,
3256                 .maxlen         = sizeof(int),
3257                 .mode           = 0644,
3258                 .proc_handler   = proc_dointvec_ms_jiffies,
3259         },
3260         {
3261                 .procname       = "gc_timeout",
3262                 .data           = &ip_rt_gc_timeout,
3263                 .maxlen         = sizeof(int),
3264                 .mode           = 0644,
3265                 .proc_handler   = proc_dointvec_jiffies,
3266         },
3267         {
3268                 .procname       = "gc_interval",
3269                 .data           = &ip_rt_gc_interval,
3270                 .maxlen         = sizeof(int),
3271                 .mode           = 0644,
3272                 .proc_handler   = proc_dointvec_jiffies,
3273         },
3274         {
3275                 .procname       = "redirect_load",
3276                 .data           = &ip_rt_redirect_load,
3277                 .maxlen         = sizeof(int),
3278                 .mode           = 0644,
3279                 .proc_handler   = proc_dointvec,
3280         },
3281         {
3282                 .procname       = "redirect_number",
3283                 .data           = &ip_rt_redirect_number,
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0644,
3286                 .proc_handler   = proc_dointvec,
3287         },
3288         {
3289                 .procname       = "redirect_silence",
3290                 .data           = &ip_rt_redirect_silence,
3291                 .maxlen         = sizeof(int),
3292                 .mode           = 0644,
3293                 .proc_handler   = proc_dointvec,
3294         },
3295         {
3296                 .procname       = "error_cost",
3297                 .data           = &ip_rt_error_cost,
3298                 .maxlen         = sizeof(int),
3299                 .mode           = 0644,
3300                 .proc_handler   = proc_dointvec,
3301         },
3302         {
3303                 .procname       = "error_burst",
3304                 .data           = &ip_rt_error_burst,
3305                 .maxlen         = sizeof(int),
3306                 .mode           = 0644,
3307                 .proc_handler   = proc_dointvec,
3308         },
3309         {
3310                 .procname       = "gc_elasticity",
3311                 .data           = &ip_rt_gc_elasticity,
3312                 .maxlen         = sizeof(int),
3313                 .mode           = 0644,
3314                 .proc_handler   = proc_dointvec,
3315         },
3316         {
3317                 .procname       = "mtu_expires",
3318                 .data           = &ip_rt_mtu_expires,
3319                 .maxlen         = sizeof(int),
3320                 .mode           = 0644,
3321                 .proc_handler   = proc_dointvec_jiffies,
3322         },
3323         {
3324                 .procname       = "min_pmtu",
3325                 .data           = &ip_rt_min_pmtu,
3326                 .maxlen         = sizeof(int),
3327                 .mode           = 0644,
3328                 .proc_handler   = proc_dointvec,
3329         },
3330         {
3331                 .procname       = "min_adv_mss",
3332                 .data           = &ip_rt_min_advmss,
3333                 .maxlen         = sizeof(int),
3334                 .mode           = 0644,
3335                 .proc_handler   = proc_dointvec,
3336         },
3337         { }
3338 };
3339
3340 static struct ctl_table ipv4_route_flush_table[] = {
3341         {
3342                 .procname       = "flush",
3343                 .maxlen         = sizeof(int),
3344                 .mode           = 0200,
3345                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3346         },
3347         { },
3348 };
3349
3350 static __net_init int sysctl_route_net_init(struct net *net)
3351 {
3352         struct ctl_table *tbl;
3353
3354         tbl = ipv4_route_flush_table;
3355         if (!net_eq(net, &init_net)) {
3356                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3357                 if (tbl == NULL)
3358                         goto err_dup;
3359         }
3360         tbl[0].extra1 = net;
3361
3362         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3363         if (net->ipv4.route_hdr == NULL)
3364                 goto err_reg;
3365         return 0;
3366
3367 err_reg:
3368         if (tbl != ipv4_route_flush_table)
3369                 kfree(tbl);
3370 err_dup:
3371         return -ENOMEM;
3372 }
3373
3374 static __net_exit void sysctl_route_net_exit(struct net *net)
3375 {
3376         struct ctl_table *tbl;
3377
3378         tbl = net->ipv4.route_hdr->ctl_table_arg;
3379         unregister_net_sysctl_table(net->ipv4.route_hdr);
3380         BUG_ON(tbl == ipv4_route_flush_table);
3381         kfree(tbl);
3382 }
3383
3384 static __net_initdata struct pernet_operations sysctl_route_ops = {
3385         .init = sysctl_route_net_init,
3386         .exit = sysctl_route_net_exit,
3387 };
3388 #endif
3389
3390 static __net_init int rt_genid_init(struct net *net)
3391 {
3392         get_random_bytes(&net->ipv4.rt_genid,
3393                          sizeof(net->ipv4.rt_genid));
3394         get_random_bytes(&net->ipv4.dev_addr_genid,
3395                          sizeof(net->ipv4.dev_addr_genid));
3396         return 0;
3397 }
3398
3399 static __net_initdata struct pernet_operations rt_genid_ops = {
3400         .init = rt_genid_init,
3401 };
3402
3403
3404 #ifdef CONFIG_IP_ROUTE_CLASSID
3405 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3406 #endif /* CONFIG_IP_ROUTE_CLASSID */
3407
3408 static __initdata unsigned long rhash_entries;
3409 static int __init set_rhash_entries(char *str)
3410 {
3411         ssize_t ret;
3412
3413         if (!str)
3414                 return 0;
3415
3416         ret = kstrtoul(str, 0, &rhash_entries);
3417         if (ret)
3418                 return 0;
3419
3420         return 1;
3421 }
3422 __setup("rhash_entries=", set_rhash_entries);
3423
3424 int __init ip_rt_init(void)
3425 {
3426         int rc = 0;
3427
3428 #ifdef CONFIG_IP_ROUTE_CLASSID
3429         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3430         if (!ip_rt_acct)
3431                 panic("IP: failed to allocate ip_rt_acct\n");
3432 #endif
3433
3434         ipv4_dst_ops.kmem_cachep =
3435                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3436                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3437
3438         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3439
3440         if (dst_entries_init(&ipv4_dst_ops) < 0)
3441                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3442
3443         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3444                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3445
3446         rt_hash_table = (struct rt_hash_bucket *)
3447                 alloc_large_system_hash("IP route cache",
3448                                         sizeof(struct rt_hash_bucket),
3449                                         rhash_entries,
3450                                         (totalram_pages >= 128 * 1024) ?
3451                                         15 : 17,
3452                                         0,
3453                                         &rt_hash_log,
3454                                         &rt_hash_mask,
3455                                         0,
3456                                         rhash_entries ? 0 : 512 * 1024);
3457         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3458         rt_hash_lock_init();
3459
3460         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3461         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3462
3463         devinet_init();
3464         ip_fib_init();
3465
3466         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3467         expires_ljiffies = jiffies;
3468         schedule_delayed_work(&expires_work,
3469                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3470
3471         if (ip_rt_proc_init())
3472                 pr_err("Unable to create route proc files\n");
3473 #ifdef CONFIG_XFRM
3474         xfrm_init();
3475         xfrm4_init(ip_rt_max_size);
3476 #endif
3477         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3478
3479 #ifdef CONFIG_SYSCTL
3480         register_pernet_subsys(&sysctl_route_ops);
3481 #endif
3482         register_pernet_subsys(&rt_genid_ops);
3483         return rc;
3484 }
3485
3486 #ifdef CONFIG_SYSCTL
3487 /*
3488  * We really need to sanitize the damn ipv4 init order, then all
3489  * this nonsense will go away.
3490  */
3491 void __init ip_static_sysctl_init(void)
3492 {
3493         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3494 }
3495 #endif