net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 152                                            struct sk_buff *skb, u32 mtu);
 153 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 154                                         struct sk_buff *skb);
 155 static int rt_garbage_collect(struct dst_ops *ops);
 156
 157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 158                             int how)
 159 {
 160 }
 161
 162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 163 {
 164         WARN_ON(1);
 165         return NULL;
 166 }
 167
 168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 169                                            struct sk_buff *skb,
 170                                            const void *daddr);
 171
 172 static struct dst_ops ipv4_dst_ops = {
 173         .family =               AF_INET,
 174         .protocol =             cpu_to_be16(ETH_P_IP),
 175         .gc =                   rt_garbage_collect,
 176         .check =                ipv4_dst_check,
 177         .default_advmss =       ipv4_default_advmss,
 178         .mtu =                  ipv4_mtu,
 179         .cow_metrics =          ipv4_cow_metrics,
 180         .destroy =              ipv4_dst_destroy,
 181         .ifdown =               ipv4_dst_ifdown,
 182         .negative_advice =      ipv4_negative_advice,
 183         .link_failure =         ipv4_link_failure,
 184         .update_pmtu =          ip_rt_update_pmtu,
 185         .redirect =             ip_do_redirect,
 186         .local_out =            __ip_local_out,
 187         .neigh_lookup =         ipv4_neigh_lookup,
 188 };
 189
 190 #define ECN_OR_COST(class)      TC_PRIO_##class
 191
 192 const __u8 ip_tos2prio[16] = {
 193         TC_PRIO_BESTEFFORT,
 194         ECN_OR_COST(BESTEFFORT),
 195         TC_PRIO_BESTEFFORT,
 196         ECN_OR_COST(BESTEFFORT),
 197         TC_PRIO_BULK,
 198         ECN_OR_COST(BULK),
 199         TC_PRIO_BULK,
 200         ECN_OR_COST(BULK),
 201         TC_PRIO_INTERACTIVE,
 202         ECN_OR_COST(INTERACTIVE),
 203         TC_PRIO_INTERACTIVE,
 204         ECN_OR_COST(INTERACTIVE),
 205         TC_PRIO_INTERACTIVE_BULK,
 206         ECN_OR_COST(INTERACTIVE_BULK),
 207         TC_PRIO_INTERACTIVE_BULK,
 208         ECN_OR_COST(INTERACTIVE_BULK)
 209 };
 210 EXPORT_SYMBOL(ip_tos2prio);
 211
 212 /*
 213  * Route cache.
 214  */
 215
 216 /* The locking scheme is rather straight forward:
 217  *
 218  * 1) Read-Copy Update protects the buckets of the central route hash.
 219  * 2) Only writers remove entries, and they hold the lock
 220  *    as they look at rtable reference counts.
 221  * 3) Only readers acquire references to rtable entries,
 222  *    they do so with atomic increments and with the
 223  *    lock held.
 224  */
 225
 226 struct rt_hash_bucket {
 227         struct rtable __rcu     *chain;
 228 };
 229
 230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 231         defined(CONFIG_PROVE_LOCKING)
 232 /*
 233  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 234  * The size of this table is a power of two and depends on the number of CPUS.
 235  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 236  */
 237 #ifdef CONFIG_LOCKDEP
 238 # define RT_HASH_LOCK_SZ        256
 239 #else
 240 # if NR_CPUS >= 32
 241 #  define RT_HASH_LOCK_SZ       4096
 242 # elif NR_CPUS >= 16
 243 #  define RT_HASH_LOCK_SZ       2048
 244 # elif NR_CPUS >= 8
 245 #  define RT_HASH_LOCK_SZ       1024
 246 # elif NR_CPUS >= 4
 247 #  define RT_HASH_LOCK_SZ       512
 248 # else
 249 #  define RT_HASH_LOCK_SZ       256
 250 # endif
 251 #endif
 252
 253 static spinlock_t       *rt_hash_locks;
 254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 255
 256 static __init void rt_hash_lock_init(void)
 257 {
 258         int i;
 259
 260         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 261                         GFP_KERNEL);
 262         if (!rt_hash_locks)
 263                 panic("IP: failed to allocate rt_hash_locks\n");
 264
 265         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 266                 spin_lock_init(&rt_hash_locks[i]);
 267 }
 268 #else
 269 # define rt_hash_lock_addr(slot) NULL
 270
 271 static inline void rt_hash_lock_init(void)
 272 {
 273 }
 274 #endif
 275
 276 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 277 static unsigned int             rt_hash_mask __read_mostly;
 278 static unsigned int             rt_hash_log  __read_mostly;
 279
 280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 282
 283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 284                                    int genid)
 285 {
 286         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 287                             idx, genid)
 288                 & rt_hash_mask;
 289 }
 290
 291 static inline int rt_genid(struct net *net)
 292 {
 293         return atomic_read(&net->ipv4.rt_genid);
 294 }
 295
 296 #ifdef CONFIG_PROC_FS
 297 struct rt_cache_iter_state {
 298         struct seq_net_private p;
 299         int bucket;
 300         int genid;
 301 };
 302
 303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306         struct rtable *r = NULL;
 307
 308         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 309                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 310                         continue;
 311                 rcu_read_lock_bh();
 312                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 313                 while (r) {
 314                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 315                             r->rt_genid == st->genid)
 316                                 return r;
 317                         r = rcu_dereference_bh(r->dst.rt_next);
 318                 }
 319                 rcu_read_unlock_bh();
 320         }
 321         return r;
 322 }
 323
 324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 325                                           struct rtable *r)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328
 329         r = rcu_dereference_bh(r->dst.rt_next);
 330         while (!r) {
 331                 rcu_read_unlock_bh();
 332                 do {
 333                         if (--st->bucket < 0)
 334                                 return NULL;
 335                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 336                 rcu_read_lock_bh();
 337                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 343                                         struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 347                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 348                         continue;
 349                 if (r->rt_genid == st->genid)
 350                         break;
 351         }
 352         return r;
 353 }
 354
 355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 356 {
 357         struct rtable *r = rt_cache_get_first(seq);
 358
 359         if (r)
 360                 while (pos && (r = rt_cache_get_next(seq, r)))
 361                         --pos;
 362         return pos ? NULL : r;
 363 }
 364
 365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         if (*pos)
 369                 return rt_cache_get_idx(seq, *pos - 1);
 370         st->genid = rt_genid(seq_file_net(seq));
 371         return SEQ_START_TOKEN;
 372 }
 373
 374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 375 {
 376         struct rtable *r;
 377
 378         if (v == SEQ_START_TOKEN)
 379                 r = rt_cache_get_first(seq);
 380         else
 381                 r = rt_cache_get_next(seq, v);
 382         ++*pos;
 383         return r;
 384 }
 385
 386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 387 {
 388         if (v && v != SEQ_START_TOKEN)
 389                 rcu_read_unlock_bh();
 390 }
 391
 392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 393 {
 394         if (v == SEQ_START_TOKEN)
 395                 seq_printf(seq, "%-127s\n",
 396                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 397                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 398                            "HHUptod\tSpecDst");
 399         else {
 400                 struct rtable *r = v;
 401                 int len;
 402
 403                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 404                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 405                            r->dst.dev ? r->dst.dev->name : "*",
 406                            (__force u32)r->rt_dst,
 407                            (__force u32)r->rt_gateway,
 408                            r->rt_flags, atomic_read(&r->dst.__refcnt),
 409                            r->dst.__use, 0, (__force u32)r->rt_src,
 410                            dst_metric_advmss(&r->dst) + 40,
 411                            dst_metric(&r->dst, RTAX_WINDOW), 0,
 412                            r->rt_key_tos,
 413                            -1, 0, 0, &len);
 414
 415                 seq_printf(seq, "%*s\n", 127 - len, "");
 416         }
 417         return 0;
 418 }
 419
 420 static const struct seq_operations rt_cache_seq_ops = {
 421         .start  = rt_cache_seq_start,
 422         .next   = rt_cache_seq_next,
 423         .stop   = rt_cache_seq_stop,
 424         .show   = rt_cache_seq_show,
 425 };
 426
 427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 428 {
 429         return seq_open_net(inode, file, &rt_cache_seq_ops,
 430                         sizeof(struct rt_cache_iter_state));
 431 }
 432
 433 static const struct file_operations rt_cache_seq_fops = {
 434         .owner   = THIS_MODULE,
 435         .open    = rt_cache_seq_open,
 436         .read    = seq_read,
 437         .llseek  = seq_lseek,
 438         .release = seq_release_net,
 439 };
 440
 441
 442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 443 {
 444         int cpu;
 445
 446         if (*pos == 0)
 447                 return SEQ_START_TOKEN;
 448
 449         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 450                 if (!cpu_possible(cpu))
 451                         continue;
 452                 *pos = cpu+1;
 453                 return &per_cpu(rt_cache_stat, cpu);
 454         }
 455         return NULL;
 456 }
 457
 458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 459 {
 460         int cpu;
 461
 462         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 463                 if (!cpu_possible(cpu))
 464                         continue;
 465                 *pos = cpu+1;
 466                 return &per_cpu(rt_cache_stat, cpu);
 467         }
 468         return NULL;
 469
 470 }
 471
 472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 473 {
 474
 475 }
 476
 477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 478 {
 479         struct rt_cache_stat *st = v;
 480
 481         if (v == SEQ_START_TOKEN) {
 482                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 483                 return 0;
 484         }
 485
 486         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 487                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 488                    dst_entries_get_slow(&ipv4_dst_ops),
 489                    st->in_hit,
 490                    st->in_slow_tot,
 491                    st->in_slow_mc,
 492                    st->in_no_route,
 493                    st->in_brd,
 494                    st->in_martian_dst,
 495                    st->in_martian_src,
 496
 497                    st->out_hit,
 498                    st->out_slow_tot,
 499                    st->out_slow_mc,
 500
 501                    st->gc_total,
 502                    st->gc_ignored,
 503                    st->gc_goal_miss,
 504                    st->gc_dst_overflow,
 505                    st->in_hlist_search,
 506                    st->out_hlist_search
 507                 );
 508         return 0;
 509 }
 510
 511 static const struct seq_operations rt_cpu_seq_ops = {
 512         .start  = rt_cpu_seq_start,
 513         .next   = rt_cpu_seq_next,
 514         .stop   = rt_cpu_seq_stop,
 515         .show   = rt_cpu_seq_show,
 516 };
 517
 518
 519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 520 {
 521         return seq_open(file, &rt_cpu_seq_ops);
 522 }
 523
 524 static const struct file_operations rt_cpu_seq_fops = {
 525         .owner   = THIS_MODULE,
 526         .open    = rt_cpu_seq_open,
 527         .read    = seq_read,
 528         .llseek  = seq_lseek,
 529         .release = seq_release,
 530 };
 531
 532 #ifdef CONFIG_IP_ROUTE_CLASSID
 533 static int rt_acct_proc_show(struct seq_file *m, void *v)
 534 {
 535         struct ip_rt_acct *dst, *src;
 536         unsigned int i, j;
 537
 538         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 539         if (!dst)
 540                 return -ENOMEM;
 541
 542         for_each_possible_cpu(i) {
 543                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 544                 for (j = 0; j < 256; j++) {
 545                         dst[j].o_bytes   += src[j].o_bytes;
 546                         dst[j].o_packets += src[j].o_packets;
 547                         dst[j].i_bytes   += src[j].i_bytes;
 548                         dst[j].i_packets += src[j].i_packets;
 549                 }
 550         }
 551
 552         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 553         kfree(dst);
 554         return 0;
 555 }
 556
 557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 558 {
 559         return single_open(file, rt_acct_proc_show, NULL);
 560 }
 561
 562 static const struct file_operations rt_acct_proc_fops = {
 563         .owner          = THIS_MODULE,
 564         .open           = rt_acct_proc_open,
 565         .read           = seq_read,
 566         .llseek         = seq_lseek,
 567         .release        = single_release,
 568 };
 569 #endif
 570
 571 static int __net_init ip_rt_do_proc_init(struct net *net)
 572 {
 573         struct proc_dir_entry *pde;
 574
 575         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 576                         &rt_cache_seq_fops);
 577         if (!pde)
 578                 goto err1;
 579
 580         pde = proc_create("rt_cache", S_IRUGO,
 581                           net->proc_net_stat, &rt_cpu_seq_fops);
 582         if (!pde)
 583                 goto err2;
 584
 585 #ifdef CONFIG_IP_ROUTE_CLASSID
 586         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 587         if (!pde)
 588                 goto err3;
 589 #endif
 590         return 0;
 591
 592 #ifdef CONFIG_IP_ROUTE_CLASSID
 593 err3:
 594         remove_proc_entry("rt_cache", net->proc_net_stat);
 595 #endif
 596 err2:
 597         remove_proc_entry("rt_cache", net->proc_net);
 598 err1:
 599         return -ENOMEM;
 600 }
 601
 602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 603 {
 604         remove_proc_entry("rt_cache", net->proc_net_stat);
 605         remove_proc_entry("rt_cache", net->proc_net);
 606 #ifdef CONFIG_IP_ROUTE_CLASSID
 607         remove_proc_entry("rt_acct", net->proc_net);
 608 #endif
 609 }
 610
 611 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 612         .init = ip_rt_do_proc_init,
 613         .exit = ip_rt_do_proc_exit,
 614 };
 615
 616 static int __init ip_rt_proc_init(void)
 617 {
 618         return register_pernet_subsys(&ip_rt_proc_ops);
 619 }
 620
 621 #else
 622 static inline int ip_rt_proc_init(void)
 623 {
 624         return 0;
 625 }
 626 #endif /* CONFIG_PROC_FS */
 627
 628 static inline void rt_free(struct rtable *rt)
 629 {
 630         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 631 }
 632
 633 static inline void rt_drop(struct rtable *rt)
 634 {
 635         ip_rt_put(rt);
 636         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 637 }
 638
 639 static inline int rt_fast_clean(struct rtable *rth)
 640 {
 641         /* Kill broadcast/multicast entries very aggresively, if they
 642            collide in hash table with more useful entries */
 643         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 644                 rt_is_input_route(rth) && rth->dst.rt_next;
 645 }
 646
 647 static inline int rt_valuable(struct rtable *rth)
 648 {
 649         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 650                 rth->dst.expires;
 651 }
 652
 653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 654 {
 655         unsigned long age;
 656         int ret = 0;
 657
 658         if (atomic_read(&rth->dst.__refcnt))
 659                 goto out;
 660
 661         age = jiffies - rth->dst.lastuse;
 662         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 663             (age <= tmo2 && rt_valuable(rth)))
 664                 goto out;
 665         ret = 1;
 666 out:    return ret;
 667 }
 668
 669 /* Bits of score are:
 670  * 31: very valuable
 671  * 30: not quite useless
 672  * 29..0: usage counter
 673  */
 674 static inline u32 rt_score(struct rtable *rt)
 675 {
 676         u32 score = jiffies - rt->dst.lastuse;
 677
 678         score = ~score & ~(3<<30);
 679
 680         if (rt_valuable(rt))
 681                 score |= (1<<31);
 682
 683         if (rt_is_output_route(rt) ||
 684             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 685                 score |= (1<<30);
 686
 687         return score;
 688 }
 689
 690 static inline bool rt_caching(const struct net *net)
 691 {
 692         return net->ipv4.current_rt_cache_rebuild_count <=
 693                 net->ipv4.sysctl_rt_cache_rebuild_count;
 694 }
 695
 696 static inline bool compare_hash_inputs(const struct rtable *rt1,
 697                                        const struct rtable *rt2)
 698 {
 699         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 700                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 701                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 702 }
 703
 704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 707                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 708                 (rt1->rt_mark ^ rt2->rt_mark) |
 709                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 710                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 711                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 712 }
 713
 714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 715 {
 716         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 717 }
 718
 719 static inline int rt_is_expired(struct rtable *rth)
 720 {
 721         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 722 }
 723
 724 /*
 725  * Perform a full scan of hash table and free all entries.
 726  * Can be called by a softirq or a process.
 727  * In the later case, we want to be reschedule if necessary
 728  */
 729 static void rt_do_flush(struct net *net, int process_context)
 730 {
 731         unsigned int i;
 732         struct rtable *rth, *next;
 733
 734         for (i = 0; i <= rt_hash_mask; i++) {
 735                 struct rtable __rcu **pprev;
 736                 struct rtable *list;
 737
 738                 if (process_context && need_resched())
 739                         cond_resched();
 740                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 741                 if (!rth)
 742                         continue;
 743
 744                 spin_lock_bh(rt_hash_lock_addr(i));
 745
 746                 list = NULL;
 747                 pprev = &rt_hash_table[i].chain;
 748                 rth = rcu_dereference_protected(*pprev,
 749                         lockdep_is_held(rt_hash_lock_addr(i)));
 750
 751                 while (rth) {
 752                         next = rcu_dereference_protected(rth->dst.rt_next,
 753                                 lockdep_is_held(rt_hash_lock_addr(i)));
 754
 755                         if (!net ||
 756                             net_eq(dev_net(rth->dst.dev), net)) {
 757                                 rcu_assign_pointer(*pprev, next);
 758                                 rcu_assign_pointer(rth->dst.rt_next, list);
 759                                 list = rth;
 760                         } else {
 761                                 pprev = &rth->dst.rt_next;
 762                         }
 763                         rth = next;
 764                 }
 765
 766                 spin_unlock_bh(rt_hash_lock_addr(i));
 767
 768                 for (; list; list = next) {
 769                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 770                         rt_free(list);
 771                 }
 772         }
 773 }
 774
 775 /*
 776  * While freeing expired entries, we compute average chain length
 777  * and standard deviation, using fixed-point arithmetic.
 778  * This to have an estimation of rt_chain_length_max
 779  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 780  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 781  */
 782
 783 #define FRACT_BITS 3
 784 #define ONE (1UL << FRACT_BITS)
 785
 786 /*
 787  * Given a hash chain and an item in this hash chain,
 788  * find if a previous entry has the same hash_inputs
 789  * (but differs on tos, mark or oif)
 790  * Returns 0 if an alias is found.
 791  * Returns ONE if rth has no alias before itself.
 792  */
 793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 794 {
 795         const struct rtable *aux = head;
 796
 797         while (aux != rth) {
 798                 if (compare_hash_inputs(aux, rth))
 799                         return 0;
 800                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 801         }
 802         return ONE;
 803 }
 804
 805 static void rt_check_expire(void)
 806 {
 807         static unsigned int rover;
 808         unsigned int i = rover, goal;
 809         struct rtable *rth;
 810         struct rtable __rcu **rthp;
 811         unsigned long samples = 0;
 812         unsigned long sum = 0, sum2 = 0;
 813         unsigned long delta;
 814         u64 mult;
 815
 816         delta = jiffies - expires_ljiffies;
 817         expires_ljiffies = jiffies;
 818         mult = ((u64)delta) << rt_hash_log;
 819         if (ip_rt_gc_timeout > 1)
 820                 do_div(mult, ip_rt_gc_timeout);
 821         goal = (unsigned int)mult;
 822         if (goal > rt_hash_mask)
 823                 goal = rt_hash_mask + 1;
 824         for (; goal > 0; goal--) {
 825                 unsigned long tmo = ip_rt_gc_timeout;
 826                 unsigned long length;
 827
 828                 i = (i + 1) & rt_hash_mask;
 829                 rthp = &rt_hash_table[i].chain;
 830
 831                 if (need_resched())
 832                         cond_resched();
 833
 834                 samples++;
 835
 836                 if (rcu_dereference_raw(*rthp) == NULL)
 837                         continue;
 838                 length = 0;
 839                 spin_lock_bh(rt_hash_lock_addr(i));
 840                 while ((rth = rcu_dereference_protected(*rthp,
 841                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 842                         prefetch(rth->dst.rt_next);
 843                         if (rt_is_expired(rth) ||
 844                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 845                                 *rthp = rth->dst.rt_next;
 846                                 rt_free(rth);
 847                                 continue;
 848                         }
 849
 850                         /* We only count entries on a chain with equal
 851                          * hash inputs once so that entries for
 852                          * different QOS levels, and other non-hash
 853                          * input attributes don't unfairly skew the
 854                          * length computation
 855                          */
 856                         tmo >>= 1;
 857                         rthp = &rth->dst.rt_next;
 858                         length += has_noalias(rt_hash_table[i].chain, rth);
 859                 }
 860                 spin_unlock_bh(rt_hash_lock_addr(i));
 861                 sum += length;
 862                 sum2 += length*length;
 863         }
 864         if (samples) {
 865                 unsigned long avg = sum / samples;
 866                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 867                 rt_chain_length_max = max_t(unsigned long,
 868                                         ip_rt_gc_elasticity,
 869                                         (avg + 4*sd) >> FRACT_BITS);
 870         }
 871         rover = i;
 872 }
 873
 874 /*
 875  * rt_worker_func() is run in process context.
 876  * we call rt_check_expire() to scan part of the hash table
 877  */
 878 static void rt_worker_func(struct work_struct *work)
 879 {
 880         rt_check_expire();
 881         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 882 }
 883
 884 /*
 885  * Perturbation of rt_genid by a small quantity [1..256]
 886  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 887  * many times (2^24) without giving recent rt_genid.
 888  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 889  */
 890 static void rt_cache_invalidate(struct net *net)
 891 {
 892         unsigned char shuffle;
 893
 894         get_random_bytes(&shuffle, sizeof(shuffle));
 895         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 896 }
 897
 898 /*
 899  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 900  * delay >= 0 : invalidate & flush cache (can be long)
 901  */
 902 void rt_cache_flush(struct net *net, int delay)
 903 {
 904         rt_cache_invalidate(net);
 905         if (delay >= 0)
 906                 rt_do_flush(net, !in_softirq());
 907 }
 908
 909 /* Flush previous cache invalidated entries from the cache */
 910 void rt_cache_flush_batch(struct net *net)
 911 {
 912         rt_do_flush(net, !in_softirq());
 913 }
 914
 915 static void rt_emergency_hash_rebuild(struct net *net)
 916 {
 917         net_warn_ratelimited("Route hash chain too long!\n");
 918         rt_cache_invalidate(net);
 919 }
 920
 921 /*
 922    Short description of GC goals.
 923
 924    We want to build algorithm, which will keep routing cache
 925    at some equilibrium point, when number of aged off entries
 926    is kept approximately equal to newly generated ones.
 927
 928    Current expiration strength is variable "expire".
 929    We try to adjust it dynamically, so that if networking
 930    is idle expires is large enough to keep enough of warm entries,
 931    and when load increases it reduces to limit cache size.
 932  */
 933
 934 static int rt_garbage_collect(struct dst_ops *ops)
 935 {
 936         static unsigned long expire = RT_GC_TIMEOUT;
 937         static unsigned long last_gc;
 938         static int rover;
 939         static int equilibrium;
 940         struct rtable *rth;
 941         struct rtable __rcu **rthp;
 942         unsigned long now = jiffies;
 943         int goal;
 944         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 945
 946         /*
 947          * Garbage collection is pretty expensive,
 948          * do not make it too frequently.
 949          */
 950
 951         RT_CACHE_STAT_INC(gc_total);
 952
 953         if (now - last_gc < ip_rt_gc_min_interval &&
 954             entries < ip_rt_max_size) {
 955                 RT_CACHE_STAT_INC(gc_ignored);
 956                 goto out;
 957         }
 958
 959         entries = dst_entries_get_slow(&ipv4_dst_ops);
 960         /* Calculate number of entries, which we want to expire now. */
 961         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 962         if (goal <= 0) {
 963                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 964                         equilibrium = ipv4_dst_ops.gc_thresh;
 965                 goal = entries - equilibrium;
 966                 if (goal > 0) {
 967                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 968                         goal = entries - equilibrium;
 969                 }
 970         } else {
 971                 /* We are in dangerous area. Try to reduce cache really
 972                  * aggressively.
 973                  */
 974                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 975                 equilibrium = entries - goal;
 976         }
 977
 978         if (now - last_gc >= ip_rt_gc_min_interval)
 979                 last_gc = now;
 980
 981         if (goal <= 0) {
 982                 equilibrium += goal;
 983                 goto work_done;
 984         }
 985
 986         do {
 987                 int i, k;
 988
 989                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 990                         unsigned long tmo = expire;
 991
 992                         k = (k + 1) & rt_hash_mask;
 993                         rthp = &rt_hash_table[k].chain;
 994                         spin_lock_bh(rt_hash_lock_addr(k));
 995                         while ((rth = rcu_dereference_protected(*rthp,
 996                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 997                                 if (!rt_is_expired(rth) &&
 998                                         !rt_may_expire(rth, tmo, expire)) {
 999                                         tmo >>= 1;
1000                                         rthp = &rth->dst.rt_next;
1001                                         continue;
1002                                 }
1003                                 *rthp = rth->dst.rt_next;
1004                                 rt_free(rth);
1005                                 goal--;
1006                         }
1007                         spin_unlock_bh(rt_hash_lock_addr(k));
1008                         if (goal <= 0)
1009                                 break;
1010                 }
1011                 rover = k;
1012
1013                 if (goal <= 0)
1014                         goto work_done;
1015
1016                 /* Goal is not achieved. We stop process if:
1017
1018                    - if expire reduced to zero. Otherwise, expire is halfed.
1019                    - if table is not full.
1020                    - if we are called from interrupt.
1021                    - jiffies check is just fallback/debug loop breaker.
1022                      We will not spin here for long time in any case.
1023                  */
1024
1025                 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027                 if (expire == 0)
1028                         break;
1029
1030                 expire >>= 1;
1031
1032                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                         goto out;
1034         } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037                 goto out;
1038         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039                 goto out;
1040         net_warn_ratelimited("dst cache overflow\n");
1041         RT_CACHE_STAT_INC(gc_dst_overflow);
1042         return 1;
1043
1044 work_done:
1045         expire += ip_rt_gc_min_interval;
1046         if (expire > ip_rt_gc_timeout ||
1047             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049                 expire = ip_rt_gc_timeout;
1050 out:    return 0;
1051 }
1052
1053 /*
1054  * Returns number of entries in a hash chain that have different hash_inputs
1055  */
1056 static int slow_chain_length(const struct rtable *head)
1057 {
1058         int length = 0;
1059         const struct rtable *rth = head;
1060
1061         while (rth) {
1062                 length += has_noalias(head, rth);
1063                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064         }
1065         return length >> FRACT_BITS;
1066 }
1067
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069                                            struct sk_buff *skb,
1070                                            const void *daddr)
1071 {
1072         struct net_device *dev = dst->dev;
1073         const __be32 *pkey = daddr;
1074         const struct rtable *rt;
1075         struct neighbour *n;
1076
1077         rt = (const struct rtable *) dst;
1078         if (rt->rt_gateway)
1079                 pkey = (const __be32 *) &rt->rt_gateway;
1080         else if (skb)
1081                 pkey = &ip_hdr(skb)->daddr;
1082
1083         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1084         if (n)
1085                 return n;
1086         return neigh_create(&arp_tbl, pkey, dev);
1087 }
1088
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090                                      struct sk_buff *skb, int ifindex)
1091 {
1092         struct rtable   *rth, *cand;
1093         struct rtable __rcu **rthp, **candp;
1094         unsigned long   now;
1095         u32             min_score;
1096         int             chain_length;
1097
1098 restart:
1099         chain_length = 0;
1100         min_score = ~(u32)0;
1101         cand = NULL;
1102         candp = NULL;
1103         now = jiffies;
1104
1105         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106                 /*
1107                  * If we're not caching, just tell the caller we
1108                  * were successful and don't touch the route.  The
1109                  * caller hold the sole reference to the cache entry, and
1110                  * it will be released when the caller is done with it.
1111                  * If we drop it here, the callers have no way to resolve routes
1112                  * when we're not caching.  Instead, just point *rp at rt, so
1113                  * the caller gets a single use out of the route
1114                  * Note that we do rt_free on this new route entry, so that
1115                  * once its refcount hits zero, we are still able to reap it
1116                  * (Thanks Alexey)
1117                  * Note: To avoid expensive rcu stuff for this uncached dst,
1118                  * we set DST_NOCACHE so that dst_release() can free dst without
1119                  * waiting a grace period.
1120                  */
1121
1122                 rt->dst.flags |= DST_NOCACHE;
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = rcu_dereference_protected(*rthp,
1130                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131                 if (rt_is_expired(rth)) {
1132                         *rthp = rth->dst.rt_next;
1133                         rt_free(rth);
1134                         continue;
1135                 }
1136                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137                         /* Put it first */
1138                         *rthp = rth->dst.rt_next;
1139                         /*
1140                          * Since lookup is lockfree, the deletion
1141                          * must be visible to another weakly ordered CPU before
1142                          * the insertion at the start of the hash chain.
1143                          */
1144                         rcu_assign_pointer(rth->dst.rt_next,
1145                                            rt_hash_table[hash].chain);
1146                         /*
1147                          * Since lookup is lockfree, the update writes
1148                          * must be ordered for consistency on SMP.
1149                          */
1150                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152                         dst_use(&rth->dst, now);
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         rt_drop(rt);
1156                         if (skb)
1157                                 skb_dst_set(skb, &rth->dst);
1158                         return rth;
1159                 }
1160
1161                 if (!atomic_read(&rth->dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max &&
1189                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190                         struct net *net = dev_net(rt->dst.dev);
1191                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192                         if (!rt_caching(net)) {
1193                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194                                         rt->dst.dev->name, num);
1195                         }
1196                         rt_emergency_hash_rebuild(net);
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200                                         ifindex, rt_genid(net));
1201                         goto restart;
1202                 }
1203         }
1204
1205         rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207         /*
1208          * Since lookup is lockfree, we must make sure
1209          * previous writes to rt are committed to memory
1210          * before making rt visible to other CPUS.
1211          */
1212         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214         spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216 skip_hashing:
1217         if (skb)
1218                 skb_dst_set(skb, &rt->dst);
1219         return rt;
1220 }
1221
1222 /*
1223  * Peer allocation may fail only in serious out-of-memory conditions.  However
1224  * we still can generate some output.
1225  * Random ID selection looks a bit dangerous because we have no chances to
1226  * select ID being unique in a reasonable period of time.
1227  * But broken packet identifier may be better than no packet at all.
1228  */
1229 static void ip_select_fb_ident(struct iphdr *iph)
1230 {
1231         static DEFINE_SPINLOCK(ip_fb_id_lock);
1232         static u32 ip_fallback_id;
1233         u32 salt;
1234
1235         spin_lock_bh(&ip_fb_id_lock);
1236         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237         iph->id = htons(salt & 0xFFFF);
1238         ip_fallback_id = salt;
1239         spin_unlock_bh(&ip_fb_id_lock);
1240 }
1241
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243 {
1244         struct net *net = dev_net(dst->dev);
1245         struct inet_peer *peer;
1246
1247         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248         if (peer) {
1249                 iph->id = htons(inet_getid(peer, more));
1250                 inet_putpeer(peer);
1251                 return;
1252         }
1253
1254         ip_select_fb_ident(iph);
1255 }
1256 EXPORT_SYMBOL(__ip_select_ident);
1257
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1259 {
1260         struct rtable __rcu **rthp;
1261         struct rtable *aux;
1262
1263         rthp = &rt_hash_table[hash].chain;
1264         spin_lock_bh(rt_hash_lock_addr(hash));
1265         ip_rt_put(rt);
1266         while ((aux = rcu_dereference_protected(*rthp,
1267                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268                 if (aux == rt || rt_is_expired(aux)) {
1269                         *rthp = aux->dst.rt_next;
1270                         rt_free(aux);
1271                         continue;
1272                 }
1273                 rthp = &aux->dst.rt_next;
1274         }
1275         spin_unlock_bh(rt_hash_lock_addr(hash));
1276 }
1277
1278 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
1279                              const struct iphdr *iph,
1280                              int oif, u8 tos,
1281                              u8 prot, u32 mark, int flow_flags)
1282 {
1283         if (sk) {
1284                 const struct inet_sock *inet = inet_sk(sk);
1285
1286                 oif = sk->sk_bound_dev_if;
1287                 mark = sk->sk_mark;
1288                 tos = RT_CONN_FLAGS(sk);
1289                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290         }
1291         flowi4_init_output(fl4, oif, mark, tos,
1292                            RT_SCOPE_UNIVERSE, prot,
1293                            flow_flags,
1294                            iph->daddr, iph->saddr, 0, 0);
1295 }
1296
1297 static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
1298 {
1299         const struct iphdr *iph = ip_hdr(skb);
1300         int oif = skb->dev->ifindex;
1301         u8 tos = RT_TOS(iph->tos);
1302         u8 prot = iph->protocol;
1303         u32 mark = skb->mark;
1304
1305         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1306 }
1307
1308 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
1309 {
1310         const struct inet_sock *inet = inet_sk(sk);
1311         struct ip_options_rcu *inet_opt;
1312         __be32 daddr = inet->inet_daddr;
1313
1314         rcu_read_lock();
1315         inet_opt = rcu_dereference(inet->inet_opt);
1316         if (inet_opt && inet_opt->opt.srr)
1317                 daddr = inet_opt->opt.faddr;
1318         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1319                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1320                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1321                            inet_sk_flowi_flags(sk),
1322                            daddr, inet->inet_saddr, 0, 0);
1323         rcu_read_unlock();
1324 }
1325
1326 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
1327                                  struct sk_buff *skb)
1328 {
1329         if (skb)
1330                 build_skb_flow_key(fl4, skb, sk);
1331         else
1332                 build_sk_flow_key(fl4, sk);
1333 }
1334
1335 static DEFINE_SPINLOCK(fnhe_lock);
1336
1337 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1338 {
1339         struct fib_nh_exception *fnhe, *oldest;
1340
1341         oldest = rcu_dereference(hash->chain);
1342         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1343              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1344                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1345                         oldest = fnhe;
1346         }
1347         return oldest;
1348 }
1349
1350 static inline u32 fnhe_hashfun(__be32 daddr)
1351 {
1352         u32 hval;
1353
1354         hval = (__force u32) daddr;
1355         hval ^= (hval >> 11) ^ (hval >> 22);
1356
1357         return hval & (FNHE_HASH_SIZE - 1);
1358 }
1359
1360 static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1361 {
1362         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1363         struct fib_nh_exception *fnhe;
1364         int depth;
1365         u32 hval;
1366
1367         if (!hash) {
1368                 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1369                                                    GFP_ATOMIC);
1370                 if (!hash)
1371                         return NULL;
1372         }
1373
1374         hval = fnhe_hashfun(daddr);
1375         hash += hval;
1376
1377         depth = 0;
1378         for (fnhe = rcu_dereference(hash->chain); fnhe;
1379              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1380                 if (fnhe->fnhe_daddr == daddr)
1381                         goto out;
1382                 depth++;
1383         }
1384
1385         if (depth > FNHE_RECLAIM_DEPTH) {
1386                 fnhe = fnhe_oldest(hash + hval, daddr);
1387                 goto out_daddr;
1388         }
1389         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1390         if (!fnhe)
1391                 return NULL;
1392
1393         fnhe->fnhe_next = hash->chain;
1394         rcu_assign_pointer(hash->chain, fnhe);
1395
1396 out_daddr:
1397         fnhe->fnhe_daddr = daddr;
1398 out:
1399         fnhe->fnhe_stamp = jiffies;
1400         return fnhe;
1401 }
1402
1403 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1404 {
1405         __be32 new_gw = icmp_hdr(skb)->un.gateway;
1406         __be32 old_gw = ip_hdr(skb)->saddr;
1407         struct net_device *dev = skb->dev;
1408         struct in_device *in_dev;
1409         struct fib_result res;
1410         struct neighbour *n;
1411         struct net *net;
1412
1413         switch (icmp_hdr(skb)->code & 7) {
1414         case ICMP_REDIR_NET:
1415         case ICMP_REDIR_NETTOS:
1416         case ICMP_REDIR_HOST:
1417         case ICMP_REDIR_HOSTTOS:
1418                 break;
1419
1420         default:
1421                 return;
1422         }
1423
1424         if (rt->rt_gateway != old_gw)
1425                 return;
1426
1427         in_dev = __in_dev_get_rcu(dev);
1428         if (!in_dev)
1429                 return;
1430
1431         net = dev_net(dev);
1432         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1433             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1434             ipv4_is_zeronet(new_gw))
1435                 goto reject_redirect;
1436
1437         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1438                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1439                         goto reject_redirect;
1440                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1441                         goto reject_redirect;
1442         } else {
1443                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1444                         goto reject_redirect;
1445         }
1446
1447         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1448         if (n) {
1449                 if (!(n->nud_state & NUD_VALID)) {
1450                         neigh_event_send(n, NULL);
1451                 } else {
1452                         if (fib_lookup(net, fl4, &res) == 0) {
1453                                 struct fib_nh *nh = &FIB_RES_NH(res);
1454                                 struct fib_nh_exception *fnhe;
1455
1456                                 spin_lock_bh(&fnhe_lock);
1457                                 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1458                                 if (fnhe)
1459                                         fnhe->fnhe_gw = new_gw;
1460                                 spin_unlock_bh(&fnhe_lock);
1461                         }
1462                         rt->rt_gateway = new_gw;
1463                         rt->rt_flags |= RTCF_REDIRECTED;
1464                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1465                 }
1466                 neigh_release(n);
1467         }
1468         return;
1469
1470 reject_redirect:
1471 #ifdef CONFIG_IP_ROUTE_VERBOSE
1472         if (IN_DEV_LOG_MARTIANS(in_dev)) {
1473                 const struct iphdr *iph = (const struct iphdr *) skb->data;
1474                 __be32 daddr = iph->daddr;
1475                 __be32 saddr = iph->saddr;
1476
1477                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1478                                      "  Advised path = %pI4 -> %pI4\n",
1479                                      &old_gw, dev->name, &new_gw,
1480                                      &saddr, &daddr);
1481         }
1482 #endif
1483         ;
1484 }
1485
1486 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1487 {
1488         struct rtable *rt;
1489         struct flowi4 fl4;
1490
1491         rt = (struct rtable *) dst;
1492
1493         ip_rt_build_flow_key(&fl4, sk, skb);
1494         __ip_do_redirect(rt, skb, &fl4);
1495 }
1496
1497 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1498 {
1499         struct rtable *rt = (struct rtable *)dst;
1500         struct dst_entry *ret = dst;
1501
1502         if (rt) {
1503                 if (dst->obsolete > 0) {
1504                         ip_rt_put(rt);
1505                         ret = NULL;
1506                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1507                            rt->dst.expires) {
1508                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1509                                                 rt->rt_oif,
1510                                                 rt_genid(dev_net(dst->dev)));
1511                         rt_del(hash, rt);
1512                         ret = NULL;
1513                 }
1514         }
1515         return ret;
1516 }
1517
1518 /*
1519  * Algorithm:
1520  *      1. The first ip_rt_redirect_number redirects are sent
1521  *         with exponential backoff, then we stop sending them at all,
1522  *         assuming that the host ignores our redirects.
1523  *      2. If we did not see packets requiring redirects
1524  *         during ip_rt_redirect_silence, we assume that the host
1525  *         forgot redirected route and start to send redirects again.
1526  *
1527  * This algorithm is much cheaper and more intelligent than dumb load limiting
1528  * in icmp.c.
1529  *
1530  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1531  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1532  */
1533
1534 void ip_rt_send_redirect(struct sk_buff *skb)
1535 {
1536         struct rtable *rt = skb_rtable(skb);
1537         struct in_device *in_dev;
1538         struct inet_peer *peer;
1539         struct net *net;
1540         int log_martians;
1541
1542         rcu_read_lock();
1543         in_dev = __in_dev_get_rcu(rt->dst.dev);
1544         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1545                 rcu_read_unlock();
1546                 return;
1547         }
1548         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1549         rcu_read_unlock();
1550
1551         net = dev_net(rt->dst.dev);
1552         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1553         if (!peer) {
1554                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1555                 return;
1556         }
1557
1558         /* No redirected packets during ip_rt_redirect_silence;
1559          * reset the algorithm.
1560          */
1561         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1562                 peer->rate_tokens = 0;
1563
1564         /* Too many ignored redirects; do not send anything
1565          * set dst.rate_last to the last seen redirected packet.
1566          */
1567         if (peer->rate_tokens >= ip_rt_redirect_number) {
1568                 peer->rate_last = jiffies;
1569                 goto out_put_peer;
1570         }
1571
1572         /* Check for load limit; set rate_last to the latest sent
1573          * redirect.
1574          */
1575         if (peer->rate_tokens == 0 ||
1576             time_after(jiffies,
1577                        (peer->rate_last +
1578                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1579                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1580                 peer->rate_last = jiffies;
1581                 ++peer->rate_tokens;
1582 #ifdef CONFIG_IP_ROUTE_VERBOSE
1583                 if (log_martians &&
1584                     peer->rate_tokens == ip_rt_redirect_number)
1585                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1586                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1587                                              &rt->rt_dst, &rt->rt_gateway);
1588 #endif
1589         }
1590 out_put_peer:
1591         inet_putpeer(peer);
1592 }
1593
1594 static int ip_error(struct sk_buff *skb)
1595 {
1596         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1597         struct rtable *rt = skb_rtable(skb);
1598         struct inet_peer *peer;
1599         unsigned long now;
1600         struct net *net;
1601         bool send;
1602         int code;
1603
1604         net = dev_net(rt->dst.dev);
1605         if (!IN_DEV_FORWARD(in_dev)) {
1606                 switch (rt->dst.error) {
1607                 case EHOSTUNREACH:
1608                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1609                         break;
1610
1611                 case ENETUNREACH:
1612                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1613                         break;
1614                 }
1615                 goto out;
1616         }
1617
1618         switch (rt->dst.error) {
1619         case EINVAL:
1620         default:
1621                 goto out;
1622         case EHOSTUNREACH:
1623                 code = ICMP_HOST_UNREACH;
1624                 break;
1625         case ENETUNREACH:
1626                 code = ICMP_NET_UNREACH;
1627                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1628                 break;
1629         case EACCES:
1630                 code = ICMP_PKT_FILTERED;
1631                 break;
1632         }
1633
1634         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1635
1636         send = true;
1637         if (peer) {
1638                 now = jiffies;
1639                 peer->rate_tokens += now - peer->rate_last;
1640                 if (peer->rate_tokens > ip_rt_error_burst)
1641                         peer->rate_tokens = ip_rt_error_burst;
1642                 peer->rate_last = now;
1643                 if (peer->rate_tokens >= ip_rt_error_cost)
1644                         peer->rate_tokens -= ip_rt_error_cost;
1645                 else
1646                         send = false;
1647                 inet_putpeer(peer);
1648         }
1649         if (send)
1650                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1651
1652 out:    kfree_skb(skb);
1653         return 0;
1654 }
1655
1656 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1657 {
1658         struct fib_result res;
1659
1660         if (mtu < ip_rt_min_pmtu)
1661                 mtu = ip_rt_min_pmtu;
1662
1663         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1664                 struct fib_nh *nh = &FIB_RES_NH(res);
1665                 struct fib_nh_exception *fnhe;
1666
1667                 spin_lock_bh(&fnhe_lock);
1668                 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1669                 if (fnhe) {
1670                         fnhe->fnhe_pmtu = mtu;
1671                         fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1672                 }
1673                 spin_unlock_bh(&fnhe_lock);
1674         }
1675         rt->rt_pmtu = mtu;
1676         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1677 }
1678
1679 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1680                               struct sk_buff *skb, u32 mtu)
1681 {
1682         struct rtable *rt = (struct rtable *) dst;
1683         struct flowi4 fl4;
1684
1685         ip_rt_build_flow_key(&fl4, sk, skb);
1686         __ip_rt_update_pmtu(rt, &fl4, mtu);
1687 }
1688
1689 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1690                       int oif, u32 mark, u8 protocol, int flow_flags)
1691 {
1692         const struct iphdr *iph = (const struct iphdr *) skb->data;
1693         struct flowi4 fl4;
1694         struct rtable *rt;
1695
1696         __build_flow_key(&fl4, NULL, iph, oif,
1697                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1698         rt = __ip_route_output_key(net, &fl4);
1699         if (!IS_ERR(rt)) {
1700                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1701                 ip_rt_put(rt);
1702         }
1703 }
1704 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1705
1706 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1707 {
1708         const struct iphdr *iph = (const struct iphdr *) skb->data;
1709         struct flowi4 fl4;
1710         struct rtable *rt;
1711
1712         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1713         rt = __ip_route_output_key(sock_net(sk), &fl4);
1714         if (!IS_ERR(rt)) {
1715                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1716                 ip_rt_put(rt);
1717         }
1718 }
1719 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1720
1721 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1722                    int oif, u32 mark, u8 protocol, int flow_flags)
1723 {
1724         const struct iphdr *iph = (const struct iphdr *) skb->data;
1725         struct flowi4 fl4;
1726         struct rtable *rt;
1727
1728         __build_flow_key(&fl4, NULL, iph, oif,
1729                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1730         rt = __ip_route_output_key(net, &fl4);
1731         if (!IS_ERR(rt)) {
1732                 __ip_do_redirect(rt, skb, &fl4);
1733                 ip_rt_put(rt);
1734         }
1735 }
1736 EXPORT_SYMBOL_GPL(ipv4_redirect);
1737
1738 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1739 {
1740         const struct iphdr *iph = (const struct iphdr *) skb->data;
1741         struct flowi4 fl4;
1742         struct rtable *rt;
1743
1744         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1745         rt = __ip_route_output_key(sock_net(sk), &fl4);
1746         if (!IS_ERR(rt)) {
1747                 __ip_do_redirect(rt, skb, &fl4);
1748                 ip_rt_put(rt);
1749         }
1750 }
1751 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1752
1753 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1754 {
1755         struct rtable *rt = (struct rtable *) dst;
1756
1757         if (rt_is_expired(rt))
1758                 return NULL;
1759         return dst;
1760 }
1761
1762 static void ipv4_dst_destroy(struct dst_entry *dst)
1763 {
1764         struct rtable *rt = (struct rtable *) dst;
1765
1766         if (rt->fi) {
1767                 fib_info_put(rt->fi);
1768                 rt->fi = NULL;
1769         }
1770 }
1771
1772
1773 static void ipv4_link_failure(struct sk_buff *skb)
1774 {
1775         struct rtable *rt;
1776
1777         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1778
1779         rt = skb_rtable(skb);
1780         if (rt)
1781                 dst_set_expires(&rt->dst, 0);
1782 }
1783
1784 static int ip_rt_bug(struct sk_buff *skb)
1785 {
1786         pr_debug("%s: %pI4 -> %pI4, %s\n",
1787                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1788                  skb->dev ? skb->dev->name : "?");
1789         kfree_skb(skb);
1790         WARN_ON(1);
1791         return 0;
1792 }
1793
1794 /*
1795    We do not cache source address of outgoing interface,
1796    because it is used only by IP RR, TS and SRR options,
1797    so that it out of fast path.
1798
1799    BTW remember: "addr" is allowed to be not aligned
1800    in IP options!
1801  */
1802
1803 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1804 {
1805         __be32 src;
1806
1807         if (rt_is_output_route(rt))
1808                 src = ip_hdr(skb)->saddr;
1809         else {
1810                 struct fib_result res;
1811                 struct flowi4 fl4;
1812                 struct iphdr *iph;
1813
1814                 iph = ip_hdr(skb);
1815
1816                 memset(&fl4, 0, sizeof(fl4));
1817                 fl4.daddr = iph->daddr;
1818                 fl4.saddr = iph->saddr;
1819                 fl4.flowi4_tos = RT_TOS(iph->tos);
1820                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1821                 fl4.flowi4_iif = skb->dev->ifindex;
1822                 fl4.flowi4_mark = skb->mark;
1823
1824                 rcu_read_lock();
1825                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1826                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1827                 else
1828                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1829                                         RT_SCOPE_UNIVERSE);
1830                 rcu_read_unlock();
1831         }
1832         memcpy(addr, &src, 4);
1833 }
1834
1835 #ifdef CONFIG_IP_ROUTE_CLASSID
1836 static void set_class_tag(struct rtable *rt, u32 tag)
1837 {
1838         if (!(rt->dst.tclassid & 0xFFFF))
1839                 rt->dst.tclassid |= tag & 0xFFFF;
1840         if (!(rt->dst.tclassid & 0xFFFF0000))
1841                 rt->dst.tclassid |= tag & 0xFFFF0000;
1842 }
1843 #endif
1844
1845 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1846 {
1847         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1848
1849         if (advmss == 0) {
1850                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1851                                ip_rt_min_advmss);
1852                 if (advmss > 65535 - 40)
1853                         advmss = 65535 - 40;
1854         }
1855         return advmss;
1856 }
1857
1858 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1859 {
1860         const struct rtable *rt = (const struct rtable *) dst;
1861         unsigned int mtu = rt->rt_pmtu;
1862
1863         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1864                 mtu = 0;
1865
1866         if (!mtu)
1867                 mtu = dst_metric_raw(dst, RTAX_MTU);
1868
1869         if (mtu && rt_is_output_route(rt))
1870                 return mtu;
1871
1872         mtu = dst->dev->mtu;
1873
1874         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1875
1876                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1877                         mtu = 576;
1878         }
1879
1880         if (mtu > IP_MAX_MTU)
1881                 mtu = IP_MAX_MTU;
1882
1883         return mtu;
1884 }
1885
1886 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1887                             struct fib_info *fi)
1888 {
1889         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1890                 rt->fi = fi;
1891                 atomic_inc(&fi->fib_clntref);
1892         }
1893         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1894 }
1895
1896 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1897 {
1898         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1899         struct fib_nh_exception *fnhe;
1900         u32 hval;
1901
1902         hval = fnhe_hashfun(daddr);
1903
1904         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1905              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1906                 if (fnhe->fnhe_daddr == daddr) {
1907                         if (fnhe->fnhe_pmtu) {
1908                                 unsigned long expires = fnhe->fnhe_expires;
1909                                 unsigned long diff = jiffies - expires;
1910
1911                                 if (time_before(jiffies, expires)) {
1912                                         rt->rt_pmtu = fnhe->fnhe_pmtu;
1913                                         dst_set_expires(&rt->dst, diff);
1914                                 }
1915                         }
1916                         if (fnhe->fnhe_gw)
1917                                 rt->rt_gateway = fnhe->fnhe_gw;
1918                         fnhe->fnhe_stamp = jiffies;
1919                         break;
1920                 }
1921         }
1922 }
1923
1924 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1925                            const struct fib_result *res,
1926                            struct fib_info *fi, u16 type, u32 itag)
1927 {
1928         if (fi) {
1929                 struct fib_nh *nh = &FIB_RES_NH(*res);
1930
1931                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1932                         rt->rt_gateway = nh->nh_gw;
1933                 if (unlikely(nh->nh_exceptions))
1934                         rt_bind_exception(rt, nh, fl4->daddr);
1935                 rt_init_metrics(rt, fl4, fi);
1936 #ifdef CONFIG_IP_ROUTE_CLASSID
1937                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1938 #endif
1939         }
1940
1941 #ifdef CONFIG_IP_ROUTE_CLASSID
1942 #ifdef CONFIG_IP_MULTIPLE_TABLES
1943         set_class_tag(rt, res->tclassid);
1944 #endif
1945         set_class_tag(rt, itag);
1946 #endif
1947 }
1948
1949 static struct rtable *rt_dst_alloc(struct net_device *dev,
1950                                    bool nopolicy, bool noxfrm)
1951 {
1952         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1953                          DST_HOST |
1954                          (nopolicy ? DST_NOPOLICY : 0) |
1955                          (noxfrm ? DST_NOXFRM : 0));
1956 }
1957
1958 /* called in rcu_read_lock() section */
1959 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1960                                 u8 tos, struct net_device *dev, int our)
1961 {
1962         unsigned int hash;
1963         struct rtable *rth;
1964         struct in_device *in_dev = __in_dev_get_rcu(dev);
1965         u32 itag = 0;
1966         int err;
1967
1968         /* Primary sanity checks. */
1969
1970         if (in_dev == NULL)
1971                 return -EINVAL;
1972
1973         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1974             skb->protocol != htons(ETH_P_IP))
1975                 goto e_inval;
1976
1977         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1978                 if (ipv4_is_loopback(saddr))
1979                         goto e_inval;
1980
1981         if (ipv4_is_zeronet(saddr)) {
1982                 if (!ipv4_is_local_multicast(daddr))
1983                         goto e_inval;
1984         } else {
1985                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1986                                           in_dev, &itag);
1987                 if (err < 0)
1988                         goto e_err;
1989         }
1990         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1991                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1992         if (!rth)
1993                 goto e_nobufs;
1994
1995 #ifdef CONFIG_IP_ROUTE_CLASSID
1996         rth->dst.tclassid = itag;
1997 #endif
1998         rth->dst.output = ip_rt_bug;
1999
2000         rth->rt_key_dst = daddr;
2001         rth->rt_key_src = saddr;
2002         rth->rt_genid   = rt_genid(dev_net(dev));
2003         rth->rt_flags   = RTCF_MULTICAST;
2004         rth->rt_type    = RTN_MULTICAST;
2005         rth->rt_key_tos = tos;
2006         rth->rt_dst     = daddr;
2007         rth->rt_src     = saddr;
2008         rth->rt_route_iif = dev->ifindex;
2009         rth->rt_iif     = dev->ifindex;
2010         rth->rt_oif     = 0;
2011         rth->rt_mark    = skb->mark;
2012         rth->rt_pmtu    = 0;
2013         rth->rt_gateway = daddr;
2014         rth->fi = NULL;
2015         if (our) {
2016                 rth->dst.input= ip_local_deliver;
2017                 rth->rt_flags |= RTCF_LOCAL;
2018         }
2019
2020 #ifdef CONFIG_IP_MROUTE
2021         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2022                 rth->dst.input = ip_mr_input;
2023 #endif
2024         RT_CACHE_STAT_INC(in_slow_mc);
2025
2026         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2027         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2028         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2029
2030 e_nobufs:
2031         return -ENOBUFS;
2032 e_inval:
2033         return -EINVAL;
2034 e_err:
2035         return err;
2036 }
2037
2038
2039 static void ip_handle_martian_source(struct net_device *dev,
2040                                      struct in_device *in_dev,
2041                                      struct sk_buff *skb,
2042                                      __be32 daddr,
2043                                      __be32 saddr)
2044 {
2045         RT_CACHE_STAT_INC(in_martian_src);
2046 #ifdef CONFIG_IP_ROUTE_VERBOSE
2047         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2048                 /*
2049                  *      RFC1812 recommendation, if source is martian,
2050                  *      the only hint is MAC header.
2051                  */
2052                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2053                         &daddr, &saddr, dev->name);
2054                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2055                         print_hex_dump(KERN_WARNING, "ll header: ",
2056                                        DUMP_PREFIX_OFFSET, 16, 1,
2057                                        skb_mac_header(skb),
2058                                        dev->hard_header_len, true);
2059                 }
2060         }
2061 #endif
2062 }
2063
2064 /* called in rcu_read_lock() section */
2065 static int __mkroute_input(struct sk_buff *skb,
2066                            const struct fib_result *res,
2067                            struct in_device *in_dev,
2068                            __be32 daddr, __be32 saddr, u32 tos,
2069                            struct rtable **result)
2070 {
2071         struct rtable *rth;
2072         int err;
2073         struct in_device *out_dev;
2074         unsigned int flags = 0;
2075         u32 itag;
2076
2077         /* get a working reference to the output device */
2078         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2079         if (out_dev == NULL) {
2080                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2081                 return -EINVAL;
2082         }
2083
2084
2085         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2086                                   in_dev->dev, in_dev, &itag);
2087         if (err < 0) {
2088                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2089                                          saddr);
2090
2091                 goto cleanup;
2092         }
2093
2094         if (err)
2095                 flags |= RTCF_DIRECTSRC;
2096
2097         if (out_dev == in_dev && err &&
2098             (IN_DEV_SHARED_MEDIA(out_dev) ||
2099              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2100                 flags |= RTCF_DOREDIRECT;
2101
2102         if (skb->protocol != htons(ETH_P_IP)) {
2103                 /* Not IP (i.e. ARP). Do not create route, if it is
2104                  * invalid for proxy arp. DNAT routes are always valid.
2105                  *
2106                  * Proxy arp feature have been extended to allow, ARP
2107                  * replies back to the same interface, to support
2108                  * Private VLAN switch technologies. See arp.c.
2109                  */
2110                 if (out_dev == in_dev &&
2111                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2112                         err = -EINVAL;
2113                         goto cleanup;
2114                 }
2115         }
2116
2117         rth = rt_dst_alloc(out_dev->dev,
2118                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2119                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2120         if (!rth) {
2121                 err = -ENOBUFS;
2122                 goto cleanup;
2123         }
2124
2125         rth->rt_key_dst = daddr;
2126         rth->rt_key_src = saddr;
2127         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2128         rth->rt_flags = flags;
2129         rth->rt_type = res->type;
2130         rth->rt_key_tos = tos;
2131         rth->rt_dst     = daddr;
2132         rth->rt_src     = saddr;
2133         rth->rt_route_iif = in_dev->dev->ifindex;
2134         rth->rt_iif     = in_dev->dev->ifindex;
2135         rth->rt_oif     = 0;
2136         rth->rt_mark    = skb->mark;
2137         rth->rt_pmtu    = 0;
2138         rth->rt_gateway = daddr;
2139         rth->fi = NULL;
2140
2141         rth->dst.input = ip_forward;
2142         rth->dst.output = ip_output;
2143
2144         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2145
2146         *result = rth;
2147         err = 0;
2148  cleanup:
2149         return err;
2150 }
2151
2152 static int ip_mkroute_input(struct sk_buff *skb,
2153                             struct fib_result *res,
2154                             const struct flowi4 *fl4,
2155                             struct in_device *in_dev,
2156                             __be32 daddr, __be32 saddr, u32 tos)
2157 {
2158         struct rtable *rth = NULL;
2159         int err;
2160         unsigned int hash;
2161
2162 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2163         if (res->fi && res->fi->fib_nhs > 1)
2164                 fib_select_multipath(res);
2165 #endif
2166
2167         /* create a routing cache entry */
2168         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2169         if (err)
2170                 return err;
2171
2172         /* put it into the cache */
2173         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2174                        rt_genid(dev_net(rth->dst.dev)));
2175         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2176         if (IS_ERR(rth))
2177                 return PTR_ERR(rth);
2178         return 0;
2179 }
2180
2181 /*
2182  *      NOTE. We drop all the packets that has local source
2183  *      addresses, because every properly looped back packet
2184  *      must have correct destination already attached by output routine.
2185  *
2186  *      Such approach solves two big problems:
2187  *      1. Not simplex devices are handled properly.
2188  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2189  *      called with rcu_read_lock()
2190  */
2191
2192 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2193                                u8 tos, struct net_device *dev)
2194 {
2195         struct fib_result res;
2196         struct in_device *in_dev = __in_dev_get_rcu(dev);
2197         struct flowi4   fl4;
2198         unsigned int    flags = 0;
2199         u32             itag = 0;
2200         struct rtable   *rth;
2201         unsigned int    hash;
2202         int             err = -EINVAL;
2203         struct net    *net = dev_net(dev);
2204
2205         /* IP on this device is disabled. */
2206
2207         if (!in_dev)
2208                 goto out;
2209
2210         /* Check for the most weird martians, which can be not detected
2211            by fib_lookup.
2212          */
2213
2214         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2215                 goto martian_source;
2216
2217         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2218                 goto brd_input;
2219
2220         /* Accept zero addresses only to limited broadcast;
2221          * I even do not know to fix it or not. Waiting for complains :-)
2222          */
2223         if (ipv4_is_zeronet(saddr))
2224                 goto martian_source;
2225
2226         if (ipv4_is_zeronet(daddr))
2227                 goto martian_destination;
2228
2229         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2230                 if (ipv4_is_loopback(daddr))
2231                         goto martian_destination;
2232
2233                 if (ipv4_is_loopback(saddr))
2234                         goto martian_source;
2235         }
2236
2237         /*
2238          *      Now we are ready to route packet.
2239          */
2240         fl4.flowi4_oif = 0;
2241         fl4.flowi4_iif = dev->ifindex;
2242         fl4.flowi4_mark = skb->mark;
2243         fl4.flowi4_tos = tos;
2244         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2245         fl4.daddr = daddr;
2246         fl4.saddr = saddr;
2247         err = fib_lookup(net, &fl4, &res);
2248         if (err != 0)
2249                 goto no_route;
2250
2251         RT_CACHE_STAT_INC(in_slow_tot);
2252
2253         if (res.type == RTN_BROADCAST)
2254                 goto brd_input;
2255
2256         if (res.type == RTN_LOCAL) {
2257                 err = fib_validate_source(skb, saddr, daddr, tos,
2258                                           net->loopback_dev->ifindex,
2259                                           dev, in_dev, &itag);
2260                 if (err < 0)
2261                         goto martian_source_keep_err;
2262                 if (err)
2263                         flags |= RTCF_DIRECTSRC;
2264                 goto local_input;
2265         }
2266
2267         if (!IN_DEV_FORWARD(in_dev))
2268                 goto no_route;
2269         if (res.type != RTN_UNICAST)
2270                 goto martian_destination;
2271
2272         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2273 out:    return err;
2274
2275 brd_input:
2276         if (skb->protocol != htons(ETH_P_IP))
2277                 goto e_inval;
2278
2279         if (!ipv4_is_zeronet(saddr)) {
2280                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2281                                           in_dev, &itag);
2282                 if (err < 0)
2283                         goto martian_source_keep_err;
2284                 if (err)
2285                         flags |= RTCF_DIRECTSRC;
2286         }
2287         flags |= RTCF_BROADCAST;
2288         res.type = RTN_BROADCAST;
2289         RT_CACHE_STAT_INC(in_brd);
2290
2291 local_input:
2292         rth = rt_dst_alloc(net->loopback_dev,
2293                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2294         if (!rth)
2295                 goto e_nobufs;
2296
2297         rth->dst.input= ip_local_deliver;
2298         rth->dst.output= ip_rt_bug;
2299 #ifdef CONFIG_IP_ROUTE_CLASSID
2300         rth->dst.tclassid = itag;
2301 #endif
2302
2303         rth->rt_key_dst = daddr;
2304         rth->rt_key_src = saddr;
2305         rth->rt_genid = rt_genid(net);
2306         rth->rt_flags   = flags|RTCF_LOCAL;
2307         rth->rt_type    = res.type;
2308         rth->rt_key_tos = tos;
2309         rth->rt_dst     = daddr;
2310         rth->rt_src     = saddr;
2311         rth->rt_route_iif = dev->ifindex;
2312         rth->rt_iif     = dev->ifindex;
2313         rth->rt_oif     = 0;
2314         rth->rt_mark    = skb->mark;
2315         rth->rt_pmtu    = 0;
2316         rth->rt_gateway = daddr;
2317         rth->fi = NULL;
2318         if (res.type == RTN_UNREACHABLE) {
2319                 rth->dst.input= ip_error;
2320                 rth->dst.error= -err;
2321                 rth->rt_flags   &= ~RTCF_LOCAL;
2322         }
2323         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2324         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2325         err = 0;
2326         if (IS_ERR(rth))
2327                 err = PTR_ERR(rth);
2328         goto out;
2329
2330 no_route:
2331         RT_CACHE_STAT_INC(in_no_route);
2332         res.type = RTN_UNREACHABLE;
2333         if (err == -ESRCH)
2334                 err = -ENETUNREACH;
2335         goto local_input;
2336
2337         /*
2338          *      Do not cache martian addresses: they should be logged (RFC1812)
2339          */
2340 martian_destination:
2341         RT_CACHE_STAT_INC(in_martian_dst);
2342 #ifdef CONFIG_IP_ROUTE_VERBOSE
2343         if (IN_DEV_LOG_MARTIANS(in_dev))
2344                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2345                                      &daddr, &saddr, dev->name);
2346 #endif
2347
2348 e_inval:
2349         err = -EINVAL;
2350         goto out;
2351
2352 e_nobufs:
2353         err = -ENOBUFS;
2354         goto out;
2355
2356 martian_source:
2357         err = -EINVAL;
2358 martian_source_keep_err:
2359         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2360         goto out;
2361 }
2362
2363 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364                            u8 tos, struct net_device *dev, bool noref)
2365 {
2366         struct rtable   *rth;
2367         unsigned int    hash;
2368         int iif = dev->ifindex;
2369         struct net *net;
2370         int res;
2371
2372         net = dev_net(dev);
2373
2374         rcu_read_lock();
2375
2376         if (!rt_caching(net))
2377                 goto skip_cache;
2378
2379         tos &= IPTOS_RT_MASK;
2380         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2381
2382         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2383              rth = rcu_dereference(rth->dst.rt_next)) {
2384                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2385                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2386                      (rth->rt_route_iif ^ iif) |
2387                      (rth->rt_key_tos ^ tos)) == 0 &&
2388                     rth->rt_mark == skb->mark &&
2389                     net_eq(dev_net(rth->dst.dev), net) &&
2390                     !rt_is_expired(rth)) {
2391                         if (noref) {
2392                                 dst_use_noref(&rth->dst, jiffies);
2393                                 skb_dst_set_noref(skb, &rth->dst);
2394                         } else {
2395                                 dst_use(&rth->dst, jiffies);
2396                                 skb_dst_set(skb, &rth->dst);
2397                         }
2398                         RT_CACHE_STAT_INC(in_hit);
2399                         rcu_read_unlock();
2400                         return 0;
2401                 }
2402                 RT_CACHE_STAT_INC(in_hlist_search);
2403         }
2404
2405 skip_cache:
2406         /* Multicast recognition logic is moved from route cache to here.
2407            The problem was that too many Ethernet cards have broken/missing
2408            hardware multicast filters :-( As result the host on multicasting
2409            network acquires a lot of useless route cache entries, sort of
2410            SDR messages from all the world. Now we try to get rid of them.
2411            Really, provided software IP multicast filter is organized
2412            reasonably (at least, hashed), it does not result in a slowdown
2413            comparing with route cache reject entries.
2414            Note, that multicast routers are not affected, because
2415            route cache entry is created eventually.
2416          */
2417         if (ipv4_is_multicast(daddr)) {
2418                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2419
2420                 if (in_dev) {
2421                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2422                                                   ip_hdr(skb)->protocol);
2423                         if (our
2424 #ifdef CONFIG_IP_MROUTE
2425                                 ||
2426                             (!ipv4_is_local_multicast(daddr) &&
2427                              IN_DEV_MFORWARD(in_dev))
2428 #endif
2429                            ) {
2430                                 int res = ip_route_input_mc(skb, daddr, saddr,
2431                                                             tos, dev, our);
2432                                 rcu_read_unlock();
2433                                 return res;
2434                         }
2435                 }
2436                 rcu_read_unlock();
2437                 return -EINVAL;
2438         }
2439         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2440         rcu_read_unlock();
2441         return res;
2442 }
2443 EXPORT_SYMBOL(ip_route_input_common);
2444
2445 /* called with rcu_read_lock() */
2446 static struct rtable *__mkroute_output(const struct fib_result *res,
2447                                        const struct flowi4 *fl4,
2448                                        __be32 orig_daddr, __be32 orig_saddr,
2449                                        int orig_oif, __u8 orig_rtos,
2450                                        struct net_device *dev_out,
2451                                        unsigned int flags)
2452 {
2453         struct fib_info *fi = res->fi;
2454         struct in_device *in_dev;
2455         u16 type = res->type;
2456         struct rtable *rth;
2457
2458         in_dev = __in_dev_get_rcu(dev_out);
2459         if (!in_dev)
2460                 return ERR_PTR(-EINVAL);
2461
2462         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2463                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2464                         return ERR_PTR(-EINVAL);
2465
2466         if (ipv4_is_lbcast(fl4->daddr))
2467                 type = RTN_BROADCAST;
2468         else if (ipv4_is_multicast(fl4->daddr))
2469                 type = RTN_MULTICAST;
2470         else if (ipv4_is_zeronet(fl4->daddr))
2471                 return ERR_PTR(-EINVAL);
2472
2473         if (dev_out->flags & IFF_LOOPBACK)
2474                 flags |= RTCF_LOCAL;
2475
2476         if (type == RTN_BROADCAST) {
2477                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2478                 fi = NULL;
2479         } else if (type == RTN_MULTICAST) {
2480                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2481                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2482                                      fl4->flowi4_proto))
2483                         flags &= ~RTCF_LOCAL;
2484                 /* If multicast route do not exist use
2485                  * default one, but do not gateway in this case.
2486                  * Yes, it is hack.
2487                  */
2488                 if (fi && res->prefixlen < 4)
2489                         fi = NULL;
2490         }
2491
2492         rth = rt_dst_alloc(dev_out,
2493                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2494                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2495         if (!rth)
2496                 return ERR_PTR(-ENOBUFS);
2497
2498         rth->dst.output = ip_output;
2499
2500         rth->rt_key_dst = orig_daddr;
2501         rth->rt_key_src = orig_saddr;
2502         rth->rt_genid = rt_genid(dev_net(dev_out));
2503         rth->rt_flags   = flags;
2504         rth->rt_type    = type;
2505         rth->rt_key_tos = orig_rtos;
2506         rth->rt_dst     = fl4->daddr;
2507         rth->rt_src     = fl4->saddr;
2508         rth->rt_route_iif = 0;
2509         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2510         rth->rt_oif     = orig_oif;
2511         rth->rt_mark    = fl4->flowi4_mark;
2512         rth->rt_pmtu    = 0;
2513         rth->rt_gateway = fl4->daddr;
2514         rth->fi = NULL;
2515
2516         RT_CACHE_STAT_INC(out_slow_tot);
2517
2518         if (flags & RTCF_LOCAL)
2519                 rth->dst.input = ip_local_deliver;
2520         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2521                 if (flags & RTCF_LOCAL &&
2522                     !(dev_out->flags & IFF_LOOPBACK)) {
2523                         rth->dst.output = ip_mc_output;
2524                         RT_CACHE_STAT_INC(out_slow_mc);
2525                 }
2526 #ifdef CONFIG_IP_MROUTE
2527                 if (type == RTN_MULTICAST) {
2528                         if (IN_DEV_MFORWARD(in_dev) &&
2529                             !ipv4_is_local_multicast(fl4->daddr)) {
2530                                 rth->dst.input = ip_mr_input;
2531                                 rth->dst.output = ip_mc_output;
2532                         }
2533                 }
2534 #endif
2535         }
2536
2537         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2538
2539         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2540                 rth->dst.flags |= DST_NOCACHE;
2541
2542         return rth;
2543 }
2544
2545 /*
2546  * Major route resolver routine.
2547  * called with rcu_read_lock();
2548  */
2549
2550 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2551 {
2552         struct net_device *dev_out = NULL;
2553         __u8 tos = RT_FL_TOS(fl4);
2554         unsigned int flags = 0;
2555         struct fib_result res;
2556         struct rtable *rth;
2557         __be32 orig_daddr;
2558         __be32 orig_saddr;
2559         int orig_oif;
2560
2561         res.tclassid    = 0;
2562         res.fi          = NULL;
2563         res.table       = NULL;
2564
2565         orig_daddr = fl4->daddr;
2566         orig_saddr = fl4->saddr;
2567         orig_oif = fl4->flowi4_oif;
2568
2569         fl4->flowi4_iif = net->loopback_dev->ifindex;
2570         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2571         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2572                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2573
2574         rcu_read_lock();
2575         if (fl4->saddr) {
2576                 rth = ERR_PTR(-EINVAL);
2577                 if (ipv4_is_multicast(fl4->saddr) ||
2578                     ipv4_is_lbcast(fl4->saddr) ||
2579                     ipv4_is_zeronet(fl4->saddr))
2580                         goto out;
2581
2582                 /* I removed check for oif == dev_out->oif here.
2583                    It was wrong for two reasons:
2584                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2585                       is assigned to multiple interfaces.
2586                    2. Moreover, we are allowed to send packets with saddr
2587                       of another iface. --ANK
2588                  */
2589
2590                 if (fl4->flowi4_oif == 0 &&
2591                     (ipv4_is_multicast(fl4->daddr) ||
2592                      ipv4_is_lbcast(fl4->daddr))) {
2593                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2594                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2595                         if (dev_out == NULL)
2596                                 goto out;
2597
2598                         /* Special hack: user can direct multicasts
2599                            and limited broadcast via necessary interface
2600                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2601                            This hack is not just for fun, it allows
2602                            vic,vat and friends to work.
2603                            They bind socket to loopback, set ttl to zero
2604                            and expect that it will work.
2605                            From the viewpoint of routing cache they are broken,
2606                            because we are not allowed to build multicast path
2607                            with loopback source addr (look, routing cache
2608                            cannot know, that ttl is zero, so that packet
2609                            will not leave this host and route is valid).
2610                            Luckily, this hack is good workaround.
2611                          */
2612
2613                         fl4->flowi4_oif = dev_out->ifindex;
2614                         goto make_route;
2615                 }
2616
2617                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2618                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2619                         if (!__ip_dev_find(net, fl4->saddr, false))
2620                                 goto out;
2621                 }
2622         }
2623
2624
2625         if (fl4->flowi4_oif) {
2626                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2627                 rth = ERR_PTR(-ENODEV);
2628                 if (dev_out == NULL)
2629                         goto out;
2630
2631                 /* RACE: Check return value of inet_select_addr instead. */
2632                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2633                         rth = ERR_PTR(-ENETUNREACH);
2634                         goto out;
2635                 }
2636                 if (ipv4_is_local_multicast(fl4->daddr) ||
2637                     ipv4_is_lbcast(fl4->daddr)) {
2638                         if (!fl4->saddr)
2639                                 fl4->saddr = inet_select_addr(dev_out, 0,
2640                                                               RT_SCOPE_LINK);
2641                         goto make_route;
2642                 }
2643                 if (fl4->saddr) {
2644                         if (ipv4_is_multicast(fl4->daddr))
2645                                 fl4->saddr = inet_select_addr(dev_out, 0,
2646                                                               fl4->flowi4_scope);
2647                         else if (!fl4->daddr)
2648                                 fl4->saddr = inet_select_addr(dev_out, 0,
2649                                                               RT_SCOPE_HOST);
2650                 }
2651         }
2652
2653         if (!fl4->daddr) {
2654                 fl4->daddr = fl4->saddr;
2655                 if (!fl4->daddr)
2656                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2657                 dev_out = net->loopback_dev;
2658                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2659                 res.type = RTN_LOCAL;
2660                 flags |= RTCF_LOCAL;
2661                 goto make_route;
2662         }
2663
2664         if (fib_lookup(net, fl4, &res)) {
2665                 res.fi = NULL;
2666                 res.table = NULL;
2667                 if (fl4->flowi4_oif) {
2668                         /* Apparently, routing tables are wrong. Assume,
2669                            that the destination is on link.
2670
2671                            WHY? DW.
2672                            Because we are allowed to send to iface
2673                            even if it has NO routes and NO assigned
2674                            addresses. When oif is specified, routing
2675                            tables are looked up with only one purpose:
2676                            to catch if destination is gatewayed, rather than
2677                            direct. Moreover, if MSG_DONTROUTE is set,
2678                            we send packet, ignoring both routing tables
2679                            and ifaddr state. --ANK
2680
2681
2682                            We could make it even if oif is unknown,
2683                            likely IPv6, but we do not.
2684                          */
2685
2686                         if (fl4->saddr == 0)
2687                                 fl4->saddr = inet_select_addr(dev_out, 0,
2688                                                               RT_SCOPE_LINK);
2689                         res.type = RTN_UNICAST;
2690                         goto make_route;
2691                 }
2692                 rth = ERR_PTR(-ENETUNREACH);
2693                 goto out;
2694         }
2695
2696         if (res.type == RTN_LOCAL) {
2697                 if (!fl4->saddr) {
2698                         if (res.fi->fib_prefsrc)
2699                                 fl4->saddr = res.fi->fib_prefsrc;
2700                         else
2701                                 fl4->saddr = fl4->daddr;
2702                 }
2703                 dev_out = net->loopback_dev;
2704                 fl4->flowi4_oif = dev_out->ifindex;
2705                 res.fi = NULL;
2706                 flags |= RTCF_LOCAL;
2707                 goto make_route;
2708         }
2709
2710 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2711         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2712                 fib_select_multipath(&res);
2713         else
2714 #endif
2715         if (!res.prefixlen &&
2716             res.table->tb_num_default > 1 &&
2717             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2718                 fib_select_default(&res);
2719
2720         if (!fl4->saddr)
2721                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2722
2723         dev_out = FIB_RES_DEV(res);
2724         fl4->flowi4_oif = dev_out->ifindex;
2725
2726
2727 make_route:
2728         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2729                                tos, dev_out, flags);
2730         if (!IS_ERR(rth)) {
2731                 unsigned int hash;
2732
2733                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2734                                rt_genid(dev_net(dev_out)));
2735                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2736         }
2737
2738 out:
2739         rcu_read_unlock();
2740         return rth;
2741 }
2742
2743 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2744 {
2745         struct rtable *rth;
2746         unsigned int hash;
2747
2748         if (!rt_caching(net))
2749                 goto slow_output;
2750
2751         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2752
2753         rcu_read_lock_bh();
2754         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2755                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2756                 if (rth->rt_key_dst == flp4->daddr &&
2757                     rth->rt_key_src == flp4->saddr &&
2758                     rt_is_output_route(rth) &&
2759                     rth->rt_oif == flp4->flowi4_oif &&
2760                     rth->rt_mark == flp4->flowi4_mark &&
2761                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2762                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2763                     net_eq(dev_net(rth->dst.dev), net) &&
2764                     !rt_is_expired(rth)) {
2765                         dst_use(&rth->dst, jiffies);
2766                         RT_CACHE_STAT_INC(out_hit);
2767                         rcu_read_unlock_bh();
2768                         if (!flp4->saddr)
2769                                 flp4->saddr = rth->rt_src;
2770                         if (!flp4->daddr)
2771                                 flp4->daddr = rth->rt_dst;
2772                         return rth;
2773                 }
2774                 RT_CACHE_STAT_INC(out_hlist_search);
2775         }
2776         rcu_read_unlock_bh();
2777
2778 slow_output:
2779         return ip_route_output_slow(net, flp4);
2780 }
2781 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2782
2783 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2784 {
2785         return NULL;
2786 }
2787
2788 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2789 {
2790         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2791
2792         return mtu ? : dst->dev->mtu;
2793 }
2794
2795 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2796                                           struct sk_buff *skb, u32 mtu)
2797 {
2798 }
2799
2800 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2801                                        struct sk_buff *skb)
2802 {
2803 }
2804
2805 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2806                                           unsigned long old)
2807 {
2808         return NULL;
2809 }
2810
2811 static struct dst_ops ipv4_dst_blackhole_ops = {
2812         .family                 =       AF_INET,
2813         .protocol               =       cpu_to_be16(ETH_P_IP),
2814         .destroy                =       ipv4_dst_destroy,
2815         .check                  =       ipv4_blackhole_dst_check,
2816         .mtu                    =       ipv4_blackhole_mtu,
2817         .default_advmss         =       ipv4_default_advmss,
2818         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2819         .redirect               =       ipv4_rt_blackhole_redirect,
2820         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2821         .neigh_lookup           =       ipv4_neigh_lookup,
2822 };
2823
2824 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2825 {
2826         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2827         struct rtable *ort = (struct rtable *) dst_orig;
2828
2829         if (rt) {
2830                 struct dst_entry *new = &rt->dst;
2831
2832                 new->__use = 1;
2833                 new->input = dst_discard;
2834                 new->output = dst_discard;
2835
2836                 new->dev = ort->dst.dev;
2837                 if (new->dev)
2838                         dev_hold(new->dev);
2839
2840                 rt->rt_key_dst = ort->rt_key_dst;
2841                 rt->rt_key_src = ort->rt_key_src;
2842                 rt->rt_key_tos = ort->rt_key_tos;
2843                 rt->rt_route_iif = ort->rt_route_iif;
2844                 rt->rt_iif = ort->rt_iif;
2845                 rt->rt_oif = ort->rt_oif;
2846                 rt->rt_mark = ort->rt_mark;
2847                 rt->rt_pmtu = ort->rt_pmtu;
2848
2849                 rt->rt_genid = rt_genid(net);
2850                 rt->rt_flags = ort->rt_flags;
2851                 rt->rt_type = ort->rt_type;
2852                 rt->rt_dst = ort->rt_dst;
2853                 rt->rt_src = ort->rt_src;
2854                 rt->rt_gateway = ort->rt_gateway;
2855                 rt->fi = ort->fi;
2856                 if (rt->fi)
2857                         atomic_inc(&rt->fi->fib_clntref);
2858
2859                 dst_free(new);
2860         }
2861
2862         dst_release(dst_orig);
2863
2864         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2865 }
2866
2867 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2868                                     struct sock *sk)
2869 {
2870         struct rtable *rt = __ip_route_output_key(net, flp4);
2871
2872         if (IS_ERR(rt))
2873                 return rt;
2874
2875         if (flp4->flowi4_proto)
2876                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2877                                                    flowi4_to_flowi(flp4),
2878                                                    sk, 0);
2879
2880         return rt;
2881 }
2882 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2883
2884 static int rt_fill_info(struct net *net,
2885                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2886                         int nowait, unsigned int flags)
2887 {
2888         struct rtable *rt = skb_rtable(skb);
2889         struct rtmsg *r;
2890         struct nlmsghdr *nlh;
2891         unsigned long expires = 0;
2892         u32 error;
2893
2894         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2895         if (nlh == NULL)
2896                 return -EMSGSIZE;
2897
2898         r = nlmsg_data(nlh);
2899         r->rtm_family    = AF_INET;
2900         r->rtm_dst_len  = 32;
2901         r->rtm_src_len  = 0;
2902         r->rtm_tos      = rt->rt_key_tos;
2903         r->rtm_table    = RT_TABLE_MAIN;
2904         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2905                 goto nla_put_failure;
2906         r->rtm_type     = rt->rt_type;
2907         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2908         r->rtm_protocol = RTPROT_UNSPEC;
2909         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2910         if (rt->rt_flags & RTCF_NOTIFY)
2911                 r->rtm_flags |= RTM_F_NOTIFY;
2912
2913         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2914                 goto nla_put_failure;
2915         if (rt->rt_key_src) {
2916                 r->rtm_src_len = 32;
2917                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2918                         goto nla_put_failure;
2919         }
2920         if (rt->dst.dev &&
2921             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2922                 goto nla_put_failure;
2923 #ifdef CONFIG_IP_ROUTE_CLASSID
2924         if (rt->dst.tclassid &&
2925             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2926                 goto nla_put_failure;
2927 #endif
2928         if (!rt_is_input_route(rt) &&
2929             rt->rt_src != rt->rt_key_src) {
2930                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2931                         goto nla_put_failure;
2932         }
2933         if (rt->rt_dst != rt->rt_gateway &&
2934             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2935                 goto nla_put_failure;
2936
2937         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2938                 goto nla_put_failure;
2939
2940         if (rt->rt_mark &&
2941             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2942                 goto nla_put_failure;
2943
2944         error = rt->dst.error;
2945         expires = rt->dst.expires;
2946         if (expires) {
2947                 if (time_before(jiffies, expires))
2948                         expires -= jiffies;
2949                 else
2950                         expires = 0;
2951         }
2952
2953         if (rt_is_input_route(rt)) {
2954 #ifdef CONFIG_IP_MROUTE
2955                 __be32 dst = rt->rt_dst;
2956
2957                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2958                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2959                         int err = ipmr_get_route(net, skb,
2960                                                  rt->rt_src, rt->rt_dst,
2961                                                  r, nowait);
2962                         if (err <= 0) {
2963                                 if (!nowait) {
2964                                         if (err == 0)
2965                                                 return 0;
2966                                         goto nla_put_failure;
2967                                 } else {
2968                                         if (err == -EMSGSIZE)
2969                                                 goto nla_put_failure;
2970                                         error = err;
2971                                 }
2972                         }
2973                 } else
2974 #endif
2975                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2976                                 goto nla_put_failure;
2977         }
2978
2979         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2980                 goto nla_put_failure;
2981
2982         return nlmsg_end(skb, nlh);
2983
2984 nla_put_failure:
2985         nlmsg_cancel(skb, nlh);
2986         return -EMSGSIZE;
2987 }
2988
2989 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2990 {
2991         struct net *net = sock_net(in_skb->sk);
2992         struct rtmsg *rtm;
2993         struct nlattr *tb[RTA_MAX+1];
2994         struct rtable *rt = NULL;
2995         __be32 dst = 0;
2996         __be32 src = 0;
2997         u32 iif;
2998         int err;
2999         int mark;
3000         struct sk_buff *skb;
3001
3002         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3003         if (err < 0)
3004                 goto errout;
3005
3006         rtm = nlmsg_data(nlh);
3007
3008         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3009         if (skb == NULL) {
3010                 err = -ENOBUFS;
3011                 goto errout;
3012         }
3013
3014         /* Reserve room for dummy headers, this skb can pass
3015            through good chunk of routing engine.
3016          */
3017         skb_reset_mac_header(skb);
3018         skb_reset_network_header(skb);
3019
3020         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3021         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3022         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3023
3024         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3025         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3026         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3027         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3028
3029         if (iif) {
3030                 struct net_device *dev;
3031
3032                 dev = __dev_get_by_index(net, iif);
3033                 if (dev == NULL) {
3034                         err = -ENODEV;
3035                         goto errout_free;
3036                 }
3037
3038                 skb->protocol   = htons(ETH_P_IP);
3039                 skb->dev        = dev;
3040                 skb->mark       = mark;
3041                 local_bh_disable();
3042                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3043                 local_bh_enable();
3044
3045                 rt = skb_rtable(skb);
3046                 if (err == 0 && rt->dst.error)
3047                         err = -rt->dst.error;
3048         } else {
3049                 struct flowi4 fl4 = {
3050                         .daddr = dst,
3051                         .saddr = src,
3052                         .flowi4_tos = rtm->rtm_tos,
3053                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3054                         .flowi4_mark = mark,
3055                 };
3056                 rt = ip_route_output_key(net, &fl4);
3057
3058                 err = 0;
3059                 if (IS_ERR(rt))
3060                         err = PTR_ERR(rt);
3061         }
3062
3063         if (err)
3064                 goto errout_free;
3065
3066         skb_dst_set(skb, &rt->dst);
3067         if (rtm->rtm_flags & RTM_F_NOTIFY)
3068                 rt->rt_flags |= RTCF_NOTIFY;
3069
3070         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3071                            RTM_NEWROUTE, 0, 0);
3072         if (err <= 0)
3073                 goto errout_free;
3074
3075         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3076 errout:
3077         return err;
3078
3079 errout_free:
3080         kfree_skb(skb);
3081         goto errout;
3082 }
3083
3084 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3085 {
3086         struct rtable *rt;
3087         int h, s_h;
3088         int idx, s_idx;
3089         struct net *net;
3090
3091         net = sock_net(skb->sk);
3092
3093         s_h = cb->args[0];
3094         if (s_h < 0)
3095                 s_h = 0;
3096         s_idx = idx = cb->args[1];
3097         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3098                 if (!rt_hash_table[h].chain)
3099                         continue;
3100                 rcu_read_lock_bh();
3101                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3102                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3103                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3104                                 continue;
3105                         if (rt_is_expired(rt))
3106                                 continue;
3107                         skb_dst_set_noref(skb, &rt->dst);
3108                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3109                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3110                                          1, NLM_F_MULTI) <= 0) {
3111                                 skb_dst_drop(skb);
3112                                 rcu_read_unlock_bh();
3113                                 goto done;
3114                         }
3115                         skb_dst_drop(skb);
3116                 }
3117                 rcu_read_unlock_bh();
3118         }
3119
3120 done:
3121         cb->args[0] = h;
3122         cb->args[1] = idx;
3123         return skb->len;
3124 }
3125
3126 void ip_rt_multicast_event(struct in_device *in_dev)
3127 {
3128         rt_cache_flush(dev_net(in_dev->dev), 0);
3129 }
3130
3131 #ifdef CONFIG_SYSCTL
3132 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3133                                         void __user *buffer,
3134                                         size_t *lenp, loff_t *ppos)
3135 {
3136         if (write) {
3137                 int flush_delay;
3138                 ctl_table ctl;
3139                 struct net *net;
3140
3141                 memcpy(&ctl, __ctl, sizeof(ctl));
3142                 ctl.data = &flush_delay;
3143                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3144
3145                 net = (struct net *)__ctl->extra1;
3146                 rt_cache_flush(net, flush_delay);
3147                 return 0;
3148         }
3149
3150         return -EINVAL;
3151 }
3152
3153 static ctl_table ipv4_route_table[] = {
3154         {
3155                 .procname       = "gc_thresh",
3156                 .data           = &ipv4_dst_ops.gc_thresh,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec,
3160         },
3161         {
3162                 .procname       = "max_size",
3163                 .data           = &ip_rt_max_size,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec,
3167         },
3168         {
3169                 /*  Deprecated. Use gc_min_interval_ms */
3170
3171                 .procname       = "gc_min_interval",
3172                 .data           = &ip_rt_gc_min_interval,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec_jiffies,
3176         },
3177         {
3178                 .procname       = "gc_min_interval_ms",
3179                 .data           = &ip_rt_gc_min_interval,
3180                 .maxlen         = sizeof(int),
3181                 .mode           = 0644,
3182                 .proc_handler   = proc_dointvec_ms_jiffies,
3183         },
3184         {
3185                 .procname       = "gc_timeout",
3186                 .data           = &ip_rt_gc_timeout,
3187                 .maxlen         = sizeof(int),
3188                 .mode           = 0644,
3189                 .proc_handler   = proc_dointvec_jiffies,
3190         },
3191         {
3192                 .procname       = "gc_interval",
3193                 .data           = &ip_rt_gc_interval,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec_jiffies,
3197         },
3198         {
3199                 .procname       = "redirect_load",
3200                 .data           = &ip_rt_redirect_load,
3201                 .maxlen         = sizeof(int),
3202                 .mode           = 0644,
3203                 .proc_handler   = proc_dointvec,
3204         },
3205         {
3206                 .procname       = "redirect_number",
3207                 .data           = &ip_rt_redirect_number,
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0644,
3210                 .proc_handler   = proc_dointvec,
3211         },
3212         {
3213                 .procname       = "redirect_silence",
3214                 .data           = &ip_rt_redirect_silence,
3215                 .maxlen         = sizeof(int),
3216                 .mode           = 0644,
3217                 .proc_handler   = proc_dointvec,
3218         },
3219         {
3220                 .procname       = "error_cost",
3221                 .data           = &ip_rt_error_cost,
3222                 .maxlen         = sizeof(int),
3223                 .mode           = 0644,
3224                 .proc_handler   = proc_dointvec,
3225         },
3226         {
3227                 .procname       = "error_burst",
3228                 .data           = &ip_rt_error_burst,
3229                 .maxlen         = sizeof(int),
3230                 .mode           = 0644,
3231                 .proc_handler   = proc_dointvec,
3232         },
3233         {
3234                 .procname       = "gc_elasticity",
3235                 .data           = &ip_rt_gc_elasticity,
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0644,
3238                 .proc_handler   = proc_dointvec,
3239         },
3240         {
3241                 .procname       = "mtu_expires",
3242                 .data           = &ip_rt_mtu_expires,
3243                 .maxlen         = sizeof(int),
3244                 .mode           = 0644,
3245                 .proc_handler   = proc_dointvec_jiffies,
3246         },
3247         {
3248                 .procname       = "min_pmtu",
3249                 .data           = &ip_rt_min_pmtu,
3250                 .maxlen         = sizeof(int),
3251                 .mode           = 0644,
3252                 .proc_handler   = proc_dointvec,
3253         },
3254         {
3255                 .procname       = "min_adv_mss",
3256                 .data           = &ip_rt_min_advmss,
3257                 .maxlen         = sizeof(int),
3258                 .mode           = 0644,
3259                 .proc_handler   = proc_dointvec,
3260         },
3261         { }
3262 };
3263
3264 static struct ctl_table ipv4_route_flush_table[] = {
3265         {
3266                 .procname       = "flush",
3267                 .maxlen         = sizeof(int),
3268                 .mode           = 0200,
3269                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3270         },
3271         { },
3272 };
3273
3274 static __net_init int sysctl_route_net_init(struct net *net)
3275 {
3276         struct ctl_table *tbl;
3277
3278         tbl = ipv4_route_flush_table;
3279         if (!net_eq(net, &init_net)) {
3280                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3281                 if (tbl == NULL)
3282                         goto err_dup;
3283         }
3284         tbl[0].extra1 = net;
3285
3286         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3287         if (net->ipv4.route_hdr == NULL)
3288                 goto err_reg;
3289         return 0;
3290
3291 err_reg:
3292         if (tbl != ipv4_route_flush_table)
3293                 kfree(tbl);
3294 err_dup:
3295         return -ENOMEM;
3296 }
3297
3298 static __net_exit void sysctl_route_net_exit(struct net *net)
3299 {
3300         struct ctl_table *tbl;
3301
3302         tbl = net->ipv4.route_hdr->ctl_table_arg;
3303         unregister_net_sysctl_table(net->ipv4.route_hdr);
3304         BUG_ON(tbl == ipv4_route_flush_table);
3305         kfree(tbl);
3306 }
3307
3308 static __net_initdata struct pernet_operations sysctl_route_ops = {
3309         .init = sysctl_route_net_init,
3310         .exit = sysctl_route_net_exit,
3311 };
3312 #endif
3313
3314 static __net_init int rt_genid_init(struct net *net)
3315 {
3316         get_random_bytes(&net->ipv4.rt_genid,
3317                          sizeof(net->ipv4.rt_genid));
3318         get_random_bytes(&net->ipv4.dev_addr_genid,
3319                          sizeof(net->ipv4.dev_addr_genid));
3320         return 0;
3321 }
3322
3323 static __net_initdata struct pernet_operations rt_genid_ops = {
3324         .init = rt_genid_init,
3325 };
3326
3327 static int __net_init ipv4_inetpeer_init(struct net *net)
3328 {
3329         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3330
3331         if (!bp)
3332                 return -ENOMEM;
3333         inet_peer_base_init(bp);
3334         net->ipv4.peers = bp;
3335         return 0;
3336 }
3337
3338 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3339 {
3340         struct inet_peer_base *bp = net->ipv4.peers;
3341
3342         net->ipv4.peers = NULL;
3343         inetpeer_invalidate_tree(bp);
3344         kfree(bp);
3345 }
3346
3347 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3348         .init   =       ipv4_inetpeer_init,
3349         .exit   =       ipv4_inetpeer_exit,
3350 };
3351
3352 #ifdef CONFIG_IP_ROUTE_CLASSID
3353 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3354 #endif /* CONFIG_IP_ROUTE_CLASSID */
3355
3356 static __initdata unsigned long rhash_entries;
3357 static int __init set_rhash_entries(char *str)
3358 {
3359         ssize_t ret;
3360
3361         if (!str)
3362                 return 0;
3363
3364         ret = kstrtoul(str, 0, &rhash_entries);
3365         if (ret)
3366                 return 0;
3367
3368         return 1;
3369 }
3370 __setup("rhash_entries=", set_rhash_entries);
3371
3372 int __init ip_rt_init(void)
3373 {
3374         int rc = 0;
3375
3376 #ifdef CONFIG_IP_ROUTE_CLASSID
3377         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3378         if (!ip_rt_acct)
3379                 panic("IP: failed to allocate ip_rt_acct\n");
3380 #endif
3381
3382         ipv4_dst_ops.kmem_cachep =
3383                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3384                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3385
3386         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3387
3388         if (dst_entries_init(&ipv4_dst_ops) < 0)
3389                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3390
3391         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3392                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3393
3394         rt_hash_table = (struct rt_hash_bucket *)
3395                 alloc_large_system_hash("IP route cache",
3396                                         sizeof(struct rt_hash_bucket),
3397                                         rhash_entries,
3398                                         (totalram_pages >= 128 * 1024) ?
3399                                         15 : 17,
3400                                         0,
3401                                         &rt_hash_log,
3402                                         &rt_hash_mask,
3403                                         0,
3404                                         rhash_entries ? 0 : 512 * 1024);
3405         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3406         rt_hash_lock_init();
3407
3408         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3409         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3410
3411         devinet_init();
3412         ip_fib_init();
3413
3414         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3415         expires_ljiffies = jiffies;
3416         schedule_delayed_work(&expires_work,
3417                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3418
3419         if (ip_rt_proc_init())
3420                 pr_err("Unable to create route proc files\n");
3421 #ifdef CONFIG_XFRM
3422         xfrm_init();
3423         xfrm4_init(ip_rt_max_size);
3424 #endif
3425         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3426
3427 #ifdef CONFIG_SYSCTL
3428         register_pernet_subsys(&sysctl_route_ops);
3429 #endif
3430         register_pernet_subsys(&rt_genid_ops);
3431         register_pernet_subsys(&ipv4_inetpeer_ops);
3432         return rc;
3433 }
3434
3435 #ifdef CONFIG_SYSCTL
3436 /*
3437  * We really need to sanitize the damn ipv4 init order, then all
3438  * this nonsense will go away.
3439  */
3440 void __init ip_static_sysctl_init(void)
3441 {
3442         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3443 }
3444 #endif