net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 192
 193 static struct dst_ops ipv4_dst_ops = {
 194         .family =               AF_INET,
 195         .protocol =             cpu_to_be16(ETH_P_IP),
 196         .gc =                   rt_garbage_collect,
 197         .check =                ipv4_dst_check,
 198         .default_advmss =       ipv4_default_advmss,
 199         .mtu =                  ipv4_mtu,
 200         .cow_metrics =          ipv4_cow_metrics,
 201         .destroy =              ipv4_dst_destroy,
 202         .ifdown =               ipv4_dst_ifdown,
 203         .negative_advice =      ipv4_negative_advice,
 204         .link_failure =         ipv4_link_failure,
 205         .update_pmtu =          ip_rt_update_pmtu,
 206         .local_out =            __ip_local_out,
 207         .neigh_lookup =         ipv4_neigh_lookup,
 208 };
 209
 210 #define ECN_OR_COST(class)      TC_PRIO_##class
 211
 212 const __u8 ip_tos2prio[16] = {
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK)
 229 };
 230 EXPORT_SYMBOL(ip_tos2prio);
 231
 232 /*
 233  * Route cache.
 234  */
 235
 236 /* The locking scheme is rather straight forward:
 237  *
 238  * 1) Read-Copy Update protects the buckets of the central route hash.
 239  * 2) Only writers remove entries, and they hold the lock
 240  *    as they look at rtable reference counts.
 241  * 3) Only readers acquire references to rtable entries,
 242  *    they do so with atomic increments and with the
 243  *    lock held.
 244  */
 245
 246 struct rt_hash_bucket {
 247         struct rtable __rcu     *chain;
 248 };
 249
 250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 251         defined(CONFIG_PROVE_LOCKING)
 252 /*
 253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 254  * The size of this table is a power of two and depends on the number of CPUS.
 255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 256  */
 257 #ifdef CONFIG_LOCKDEP
 258 # define RT_HASH_LOCK_SZ        256
 259 #else
 260 # if NR_CPUS >= 32
 261 #  define RT_HASH_LOCK_SZ       4096
 262 # elif NR_CPUS >= 16
 263 #  define RT_HASH_LOCK_SZ       2048
 264 # elif NR_CPUS >= 8
 265 #  define RT_HASH_LOCK_SZ       1024
 266 # elif NR_CPUS >= 4
 267 #  define RT_HASH_LOCK_SZ       512
 268 # else
 269 #  define RT_HASH_LOCK_SZ       256
 270 # endif
 271 #endif
 272
 273 static spinlock_t       *rt_hash_locks;
 274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 275
 276 static __init void rt_hash_lock_init(void)
 277 {
 278         int i;
 279
 280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 281                         GFP_KERNEL);
 282         if (!rt_hash_locks)
 283                 panic("IP: failed to allocate rt_hash_locks\n");
 284
 285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 286                 spin_lock_init(&rt_hash_locks[i]);
 287 }
 288 #else
 289 # define rt_hash_lock_addr(slot) NULL
 290
 291 static inline void rt_hash_lock_init(void)
 292 {
 293 }
 294 #endif
 295
 296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 297 static unsigned int             rt_hash_mask __read_mostly;
 298 static unsigned int             rt_hash_log  __read_mostly;
 299
 300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 302
 303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 304                                    int genid)
 305 {
 306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 307                             idx, genid)
 308                 & rt_hash_mask;
 309 }
 310
 311 static inline int rt_genid(struct net *net)
 312 {
 313         return atomic_read(&net->ipv4.rt_genid);
 314 }
 315
 316 #ifdef CONFIG_PROC_FS
 317 struct rt_cache_iter_state {
 318         struct seq_net_private p;
 319         int bucket;
 320         int genid;
 321 };
 322
 323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326         struct rtable *r = NULL;
 327
 328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 330                         continue;
 331                 rcu_read_lock_bh();
 332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 333                 while (r) {
 334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 335                             r->rt_genid == st->genid)
 336                                 return r;
 337                         r = rcu_dereference_bh(r->dst.rt_next);
 338                 }
 339                 rcu_read_unlock_bh();
 340         }
 341         return r;
 342 }
 343
 344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 345                                           struct rtable *r)
 346 {
 347         struct rt_cache_iter_state *st = seq->private;
 348
 349         r = rcu_dereference_bh(r->dst.rt_next);
 350         while (!r) {
 351                 rcu_read_unlock_bh();
 352                 do {
 353                         if (--st->bucket < 0)
 354                                 return NULL;
 355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 356                 rcu_read_lock_bh();
 357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 358         }
 359         return r;
 360 }
 361
 362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 363                                         struct rtable *r)
 364 {
 365         struct rt_cache_iter_state *st = seq->private;
 366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 368                         continue;
 369                 if (r->rt_genid == st->genid)
 370                         break;
 371         }
 372         return r;
 373 }
 374
 375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 376 {
 377         struct rtable *r = rt_cache_get_first(seq);
 378
 379         if (r)
 380                 while (pos && (r = rt_cache_get_next(seq, r)))
 381                         --pos;
 382         return pos ? NULL : r;
 383 }
 384
 385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 386 {
 387         struct rt_cache_iter_state *st = seq->private;
 388         if (*pos)
 389                 return rt_cache_get_idx(seq, *pos - 1);
 390         st->genid = rt_genid(seq_file_net(seq));
 391         return SEQ_START_TOKEN;
 392 }
 393
 394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 395 {
 396         struct rtable *r;
 397
 398         if (v == SEQ_START_TOKEN)
 399                 r = rt_cache_get_first(seq);
 400         else
 401                 r = rt_cache_get_next(seq, v);
 402         ++*pos;
 403         return r;
 404 }
 405
 406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 407 {
 408         if (v && v != SEQ_START_TOKEN)
 409                 rcu_read_unlock_bh();
 410 }
 411
 412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 413 {
 414         if (v == SEQ_START_TOKEN)
 415                 seq_printf(seq, "%-127s\n",
 416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 418                            "HHUptod\tSpecDst");
 419         else {
 420                 struct rtable *r = v;
 421                 int len;
 422
 423                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 424                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 425                         r->dst.dev ? r->dst.dev->name : "*",
 426                         (__force u32)r->rt_dst,
 427                         (__force u32)r->rt_gateway,
 428                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 429                         r->dst.__use, 0, (__force u32)r->rt_src,
 430                         dst_metric_advmss(&r->dst) + 40,
 431                         dst_metric(&r->dst, RTAX_WINDOW),
 432                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 433                               dst_metric(&r->dst, RTAX_RTTVAR)),
 434                         r->rt_key_tos,
 435                         -1, 0, 0, &len);
 436
 437                 seq_printf(seq, "%*s\n", 127 - len, "");
 438         }
 439         return 0;
 440 }
 441
 442 static const struct seq_operations rt_cache_seq_ops = {
 443         .start  = rt_cache_seq_start,
 444         .next   = rt_cache_seq_next,
 445         .stop   = rt_cache_seq_stop,
 446         .show   = rt_cache_seq_show,
 447 };
 448
 449 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 450 {
 451         return seq_open_net(inode, file, &rt_cache_seq_ops,
 452                         sizeof(struct rt_cache_iter_state));
 453 }
 454
 455 static const struct file_operations rt_cache_seq_fops = {
 456         .owner   = THIS_MODULE,
 457         .open    = rt_cache_seq_open,
 458         .read    = seq_read,
 459         .llseek  = seq_lseek,
 460         .release = seq_release_net,
 461 };
 462
 463
 464 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 465 {
 466         int cpu;
 467
 468         if (*pos == 0)
 469                 return SEQ_START_TOKEN;
 470
 471         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 472                 if (!cpu_possible(cpu))
 473                         continue;
 474                 *pos = cpu+1;
 475                 return &per_cpu(rt_cache_stat, cpu);
 476         }
 477         return NULL;
 478 }
 479
 480 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 481 {
 482         int cpu;
 483
 484         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 485                 if (!cpu_possible(cpu))
 486                         continue;
 487                 *pos = cpu+1;
 488                 return &per_cpu(rt_cache_stat, cpu);
 489         }
 490         return NULL;
 491
 492 }
 493
 494 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 495 {
 496
 497 }
 498
 499 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 500 {
 501         struct rt_cache_stat *st = v;
 502
 503         if (v == SEQ_START_TOKEN) {
 504                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 505                 return 0;
 506         }
 507
 508         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 509                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 510                    dst_entries_get_slow(&ipv4_dst_ops),
 511                    st->in_hit,
 512                    st->in_slow_tot,
 513                    st->in_slow_mc,
 514                    st->in_no_route,
 515                    st->in_brd,
 516                    st->in_martian_dst,
 517                    st->in_martian_src,
 518
 519                    st->out_hit,
 520                    st->out_slow_tot,
 521                    st->out_slow_mc,
 522
 523                    st->gc_total,
 524                    st->gc_ignored,
 525                    st->gc_goal_miss,
 526                    st->gc_dst_overflow,
 527                    st->in_hlist_search,
 528                    st->out_hlist_search
 529                 );
 530         return 0;
 531 }
 532
 533 static const struct seq_operations rt_cpu_seq_ops = {
 534         .start  = rt_cpu_seq_start,
 535         .next   = rt_cpu_seq_next,
 536         .stop   = rt_cpu_seq_stop,
 537         .show   = rt_cpu_seq_show,
 538 };
 539
 540
 541 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 542 {
 543         return seq_open(file, &rt_cpu_seq_ops);
 544 }
 545
 546 static const struct file_operations rt_cpu_seq_fops = {
 547         .owner   = THIS_MODULE,
 548         .open    = rt_cpu_seq_open,
 549         .read    = seq_read,
 550         .llseek  = seq_lseek,
 551         .release = seq_release,
 552 };
 553
 554 #ifdef CONFIG_IP_ROUTE_CLASSID
 555 static int rt_acct_proc_show(struct seq_file *m, void *v)
 556 {
 557         struct ip_rt_acct *dst, *src;
 558         unsigned int i, j;
 559
 560         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 561         if (!dst)
 562                 return -ENOMEM;
 563
 564         for_each_possible_cpu(i) {
 565                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 566                 for (j = 0; j < 256; j++) {
 567                         dst[j].o_bytes   += src[j].o_bytes;
 568                         dst[j].o_packets += src[j].o_packets;
 569                         dst[j].i_bytes   += src[j].i_bytes;
 570                         dst[j].i_packets += src[j].i_packets;
 571                 }
 572         }
 573
 574         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 575         kfree(dst);
 576         return 0;
 577 }
 578
 579 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 580 {
 581         return single_open(file, rt_acct_proc_show, NULL);
 582 }
 583
 584 static const struct file_operations rt_acct_proc_fops = {
 585         .owner          = THIS_MODULE,
 586         .open           = rt_acct_proc_open,
 587         .read           = seq_read,
 588         .llseek         = seq_lseek,
 589         .release        = single_release,
 590 };
 591 #endif
 592
 593 static int __net_init ip_rt_do_proc_init(struct net *net)
 594 {
 595         struct proc_dir_entry *pde;
 596
 597         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 598                         &rt_cache_seq_fops);
 599         if (!pde)
 600                 goto err1;
 601
 602         pde = proc_create("rt_cache", S_IRUGO,
 603                           net->proc_net_stat, &rt_cpu_seq_fops);
 604         if (!pde)
 605                 goto err2;
 606
 607 #ifdef CONFIG_IP_ROUTE_CLASSID
 608         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 609         if (!pde)
 610                 goto err3;
 611 #endif
 612         return 0;
 613
 614 #ifdef CONFIG_IP_ROUTE_CLASSID
 615 err3:
 616         remove_proc_entry("rt_cache", net->proc_net_stat);
 617 #endif
 618 err2:
 619         remove_proc_entry("rt_cache", net->proc_net);
 620 err1:
 621         return -ENOMEM;
 622 }
 623
 624 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 625 {
 626         remove_proc_entry("rt_cache", net->proc_net_stat);
 627         remove_proc_entry("rt_cache", net->proc_net);
 628 #ifdef CONFIG_IP_ROUTE_CLASSID
 629         remove_proc_entry("rt_acct", net->proc_net);
 630 #endif
 631 }
 632
 633 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 634         .init = ip_rt_do_proc_init,
 635         .exit = ip_rt_do_proc_exit,
 636 };
 637
 638 static int __init ip_rt_proc_init(void)
 639 {
 640         return register_pernet_subsys(&ip_rt_proc_ops);
 641 }
 642
 643 #else
 644 static inline int ip_rt_proc_init(void)
 645 {
 646         return 0;
 647 }
 648 #endif /* CONFIG_PROC_FS */
 649
 650 static inline void rt_free(struct rtable *rt)
 651 {
 652         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 653 }
 654
 655 static inline void rt_drop(struct rtable *rt)
 656 {
 657         ip_rt_put(rt);
 658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 659 }
 660
 661 static inline int rt_fast_clean(struct rtable *rth)
 662 {
 663         /* Kill broadcast/multicast entries very aggresively, if they
 664            collide in hash table with more useful entries */
 665         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 666                 rt_is_input_route(rth) && rth->dst.rt_next;
 667 }
 668
 669 static inline int rt_valuable(struct rtable *rth)
 670 {
 671         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 672                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 673 }
 674
 675 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 676 {
 677         unsigned long age;
 678         int ret = 0;
 679
 680         if (atomic_read(&rth->dst.__refcnt))
 681                 goto out;
 682
 683         age = jiffies - rth->dst.lastuse;
 684         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 685             (age <= tmo2 && rt_valuable(rth)))
 686                 goto out;
 687         ret = 1;
 688 out:    return ret;
 689 }
 690
 691 /* Bits of score are:
 692  * 31: very valuable
 693  * 30: not quite useless
 694  * 29..0: usage counter
 695  */
 696 static inline u32 rt_score(struct rtable *rt)
 697 {
 698         u32 score = jiffies - rt->dst.lastuse;
 699
 700         score = ~score & ~(3<<30);
 701
 702         if (rt_valuable(rt))
 703                 score |= (1<<31);
 704
 705         if (rt_is_output_route(rt) ||
 706             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 707                 score |= (1<<30);
 708
 709         return score;
 710 }
 711
 712 static inline bool rt_caching(const struct net *net)
 713 {
 714         return net->ipv4.current_rt_cache_rebuild_count <=
 715                 net->ipv4.sysctl_rt_cache_rebuild_count;
 716 }
 717
 718 static inline bool compare_hash_inputs(const struct rtable *rt1,
 719                                        const struct rtable *rt2)
 720 {
 721         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 722                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 723                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 724 }
 725
 726 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 727 {
 728         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 729                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 730                 (rt1->rt_mark ^ rt2->rt_mark) |
 731                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 732                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 733                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 734 }
 735
 736 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 737 {
 738         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 739 }
 740
 741 static inline int rt_is_expired(struct rtable *rth)
 742 {
 743         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 744 }
 745
 746 /*
 747  * Perform a full scan of hash table and free all entries.
 748  * Can be called by a softirq or a process.
 749  * In the later case, we want to be reschedule if necessary
 750  */
 751 static void rt_do_flush(struct net *net, int process_context)
 752 {
 753         unsigned int i;
 754         struct rtable *rth, *next;
 755
 756         for (i = 0; i <= rt_hash_mask; i++) {
 757                 struct rtable __rcu **pprev;
 758                 struct rtable *list;
 759
 760                 if (process_context && need_resched())
 761                         cond_resched();
 762                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 763                 if (!rth)
 764                         continue;
 765
 766                 spin_lock_bh(rt_hash_lock_addr(i));
 767
 768                 list = NULL;
 769                 pprev = &rt_hash_table[i].chain;
 770                 rth = rcu_dereference_protected(*pprev,
 771                         lockdep_is_held(rt_hash_lock_addr(i)));
 772
 773                 while (rth) {
 774                         next = rcu_dereference_protected(rth->dst.rt_next,
 775                                 lockdep_is_held(rt_hash_lock_addr(i)));
 776
 777                         if (!net ||
 778                             net_eq(dev_net(rth->dst.dev), net)) {
 779                                 rcu_assign_pointer(*pprev, next);
 780                                 rcu_assign_pointer(rth->dst.rt_next, list);
 781                                 list = rth;
 782                         } else {
 783                                 pprev = &rth->dst.rt_next;
 784                         }
 785                         rth = next;
 786                 }
 787
 788                 spin_unlock_bh(rt_hash_lock_addr(i));
 789
 790                 for (; list; list = next) {
 791                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 792                         rt_free(list);
 793                 }
 794         }
 795 }
 796
 797 /*
 798  * While freeing expired entries, we compute average chain length
 799  * and standard deviation, using fixed-point arithmetic.
 800  * This to have an estimation of rt_chain_length_max
 801  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 802  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 803  */
 804
 805 #define FRACT_BITS 3
 806 #define ONE (1UL << FRACT_BITS)
 807
 808 /*
 809  * Given a hash chain and an item in this hash chain,
 810  * find if a previous entry has the same hash_inputs
 811  * (but differs on tos, mark or oif)
 812  * Returns 0 if an alias is found.
 813  * Returns ONE if rth has no alias before itself.
 814  */
 815 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 816 {
 817         const struct rtable *aux = head;
 818
 819         while (aux != rth) {
 820                 if (compare_hash_inputs(aux, rth))
 821                         return 0;
 822                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 823         }
 824         return ONE;
 825 }
 826
 827 static void rt_check_expire(void)
 828 {
 829         static unsigned int rover;
 830         unsigned int i = rover, goal;
 831         struct rtable *rth;
 832         struct rtable __rcu **rthp;
 833         unsigned long samples = 0;
 834         unsigned long sum = 0, sum2 = 0;
 835         unsigned long delta;
 836         u64 mult;
 837
 838         delta = jiffies - expires_ljiffies;
 839         expires_ljiffies = jiffies;
 840         mult = ((u64)delta) << rt_hash_log;
 841         if (ip_rt_gc_timeout > 1)
 842                 do_div(mult, ip_rt_gc_timeout);
 843         goal = (unsigned int)mult;
 844         if (goal > rt_hash_mask)
 845                 goal = rt_hash_mask + 1;
 846         for (; goal > 0; goal--) {
 847                 unsigned long tmo = ip_rt_gc_timeout;
 848                 unsigned long length;
 849
 850                 i = (i + 1) & rt_hash_mask;
 851                 rthp = &rt_hash_table[i].chain;
 852
 853                 if (need_resched())
 854                         cond_resched();
 855
 856                 samples++;
 857
 858                 if (rcu_dereference_raw(*rthp) == NULL)
 859                         continue;
 860                 length = 0;
 861                 spin_lock_bh(rt_hash_lock_addr(i));
 862                 while ((rth = rcu_dereference_protected(*rthp,
 863                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 864                         prefetch(rth->dst.rt_next);
 865                         if (rt_is_expired(rth) ||
 866                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 867                                 *rthp = rth->dst.rt_next;
 868                                 rt_free(rth);
 869                                 continue;
 870                         }
 871
 872                         /* We only count entries on a chain with equal
 873                          * hash inputs once so that entries for
 874                          * different QOS levels, and other non-hash
 875                          * input attributes don't unfairly skew the
 876                          * length computation
 877                          */
 878                         tmo >>= 1;
 879                         rthp = &rth->dst.rt_next;
 880                         length += has_noalias(rt_hash_table[i].chain, rth);
 881                 }
 882                 spin_unlock_bh(rt_hash_lock_addr(i));
 883                 sum += length;
 884                 sum2 += length*length;
 885         }
 886         if (samples) {
 887                 unsigned long avg = sum / samples;
 888                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 889                 rt_chain_length_max = max_t(unsigned long,
 890                                         ip_rt_gc_elasticity,
 891                                         (avg + 4*sd) >> FRACT_BITS);
 892         }
 893         rover = i;
 894 }
 895
 896 /*
 897  * rt_worker_func() is run in process context.
 898  * we call rt_check_expire() to scan part of the hash table
 899  */
 900 static void rt_worker_func(struct work_struct *work)
 901 {
 902         rt_check_expire();
 903         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 904 }
 905
 906 /*
 907  * Perturbation of rt_genid by a small quantity [1..256]
 908  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 909  * many times (2^24) without giving recent rt_genid.
 910  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 911  */
 912 static void rt_cache_invalidate(struct net *net)
 913 {
 914         unsigned char shuffle;
 915
 916         get_random_bytes(&shuffle, sizeof(shuffle));
 917         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 918         inetpeer_invalidate_family(AF_INET);
 919 }
 920
 921 /*
 922  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 923  * delay >= 0 : invalidate & flush cache (can be long)
 924  */
 925 void rt_cache_flush(struct net *net, int delay)
 926 {
 927         rt_cache_invalidate(net);
 928         if (delay >= 0)
 929                 rt_do_flush(net, !in_softirq());
 930 }
 931
 932 /* Flush previous cache invalidated entries from the cache */
 933 void rt_cache_flush_batch(struct net *net)
 934 {
 935         rt_do_flush(net, !in_softirq());
 936 }
 937
 938 static void rt_emergency_hash_rebuild(struct net *net)
 939 {
 940         net_warn_ratelimited("Route hash chain too long!\n");
 941         rt_cache_invalidate(net);
 942 }
 943
 944 /*
 945    Short description of GC goals.
 946
 947    We want to build algorithm, which will keep routing cache
 948    at some equilibrium point, when number of aged off entries
 949    is kept approximately equal to newly generated ones.
 950
 951    Current expiration strength is variable "expire".
 952    We try to adjust it dynamically, so that if networking
 953    is idle expires is large enough to keep enough of warm entries,
 954    and when load increases it reduces to limit cache size.
 955  */
 956
 957 static int rt_garbage_collect(struct dst_ops *ops)
 958 {
 959         static unsigned long expire = RT_GC_TIMEOUT;
 960         static unsigned long last_gc;
 961         static int rover;
 962         static int equilibrium;
 963         struct rtable *rth;
 964         struct rtable __rcu **rthp;
 965         unsigned long now = jiffies;
 966         int goal;
 967         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 968
 969         /*
 970          * Garbage collection is pretty expensive,
 971          * do not make it too frequently.
 972          */
 973
 974         RT_CACHE_STAT_INC(gc_total);
 975
 976         if (now - last_gc < ip_rt_gc_min_interval &&
 977             entries < ip_rt_max_size) {
 978                 RT_CACHE_STAT_INC(gc_ignored);
 979                 goto out;
 980         }
 981
 982         entries = dst_entries_get_slow(&ipv4_dst_ops);
 983         /* Calculate number of entries, which we want to expire now. */
 984         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 985         if (goal <= 0) {
 986                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 987                         equilibrium = ipv4_dst_ops.gc_thresh;
 988                 goal = entries - equilibrium;
 989                 if (goal > 0) {
 990                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 991                         goal = entries - equilibrium;
 992                 }
 993         } else {
 994                 /* We are in dangerous area. Try to reduce cache really
 995                  * aggressively.
 996                  */
 997                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 998                 equilibrium = entries - goal;
 999         }
1000
1001         if (now - last_gc >= ip_rt_gc_min_interval)
1002                 last_gc = now;
1003
1004         if (goal <= 0) {
1005                 equilibrium += goal;
1006                 goto work_done;
1007         }
1008
1009         do {
1010                 int i, k;
1011
1012                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1013                         unsigned long tmo = expire;
1014
1015                         k = (k + 1) & rt_hash_mask;
1016                         rthp = &rt_hash_table[k].chain;
1017                         spin_lock_bh(rt_hash_lock_addr(k));
1018                         while ((rth = rcu_dereference_protected(*rthp,
1019                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1020                                 if (!rt_is_expired(rth) &&
1021                                         !rt_may_expire(rth, tmo, expire)) {
1022                                         tmo >>= 1;
1023                                         rthp = &rth->dst.rt_next;
1024                                         continue;
1025                                 }
1026                                 *rthp = rth->dst.rt_next;
1027                                 rt_free(rth);
1028                                 goal--;
1029                         }
1030                         spin_unlock_bh(rt_hash_lock_addr(k));
1031                         if (goal <= 0)
1032                                 break;
1033                 }
1034                 rover = k;
1035
1036                 if (goal <= 0)
1037                         goto work_done;
1038
1039                 /* Goal is not achieved. We stop process if:
1040
1041                    - if expire reduced to zero. Otherwise, expire is halfed.
1042                    - if table is not full.
1043                    - if we are called from interrupt.
1044                    - jiffies check is just fallback/debug loop breaker.
1045                      We will not spin here for long time in any case.
1046                  */
1047
1048                 RT_CACHE_STAT_INC(gc_goal_miss);
1049
1050                 if (expire == 0)
1051                         break;
1052
1053                 expire >>= 1;
1054
1055                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1056                         goto out;
1057         } while (!in_softirq() && time_before_eq(jiffies, now));
1058
1059         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1060                 goto out;
1061         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1062                 goto out;
1063         net_warn_ratelimited("dst cache overflow\n");
1064         RT_CACHE_STAT_INC(gc_dst_overflow);
1065         return 1;
1066
1067 work_done:
1068         expire += ip_rt_gc_min_interval;
1069         if (expire > ip_rt_gc_timeout ||
1070             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1071             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1072                 expire = ip_rt_gc_timeout;
1073 out:    return 0;
1074 }
1075
1076 /*
1077  * Returns number of entries in a hash chain that have different hash_inputs
1078  */
1079 static int slow_chain_length(const struct rtable *head)
1080 {
1081         int length = 0;
1082         const struct rtable *rth = head;
1083
1084         while (rth) {
1085                 length += has_noalias(head, rth);
1086                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1087         }
1088         return length >> FRACT_BITS;
1089 }
1090
1091 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1092 {
1093         struct net_device *dev = dst->dev;
1094         const __be32 *pkey = daddr;
1095         const struct rtable *rt;
1096         struct neighbour *n;
1097
1098         rt = (const struct rtable *) dst;
1099         if (rt->rt_gateway)
1100                 pkey = (const __be32 *) &rt->rt_gateway;
1101
1102         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1103         if (n)
1104                 return n;
1105         return neigh_create(&arp_tbl, pkey, dev);
1106 }
1107
1108 static int rt_bind_neighbour(struct rtable *rt)
1109 {
1110         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1111         if (IS_ERR(n))
1112                 return PTR_ERR(n);
1113         dst_set_neighbour(&rt->dst, n);
1114
1115         return 0;
1116 }
1117
1118 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1119                                      struct sk_buff *skb, int ifindex)
1120 {
1121         struct rtable   *rth, *cand;
1122         struct rtable __rcu **rthp, **candp;
1123         unsigned long   now;
1124         u32             min_score;
1125         int             chain_length;
1126         int attempts = !in_softirq();
1127
1128 restart:
1129         chain_length = 0;
1130         min_score = ~(u32)0;
1131         cand = NULL;
1132         candp = NULL;
1133         now = jiffies;
1134
1135         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1136                 /*
1137                  * If we're not caching, just tell the caller we
1138                  * were successful and don't touch the route.  The
1139                  * caller hold the sole reference to the cache entry, and
1140                  * it will be released when the caller is done with it.
1141                  * If we drop it here, the callers have no way to resolve routes
1142                  * when we're not caching.  Instead, just point *rp at rt, so
1143                  * the caller gets a single use out of the route
1144                  * Note that we do rt_free on this new route entry, so that
1145                  * once its refcount hits zero, we are still able to reap it
1146                  * (Thanks Alexey)
1147                  * Note: To avoid expensive rcu stuff for this uncached dst,
1148                  * we set DST_NOCACHE so that dst_release() can free dst without
1149                  * waiting a grace period.
1150                  */
1151
1152                 rt->dst.flags |= DST_NOCACHE;
1153                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1154                         int err = rt_bind_neighbour(rt);
1155                         if (err) {
1156                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1157                                 ip_rt_put(rt);
1158                                 return ERR_PTR(err);
1159                         }
1160                 }
1161
1162                 goto skip_hashing;
1163         }
1164
1165         rthp = &rt_hash_table[hash].chain;
1166
1167         spin_lock_bh(rt_hash_lock_addr(hash));
1168         while ((rth = rcu_dereference_protected(*rthp,
1169                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1170                 if (rt_is_expired(rth)) {
1171                         *rthp = rth->dst.rt_next;
1172                         rt_free(rth);
1173                         continue;
1174                 }
1175                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1176                         /* Put it first */
1177                         *rthp = rth->dst.rt_next;
1178                         /*
1179                          * Since lookup is lockfree, the deletion
1180                          * must be visible to another weakly ordered CPU before
1181                          * the insertion at the start of the hash chain.
1182                          */
1183                         rcu_assign_pointer(rth->dst.rt_next,
1184                                            rt_hash_table[hash].chain);
1185                         /*
1186                          * Since lookup is lockfree, the update writes
1187                          * must be ordered for consistency on SMP.
1188                          */
1189                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1190
1191                         dst_use(&rth->dst, now);
1192                         spin_unlock_bh(rt_hash_lock_addr(hash));
1193
1194                         rt_drop(rt);
1195                         if (skb)
1196                                 skb_dst_set(skb, &rth->dst);
1197                         return rth;
1198                 }
1199
1200                 if (!atomic_read(&rth->dst.__refcnt)) {
1201                         u32 score = rt_score(rth);
1202
1203                         if (score <= min_score) {
1204                                 cand = rth;
1205                                 candp = rthp;
1206                                 min_score = score;
1207                         }
1208                 }
1209
1210                 chain_length++;
1211
1212                 rthp = &rth->dst.rt_next;
1213         }
1214
1215         if (cand) {
1216                 /* ip_rt_gc_elasticity used to be average length of chain
1217                  * length, when exceeded gc becomes really aggressive.
1218                  *
1219                  * The second limit is less certain. At the moment it allows
1220                  * only 2 entries per bucket. We will see.
1221                  */
1222                 if (chain_length > ip_rt_gc_elasticity) {
1223                         *candp = cand->dst.rt_next;
1224                         rt_free(cand);
1225                 }
1226         } else {
1227                 if (chain_length > rt_chain_length_max &&
1228                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1229                         struct net *net = dev_net(rt->dst.dev);
1230                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1231                         if (!rt_caching(net)) {
1232                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1233                                         rt->dst.dev->name, num);
1234                         }
1235                         rt_emergency_hash_rebuild(net);
1236                         spin_unlock_bh(rt_hash_lock_addr(hash));
1237
1238                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1239                                         ifindex, rt_genid(net));
1240                         goto restart;
1241                 }
1242         }
1243
1244         /* Try to bind route to arp only if it is output
1245            route or unicast forwarding path.
1246          */
1247         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1248                 int err = rt_bind_neighbour(rt);
1249                 if (err) {
1250                         spin_unlock_bh(rt_hash_lock_addr(hash));
1251
1252                         if (err != -ENOBUFS) {
1253                                 rt_drop(rt);
1254                                 return ERR_PTR(err);
1255                         }
1256
1257                         /* Neighbour tables are full and nothing
1258                            can be released. Try to shrink route cache,
1259                            it is most likely it holds some neighbour records.
1260                          */
1261                         if (attempts-- > 0) {
1262                                 int saved_elasticity = ip_rt_gc_elasticity;
1263                                 int saved_int = ip_rt_gc_min_interval;
1264                                 ip_rt_gc_elasticity     = 1;
1265                                 ip_rt_gc_min_interval   = 0;
1266                                 rt_garbage_collect(&ipv4_dst_ops);
1267                                 ip_rt_gc_min_interval   = saved_int;
1268                                 ip_rt_gc_elasticity     = saved_elasticity;
1269                                 goto restart;
1270                         }
1271
1272                         net_warn_ratelimited("Neighbour table overflow\n");
1273                         rt_drop(rt);
1274                         return ERR_PTR(-ENOBUFS);
1275                 }
1276         }
1277
1278         rt->dst.rt_next = rt_hash_table[hash].chain;
1279
1280         /*
1281          * Since lookup is lockfree, we must make sure
1282          * previous writes to rt are committed to memory
1283          * before making rt visible to other CPUS.
1284          */
1285         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1286
1287         spin_unlock_bh(rt_hash_lock_addr(hash));
1288
1289 skip_hashing:
1290         if (skb)
1291                 skb_dst_set(skb, &rt->dst);
1292         return rt;
1293 }
1294
1295 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1296
1297 static u32 rt_peer_genid(void)
1298 {
1299         return atomic_read(&__rt_peer_genid);
1300 }
1301
1302 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1303 {
1304         struct inet_peer_base *base;
1305         struct inet_peer *peer;
1306
1307         base = inetpeer_base_ptr(rt->_peer);
1308         if (!base)
1309                 return;
1310
1311         peer = inet_getpeer_v4(base, daddr, create);
1312         if (peer) {
1313                 if (!rt_set_peer(rt, peer))
1314                         inet_putpeer(peer);
1315                 else
1316                         rt->rt_peer_genid = rt_peer_genid();
1317         }
1318 }
1319
1320 /*
1321  * Peer allocation may fail only in serious out-of-memory conditions.  However
1322  * we still can generate some output.
1323  * Random ID selection looks a bit dangerous because we have no chances to
1324  * select ID being unique in a reasonable period of time.
1325  * But broken packet identifier may be better than no packet at all.
1326  */
1327 static void ip_select_fb_ident(struct iphdr *iph)
1328 {
1329         static DEFINE_SPINLOCK(ip_fb_id_lock);
1330         static u32 ip_fallback_id;
1331         u32 salt;
1332
1333         spin_lock_bh(&ip_fb_id_lock);
1334         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1335         iph->id = htons(salt & 0xFFFF);
1336         ip_fallback_id = salt;
1337         spin_unlock_bh(&ip_fb_id_lock);
1338 }
1339
1340 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1341 {
1342         struct rtable *rt = (struct rtable *) dst;
1343
1344         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1345                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1346
1347                 /* If peer is attached to destination, it is never detached,
1348                    so that we need not to grab a lock to dereference it.
1349                  */
1350                 if (peer) {
1351                         iph->id = htons(inet_getid(peer, more));
1352                         return;
1353                 }
1354         } else if (!rt)
1355                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1356
1357         ip_select_fb_ident(iph);
1358 }
1359 EXPORT_SYMBOL(__ip_select_ident);
1360
1361 static void rt_del(unsigned int hash, struct rtable *rt)
1362 {
1363         struct rtable __rcu **rthp;
1364         struct rtable *aux;
1365
1366         rthp = &rt_hash_table[hash].chain;
1367         spin_lock_bh(rt_hash_lock_addr(hash));
1368         ip_rt_put(rt);
1369         while ((aux = rcu_dereference_protected(*rthp,
1370                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1371                 if (aux == rt || rt_is_expired(aux)) {
1372                         *rthp = aux->dst.rt_next;
1373                         rt_free(aux);
1374                         continue;
1375                 }
1376                 rthp = &aux->dst.rt_next;
1377         }
1378         spin_unlock_bh(rt_hash_lock_addr(hash));
1379 }
1380
1381 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1382 {
1383         struct rtable *rt = (struct rtable *) dst;
1384         __be32 orig_gw = rt->rt_gateway;
1385         struct neighbour *n, *old_n;
1386
1387         dst_confirm(&rt->dst);
1388
1389         rt->rt_gateway = peer->redirect_learned.a4;
1390
1391         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1392         if (IS_ERR(n)) {
1393                 rt->rt_gateway = orig_gw;
1394                 return;
1395         }
1396         old_n = xchg(&rt->dst._neighbour, n);
1397         if (old_n)
1398                 neigh_release(old_n);
1399         if (!(n->nud_state & NUD_VALID)) {
1400                 neigh_event_send(n, NULL);
1401         } else {
1402                 rt->rt_flags |= RTCF_REDIRECTED;
1403                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1404         }
1405 }
1406
1407 /* called in rcu_read_lock() section */
1408 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1409                     __be32 saddr, struct net_device *dev)
1410 {
1411         int s, i;
1412         struct in_device *in_dev = __in_dev_get_rcu(dev);
1413         __be32 skeys[2] = { saddr, 0 };
1414         int    ikeys[2] = { dev->ifindex, 0 };
1415         struct inet_peer *peer;
1416         struct net *net;
1417
1418         if (!in_dev)
1419                 return;
1420
1421         net = dev_net(dev);
1422         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1423             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1424             ipv4_is_zeronet(new_gw))
1425                 goto reject_redirect;
1426
1427         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1428                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1429                         goto reject_redirect;
1430                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1431                         goto reject_redirect;
1432         } else {
1433                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1434                         goto reject_redirect;
1435         }
1436
1437         for (s = 0; s < 2; s++) {
1438                 for (i = 0; i < 2; i++) {
1439                         unsigned int hash;
1440                         struct rtable __rcu **rthp;
1441                         struct rtable *rt;
1442
1443                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1444
1445                         rthp = &rt_hash_table[hash].chain;
1446
1447                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1448                                 rthp = &rt->dst.rt_next;
1449
1450                                 if (rt->rt_key_dst != daddr ||
1451                                     rt->rt_key_src != skeys[s] ||
1452                                     rt->rt_oif != ikeys[i] ||
1453                                     rt_is_input_route(rt) ||
1454                                     rt_is_expired(rt) ||
1455                                     !net_eq(dev_net(rt->dst.dev), net) ||
1456                                     rt->dst.error ||
1457                                     rt->dst.dev != dev ||
1458                                     rt->rt_gateway != old_gw)
1459                                         continue;
1460
1461                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1462                                 if (peer) {
1463                                         if (peer->redirect_learned.a4 != new_gw) {
1464                                                 peer->redirect_learned.a4 = new_gw;
1465                                                 atomic_inc(&__rt_peer_genid);
1466                                         }
1467                                         check_peer_redir(&rt->dst, peer);
1468                                 }
1469                         }
1470                 }
1471         }
1472         return;
1473
1474 reject_redirect:
1475 #ifdef CONFIG_IP_ROUTE_VERBOSE
1476         if (IN_DEV_LOG_MARTIANS(in_dev))
1477                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1478                                      "  Advised path = %pI4 -> %pI4\n",
1479                                      &old_gw, dev->name, &new_gw,
1480                                      &saddr, &daddr);
1481 #endif
1482         ;
1483 }
1484
1485 static bool peer_pmtu_expired(struct inet_peer *peer)
1486 {
1487         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1488
1489         return orig &&
1490                time_after_eq(jiffies, orig) &&
1491                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1492 }
1493
1494 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1495 {
1496         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1497
1498         return orig &&
1499                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1500 }
1501
1502 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1503 {
1504         struct rtable *rt = (struct rtable *)dst;
1505         struct dst_entry *ret = dst;
1506
1507         if (rt) {
1508                 if (dst->obsolete > 0) {
1509                         ip_rt_put(rt);
1510                         ret = NULL;
1511                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1512                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1513                                                 rt->rt_oif,
1514                                                 rt_genid(dev_net(dst->dev)));
1515                         rt_del(hash, rt);
1516                         ret = NULL;
1517                 } else if (rt_has_peer(rt)) {
1518                         struct inet_peer *peer = rt_peer_ptr(rt);
1519                         if (peer_pmtu_expired(peer))
1520                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1521                 }
1522         }
1523         return ret;
1524 }
1525
1526 /*
1527  * Algorithm:
1528  *      1. The first ip_rt_redirect_number redirects are sent
1529  *         with exponential backoff, then we stop sending them at all,
1530  *         assuming that the host ignores our redirects.
1531  *      2. If we did not see packets requiring redirects
1532  *         during ip_rt_redirect_silence, we assume that the host
1533  *         forgot redirected route and start to send redirects again.
1534  *
1535  * This algorithm is much cheaper and more intelligent than dumb load limiting
1536  * in icmp.c.
1537  *
1538  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1539  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1540  */
1541
1542 void ip_rt_send_redirect(struct sk_buff *skb)
1543 {
1544         struct rtable *rt = skb_rtable(skb);
1545         struct in_device *in_dev;
1546         struct inet_peer *peer;
1547         int log_martians;
1548
1549         rcu_read_lock();
1550         in_dev = __in_dev_get_rcu(rt->dst.dev);
1551         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1552                 rcu_read_unlock();
1553                 return;
1554         }
1555         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1556         rcu_read_unlock();
1557
1558         peer = rt_get_peer_create(rt, rt->rt_dst);
1559         if (!peer) {
1560                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561                 return;
1562         }
1563
1564         /* No redirected packets during ip_rt_redirect_silence;
1565          * reset the algorithm.
1566          */
1567         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1568                 peer->rate_tokens = 0;
1569
1570         /* Too many ignored redirects; do not send anything
1571          * set dst.rate_last to the last seen redirected packet.
1572          */
1573         if (peer->rate_tokens >= ip_rt_redirect_number) {
1574                 peer->rate_last = jiffies;
1575                 return;
1576         }
1577
1578         /* Check for load limit; set rate_last to the latest sent
1579          * redirect.
1580          */
1581         if (peer->rate_tokens == 0 ||
1582             time_after(jiffies,
1583                        (peer->rate_last +
1584                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1585                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1586                 peer->rate_last = jiffies;
1587                 ++peer->rate_tokens;
1588 #ifdef CONFIG_IP_ROUTE_VERBOSE
1589                 if (log_martians &&
1590                     peer->rate_tokens == ip_rt_redirect_number)
1591                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1592                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1593                                              &rt->rt_dst, &rt->rt_gateway);
1594 #endif
1595         }
1596 }
1597
1598 static int ip_error(struct sk_buff *skb)
1599 {
1600         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1601         struct rtable *rt = skb_rtable(skb);
1602         struct inet_peer *peer;
1603         unsigned long now;
1604         struct net *net;
1605         bool send;
1606         int code;
1607
1608         net = dev_net(rt->dst.dev);
1609         if (!IN_DEV_FORWARD(in_dev)) {
1610                 switch (rt->dst.error) {
1611                 case EHOSTUNREACH:
1612                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1613                         break;
1614
1615                 case ENETUNREACH:
1616                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1617                         break;
1618                 }
1619                 goto out;
1620         }
1621
1622         switch (rt->dst.error) {
1623         case EINVAL:
1624         default:
1625                 goto out;
1626         case EHOSTUNREACH:
1627                 code = ICMP_HOST_UNREACH;
1628                 break;
1629         case ENETUNREACH:
1630                 code = ICMP_NET_UNREACH;
1631                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1632                 break;
1633         case EACCES:
1634                 code = ICMP_PKT_FILTERED;
1635                 break;
1636         }
1637
1638         peer = rt_get_peer_create(rt, rt->rt_dst);
1639
1640         send = true;
1641         if (peer) {
1642                 now = jiffies;
1643                 peer->rate_tokens += now - peer->rate_last;
1644                 if (peer->rate_tokens > ip_rt_error_burst)
1645                         peer->rate_tokens = ip_rt_error_burst;
1646                 peer->rate_last = now;
1647                 if (peer->rate_tokens >= ip_rt_error_cost)
1648                         peer->rate_tokens -= ip_rt_error_cost;
1649                 else
1650                         send = false;
1651         }
1652         if (send)
1653                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1654
1655 out:    kfree_skb(skb);
1656         return 0;
1657 }
1658
1659 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1660 {
1661         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1662
1663         if (!expires)
1664                 return;
1665         if (time_before(jiffies, expires)) {
1666                 u32 orig_dst_mtu = dst_mtu(dst);
1667                 if (peer->pmtu_learned < orig_dst_mtu) {
1668                         if (!peer->pmtu_orig)
1669                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1670                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1671                 }
1672         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1673                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1674 }
1675
1676 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1677 {
1678         struct rtable *rt = (struct rtable *) dst;
1679         struct inet_peer *peer;
1680
1681         dst_confirm(dst);
1682
1683         peer = rt_get_peer_create(rt, rt->rt_dst);
1684         if (peer) {
1685                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1686
1687                 if (mtu < ip_rt_min_pmtu)
1688                         mtu = ip_rt_min_pmtu;
1689                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1690
1691                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1692                         if (!pmtu_expires)
1693                                 pmtu_expires = 1UL;
1694
1695                         peer->pmtu_learned = mtu;
1696                         peer->pmtu_expires = pmtu_expires;
1697
1698                         atomic_inc(&__rt_peer_genid);
1699                         rt->rt_peer_genid = rt_peer_genid();
1700                 }
1701                 check_peer_pmtu(dst, peer);
1702         }
1703 }
1704
1705 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1706                       int oif, u32 mark, u8 protocol, int flow_flags)
1707 {
1708         const struct iphdr *iph = (const struct iphdr *)skb->data;
1709         struct flowi4 fl4;
1710         struct rtable *rt;
1711
1712         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1713                            protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1714                            iph->daddr, iph->saddr, 0, 0);
1715         rt = __ip_route_output_key(net, &fl4);
1716         if (!IS_ERR(rt)) {
1717                 ip_rt_update_pmtu(&rt->dst, mtu);
1718                 ip_rt_put(rt);
1719         }
1720 }
1721 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1722
1723 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1724 {
1725         const struct inet_sock *inet = inet_sk(sk);
1726
1727         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1728                                 sk->sk_bound_dev_if, sk->sk_mark,
1729                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1730                                 inet_sk_flowi_flags(sk));
1731 }
1732 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1733
1734 static void ipv4_validate_peer(struct rtable *rt)
1735 {
1736         if (rt->rt_peer_genid != rt_peer_genid()) {
1737                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1738
1739                 if (peer) {
1740                         check_peer_pmtu(&rt->dst, peer);
1741
1742                         if (peer->redirect_learned.a4 &&
1743                             peer->redirect_learned.a4 != rt->rt_gateway)
1744                                 check_peer_redir(&rt->dst, peer);
1745                 }
1746
1747                 rt->rt_peer_genid = rt_peer_genid();
1748         }
1749 }
1750
1751 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1752 {
1753         struct rtable *rt = (struct rtable *) dst;
1754
1755         if (rt_is_expired(rt))
1756                 return NULL;
1757         ipv4_validate_peer(rt);
1758         return dst;
1759 }
1760
1761 static void ipv4_dst_destroy(struct dst_entry *dst)
1762 {
1763         struct rtable *rt = (struct rtable *) dst;
1764
1765         if (rt->fi) {
1766                 fib_info_put(rt->fi);
1767                 rt->fi = NULL;
1768         }
1769         if (rt_has_peer(rt)) {
1770                 struct inet_peer *peer = rt_peer_ptr(rt);
1771                 inet_putpeer(peer);
1772         }
1773 }
1774
1775
1776 static void ipv4_link_failure(struct sk_buff *skb)
1777 {
1778         struct rtable *rt;
1779
1780         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1781
1782         rt = skb_rtable(skb);
1783         if (rt && rt_has_peer(rt)) {
1784                 struct inet_peer *peer = rt_peer_ptr(rt);
1785                 if (peer_pmtu_cleaned(peer))
1786                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1787         }
1788 }
1789
1790 static int ip_rt_bug(struct sk_buff *skb)
1791 {
1792         pr_debug("%s: %pI4 -> %pI4, %s\n",
1793                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1794                  skb->dev ? skb->dev->name : "?");
1795         kfree_skb(skb);
1796         WARN_ON(1);
1797         return 0;
1798 }
1799
1800 /*
1801    We do not cache source address of outgoing interface,
1802    because it is used only by IP RR, TS and SRR options,
1803    so that it out of fast path.
1804
1805    BTW remember: "addr" is allowed to be not aligned
1806    in IP options!
1807  */
1808
1809 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1810 {
1811         __be32 src;
1812
1813         if (rt_is_output_route(rt))
1814                 src = ip_hdr(skb)->saddr;
1815         else {
1816                 struct fib_result res;
1817                 struct flowi4 fl4;
1818                 struct iphdr *iph;
1819
1820                 iph = ip_hdr(skb);
1821
1822                 memset(&fl4, 0, sizeof(fl4));
1823                 fl4.daddr = iph->daddr;
1824                 fl4.saddr = iph->saddr;
1825                 fl4.flowi4_tos = RT_TOS(iph->tos);
1826                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1827                 fl4.flowi4_iif = skb->dev->ifindex;
1828                 fl4.flowi4_mark = skb->mark;
1829
1830                 rcu_read_lock();
1831                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1832                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1833                 else
1834                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1835                                         RT_SCOPE_UNIVERSE);
1836                 rcu_read_unlock();
1837         }
1838         memcpy(addr, &src, 4);
1839 }
1840
1841 #ifdef CONFIG_IP_ROUTE_CLASSID
1842 static void set_class_tag(struct rtable *rt, u32 tag)
1843 {
1844         if (!(rt->dst.tclassid & 0xFFFF))
1845                 rt->dst.tclassid |= tag & 0xFFFF;
1846         if (!(rt->dst.tclassid & 0xFFFF0000))
1847                 rt->dst.tclassid |= tag & 0xFFFF0000;
1848 }
1849 #endif
1850
1851 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1852 {
1853         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1854
1855         if (advmss == 0) {
1856                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1857                                ip_rt_min_advmss);
1858                 if (advmss > 65535 - 40)
1859                         advmss = 65535 - 40;
1860         }
1861         return advmss;
1862 }
1863
1864 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1865 {
1866         const struct rtable *rt = (const struct rtable *) dst;
1867         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1868
1869         if (mtu && rt_is_output_route(rt))
1870                 return mtu;
1871
1872         mtu = dst->dev->mtu;
1873
1874         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1875
1876                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1877                         mtu = 576;
1878         }
1879
1880         if (mtu > IP_MAX_MTU)
1881                 mtu = IP_MAX_MTU;
1882
1883         return mtu;
1884 }
1885
1886 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1887                             struct fib_info *fi)
1888 {
1889         struct inet_peer_base *base;
1890         struct inet_peer *peer;
1891         int create = 0;
1892
1893         /* If a peer entry exists for this destination, we must hook
1894          * it up in order to get at cached metrics.
1895          */
1896         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1897                 create = 1;
1898
1899         base = inetpeer_base_ptr(rt->_peer);
1900         BUG_ON(!base);
1901
1902         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1903         if (peer) {
1904                 __rt_set_peer(rt, peer);
1905                 rt->rt_peer_genid = rt_peer_genid();
1906                 if (inet_metrics_new(peer))
1907                         memcpy(peer->metrics, fi->fib_metrics,
1908                                sizeof(u32) * RTAX_MAX);
1909                 dst_init_metrics(&rt->dst, peer->metrics, false);
1910
1911                 check_peer_pmtu(&rt->dst, peer);
1912
1913                 if (peer->redirect_learned.a4 &&
1914                     peer->redirect_learned.a4 != rt->rt_gateway) {
1915                         rt->rt_gateway = peer->redirect_learned.a4;
1916                         rt->rt_flags |= RTCF_REDIRECTED;
1917                 }
1918         } else {
1919                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1920                         rt->fi = fi;
1921                         atomic_inc(&fi->fib_clntref);
1922                 }
1923                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1924         }
1925 }
1926
1927 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1928                            const struct fib_result *res,
1929                            struct fib_info *fi, u16 type, u32 itag)
1930 {
1931         struct dst_entry *dst = &rt->dst;
1932
1933         if (fi) {
1934                 if (FIB_RES_GW(*res) &&
1935                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1936                         rt->rt_gateway = FIB_RES_GW(*res);
1937                 rt_init_metrics(rt, fl4, fi);
1938 #ifdef CONFIG_IP_ROUTE_CLASSID
1939                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1940 #endif
1941         }
1942
1943         if (dst_mtu(dst) > IP_MAX_MTU)
1944                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1945
1946 #ifdef CONFIG_IP_ROUTE_CLASSID
1947 #ifdef CONFIG_IP_MULTIPLE_TABLES
1948         set_class_tag(rt, fib_rules_tclass(res));
1949 #endif
1950         set_class_tag(rt, itag);
1951 #endif
1952 }
1953
1954 static struct rtable *rt_dst_alloc(struct net_device *dev,
1955                                    bool nopolicy, bool noxfrm)
1956 {
1957         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1958                          DST_HOST |
1959                          (nopolicy ? DST_NOPOLICY : 0) |
1960                          (noxfrm ? DST_NOXFRM : 0));
1961 }
1962
1963 /* called in rcu_read_lock() section */
1964 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1965                                 u8 tos, struct net_device *dev, int our)
1966 {
1967         unsigned int hash;
1968         struct rtable *rth;
1969         struct in_device *in_dev = __in_dev_get_rcu(dev);
1970         u32 itag = 0;
1971         int err;
1972
1973         /* Primary sanity checks. */
1974
1975         if (in_dev == NULL)
1976                 return -EINVAL;
1977
1978         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1979             skb->protocol != htons(ETH_P_IP))
1980                 goto e_inval;
1981
1982         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1983                 if (ipv4_is_loopback(saddr))
1984                         goto e_inval;
1985
1986         if (ipv4_is_zeronet(saddr)) {
1987                 if (!ipv4_is_local_multicast(daddr))
1988                         goto e_inval;
1989         } else {
1990                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1991                                           in_dev, &itag);
1992                 if (err < 0)
1993                         goto e_err;
1994         }
1995         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1996                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1997         if (!rth)
1998                 goto e_nobufs;
1999
2000 #ifdef CONFIG_IP_ROUTE_CLASSID
2001         rth->dst.tclassid = itag;
2002 #endif
2003         rth->dst.output = ip_rt_bug;
2004
2005         rth->rt_key_dst = daddr;
2006         rth->rt_key_src = saddr;
2007         rth->rt_genid   = rt_genid(dev_net(dev));
2008         rth->rt_flags   = RTCF_MULTICAST;
2009         rth->rt_type    = RTN_MULTICAST;
2010         rth->rt_key_tos = tos;
2011         rth->rt_dst     = daddr;
2012         rth->rt_src     = saddr;
2013         rth->rt_route_iif = dev->ifindex;
2014         rth->rt_iif     = dev->ifindex;
2015         rth->rt_oif     = 0;
2016         rth->rt_mark    = skb->mark;
2017         rth->rt_gateway = daddr;
2018         rth->rt_peer_genid = 0;
2019         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2020         rth->fi = NULL;
2021         if (our) {
2022                 rth->dst.input= ip_local_deliver;
2023                 rth->rt_flags |= RTCF_LOCAL;
2024         }
2025
2026 #ifdef CONFIG_IP_MROUTE
2027         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2028                 rth->dst.input = ip_mr_input;
2029 #endif
2030         RT_CACHE_STAT_INC(in_slow_mc);
2031
2032         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2033         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2034         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2035
2036 e_nobufs:
2037         return -ENOBUFS;
2038 e_inval:
2039         return -EINVAL;
2040 e_err:
2041         return err;
2042 }
2043
2044
2045 static void ip_handle_martian_source(struct net_device *dev,
2046                                      struct in_device *in_dev,
2047                                      struct sk_buff *skb,
2048                                      __be32 daddr,
2049                                      __be32 saddr)
2050 {
2051         RT_CACHE_STAT_INC(in_martian_src);
2052 #ifdef CONFIG_IP_ROUTE_VERBOSE
2053         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2054                 /*
2055                  *      RFC1812 recommendation, if source is martian,
2056                  *      the only hint is MAC header.
2057                  */
2058                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2059                         &daddr, &saddr, dev->name);
2060                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2061                         print_hex_dump(KERN_WARNING, "ll header: ",
2062                                        DUMP_PREFIX_OFFSET, 16, 1,
2063                                        skb_mac_header(skb),
2064                                        dev->hard_header_len, true);
2065                 }
2066         }
2067 #endif
2068 }
2069
2070 /* called in rcu_read_lock() section */
2071 static int __mkroute_input(struct sk_buff *skb,
2072                            const struct fib_result *res,
2073                            struct in_device *in_dev,
2074                            __be32 daddr, __be32 saddr, u32 tos,
2075                            struct rtable **result)
2076 {
2077         struct rtable *rth;
2078         int err;
2079         struct in_device *out_dev;
2080         unsigned int flags = 0;
2081         u32 itag;
2082
2083         /* get a working reference to the output device */
2084         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2085         if (out_dev == NULL) {
2086                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2087                 return -EINVAL;
2088         }
2089
2090
2091         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2092                                   in_dev->dev, in_dev, &itag);
2093         if (err < 0) {
2094                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2095                                          saddr);
2096
2097                 goto cleanup;
2098         }
2099
2100         if (err)
2101                 flags |= RTCF_DIRECTSRC;
2102
2103         if (out_dev == in_dev && err &&
2104             (IN_DEV_SHARED_MEDIA(out_dev) ||
2105              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2106                 flags |= RTCF_DOREDIRECT;
2107
2108         if (skb->protocol != htons(ETH_P_IP)) {
2109                 /* Not IP (i.e. ARP). Do not create route, if it is
2110                  * invalid for proxy arp. DNAT routes are always valid.
2111                  *
2112                  * Proxy arp feature have been extended to allow, ARP
2113                  * replies back to the same interface, to support
2114                  * Private VLAN switch technologies. See arp.c.
2115                  */
2116                 if (out_dev == in_dev &&
2117                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2118                         err = -EINVAL;
2119                         goto cleanup;
2120                 }
2121         }
2122
2123         rth = rt_dst_alloc(out_dev->dev,
2124                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2125                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2126         if (!rth) {
2127                 err = -ENOBUFS;
2128                 goto cleanup;
2129         }
2130
2131         rth->rt_key_dst = daddr;
2132         rth->rt_key_src = saddr;
2133         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2134         rth->rt_flags = flags;
2135         rth->rt_type = res->type;
2136         rth->rt_key_tos = tos;
2137         rth->rt_dst     = daddr;
2138         rth->rt_src     = saddr;
2139         rth->rt_route_iif = in_dev->dev->ifindex;
2140         rth->rt_iif     = in_dev->dev->ifindex;
2141         rth->rt_oif     = 0;
2142         rth->rt_mark    = skb->mark;
2143         rth->rt_gateway = daddr;
2144         rth->rt_peer_genid = 0;
2145         rt_init_peer(rth, &res->table->tb_peers);
2146         rth->fi = NULL;
2147
2148         rth->dst.input = ip_forward;
2149         rth->dst.output = ip_output;
2150
2151         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2152
2153         *result = rth;
2154         err = 0;
2155  cleanup:
2156         return err;
2157 }
2158
2159 static int ip_mkroute_input(struct sk_buff *skb,
2160                             struct fib_result *res,
2161                             const struct flowi4 *fl4,
2162                             struct in_device *in_dev,
2163                             __be32 daddr, __be32 saddr, u32 tos)
2164 {
2165         struct rtable *rth = NULL;
2166         int err;
2167         unsigned int hash;
2168
2169 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2170         if (res->fi && res->fi->fib_nhs > 1)
2171                 fib_select_multipath(res);
2172 #endif
2173
2174         /* create a routing cache entry */
2175         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2176         if (err)
2177                 return err;
2178
2179         /* put it into the cache */
2180         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2181                        rt_genid(dev_net(rth->dst.dev)));
2182         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2183         if (IS_ERR(rth))
2184                 return PTR_ERR(rth);
2185         return 0;
2186 }
2187
2188 /*
2189  *      NOTE. We drop all the packets that has local source
2190  *      addresses, because every properly looped back packet
2191  *      must have correct destination already attached by output routine.
2192  *
2193  *      Such approach solves two big problems:
2194  *      1. Not simplex devices are handled properly.
2195  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2196  *      called with rcu_read_lock()
2197  */
2198
2199 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2200                                u8 tos, struct net_device *dev)
2201 {
2202         struct fib_result res;
2203         struct in_device *in_dev = __in_dev_get_rcu(dev);
2204         struct flowi4   fl4;
2205         unsigned int    flags = 0;
2206         u32             itag = 0;
2207         struct rtable   *rth;
2208         unsigned int    hash;
2209         int             err = -EINVAL;
2210         struct net    *net = dev_net(dev);
2211
2212         /* IP on this device is disabled. */
2213
2214         if (!in_dev)
2215                 goto out;
2216
2217         /* Check for the most weird martians, which can be not detected
2218            by fib_lookup.
2219          */
2220
2221         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2222                 goto martian_source;
2223
2224         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2225                 goto brd_input;
2226
2227         /* Accept zero addresses only to limited broadcast;
2228          * I even do not know to fix it or not. Waiting for complains :-)
2229          */
2230         if (ipv4_is_zeronet(saddr))
2231                 goto martian_source;
2232
2233         if (ipv4_is_zeronet(daddr))
2234                 goto martian_destination;
2235
2236         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2237                 if (ipv4_is_loopback(daddr))
2238                         goto martian_destination;
2239
2240                 if (ipv4_is_loopback(saddr))
2241                         goto martian_source;
2242         }
2243
2244         /*
2245          *      Now we are ready to route packet.
2246          */
2247         fl4.flowi4_oif = 0;
2248         fl4.flowi4_iif = dev->ifindex;
2249         fl4.flowi4_mark = skb->mark;
2250         fl4.flowi4_tos = tos;
2251         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2252         fl4.daddr = daddr;
2253         fl4.saddr = saddr;
2254         err = fib_lookup(net, &fl4, &res);
2255         if (err != 0)
2256                 goto no_route;
2257
2258         RT_CACHE_STAT_INC(in_slow_tot);
2259
2260         if (res.type == RTN_BROADCAST)
2261                 goto brd_input;
2262
2263         if (res.type == RTN_LOCAL) {
2264                 err = fib_validate_source(skb, saddr, daddr, tos,
2265                                           net->loopback_dev->ifindex,
2266                                           dev, in_dev, &itag);
2267                 if (err < 0)
2268                         goto martian_source_keep_err;
2269                 if (err)
2270                         flags |= RTCF_DIRECTSRC;
2271                 goto local_input;
2272         }
2273
2274         if (!IN_DEV_FORWARD(in_dev))
2275                 goto no_route;
2276         if (res.type != RTN_UNICAST)
2277                 goto martian_destination;
2278
2279         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2280 out:    return err;
2281
2282 brd_input:
2283         if (skb->protocol != htons(ETH_P_IP))
2284                 goto e_inval;
2285
2286         if (!ipv4_is_zeronet(saddr)) {
2287                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2288                                           in_dev, &itag);
2289                 if (err < 0)
2290                         goto martian_source_keep_err;
2291                 if (err)
2292                         flags |= RTCF_DIRECTSRC;
2293         }
2294         flags |= RTCF_BROADCAST;
2295         res.type = RTN_BROADCAST;
2296         RT_CACHE_STAT_INC(in_brd);
2297
2298 local_input:
2299         rth = rt_dst_alloc(net->loopback_dev,
2300                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2301         if (!rth)
2302                 goto e_nobufs;
2303
2304         rth->dst.input= ip_local_deliver;
2305         rth->dst.output= ip_rt_bug;
2306 #ifdef CONFIG_IP_ROUTE_CLASSID
2307         rth->dst.tclassid = itag;
2308 #endif
2309
2310         rth->rt_key_dst = daddr;
2311         rth->rt_key_src = saddr;
2312         rth->rt_genid = rt_genid(net);
2313         rth->rt_flags   = flags|RTCF_LOCAL;
2314         rth->rt_type    = res.type;
2315         rth->rt_key_tos = tos;
2316         rth->rt_dst     = daddr;
2317         rth->rt_src     = saddr;
2318         rth->rt_route_iif = dev->ifindex;
2319         rth->rt_iif     = dev->ifindex;
2320         rth->rt_oif     = 0;
2321         rth->rt_mark    = skb->mark;
2322         rth->rt_gateway = daddr;
2323         rth->rt_peer_genid = 0;
2324         rt_init_peer(rth, net->ipv4.peers);
2325         rth->fi = NULL;
2326         if (res.type == RTN_UNREACHABLE) {
2327                 rth->dst.input= ip_error;
2328                 rth->dst.error= -err;
2329                 rth->rt_flags   &= ~RTCF_LOCAL;
2330         }
2331         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2332         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2333         err = 0;
2334         if (IS_ERR(rth))
2335                 err = PTR_ERR(rth);
2336         goto out;
2337
2338 no_route:
2339         RT_CACHE_STAT_INC(in_no_route);
2340         res.type = RTN_UNREACHABLE;
2341         if (err == -ESRCH)
2342                 err = -ENETUNREACH;
2343         goto local_input;
2344
2345         /*
2346          *      Do not cache martian addresses: they should be logged (RFC1812)
2347          */
2348 martian_destination:
2349         RT_CACHE_STAT_INC(in_martian_dst);
2350 #ifdef CONFIG_IP_ROUTE_VERBOSE
2351         if (IN_DEV_LOG_MARTIANS(in_dev))
2352                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2353                                      &daddr, &saddr, dev->name);
2354 #endif
2355
2356 e_inval:
2357         err = -EINVAL;
2358         goto out;
2359
2360 e_nobufs:
2361         err = -ENOBUFS;
2362         goto out;
2363
2364 martian_source:
2365         err = -EINVAL;
2366 martian_source_keep_err:
2367         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2368         goto out;
2369 }
2370
2371 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2372                            u8 tos, struct net_device *dev, bool noref)
2373 {
2374         struct rtable   *rth;
2375         unsigned int    hash;
2376         int iif = dev->ifindex;
2377         struct net *net;
2378         int res;
2379
2380         net = dev_net(dev);
2381
2382         rcu_read_lock();
2383
2384         if (!rt_caching(net))
2385                 goto skip_cache;
2386
2387         tos &= IPTOS_RT_MASK;
2388         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2389
2390         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2391              rth = rcu_dereference(rth->dst.rt_next)) {
2392                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2393                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2394                      (rth->rt_route_iif ^ iif) |
2395                      (rth->rt_key_tos ^ tos)) == 0 &&
2396                     rth->rt_mark == skb->mark &&
2397                     net_eq(dev_net(rth->dst.dev), net) &&
2398                     !rt_is_expired(rth)) {
2399                         ipv4_validate_peer(rth);
2400                         if (noref) {
2401                                 dst_use_noref(&rth->dst, jiffies);
2402                                 skb_dst_set_noref(skb, &rth->dst);
2403                         } else {
2404                                 dst_use(&rth->dst, jiffies);
2405                                 skb_dst_set(skb, &rth->dst);
2406                         }
2407                         RT_CACHE_STAT_INC(in_hit);
2408                         rcu_read_unlock();
2409                         return 0;
2410                 }
2411                 RT_CACHE_STAT_INC(in_hlist_search);
2412         }
2413
2414 skip_cache:
2415         /* Multicast recognition logic is moved from route cache to here.
2416            The problem was that too many Ethernet cards have broken/missing
2417            hardware multicast filters :-( As result the host on multicasting
2418            network acquires a lot of useless route cache entries, sort of
2419            SDR messages from all the world. Now we try to get rid of them.
2420            Really, provided software IP multicast filter is organized
2421            reasonably (at least, hashed), it does not result in a slowdown
2422            comparing with route cache reject entries.
2423            Note, that multicast routers are not affected, because
2424            route cache entry is created eventually.
2425          */
2426         if (ipv4_is_multicast(daddr)) {
2427                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2428
2429                 if (in_dev) {
2430                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2431                                                   ip_hdr(skb)->protocol);
2432                         if (our
2433 #ifdef CONFIG_IP_MROUTE
2434                                 ||
2435                             (!ipv4_is_local_multicast(daddr) &&
2436                              IN_DEV_MFORWARD(in_dev))
2437 #endif
2438                            ) {
2439                                 int res = ip_route_input_mc(skb, daddr, saddr,
2440                                                             tos, dev, our);
2441                                 rcu_read_unlock();
2442                                 return res;
2443                         }
2444                 }
2445                 rcu_read_unlock();
2446                 return -EINVAL;
2447         }
2448         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2449         rcu_read_unlock();
2450         return res;
2451 }
2452 EXPORT_SYMBOL(ip_route_input_common);
2453
2454 /* called with rcu_read_lock() */
2455 static struct rtable *__mkroute_output(const struct fib_result *res,
2456                                        const struct flowi4 *fl4,
2457                                        __be32 orig_daddr, __be32 orig_saddr,
2458                                        int orig_oif, __u8 orig_rtos,
2459                                        struct net_device *dev_out,
2460                                        unsigned int flags)
2461 {
2462         struct fib_info *fi = res->fi;
2463         struct in_device *in_dev;
2464         u16 type = res->type;
2465         struct rtable *rth;
2466
2467         in_dev = __in_dev_get_rcu(dev_out);
2468         if (!in_dev)
2469                 return ERR_PTR(-EINVAL);
2470
2471         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2472                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2473                         return ERR_PTR(-EINVAL);
2474
2475         if (ipv4_is_lbcast(fl4->daddr))
2476                 type = RTN_BROADCAST;
2477         else if (ipv4_is_multicast(fl4->daddr))
2478                 type = RTN_MULTICAST;
2479         else if (ipv4_is_zeronet(fl4->daddr))
2480                 return ERR_PTR(-EINVAL);
2481
2482         if (dev_out->flags & IFF_LOOPBACK)
2483                 flags |= RTCF_LOCAL;
2484
2485         if (type == RTN_BROADCAST) {
2486                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2487                 fi = NULL;
2488         } else if (type == RTN_MULTICAST) {
2489                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2490                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2491                                      fl4->flowi4_proto))
2492                         flags &= ~RTCF_LOCAL;
2493                 /* If multicast route do not exist use
2494                  * default one, but do not gateway in this case.
2495                  * Yes, it is hack.
2496                  */
2497                 if (fi && res->prefixlen < 4)
2498                         fi = NULL;
2499         }
2500
2501         rth = rt_dst_alloc(dev_out,
2502                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2503                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2504         if (!rth)
2505                 return ERR_PTR(-ENOBUFS);
2506
2507         rth->dst.output = ip_output;
2508
2509         rth->rt_key_dst = orig_daddr;
2510         rth->rt_key_src = orig_saddr;
2511         rth->rt_genid = rt_genid(dev_net(dev_out));
2512         rth->rt_flags   = flags;
2513         rth->rt_type    = type;
2514         rth->rt_key_tos = orig_rtos;
2515         rth->rt_dst     = fl4->daddr;
2516         rth->rt_src     = fl4->saddr;
2517         rth->rt_route_iif = 0;
2518         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2519         rth->rt_oif     = orig_oif;
2520         rth->rt_mark    = fl4->flowi4_mark;
2521         rth->rt_gateway = fl4->daddr;
2522         rth->rt_peer_genid = 0;
2523         rt_init_peer(rth, (res->table ?
2524                            &res->table->tb_peers :
2525                            dev_net(dev_out)->ipv4.peers));
2526         rth->fi = NULL;
2527
2528         RT_CACHE_STAT_INC(out_slow_tot);
2529
2530         if (flags & RTCF_LOCAL)
2531                 rth->dst.input = ip_local_deliver;
2532         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2533                 if (flags & RTCF_LOCAL &&
2534                     !(dev_out->flags & IFF_LOOPBACK)) {
2535                         rth->dst.output = ip_mc_output;
2536                         RT_CACHE_STAT_INC(out_slow_mc);
2537                 }
2538 #ifdef CONFIG_IP_MROUTE
2539                 if (type == RTN_MULTICAST) {
2540                         if (IN_DEV_MFORWARD(in_dev) &&
2541                             !ipv4_is_local_multicast(fl4->daddr)) {
2542                                 rth->dst.input = ip_mr_input;
2543                                 rth->dst.output = ip_mc_output;
2544                         }
2545                 }
2546 #endif
2547         }
2548
2549         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2550
2551         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2552                 rth->dst.flags |= DST_NOCACHE;
2553
2554         return rth;
2555 }
2556
2557 /*
2558  * Major route resolver routine.
2559  * called with rcu_read_lock();
2560  */
2561
2562 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2563 {
2564         struct net_device *dev_out = NULL;
2565         __u8 tos = RT_FL_TOS(fl4);
2566         unsigned int flags = 0;
2567         struct fib_result res;
2568         struct rtable *rth;
2569         __be32 orig_daddr;
2570         __be32 orig_saddr;
2571         int orig_oif;
2572
2573         res.fi          = NULL;
2574         res.table       = NULL;
2575 #ifdef CONFIG_IP_MULTIPLE_TABLES
2576         res.r           = NULL;
2577 #endif
2578
2579         orig_daddr = fl4->daddr;
2580         orig_saddr = fl4->saddr;
2581         orig_oif = fl4->flowi4_oif;
2582
2583         fl4->flowi4_iif = net->loopback_dev->ifindex;
2584         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2585         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2586                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2587
2588         rcu_read_lock();
2589         if (fl4->saddr) {
2590                 rth = ERR_PTR(-EINVAL);
2591                 if (ipv4_is_multicast(fl4->saddr) ||
2592                     ipv4_is_lbcast(fl4->saddr) ||
2593                     ipv4_is_zeronet(fl4->saddr))
2594                         goto out;
2595
2596                 /* I removed check for oif == dev_out->oif here.
2597                    It was wrong for two reasons:
2598                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2599                       is assigned to multiple interfaces.
2600                    2. Moreover, we are allowed to send packets with saddr
2601                       of another iface. --ANK
2602                  */
2603
2604                 if (fl4->flowi4_oif == 0 &&
2605                     (ipv4_is_multicast(fl4->daddr) ||
2606                      ipv4_is_lbcast(fl4->daddr))) {
2607                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2608                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2609                         if (dev_out == NULL)
2610                                 goto out;
2611
2612                         /* Special hack: user can direct multicasts
2613                            and limited broadcast via necessary interface
2614                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2615                            This hack is not just for fun, it allows
2616                            vic,vat and friends to work.
2617                            They bind socket to loopback, set ttl to zero
2618                            and expect that it will work.
2619                            From the viewpoint of routing cache they are broken,
2620                            because we are not allowed to build multicast path
2621                            with loopback source addr (look, routing cache
2622                            cannot know, that ttl is zero, so that packet
2623                            will not leave this host and route is valid).
2624                            Luckily, this hack is good workaround.
2625                          */
2626
2627                         fl4->flowi4_oif = dev_out->ifindex;
2628                         goto make_route;
2629                 }
2630
2631                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2632                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2633                         if (!__ip_dev_find(net, fl4->saddr, false))
2634                                 goto out;
2635                 }
2636         }
2637
2638
2639         if (fl4->flowi4_oif) {
2640                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2641                 rth = ERR_PTR(-ENODEV);
2642                 if (dev_out == NULL)
2643                         goto out;
2644
2645                 /* RACE: Check return value of inet_select_addr instead. */
2646                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2647                         rth = ERR_PTR(-ENETUNREACH);
2648                         goto out;
2649                 }
2650                 if (ipv4_is_local_multicast(fl4->daddr) ||
2651                     ipv4_is_lbcast(fl4->daddr)) {
2652                         if (!fl4->saddr)
2653                                 fl4->saddr = inet_select_addr(dev_out, 0,
2654                                                               RT_SCOPE_LINK);
2655                         goto make_route;
2656                 }
2657                 if (fl4->saddr) {
2658                         if (ipv4_is_multicast(fl4->daddr))
2659                                 fl4->saddr = inet_select_addr(dev_out, 0,
2660                                                               fl4->flowi4_scope);
2661                         else if (!fl4->daddr)
2662                                 fl4->saddr = inet_select_addr(dev_out, 0,
2663                                                               RT_SCOPE_HOST);
2664                 }
2665         }
2666
2667         if (!fl4->daddr) {
2668                 fl4->daddr = fl4->saddr;
2669                 if (!fl4->daddr)
2670                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2671                 dev_out = net->loopback_dev;
2672                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2673                 res.type = RTN_LOCAL;
2674                 flags |= RTCF_LOCAL;
2675                 goto make_route;
2676         }
2677
2678         if (fib_lookup(net, fl4, &res)) {
2679                 res.fi = NULL;
2680                 res.table = NULL;
2681                 if (fl4->flowi4_oif) {
2682                         /* Apparently, routing tables are wrong. Assume,
2683                            that the destination is on link.
2684
2685                            WHY? DW.
2686                            Because we are allowed to send to iface
2687                            even if it has NO routes and NO assigned
2688                            addresses. When oif is specified, routing
2689                            tables are looked up with only one purpose:
2690                            to catch if destination is gatewayed, rather than
2691                            direct. Moreover, if MSG_DONTROUTE is set,
2692                            we send packet, ignoring both routing tables
2693                            and ifaddr state. --ANK
2694
2695
2696                            We could make it even if oif is unknown,
2697                            likely IPv6, but we do not.
2698                          */
2699
2700                         if (fl4->saddr == 0)
2701                                 fl4->saddr = inet_select_addr(dev_out, 0,
2702                                                               RT_SCOPE_LINK);
2703                         res.type = RTN_UNICAST;
2704                         goto make_route;
2705                 }
2706                 rth = ERR_PTR(-ENETUNREACH);
2707                 goto out;
2708         }
2709
2710         if (res.type == RTN_LOCAL) {
2711                 if (!fl4->saddr) {
2712                         if (res.fi->fib_prefsrc)
2713                                 fl4->saddr = res.fi->fib_prefsrc;
2714                         else
2715                                 fl4->saddr = fl4->daddr;
2716                 }
2717                 dev_out = net->loopback_dev;
2718                 fl4->flowi4_oif = dev_out->ifindex;
2719                 res.fi = NULL;
2720                 flags |= RTCF_LOCAL;
2721                 goto make_route;
2722         }
2723
2724 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2725         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2726                 fib_select_multipath(&res);
2727         else
2728 #endif
2729         if (!res.prefixlen &&
2730             res.table->tb_num_default > 1 &&
2731             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2732                 fib_select_default(&res);
2733
2734         if (!fl4->saddr)
2735                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2736
2737         dev_out = FIB_RES_DEV(res);
2738         fl4->flowi4_oif = dev_out->ifindex;
2739
2740
2741 make_route:
2742         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2743                                tos, dev_out, flags);
2744         if (!IS_ERR(rth)) {
2745                 unsigned int hash;
2746
2747                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2748                                rt_genid(dev_net(dev_out)));
2749                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2750         }
2751
2752 out:
2753         rcu_read_unlock();
2754         return rth;
2755 }
2756
2757 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2758 {
2759         struct rtable *rth;
2760         unsigned int hash;
2761
2762         if (!rt_caching(net))
2763                 goto slow_output;
2764
2765         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2766
2767         rcu_read_lock_bh();
2768         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2769                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2770                 if (rth->rt_key_dst == flp4->daddr &&
2771                     rth->rt_key_src == flp4->saddr &&
2772                     rt_is_output_route(rth) &&
2773                     rth->rt_oif == flp4->flowi4_oif &&
2774                     rth->rt_mark == flp4->flowi4_mark &&
2775                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2776                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2777                     net_eq(dev_net(rth->dst.dev), net) &&
2778                     !rt_is_expired(rth)) {
2779                         ipv4_validate_peer(rth);
2780                         dst_use(&rth->dst, jiffies);
2781                         RT_CACHE_STAT_INC(out_hit);
2782                         rcu_read_unlock_bh();
2783                         if (!flp4->saddr)
2784                                 flp4->saddr = rth->rt_src;
2785                         if (!flp4->daddr)
2786                                 flp4->daddr = rth->rt_dst;
2787                         return rth;
2788                 }
2789                 RT_CACHE_STAT_INC(out_hlist_search);
2790         }
2791         rcu_read_unlock_bh();
2792
2793 slow_output:
2794         return ip_route_output_slow(net, flp4);
2795 }
2796 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2797
2798 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2799 {
2800         return NULL;
2801 }
2802
2803 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2804 {
2805         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2806
2807         return mtu ? : dst->dev->mtu;
2808 }
2809
2810 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2811 {
2812 }
2813
2814 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2815                                           unsigned long old)
2816 {
2817         return NULL;
2818 }
2819
2820 static struct dst_ops ipv4_dst_blackhole_ops = {
2821         .family                 =       AF_INET,
2822         .protocol               =       cpu_to_be16(ETH_P_IP),
2823         .destroy                =       ipv4_dst_destroy,
2824         .check                  =       ipv4_blackhole_dst_check,
2825         .mtu                    =       ipv4_blackhole_mtu,
2826         .default_advmss         =       ipv4_default_advmss,
2827         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2828         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2829         .neigh_lookup           =       ipv4_neigh_lookup,
2830 };
2831
2832 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2833 {
2834         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2835         struct rtable *ort = (struct rtable *) dst_orig;
2836
2837         if (rt) {
2838                 struct dst_entry *new = &rt->dst;
2839
2840                 new->__use = 1;
2841                 new->input = dst_discard;
2842                 new->output = dst_discard;
2843                 dst_copy_metrics(new, &ort->dst);
2844
2845                 new->dev = ort->dst.dev;
2846                 if (new->dev)
2847                         dev_hold(new->dev);
2848
2849                 rt->rt_key_dst = ort->rt_key_dst;
2850                 rt->rt_key_src = ort->rt_key_src;
2851                 rt->rt_key_tos = ort->rt_key_tos;
2852                 rt->rt_route_iif = ort->rt_route_iif;
2853                 rt->rt_iif = ort->rt_iif;
2854                 rt->rt_oif = ort->rt_oif;
2855                 rt->rt_mark = ort->rt_mark;
2856
2857                 rt->rt_genid = rt_genid(net);
2858                 rt->rt_flags = ort->rt_flags;
2859                 rt->rt_type = ort->rt_type;
2860                 rt->rt_dst = ort->rt_dst;
2861                 rt->rt_src = ort->rt_src;
2862                 rt->rt_gateway = ort->rt_gateway;
2863                 rt_transfer_peer(rt, ort);
2864                 rt->fi = ort->fi;
2865                 if (rt->fi)
2866                         atomic_inc(&rt->fi->fib_clntref);
2867
2868                 dst_free(new);
2869         }
2870
2871         dst_release(dst_orig);
2872
2873         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2874 }
2875
2876 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2877                                     struct sock *sk)
2878 {
2879         struct rtable *rt = __ip_route_output_key(net, flp4);
2880
2881         if (IS_ERR(rt))
2882                 return rt;
2883
2884         if (flp4->flowi4_proto)
2885                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2886                                                    flowi4_to_flowi(flp4),
2887                                                    sk, 0);
2888
2889         return rt;
2890 }
2891 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2892
2893 static int rt_fill_info(struct net *net,
2894                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2895                         int nowait, unsigned int flags)
2896 {
2897         struct rtable *rt = skb_rtable(skb);
2898         struct rtmsg *r;
2899         struct nlmsghdr *nlh;
2900         unsigned long expires = 0;
2901         u32 id = 0, ts = 0, tsage = 0, error;
2902
2903         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2904         if (nlh == NULL)
2905                 return -EMSGSIZE;
2906
2907         r = nlmsg_data(nlh);
2908         r->rtm_family    = AF_INET;
2909         r->rtm_dst_len  = 32;
2910         r->rtm_src_len  = 0;
2911         r->rtm_tos      = rt->rt_key_tos;
2912         r->rtm_table    = RT_TABLE_MAIN;
2913         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2914                 goto nla_put_failure;
2915         r->rtm_type     = rt->rt_type;
2916         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2917         r->rtm_protocol = RTPROT_UNSPEC;
2918         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2919         if (rt->rt_flags & RTCF_NOTIFY)
2920                 r->rtm_flags |= RTM_F_NOTIFY;
2921
2922         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2923                 goto nla_put_failure;
2924         if (rt->rt_key_src) {
2925                 r->rtm_src_len = 32;
2926                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2927                         goto nla_put_failure;
2928         }
2929         if (rt->dst.dev &&
2930             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2931                 goto nla_put_failure;
2932 #ifdef CONFIG_IP_ROUTE_CLASSID
2933         if (rt->dst.tclassid &&
2934             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2935                 goto nla_put_failure;
2936 #endif
2937         if (!rt_is_input_route(rt) &&
2938             rt->rt_src != rt->rt_key_src) {
2939                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2940                         goto nla_put_failure;
2941         }
2942         if (rt->rt_dst != rt->rt_gateway &&
2943             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2944                 goto nla_put_failure;
2945
2946         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2947                 goto nla_put_failure;
2948
2949         if (rt->rt_mark &&
2950             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2951                 goto nla_put_failure;
2952
2953         error = rt->dst.error;
2954         if (rt_has_peer(rt)) {
2955                 const struct inet_peer *peer = rt_peer_ptr(rt);
2956                 inet_peer_refcheck(peer);
2957                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2958                 if (peer->tcp_ts_stamp) {
2959                         ts = peer->tcp_ts;
2960                         tsage = get_seconds() - peer->tcp_ts_stamp;
2961                 }
2962                 expires = ACCESS_ONCE(peer->pmtu_expires);
2963                 if (expires) {
2964                         if (time_before(jiffies, expires))
2965                                 expires -= jiffies;
2966                         else
2967                                 expires = 0;
2968                 }
2969         }
2970
2971         if (rt_is_input_route(rt)) {
2972 #ifdef CONFIG_IP_MROUTE
2973                 __be32 dst = rt->rt_dst;
2974
2975                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2976                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2977                         int err = ipmr_get_route(net, skb,
2978                                                  rt->rt_src, rt->rt_dst,
2979                                                  r, nowait);
2980                         if (err <= 0) {
2981                                 if (!nowait) {
2982                                         if (err == 0)
2983                                                 return 0;
2984                                         goto nla_put_failure;
2985                                 } else {
2986                                         if (err == -EMSGSIZE)
2987                                                 goto nla_put_failure;
2988                                         error = err;
2989                                 }
2990                         }
2991                 } else
2992 #endif
2993                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2994                                 goto nla_put_failure;
2995         }
2996
2997         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2998                                expires, error) < 0)
2999                 goto nla_put_failure;
3000
3001         return nlmsg_end(skb, nlh);
3002
3003 nla_put_failure:
3004         nlmsg_cancel(skb, nlh);
3005         return -EMSGSIZE;
3006 }
3007
3008 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3009 {
3010         struct net *net = sock_net(in_skb->sk);
3011         struct rtmsg *rtm;
3012         struct nlattr *tb[RTA_MAX+1];
3013         struct rtable *rt = NULL;
3014         __be32 dst = 0;
3015         __be32 src = 0;
3016         u32 iif;
3017         int err;
3018         int mark;
3019         struct sk_buff *skb;
3020
3021         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3022         if (err < 0)
3023                 goto errout;
3024
3025         rtm = nlmsg_data(nlh);
3026
3027         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3028         if (skb == NULL) {
3029                 err = -ENOBUFS;
3030                 goto errout;
3031         }
3032
3033         /* Reserve room for dummy headers, this skb can pass
3034            through good chunk of routing engine.
3035          */
3036         skb_reset_mac_header(skb);
3037         skb_reset_network_header(skb);
3038
3039         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3040         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3041         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3042
3043         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3044         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3045         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3046         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3047
3048         if (iif) {
3049                 struct net_device *dev;
3050
3051                 dev = __dev_get_by_index(net, iif);
3052                 if (dev == NULL) {
3053                         err = -ENODEV;
3054                         goto errout_free;
3055                 }
3056
3057                 skb->protocol   = htons(ETH_P_IP);
3058                 skb->dev        = dev;
3059                 skb->mark       = mark;
3060                 local_bh_disable();
3061                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3062                 local_bh_enable();
3063
3064                 rt = skb_rtable(skb);
3065                 if (err == 0 && rt->dst.error)
3066                         err = -rt->dst.error;
3067         } else {
3068                 struct flowi4 fl4 = {
3069                         .daddr = dst,
3070                         .saddr = src,
3071                         .flowi4_tos = rtm->rtm_tos,
3072                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3073                         .flowi4_mark = mark,
3074                 };
3075                 rt = ip_route_output_key(net, &fl4);
3076
3077                 err = 0;
3078                 if (IS_ERR(rt))
3079                         err = PTR_ERR(rt);
3080         }
3081
3082         if (err)
3083                 goto errout_free;
3084
3085         skb_dst_set(skb, &rt->dst);
3086         if (rtm->rtm_flags & RTM_F_NOTIFY)
3087                 rt->rt_flags |= RTCF_NOTIFY;
3088
3089         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3090                            RTM_NEWROUTE, 0, 0);
3091         if (err <= 0)
3092                 goto errout_free;
3093
3094         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3095 errout:
3096         return err;
3097
3098 errout_free:
3099         kfree_skb(skb);
3100         goto errout;
3101 }
3102
3103 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3104 {
3105         struct rtable *rt;
3106         int h, s_h;
3107         int idx, s_idx;
3108         struct net *net;
3109
3110         net = sock_net(skb->sk);
3111
3112         s_h = cb->args[0];
3113         if (s_h < 0)
3114                 s_h = 0;
3115         s_idx = idx = cb->args[1];
3116         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3117                 if (!rt_hash_table[h].chain)
3118                         continue;
3119                 rcu_read_lock_bh();
3120                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3121                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3122                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3123                                 continue;
3124                         if (rt_is_expired(rt))
3125                                 continue;
3126                         skb_dst_set_noref(skb, &rt->dst);
3127                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3128                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3129                                          1, NLM_F_MULTI) <= 0) {
3130                                 skb_dst_drop(skb);
3131                                 rcu_read_unlock_bh();
3132                                 goto done;
3133                         }
3134                         skb_dst_drop(skb);
3135                 }
3136                 rcu_read_unlock_bh();
3137         }
3138
3139 done:
3140         cb->args[0] = h;
3141         cb->args[1] = idx;
3142         return skb->len;
3143 }
3144
3145 void ip_rt_multicast_event(struct in_device *in_dev)
3146 {
3147         rt_cache_flush(dev_net(in_dev->dev), 0);
3148 }
3149
3150 #ifdef CONFIG_SYSCTL
3151 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3152                                         void __user *buffer,
3153                                         size_t *lenp, loff_t *ppos)
3154 {
3155         if (write) {
3156                 int flush_delay;
3157                 ctl_table ctl;
3158                 struct net *net;
3159
3160                 memcpy(&ctl, __ctl, sizeof(ctl));
3161                 ctl.data = &flush_delay;
3162                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3163
3164                 net = (struct net *)__ctl->extra1;
3165                 rt_cache_flush(net, flush_delay);
3166                 return 0;
3167         }
3168
3169         return -EINVAL;
3170 }
3171
3172 static ctl_table ipv4_route_table[] = {
3173         {
3174                 .procname       = "gc_thresh",
3175                 .data           = &ipv4_dst_ops.gc_thresh,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "max_size",
3182                 .data           = &ip_rt_max_size,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 /*  Deprecated. Use gc_min_interval_ms */
3189
3190                 .procname       = "gc_min_interval",
3191                 .data           = &ip_rt_gc_min_interval,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec_jiffies,
3195         },
3196         {
3197                 .procname       = "gc_min_interval_ms",
3198                 .data           = &ip_rt_gc_min_interval,
3199                 .maxlen         = sizeof(int),
3200                 .mode           = 0644,
3201                 .proc_handler   = proc_dointvec_ms_jiffies,
3202         },
3203         {
3204                 .procname       = "gc_timeout",
3205                 .data           = &ip_rt_gc_timeout,
3206                 .maxlen         = sizeof(int),
3207                 .mode           = 0644,
3208                 .proc_handler   = proc_dointvec_jiffies,
3209         },
3210         {
3211                 .procname       = "gc_interval",
3212                 .data           = &ip_rt_gc_interval,
3213                 .maxlen         = sizeof(int),
3214                 .mode           = 0644,
3215                 .proc_handler   = proc_dointvec_jiffies,
3216         },
3217         {
3218                 .procname       = "redirect_load",
3219                 .data           = &ip_rt_redirect_load,
3220                 .maxlen         = sizeof(int),
3221                 .mode           = 0644,
3222                 .proc_handler   = proc_dointvec,
3223         },
3224         {
3225                 .procname       = "redirect_number",
3226                 .data           = &ip_rt_redirect_number,
3227                 .maxlen         = sizeof(int),
3228                 .mode           = 0644,
3229                 .proc_handler   = proc_dointvec,
3230         },
3231         {
3232                 .procname       = "redirect_silence",
3233                 .data           = &ip_rt_redirect_silence,
3234                 .maxlen         = sizeof(int),
3235                 .mode           = 0644,
3236                 .proc_handler   = proc_dointvec,
3237         },
3238         {
3239                 .procname       = "error_cost",
3240                 .data           = &ip_rt_error_cost,
3241                 .maxlen         = sizeof(int),
3242                 .mode           = 0644,
3243                 .proc_handler   = proc_dointvec,
3244         },
3245         {
3246                 .procname       = "error_burst",
3247                 .data           = &ip_rt_error_burst,
3248                 .maxlen         = sizeof(int),
3249                 .mode           = 0644,
3250                 .proc_handler   = proc_dointvec,
3251         },
3252         {
3253                 .procname       = "gc_elasticity",
3254                 .data           = &ip_rt_gc_elasticity,
3255                 .maxlen         = sizeof(int),
3256                 .mode           = 0644,
3257                 .proc_handler   = proc_dointvec,
3258         },
3259         {
3260                 .procname       = "mtu_expires",
3261                 .data           = &ip_rt_mtu_expires,
3262                 .maxlen         = sizeof(int),
3263                 .mode           = 0644,
3264                 .proc_handler   = proc_dointvec_jiffies,
3265         },
3266         {
3267                 .procname       = "min_pmtu",
3268                 .data           = &ip_rt_min_pmtu,
3269                 .maxlen         = sizeof(int),
3270                 .mode           = 0644,
3271                 .proc_handler   = proc_dointvec,
3272         },
3273         {
3274                 .procname       = "min_adv_mss",
3275                 .data           = &ip_rt_min_advmss,
3276                 .maxlen         = sizeof(int),
3277                 .mode           = 0644,
3278                 .proc_handler   = proc_dointvec,
3279         },
3280         { }
3281 };
3282
3283 static struct ctl_table ipv4_route_flush_table[] = {
3284         {
3285                 .procname       = "flush",
3286                 .maxlen         = sizeof(int),
3287                 .mode           = 0200,
3288                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3289         },
3290         { },
3291 };
3292
3293 static __net_init int sysctl_route_net_init(struct net *net)
3294 {
3295         struct ctl_table *tbl;
3296
3297         tbl = ipv4_route_flush_table;
3298         if (!net_eq(net, &init_net)) {
3299                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3300                 if (tbl == NULL)
3301                         goto err_dup;
3302         }
3303         tbl[0].extra1 = net;
3304
3305         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3306         if (net->ipv4.route_hdr == NULL)
3307                 goto err_reg;
3308         return 0;
3309
3310 err_reg:
3311         if (tbl != ipv4_route_flush_table)
3312                 kfree(tbl);
3313 err_dup:
3314         return -ENOMEM;
3315 }
3316
3317 static __net_exit void sysctl_route_net_exit(struct net *net)
3318 {
3319         struct ctl_table *tbl;
3320
3321         tbl = net->ipv4.route_hdr->ctl_table_arg;
3322         unregister_net_sysctl_table(net->ipv4.route_hdr);
3323         BUG_ON(tbl == ipv4_route_flush_table);
3324         kfree(tbl);
3325 }
3326
3327 static __net_initdata struct pernet_operations sysctl_route_ops = {
3328         .init = sysctl_route_net_init,
3329         .exit = sysctl_route_net_exit,
3330 };
3331 #endif
3332
3333 static __net_init int rt_genid_init(struct net *net)
3334 {
3335         get_random_bytes(&net->ipv4.rt_genid,
3336                          sizeof(net->ipv4.rt_genid));
3337         get_random_bytes(&net->ipv4.dev_addr_genid,
3338                          sizeof(net->ipv4.dev_addr_genid));
3339         return 0;
3340 }
3341
3342 static __net_initdata struct pernet_operations rt_genid_ops = {
3343         .init = rt_genid_init,
3344 };
3345
3346 static int __net_init ipv4_inetpeer_init(struct net *net)
3347 {
3348         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3349
3350         if (!bp)
3351                 return -ENOMEM;
3352         inet_peer_base_init(bp);
3353         net->ipv4.peers = bp;
3354         return 0;
3355 }
3356
3357 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3358 {
3359         struct inet_peer_base *bp = net->ipv4.peers;
3360
3361         net->ipv4.peers = NULL;
3362         inetpeer_invalidate_tree(bp);
3363         kfree(bp);
3364 }
3365
3366 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3367         .init   =       ipv4_inetpeer_init,
3368         .exit   =       ipv4_inetpeer_exit,
3369 };
3370
3371 #ifdef CONFIG_IP_ROUTE_CLASSID
3372 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_IP_ROUTE_CLASSID */
3374
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3377 {
3378         ssize_t ret;
3379
3380         if (!str)
3381                 return 0;
3382
3383         ret = kstrtoul(str, 0, &rhash_entries);
3384         if (ret)
3385                 return 0;
3386
3387         return 1;
3388 }
3389 __setup("rhash_entries=", set_rhash_entries);
3390
3391 int __init ip_rt_init(void)
3392 {
3393         int rc = 0;
3394
3395 #ifdef CONFIG_IP_ROUTE_CLASSID
3396         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3397         if (!ip_rt_acct)
3398                 panic("IP: failed to allocate ip_rt_acct\n");
3399 #endif
3400
3401         ipv4_dst_ops.kmem_cachep =
3402                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3403                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3404
3405         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3406
3407         if (dst_entries_init(&ipv4_dst_ops) < 0)
3408                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3409
3410         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3411                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3412
3413         rt_hash_table = (struct rt_hash_bucket *)
3414                 alloc_large_system_hash("IP route cache",
3415                                         sizeof(struct rt_hash_bucket),
3416                                         rhash_entries,
3417                                         (totalram_pages >= 128 * 1024) ?
3418                                         15 : 17,
3419                                         0,
3420                                         &rt_hash_log,
3421                                         &rt_hash_mask,
3422                                         0,
3423                                         rhash_entries ? 0 : 512 * 1024);
3424         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3425         rt_hash_lock_init();
3426
3427         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3428         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3429
3430         devinet_init();
3431         ip_fib_init();
3432
3433         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3434         expires_ljiffies = jiffies;
3435         schedule_delayed_work(&expires_work,
3436                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3437
3438         if (ip_rt_proc_init())
3439                 pr_err("Unable to create route proc files\n");
3440 #ifdef CONFIG_XFRM
3441         xfrm_init();
3442         xfrm4_init(ip_rt_max_size);
3443 #endif
3444         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3445
3446 #ifdef CONFIG_SYSCTL
3447         register_pernet_subsys(&sysctl_route_ops);
3448 #endif
3449         register_pernet_subsys(&rt_genid_ops);
3450         register_pernet_subsys(&ipv4_inetpeer_ops);
3451         return rc;
3452 }
3453
3454 #ifdef CONFIG_SYSCTL
3455 /*
3456  * We really need to sanitize the damn ipv4 init order, then all
3457  * this nonsense will go away.
3458  */
3459 void __init ip_static_sysctl_init(void)
3460 {
3461         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3462 }
3463 #endif