net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113 #include <net/ip_tunnels.h>
 114 #include <net/l3mdev.h>
 115
 116 #include "fib_lookup.h"
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .open    = rt_cache_seq_open,
 244         .read    = seq_read,
 245         .llseek  = seq_lseek,
 246         .release = seq_release,
 247 };
 248
 249
 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251 {
 252         int cpu;
 253
 254         if (*pos == 0)
 255                 return SEQ_START_TOKEN;
 256
 257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                 if (!cpu_possible(cpu))
 259                         continue;
 260                 *pos = cpu+1;
 261                 return &per_cpu(rt_cache_stat, cpu);
 262         }
 263         return NULL;
 264 }
 265
 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267 {
 268         int cpu;
 269
 270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363
 364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365 {
 366         return single_open(file, rt_acct_proc_show, NULL);
 367 }
 368
 369 static const struct file_operations rt_acct_proc_fops = {
 370         .open           = rt_acct_proc_open,
 371         .read           = seq_read,
 372         .llseek         = seq_lseek,
 373         .release        = single_release,
 374 };
 375 #endif
 376
 377 static int __net_init ip_rt_do_proc_init(struct net *net)
 378 {
 379         struct proc_dir_entry *pde;
 380
 381         pde = proc_create("rt_cache", 0444, net->proc_net,
 382                           &rt_cache_seq_fops);
 383         if (!pde)
 384                 goto err1;
 385
 386         pde = proc_create("rt_cache", 0444,
 387                           net->proc_net_stat, &rt_cpu_seq_fops);
 388         if (!pde)
 389                 goto err2;
 390
 391 #ifdef CONFIG_IP_ROUTE_CLASSID
 392         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 393         if (!pde)
 394                 goto err3;
 395 #endif
 396         return 0;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399 err3:
 400         remove_proc_entry("rt_cache", net->proc_net_stat);
 401 #endif
 402 err2:
 403         remove_proc_entry("rt_cache", net->proc_net);
 404 err1:
 405         return -ENOMEM;
 406 }
 407
 408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409 {
 410         remove_proc_entry("rt_cache", net->proc_net_stat);
 411         remove_proc_entry("rt_cache", net->proc_net);
 412 #ifdef CONFIG_IP_ROUTE_CLASSID
 413         remove_proc_entry("rt_acct", net->proc_net);
 414 #endif
 415 }
 416
 417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418         .init = ip_rt_do_proc_init,
 419         .exit = ip_rt_do_proc_exit,
 420 };
 421
 422 static int __init ip_rt_proc_init(void)
 423 {
 424         return register_pernet_subsys(&ip_rt_proc_ops);
 425 }
 426
 427 #else
 428 static inline int ip_rt_proc_init(void)
 429 {
 430         return 0;
 431 }
 432 #endif /* CONFIG_PROC_FS */
 433
 434 static inline bool rt_is_expired(const struct rtable *rth)
 435 {
 436         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437 }
 438
 439 void rt_cache_flush(struct net *net)
 440 {
 441         rt_genid_bump_ipv4(net);
 442 }
 443
 444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445                                            struct sk_buff *skb,
 446                                            const void *daddr)
 447 {
 448         struct net_device *dev = dst->dev;
 449         const __be32 *pkey = daddr;
 450         const struct rtable *rt;
 451         struct neighbour *n;
 452
 453         rt = (const struct rtable *) dst;
 454         if (rt->rt_gateway)
 455                 pkey = (const __be32 *) &rt->rt_gateway;
 456         else if (skb)
 457                 pkey = &ip_hdr(skb)->daddr;
 458
 459         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460         if (n)
 461                 return n;
 462         return neigh_create(&arp_tbl, pkey, dev);
 463 }
 464
 465 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 466 {
 467         struct net_device *dev = dst->dev;
 468         const __be32 *pkey = daddr;
 469         const struct rtable *rt;
 470
 471         rt = (const struct rtable *)dst;
 472         if (rt->rt_gateway)
 473                 pkey = (const __be32 *)&rt->rt_gateway;
 474         else if (!daddr ||
 475                  (rt->rt_flags &
 476                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477                 return;
 478
 479         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 480 }
 481
 482 #define IP_IDENTS_SZ 2048u
 483
 484 static atomic_t *ip_idents __read_mostly;
 485 static u32 *ip_tstamps __read_mostly;
 486
 487 /* In order to protect privacy, we add a perturbation to identifiers
 488  * if one generator is seldom used. This makes hard for an attacker
 489  * to infer how many packets were sent between two points in time.
 490  */
 491 u32 ip_idents_reserve(u32 hash, int segs)
 492 {
 493         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495         u32 old = READ_ONCE(*p_tstamp);
 496         u32 now = (u32)jiffies;
 497         u32 new, delta = 0;
 498
 499         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500                 delta = prandom_u32_max(now - old);
 501
 502         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503         do {
 504                 old = (u32)atomic_read(p_id);
 505                 new = old + delta + segs;
 506         } while (atomic_cmpxchg(p_id, old, new) != old);
 507
 508         return new - segs;
 509 }
 510 EXPORT_SYMBOL(ip_idents_reserve);
 511
 512 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513 {
 514         static u32 ip_idents_hashrnd __read_mostly;
 515         u32 hash, id;
 516
 517         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 518
 519         hash = jhash_3words((__force u32)iph->daddr,
 520                             (__force u32)iph->saddr,
 521                             iph->protocol ^ net_hash_mix(net),
 522                             ip_idents_hashrnd);
 523         id = ip_idents_reserve(hash, segs);
 524         iph->id = htons(id);
 525 }
 526 EXPORT_SYMBOL(__ip_select_ident);
 527
 528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529                              const struct sock *sk,
 530                              const struct iphdr *iph,
 531                              int oif, u8 tos,
 532                              u8 prot, u32 mark, int flow_flags)
 533 {
 534         if (sk) {
 535                 const struct inet_sock *inet = inet_sk(sk);
 536
 537                 oif = sk->sk_bound_dev_if;
 538                 mark = sk->sk_mark;
 539                 tos = RT_CONN_FLAGS(sk);
 540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541         }
 542         flowi4_init_output(fl4, oif, mark, tos,
 543                            RT_SCOPE_UNIVERSE, prot,
 544                            flow_flags,
 545                            iph->daddr, iph->saddr, 0, 0,
 546                            sock_net_uid(net, sk));
 547 }
 548
 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550                                const struct sock *sk)
 551 {
 552         const struct net *net = dev_net(skb->dev);
 553         const struct iphdr *iph = ip_hdr(skb);
 554         int oif = skb->dev->ifindex;
 555         u8 tos = RT_TOS(iph->tos);
 556         u8 prot = iph->protocol;
 557         u32 mark = skb->mark;
 558
 559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560 }
 561
 562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563 {
 564         const struct inet_sock *inet = inet_sk(sk);
 565         const struct ip_options_rcu *inet_opt;
 566         __be32 daddr = inet->inet_daddr;
 567
 568         rcu_read_lock();
 569         inet_opt = rcu_dereference(inet->inet_opt);
 570         if (inet_opt && inet_opt->opt.srr)
 571                 daddr = inet_opt->opt.faddr;
 572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575                            inet_sk_flowi_flags(sk),
 576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577         rcu_read_unlock();
 578 }
 579
 580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581                                  const struct sk_buff *skb)
 582 {
 583         if (skb)
 584                 build_skb_flow_key(fl4, skb, sk);
 585         else
 586                 build_sk_flow_key(fl4, sk);
 587 }
 588
 589 static DEFINE_SPINLOCK(fnhe_lock);
 590
 591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592 {
 593         struct rtable *rt;
 594
 595         rt = rcu_dereference(fnhe->fnhe_rth_input);
 596         if (rt) {
 597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598                 dst_dev_put(&rt->dst);
 599                 dst_release(&rt->dst);
 600         }
 601         rt = rcu_dereference(fnhe->fnhe_rth_output);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607 }
 608
 609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610 {
 611         struct fib_nh_exception *fnhe, *oldest;
 612
 613         oldest = rcu_dereference(hash->chain);
 614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 617                         oldest = fnhe;
 618         }
 619         fnhe_flush_routes(oldest);
 620         return oldest;
 621 }
 622
 623 static inline u32 fnhe_hashfun(__be32 daddr)
 624 {
 625         static u32 fnhe_hashrnd __read_mostly;
 626         u32 hval;
 627
 628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630         return hash_32(hval, FNHE_HASH_SHIFT);
 631 }
 632
 633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634 {
 635         rt->rt_pmtu = fnhe->fnhe_pmtu;
 636         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637         rt->dst.expires = fnhe->fnhe_expires;
 638
 639         if (fnhe->fnhe_gw) {
 640                 rt->rt_flags |= RTCF_REDIRECTED;
 641                 rt->rt_gateway = fnhe->fnhe_gw;
 642                 rt->rt_uses_gateway = 1;
 643         }
 644 }
 645
 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647                                   u32 pmtu, bool lock, unsigned long expires)
 648 {
 649         struct fnhe_hash_bucket *hash;
 650         struct fib_nh_exception *fnhe;
 651         struct rtable *rt;
 652         u32 genid, hval;
 653         unsigned int i;
 654         int depth;
 655
 656         genid = fnhe_genid(dev_net(nh->nh_dev));
 657         hval = fnhe_hashfun(daddr);
 658
 659         spin_lock_bh(&fnhe_lock);
 660
 661         hash = rcu_dereference(nh->nh_exceptions);
 662         if (!hash) {
 663                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664                 if (!hash)
 665                         goto out_unlock;
 666                 rcu_assign_pointer(nh->nh_exceptions, hash);
 667         }
 668
 669         hash += hval;
 670
 671         depth = 0;
 672         for (fnhe = rcu_dereference(hash->chain); fnhe;
 673              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674                 if (fnhe->fnhe_daddr == daddr)
 675                         break;
 676                 depth++;
 677         }
 678
 679         if (fnhe) {
 680                 if (fnhe->fnhe_genid != genid)
 681                         fnhe->fnhe_genid = genid;
 682                 if (gw)
 683                         fnhe->fnhe_gw = gw;
 684                 if (pmtu) {
 685                         fnhe->fnhe_pmtu = pmtu;
 686                         fnhe->fnhe_mtu_locked = lock;
 687                 }
 688                 fnhe->fnhe_expires = max(1UL, expires);
 689                 /* Update all cached dsts too */
 690                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 691                 if (rt)
 692                         fill_route_from_fnhe(rt, fnhe);
 693                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 694                 if (rt)
 695                         fill_route_from_fnhe(rt, fnhe);
 696         } else {
 697                 if (depth > FNHE_RECLAIM_DEPTH)
 698                         fnhe = fnhe_oldest(hash);
 699                 else {
 700                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701                         if (!fnhe)
 702                                 goto out_unlock;
 703
 704                         fnhe->fnhe_next = hash->chain;
 705                         rcu_assign_pointer(hash->chain, fnhe);
 706                 }
 707                 fnhe->fnhe_genid = genid;
 708                 fnhe->fnhe_daddr = daddr;
 709                 fnhe->fnhe_gw = gw;
 710                 fnhe->fnhe_pmtu = pmtu;
 711                 fnhe->fnhe_mtu_locked = lock;
 712                 fnhe->fnhe_expires = expires;
 713
 714                 /* Exception created; mark the cached routes for the nexthop
 715                  * stale, so anyone caching it rechecks if this exception
 716                  * applies to them.
 717                  */
 718                 rt = rcu_dereference(nh->nh_rth_input);
 719                 if (rt)
 720                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722                 for_each_possible_cpu(i) {
 723                         struct rtable __rcu **prt;
 724                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 725                         rt = rcu_dereference(*prt);
 726                         if (rt)
 727                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 728                 }
 729         }
 730
 731         fnhe->fnhe_stamp = jiffies;
 732
 733 out_unlock:
 734         spin_unlock_bh(&fnhe_lock);
 735 }
 736
 737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738                              bool kill_route)
 739 {
 740         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 741         __be32 old_gw = ip_hdr(skb)->saddr;
 742         struct net_device *dev = skb->dev;
 743         struct in_device *in_dev;
 744         struct fib_result res;
 745         struct neighbour *n;
 746         struct net *net;
 747
 748         switch (icmp_hdr(skb)->code & 7) {
 749         case ICMP_REDIR_NET:
 750         case ICMP_REDIR_NETTOS:
 751         case ICMP_REDIR_HOST:
 752         case ICMP_REDIR_HOSTTOS:
 753                 break;
 754
 755         default:
 756                 return;
 757         }
 758
 759         if (rt->rt_gateway != old_gw)
 760                 return;
 761
 762         in_dev = __in_dev_get_rcu(dev);
 763         if (!in_dev)
 764                 return;
 765
 766         net = dev_net(dev);
 767         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769             ipv4_is_zeronet(new_gw))
 770                 goto reject_redirect;
 771
 772         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774                         goto reject_redirect;
 775                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776                         goto reject_redirect;
 777         } else {
 778                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779                         goto reject_redirect;
 780         }
 781
 782         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783         if (!n)
 784                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785         if (!IS_ERR(n)) {
 786                 if (!(n->nud_state & NUD_VALID)) {
 787                         neigh_event_send(n, NULL);
 788                 } else {
 789                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 790                                 struct fib_nh *nh = &FIB_RES_NH(res);
 791
 792                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 793                                                 0, false,
 794                                                 jiffies + ip_rt_gc_timeout);
 795                         }
 796                         if (kill_route)
 797                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 798                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799                 }
 800                 neigh_release(n);
 801         }
 802         return;
 803
 804 reject_redirect:
 805 #ifdef CONFIG_IP_ROUTE_VERBOSE
 806         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 808                 __be32 daddr = iph->daddr;
 809                 __be32 saddr = iph->saddr;
 810
 811                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812                                      "  Advised path = %pI4 -> %pI4\n",
 813                                      &old_gw, dev->name, &new_gw,
 814                                      &saddr, &daddr);
 815         }
 816 #endif
 817         ;
 818 }
 819
 820 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821 {
 822         struct rtable *rt;
 823         struct flowi4 fl4;
 824         const struct iphdr *iph = (const struct iphdr *) skb->data;
 825         struct net *net = dev_net(skb->dev);
 826         int oif = skb->dev->ifindex;
 827         u8 tos = RT_TOS(iph->tos);
 828         u8 prot = iph->protocol;
 829         u32 mark = skb->mark;
 830
 831         rt = (struct rtable *) dst;
 832
 833         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834         __ip_do_redirect(rt, skb, &fl4, true);
 835 }
 836
 837 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838 {
 839         struct rtable *rt = (struct rtable *)dst;
 840         struct dst_entry *ret = dst;
 841
 842         if (rt) {
 843                 if (dst->obsolete > 0) {
 844                         ip_rt_put(rt);
 845                         ret = NULL;
 846                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847                            rt->dst.expires) {
 848                         ip_rt_put(rt);
 849                         ret = NULL;
 850                 }
 851         }
 852         return ret;
 853 }
 854
 855 /*
 856  * Algorithm:
 857  *      1. The first ip_rt_redirect_number redirects are sent
 858  *         with exponential backoff, then we stop sending them at all,
 859  *         assuming that the host ignores our redirects.
 860  *      2. If we did not see packets requiring redirects
 861  *         during ip_rt_redirect_silence, we assume that the host
 862  *         forgot redirected route and start to send redirects again.
 863  *
 864  * This algorithm is much cheaper and more intelligent than dumb load limiting
 865  * in icmp.c.
 866  *
 867  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869  */
 870
 871 void ip_rt_send_redirect(struct sk_buff *skb)
 872 {
 873         struct rtable *rt = skb_rtable(skb);
 874         struct in_device *in_dev;
 875         struct inet_peer *peer;
 876         struct net *net;
 877         int log_martians;
 878         int vif;
 879
 880         rcu_read_lock();
 881         in_dev = __in_dev_get_rcu(rt->dst.dev);
 882         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883                 rcu_read_unlock();
 884                 return;
 885         }
 886         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888         rcu_read_unlock();
 889
 890         net = dev_net(rt->dst.dev);
 891         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 892         if (!peer) {
 893                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 895                 return;
 896         }
 897
 898         /* No redirected packets during ip_rt_redirect_silence;
 899          * reset the algorithm.
 900          */
 901         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 902                 peer->rate_tokens = 0;
 903
 904         /* Too many ignored redirects; do not send anything
 905          * set dst.rate_last to the last seen redirected packet.
 906          */
 907         if (peer->rate_tokens >= ip_rt_redirect_number) {
 908                 peer->rate_last = jiffies;
 909                 goto out_put_peer;
 910         }
 911
 912         /* Check for load limit; set rate_last to the latest sent
 913          * redirect.
 914          */
 915         if (peer->rate_tokens == 0 ||
 916             time_after(jiffies,
 917                        (peer->rate_last +
 918                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 919                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922                 peer->rate_last = jiffies;
 923                 ++peer->rate_tokens;
 924 #ifdef CONFIG_IP_ROUTE_VERBOSE
 925                 if (log_martians &&
 926                     peer->rate_tokens == ip_rt_redirect_number)
 927                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 929                                              &ip_hdr(skb)->daddr, &gw);
 930 #endif
 931         }
 932 out_put_peer:
 933         inet_putpeer(peer);
 934 }
 935
 936 static int ip_error(struct sk_buff *skb)
 937 {
 938         struct rtable *rt = skb_rtable(skb);
 939         struct net_device *dev = skb->dev;
 940         struct in_device *in_dev;
 941         struct inet_peer *peer;
 942         unsigned long now;
 943         struct net *net;
 944         bool send;
 945         int code;
 946
 947         if (netif_is_l3_master(skb->dev)) {
 948                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949                 if (!dev)
 950                         goto out;
 951         }
 952
 953         in_dev = __in_dev_get_rcu(dev);
 954
 955         /* IP on this device is disabled. */
 956         if (!in_dev)
 957                 goto out;
 958
 959         net = dev_net(rt->dst.dev);
 960         if (!IN_DEV_FORWARD(in_dev)) {
 961                 switch (rt->dst.error) {
 962                 case EHOSTUNREACH:
 963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964                         break;
 965
 966                 case ENETUNREACH:
 967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                         break;
 969                 }
 970                 goto out;
 971         }
 972
 973         switch (rt->dst.error) {
 974         case EINVAL:
 975         default:
 976                 goto out;
 977         case EHOSTUNREACH:
 978                 code = ICMP_HOST_UNREACH;
 979                 break;
 980         case ENETUNREACH:
 981                 code = ICMP_NET_UNREACH;
 982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                 break;
 984         case EACCES:
 985                 code = ICMP_PKT_FILTERED;
 986                 break;
 987         }
 988
 989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990                                l3mdev_master_ifindex(skb->dev), 1);
 991
 992         send = true;
 993         if (peer) {
 994                 now = jiffies;
 995                 peer->rate_tokens += now - peer->rate_last;
 996                 if (peer->rate_tokens > ip_rt_error_burst)
 997                         peer->rate_tokens = ip_rt_error_burst;
 998                 peer->rate_last = now;
 999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         struct fib_result res;
1016         bool lock = false;
1017
1018         if (ip_mtu_locked(dst))
1019                 return;
1020
1021         if (ipv4_mtu(dst) < mtu)
1022                 return;
1023
1024         if (mtu < ip_rt_min_pmtu) {
1025                 lock = true;
1026                 mtu = ip_rt_min_pmtu;
1027         }
1028
1029         if (rt->rt_pmtu == mtu &&
1030             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031                 return;
1032
1033         rcu_read_lock();
1034         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035                 struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038                                       jiffies + ip_rt_mtu_expires);
1039         }
1040         rcu_read_unlock();
1041 }
1042
1043 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044                               struct sk_buff *skb, u32 mtu)
1045 {
1046         struct rtable *rt = (struct rtable *) dst;
1047         struct flowi4 fl4;
1048
1049         ip_rt_build_flow_key(&fl4, sk, skb);
1050         __ip_rt_update_pmtu(rt, &fl4, mtu);
1051 }
1052
1053 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054                       int oif, u32 mark, u8 protocol, int flow_flags)
1055 {
1056         const struct iphdr *iph = (const struct iphdr *) skb->data;
1057         struct flowi4 fl4;
1058         struct rtable *rt;
1059
1060         if (!mark)
1061                 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063         __build_flow_key(net, &fl4, NULL, iph, oif,
1064                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1065         rt = __ip_route_output_key(net, &fl4);
1066         if (!IS_ERR(rt)) {
1067                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1068                 ip_rt_put(rt);
1069         }
1070 }
1071 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081         if (!fl4.flowi4_mark)
1082                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084         rt = __ip_route_output_key(sock_net(sk), &fl4);
1085         if (!IS_ERR(rt)) {
1086                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1087                 ip_rt_put(rt);
1088         }
1089 }
1090
1091 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 {
1093         const struct iphdr *iph = (const struct iphdr *) skb->data;
1094         struct flowi4 fl4;
1095         struct rtable *rt;
1096         struct dst_entry *odst = NULL;
1097         bool new = false;
1098         struct net *net = sock_net(sk);
1099
1100         bh_lock_sock(sk);
1101
1102         if (!ip_sk_accept_pmtu(sk))
1103                 goto out;
1104
1105         odst = sk_dst_get(sk);
1106
1107         if (sock_owned_by_user(sk) || !odst) {
1108                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1109                 goto out;
1110         }
1111
1112         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114         rt = (struct rtable *)odst;
1115         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117                 if (IS_ERR(rt))
1118                         goto out;
1119
1120                 new = true;
1121         }
1122
1123         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125         if (!dst_check(&rt->dst, 0)) {
1126                 if (new)
1127                         dst_release(&rt->dst);
1128
1129                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130                 if (IS_ERR(rt))
1131                         goto out;
1132
1133                 new = true;
1134         }
1135
1136         if (new)
1137                 sk_dst_set(sk, &rt->dst);
1138
1139 out:
1140         bh_unlock_sock(sk);
1141         dst_release(odst);
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146                    int oif, u32 mark, u8 protocol, int flow_flags)
1147 {
1148         const struct iphdr *iph = (const struct iphdr *) skb->data;
1149         struct flowi4 fl4;
1150         struct rtable *rt;
1151
1152         __build_flow_key(net, &fl4, NULL, iph, oif,
1153                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1154         rt = __ip_route_output_key(net, &fl4);
1155         if (!IS_ERR(rt)) {
1156                 __ip_do_redirect(rt, skb, &fl4, false);
1157                 ip_rt_put(rt);
1158         }
1159 }
1160 EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163 {
1164         const struct iphdr *iph = (const struct iphdr *) skb->data;
1165         struct flowi4 fl4;
1166         struct rtable *rt;
1167         struct net *net = sock_net(sk);
1168
1169         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170         rt = __ip_route_output_key(net, &fl4);
1171         if (!IS_ERR(rt)) {
1172                 __ip_do_redirect(rt, skb, &fl4, false);
1173                 ip_rt_put(rt);
1174         }
1175 }
1176 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179 {
1180         struct rtable *rt = (struct rtable *) dst;
1181
1182         /* All IPV4 dsts are created with ->obsolete set to the value
1183          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184          * into this function always.
1185          *
1186          * When a PMTU/redirect information update invalidates a route,
1187          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188          * DST_OBSOLETE_DEAD by dst_free().
1189          */
1190         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191                 return NULL;
1192         return dst;
1193 }
1194
1195 static void ipv4_link_failure(struct sk_buff *skb)
1196 {
1197         struct rtable *rt;
1198
1199         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201         rt = skb_rtable(skb);
1202         if (rt)
1203                 dst_set_expires(&rt->dst, 0);
1204 }
1205
1206 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207 {
1208         pr_debug("%s: %pI4 -> %pI4, %s\n",
1209                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210                  skb->dev ? skb->dev->name : "?");
1211         kfree_skb(skb);
1212         WARN_ON(1);
1213         return 0;
1214 }
1215
1216 /*
1217    We do not cache source address of outgoing interface,
1218    because it is used only by IP RR, TS and SRR options,
1219    so that it out of fast path.
1220
1221    BTW remember: "addr" is allowed to be not aligned
1222    in IP options!
1223  */
1224
1225 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226 {
1227         __be32 src;
1228
1229         if (rt_is_output_route(rt))
1230                 src = ip_hdr(skb)->saddr;
1231         else {
1232                 struct fib_result res;
1233                 struct flowi4 fl4;
1234                 struct iphdr *iph;
1235
1236                 iph = ip_hdr(skb);
1237
1238                 memset(&fl4, 0, sizeof(fl4));
1239                 fl4.daddr = iph->daddr;
1240                 fl4.saddr = iph->saddr;
1241                 fl4.flowi4_tos = RT_TOS(iph->tos);
1242                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1243                 fl4.flowi4_iif = skb->dev->ifindex;
1244                 fl4.flowi4_mark = skb->mark;
1245
1246                 rcu_read_lock();
1247                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249                 else
1250                         src = inet_select_addr(rt->dst.dev,
1251                                                rt_nexthop(rt, iph->daddr),
1252                                                RT_SCOPE_UNIVERSE);
1253                 rcu_read_unlock();
1254         }
1255         memcpy(addr, &src, 4);
1256 }
1257
1258 #ifdef CONFIG_IP_ROUTE_CLASSID
1259 static void set_class_tag(struct rtable *rt, u32 tag)
1260 {
1261         if (!(rt->dst.tclassid & 0xFFFF))
1262                 rt->dst.tclassid |= tag & 0xFFFF;
1263         if (!(rt->dst.tclassid & 0xFFFF0000))
1264                 rt->dst.tclassid |= tag & 0xFFFF0000;
1265 }
1266 #endif
1267
1268 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269 {
1270         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272                                     ip_rt_min_advmss);
1273
1274         return min(advmss, IPV4_MAX_PMTU - header_size);
1275 }
1276
1277 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278 {
1279         const struct rtable *rt = (const struct rtable *) dst;
1280         unsigned int mtu = rt->rt_pmtu;
1281
1282         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283                 mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285         if (mtu)
1286                 return mtu;
1287
1288         mtu = READ_ONCE(dst->dev->mtu);
1289
1290         if (unlikely(ip_mtu_locked(dst))) {
1291                 if (rt->rt_uses_gateway && mtu > 576)
1292                         mtu = 576;
1293         }
1294
1295         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298 }
1299
1300 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1301 {
1302         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1303         struct fib_nh_exception *fnhe;
1304         u32 hval;
1305
1306         if (!hash)
1307                 return NULL;
1308
1309         hval = fnhe_hashfun(daddr);
1310
1311         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1312              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1313                 if (fnhe->fnhe_daddr == daddr)
1314                         return fnhe;
1315         }
1316         return NULL;
1317 }
1318
1319 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1320                               __be32 daddr, const bool do_cache)
1321 {
1322         bool ret = false;
1323
1324         spin_lock_bh(&fnhe_lock);
1325
1326         if (daddr == fnhe->fnhe_daddr) {
1327                 struct rtable __rcu **porig;
1328                 struct rtable *orig;
1329                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1330
1331                 if (rt_is_input_route(rt))
1332                         porig = &fnhe->fnhe_rth_input;
1333                 else
1334                         porig = &fnhe->fnhe_rth_output;
1335                 orig = rcu_dereference(*porig);
1336
1337                 if (fnhe->fnhe_genid != genid) {
1338                         fnhe->fnhe_genid = genid;
1339                         fnhe->fnhe_gw = 0;
1340                         fnhe->fnhe_pmtu = 0;
1341                         fnhe->fnhe_expires = 0;
1342                         fnhe_flush_routes(fnhe);
1343                         orig = NULL;
1344                 }
1345                 fill_route_from_fnhe(rt, fnhe);
1346                 if (!rt->rt_gateway)
1347                         rt->rt_gateway = daddr;
1348
1349                 if (do_cache) {
1350                         dst_hold(&rt->dst);
1351                         rcu_assign_pointer(*porig, rt);
1352                         if (orig) {
1353                                 dst_dev_put(&orig->dst);
1354                                 dst_release(&orig->dst);
1355                         }
1356                         ret = true;
1357                 }
1358
1359                 fnhe->fnhe_stamp = jiffies;
1360         }
1361         spin_unlock_bh(&fnhe_lock);
1362
1363         return ret;
1364 }
1365
1366 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1367 {
1368         struct rtable *orig, *prev, **p;
1369         bool ret = true;
1370
1371         if (rt_is_input_route(rt)) {
1372                 p = (struct rtable **)&nh->nh_rth_input;
1373         } else {
1374                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1375         }
1376         orig = *p;
1377
1378         /* hold dst before doing cmpxchg() to avoid race condition
1379          * on this dst
1380          */
1381         dst_hold(&rt->dst);
1382         prev = cmpxchg(p, orig, rt);
1383         if (prev == orig) {
1384                 if (orig) {
1385                         dst_dev_put(&orig->dst);
1386                         dst_release(&orig->dst);
1387                 }
1388         } else {
1389                 dst_release(&rt->dst);
1390                 ret = false;
1391         }
1392
1393         return ret;
1394 }
1395
1396 struct uncached_list {
1397         spinlock_t              lock;
1398         struct list_head        head;
1399 };
1400
1401 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1402
1403 void rt_add_uncached_list(struct rtable *rt)
1404 {
1405         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1406
1407         rt->rt_uncached_list = ul;
1408
1409         spin_lock_bh(&ul->lock);
1410         list_add_tail(&rt->rt_uncached, &ul->head);
1411         spin_unlock_bh(&ul->lock);
1412 }
1413
1414 void rt_del_uncached_list(struct rtable *rt)
1415 {
1416         if (!list_empty(&rt->rt_uncached)) {
1417                 struct uncached_list *ul = rt->rt_uncached_list;
1418
1419                 spin_lock_bh(&ul->lock);
1420                 list_del(&rt->rt_uncached);
1421                 spin_unlock_bh(&ul->lock);
1422         }
1423 }
1424
1425 static void ipv4_dst_destroy(struct dst_entry *dst)
1426 {
1427         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1428         struct rtable *rt = (struct rtable *)dst;
1429
1430         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1431                 kfree(p);
1432
1433         rt_del_uncached_list(rt);
1434 }
1435
1436 void rt_flush_dev(struct net_device *dev)
1437 {
1438         struct net *net = dev_net(dev);
1439         struct rtable *rt;
1440         int cpu;
1441
1442         for_each_possible_cpu(cpu) {
1443                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1444
1445                 spin_lock_bh(&ul->lock);
1446                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1447                         if (rt->dst.dev != dev)
1448                                 continue;
1449                         rt->dst.dev = net->loopback_dev;
1450                         dev_hold(rt->dst.dev);
1451                         dev_put(dev);
1452                 }
1453                 spin_unlock_bh(&ul->lock);
1454         }
1455 }
1456
1457 static bool rt_cache_valid(const struct rtable *rt)
1458 {
1459         return  rt &&
1460                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1461                 !rt_is_expired(rt);
1462 }
1463
1464 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1465                            const struct fib_result *res,
1466                            struct fib_nh_exception *fnhe,
1467                            struct fib_info *fi, u16 type, u32 itag,
1468                            const bool do_cache)
1469 {
1470         bool cached = false;
1471
1472         if (fi) {
1473                 struct fib_nh *nh = &FIB_RES_NH(*res);
1474
1475                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1476                         rt->rt_gateway = nh->nh_gw;
1477                         rt->rt_uses_gateway = 1;
1478                 }
1479                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1480                 if (fi->fib_metrics != &dst_default_metrics) {
1481                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1482                         refcount_inc(&fi->fib_metrics->refcnt);
1483                 }
1484 #ifdef CONFIG_IP_ROUTE_CLASSID
1485                 rt->dst.tclassid = nh->nh_tclassid;
1486 #endif
1487                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1488                 if (unlikely(fnhe))
1489                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1490                 else if (do_cache)
1491                         cached = rt_cache_route(nh, rt);
1492                 if (unlikely(!cached)) {
1493                         /* Routes we intend to cache in nexthop exception or
1494                          * FIB nexthop have the DST_NOCACHE bit clear.
1495                          * However, if we are unsuccessful at storing this
1496                          * route into the cache we really need to set it.
1497                          */
1498                         if (!rt->rt_gateway)
1499                                 rt->rt_gateway = daddr;
1500                         rt_add_uncached_list(rt);
1501                 }
1502         } else
1503                 rt_add_uncached_list(rt);
1504
1505 #ifdef CONFIG_IP_ROUTE_CLASSID
1506 #ifdef CONFIG_IP_MULTIPLE_TABLES
1507         set_class_tag(rt, res->tclassid);
1508 #endif
1509         set_class_tag(rt, itag);
1510 #endif
1511 }
1512
1513 struct rtable *rt_dst_alloc(struct net_device *dev,
1514                             unsigned int flags, u16 type,
1515                             bool nopolicy, bool noxfrm, bool will_cache)
1516 {
1517         struct rtable *rt;
1518
1519         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1520                        (will_cache ? 0 : DST_HOST) |
1521                        (nopolicy ? DST_NOPOLICY : 0) |
1522                        (noxfrm ? DST_NOXFRM : 0));
1523
1524         if (rt) {
1525                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1526                 rt->rt_flags = flags;
1527                 rt->rt_type = type;
1528                 rt->rt_is_input = 0;
1529                 rt->rt_iif = 0;
1530                 rt->rt_pmtu = 0;
1531                 rt->rt_mtu_locked = 0;
1532                 rt->rt_gateway = 0;
1533                 rt->rt_uses_gateway = 0;
1534                 INIT_LIST_HEAD(&rt->rt_uncached);
1535
1536                 rt->dst.output = ip_output;
1537                 if (flags & RTCF_LOCAL)
1538                         rt->dst.input = ip_local_deliver;
1539         }
1540
1541         return rt;
1542 }
1543 EXPORT_SYMBOL(rt_dst_alloc);
1544
1545 /* called in rcu_read_lock() section */
1546 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1547                           u8 tos, struct net_device *dev,
1548                           struct in_device *in_dev, u32 *itag)
1549 {
1550         int err;
1551
1552         /* Primary sanity checks. */
1553         if (!in_dev)
1554                 return -EINVAL;
1555
1556         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1557             skb->protocol != htons(ETH_P_IP))
1558                 return -EINVAL;
1559
1560         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1561                 return -EINVAL;
1562
1563         if (ipv4_is_zeronet(saddr)) {
1564                 if (!ipv4_is_local_multicast(daddr))
1565                         return -EINVAL;
1566         } else {
1567                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1568                                           in_dev, itag);
1569                 if (err < 0)
1570                         return err;
1571         }
1572         return 0;
1573 }
1574
1575 /* called in rcu_read_lock() section */
1576 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1577                              u8 tos, struct net_device *dev, int our)
1578 {
1579         struct in_device *in_dev = __in_dev_get_rcu(dev);
1580         unsigned int flags = RTCF_MULTICAST;
1581         struct rtable *rth;
1582         u32 itag = 0;
1583         int err;
1584
1585         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1586         if (err)
1587                 return err;
1588
1589         if (our)
1590                 flags |= RTCF_LOCAL;
1591
1592         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1593                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1594         if (!rth)
1595                 return -ENOBUFS;
1596
1597 #ifdef CONFIG_IP_ROUTE_CLASSID
1598         rth->dst.tclassid = itag;
1599 #endif
1600         rth->dst.output = ip_rt_bug;
1601         rth->rt_is_input= 1;
1602
1603 #ifdef CONFIG_IP_MROUTE
1604         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1605                 rth->dst.input = ip_mr_input;
1606 #endif
1607         RT_CACHE_STAT_INC(in_slow_mc);
1608
1609         skb_dst_set(skb, &rth->dst);
1610         return 0;
1611 }
1612
1613
1614 static void ip_handle_martian_source(struct net_device *dev,
1615                                      struct in_device *in_dev,
1616                                      struct sk_buff *skb,
1617                                      __be32 daddr,
1618                                      __be32 saddr)
1619 {
1620         RT_CACHE_STAT_INC(in_martian_src);
1621 #ifdef CONFIG_IP_ROUTE_VERBOSE
1622         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1623                 /*
1624                  *      RFC1812 recommendation, if source is martian,
1625                  *      the only hint is MAC header.
1626                  */
1627                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1628                         &daddr, &saddr, dev->name);
1629                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1630                         print_hex_dump(KERN_WARNING, "ll header: ",
1631                                        DUMP_PREFIX_OFFSET, 16, 1,
1632                                        skb_mac_header(skb),
1633                                        dev->hard_header_len, true);
1634                 }
1635         }
1636 #endif
1637 }
1638
1639 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1640 {
1641         struct fnhe_hash_bucket *hash;
1642         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1643         u32 hval = fnhe_hashfun(daddr);
1644
1645         spin_lock_bh(&fnhe_lock);
1646
1647         hash = rcu_dereference_protected(nh->nh_exceptions,
1648                                          lockdep_is_held(&fnhe_lock));
1649         hash += hval;
1650
1651         fnhe_p = &hash->chain;
1652         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1653         while (fnhe) {
1654                 if (fnhe->fnhe_daddr == daddr) {
1655                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1656                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1657                         fnhe_flush_routes(fnhe);
1658                         kfree_rcu(fnhe, rcu);
1659                         break;
1660                 }
1661                 fnhe_p = &fnhe->fnhe_next;
1662                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1663                                                  lockdep_is_held(&fnhe_lock));
1664         }
1665
1666         spin_unlock_bh(&fnhe_lock);
1667 }
1668
1669 /* called in rcu_read_lock() section */
1670 static int __mkroute_input(struct sk_buff *skb,
1671                            const struct fib_result *res,
1672                            struct in_device *in_dev,
1673                            __be32 daddr, __be32 saddr, u32 tos)
1674 {
1675         struct fib_nh_exception *fnhe;
1676         struct rtable *rth;
1677         int err;
1678         struct in_device *out_dev;
1679         bool do_cache;
1680         u32 itag = 0;
1681
1682         /* get a working reference to the output device */
1683         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1684         if (!out_dev) {
1685                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1686                 return -EINVAL;
1687         }
1688
1689         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1690                                   in_dev->dev, in_dev, &itag);
1691         if (err < 0) {
1692                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1693                                          saddr);
1694
1695                 goto cleanup;
1696         }
1697
1698         do_cache = res->fi && !itag;
1699         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1700             skb->protocol == htons(ETH_P_IP) &&
1701             (IN_DEV_SHARED_MEDIA(out_dev) ||
1702              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1703                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1704
1705         if (skb->protocol != htons(ETH_P_IP)) {
1706                 /* Not IP (i.e. ARP). Do not create route, if it is
1707                  * invalid for proxy arp. DNAT routes are always valid.
1708                  *
1709                  * Proxy arp feature have been extended to allow, ARP
1710                  * replies back to the same interface, to support
1711                  * Private VLAN switch technologies. See arp.c.
1712                  */
1713                 if (out_dev == in_dev &&
1714                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1715                         err = -EINVAL;
1716                         goto cleanup;
1717                 }
1718         }
1719
1720         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1721         if (do_cache) {
1722                 if (fnhe) {
1723                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1724                         if (rth && rth->dst.expires &&
1725                             time_after(jiffies, rth->dst.expires)) {
1726                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1727                                 fnhe = NULL;
1728                         } else {
1729                                 goto rt_cache;
1730                         }
1731                 }
1732
1733                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1734
1735 rt_cache:
1736                 if (rt_cache_valid(rth)) {
1737                         skb_dst_set_noref(skb, &rth->dst);
1738                         goto out;
1739                 }
1740         }
1741
1742         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1743                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1744                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1745         if (!rth) {
1746                 err = -ENOBUFS;
1747                 goto cleanup;
1748         }
1749
1750         rth->rt_is_input = 1;
1751         RT_CACHE_STAT_INC(in_slow_tot);
1752
1753         rth->dst.input = ip_forward;
1754
1755         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1756                        do_cache);
1757         lwtunnel_set_redirect(&rth->dst);
1758         skb_dst_set(skb, &rth->dst);
1759 out:
1760         err = 0;
1761  cleanup:
1762         return err;
1763 }
1764
1765 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1766 /* To make ICMP packets follow the right flow, the multipath hash is
1767  * calculated from the inner IP addresses.
1768  */
1769 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1770                                  struct flow_keys *hash_keys)
1771 {
1772         const struct iphdr *outer_iph = ip_hdr(skb);
1773         const struct iphdr *key_iph = outer_iph;
1774         const struct iphdr *inner_iph;
1775         const struct icmphdr *icmph;
1776         struct iphdr _inner_iph;
1777         struct icmphdr _icmph;
1778
1779         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1780                 goto out;
1781
1782         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1783                 goto out;
1784
1785         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1786                                    &_icmph);
1787         if (!icmph)
1788                 goto out;
1789
1790         if (icmph->type != ICMP_DEST_UNREACH &&
1791             icmph->type != ICMP_REDIRECT &&
1792             icmph->type != ICMP_TIME_EXCEEDED &&
1793             icmph->type != ICMP_PARAMETERPROB)
1794                 goto out;
1795
1796         inner_iph = skb_header_pointer(skb,
1797                                        outer_iph->ihl * 4 + sizeof(_icmph),
1798                                        sizeof(_inner_iph), &_inner_iph);
1799         if (!inner_iph)
1800                 goto out;
1801
1802         key_iph = inner_iph;
1803 out:
1804         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1805         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1806 }
1807
1808 /* if skb is set it will be used and fl4 can be NULL */
1809 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1810                        const struct sk_buff *skb, struct flow_keys *flkeys)
1811 {
1812         struct flow_keys hash_keys;
1813         u32 mhash;
1814
1815         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1816         case 0:
1817                 memset(&hash_keys, 0, sizeof(hash_keys));
1818                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1819                 if (skb) {
1820                         ip_multipath_l3_keys(skb, &hash_keys);
1821                 } else {
1822                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1823                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1824                 }
1825                 break;
1826         case 1:
1827                 /* skb is currently provided only when forwarding */
1828                 if (skb) {
1829                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1830                         struct flow_keys keys;
1831
1832                         /* short-circuit if we already have L4 hash present */
1833                         if (skb->l4_hash)
1834                                 return skb_get_hash_raw(skb) >> 1;
1835
1836                         memset(&hash_keys, 0, sizeof(hash_keys));
1837
1838                         if (!flkeys) {
1839                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1840                                 flkeys = &keys;
1841                         }
1842
1843                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1844                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1845                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1846                         hash_keys.ports.src = flkeys->ports.src;
1847                         hash_keys.ports.dst = flkeys->ports.dst;
1848                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1849                 } else {
1850                         memset(&hash_keys, 0, sizeof(hash_keys));
1851                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1852                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1853                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1854                         hash_keys.ports.src = fl4->fl4_sport;
1855                         hash_keys.ports.dst = fl4->fl4_dport;
1856                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1857                 }
1858                 break;
1859         }
1860         mhash = flow_hash_from_keys(&hash_keys);
1861
1862         return mhash >> 1;
1863 }
1864 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1865
1866 static int ip_mkroute_input(struct sk_buff *skb,
1867                             struct fib_result *res,
1868                             struct in_device *in_dev,
1869                             __be32 daddr, __be32 saddr, u32 tos,
1870                             struct flow_keys *hkeys)
1871 {
1872 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1873         if (res->fi && res->fi->fib_nhs > 1) {
1874                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1875
1876                 fib_select_multipath(res, h);
1877         }
1878 #endif
1879
1880         /* create a routing cache entry */
1881         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1882 }
1883
1884 /*
1885  *      NOTE. We drop all the packets that has local source
1886  *      addresses, because every properly looped back packet
1887  *      must have correct destination already attached by output routine.
1888  *
1889  *      Such approach solves two big problems:
1890  *      1. Not simplex devices are handled properly.
1891  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1892  *      called with rcu_read_lock()
1893  */
1894
1895 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1896                                u8 tos, struct net_device *dev,
1897                                struct fib_result *res)
1898 {
1899         struct in_device *in_dev = __in_dev_get_rcu(dev);
1900         struct flow_keys *flkeys = NULL, _flkeys;
1901         struct net    *net = dev_net(dev);
1902         struct ip_tunnel_info *tun_info;
1903         int             err = -EINVAL;
1904         unsigned int    flags = 0;
1905         u32             itag = 0;
1906         struct rtable   *rth;
1907         struct flowi4   fl4;
1908         bool do_cache;
1909
1910         /* IP on this device is disabled. */
1911
1912         if (!in_dev)
1913                 goto out;
1914
1915         /* Check for the most weird martians, which can be not detected
1916            by fib_lookup.
1917          */
1918
1919         tun_info = skb_tunnel_info(skb);
1920         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1921                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1922         else
1923                 fl4.flowi4_tun_key.tun_id = 0;
1924         skb_dst_drop(skb);
1925
1926         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1927                 goto martian_source;
1928
1929         res->fi = NULL;
1930         res->table = NULL;
1931         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1932                 goto brd_input;
1933
1934         /* Accept zero addresses only to limited broadcast;
1935          * I even do not know to fix it or not. Waiting for complains :-)
1936          */
1937         if (ipv4_is_zeronet(saddr))
1938                 goto martian_source;
1939
1940         if (ipv4_is_zeronet(daddr))
1941                 goto martian_destination;
1942
1943         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1944          * and call it once if daddr or/and saddr are loopback addresses
1945          */
1946         if (ipv4_is_loopback(daddr)) {
1947                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1948                         goto martian_destination;
1949         } else if (ipv4_is_loopback(saddr)) {
1950                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1951                         goto martian_source;
1952         }
1953
1954         /*
1955          *      Now we are ready to route packet.
1956          */
1957         fl4.flowi4_oif = 0;
1958         fl4.flowi4_iif = dev->ifindex;
1959         fl4.flowi4_mark = skb->mark;
1960         fl4.flowi4_tos = tos;
1961         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1962         fl4.flowi4_flags = 0;
1963         fl4.daddr = daddr;
1964         fl4.saddr = saddr;
1965         fl4.flowi4_uid = sock_net_uid(net, NULL);
1966
1967         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
1968                 flkeys = &_flkeys;
1969
1970         err = fib_lookup(net, &fl4, res, 0);
1971         if (err != 0) {
1972                 if (!IN_DEV_FORWARD(in_dev))
1973                         err = -EHOSTUNREACH;
1974                 goto no_route;
1975         }
1976
1977         if (res->type == RTN_BROADCAST)
1978                 goto brd_input;
1979
1980         if (res->type == RTN_LOCAL) {
1981                 err = fib_validate_source(skb, saddr, daddr, tos,
1982                                           0, dev, in_dev, &itag);
1983                 if (err < 0)
1984                         goto martian_source;
1985                 goto local_input;
1986         }
1987
1988         if (!IN_DEV_FORWARD(in_dev)) {
1989                 err = -EHOSTUNREACH;
1990                 goto no_route;
1991         }
1992         if (res->type != RTN_UNICAST)
1993                 goto martian_destination;
1994
1995         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1996 out:    return err;
1997
1998 brd_input:
1999         if (skb->protocol != htons(ETH_P_IP))
2000                 goto e_inval;
2001
2002         if (!ipv4_is_zeronet(saddr)) {
2003                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2004                                           in_dev, &itag);
2005                 if (err < 0)
2006                         goto martian_source;
2007         }
2008         flags |= RTCF_BROADCAST;
2009         res->type = RTN_BROADCAST;
2010         RT_CACHE_STAT_INC(in_brd);
2011
2012 local_input:
2013         do_cache = false;
2014         if (res->fi) {
2015                 if (!itag) {
2016                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2017                         if (rt_cache_valid(rth)) {
2018                                 skb_dst_set_noref(skb, &rth->dst);
2019                                 err = 0;
2020                                 goto out;
2021                         }
2022                         do_cache = true;
2023                 }
2024         }
2025
2026         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2027                            flags | RTCF_LOCAL, res->type,
2028                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2029         if (!rth)
2030                 goto e_nobufs;
2031
2032         rth->dst.output= ip_rt_bug;
2033 #ifdef CONFIG_IP_ROUTE_CLASSID
2034         rth->dst.tclassid = itag;
2035 #endif
2036         rth->rt_is_input = 1;
2037
2038         RT_CACHE_STAT_INC(in_slow_tot);
2039         if (res->type == RTN_UNREACHABLE) {
2040                 rth->dst.input= ip_error;
2041                 rth->dst.error= -err;
2042                 rth->rt_flags   &= ~RTCF_LOCAL;
2043         }
2044
2045         if (do_cache) {
2046                 struct fib_nh *nh = &FIB_RES_NH(*res);
2047
2048                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2049                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2050                         WARN_ON(rth->dst.input == lwtunnel_input);
2051                         rth->dst.lwtstate->orig_input = rth->dst.input;
2052                         rth->dst.input = lwtunnel_input;
2053                 }
2054
2055                 if (unlikely(!rt_cache_route(nh, rth)))
2056                         rt_add_uncached_list(rth);
2057         }
2058         skb_dst_set(skb, &rth->dst);
2059         err = 0;
2060         goto out;
2061
2062 no_route:
2063         RT_CACHE_STAT_INC(in_no_route);
2064         res->type = RTN_UNREACHABLE;
2065         res->fi = NULL;
2066         res->table = NULL;
2067         goto local_input;
2068
2069         /*
2070          *      Do not cache martian addresses: they should be logged (RFC1812)
2071          */
2072 martian_destination:
2073         RT_CACHE_STAT_INC(in_martian_dst);
2074 #ifdef CONFIG_IP_ROUTE_VERBOSE
2075         if (IN_DEV_LOG_MARTIANS(in_dev))
2076                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2077                                      &daddr, &saddr, dev->name);
2078 #endif
2079
2080 e_inval:
2081         err = -EINVAL;
2082         goto out;
2083
2084 e_nobufs:
2085         err = -ENOBUFS;
2086         goto out;
2087
2088 martian_source:
2089         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2090         goto out;
2091 }
2092
2093 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094                          u8 tos, struct net_device *dev)
2095 {
2096         struct fib_result res;
2097         int err;
2098
2099         tos &= IPTOS_RT_MASK;
2100         rcu_read_lock();
2101         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2102         rcu_read_unlock();
2103
2104         return err;
2105 }
2106 EXPORT_SYMBOL(ip_route_input_noref);
2107
2108 /* called with rcu_read_lock held */
2109 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2110                        u8 tos, struct net_device *dev, struct fib_result *res)
2111 {
2112         /* Multicast recognition logic is moved from route cache to here.
2113            The problem was that too many Ethernet cards have broken/missing
2114            hardware multicast filters :-( As result the host on multicasting
2115            network acquires a lot of useless route cache entries, sort of
2116            SDR messages from all the world. Now we try to get rid of them.
2117            Really, provided software IP multicast filter is organized
2118            reasonably (at least, hashed), it does not result in a slowdown
2119            comparing with route cache reject entries.
2120            Note, that multicast routers are not affected, because
2121            route cache entry is created eventually.
2122          */
2123         if (ipv4_is_multicast(daddr)) {
2124                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2125                 int our = 0;
2126                 int err = -EINVAL;
2127
2128                 if (in_dev)
2129                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2130                                               ip_hdr(skb)->protocol);
2131
2132                 /* check l3 master if no match yet */
2133                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2134                         struct in_device *l3_in_dev;
2135
2136                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2137                         if (l3_in_dev)
2138                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2139                                                       ip_hdr(skb)->protocol);
2140                 }
2141
2142                 if (our
2143 #ifdef CONFIG_IP_MROUTE
2144                         ||
2145                     (!ipv4_is_local_multicast(daddr) &&
2146                      IN_DEV_MFORWARD(in_dev))
2147 #endif
2148                    ) {
2149                         err = ip_route_input_mc(skb, daddr, saddr,
2150                                                 tos, dev, our);
2151                 }
2152                 return err;
2153         }
2154
2155         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2156 }
2157
2158 /* called with rcu_read_lock() */
2159 static struct rtable *__mkroute_output(const struct fib_result *res,
2160                                        const struct flowi4 *fl4, int orig_oif,
2161                                        struct net_device *dev_out,
2162                                        unsigned int flags)
2163 {
2164         struct fib_info *fi = res->fi;
2165         struct fib_nh_exception *fnhe;
2166         struct in_device *in_dev;
2167         u16 type = res->type;
2168         struct rtable *rth;
2169         bool do_cache;
2170
2171         in_dev = __in_dev_get_rcu(dev_out);
2172         if (!in_dev)
2173                 return ERR_PTR(-EINVAL);
2174
2175         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2176                 if (ipv4_is_loopback(fl4->saddr) &&
2177                     !(dev_out->flags & IFF_LOOPBACK) &&
2178                     !netif_is_l3_master(dev_out))
2179                         return ERR_PTR(-EINVAL);
2180
2181         if (ipv4_is_lbcast(fl4->daddr))
2182                 type = RTN_BROADCAST;
2183         else if (ipv4_is_multicast(fl4->daddr))
2184                 type = RTN_MULTICAST;
2185         else if (ipv4_is_zeronet(fl4->daddr))
2186                 return ERR_PTR(-EINVAL);
2187
2188         if (dev_out->flags & IFF_LOOPBACK)
2189                 flags |= RTCF_LOCAL;
2190
2191         do_cache = true;
2192         if (type == RTN_BROADCAST) {
2193                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2194                 fi = NULL;
2195         } else if (type == RTN_MULTICAST) {
2196                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2197                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2198                                      fl4->flowi4_proto))
2199                         flags &= ~RTCF_LOCAL;
2200                 else
2201                         do_cache = false;
2202                 /* If multicast route do not exist use
2203                  * default one, but do not gateway in this case.
2204                  * Yes, it is hack.
2205                  */
2206                 if (fi && res->prefixlen < 4)
2207                         fi = NULL;
2208         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2209                    (orig_oif != dev_out->ifindex)) {
2210                 /* For local routes that require a particular output interface
2211                  * we do not want to cache the result.  Caching the result
2212                  * causes incorrect behaviour when there are multiple source
2213                  * addresses on the interface, the end result being that if the
2214                  * intended recipient is waiting on that interface for the
2215                  * packet he won't receive it because it will be delivered on
2216                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2217                  * be set to the loopback interface as well.
2218                  */
2219                 fi = NULL;
2220         }
2221
2222         fnhe = NULL;
2223         do_cache &= fi != NULL;
2224         if (do_cache) {
2225                 struct rtable __rcu **prth;
2226                 struct fib_nh *nh = &FIB_RES_NH(*res);
2227
2228                 fnhe = find_exception(nh, fl4->daddr);
2229                 if (fnhe) {
2230                         prth = &fnhe->fnhe_rth_output;
2231                         rth = rcu_dereference(*prth);
2232                         if (rth && rth->dst.expires &&
2233                             time_after(jiffies, rth->dst.expires)) {
2234                                 ip_del_fnhe(nh, fl4->daddr);
2235                                 fnhe = NULL;
2236                         } else {
2237                                 goto rt_cache;
2238                         }
2239                 }
2240
2241                 if (unlikely(fl4->flowi4_flags &
2242                              FLOWI_FLAG_KNOWN_NH &&
2243                              !(nh->nh_gw &&
2244                                nh->nh_scope == RT_SCOPE_LINK))) {
2245                         do_cache = false;
2246                         goto add;
2247                 }
2248                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2249                 rth = rcu_dereference(*prth);
2250
2251 rt_cache:
2252                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2253                         return rth;
2254         }
2255
2256 add:
2257         rth = rt_dst_alloc(dev_out, flags, type,
2258                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2259                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2260                            do_cache);
2261         if (!rth)
2262                 return ERR_PTR(-ENOBUFS);
2263
2264         rth->rt_iif = orig_oif;
2265
2266         RT_CACHE_STAT_INC(out_slow_tot);
2267
2268         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2269                 if (flags & RTCF_LOCAL &&
2270                     !(dev_out->flags & IFF_LOOPBACK)) {
2271                         rth->dst.output = ip_mc_output;
2272                         RT_CACHE_STAT_INC(out_slow_mc);
2273                 }
2274 #ifdef CONFIG_IP_MROUTE
2275                 if (type == RTN_MULTICAST) {
2276                         if (IN_DEV_MFORWARD(in_dev) &&
2277                             !ipv4_is_local_multicast(fl4->daddr)) {
2278                                 rth->dst.input = ip_mr_input;
2279                                 rth->dst.output = ip_mc_output;
2280                         }
2281                 }
2282 #endif
2283         }
2284
2285         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2286         lwtunnel_set_redirect(&rth->dst);
2287
2288         return rth;
2289 }
2290
2291 /*
2292  * Major route resolver routine.
2293  */
2294
2295 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2296                                         const struct sk_buff *skb)
2297 {
2298         __u8 tos = RT_FL_TOS(fl4);
2299         struct fib_result res = {
2300                 .type           = RTN_UNSPEC,
2301                 .fi             = NULL,
2302                 .table          = NULL,
2303                 .tclassid       = 0,
2304         };
2305         struct rtable *rth;
2306
2307         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2308         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2309         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2310                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2311
2312         rcu_read_lock();
2313         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2314         rcu_read_unlock();
2315
2316         return rth;
2317 }
2318 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2319
2320 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2321                                             struct fib_result *res,
2322                                             const struct sk_buff *skb)
2323 {
2324         struct net_device *dev_out = NULL;
2325         int orig_oif = fl4->flowi4_oif;
2326         unsigned int flags = 0;
2327         struct rtable *rth;
2328         int err = -ENETUNREACH;
2329
2330         if (fl4->saddr) {
2331                 rth = ERR_PTR(-EINVAL);
2332                 if (ipv4_is_multicast(fl4->saddr) ||
2333                     ipv4_is_lbcast(fl4->saddr) ||
2334                     ipv4_is_zeronet(fl4->saddr))
2335                         goto out;
2336
2337                 /* I removed check for oif == dev_out->oif here.
2338                    It was wrong for two reasons:
2339                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2340                       is assigned to multiple interfaces.
2341                    2. Moreover, we are allowed to send packets with saddr
2342                       of another iface. --ANK
2343                  */
2344
2345                 if (fl4->flowi4_oif == 0 &&
2346                     (ipv4_is_multicast(fl4->daddr) ||
2347                      ipv4_is_lbcast(fl4->daddr))) {
2348                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2349                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2350                         if (!dev_out)
2351                                 goto out;
2352
2353                         /* Special hack: user can direct multicasts
2354                            and limited broadcast via necessary interface
2355                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2356                            This hack is not just for fun, it allows
2357                            vic,vat and friends to work.
2358                            They bind socket to loopback, set ttl to zero
2359                            and expect that it will work.
2360                            From the viewpoint of routing cache they are broken,
2361                            because we are not allowed to build multicast path
2362                            with loopback source addr (look, routing cache
2363                            cannot know, that ttl is zero, so that packet
2364                            will not leave this host and route is valid).
2365                            Luckily, this hack is good workaround.
2366                          */
2367
2368                         fl4->flowi4_oif = dev_out->ifindex;
2369                         goto make_route;
2370                 }
2371
2372                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2373                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2374                         if (!__ip_dev_find(net, fl4->saddr, false))
2375                                 goto out;
2376                 }
2377         }
2378
2379
2380         if (fl4->flowi4_oif) {
2381                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2382                 rth = ERR_PTR(-ENODEV);
2383                 if (!dev_out)
2384                         goto out;
2385
2386                 /* RACE: Check return value of inet_select_addr instead. */
2387                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2388                         rth = ERR_PTR(-ENETUNREACH);
2389                         goto out;
2390                 }
2391                 if (ipv4_is_local_multicast(fl4->daddr) ||
2392                     ipv4_is_lbcast(fl4->daddr) ||
2393                     fl4->flowi4_proto == IPPROTO_IGMP) {
2394                         if (!fl4->saddr)
2395                                 fl4->saddr = inet_select_addr(dev_out, 0,
2396                                                               RT_SCOPE_LINK);
2397                         goto make_route;
2398                 }
2399                 if (!fl4->saddr) {
2400                         if (ipv4_is_multicast(fl4->daddr))
2401                                 fl4->saddr = inet_select_addr(dev_out, 0,
2402                                                               fl4->flowi4_scope);
2403                         else if (!fl4->daddr)
2404                                 fl4->saddr = inet_select_addr(dev_out, 0,
2405                                                               RT_SCOPE_HOST);
2406                 }
2407         }
2408
2409         if (!fl4->daddr) {
2410                 fl4->daddr = fl4->saddr;
2411                 if (!fl4->daddr)
2412                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2413                 dev_out = net->loopback_dev;
2414                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2415                 res->type = RTN_LOCAL;
2416                 flags |= RTCF_LOCAL;
2417                 goto make_route;
2418         }
2419
2420         err = fib_lookup(net, fl4, res, 0);
2421         if (err) {
2422                 res->fi = NULL;
2423                 res->table = NULL;
2424                 if (fl4->flowi4_oif &&
2425                     (ipv4_is_multicast(fl4->daddr) ||
2426                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2427                         /* Apparently, routing tables are wrong. Assume,
2428                            that the destination is on link.
2429
2430                            WHY? DW.
2431                            Because we are allowed to send to iface
2432                            even if it has NO routes and NO assigned
2433                            addresses. When oif is specified, routing
2434                            tables are looked up with only one purpose:
2435                            to catch if destination is gatewayed, rather than
2436                            direct. Moreover, if MSG_DONTROUTE is set,
2437                            we send packet, ignoring both routing tables
2438                            and ifaddr state. --ANK
2439
2440
2441                            We could make it even if oif is unknown,
2442                            likely IPv6, but we do not.
2443                          */
2444
2445                         if (fl4->saddr == 0)
2446                                 fl4->saddr = inet_select_addr(dev_out, 0,
2447                                                               RT_SCOPE_LINK);
2448                         res->type = RTN_UNICAST;
2449                         goto make_route;
2450                 }
2451                 rth = ERR_PTR(err);
2452                 goto out;
2453         }
2454
2455         if (res->type == RTN_LOCAL) {
2456                 if (!fl4->saddr) {
2457                         if (res->fi->fib_prefsrc)
2458                                 fl4->saddr = res->fi->fib_prefsrc;
2459                         else
2460                                 fl4->saddr = fl4->daddr;
2461                 }
2462
2463                 /* L3 master device is the loopback for that domain */
2464                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2465                         net->loopback_dev;
2466
2467                 /* make sure orig_oif points to fib result device even
2468                  * though packet rx/tx happens over loopback or l3mdev
2469                  */
2470                 orig_oif = FIB_RES_OIF(*res);
2471
2472                 fl4->flowi4_oif = dev_out->ifindex;
2473                 flags |= RTCF_LOCAL;
2474                 goto make_route;
2475         }
2476
2477         fib_select_path(net, res, fl4, skb);
2478
2479         dev_out = FIB_RES_DEV(*res);
2480         fl4->flowi4_oif = dev_out->ifindex;
2481
2482
2483 make_route:
2484         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2485
2486 out:
2487         return rth;
2488 }
2489
2490 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2491 {
2492         return NULL;
2493 }
2494
2495 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2496 {
2497         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2498
2499         return mtu ? : dst->dev->mtu;
2500 }
2501
2502 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2503                                           struct sk_buff *skb, u32 mtu)
2504 {
2505 }
2506
2507 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2508                                        struct sk_buff *skb)
2509 {
2510 }
2511
2512 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2513                                           unsigned long old)
2514 {
2515         return NULL;
2516 }
2517
2518 static struct dst_ops ipv4_dst_blackhole_ops = {
2519         .family                 =       AF_INET,
2520         .check                  =       ipv4_blackhole_dst_check,
2521         .mtu                    =       ipv4_blackhole_mtu,
2522         .default_advmss         =       ipv4_default_advmss,
2523         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2524         .redirect               =       ipv4_rt_blackhole_redirect,
2525         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2526         .neigh_lookup           =       ipv4_neigh_lookup,
2527 };
2528
2529 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2530 {
2531         struct rtable *ort = (struct rtable *) dst_orig;
2532         struct rtable *rt;
2533
2534         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2535         if (rt) {
2536                 struct dst_entry *new = &rt->dst;
2537
2538                 new->__use = 1;
2539                 new->input = dst_discard;
2540                 new->output = dst_discard_out;
2541
2542                 new->dev = net->loopback_dev;
2543                 if (new->dev)
2544                         dev_hold(new->dev);
2545
2546                 rt->rt_is_input = ort->rt_is_input;
2547                 rt->rt_iif = ort->rt_iif;
2548                 rt->rt_pmtu = ort->rt_pmtu;
2549                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2550
2551                 rt->rt_genid = rt_genid_ipv4(net);
2552                 rt->rt_flags = ort->rt_flags;
2553                 rt->rt_type = ort->rt_type;
2554                 rt->rt_gateway = ort->rt_gateway;
2555                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2556
2557                 INIT_LIST_HEAD(&rt->rt_uncached);
2558         }
2559
2560         dst_release(dst_orig);
2561
2562         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2563 }
2564
2565 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2566                                     const struct sock *sk)
2567 {
2568         struct rtable *rt = __ip_route_output_key(net, flp4);
2569
2570         if (IS_ERR(rt))
2571                 return rt;
2572
2573         if (flp4->flowi4_proto)
2574                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2575                                                         flowi4_to_flowi(flp4),
2576                                                         sk, 0);
2577
2578         return rt;
2579 }
2580 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2581
2582 /* called with rcu_read_lock held */
2583 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2584                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2585                         u32 seq)
2586 {
2587         struct rtable *rt = skb_rtable(skb);
2588         struct rtmsg *r;
2589         struct nlmsghdr *nlh;
2590         unsigned long expires = 0;
2591         u32 error;
2592         u32 metrics[RTAX_MAX];
2593
2594         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2595         if (!nlh)
2596                 return -EMSGSIZE;
2597
2598         r = nlmsg_data(nlh);
2599         r->rtm_family    = AF_INET;
2600         r->rtm_dst_len  = 32;
2601         r->rtm_src_len  = 0;
2602         r->rtm_tos      = fl4->flowi4_tos;
2603         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2604         if (nla_put_u32(skb, RTA_TABLE, table_id))
2605                 goto nla_put_failure;
2606         r->rtm_type     = rt->rt_type;
2607         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2608         r->rtm_protocol = RTPROT_UNSPEC;
2609         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2610         if (rt->rt_flags & RTCF_NOTIFY)
2611                 r->rtm_flags |= RTM_F_NOTIFY;
2612         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2613                 r->rtm_flags |= RTCF_DOREDIRECT;
2614
2615         if (nla_put_in_addr(skb, RTA_DST, dst))
2616                 goto nla_put_failure;
2617         if (src) {
2618                 r->rtm_src_len = 32;
2619                 if (nla_put_in_addr(skb, RTA_SRC, src))
2620                         goto nla_put_failure;
2621         }
2622         if (rt->dst.dev &&
2623             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2624                 goto nla_put_failure;
2625 #ifdef CONFIG_IP_ROUTE_CLASSID
2626         if (rt->dst.tclassid &&
2627             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2628                 goto nla_put_failure;
2629 #endif
2630         if (!rt_is_input_route(rt) &&
2631             fl4->saddr != src) {
2632                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2633                         goto nla_put_failure;
2634         }
2635         if (rt->rt_uses_gateway &&
2636             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2637                 goto nla_put_failure;
2638
2639         expires = rt->dst.expires;
2640         if (expires) {
2641                 unsigned long now = jiffies;
2642
2643                 if (time_before(now, expires))
2644                         expires -= now;
2645                 else
2646                         expires = 0;
2647         }
2648
2649         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2650         if (rt->rt_pmtu && expires)
2651                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2652         if (rt->rt_mtu_locked && expires)
2653                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2654         if (rtnetlink_put_metrics(skb, metrics) < 0)
2655                 goto nla_put_failure;
2656
2657         if (fl4->flowi4_mark &&
2658             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2659                 goto nla_put_failure;
2660
2661         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2662             nla_put_u32(skb, RTA_UID,
2663                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2664                 goto nla_put_failure;
2665
2666         error = rt->dst.error;
2667
2668         if (rt_is_input_route(rt)) {
2669 #ifdef CONFIG_IP_MROUTE
2670                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2671                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2672                         int err = ipmr_get_route(net, skb,
2673                                                  fl4->saddr, fl4->daddr,
2674                                                  r, portid);
2675
2676                         if (err <= 0) {
2677                                 if (err == 0)
2678                                         return 0;
2679                                 goto nla_put_failure;
2680                         }
2681                 } else
2682 #endif
2683                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2684                                 goto nla_put_failure;
2685         }
2686
2687         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2688                 goto nla_put_failure;
2689
2690         nlmsg_end(skb, nlh);
2691         return 0;
2692
2693 nla_put_failure:
2694         nlmsg_cancel(skb, nlh);
2695         return -EMSGSIZE;
2696 }
2697
2698 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2699                              struct netlink_ext_ack *extack)
2700 {
2701         struct net *net = sock_net(in_skb->sk);
2702         struct rtmsg *rtm;
2703         struct nlattr *tb[RTA_MAX+1];
2704         struct fib_result res = {};
2705         struct rtable *rt = NULL;
2706         struct flowi4 fl4;
2707         __be32 dst = 0;
2708         __be32 src = 0;
2709         u32 iif;
2710         int err;
2711         int mark;
2712         struct sk_buff *skb;
2713         u32 table_id = RT_TABLE_MAIN;
2714         kuid_t uid;
2715
2716         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2717                           extack);
2718         if (err < 0)
2719                 goto errout;
2720
2721         rtm = nlmsg_data(nlh);
2722
2723         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2724         if (!skb) {
2725                 err = -ENOBUFS;
2726                 goto errout;
2727         }
2728
2729         /* Reserve room for dummy headers, this skb can pass
2730            through good chunk of routing engine.
2731          */
2732         skb_reset_mac_header(skb);
2733         skb_reset_network_header(skb);
2734
2735         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2736         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2737         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2738         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2739         if (tb[RTA_UID])
2740                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2741         else
2742                 uid = (iif ? INVALID_UID : current_uid());
2743
2744         /* Bugfix: need to give ip_route_input enough of an IP header to
2745          * not gag.
2746          */
2747         ip_hdr(skb)->protocol = IPPROTO_UDP;
2748         ip_hdr(skb)->saddr = src;
2749         ip_hdr(skb)->daddr = dst;
2750
2751         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2752
2753         memset(&fl4, 0, sizeof(fl4));
2754         fl4.daddr = dst;
2755         fl4.saddr = src;
2756         fl4.flowi4_tos = rtm->rtm_tos;
2757         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2758         fl4.flowi4_mark = mark;
2759         fl4.flowi4_uid = uid;
2760
2761         rcu_read_lock();
2762
2763         if (iif) {
2764                 struct net_device *dev;
2765
2766                 dev = dev_get_by_index_rcu(net, iif);
2767                 if (!dev) {
2768                         err = -ENODEV;
2769                         goto errout_free;
2770                 }
2771
2772                 skb->protocol   = htons(ETH_P_IP);
2773                 skb->dev        = dev;
2774                 skb->mark       = mark;
2775                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2776                                          dev, &res);
2777
2778                 rt = skb_rtable(skb);
2779                 if (err == 0 && rt->dst.error)
2780                         err = -rt->dst.error;
2781         } else {
2782                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2783                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2784                 err = 0;
2785                 if (IS_ERR(rt))
2786                         err = PTR_ERR(rt);
2787                 else
2788                         skb_dst_set(skb, &rt->dst);
2789         }
2790
2791         if (err)
2792                 goto errout_free;
2793
2794         if (rtm->rtm_flags & RTM_F_NOTIFY)
2795                 rt->rt_flags |= RTCF_NOTIFY;
2796
2797         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2798                 table_id = res.table ? res.table->tb_id : 0;
2799
2800         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2801                 if (!res.fi) {
2802                         err = fib_props[res.type].error;
2803                         if (!err)
2804                                 err = -EHOSTUNREACH;
2805                         goto errout_free;
2806                 }
2807                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2808                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2809                                     rt->rt_type, res.prefix, res.prefixlen,
2810                                     fl4.flowi4_tos, res.fi, 0);
2811         } else {
2812                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2813                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2814         }
2815         if (err < 0)
2816                 goto errout_free;
2817
2818         rcu_read_unlock();
2819
2820         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2821 errout:
2822         return err;
2823
2824 errout_free:
2825         rcu_read_unlock();
2826         kfree_skb(skb);
2827         goto errout;
2828 }
2829
2830 void ip_rt_multicast_event(struct in_device *in_dev)
2831 {
2832         rt_cache_flush(dev_net(in_dev->dev));
2833 }
2834
2835 #ifdef CONFIG_SYSCTL
2836 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2837 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2838 static int ip_rt_gc_elasticity __read_mostly    = 8;
2839 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2840
2841 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2842                                         void __user *buffer,
2843                                         size_t *lenp, loff_t *ppos)
2844 {
2845         struct net *net = (struct net *)__ctl->extra1;
2846
2847         if (write) {
2848                 rt_cache_flush(net);
2849                 fnhe_genid_bump(net);
2850                 return 0;
2851         }
2852
2853         return -EINVAL;
2854 }
2855
2856 static struct ctl_table ipv4_route_table[] = {
2857         {
2858                 .procname       = "gc_thresh",
2859                 .data           = &ipv4_dst_ops.gc_thresh,
2860                 .maxlen         = sizeof(int),
2861                 .mode           = 0644,
2862                 .proc_handler   = proc_dointvec,
2863         },
2864         {
2865                 .procname       = "max_size",
2866                 .data           = &ip_rt_max_size,
2867                 .maxlen         = sizeof(int),
2868                 .mode           = 0644,
2869                 .proc_handler   = proc_dointvec,
2870         },
2871         {
2872                 /*  Deprecated. Use gc_min_interval_ms */
2873
2874                 .procname       = "gc_min_interval",
2875                 .data           = &ip_rt_gc_min_interval,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = proc_dointvec_jiffies,
2879         },
2880         {
2881                 .procname       = "gc_min_interval_ms",
2882                 .data           = &ip_rt_gc_min_interval,
2883                 .maxlen         = sizeof(int),
2884                 .mode           = 0644,
2885                 .proc_handler   = proc_dointvec_ms_jiffies,
2886         },
2887         {
2888                 .procname       = "gc_timeout",
2889                 .data           = &ip_rt_gc_timeout,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = proc_dointvec_jiffies,
2893         },
2894         {
2895                 .procname       = "gc_interval",
2896                 .data           = &ip_rt_gc_interval,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = proc_dointvec_jiffies,
2900         },
2901         {
2902                 .procname       = "redirect_load",
2903                 .data           = &ip_rt_redirect_load,
2904                 .maxlen         = sizeof(int),
2905                 .mode           = 0644,
2906                 .proc_handler   = proc_dointvec,
2907         },
2908         {
2909                 .procname       = "redirect_number",
2910                 .data           = &ip_rt_redirect_number,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = proc_dointvec,
2914         },
2915         {
2916                 .procname       = "redirect_silence",
2917                 .data           = &ip_rt_redirect_silence,
2918                 .maxlen         = sizeof(int),
2919                 .mode           = 0644,
2920                 .proc_handler   = proc_dointvec,
2921         },
2922         {
2923                 .procname       = "error_cost",
2924                 .data           = &ip_rt_error_cost,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = proc_dointvec,
2928         },
2929         {
2930                 .procname       = "error_burst",
2931                 .data           = &ip_rt_error_burst,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = proc_dointvec,
2935         },
2936         {
2937                 .procname       = "gc_elasticity",
2938                 .data           = &ip_rt_gc_elasticity,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = proc_dointvec,
2942         },
2943         {
2944                 .procname       = "mtu_expires",
2945                 .data           = &ip_rt_mtu_expires,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = proc_dointvec_jiffies,
2949         },
2950         {
2951                 .procname       = "min_pmtu",
2952                 .data           = &ip_rt_min_pmtu,
2953                 .maxlen         = sizeof(int),
2954                 .mode           = 0644,
2955                 .proc_handler   = proc_dointvec_minmax,
2956                 .extra1         = &ip_min_valid_pmtu,
2957         },
2958         {
2959                 .procname       = "min_adv_mss",
2960                 .data           = &ip_rt_min_advmss,
2961                 .maxlen         = sizeof(int),
2962                 .mode           = 0644,
2963                 .proc_handler   = proc_dointvec,
2964         },
2965         { }
2966 };
2967
2968 static struct ctl_table ipv4_route_flush_table[] = {
2969         {
2970                 .procname       = "flush",
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0200,
2973                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2974         },
2975         { },
2976 };
2977
2978 static __net_init int sysctl_route_net_init(struct net *net)
2979 {
2980         struct ctl_table *tbl;
2981
2982         tbl = ipv4_route_flush_table;
2983         if (!net_eq(net, &init_net)) {
2984                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2985                 if (!tbl)
2986                         goto err_dup;
2987
2988                 /* Don't export sysctls to unprivileged users */
2989                 if (net->user_ns != &init_user_ns)
2990                         tbl[0].procname = NULL;
2991         }
2992         tbl[0].extra1 = net;
2993
2994         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2995         if (!net->ipv4.route_hdr)
2996                 goto err_reg;
2997         return 0;
2998
2999 err_reg:
3000         if (tbl != ipv4_route_flush_table)
3001                 kfree(tbl);
3002 err_dup:
3003         return -ENOMEM;
3004 }
3005
3006 static __net_exit void sysctl_route_net_exit(struct net *net)
3007 {
3008         struct ctl_table *tbl;
3009
3010         tbl = net->ipv4.route_hdr->ctl_table_arg;
3011         unregister_net_sysctl_table(net->ipv4.route_hdr);
3012         BUG_ON(tbl == ipv4_route_flush_table);
3013         kfree(tbl);
3014 }
3015
3016 static __net_initdata struct pernet_operations sysctl_route_ops = {
3017         .init = sysctl_route_net_init,
3018         .exit = sysctl_route_net_exit,
3019 };
3020 #endif
3021
3022 static __net_init int rt_genid_init(struct net *net)
3023 {
3024         atomic_set(&net->ipv4.rt_genid, 0);
3025         atomic_set(&net->fnhe_genid, 0);
3026         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3027         return 0;
3028 }
3029
3030 static __net_initdata struct pernet_operations rt_genid_ops = {
3031         .init = rt_genid_init,
3032 };
3033
3034 static int __net_init ipv4_inetpeer_init(struct net *net)
3035 {
3036         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3037
3038         if (!bp)
3039                 return -ENOMEM;
3040         inet_peer_base_init(bp);
3041         net->ipv4.peers = bp;
3042         return 0;
3043 }
3044
3045 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3046 {
3047         struct inet_peer_base *bp = net->ipv4.peers;
3048
3049         net->ipv4.peers = NULL;
3050         inetpeer_invalidate_tree(bp);
3051         kfree(bp);
3052 }
3053
3054 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3055         .init   =       ipv4_inetpeer_init,
3056         .exit   =       ipv4_inetpeer_exit,
3057 };
3058
3059 #ifdef CONFIG_IP_ROUTE_CLASSID
3060 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3061 #endif /* CONFIG_IP_ROUTE_CLASSID */
3062
3063 int __init ip_rt_init(void)
3064 {
3065         int cpu;
3066
3067         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3068         if (!ip_idents)
3069                 panic("IP: failed to allocate ip_idents\n");
3070
3071         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3072
3073         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3074         if (!ip_tstamps)
3075                 panic("IP: failed to allocate ip_tstamps\n");
3076
3077         for_each_possible_cpu(cpu) {
3078                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3079
3080                 INIT_LIST_HEAD(&ul->head);
3081                 spin_lock_init(&ul->lock);
3082         }
3083 #ifdef CONFIG_IP_ROUTE_CLASSID
3084         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3085         if (!ip_rt_acct)
3086                 panic("IP: failed to allocate ip_rt_acct\n");
3087 #endif
3088
3089         ipv4_dst_ops.kmem_cachep =
3090                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3091                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3092
3093         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3094
3095         if (dst_entries_init(&ipv4_dst_ops) < 0)
3096                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3097
3098         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3099                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3100
3101         ipv4_dst_ops.gc_thresh = ~0;
3102         ip_rt_max_size = INT_MAX;
3103
3104         devinet_init();
3105         ip_fib_init();
3106
3107         if (ip_rt_proc_init())
3108                 pr_err("Unable to create route proc files\n");
3109 #ifdef CONFIG_XFRM
3110         xfrm_init();
3111         xfrm4_init();
3112 #endif
3113         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3114                       RTNL_FLAG_DOIT_UNLOCKED);
3115
3116 #ifdef CONFIG_SYSCTL
3117         register_pernet_subsys(&sysctl_route_ops);
3118 #endif
3119         register_pernet_subsys(&rt_genid_ops);
3120         register_pernet_subsys(&ipv4_inetpeer_ops);
3121         return 0;
3122 }
3123
3124 #ifdef CONFIG_SYSCTL
3125 /*
3126  * We really need to sanitize the damn ipv4 init order, then all
3127  * this nonsense will go away.
3128  */
3129 void __init ip_static_sysctl_init(void)
3130 {
3131         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3132 }
3133 #endif