net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 148                                            struct sk_buff *skb, u32 mtu);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         return NULL;
 281
 282 }
 283
 284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285 {
 286
 287 }
 288
 289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290 {
 291         struct rt_cache_stat *st = v;
 292
 293         if (v == SEQ_START_TOKEN) {
 294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                 return 0;
 296         }
 297
 298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                    dst_entries_get_slow(&ipv4_dst_ops),
 301                    0, /* st->in_hit */
 302                    st->in_slow_tot,
 303                    st->in_slow_mc,
 304                    st->in_no_route,
 305                    st->in_brd,
 306                    st->in_martian_dst,
 307                    st->in_martian_src,
 308
 309                    0, /* st->out_hit */
 310                    st->out_slow_tot,
 311                    st->out_slow_mc,
 312
 313                    0, /* st->gc_total */
 314                    0, /* st->gc_ignored */
 315                    0, /* st->gc_goal_miss */
 316                    0, /* st->gc_dst_overflow */
 317                    0, /* st->in_hlist_search */
 318                    0  /* st->out_hlist_search */
 319                 );
 320         return 0;
 321 }
 322
 323 static const struct seq_operations rt_cpu_seq_ops = {
 324         .start  = rt_cpu_seq_start,
 325         .next   = rt_cpu_seq_next,
 326         .stop   = rt_cpu_seq_stop,
 327         .show   = rt_cpu_seq_show,
 328 };
 329
 330
 331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332 {
 333         return seq_open(file, &rt_cpu_seq_ops);
 334 }
 335
 336 static const struct file_operations rt_cpu_seq_fops = {
 337         .owner   = THIS_MODULE,
 338         .open    = rt_cpu_seq_open,
 339         .read    = seq_read,
 340         .llseek  = seq_lseek,
 341         .release = seq_release,
 342 };
 343
 344 #ifdef CONFIG_IP_ROUTE_CLASSID
 345 static int rt_acct_proc_show(struct seq_file *m, void *v)
 346 {
 347         struct ip_rt_acct *dst, *src;
 348         unsigned int i, j;
 349
 350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351         if (!dst)
 352                 return -ENOMEM;
 353
 354         for_each_possible_cpu(i) {
 355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                 for (j = 0; j < 256; j++) {
 357                         dst[j].o_bytes   += src[j].o_bytes;
 358                         dst[j].o_packets += src[j].o_packets;
 359                         dst[j].i_bytes   += src[j].i_bytes;
 360                         dst[j].i_packets += src[j].i_packets;
 361                 }
 362         }
 363
 364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365         kfree(dst);
 366         return 0;
 367 }
 368
 369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370 {
 371         return single_open(file, rt_acct_proc_show, NULL);
 372 }
 373
 374 static const struct file_operations rt_acct_proc_fops = {
 375         .owner          = THIS_MODULE,
 376         .open           = rt_acct_proc_open,
 377         .read           = seq_read,
 378         .llseek         = seq_lseek,
 379         .release        = single_release,
 380 };
 381 #endif
 382
 383 static int __net_init ip_rt_do_proc_init(struct net *net)
 384 {
 385         struct proc_dir_entry *pde;
 386
 387         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 388                           &rt_cache_seq_fops);
 389         if (!pde)
 390                 goto err1;
 391
 392         pde = proc_create("rt_cache", S_IRUGO,
 393                           net->proc_net_stat, &rt_cpu_seq_fops);
 394         if (!pde)
 395                 goto err2;
 396
 397 #ifdef CONFIG_IP_ROUTE_CLASSID
 398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399         if (!pde)
 400                 goto err3;
 401 #endif
 402         return 0;
 403
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405 err3:
 406         remove_proc_entry("rt_cache", net->proc_net_stat);
 407 #endif
 408 err2:
 409         remove_proc_entry("rt_cache", net->proc_net);
 410 err1:
 411         return -ENOMEM;
 412 }
 413
 414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415 {
 416         remove_proc_entry("rt_cache", net->proc_net_stat);
 417         remove_proc_entry("rt_cache", net->proc_net);
 418 #ifdef CONFIG_IP_ROUTE_CLASSID
 419         remove_proc_entry("rt_acct", net->proc_net);
 420 #endif
 421 }
 422
 423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424         .init = ip_rt_do_proc_init,
 425         .exit = ip_rt_do_proc_exit,
 426 };
 427
 428 static int __init ip_rt_proc_init(void)
 429 {
 430         return register_pernet_subsys(&ip_rt_proc_ops);
 431 }
 432
 433 #else
 434 static inline int ip_rt_proc_init(void)
 435 {
 436         return 0;
 437 }
 438 #endif /* CONFIG_PROC_FS */
 439
 440 static inline bool rt_is_expired(const struct rtable *rth)
 441 {
 442         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 443 }
 444
 445 void rt_cache_flush(struct net *net)
 446 {
 447         rt_genid_bump_ipv4(net);
 448 }
 449
 450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                            struct sk_buff *skb,
 452                                            const void *daddr)
 453 {
 454         struct net_device *dev = dst->dev;
 455         const __be32 *pkey = daddr;
 456         const struct rtable *rt;
 457         struct neighbour *n;
 458
 459         rt = (const struct rtable *) dst;
 460         if (rt->rt_gateway)
 461                 pkey = (const __be32 *) &rt->rt_gateway;
 462         else if (skb)
 463                 pkey = &ip_hdr(skb)->daddr;
 464
 465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466         if (n)
 467                 return n;
 468         return neigh_create(&arp_tbl, pkey, dev);
 469 }
 470
 471 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 472 {
 473         struct net_device *dev = dst->dev;
 474         const __be32 *pkey = daddr;
 475         const struct rtable *rt;
 476
 477         rt = (const struct rtable *)dst;
 478         if (rt->rt_gateway)
 479                 pkey = (const __be32 *)&rt->rt_gateway;
 480         else if (!daddr ||
 481                  (rt->rt_flags &
 482                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 483                 return;
 484
 485         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 486 }
 487
 488 #define IP_IDENTS_SZ 2048u
 489
 490 static atomic_t *ip_idents __read_mostly;
 491 static u32 *ip_tstamps __read_mostly;
 492
 493 /* In order to protect privacy, we add a perturbation to identifiers
 494  * if one generator is seldom used. This makes hard for an attacker
 495  * to infer how many packets were sent between two points in time.
 496  */
 497 u32 ip_idents_reserve(u32 hash, int segs)
 498 {
 499         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 500         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 501         u32 old = READ_ONCE(*p_tstamp);
 502         u32 now = (u32)jiffies;
 503         u32 new, delta = 0;
 504
 505         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 506                 delta = prandom_u32_max(now - old);
 507
 508         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 509         do {
 510                 old = (u32)atomic_read(p_id);
 511                 new = old + delta + segs;
 512         } while (atomic_cmpxchg(p_id, old, new) != old);
 513
 514         return new - segs;
 515 }
 516 EXPORT_SYMBOL(ip_idents_reserve);
 517
 518 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 519 {
 520         static u32 ip_idents_hashrnd __read_mostly;
 521         u32 hash, id;
 522
 523         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 524
 525         hash = jhash_3words((__force u32)iph->daddr,
 526                             (__force u32)iph->saddr,
 527                             iph->protocol ^ net_hash_mix(net),
 528                             ip_idents_hashrnd);
 529         id = ip_idents_reserve(hash, segs);
 530         iph->id = htons(id);
 531 }
 532 EXPORT_SYMBOL(__ip_select_ident);
 533
 534 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 535                              const struct sock *sk,
 536                              const struct iphdr *iph,
 537                              int oif, u8 tos,
 538                              u8 prot, u32 mark, int flow_flags)
 539 {
 540         if (sk) {
 541                 const struct inet_sock *inet = inet_sk(sk);
 542
 543                 oif = sk->sk_bound_dev_if;
 544                 mark = sk->sk_mark;
 545                 tos = RT_CONN_FLAGS(sk);
 546                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 547         }
 548         flowi4_init_output(fl4, oif, mark, tos,
 549                            RT_SCOPE_UNIVERSE, prot,
 550                            flow_flags,
 551                            iph->daddr, iph->saddr, 0, 0,
 552                            sock_net_uid(net, sk));
 553 }
 554
 555 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 556                                const struct sock *sk)
 557 {
 558         const struct net *net = dev_net(skb->dev);
 559         const struct iphdr *iph = ip_hdr(skb);
 560         int oif = skb->dev->ifindex;
 561         u8 tos = RT_TOS(iph->tos);
 562         u8 prot = iph->protocol;
 563         u32 mark = skb->mark;
 564
 565         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 566 }
 567
 568 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 569 {
 570         const struct inet_sock *inet = inet_sk(sk);
 571         const struct ip_options_rcu *inet_opt;
 572         __be32 daddr = inet->inet_daddr;
 573
 574         rcu_read_lock();
 575         inet_opt = rcu_dereference(inet->inet_opt);
 576         if (inet_opt && inet_opt->opt.srr)
 577                 daddr = inet_opt->opt.faddr;
 578         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 579                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 580                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 581                            inet_sk_flowi_flags(sk),
 582                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 583         rcu_read_unlock();
 584 }
 585
 586 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 587                                  const struct sk_buff *skb)
 588 {
 589         if (skb)
 590                 build_skb_flow_key(fl4, skb, sk);
 591         else
 592                 build_sk_flow_key(fl4, sk);
 593 }
 594
 595 static DEFINE_SPINLOCK(fnhe_lock);
 596
 597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 598 {
 599         struct rtable *rt;
 600
 601         rt = rcu_dereference(fnhe->fnhe_rth_input);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607         rt = rcu_dereference(fnhe->fnhe_rth_output);
 608         if (rt) {
 609                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 610                 dst_dev_put(&rt->dst);
 611                 dst_release(&rt->dst);
 612         }
 613 }
 614
 615 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 616 {
 617         struct fib_nh_exception *fnhe, *oldest;
 618
 619         oldest = rcu_dereference(hash->chain);
 620         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 621              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 622                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 623                         oldest = fnhe;
 624         }
 625         fnhe_flush_routes(oldest);
 626         return oldest;
 627 }
 628
 629 static inline u32 fnhe_hashfun(__be32 daddr)
 630 {
 631         static u32 fnhe_hashrnd __read_mostly;
 632         u32 hval;
 633
 634         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 635         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 636         return hash_32(hval, FNHE_HASH_SHIFT);
 637 }
 638
 639 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 640 {
 641         rt->rt_pmtu = fnhe->fnhe_pmtu;
 642         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 643         rt->dst.expires = fnhe->fnhe_expires;
 644
 645         if (fnhe->fnhe_gw) {
 646                 rt->rt_flags |= RTCF_REDIRECTED;
 647                 rt->rt_gateway = fnhe->fnhe_gw;
 648                 rt->rt_uses_gateway = 1;
 649         }
 650 }
 651
 652 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 653                                   u32 pmtu, bool lock, unsigned long expires)
 654 {
 655         struct fnhe_hash_bucket *hash;
 656         struct fib_nh_exception *fnhe;
 657         struct rtable *rt;
 658         u32 genid, hval;
 659         unsigned int i;
 660         int depth;
 661
 662         genid = fnhe_genid(dev_net(nh->nh_dev));
 663         hval = fnhe_hashfun(daddr);
 664
 665         spin_lock_bh(&fnhe_lock);
 666
 667         hash = rcu_dereference(nh->nh_exceptions);
 668         if (!hash) {
 669                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 670                 if (!hash)
 671                         goto out_unlock;
 672                 rcu_assign_pointer(nh->nh_exceptions, hash);
 673         }
 674
 675         hash += hval;
 676
 677         depth = 0;
 678         for (fnhe = rcu_dereference(hash->chain); fnhe;
 679              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 680                 if (fnhe->fnhe_daddr == daddr)
 681                         break;
 682                 depth++;
 683         }
 684
 685         if (fnhe) {
 686                 if (fnhe->fnhe_genid != genid)
 687                         fnhe->fnhe_genid = genid;
 688                 if (gw)
 689                         fnhe->fnhe_gw = gw;
 690                 if (pmtu) {
 691                         fnhe->fnhe_pmtu = pmtu;
 692                         fnhe->fnhe_mtu_locked = lock;
 693                 }
 694                 fnhe->fnhe_expires = max(1UL, expires);
 695                 /* Update all cached dsts too */
 696                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 697                 if (rt)
 698                         fill_route_from_fnhe(rt, fnhe);
 699                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 700                 if (rt)
 701                         fill_route_from_fnhe(rt, fnhe);
 702         } else {
 703                 if (depth > FNHE_RECLAIM_DEPTH)
 704                         fnhe = fnhe_oldest(hash);
 705                 else {
 706                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 707                         if (!fnhe)
 708                                 goto out_unlock;
 709
 710                         fnhe->fnhe_next = hash->chain;
 711                         rcu_assign_pointer(hash->chain, fnhe);
 712                 }
 713                 fnhe->fnhe_genid = genid;
 714                 fnhe->fnhe_daddr = daddr;
 715                 fnhe->fnhe_gw = gw;
 716                 fnhe->fnhe_pmtu = pmtu;
 717                 fnhe->fnhe_mtu_locked = lock;
 718                 fnhe->fnhe_expires = max(1UL, expires);
 719
 720                 /* Exception created; mark the cached routes for the nexthop
 721                  * stale, so anyone caching it rechecks if this exception
 722                  * applies to them.
 723                  */
 724                 rt = rcu_dereference(nh->nh_rth_input);
 725                 if (rt)
 726                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 727
 728                 for_each_possible_cpu(i) {
 729                         struct rtable __rcu **prt;
 730                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 731                         rt = rcu_dereference(*prt);
 732                         if (rt)
 733                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 734                 }
 735         }
 736
 737         fnhe->fnhe_stamp = jiffies;
 738
 739 out_unlock:
 740         spin_unlock_bh(&fnhe_lock);
 741 }
 742
 743 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 744                              bool kill_route)
 745 {
 746         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 747         __be32 old_gw = ip_hdr(skb)->saddr;
 748         struct net_device *dev = skb->dev;
 749         struct in_device *in_dev;
 750         struct fib_result res;
 751         struct neighbour *n;
 752         struct net *net;
 753
 754         switch (icmp_hdr(skb)->code & 7) {
 755         case ICMP_REDIR_NET:
 756         case ICMP_REDIR_NETTOS:
 757         case ICMP_REDIR_HOST:
 758         case ICMP_REDIR_HOSTTOS:
 759                 break;
 760
 761         default:
 762                 return;
 763         }
 764
 765         if (rt->rt_gateway != old_gw)
 766                 return;
 767
 768         in_dev = __in_dev_get_rcu(dev);
 769         if (!in_dev)
 770                 return;
 771
 772         net = dev_net(dev);
 773         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 774             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 775             ipv4_is_zeronet(new_gw))
 776                 goto reject_redirect;
 777
 778         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 779                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 780                         goto reject_redirect;
 781                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 782                         goto reject_redirect;
 783         } else {
 784                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 785                         goto reject_redirect;
 786         }
 787
 788         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 789         if (!n)
 790                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 791         if (!IS_ERR(n)) {
 792                 if (!(n->nud_state & NUD_VALID)) {
 793                         neigh_event_send(n, NULL);
 794                 } else {
 795                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 796                                 struct fib_nh *nh = &FIB_RES_NH(res);
 797
 798                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 799                                                 0, false,
 800                                                 jiffies + ip_rt_gc_timeout);
 801                         }
 802                         if (kill_route)
 803                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 804                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 805                 }
 806                 neigh_release(n);
 807         }
 808         return;
 809
 810 reject_redirect:
 811 #ifdef CONFIG_IP_ROUTE_VERBOSE
 812         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 813                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 814                 __be32 daddr = iph->daddr;
 815                 __be32 saddr = iph->saddr;
 816
 817                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 818                                      "  Advised path = %pI4 -> %pI4\n",
 819                                      &old_gw, dev->name, &new_gw,
 820                                      &saddr, &daddr);
 821         }
 822 #endif
 823         ;
 824 }
 825
 826 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 827 {
 828         struct rtable *rt;
 829         struct flowi4 fl4;
 830         const struct iphdr *iph = (const struct iphdr *) skb->data;
 831         struct net *net = dev_net(skb->dev);
 832         int oif = skb->dev->ifindex;
 833         u8 tos = RT_TOS(iph->tos);
 834         u8 prot = iph->protocol;
 835         u32 mark = skb->mark;
 836
 837         rt = (struct rtable *) dst;
 838
 839         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 840         __ip_do_redirect(rt, skb, &fl4, true);
 841 }
 842
 843 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 844 {
 845         struct rtable *rt = (struct rtable *)dst;
 846         struct dst_entry *ret = dst;
 847
 848         if (rt) {
 849                 if (dst->obsolete > 0) {
 850                         ip_rt_put(rt);
 851                         ret = NULL;
 852                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 853                            rt->dst.expires) {
 854                         ip_rt_put(rt);
 855                         ret = NULL;
 856                 }
 857         }
 858         return ret;
 859 }
 860
 861 /*
 862  * Algorithm:
 863  *      1. The first ip_rt_redirect_number redirects are sent
 864  *         with exponential backoff, then we stop sending them at all,
 865  *         assuming that the host ignores our redirects.
 866  *      2. If we did not see packets requiring redirects
 867  *         during ip_rt_redirect_silence, we assume that the host
 868  *         forgot redirected route and start to send redirects again.
 869  *
 870  * This algorithm is much cheaper and more intelligent than dumb load limiting
 871  * in icmp.c.
 872  *
 873  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 874  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 875  */
 876
 877 void ip_rt_send_redirect(struct sk_buff *skb)
 878 {
 879         struct rtable *rt = skb_rtable(skb);
 880         struct in_device *in_dev;
 881         struct inet_peer *peer;
 882         struct net *net;
 883         int log_martians;
 884         int vif;
 885
 886         rcu_read_lock();
 887         in_dev = __in_dev_get_rcu(rt->dst.dev);
 888         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 889                 rcu_read_unlock();
 890                 return;
 891         }
 892         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 893         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 894         rcu_read_unlock();
 895
 896         net = dev_net(rt->dst.dev);
 897         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 898         if (!peer) {
 899                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 900                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 901                 return;
 902         }
 903
 904         /* No redirected packets during ip_rt_redirect_silence;
 905          * reset the algorithm.
 906          */
 907         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 908                 peer->rate_tokens = 0;
 909
 910         /* Too many ignored redirects; do not send anything
 911          * set dst.rate_last to the last seen redirected packet.
 912          */
 913         if (peer->rate_tokens >= ip_rt_redirect_number) {
 914                 peer->rate_last = jiffies;
 915                 goto out_put_peer;
 916         }
 917
 918         /* Check for load limit; set rate_last to the latest sent
 919          * redirect.
 920          */
 921         if (peer->rate_tokens == 0 ||
 922             time_after(jiffies,
 923                        (peer->rate_last +
 924                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 925                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 926
 927                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 928                 peer->rate_last = jiffies;
 929                 ++peer->rate_tokens;
 930 #ifdef CONFIG_IP_ROUTE_VERBOSE
 931                 if (log_martians &&
 932                     peer->rate_tokens == ip_rt_redirect_number)
 933                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 934                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 935                                              &ip_hdr(skb)->daddr, &gw);
 936 #endif
 937         }
 938 out_put_peer:
 939         inet_putpeer(peer);
 940 }
 941
 942 static int ip_error(struct sk_buff *skb)
 943 {
 944         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 945         struct rtable *rt = skb_rtable(skb);
 946         struct inet_peer *peer;
 947         unsigned long now;
 948         struct net *net;
 949         bool send;
 950         int code;
 951
 952         /* IP on this device is disabled. */
 953         if (!in_dev)
 954                 goto out;
 955
 956         net = dev_net(rt->dst.dev);
 957         if (!IN_DEV_FORWARD(in_dev)) {
 958                 switch (rt->dst.error) {
 959                 case EHOSTUNREACH:
 960                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 961                         break;
 962
 963                 case ENETUNREACH:
 964                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 965                         break;
 966                 }
 967                 goto out;
 968         }
 969
 970         switch (rt->dst.error) {
 971         case EINVAL:
 972         default:
 973                 goto out;
 974         case EHOSTUNREACH:
 975                 code = ICMP_HOST_UNREACH;
 976                 break;
 977         case ENETUNREACH:
 978                 code = ICMP_NET_UNREACH;
 979                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 980                 break;
 981         case EACCES:
 982                 code = ICMP_PKT_FILTERED;
 983                 break;
 984         }
 985
 986         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 987                                l3mdev_master_ifindex(skb->dev), 1);
 988
 989         send = true;
 990         if (peer) {
 991                 now = jiffies;
 992                 peer->rate_tokens += now - peer->rate_last;
 993                 if (peer->rate_tokens > ip_rt_error_burst)
 994                         peer->rate_tokens = ip_rt_error_burst;
 995                 peer->rate_last = now;
 996                 if (peer->rate_tokens >= ip_rt_error_cost)
 997                         peer->rate_tokens -= ip_rt_error_cost;
 998                 else
 999                         send = false;
1000                 inet_putpeer(peer);
1001         }
1002         if (send)
1003                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004
1005 out:    kfree_skb(skb);
1006         return 0;
1007 }
1008
1009 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1010 {
1011         struct dst_entry *dst = &rt->dst;
1012         struct fib_result res;
1013         bool lock = false;
1014
1015         if (ip_mtu_locked(dst))
1016                 return;
1017
1018         if (ipv4_mtu(dst) < mtu)
1019                 return;
1020
1021         if (mtu < ip_rt_min_pmtu) {
1022                 lock = true;
1023                 mtu = ip_rt_min_pmtu;
1024         }
1025
1026         if (rt->rt_pmtu == mtu &&
1027             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1028                 return;
1029
1030         rcu_read_lock();
1031         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1032                 struct fib_nh *nh = &FIB_RES_NH(res);
1033
1034                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1035                                       jiffies + ip_rt_mtu_expires);
1036         }
1037         rcu_read_unlock();
1038 }
1039
1040 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1041                               struct sk_buff *skb, u32 mtu)
1042 {
1043         struct rtable *rt = (struct rtable *) dst;
1044         struct flowi4 fl4;
1045
1046         ip_rt_build_flow_key(&fl4, sk, skb);
1047         __ip_rt_update_pmtu(rt, &fl4, mtu);
1048 }
1049
1050 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1051                       int oif, u32 mark, u8 protocol, int flow_flags)
1052 {
1053         const struct iphdr *iph = (const struct iphdr *) skb->data;
1054         struct flowi4 fl4;
1055         struct rtable *rt;
1056
1057         if (!mark)
1058                 mark = IP4_REPLY_MARK(net, skb->mark);
1059
1060         __build_flow_key(net, &fl4, NULL, iph, oif,
1061                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1062         rt = __ip_route_output_key(net, &fl4);
1063         if (!IS_ERR(rt)) {
1064                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1065                 ip_rt_put(rt);
1066         }
1067 }
1068 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1069
1070 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1071 {
1072         const struct iphdr *iph = (const struct iphdr *) skb->data;
1073         struct flowi4 fl4;
1074         struct rtable *rt;
1075
1076         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1077
1078         if (!fl4.flowi4_mark)
1079                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1080
1081         rt = __ip_route_output_key(sock_net(sk), &fl4);
1082         if (!IS_ERR(rt)) {
1083                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1084                 ip_rt_put(rt);
1085         }
1086 }
1087
1088 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1089 {
1090         const struct iphdr *iph = (const struct iphdr *) skb->data;
1091         struct flowi4 fl4;
1092         struct rtable *rt;
1093         struct dst_entry *odst = NULL;
1094         bool new = false;
1095         struct net *net = sock_net(sk);
1096
1097         bh_lock_sock(sk);
1098
1099         if (!ip_sk_accept_pmtu(sk))
1100                 goto out;
1101
1102         odst = sk_dst_get(sk);
1103
1104         if (sock_owned_by_user(sk) || !odst) {
1105                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1106                 goto out;
1107         }
1108
1109         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1110
1111         rt = (struct rtable *)odst;
1112         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1113                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114                 if (IS_ERR(rt))
1115                         goto out;
1116
1117                 new = true;
1118         }
1119
1120         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1121
1122         if (!dst_check(&rt->dst, 0)) {
1123                 if (new)
1124                         dst_release(&rt->dst);
1125
1126                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127                 if (IS_ERR(rt))
1128                         goto out;
1129
1130                 new = true;
1131         }
1132
1133         if (new)
1134                 sk_dst_set(sk, &rt->dst);
1135
1136 out:
1137         bh_unlock_sock(sk);
1138         dst_release(odst);
1139 }
1140 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1141
1142 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1143                    int oif, u32 mark, u8 protocol, int flow_flags)
1144 {
1145         const struct iphdr *iph = (const struct iphdr *) skb->data;
1146         struct flowi4 fl4;
1147         struct rtable *rt;
1148
1149         __build_flow_key(net, &fl4, NULL, iph, oif,
1150                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1151         rt = __ip_route_output_key(net, &fl4);
1152         if (!IS_ERR(rt)) {
1153                 __ip_do_redirect(rt, skb, &fl4, false);
1154                 ip_rt_put(rt);
1155         }
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_redirect);
1158
1159 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1160 {
1161         const struct iphdr *iph = (const struct iphdr *) skb->data;
1162         struct flowi4 fl4;
1163         struct rtable *rt;
1164         struct net *net = sock_net(sk);
1165
1166         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1167         rt = __ip_route_output_key(net, &fl4);
1168         if (!IS_ERR(rt)) {
1169                 __ip_do_redirect(rt, skb, &fl4, false);
1170                 ip_rt_put(rt);
1171         }
1172 }
1173 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1174
1175 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1176 {
1177         struct rtable *rt = (struct rtable *) dst;
1178
1179         /* All IPV4 dsts are created with ->obsolete set to the value
1180          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1181          * into this function always.
1182          *
1183          * When a PMTU/redirect information update invalidates a route,
1184          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1185          * DST_OBSOLETE_DEAD by dst_free().
1186          */
1187         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1188                 return NULL;
1189         return dst;
1190 }
1191
1192 static void ipv4_link_failure(struct sk_buff *skb)
1193 {
1194         struct rtable *rt;
1195
1196         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1197
1198         rt = skb_rtable(skb);
1199         if (rt)
1200                 dst_set_expires(&rt->dst, 0);
1201 }
1202
1203 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1204 {
1205         pr_debug("%s: %pI4 -> %pI4, %s\n",
1206                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1207                  skb->dev ? skb->dev->name : "?");
1208         kfree_skb(skb);
1209         WARN_ON(1);
1210         return 0;
1211 }
1212
1213 /*
1214    We do not cache source address of outgoing interface,
1215    because it is used only by IP RR, TS and SRR options,
1216    so that it out of fast path.
1217
1218    BTW remember: "addr" is allowed to be not aligned
1219    in IP options!
1220  */
1221
1222 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1223 {
1224         __be32 src;
1225
1226         if (rt_is_output_route(rt))
1227                 src = ip_hdr(skb)->saddr;
1228         else {
1229                 struct fib_result res;
1230                 struct flowi4 fl4;
1231                 struct iphdr *iph;
1232
1233                 iph = ip_hdr(skb);
1234
1235                 memset(&fl4, 0, sizeof(fl4));
1236                 fl4.daddr = iph->daddr;
1237                 fl4.saddr = iph->saddr;
1238                 fl4.flowi4_tos = RT_TOS(iph->tos);
1239                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1240                 fl4.flowi4_iif = skb->dev->ifindex;
1241                 fl4.flowi4_mark = skb->mark;
1242
1243                 rcu_read_lock();
1244                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1245                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1246                 else
1247                         src = inet_select_addr(rt->dst.dev,
1248                                                rt_nexthop(rt, iph->daddr),
1249                                                RT_SCOPE_UNIVERSE);
1250                 rcu_read_unlock();
1251         }
1252         memcpy(addr, &src, 4);
1253 }
1254
1255 #ifdef CONFIG_IP_ROUTE_CLASSID
1256 static void set_class_tag(struct rtable *rt, u32 tag)
1257 {
1258         if (!(rt->dst.tclassid & 0xFFFF))
1259                 rt->dst.tclassid |= tag & 0xFFFF;
1260         if (!(rt->dst.tclassid & 0xFFFF0000))
1261                 rt->dst.tclassid |= tag & 0xFFFF0000;
1262 }
1263 #endif
1264
1265 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1266 {
1267         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1268         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1269                                     ip_rt_min_advmss);
1270
1271         return min(advmss, IPV4_MAX_PMTU - header_size);
1272 }
1273
1274 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1275 {
1276         const struct rtable *rt = (const struct rtable *) dst;
1277         unsigned int mtu = rt->rt_pmtu;
1278
1279         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1280                 mtu = dst_metric_raw(dst, RTAX_MTU);
1281
1282         if (mtu)
1283                 return mtu;
1284
1285         mtu = READ_ONCE(dst->dev->mtu);
1286
1287         if (unlikely(ip_mtu_locked(dst))) {
1288                 if (rt->rt_uses_gateway && mtu > 576)
1289                         mtu = 576;
1290         }
1291
1292         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1293
1294         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1295 }
1296
1297 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1298 {
1299         struct fnhe_hash_bucket *hash;
1300         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1301         u32 hval = fnhe_hashfun(daddr);
1302
1303         spin_lock_bh(&fnhe_lock);
1304
1305         hash = rcu_dereference_protected(nh->nh_exceptions,
1306                                          lockdep_is_held(&fnhe_lock));
1307         hash += hval;
1308
1309         fnhe_p = &hash->chain;
1310         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1311         while (fnhe) {
1312                 if (fnhe->fnhe_daddr == daddr) {
1313                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1314                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1315                         fnhe_flush_routes(fnhe);
1316                         kfree_rcu(fnhe, rcu);
1317                         break;
1318                 }
1319                 fnhe_p = &fnhe->fnhe_next;
1320                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1321                                                  lockdep_is_held(&fnhe_lock));
1322         }
1323
1324         spin_unlock_bh(&fnhe_lock);
1325 }
1326
1327 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1328 {
1329         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1330         struct fib_nh_exception *fnhe;
1331         u32 hval;
1332
1333         if (!hash)
1334                 return NULL;
1335
1336         hval = fnhe_hashfun(daddr);
1337
1338         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1339              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1340                 if (fnhe->fnhe_daddr == daddr) {
1341                         if (fnhe->fnhe_expires &&
1342                             time_after(jiffies, fnhe->fnhe_expires)) {
1343                                 ip_del_fnhe(nh, daddr);
1344                                 break;
1345                         }
1346                         return fnhe;
1347                 }
1348         }
1349         return NULL;
1350 }
1351
1352 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1353                               __be32 daddr, const bool do_cache)
1354 {
1355         bool ret = false;
1356
1357         spin_lock_bh(&fnhe_lock);
1358
1359         if (daddr == fnhe->fnhe_daddr) {
1360                 struct rtable __rcu **porig;
1361                 struct rtable *orig;
1362                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1363
1364                 if (rt_is_input_route(rt))
1365                         porig = &fnhe->fnhe_rth_input;
1366                 else
1367                         porig = &fnhe->fnhe_rth_output;
1368                 orig = rcu_dereference(*porig);
1369
1370                 if (fnhe->fnhe_genid != genid) {
1371                         fnhe->fnhe_genid = genid;
1372                         fnhe->fnhe_gw = 0;
1373                         fnhe->fnhe_pmtu = 0;
1374                         fnhe->fnhe_expires = 0;
1375                         fnhe_flush_routes(fnhe);
1376                         orig = NULL;
1377                 }
1378                 fill_route_from_fnhe(rt, fnhe);
1379                 if (!rt->rt_gateway)
1380                         rt->rt_gateway = daddr;
1381
1382                 if (do_cache) {
1383                         dst_hold(&rt->dst);
1384                         rcu_assign_pointer(*porig, rt);
1385                         if (orig) {
1386                                 dst_dev_put(&orig->dst);
1387                                 dst_release(&orig->dst);
1388                         }
1389                         ret = true;
1390                 }
1391
1392                 fnhe->fnhe_stamp = jiffies;
1393         }
1394         spin_unlock_bh(&fnhe_lock);
1395
1396         return ret;
1397 }
1398
1399 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1400 {
1401         struct rtable *orig, *prev, **p;
1402         bool ret = true;
1403
1404         if (rt_is_input_route(rt)) {
1405                 p = (struct rtable **)&nh->nh_rth_input;
1406         } else {
1407                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1408         }
1409         orig = *p;
1410
1411         /* hold dst before doing cmpxchg() to avoid race condition
1412          * on this dst
1413          */
1414         dst_hold(&rt->dst);
1415         prev = cmpxchg(p, orig, rt);
1416         if (prev == orig) {
1417                 if (orig) {
1418                         dst_dev_put(&orig->dst);
1419                         dst_release(&orig->dst);
1420                 }
1421         } else {
1422                 dst_release(&rt->dst);
1423                 ret = false;
1424         }
1425
1426         return ret;
1427 }
1428
1429 struct uncached_list {
1430         spinlock_t              lock;
1431         struct list_head        head;
1432 };
1433
1434 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1435
1436 void rt_add_uncached_list(struct rtable *rt)
1437 {
1438         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1439
1440         rt->rt_uncached_list = ul;
1441
1442         spin_lock_bh(&ul->lock);
1443         list_add_tail(&rt->rt_uncached, &ul->head);
1444         spin_unlock_bh(&ul->lock);
1445 }
1446
1447 void rt_del_uncached_list(struct rtable *rt)
1448 {
1449         if (!list_empty(&rt->rt_uncached)) {
1450                 struct uncached_list *ul = rt->rt_uncached_list;
1451
1452                 spin_lock_bh(&ul->lock);
1453                 list_del(&rt->rt_uncached);
1454                 spin_unlock_bh(&ul->lock);
1455         }
1456 }
1457
1458 static void ipv4_dst_destroy(struct dst_entry *dst)
1459 {
1460         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1461         struct rtable *rt = (struct rtable *)dst;
1462
1463         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1464                 kfree(p);
1465
1466         rt_del_uncached_list(rt);
1467 }
1468
1469 void rt_flush_dev(struct net_device *dev)
1470 {
1471         struct net *net = dev_net(dev);
1472         struct rtable *rt;
1473         int cpu;
1474
1475         for_each_possible_cpu(cpu) {
1476                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1477
1478                 spin_lock_bh(&ul->lock);
1479                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1480                         if (rt->dst.dev != dev)
1481                                 continue;
1482                         rt->dst.dev = net->loopback_dev;
1483                         dev_hold(rt->dst.dev);
1484                         dev_put(dev);
1485                 }
1486                 spin_unlock_bh(&ul->lock);
1487         }
1488 }
1489
1490 static bool rt_cache_valid(const struct rtable *rt)
1491 {
1492         return  rt &&
1493                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1494                 !rt_is_expired(rt);
1495 }
1496
1497 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1498                            const struct fib_result *res,
1499                            struct fib_nh_exception *fnhe,
1500                            struct fib_info *fi, u16 type, u32 itag,
1501                            const bool do_cache)
1502 {
1503         bool cached = false;
1504
1505         if (fi) {
1506                 struct fib_nh *nh = &FIB_RES_NH(*res);
1507
1508                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1509                         rt->rt_gateway = nh->nh_gw;
1510                         rt->rt_uses_gateway = 1;
1511                 }
1512                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1513                 if (fi->fib_metrics != &dst_default_metrics) {
1514                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1515                         refcount_inc(&fi->fib_metrics->refcnt);
1516                 }
1517 #ifdef CONFIG_IP_ROUTE_CLASSID
1518                 rt->dst.tclassid = nh->nh_tclassid;
1519 #endif
1520                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1521                 if (unlikely(fnhe))
1522                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1523                 else if (do_cache)
1524                         cached = rt_cache_route(nh, rt);
1525                 if (unlikely(!cached)) {
1526                         /* Routes we intend to cache in nexthop exception or
1527                          * FIB nexthop have the DST_NOCACHE bit clear.
1528                          * However, if we are unsuccessful at storing this
1529                          * route into the cache we really need to set it.
1530                          */
1531                         if (!rt->rt_gateway)
1532                                 rt->rt_gateway = daddr;
1533                         rt_add_uncached_list(rt);
1534                 }
1535         } else
1536                 rt_add_uncached_list(rt);
1537
1538 #ifdef CONFIG_IP_ROUTE_CLASSID
1539 #ifdef CONFIG_IP_MULTIPLE_TABLES
1540         set_class_tag(rt, res->tclassid);
1541 #endif
1542         set_class_tag(rt, itag);
1543 #endif
1544 }
1545
1546 struct rtable *rt_dst_alloc(struct net_device *dev,
1547                             unsigned int flags, u16 type,
1548                             bool nopolicy, bool noxfrm, bool will_cache)
1549 {
1550         struct rtable *rt;
1551
1552         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1553                        (will_cache ? 0 : DST_HOST) |
1554                        (nopolicy ? DST_NOPOLICY : 0) |
1555                        (noxfrm ? DST_NOXFRM : 0));
1556
1557         if (rt) {
1558                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1559                 rt->rt_flags = flags;
1560                 rt->rt_type = type;
1561                 rt->rt_is_input = 0;
1562                 rt->rt_iif = 0;
1563                 rt->rt_pmtu = 0;
1564                 rt->rt_mtu_locked = 0;
1565                 rt->rt_gateway = 0;
1566                 rt->rt_uses_gateway = 0;
1567                 rt->rt_table_id = 0;
1568                 INIT_LIST_HEAD(&rt->rt_uncached);
1569
1570                 rt->dst.output = ip_output;
1571                 if (flags & RTCF_LOCAL)
1572                         rt->dst.input = ip_local_deliver;
1573         }
1574
1575         return rt;
1576 }
1577 EXPORT_SYMBOL(rt_dst_alloc);
1578
1579 /* called in rcu_read_lock() section */
1580 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1581                           u8 tos, struct net_device *dev,
1582                           struct in_device *in_dev, u32 *itag)
1583 {
1584         int err;
1585
1586         /* Primary sanity checks. */
1587         if (!in_dev)
1588                 return -EINVAL;
1589
1590         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1591             skb->protocol != htons(ETH_P_IP))
1592                 return -EINVAL;
1593
1594         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1595                 return -EINVAL;
1596
1597         if (ipv4_is_zeronet(saddr)) {
1598                 if (!ipv4_is_local_multicast(daddr))
1599                         return -EINVAL;
1600         } else {
1601                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1602                                           in_dev, itag);
1603                 if (err < 0)
1604                         return err;
1605         }
1606         return 0;
1607 }
1608
1609 /* called in rcu_read_lock() section */
1610 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1611                              u8 tos, struct net_device *dev, int our)
1612 {
1613         struct in_device *in_dev = __in_dev_get_rcu(dev);
1614         unsigned int flags = RTCF_MULTICAST;
1615         struct rtable *rth;
1616         u32 itag = 0;
1617         int err;
1618
1619         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1620         if (err)
1621                 return err;
1622
1623         if (our)
1624                 flags |= RTCF_LOCAL;
1625
1626         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1627                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1628         if (!rth)
1629                 return -ENOBUFS;
1630
1631 #ifdef CONFIG_IP_ROUTE_CLASSID
1632         rth->dst.tclassid = itag;
1633 #endif
1634         rth->dst.output = ip_rt_bug;
1635         rth->rt_is_input= 1;
1636
1637 #ifdef CONFIG_IP_MROUTE
1638         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1639                 rth->dst.input = ip_mr_input;
1640 #endif
1641         RT_CACHE_STAT_INC(in_slow_mc);
1642
1643         skb_dst_set(skb, &rth->dst);
1644         return 0;
1645 }
1646
1647
1648 static void ip_handle_martian_source(struct net_device *dev,
1649                                      struct in_device *in_dev,
1650                                      struct sk_buff *skb,
1651                                      __be32 daddr,
1652                                      __be32 saddr)
1653 {
1654         RT_CACHE_STAT_INC(in_martian_src);
1655 #ifdef CONFIG_IP_ROUTE_VERBOSE
1656         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1657                 /*
1658                  *      RFC1812 recommendation, if source is martian,
1659                  *      the only hint is MAC header.
1660                  */
1661                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1662                         &daddr, &saddr, dev->name);
1663                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1664                         print_hex_dump(KERN_WARNING, "ll header: ",
1665                                        DUMP_PREFIX_OFFSET, 16, 1,
1666                                        skb_mac_header(skb),
1667                                        dev->hard_header_len, true);
1668                 }
1669         }
1670 #endif
1671 }
1672
1673 static void set_lwt_redirect(struct rtable *rth)
1674 {
1675         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1676                 rth->dst.lwtstate->orig_output = rth->dst.output;
1677                 rth->dst.output = lwtunnel_output;
1678         }
1679
1680         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1681                 rth->dst.lwtstate->orig_input = rth->dst.input;
1682                 rth->dst.input = lwtunnel_input;
1683         }
1684 }
1685
1686 /* called in rcu_read_lock() section */
1687 static int __mkroute_input(struct sk_buff *skb,
1688                            const struct fib_result *res,
1689                            struct in_device *in_dev,
1690                            __be32 daddr, __be32 saddr, u32 tos)
1691 {
1692         struct fib_nh_exception *fnhe;
1693         struct rtable *rth;
1694         int err;
1695         struct in_device *out_dev;
1696         bool do_cache;
1697         u32 itag = 0;
1698
1699         /* get a working reference to the output device */
1700         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1701         if (!out_dev) {
1702                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1703                 return -EINVAL;
1704         }
1705
1706         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1707                                   in_dev->dev, in_dev, &itag);
1708         if (err < 0) {
1709                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1710                                          saddr);
1711
1712                 goto cleanup;
1713         }
1714
1715         do_cache = res->fi && !itag;
1716         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1717             skb->protocol == htons(ETH_P_IP) &&
1718             (IN_DEV_SHARED_MEDIA(out_dev) ||
1719              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1720                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1721
1722         if (skb->protocol != htons(ETH_P_IP)) {
1723                 /* Not IP (i.e. ARP). Do not create route, if it is
1724                  * invalid for proxy arp. DNAT routes are always valid.
1725                  *
1726                  * Proxy arp feature have been extended to allow, ARP
1727                  * replies back to the same interface, to support
1728                  * Private VLAN switch technologies. See arp.c.
1729                  */
1730                 if (out_dev == in_dev &&
1731                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1732                         err = -EINVAL;
1733                         goto cleanup;
1734                 }
1735         }
1736
1737         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1738         if (do_cache) {
1739                 if (fnhe)
1740                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1741                 else
1742                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1743                 if (rt_cache_valid(rth)) {
1744                         skb_dst_set_noref(skb, &rth->dst);
1745                         goto out;
1746                 }
1747         }
1748
1749         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1750                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1751                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1752         if (!rth) {
1753                 err = -ENOBUFS;
1754                 goto cleanup;
1755         }
1756
1757         rth->rt_is_input = 1;
1758         if (res->table)
1759                 rth->rt_table_id = res->table->tb_id;
1760         RT_CACHE_STAT_INC(in_slow_tot);
1761
1762         rth->dst.input = ip_forward;
1763
1764         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1765                        do_cache);
1766         set_lwt_redirect(rth);
1767         skb_dst_set(skb, &rth->dst);
1768 out:
1769         err = 0;
1770  cleanup:
1771         return err;
1772 }
1773
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1775 /* To make ICMP packets follow the right flow, the multipath hash is
1776  * calculated from the inner IP addresses.
1777  */
1778 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1779                                  struct flow_keys *hash_keys)
1780 {
1781         const struct iphdr *outer_iph = ip_hdr(skb);
1782         const struct iphdr *inner_iph;
1783         const struct icmphdr *icmph;
1784         struct iphdr _inner_iph;
1785         struct icmphdr _icmph;
1786
1787         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1788         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1789         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1790                 return;
1791
1792         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1793                 return;
1794
1795         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1796                                    &_icmph);
1797         if (!icmph)
1798                 return;
1799
1800         if (icmph->type != ICMP_DEST_UNREACH &&
1801             icmph->type != ICMP_REDIRECT &&
1802             icmph->type != ICMP_TIME_EXCEEDED &&
1803             icmph->type != ICMP_PARAMETERPROB)
1804                 return;
1805
1806         inner_iph = skb_header_pointer(skb,
1807                                        outer_iph->ihl * 4 + sizeof(_icmph),
1808                                        sizeof(_inner_iph), &_inner_iph);
1809         if (!inner_iph)
1810                 return;
1811         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1812         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1813 }
1814
1815 /* if skb is set it will be used and fl4 can be NULL */
1816 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1817                        const struct sk_buff *skb)
1818 {
1819         struct net *net = fi->fib_net;
1820         struct flow_keys hash_keys;
1821         u32 mhash;
1822
1823         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1824         case 0:
1825                 memset(&hash_keys, 0, sizeof(hash_keys));
1826                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1827                 if (skb) {
1828                         ip_multipath_l3_keys(skb, &hash_keys);
1829                 } else {
1830                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1831                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1832                 }
1833                 break;
1834         case 1:
1835                 /* skb is currently provided only when forwarding */
1836                 if (skb) {
1837                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1838                         struct flow_keys keys;
1839
1840                         /* short-circuit if we already have L4 hash present */
1841                         if (skb->l4_hash)
1842                                 return skb_get_hash_raw(skb) >> 1;
1843                         memset(&hash_keys, 0, sizeof(hash_keys));
1844                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1845
1846                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1847                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1848                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1849                         hash_keys.ports.src = keys.ports.src;
1850                         hash_keys.ports.dst = keys.ports.dst;
1851                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1852                 } else {
1853                         memset(&hash_keys, 0, sizeof(hash_keys));
1854                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1855                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1856                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1857                         hash_keys.ports.src = fl4->fl4_sport;
1858                         hash_keys.ports.dst = fl4->fl4_dport;
1859                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1860                 }
1861                 break;
1862         }
1863         mhash = flow_hash_from_keys(&hash_keys);
1864
1865         return mhash >> 1;
1866 }
1867 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1868 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1869
1870 static int ip_mkroute_input(struct sk_buff *skb,
1871                             struct fib_result *res,
1872                             struct in_device *in_dev,
1873                             __be32 daddr, __be32 saddr, u32 tos)
1874 {
1875 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1876         if (res->fi && res->fi->fib_nhs > 1) {
1877                 int h = fib_multipath_hash(res->fi, NULL, skb);
1878
1879                 fib_select_multipath(res, h);
1880         }
1881 #endif
1882
1883         /* create a routing cache entry */
1884         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1885 }
1886
1887 /*
1888  *      NOTE. We drop all the packets that has local source
1889  *      addresses, because every properly looped back packet
1890  *      must have correct destination already attached by output routine.
1891  *
1892  *      Such approach solves two big problems:
1893  *      1. Not simplex devices are handled properly.
1894  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1895  *      called with rcu_read_lock()
1896  */
1897
1898 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1899                                u8 tos, struct net_device *dev,
1900                                struct fib_result *res)
1901 {
1902         struct in_device *in_dev = __in_dev_get_rcu(dev);
1903         struct ip_tunnel_info *tun_info;
1904         struct flowi4   fl4;
1905         unsigned int    flags = 0;
1906         u32             itag = 0;
1907         struct rtable   *rth;
1908         int             err = -EINVAL;
1909         struct net    *net = dev_net(dev);
1910         bool do_cache;
1911
1912         /* IP on this device is disabled. */
1913
1914         if (!in_dev)
1915                 goto out;
1916
1917         /* Check for the most weird martians, which can be not detected
1918            by fib_lookup.
1919          */
1920
1921         tun_info = skb_tunnel_info(skb);
1922         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1923                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1924         else
1925                 fl4.flowi4_tun_key.tun_id = 0;
1926         skb_dst_drop(skb);
1927
1928         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1929                 goto martian_source;
1930
1931         res->fi = NULL;
1932         res->table = NULL;
1933         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1934                 goto brd_input;
1935
1936         /* Accept zero addresses only to limited broadcast;
1937          * I even do not know to fix it or not. Waiting for complains :-)
1938          */
1939         if (ipv4_is_zeronet(saddr))
1940                 goto martian_source;
1941
1942         if (ipv4_is_zeronet(daddr))
1943                 goto martian_destination;
1944
1945         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1946          * and call it once if daddr or/and saddr are loopback addresses
1947          */
1948         if (ipv4_is_loopback(daddr)) {
1949                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1950                         goto martian_destination;
1951         } else if (ipv4_is_loopback(saddr)) {
1952                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1953                         goto martian_source;
1954         }
1955
1956         /*
1957          *      Now we are ready to route packet.
1958          */
1959         fl4.flowi4_oif = 0;
1960         fl4.flowi4_iif = dev->ifindex;
1961         fl4.flowi4_mark = skb->mark;
1962         fl4.flowi4_tos = tos;
1963         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1964         fl4.flowi4_flags = 0;
1965         fl4.daddr = daddr;
1966         fl4.saddr = saddr;
1967         fl4.flowi4_uid = sock_net_uid(net, NULL);
1968         err = fib_lookup(net, &fl4, res, 0);
1969         if (err != 0) {
1970                 if (!IN_DEV_FORWARD(in_dev))
1971                         err = -EHOSTUNREACH;
1972                 goto no_route;
1973         }
1974
1975         if (res->type == RTN_BROADCAST)
1976                 goto brd_input;
1977
1978         if (res->type == RTN_LOCAL) {
1979                 err = fib_validate_source(skb, saddr, daddr, tos,
1980                                           0, dev, in_dev, &itag);
1981                 if (err < 0)
1982                         goto martian_source;
1983                 goto local_input;
1984         }
1985
1986         if (!IN_DEV_FORWARD(in_dev)) {
1987                 err = -EHOSTUNREACH;
1988                 goto no_route;
1989         }
1990         if (res->type != RTN_UNICAST)
1991                 goto martian_destination;
1992
1993         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1994 out:    return err;
1995
1996 brd_input:
1997         if (skb->protocol != htons(ETH_P_IP))
1998                 goto e_inval;
1999
2000         if (!ipv4_is_zeronet(saddr)) {
2001                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2002                                           in_dev, &itag);
2003                 if (err < 0)
2004                         goto martian_source;
2005         }
2006         flags |= RTCF_BROADCAST;
2007         res->type = RTN_BROADCAST;
2008         RT_CACHE_STAT_INC(in_brd);
2009
2010 local_input:
2011         do_cache = false;
2012         if (res->fi) {
2013                 if (!itag) {
2014                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2015                         if (rt_cache_valid(rth)) {
2016                                 skb_dst_set_noref(skb, &rth->dst);
2017                                 err = 0;
2018                                 goto out;
2019                         }
2020                         do_cache = true;
2021                 }
2022         }
2023
2024         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2025                            flags | RTCF_LOCAL, res->type,
2026                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2027         if (!rth)
2028                 goto e_nobufs;
2029
2030         rth->dst.output= ip_rt_bug;
2031 #ifdef CONFIG_IP_ROUTE_CLASSID
2032         rth->dst.tclassid = itag;
2033 #endif
2034         rth->rt_is_input = 1;
2035         if (res->table)
2036                 rth->rt_table_id = res->table->tb_id;
2037
2038         RT_CACHE_STAT_INC(in_slow_tot);
2039         if (res->type == RTN_UNREACHABLE) {
2040                 rth->dst.input= ip_error;
2041                 rth->dst.error= -err;
2042                 rth->rt_flags   &= ~RTCF_LOCAL;
2043         }
2044
2045         if (do_cache) {
2046                 struct fib_nh *nh = &FIB_RES_NH(*res);
2047
2048                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2049                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2050                         WARN_ON(rth->dst.input == lwtunnel_input);
2051                         rth->dst.lwtstate->orig_input = rth->dst.input;
2052                         rth->dst.input = lwtunnel_input;
2053                 }
2054
2055                 if (unlikely(!rt_cache_route(nh, rth)))
2056                         rt_add_uncached_list(rth);
2057         }
2058         skb_dst_set(skb, &rth->dst);
2059         err = 0;
2060         goto out;
2061
2062 no_route:
2063         RT_CACHE_STAT_INC(in_no_route);
2064         res->type = RTN_UNREACHABLE;
2065         res->fi = NULL;
2066         res->table = NULL;
2067         goto local_input;
2068
2069         /*
2070          *      Do not cache martian addresses: they should be logged (RFC1812)
2071          */
2072 martian_destination:
2073         RT_CACHE_STAT_INC(in_martian_dst);
2074 #ifdef CONFIG_IP_ROUTE_VERBOSE
2075         if (IN_DEV_LOG_MARTIANS(in_dev))
2076                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2077                                      &daddr, &saddr, dev->name);
2078 #endif
2079
2080 e_inval:
2081         err = -EINVAL;
2082         goto out;
2083
2084 e_nobufs:
2085         err = -ENOBUFS;
2086         goto out;
2087
2088 martian_source:
2089         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2090         goto out;
2091 }
2092
2093 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094                          u8 tos, struct net_device *dev)
2095 {
2096         struct fib_result res;
2097         int err;
2098
2099         tos &= IPTOS_RT_MASK;
2100         rcu_read_lock();
2101         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2102         rcu_read_unlock();
2103
2104         return err;
2105 }
2106 EXPORT_SYMBOL(ip_route_input_noref);
2107
2108 /* called with rcu_read_lock held */
2109 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2110                        u8 tos, struct net_device *dev, struct fib_result *res)
2111 {
2112         /* Multicast recognition logic is moved from route cache to here.
2113            The problem was that too many Ethernet cards have broken/missing
2114            hardware multicast filters :-( As result the host on multicasting
2115            network acquires a lot of useless route cache entries, sort of
2116            SDR messages from all the world. Now we try to get rid of them.
2117            Really, provided software IP multicast filter is organized
2118            reasonably (at least, hashed), it does not result in a slowdown
2119            comparing with route cache reject entries.
2120            Note, that multicast routers are not affected, because
2121            route cache entry is created eventually.
2122          */
2123         if (ipv4_is_multicast(daddr)) {
2124                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2125                 int our = 0;
2126                 int err = -EINVAL;
2127
2128                 if (in_dev)
2129                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2130                                               ip_hdr(skb)->protocol);
2131
2132                 /* check l3 master if no match yet */
2133                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2134                         struct in_device *l3_in_dev;
2135
2136                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2137                         if (l3_in_dev)
2138                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2139                                                       ip_hdr(skb)->protocol);
2140                 }
2141
2142                 if (our
2143 #ifdef CONFIG_IP_MROUTE
2144                         ||
2145                     (!ipv4_is_local_multicast(daddr) &&
2146                      IN_DEV_MFORWARD(in_dev))
2147 #endif
2148                    ) {
2149                         err = ip_route_input_mc(skb, daddr, saddr,
2150                                                 tos, dev, our);
2151                 }
2152                 return err;
2153         }
2154
2155         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2156 }
2157
2158 /* called with rcu_read_lock() */
2159 static struct rtable *__mkroute_output(const struct fib_result *res,
2160                                        const struct flowi4 *fl4, int orig_oif,
2161                                        struct net_device *dev_out,
2162                                        unsigned int flags)
2163 {
2164         struct fib_info *fi = res->fi;
2165         struct fib_nh_exception *fnhe;
2166         struct in_device *in_dev;
2167         u16 type = res->type;
2168         struct rtable *rth;
2169         bool do_cache;
2170
2171         in_dev = __in_dev_get_rcu(dev_out);
2172         if (!in_dev)
2173                 return ERR_PTR(-EINVAL);
2174
2175         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2176                 if (ipv4_is_loopback(fl4->saddr) &&
2177                     !(dev_out->flags & IFF_LOOPBACK) &&
2178                     !netif_is_l3_master(dev_out))
2179                         return ERR_PTR(-EINVAL);
2180
2181         if (ipv4_is_lbcast(fl4->daddr))
2182                 type = RTN_BROADCAST;
2183         else if (ipv4_is_multicast(fl4->daddr))
2184                 type = RTN_MULTICAST;
2185         else if (ipv4_is_zeronet(fl4->daddr))
2186                 return ERR_PTR(-EINVAL);
2187
2188         if (dev_out->flags & IFF_LOOPBACK)
2189                 flags |= RTCF_LOCAL;
2190
2191         do_cache = true;
2192         if (type == RTN_BROADCAST) {
2193                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2194                 fi = NULL;
2195         } else if (type == RTN_MULTICAST) {
2196                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2197                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2198                                      fl4->flowi4_proto))
2199                         flags &= ~RTCF_LOCAL;
2200                 else
2201                         do_cache = false;
2202                 /* If multicast route do not exist use
2203                  * default one, but do not gateway in this case.
2204                  * Yes, it is hack.
2205                  */
2206                 if (fi && res->prefixlen < 4)
2207                         fi = NULL;
2208         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2209                    (orig_oif != dev_out->ifindex)) {
2210                 /* For local routes that require a particular output interface
2211                  * we do not want to cache the result.  Caching the result
2212                  * causes incorrect behaviour when there are multiple source
2213                  * addresses on the interface, the end result being that if the
2214                  * intended recipient is waiting on that interface for the
2215                  * packet he won't receive it because it will be delivered on
2216                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2217                  * be set to the loopback interface as well.
2218                  */
2219                 do_cache = false;
2220         }
2221
2222         fnhe = NULL;
2223         do_cache &= fi != NULL;
2224         if (fi) {
2225                 struct rtable __rcu **prth;
2226                 struct fib_nh *nh = &FIB_RES_NH(*res);
2227
2228                 fnhe = find_exception(nh, fl4->daddr);
2229                 if (!do_cache)
2230                         goto add;
2231                 if (fnhe) {
2232                         prth = &fnhe->fnhe_rth_output;
2233                 } else {
2234                         if (unlikely(fl4->flowi4_flags &
2235                                      FLOWI_FLAG_KNOWN_NH &&
2236                                      !(nh->nh_gw &&
2237                                        nh->nh_scope == RT_SCOPE_LINK))) {
2238                                 do_cache = false;
2239                                 goto add;
2240                         }
2241                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2242                 }
2243                 rth = rcu_dereference(*prth);
2244                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2245                         return rth;
2246         }
2247
2248 add:
2249         rth = rt_dst_alloc(dev_out, flags, type,
2250                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2251                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2252                            do_cache);
2253         if (!rth)
2254                 return ERR_PTR(-ENOBUFS);
2255
2256         rth->rt_iif = orig_oif;
2257         if (res->table)
2258                 rth->rt_table_id = res->table->tb_id;
2259
2260         RT_CACHE_STAT_INC(out_slow_tot);
2261
2262         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2263                 if (flags & RTCF_LOCAL &&
2264                     !(dev_out->flags & IFF_LOOPBACK)) {
2265                         rth->dst.output = ip_mc_output;
2266                         RT_CACHE_STAT_INC(out_slow_mc);
2267                 }
2268 #ifdef CONFIG_IP_MROUTE
2269                 if (type == RTN_MULTICAST) {
2270                         if (IN_DEV_MFORWARD(in_dev) &&
2271                             !ipv4_is_local_multicast(fl4->daddr)) {
2272                                 rth->dst.input = ip_mr_input;
2273                                 rth->dst.output = ip_mc_output;
2274                         }
2275                 }
2276 #endif
2277         }
2278
2279         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2280         set_lwt_redirect(rth);
2281
2282         return rth;
2283 }
2284
2285 /*
2286  * Major route resolver routine.
2287  */
2288
2289 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290                                         const struct sk_buff *skb)
2291 {
2292         __u8 tos = RT_FL_TOS(fl4);
2293         struct fib_result res = {
2294                 .type           = RTN_UNSPEC,
2295                 .fi             = NULL,
2296                 .table          = NULL,
2297                 .tclassid       = 0,
2298         };
2299         struct rtable *rth;
2300
2301         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2302         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2305
2306         rcu_read_lock();
2307         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308         rcu_read_unlock();
2309
2310         return rth;
2311 }
2312 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315                                             struct fib_result *res,
2316                                             const struct sk_buff *skb)
2317 {
2318         struct net_device *dev_out = NULL;
2319         int orig_oif = fl4->flowi4_oif;
2320         unsigned int flags = 0;
2321         struct rtable *rth;
2322         int err = -ENETUNREACH;
2323
2324         if (fl4->saddr) {
2325                 rth = ERR_PTR(-EINVAL);
2326                 if (ipv4_is_multicast(fl4->saddr) ||
2327                     ipv4_is_lbcast(fl4->saddr) ||
2328                     ipv4_is_zeronet(fl4->saddr))
2329                         goto out;
2330
2331                 /* I removed check for oif == dev_out->oif here.
2332                    It was wrong for two reasons:
2333                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334                       is assigned to multiple interfaces.
2335                    2. Moreover, we are allowed to send packets with saddr
2336                       of another iface. --ANK
2337                  */
2338
2339                 if (fl4->flowi4_oif == 0 &&
2340                     (ipv4_is_multicast(fl4->daddr) ||
2341                      ipv4_is_lbcast(fl4->daddr))) {
2342                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2343                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2344                         if (!dev_out)
2345                                 goto out;
2346
2347                         /* Special hack: user can direct multicasts
2348                            and limited broadcast via necessary interface
2349                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350                            This hack is not just for fun, it allows
2351                            vic,vat and friends to work.
2352                            They bind socket to loopback, set ttl to zero
2353                            and expect that it will work.
2354                            From the viewpoint of routing cache they are broken,
2355                            because we are not allowed to build multicast path
2356                            with loopback source addr (look, routing cache
2357                            cannot know, that ttl is zero, so that packet
2358                            will not leave this host and route is valid).
2359                            Luckily, this hack is good workaround.
2360                          */
2361
2362                         fl4->flowi4_oif = dev_out->ifindex;
2363                         goto make_route;
2364                 }
2365
2366                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2367                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368                         if (!__ip_dev_find(net, fl4->saddr, false))
2369                                 goto out;
2370                 }
2371         }
2372
2373
2374         if (fl4->flowi4_oif) {
2375                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2376                 rth = ERR_PTR(-ENODEV);
2377                 if (!dev_out)
2378                         goto out;
2379
2380                 /* RACE: Check return value of inet_select_addr instead. */
2381                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2382                         rth = ERR_PTR(-ENETUNREACH);
2383                         goto out;
2384                 }
2385                 if (ipv4_is_local_multicast(fl4->daddr) ||
2386                     ipv4_is_lbcast(fl4->daddr) ||
2387                     fl4->flowi4_proto == IPPROTO_IGMP) {
2388                         if (!fl4->saddr)
2389                                 fl4->saddr = inet_select_addr(dev_out, 0,
2390                                                               RT_SCOPE_LINK);
2391                         goto make_route;
2392                 }
2393                 if (!fl4->saddr) {
2394                         if (ipv4_is_multicast(fl4->daddr))
2395                                 fl4->saddr = inet_select_addr(dev_out, 0,
2396                                                               fl4->flowi4_scope);
2397                         else if (!fl4->daddr)
2398                                 fl4->saddr = inet_select_addr(dev_out, 0,
2399                                                               RT_SCOPE_HOST);
2400                 }
2401         }
2402
2403         if (!fl4->daddr) {
2404                 fl4->daddr = fl4->saddr;
2405                 if (!fl4->daddr)
2406                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2407                 dev_out = net->loopback_dev;
2408                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2409                 res->type = RTN_LOCAL;
2410                 flags |= RTCF_LOCAL;
2411                 goto make_route;
2412         }
2413
2414         err = fib_lookup(net, fl4, res, 0);
2415         if (err) {
2416                 res->fi = NULL;
2417                 res->table = NULL;
2418                 if (fl4->flowi4_oif &&
2419                     (ipv4_is_multicast(fl4->daddr) ||
2420                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2421                         /* Apparently, routing tables are wrong. Assume,
2422                            that the destination is on link.
2423
2424                            WHY? DW.
2425                            Because we are allowed to send to iface
2426                            even if it has NO routes and NO assigned
2427                            addresses. When oif is specified, routing
2428                            tables are looked up with only one purpose:
2429                            to catch if destination is gatewayed, rather than
2430                            direct. Moreover, if MSG_DONTROUTE is set,
2431                            we send packet, ignoring both routing tables
2432                            and ifaddr state. --ANK
2433
2434
2435                            We could make it even if oif is unknown,
2436                            likely IPv6, but we do not.
2437                          */
2438
2439                         if (fl4->saddr == 0)
2440                                 fl4->saddr = inet_select_addr(dev_out, 0,
2441                                                               RT_SCOPE_LINK);
2442                         res->type = RTN_UNICAST;
2443                         goto make_route;
2444                 }
2445                 rth = ERR_PTR(err);
2446                 goto out;
2447         }
2448
2449         if (res->type == RTN_LOCAL) {
2450                 if (!fl4->saddr) {
2451                         if (res->fi->fib_prefsrc)
2452                                 fl4->saddr = res->fi->fib_prefsrc;
2453                         else
2454                                 fl4->saddr = fl4->daddr;
2455                 }
2456
2457                 /* L3 master device is the loopback for that domain */
2458                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2459                         net->loopback_dev;
2460
2461                 /* make sure orig_oif points to fib result device even
2462                  * though packet rx/tx happens over loopback or l3mdev
2463                  */
2464                 orig_oif = FIB_RES_OIF(*res);
2465
2466                 fl4->flowi4_oif = dev_out->ifindex;
2467                 flags |= RTCF_LOCAL;
2468                 goto make_route;
2469         }
2470
2471         fib_select_path(net, res, fl4, skb);
2472
2473         dev_out = FIB_RES_DEV(*res);
2474         fl4->flowi4_oif = dev_out->ifindex;
2475
2476
2477 make_route:
2478         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2479
2480 out:
2481         return rth;
2482 }
2483
2484 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485 {
2486         return NULL;
2487 }
2488
2489 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2490 {
2491         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493         return mtu ? : dst->dev->mtu;
2494 }
2495
2496 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497                                           struct sk_buff *skb, u32 mtu)
2498 {
2499 }
2500
2501 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502                                        struct sk_buff *skb)
2503 {
2504 }
2505
2506 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507                                           unsigned long old)
2508 {
2509         return NULL;
2510 }
2511
2512 static struct dst_ops ipv4_dst_blackhole_ops = {
2513         .family                 =       AF_INET,
2514         .check                  =       ipv4_blackhole_dst_check,
2515         .mtu                    =       ipv4_blackhole_mtu,
2516         .default_advmss         =       ipv4_default_advmss,
2517         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2518         .redirect               =       ipv4_rt_blackhole_redirect,
2519         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2520         .neigh_lookup           =       ipv4_neigh_lookup,
2521 };
2522
2523 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2524 {
2525         struct rtable *ort = (struct rtable *) dst_orig;
2526         struct rtable *rt;
2527
2528         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2529         if (rt) {
2530                 struct dst_entry *new = &rt->dst;
2531
2532                 new->__use = 1;
2533                 new->input = dst_discard;
2534                 new->output = dst_discard_out;
2535
2536                 new->dev = net->loopback_dev;
2537                 if (new->dev)
2538                         dev_hold(new->dev);
2539
2540                 rt->rt_is_input = ort->rt_is_input;
2541                 rt->rt_iif = ort->rt_iif;
2542                 rt->rt_pmtu = ort->rt_pmtu;
2543                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2544
2545                 rt->rt_genid = rt_genid_ipv4(net);
2546                 rt->rt_flags = ort->rt_flags;
2547                 rt->rt_type = ort->rt_type;
2548                 rt->rt_gateway = ort->rt_gateway;
2549                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2550
2551                 INIT_LIST_HEAD(&rt->rt_uncached);
2552         }
2553
2554         dst_release(dst_orig);
2555
2556         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557 }
2558
2559 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560                                     const struct sock *sk)
2561 {
2562         struct rtable *rt = __ip_route_output_key(net, flp4);
2563
2564         if (IS_ERR(rt))
2565                 return rt;
2566
2567         if (flp4->flowi4_proto)
2568                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569                                                         flowi4_to_flowi(flp4),
2570                                                         sk, 0);
2571
2572         return rt;
2573 }
2574 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
2576 /* called with rcu_read_lock held */
2577 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579                         u32 seq)
2580 {
2581         struct rtable *rt = skb_rtable(skb);
2582         struct rtmsg *r;
2583         struct nlmsghdr *nlh;
2584         unsigned long expires = 0;
2585         u32 error;
2586         u32 metrics[RTAX_MAX];
2587
2588         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589         if (!nlh)
2590                 return -EMSGSIZE;
2591
2592         r = nlmsg_data(nlh);
2593         r->rtm_family    = AF_INET;
2594         r->rtm_dst_len  = 32;
2595         r->rtm_src_len  = 0;
2596         r->rtm_tos      = fl4->flowi4_tos;
2597         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598         if (nla_put_u32(skb, RTA_TABLE, table_id))
2599                 goto nla_put_failure;
2600         r->rtm_type     = rt->rt_type;
2601         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2602         r->rtm_protocol = RTPROT_UNSPEC;
2603         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604         if (rt->rt_flags & RTCF_NOTIFY)
2605                 r->rtm_flags |= RTM_F_NOTIFY;
2606         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607                 r->rtm_flags |= RTCF_DOREDIRECT;
2608
2609         if (nla_put_in_addr(skb, RTA_DST, dst))
2610                 goto nla_put_failure;
2611         if (src) {
2612                 r->rtm_src_len = 32;
2613                 if (nla_put_in_addr(skb, RTA_SRC, src))
2614                         goto nla_put_failure;
2615         }
2616         if (rt->dst.dev &&
2617             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618                 goto nla_put_failure;
2619 #ifdef CONFIG_IP_ROUTE_CLASSID
2620         if (rt->dst.tclassid &&
2621             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622                 goto nla_put_failure;
2623 #endif
2624         if (!rt_is_input_route(rt) &&
2625             fl4->saddr != src) {
2626                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2627                         goto nla_put_failure;
2628         }
2629         if (rt->rt_uses_gateway &&
2630             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631                 goto nla_put_failure;
2632
2633         expires = rt->dst.expires;
2634         if (expires) {
2635                 unsigned long now = jiffies;
2636
2637                 if (time_before(now, expires))
2638                         expires -= now;
2639                 else
2640                         expires = 0;
2641         }
2642
2643         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644         if (rt->rt_pmtu && expires)
2645                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646         if (rt->rt_mtu_locked && expires)
2647                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2648         if (rtnetlink_put_metrics(skb, metrics) < 0)
2649                 goto nla_put_failure;
2650
2651         if (fl4->flowi4_mark &&
2652             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2653                 goto nla_put_failure;
2654
2655         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656             nla_put_u32(skb, RTA_UID,
2657                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658                 goto nla_put_failure;
2659
2660         error = rt->dst.error;
2661
2662         if (rt_is_input_route(rt)) {
2663 #ifdef CONFIG_IP_MROUTE
2664                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666                         int err = ipmr_get_route(net, skb,
2667                                                  fl4->saddr, fl4->daddr,
2668                                                  r, portid);
2669
2670                         if (err <= 0) {
2671                                 if (err == 0)
2672                                         return 0;
2673                                 goto nla_put_failure;
2674                         }
2675                 } else
2676 #endif
2677                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2678                                 goto nla_put_failure;
2679         }
2680
2681         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2682                 goto nla_put_failure;
2683
2684         nlmsg_end(skb, nlh);
2685         return 0;
2686
2687 nla_put_failure:
2688         nlmsg_cancel(skb, nlh);
2689         return -EMSGSIZE;
2690 }
2691
2692 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693                              struct netlink_ext_ack *extack)
2694 {
2695         struct net *net = sock_net(in_skb->sk);
2696         struct rtmsg *rtm;
2697         struct nlattr *tb[RTA_MAX+1];
2698         struct fib_result res = {};
2699         struct rtable *rt = NULL;
2700         struct flowi4 fl4;
2701         __be32 dst = 0;
2702         __be32 src = 0;
2703         u32 iif;
2704         int err;
2705         int mark;
2706         struct sk_buff *skb;
2707         u32 table_id = RT_TABLE_MAIN;
2708         kuid_t uid;
2709
2710         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2711                           extack);
2712         if (err < 0)
2713                 goto errout;
2714
2715         rtm = nlmsg_data(nlh);
2716
2717         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718         if (!skb) {
2719                 err = -ENOBUFS;
2720                 goto errout;
2721         }
2722
2723         /* Reserve room for dummy headers, this skb can pass
2724            through good chunk of routing engine.
2725          */
2726         skb_reset_mac_header(skb);
2727         skb_reset_network_header(skb);
2728
2729         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2731         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2732         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2733         if (tb[RTA_UID])
2734                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735         else
2736                 uid = (iif ? INVALID_UID : current_uid());
2737
2738         /* Bugfix: need to give ip_route_input enough of an IP header to
2739          * not gag.
2740          */
2741         ip_hdr(skb)->protocol = IPPROTO_UDP;
2742         ip_hdr(skb)->saddr = src;
2743         ip_hdr(skb)->daddr = dst;
2744
2745         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2746
2747         memset(&fl4, 0, sizeof(fl4));
2748         fl4.daddr = dst;
2749         fl4.saddr = src;
2750         fl4.flowi4_tos = rtm->rtm_tos;
2751         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752         fl4.flowi4_mark = mark;
2753         fl4.flowi4_uid = uid;
2754
2755         rcu_read_lock();
2756
2757         if (iif) {
2758                 struct net_device *dev;
2759
2760                 dev = dev_get_by_index_rcu(net, iif);
2761                 if (!dev) {
2762                         err = -ENODEV;
2763                         goto errout_free;
2764                 }
2765
2766                 skb->protocol   = htons(ETH_P_IP);
2767                 skb->dev        = dev;
2768                 skb->mark       = mark;
2769                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770                                          dev, &res);
2771
2772                 rt = skb_rtable(skb);
2773                 if (err == 0 && rt->dst.error)
2774                         err = -rt->dst.error;
2775         } else {
2776                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2777                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2778                 err = 0;
2779                 if (IS_ERR(rt))
2780                         err = PTR_ERR(rt);
2781                 else
2782                         skb_dst_set(skb, &rt->dst);
2783         }
2784
2785         if (err)
2786                 goto errout_free;
2787
2788         if (rtm->rtm_flags & RTM_F_NOTIFY)
2789                 rt->rt_flags |= RTCF_NOTIFY;
2790
2791         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792                 table_id = rt->rt_table_id;
2793
2794         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2795                 if (!res.fi) {
2796                         err = fib_props[res.type].error;
2797                         if (!err)
2798                                 err = -EHOSTUNREACH;
2799                         goto errout_free;
2800                 }
2801                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803                                     rt->rt_type, res.prefix, res.prefixlen,
2804                                     fl4.flowi4_tos, res.fi, 0);
2805         } else {
2806                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2807                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2808         }
2809         if (err < 0)
2810                 goto errout_free;
2811
2812         rcu_read_unlock();
2813
2814         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2815 errout:
2816         return err;
2817
2818 errout_free:
2819         rcu_read_unlock();
2820         kfree_skb(skb);
2821         goto errout;
2822 }
2823
2824 void ip_rt_multicast_event(struct in_device *in_dev)
2825 {
2826         rt_cache_flush(dev_net(in_dev->dev));
2827 }
2828
2829 #ifdef CONFIG_SYSCTL
2830 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2831 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2832 static int ip_rt_gc_elasticity __read_mostly    = 8;
2833
2834 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2835                                         void __user *buffer,
2836                                         size_t *lenp, loff_t *ppos)
2837 {
2838         struct net *net = (struct net *)__ctl->extra1;
2839
2840         if (write) {
2841                 rt_cache_flush(net);
2842                 fnhe_genid_bump(net);
2843                 return 0;
2844         }
2845
2846         return -EINVAL;
2847 }
2848
2849 static struct ctl_table ipv4_route_table[] = {
2850         {
2851                 .procname       = "gc_thresh",
2852                 .data           = &ipv4_dst_ops.gc_thresh,
2853                 .maxlen         = sizeof(int),
2854                 .mode           = 0644,
2855                 .proc_handler   = proc_dointvec,
2856         },
2857         {
2858                 .procname       = "max_size",
2859                 .data           = &ip_rt_max_size,
2860                 .maxlen         = sizeof(int),
2861                 .mode           = 0644,
2862                 .proc_handler   = proc_dointvec,
2863         },
2864         {
2865                 /*  Deprecated. Use gc_min_interval_ms */
2866
2867                 .procname       = "gc_min_interval",
2868                 .data           = &ip_rt_gc_min_interval,
2869                 .maxlen         = sizeof(int),
2870                 .mode           = 0644,
2871                 .proc_handler   = proc_dointvec_jiffies,
2872         },
2873         {
2874                 .procname       = "gc_min_interval_ms",
2875                 .data           = &ip_rt_gc_min_interval,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = proc_dointvec_ms_jiffies,
2879         },
2880         {
2881                 .procname       = "gc_timeout",
2882                 .data           = &ip_rt_gc_timeout,
2883                 .maxlen         = sizeof(int),
2884                 .mode           = 0644,
2885                 .proc_handler   = proc_dointvec_jiffies,
2886         },
2887         {
2888                 .procname       = "gc_interval",
2889                 .data           = &ip_rt_gc_interval,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = proc_dointvec_jiffies,
2893         },
2894         {
2895                 .procname       = "redirect_load",
2896                 .data           = &ip_rt_redirect_load,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = proc_dointvec,
2900         },
2901         {
2902                 .procname       = "redirect_number",
2903                 .data           = &ip_rt_redirect_number,
2904                 .maxlen         = sizeof(int),
2905                 .mode           = 0644,
2906                 .proc_handler   = proc_dointvec,
2907         },
2908         {
2909                 .procname       = "redirect_silence",
2910                 .data           = &ip_rt_redirect_silence,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = proc_dointvec,
2914         },
2915         {
2916                 .procname       = "error_cost",
2917                 .data           = &ip_rt_error_cost,
2918                 .maxlen         = sizeof(int),
2919                 .mode           = 0644,
2920                 .proc_handler   = proc_dointvec,
2921         },
2922         {
2923                 .procname       = "error_burst",
2924                 .data           = &ip_rt_error_burst,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = proc_dointvec,
2928         },
2929         {
2930                 .procname       = "gc_elasticity",
2931                 .data           = &ip_rt_gc_elasticity,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = proc_dointvec,
2935         },
2936         {
2937                 .procname       = "mtu_expires",
2938                 .data           = &ip_rt_mtu_expires,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = proc_dointvec_jiffies,
2942         },
2943         {
2944                 .procname       = "min_pmtu",
2945                 .data           = &ip_rt_min_pmtu,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = proc_dointvec_minmax,
2949                 .extra1         = &ip_min_valid_pmtu,
2950         },
2951         {
2952                 .procname       = "min_adv_mss",
2953                 .data           = &ip_rt_min_advmss,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = proc_dointvec,
2957         },
2958         { }
2959 };
2960
2961 static struct ctl_table ipv4_route_flush_table[] = {
2962         {
2963                 .procname       = "flush",
2964                 .maxlen         = sizeof(int),
2965                 .mode           = 0200,
2966                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2967         },
2968         { },
2969 };
2970
2971 static __net_init int sysctl_route_net_init(struct net *net)
2972 {
2973         struct ctl_table *tbl;
2974
2975         tbl = ipv4_route_flush_table;
2976         if (!net_eq(net, &init_net)) {
2977                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2978                 if (!tbl)
2979                         goto err_dup;
2980
2981                 /* Don't export sysctls to unprivileged users */
2982                 if (net->user_ns != &init_user_ns)
2983                         tbl[0].procname = NULL;
2984         }
2985         tbl[0].extra1 = net;
2986
2987         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2988         if (!net->ipv4.route_hdr)
2989                 goto err_reg;
2990         return 0;
2991
2992 err_reg:
2993         if (tbl != ipv4_route_flush_table)
2994                 kfree(tbl);
2995 err_dup:
2996         return -ENOMEM;
2997 }
2998
2999 static __net_exit void sysctl_route_net_exit(struct net *net)
3000 {
3001         struct ctl_table *tbl;
3002
3003         tbl = net->ipv4.route_hdr->ctl_table_arg;
3004         unregister_net_sysctl_table(net->ipv4.route_hdr);
3005         BUG_ON(tbl == ipv4_route_flush_table);
3006         kfree(tbl);
3007 }
3008
3009 static __net_initdata struct pernet_operations sysctl_route_ops = {
3010         .init = sysctl_route_net_init,
3011         .exit = sysctl_route_net_exit,
3012 };
3013 #endif
3014
3015 static __net_init int rt_genid_init(struct net *net)
3016 {
3017         atomic_set(&net->ipv4.rt_genid, 0);
3018         atomic_set(&net->fnhe_genid, 0);
3019         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3020         return 0;
3021 }
3022
3023 static __net_initdata struct pernet_operations rt_genid_ops = {
3024         .init = rt_genid_init,
3025 };
3026
3027 static int __net_init ipv4_inetpeer_init(struct net *net)
3028 {
3029         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3030
3031         if (!bp)
3032                 return -ENOMEM;
3033         inet_peer_base_init(bp);
3034         net->ipv4.peers = bp;
3035         return 0;
3036 }
3037
3038 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3039 {
3040         struct inet_peer_base *bp = net->ipv4.peers;
3041
3042         net->ipv4.peers = NULL;
3043         inetpeer_invalidate_tree(bp);
3044         kfree(bp);
3045 }
3046
3047 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3048         .init   =       ipv4_inetpeer_init,
3049         .exit   =       ipv4_inetpeer_exit,
3050 };
3051
3052 #ifdef CONFIG_IP_ROUTE_CLASSID
3053 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3054 #endif /* CONFIG_IP_ROUTE_CLASSID */
3055
3056 int __init ip_rt_init(void)
3057 {
3058         int cpu;
3059
3060         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3061         if (!ip_idents)
3062                 panic("IP: failed to allocate ip_idents\n");
3063
3064         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3065
3066         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3067         if (!ip_tstamps)
3068                 panic("IP: failed to allocate ip_tstamps\n");
3069
3070         for_each_possible_cpu(cpu) {
3071                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3072
3073                 INIT_LIST_HEAD(&ul->head);
3074                 spin_lock_init(&ul->lock);
3075         }
3076 #ifdef CONFIG_IP_ROUTE_CLASSID
3077         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3078         if (!ip_rt_acct)
3079                 panic("IP: failed to allocate ip_rt_acct\n");
3080 #endif
3081
3082         ipv4_dst_ops.kmem_cachep =
3083                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3084                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3085
3086         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3087
3088         if (dst_entries_init(&ipv4_dst_ops) < 0)
3089                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3090
3091         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3092                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3093
3094         ipv4_dst_ops.gc_thresh = ~0;
3095         ip_rt_max_size = INT_MAX;
3096
3097         devinet_init();
3098         ip_fib_init();
3099
3100         if (ip_rt_proc_init())
3101                 pr_err("Unable to create route proc files\n");
3102 #ifdef CONFIG_XFRM
3103         xfrm_init();
3104         xfrm4_init();
3105 #endif
3106         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3107                       RTNL_FLAG_DOIT_UNLOCKED);
3108
3109 #ifdef CONFIG_SYSCTL
3110         register_pernet_subsys(&sysctl_route_ops);
3111 #endif
3112         register_pernet_subsys(&rt_genid_ops);
3113         register_pernet_subsys(&ipv4_inetpeer_ops);
3114         return 0;
3115 }
3116
3117 #ifdef CONFIG_SYSCTL
3118 /*
3119  * We really need to sanitize the damn ipv4 init order, then all
3120  * this nonsense will go away.
3121  */
3122 void __init ip_static_sysctl_init(void)
3123 {
3124         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3125 }
3126 #endif