net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113 #include <net/ip_tunnels.h>
 114 #include <net/l3mdev.h>
 115
 116 #include "fib_lookup.h"
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .open    = rt_cache_seq_open,
 244         .read    = seq_read,
 245         .llseek  = seq_lseek,
 246         .release = seq_release,
 247 };
 248
 249
 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251 {
 252         int cpu;
 253
 254         if (*pos == 0)
 255                 return SEQ_START_TOKEN;
 256
 257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                 if (!cpu_possible(cpu))
 259                         continue;
 260                 *pos = cpu+1;
 261                 return &per_cpu(rt_cache_stat, cpu);
 262         }
 263         return NULL;
 264 }
 265
 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267 {
 268         int cpu;
 269
 270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363
 364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365 {
 366         return single_open(file, rt_acct_proc_show, NULL);
 367 }
 368
 369 static const struct file_operations rt_acct_proc_fops = {
 370         .open           = rt_acct_proc_open,
 371         .read           = seq_read,
 372         .llseek         = seq_lseek,
 373         .release        = single_release,
 374 };
 375 #endif
 376
 377 static int __net_init ip_rt_do_proc_init(struct net *net)
 378 {
 379         struct proc_dir_entry *pde;
 380
 381         pde = proc_create("rt_cache", 0444, net->proc_net,
 382                           &rt_cache_seq_fops);
 383         if (!pde)
 384                 goto err1;
 385
 386         pde = proc_create("rt_cache", 0444,
 387                           net->proc_net_stat, &rt_cpu_seq_fops);
 388         if (!pde)
 389                 goto err2;
 390
 391 #ifdef CONFIG_IP_ROUTE_CLASSID
 392         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 393         if (!pde)
 394                 goto err3;
 395 #endif
 396         return 0;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399 err3:
 400         remove_proc_entry("rt_cache", net->proc_net_stat);
 401 #endif
 402 err2:
 403         remove_proc_entry("rt_cache", net->proc_net);
 404 err1:
 405         return -ENOMEM;
 406 }
 407
 408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409 {
 410         remove_proc_entry("rt_cache", net->proc_net_stat);
 411         remove_proc_entry("rt_cache", net->proc_net);
 412 #ifdef CONFIG_IP_ROUTE_CLASSID
 413         remove_proc_entry("rt_acct", net->proc_net);
 414 #endif
 415 }
 416
 417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418         .init = ip_rt_do_proc_init,
 419         .exit = ip_rt_do_proc_exit,
 420 };
 421
 422 static int __init ip_rt_proc_init(void)
 423 {
 424         return register_pernet_subsys(&ip_rt_proc_ops);
 425 }
 426
 427 #else
 428 static inline int ip_rt_proc_init(void)
 429 {
 430         return 0;
 431 }
 432 #endif /* CONFIG_PROC_FS */
 433
 434 static inline bool rt_is_expired(const struct rtable *rth)
 435 {
 436         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437 }
 438
 439 void rt_cache_flush(struct net *net)
 440 {
 441         rt_genid_bump_ipv4(net);
 442 }
 443
 444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445                                            struct sk_buff *skb,
 446                                            const void *daddr)
 447 {
 448         struct net_device *dev = dst->dev;
 449         const __be32 *pkey = daddr;
 450         const struct rtable *rt;
 451         struct neighbour *n;
 452
 453         rt = (const struct rtable *) dst;
 454         if (rt->rt_gateway)
 455                 pkey = (const __be32 *) &rt->rt_gateway;
 456         else if (skb)
 457                 pkey = &ip_hdr(skb)->daddr;
 458
 459         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460         if (n)
 461                 return n;
 462         return neigh_create(&arp_tbl, pkey, dev);
 463 }
 464
 465 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 466 {
 467         struct net_device *dev = dst->dev;
 468         const __be32 *pkey = daddr;
 469         const struct rtable *rt;
 470
 471         rt = (const struct rtable *)dst;
 472         if (rt->rt_gateway)
 473                 pkey = (const __be32 *)&rt->rt_gateway;
 474         else if (!daddr ||
 475                  (rt->rt_flags &
 476                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477                 return;
 478
 479         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 480 }
 481
 482 #define IP_IDENTS_SZ 2048u
 483
 484 static atomic_t *ip_idents __read_mostly;
 485 static u32 *ip_tstamps __read_mostly;
 486
 487 /* In order to protect privacy, we add a perturbation to identifiers
 488  * if one generator is seldom used. This makes hard for an attacker
 489  * to infer how many packets were sent between two points in time.
 490  */
 491 u32 ip_idents_reserve(u32 hash, int segs)
 492 {
 493         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495         u32 old = READ_ONCE(*p_tstamp);
 496         u32 now = (u32)jiffies;
 497         u32 new, delta = 0;
 498
 499         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500                 delta = prandom_u32_max(now - old);
 501
 502         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503         do {
 504                 old = (u32)atomic_read(p_id);
 505                 new = old + delta + segs;
 506         } while (atomic_cmpxchg(p_id, old, new) != old);
 507
 508         return new - segs;
 509 }
 510 EXPORT_SYMBOL(ip_idents_reserve);
 511
 512 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513 {
 514         static u32 ip_idents_hashrnd __read_mostly;
 515         u32 hash, id;
 516
 517         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 518
 519         hash = jhash_3words((__force u32)iph->daddr,
 520                             (__force u32)iph->saddr,
 521                             iph->protocol ^ net_hash_mix(net),
 522                             ip_idents_hashrnd);
 523         id = ip_idents_reserve(hash, segs);
 524         iph->id = htons(id);
 525 }
 526 EXPORT_SYMBOL(__ip_select_ident);
 527
 528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529                              const struct sock *sk,
 530                              const struct iphdr *iph,
 531                              int oif, u8 tos,
 532                              u8 prot, u32 mark, int flow_flags)
 533 {
 534         if (sk) {
 535                 const struct inet_sock *inet = inet_sk(sk);
 536
 537                 oif = sk->sk_bound_dev_if;
 538                 mark = sk->sk_mark;
 539                 tos = RT_CONN_FLAGS(sk);
 540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541         }
 542         flowi4_init_output(fl4, oif, mark, tos,
 543                            RT_SCOPE_UNIVERSE, prot,
 544                            flow_flags,
 545                            iph->daddr, iph->saddr, 0, 0,
 546                            sock_net_uid(net, sk));
 547 }
 548
 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550                                const struct sock *sk)
 551 {
 552         const struct net *net = dev_net(skb->dev);
 553         const struct iphdr *iph = ip_hdr(skb);
 554         int oif = skb->dev->ifindex;
 555         u8 tos = RT_TOS(iph->tos);
 556         u8 prot = iph->protocol;
 557         u32 mark = skb->mark;
 558
 559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560 }
 561
 562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563 {
 564         const struct inet_sock *inet = inet_sk(sk);
 565         const struct ip_options_rcu *inet_opt;
 566         __be32 daddr = inet->inet_daddr;
 567
 568         rcu_read_lock();
 569         inet_opt = rcu_dereference(inet->inet_opt);
 570         if (inet_opt && inet_opt->opt.srr)
 571                 daddr = inet_opt->opt.faddr;
 572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575                            inet_sk_flowi_flags(sk),
 576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577         rcu_read_unlock();
 578 }
 579
 580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581                                  const struct sk_buff *skb)
 582 {
 583         if (skb)
 584                 build_skb_flow_key(fl4, skb, sk);
 585         else
 586                 build_sk_flow_key(fl4, sk);
 587 }
 588
 589 static DEFINE_SPINLOCK(fnhe_lock);
 590
 591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592 {
 593         struct rtable *rt;
 594
 595         rt = rcu_dereference(fnhe->fnhe_rth_input);
 596         if (rt) {
 597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598                 dst_dev_put(&rt->dst);
 599                 dst_release(&rt->dst);
 600         }
 601         rt = rcu_dereference(fnhe->fnhe_rth_output);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607 }
 608
 609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610 {
 611         struct fib_nh_exception *fnhe, *oldest;
 612
 613         oldest = rcu_dereference(hash->chain);
 614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 617                         oldest = fnhe;
 618         }
 619         fnhe_flush_routes(oldest);
 620         return oldest;
 621 }
 622
 623 static inline u32 fnhe_hashfun(__be32 daddr)
 624 {
 625         static u32 fnhe_hashrnd __read_mostly;
 626         u32 hval;
 627
 628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630         return hash_32(hval, FNHE_HASH_SHIFT);
 631 }
 632
 633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634 {
 635         rt->rt_pmtu = fnhe->fnhe_pmtu;
 636         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637         rt->dst.expires = fnhe->fnhe_expires;
 638
 639         if (fnhe->fnhe_gw) {
 640                 rt->rt_flags |= RTCF_REDIRECTED;
 641                 rt->rt_gateway = fnhe->fnhe_gw;
 642                 rt->rt_uses_gateway = 1;
 643         }
 644 }
 645
 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647                                   u32 pmtu, bool lock, unsigned long expires)
 648 {
 649         struct fnhe_hash_bucket *hash;
 650         struct fib_nh_exception *fnhe;
 651         struct rtable *rt;
 652         u32 genid, hval;
 653         unsigned int i;
 654         int depth;
 655
 656         genid = fnhe_genid(dev_net(nh->nh_dev));
 657         hval = fnhe_hashfun(daddr);
 658
 659         spin_lock_bh(&fnhe_lock);
 660
 661         hash = rcu_dereference(nh->nh_exceptions);
 662         if (!hash) {
 663                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664                 if (!hash)
 665                         goto out_unlock;
 666                 rcu_assign_pointer(nh->nh_exceptions, hash);
 667         }
 668
 669         hash += hval;
 670
 671         depth = 0;
 672         for (fnhe = rcu_dereference(hash->chain); fnhe;
 673              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674                 if (fnhe->fnhe_daddr == daddr)
 675                         break;
 676                 depth++;
 677         }
 678
 679         if (fnhe) {
 680                 if (fnhe->fnhe_genid != genid)
 681                         fnhe->fnhe_genid = genid;
 682                 if (gw)
 683                         fnhe->fnhe_gw = gw;
 684                 if (pmtu) {
 685                         fnhe->fnhe_pmtu = pmtu;
 686                         fnhe->fnhe_mtu_locked = lock;
 687                 }
 688                 fnhe->fnhe_expires = max(1UL, expires);
 689                 /* Update all cached dsts too */
 690                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 691                 if (rt)
 692                         fill_route_from_fnhe(rt, fnhe);
 693                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 694                 if (rt)
 695                         fill_route_from_fnhe(rt, fnhe);
 696         } else {
 697                 if (depth > FNHE_RECLAIM_DEPTH)
 698                         fnhe = fnhe_oldest(hash);
 699                 else {
 700                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701                         if (!fnhe)
 702                                 goto out_unlock;
 703
 704                         fnhe->fnhe_next = hash->chain;
 705                         rcu_assign_pointer(hash->chain, fnhe);
 706                 }
 707                 fnhe->fnhe_genid = genid;
 708                 fnhe->fnhe_daddr = daddr;
 709                 fnhe->fnhe_gw = gw;
 710                 fnhe->fnhe_pmtu = pmtu;
 711                 fnhe->fnhe_mtu_locked = lock;
 712                 fnhe->fnhe_expires = max(1UL, expires);
 713
 714                 /* Exception created; mark the cached routes for the nexthop
 715                  * stale, so anyone caching it rechecks if this exception
 716                  * applies to them.
 717                  */
 718                 rt = rcu_dereference(nh->nh_rth_input);
 719                 if (rt)
 720                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722                 for_each_possible_cpu(i) {
 723                         struct rtable __rcu **prt;
 724                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 725                         rt = rcu_dereference(*prt);
 726                         if (rt)
 727                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 728                 }
 729         }
 730
 731         fnhe->fnhe_stamp = jiffies;
 732
 733 out_unlock:
 734         spin_unlock_bh(&fnhe_lock);
 735 }
 736
 737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738                              bool kill_route)
 739 {
 740         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 741         __be32 old_gw = ip_hdr(skb)->saddr;
 742         struct net_device *dev = skb->dev;
 743         struct in_device *in_dev;
 744         struct fib_result res;
 745         struct neighbour *n;
 746         struct net *net;
 747
 748         switch (icmp_hdr(skb)->code & 7) {
 749         case ICMP_REDIR_NET:
 750         case ICMP_REDIR_NETTOS:
 751         case ICMP_REDIR_HOST:
 752         case ICMP_REDIR_HOSTTOS:
 753                 break;
 754
 755         default:
 756                 return;
 757         }
 758
 759         if (rt->rt_gateway != old_gw)
 760                 return;
 761
 762         in_dev = __in_dev_get_rcu(dev);
 763         if (!in_dev)
 764                 return;
 765
 766         net = dev_net(dev);
 767         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769             ipv4_is_zeronet(new_gw))
 770                 goto reject_redirect;
 771
 772         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774                         goto reject_redirect;
 775                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776                         goto reject_redirect;
 777         } else {
 778                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779                         goto reject_redirect;
 780         }
 781
 782         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783         if (!n)
 784                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785         if (!IS_ERR(n)) {
 786                 if (!(n->nud_state & NUD_VALID)) {
 787                         neigh_event_send(n, NULL);
 788                 } else {
 789                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 790                                 struct fib_nh *nh = &FIB_RES_NH(res);
 791
 792                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 793                                                 0, false,
 794                                                 jiffies + ip_rt_gc_timeout);
 795                         }
 796                         if (kill_route)
 797                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 798                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799                 }
 800                 neigh_release(n);
 801         }
 802         return;
 803
 804 reject_redirect:
 805 #ifdef CONFIG_IP_ROUTE_VERBOSE
 806         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 808                 __be32 daddr = iph->daddr;
 809                 __be32 saddr = iph->saddr;
 810
 811                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812                                      "  Advised path = %pI4 -> %pI4\n",
 813                                      &old_gw, dev->name, &new_gw,
 814                                      &saddr, &daddr);
 815         }
 816 #endif
 817         ;
 818 }
 819
 820 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821 {
 822         struct rtable *rt;
 823         struct flowi4 fl4;
 824         const struct iphdr *iph = (const struct iphdr *) skb->data;
 825         struct net *net = dev_net(skb->dev);
 826         int oif = skb->dev->ifindex;
 827         u8 tos = RT_TOS(iph->tos);
 828         u8 prot = iph->protocol;
 829         u32 mark = skb->mark;
 830
 831         rt = (struct rtable *) dst;
 832
 833         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834         __ip_do_redirect(rt, skb, &fl4, true);
 835 }
 836
 837 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838 {
 839         struct rtable *rt = (struct rtable *)dst;
 840         struct dst_entry *ret = dst;
 841
 842         if (rt) {
 843                 if (dst->obsolete > 0) {
 844                         ip_rt_put(rt);
 845                         ret = NULL;
 846                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847                            rt->dst.expires) {
 848                         ip_rt_put(rt);
 849                         ret = NULL;
 850                 }
 851         }
 852         return ret;
 853 }
 854
 855 /*
 856  * Algorithm:
 857  *      1. The first ip_rt_redirect_number redirects are sent
 858  *         with exponential backoff, then we stop sending them at all,
 859  *         assuming that the host ignores our redirects.
 860  *      2. If we did not see packets requiring redirects
 861  *         during ip_rt_redirect_silence, we assume that the host
 862  *         forgot redirected route and start to send redirects again.
 863  *
 864  * This algorithm is much cheaper and more intelligent than dumb load limiting
 865  * in icmp.c.
 866  *
 867  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869  */
 870
 871 void ip_rt_send_redirect(struct sk_buff *skb)
 872 {
 873         struct rtable *rt = skb_rtable(skb);
 874         struct in_device *in_dev;
 875         struct inet_peer *peer;
 876         struct net *net;
 877         int log_martians;
 878         int vif;
 879
 880         rcu_read_lock();
 881         in_dev = __in_dev_get_rcu(rt->dst.dev);
 882         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883                 rcu_read_unlock();
 884                 return;
 885         }
 886         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888         rcu_read_unlock();
 889
 890         net = dev_net(rt->dst.dev);
 891         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 892         if (!peer) {
 893                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 895                 return;
 896         }
 897
 898         /* No redirected packets during ip_rt_redirect_silence;
 899          * reset the algorithm.
 900          */
 901         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 902                 peer->rate_tokens = 0;
 903
 904         /* Too many ignored redirects; do not send anything
 905          * set dst.rate_last to the last seen redirected packet.
 906          */
 907         if (peer->rate_tokens >= ip_rt_redirect_number) {
 908                 peer->rate_last = jiffies;
 909                 goto out_put_peer;
 910         }
 911
 912         /* Check for load limit; set rate_last to the latest sent
 913          * redirect.
 914          */
 915         if (peer->rate_tokens == 0 ||
 916             time_after(jiffies,
 917                        (peer->rate_last +
 918                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 919                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922                 peer->rate_last = jiffies;
 923                 ++peer->rate_tokens;
 924 #ifdef CONFIG_IP_ROUTE_VERBOSE
 925                 if (log_martians &&
 926                     peer->rate_tokens == ip_rt_redirect_number)
 927                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 929                                              &ip_hdr(skb)->daddr, &gw);
 930 #endif
 931         }
 932 out_put_peer:
 933         inet_putpeer(peer);
 934 }
 935
 936 static int ip_error(struct sk_buff *skb)
 937 {
 938         struct rtable *rt = skb_rtable(skb);
 939         struct net_device *dev = skb->dev;
 940         struct in_device *in_dev;
 941         struct inet_peer *peer;
 942         unsigned long now;
 943         struct net *net;
 944         bool send;
 945         int code;
 946
 947         if (netif_is_l3_master(skb->dev)) {
 948                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949                 if (!dev)
 950                         goto out;
 951         }
 952
 953         in_dev = __in_dev_get_rcu(dev);
 954
 955         /* IP on this device is disabled. */
 956         if (!in_dev)
 957                 goto out;
 958
 959         net = dev_net(rt->dst.dev);
 960         if (!IN_DEV_FORWARD(in_dev)) {
 961                 switch (rt->dst.error) {
 962                 case EHOSTUNREACH:
 963                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964                         break;
 965
 966                 case ENETUNREACH:
 967                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                         break;
 969                 }
 970                 goto out;
 971         }
 972
 973         switch (rt->dst.error) {
 974         case EINVAL:
 975         default:
 976                 goto out;
 977         case EHOSTUNREACH:
 978                 code = ICMP_HOST_UNREACH;
 979                 break;
 980         case ENETUNREACH:
 981                 code = ICMP_NET_UNREACH;
 982                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                 break;
 984         case EACCES:
 985                 code = ICMP_PKT_FILTERED;
 986                 break;
 987         }
 988
 989         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990                                l3mdev_master_ifindex(skb->dev), 1);
 991
 992         send = true;
 993         if (peer) {
 994                 now = jiffies;
 995                 peer->rate_tokens += now - peer->rate_last;
 996                 if (peer->rate_tokens > ip_rt_error_burst)
 997                         peer->rate_tokens = ip_rt_error_burst;
 998                 peer->rate_last = now;
 999                 if (peer->rate_tokens >= ip_rt_error_cost)
1000                         peer->rate_tokens -= ip_rt_error_cost;
1001                 else
1002                         send = false;
1003                 inet_putpeer(peer);
1004         }
1005         if (send)
1006                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008 out:    kfree_skb(skb);
1009         return 0;
1010 }
1011
1012 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013 {
1014         struct dst_entry *dst = &rt->dst;
1015         struct fib_result res;
1016         bool lock = false;
1017
1018         if (ip_mtu_locked(dst))
1019                 return;
1020
1021         if (ipv4_mtu(dst) < mtu)
1022                 return;
1023
1024         if (mtu < ip_rt_min_pmtu) {
1025                 lock = true;
1026                 mtu = ip_rt_min_pmtu;
1027         }
1028
1029         if (rt->rt_pmtu == mtu &&
1030             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031                 return;
1032
1033         rcu_read_lock();
1034         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035                 struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038                                       jiffies + ip_rt_mtu_expires);
1039         }
1040         rcu_read_unlock();
1041 }
1042
1043 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044                               struct sk_buff *skb, u32 mtu)
1045 {
1046         struct rtable *rt = (struct rtable *) dst;
1047         struct flowi4 fl4;
1048
1049         ip_rt_build_flow_key(&fl4, sk, skb);
1050         __ip_rt_update_pmtu(rt, &fl4, mtu);
1051 }
1052
1053 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054                       int oif, u32 mark, u8 protocol, int flow_flags)
1055 {
1056         const struct iphdr *iph = (const struct iphdr *) skb->data;
1057         struct flowi4 fl4;
1058         struct rtable *rt;
1059
1060         if (!mark)
1061                 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063         __build_flow_key(net, &fl4, NULL, iph, oif,
1064                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1065         rt = __ip_route_output_key(net, &fl4);
1066         if (!IS_ERR(rt)) {
1067                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1068                 ip_rt_put(rt);
1069         }
1070 }
1071 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078
1079         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081         if (!fl4.flowi4_mark)
1082                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084         rt = __ip_route_output_key(sock_net(sk), &fl4);
1085         if (!IS_ERR(rt)) {
1086                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1087                 ip_rt_put(rt);
1088         }
1089 }
1090
1091 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092 {
1093         const struct iphdr *iph = (const struct iphdr *) skb->data;
1094         struct flowi4 fl4;
1095         struct rtable *rt;
1096         struct dst_entry *odst = NULL;
1097         bool new = false;
1098         struct net *net = sock_net(sk);
1099
1100         bh_lock_sock(sk);
1101
1102         if (!ip_sk_accept_pmtu(sk))
1103                 goto out;
1104
1105         odst = sk_dst_get(sk);
1106
1107         if (sock_owned_by_user(sk) || !odst) {
1108                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1109                 goto out;
1110         }
1111
1112         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114         rt = (struct rtable *)odst;
1115         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117                 if (IS_ERR(rt))
1118                         goto out;
1119
1120                 new = true;
1121         }
1122
1123         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125         if (!dst_check(&rt->dst, 0)) {
1126                 if (new)
1127                         dst_release(&rt->dst);
1128
1129                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130                 if (IS_ERR(rt))
1131                         goto out;
1132
1133                 new = true;
1134         }
1135
1136         if (new)
1137                 sk_dst_set(sk, &rt->dst);
1138
1139 out:
1140         bh_unlock_sock(sk);
1141         dst_release(odst);
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146                    int oif, u32 mark, u8 protocol, int flow_flags)
1147 {
1148         const struct iphdr *iph = (const struct iphdr *) skb->data;
1149         struct flowi4 fl4;
1150         struct rtable *rt;
1151
1152         __build_flow_key(net, &fl4, NULL, iph, oif,
1153                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1154         rt = __ip_route_output_key(net, &fl4);
1155         if (!IS_ERR(rt)) {
1156                 __ip_do_redirect(rt, skb, &fl4, false);
1157                 ip_rt_put(rt);
1158         }
1159 }
1160 EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163 {
1164         const struct iphdr *iph = (const struct iphdr *) skb->data;
1165         struct flowi4 fl4;
1166         struct rtable *rt;
1167         struct net *net = sock_net(sk);
1168
1169         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170         rt = __ip_route_output_key(net, &fl4);
1171         if (!IS_ERR(rt)) {
1172                 __ip_do_redirect(rt, skb, &fl4, false);
1173                 ip_rt_put(rt);
1174         }
1175 }
1176 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179 {
1180         struct rtable *rt = (struct rtable *) dst;
1181
1182         /* All IPV4 dsts are created with ->obsolete set to the value
1183          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184          * into this function always.
1185          *
1186          * When a PMTU/redirect information update invalidates a route,
1187          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188          * DST_OBSOLETE_DEAD by dst_free().
1189          */
1190         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191                 return NULL;
1192         return dst;
1193 }
1194
1195 static void ipv4_link_failure(struct sk_buff *skb)
1196 {
1197         struct rtable *rt;
1198
1199         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201         rt = skb_rtable(skb);
1202         if (rt)
1203                 dst_set_expires(&rt->dst, 0);
1204 }
1205
1206 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207 {
1208         pr_debug("%s: %pI4 -> %pI4, %s\n",
1209                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210                  skb->dev ? skb->dev->name : "?");
1211         kfree_skb(skb);
1212         WARN_ON(1);
1213         return 0;
1214 }
1215
1216 /*
1217    We do not cache source address of outgoing interface,
1218    because it is used only by IP RR, TS and SRR options,
1219    so that it out of fast path.
1220
1221    BTW remember: "addr" is allowed to be not aligned
1222    in IP options!
1223  */
1224
1225 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226 {
1227         __be32 src;
1228
1229         if (rt_is_output_route(rt))
1230                 src = ip_hdr(skb)->saddr;
1231         else {
1232                 struct fib_result res;
1233                 struct flowi4 fl4;
1234                 struct iphdr *iph;
1235
1236                 iph = ip_hdr(skb);
1237
1238                 memset(&fl4, 0, sizeof(fl4));
1239                 fl4.daddr = iph->daddr;
1240                 fl4.saddr = iph->saddr;
1241                 fl4.flowi4_tos = RT_TOS(iph->tos);
1242                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1243                 fl4.flowi4_iif = skb->dev->ifindex;
1244                 fl4.flowi4_mark = skb->mark;
1245
1246                 rcu_read_lock();
1247                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249                 else
1250                         src = inet_select_addr(rt->dst.dev,
1251                                                rt_nexthop(rt, iph->daddr),
1252                                                RT_SCOPE_UNIVERSE);
1253                 rcu_read_unlock();
1254         }
1255         memcpy(addr, &src, 4);
1256 }
1257
1258 #ifdef CONFIG_IP_ROUTE_CLASSID
1259 static void set_class_tag(struct rtable *rt, u32 tag)
1260 {
1261         if (!(rt->dst.tclassid & 0xFFFF))
1262                 rt->dst.tclassid |= tag & 0xFFFF;
1263         if (!(rt->dst.tclassid & 0xFFFF0000))
1264                 rt->dst.tclassid |= tag & 0xFFFF0000;
1265 }
1266 #endif
1267
1268 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269 {
1270         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272                                     ip_rt_min_advmss);
1273
1274         return min(advmss, IPV4_MAX_PMTU - header_size);
1275 }
1276
1277 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278 {
1279         const struct rtable *rt = (const struct rtable *) dst;
1280         unsigned int mtu = rt->rt_pmtu;
1281
1282         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283                 mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285         if (mtu)
1286                 return mtu;
1287
1288         mtu = READ_ONCE(dst->dev->mtu);
1289
1290         if (unlikely(ip_mtu_locked(dst))) {
1291                 if (rt->rt_uses_gateway && mtu > 576)
1292                         mtu = 576;
1293         }
1294
1295         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298 }
1299
1300 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301 {
1302         struct fnhe_hash_bucket *hash;
1303         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304         u32 hval = fnhe_hashfun(daddr);
1305
1306         spin_lock_bh(&fnhe_lock);
1307
1308         hash = rcu_dereference_protected(nh->nh_exceptions,
1309                                          lockdep_is_held(&fnhe_lock));
1310         hash += hval;
1311
1312         fnhe_p = &hash->chain;
1313         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314         while (fnhe) {
1315                 if (fnhe->fnhe_daddr == daddr) {
1316                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1318                         fnhe_flush_routes(fnhe);
1319                         kfree_rcu(fnhe, rcu);
1320                         break;
1321                 }
1322                 fnhe_p = &fnhe->fnhe_next;
1323                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324                                                  lockdep_is_held(&fnhe_lock));
1325         }
1326
1327         spin_unlock_bh(&fnhe_lock);
1328 }
1329
1330 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1331 {
1332         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1333         struct fib_nh_exception *fnhe;
1334         u32 hval;
1335
1336         if (!hash)
1337                 return NULL;
1338
1339         hval = fnhe_hashfun(daddr);
1340
1341         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1343                 if (fnhe->fnhe_daddr == daddr) {
1344                         if (fnhe->fnhe_expires &&
1345                             time_after(jiffies, fnhe->fnhe_expires)) {
1346                                 ip_del_fnhe(nh, daddr);
1347                                 break;
1348                         }
1349                         return fnhe;
1350                 }
1351         }
1352         return NULL;
1353 }
1354
1355 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1356                               __be32 daddr, const bool do_cache)
1357 {
1358         bool ret = false;
1359
1360         spin_lock_bh(&fnhe_lock);
1361
1362         if (daddr == fnhe->fnhe_daddr) {
1363                 struct rtable __rcu **porig;
1364                 struct rtable *orig;
1365                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1366
1367                 if (rt_is_input_route(rt))
1368                         porig = &fnhe->fnhe_rth_input;
1369                 else
1370                         porig = &fnhe->fnhe_rth_output;
1371                 orig = rcu_dereference(*porig);
1372
1373                 if (fnhe->fnhe_genid != genid) {
1374                         fnhe->fnhe_genid = genid;
1375                         fnhe->fnhe_gw = 0;
1376                         fnhe->fnhe_pmtu = 0;
1377                         fnhe->fnhe_expires = 0;
1378                         fnhe_flush_routes(fnhe);
1379                         orig = NULL;
1380                 }
1381                 fill_route_from_fnhe(rt, fnhe);
1382                 if (!rt->rt_gateway)
1383                         rt->rt_gateway = daddr;
1384
1385                 if (do_cache) {
1386                         dst_hold(&rt->dst);
1387                         rcu_assign_pointer(*porig, rt);
1388                         if (orig) {
1389                                 dst_dev_put(&orig->dst);
1390                                 dst_release(&orig->dst);
1391                         }
1392                         ret = true;
1393                 }
1394
1395                 fnhe->fnhe_stamp = jiffies;
1396         }
1397         spin_unlock_bh(&fnhe_lock);
1398
1399         return ret;
1400 }
1401
1402 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1403 {
1404         struct rtable *orig, *prev, **p;
1405         bool ret = true;
1406
1407         if (rt_is_input_route(rt)) {
1408                 p = (struct rtable **)&nh->nh_rth_input;
1409         } else {
1410                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1411         }
1412         orig = *p;
1413
1414         /* hold dst before doing cmpxchg() to avoid race condition
1415          * on this dst
1416          */
1417         dst_hold(&rt->dst);
1418         prev = cmpxchg(p, orig, rt);
1419         if (prev == orig) {
1420                 if (orig) {
1421                         dst_dev_put(&orig->dst);
1422                         dst_release(&orig->dst);
1423                 }
1424         } else {
1425                 dst_release(&rt->dst);
1426                 ret = false;
1427         }
1428
1429         return ret;
1430 }
1431
1432 struct uncached_list {
1433         spinlock_t              lock;
1434         struct list_head        head;
1435 };
1436
1437 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1438
1439 void rt_add_uncached_list(struct rtable *rt)
1440 {
1441         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1442
1443         rt->rt_uncached_list = ul;
1444
1445         spin_lock_bh(&ul->lock);
1446         list_add_tail(&rt->rt_uncached, &ul->head);
1447         spin_unlock_bh(&ul->lock);
1448 }
1449
1450 void rt_del_uncached_list(struct rtable *rt)
1451 {
1452         if (!list_empty(&rt->rt_uncached)) {
1453                 struct uncached_list *ul = rt->rt_uncached_list;
1454
1455                 spin_lock_bh(&ul->lock);
1456                 list_del(&rt->rt_uncached);
1457                 spin_unlock_bh(&ul->lock);
1458         }
1459 }
1460
1461 static void ipv4_dst_destroy(struct dst_entry *dst)
1462 {
1463         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1464         struct rtable *rt = (struct rtable *)dst;
1465
1466         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1467                 kfree(p);
1468
1469         rt_del_uncached_list(rt);
1470 }
1471
1472 void rt_flush_dev(struct net_device *dev)
1473 {
1474         struct net *net = dev_net(dev);
1475         struct rtable *rt;
1476         int cpu;
1477
1478         for_each_possible_cpu(cpu) {
1479                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1480
1481                 spin_lock_bh(&ul->lock);
1482                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1483                         if (rt->dst.dev != dev)
1484                                 continue;
1485                         rt->dst.dev = net->loopback_dev;
1486                         dev_hold(rt->dst.dev);
1487                         dev_put(dev);
1488                 }
1489                 spin_unlock_bh(&ul->lock);
1490         }
1491 }
1492
1493 static bool rt_cache_valid(const struct rtable *rt)
1494 {
1495         return  rt &&
1496                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1497                 !rt_is_expired(rt);
1498 }
1499
1500 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1501                            const struct fib_result *res,
1502                            struct fib_nh_exception *fnhe,
1503                            struct fib_info *fi, u16 type, u32 itag,
1504                            const bool do_cache)
1505 {
1506         bool cached = false;
1507
1508         if (fi) {
1509                 struct fib_nh *nh = &FIB_RES_NH(*res);
1510
1511                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1512                         rt->rt_gateway = nh->nh_gw;
1513                         rt->rt_uses_gateway = 1;
1514                 }
1515                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1516                 if (fi->fib_metrics != &dst_default_metrics) {
1517                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1518                         refcount_inc(&fi->fib_metrics->refcnt);
1519                 }
1520 #ifdef CONFIG_IP_ROUTE_CLASSID
1521                 rt->dst.tclassid = nh->nh_tclassid;
1522 #endif
1523                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1524                 if (unlikely(fnhe))
1525                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1526                 else if (do_cache)
1527                         cached = rt_cache_route(nh, rt);
1528                 if (unlikely(!cached)) {
1529                         /* Routes we intend to cache in nexthop exception or
1530                          * FIB nexthop have the DST_NOCACHE bit clear.
1531                          * However, if we are unsuccessful at storing this
1532                          * route into the cache we really need to set it.
1533                          */
1534                         if (!rt->rt_gateway)
1535                                 rt->rt_gateway = daddr;
1536                         rt_add_uncached_list(rt);
1537                 }
1538         } else
1539                 rt_add_uncached_list(rt);
1540
1541 #ifdef CONFIG_IP_ROUTE_CLASSID
1542 #ifdef CONFIG_IP_MULTIPLE_TABLES
1543         set_class_tag(rt, res->tclassid);
1544 #endif
1545         set_class_tag(rt, itag);
1546 #endif
1547 }
1548
1549 struct rtable *rt_dst_alloc(struct net_device *dev,
1550                             unsigned int flags, u16 type,
1551                             bool nopolicy, bool noxfrm, bool will_cache)
1552 {
1553         struct rtable *rt;
1554
1555         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1556                        (will_cache ? 0 : DST_HOST) |
1557                        (nopolicy ? DST_NOPOLICY : 0) |
1558                        (noxfrm ? DST_NOXFRM : 0));
1559
1560         if (rt) {
1561                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1562                 rt->rt_flags = flags;
1563                 rt->rt_type = type;
1564                 rt->rt_is_input = 0;
1565                 rt->rt_iif = 0;
1566                 rt->rt_pmtu = 0;
1567                 rt->rt_mtu_locked = 0;
1568                 rt->rt_gateway = 0;
1569                 rt->rt_uses_gateway = 0;
1570                 INIT_LIST_HEAD(&rt->rt_uncached);
1571
1572                 rt->dst.output = ip_output;
1573                 if (flags & RTCF_LOCAL)
1574                         rt->dst.input = ip_local_deliver;
1575         }
1576
1577         return rt;
1578 }
1579 EXPORT_SYMBOL(rt_dst_alloc);
1580
1581 /* called in rcu_read_lock() section */
1582 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1583                           u8 tos, struct net_device *dev,
1584                           struct in_device *in_dev, u32 *itag)
1585 {
1586         int err;
1587
1588         /* Primary sanity checks. */
1589         if (!in_dev)
1590                 return -EINVAL;
1591
1592         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1593             skb->protocol != htons(ETH_P_IP))
1594                 return -EINVAL;
1595
1596         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1597                 return -EINVAL;
1598
1599         if (ipv4_is_zeronet(saddr)) {
1600                 if (!ipv4_is_local_multicast(daddr))
1601                         return -EINVAL;
1602         } else {
1603                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1604                                           in_dev, itag);
1605                 if (err < 0)
1606                         return err;
1607         }
1608         return 0;
1609 }
1610
1611 /* called in rcu_read_lock() section */
1612 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1613                              u8 tos, struct net_device *dev, int our)
1614 {
1615         struct in_device *in_dev = __in_dev_get_rcu(dev);
1616         unsigned int flags = RTCF_MULTICAST;
1617         struct rtable *rth;
1618         u32 itag = 0;
1619         int err;
1620
1621         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1622         if (err)
1623                 return err;
1624
1625         if (our)
1626                 flags |= RTCF_LOCAL;
1627
1628         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1629                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1630         if (!rth)
1631                 return -ENOBUFS;
1632
1633 #ifdef CONFIG_IP_ROUTE_CLASSID
1634         rth->dst.tclassid = itag;
1635 #endif
1636         rth->dst.output = ip_rt_bug;
1637         rth->rt_is_input= 1;
1638
1639 #ifdef CONFIG_IP_MROUTE
1640         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1641                 rth->dst.input = ip_mr_input;
1642 #endif
1643         RT_CACHE_STAT_INC(in_slow_mc);
1644
1645         skb_dst_set(skb, &rth->dst);
1646         return 0;
1647 }
1648
1649
1650 static void ip_handle_martian_source(struct net_device *dev,
1651                                      struct in_device *in_dev,
1652                                      struct sk_buff *skb,
1653                                      __be32 daddr,
1654                                      __be32 saddr)
1655 {
1656         RT_CACHE_STAT_INC(in_martian_src);
1657 #ifdef CONFIG_IP_ROUTE_VERBOSE
1658         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1659                 /*
1660                  *      RFC1812 recommendation, if source is martian,
1661                  *      the only hint is MAC header.
1662                  */
1663                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1664                         &daddr, &saddr, dev->name);
1665                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1666                         print_hex_dump(KERN_WARNING, "ll header: ",
1667                                        DUMP_PREFIX_OFFSET, 16, 1,
1668                                        skb_mac_header(skb),
1669                                        dev->hard_header_len, true);
1670                 }
1671         }
1672 #endif
1673 }
1674
1675 /* called in rcu_read_lock() section */
1676 static int __mkroute_input(struct sk_buff *skb,
1677                            const struct fib_result *res,
1678                            struct in_device *in_dev,
1679                            __be32 daddr, __be32 saddr, u32 tos)
1680 {
1681         struct fib_nh_exception *fnhe;
1682         struct rtable *rth;
1683         int err;
1684         struct in_device *out_dev;
1685         bool do_cache;
1686         u32 itag = 0;
1687
1688         /* get a working reference to the output device */
1689         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1690         if (!out_dev) {
1691                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1692                 return -EINVAL;
1693         }
1694
1695         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1696                                   in_dev->dev, in_dev, &itag);
1697         if (err < 0) {
1698                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1699                                          saddr);
1700
1701                 goto cleanup;
1702         }
1703
1704         do_cache = res->fi && !itag;
1705         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1706             skb->protocol == htons(ETH_P_IP) &&
1707             (IN_DEV_SHARED_MEDIA(out_dev) ||
1708              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1709                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1710
1711         if (skb->protocol != htons(ETH_P_IP)) {
1712                 /* Not IP (i.e. ARP). Do not create route, if it is
1713                  * invalid for proxy arp. DNAT routes are always valid.
1714                  *
1715                  * Proxy arp feature have been extended to allow, ARP
1716                  * replies back to the same interface, to support
1717                  * Private VLAN switch technologies. See arp.c.
1718                  */
1719                 if (out_dev == in_dev &&
1720                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1721                         err = -EINVAL;
1722                         goto cleanup;
1723                 }
1724         }
1725
1726         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1727         if (do_cache) {
1728                 if (fnhe)
1729                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1730                 else
1731                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1732                 if (rt_cache_valid(rth)) {
1733                         skb_dst_set_noref(skb, &rth->dst);
1734                         goto out;
1735                 }
1736         }
1737
1738         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1739                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1740                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1741         if (!rth) {
1742                 err = -ENOBUFS;
1743                 goto cleanup;
1744         }
1745
1746         rth->rt_is_input = 1;
1747         RT_CACHE_STAT_INC(in_slow_tot);
1748
1749         rth->dst.input = ip_forward;
1750
1751         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1752                        do_cache);
1753         lwtunnel_set_redirect(&rth->dst);
1754         skb_dst_set(skb, &rth->dst);
1755 out:
1756         err = 0;
1757  cleanup:
1758         return err;
1759 }
1760
1761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1762 /* To make ICMP packets follow the right flow, the multipath hash is
1763  * calculated from the inner IP addresses.
1764  */
1765 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1766                                  struct flow_keys *hash_keys)
1767 {
1768         const struct iphdr *outer_iph = ip_hdr(skb);
1769         const struct iphdr *key_iph = outer_iph;
1770         const struct iphdr *inner_iph;
1771         const struct icmphdr *icmph;
1772         struct iphdr _inner_iph;
1773         struct icmphdr _icmph;
1774
1775         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1776                 goto out;
1777
1778         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1779                 goto out;
1780
1781         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1782                                    &_icmph);
1783         if (!icmph)
1784                 goto out;
1785
1786         if (icmph->type != ICMP_DEST_UNREACH &&
1787             icmph->type != ICMP_REDIRECT &&
1788             icmph->type != ICMP_TIME_EXCEEDED &&
1789             icmph->type != ICMP_PARAMETERPROB)
1790                 goto out;
1791
1792         inner_iph = skb_header_pointer(skb,
1793                                        outer_iph->ihl * 4 + sizeof(_icmph),
1794                                        sizeof(_inner_iph), &_inner_iph);
1795         if (!inner_iph)
1796                 goto out;
1797
1798         key_iph = inner_iph;
1799 out:
1800         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1801         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1802 }
1803
1804 /* if skb is set it will be used and fl4 can be NULL */
1805 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1806                        const struct sk_buff *skb, struct flow_keys *flkeys)
1807 {
1808         struct flow_keys hash_keys;
1809         u32 mhash;
1810
1811         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1812         case 0:
1813                 memset(&hash_keys, 0, sizeof(hash_keys));
1814                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1815                 if (skb) {
1816                         ip_multipath_l3_keys(skb, &hash_keys);
1817                 } else {
1818                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1819                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1820                 }
1821                 break;
1822         case 1:
1823                 /* skb is currently provided only when forwarding */
1824                 if (skb) {
1825                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1826                         struct flow_keys keys;
1827
1828                         /* short-circuit if we already have L4 hash present */
1829                         if (skb->l4_hash)
1830                                 return skb_get_hash_raw(skb) >> 1;
1831
1832                         memset(&hash_keys, 0, sizeof(hash_keys));
1833
1834                         if (!flkeys) {
1835                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1836                                 flkeys = &keys;
1837                         }
1838
1839                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1840                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1841                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1842                         hash_keys.ports.src = flkeys->ports.src;
1843                         hash_keys.ports.dst = flkeys->ports.dst;
1844                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1845                 } else {
1846                         memset(&hash_keys, 0, sizeof(hash_keys));
1847                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1848                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1849                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1850                         hash_keys.ports.src = fl4->fl4_sport;
1851                         hash_keys.ports.dst = fl4->fl4_dport;
1852                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1853                 }
1854                 break;
1855         }
1856         mhash = flow_hash_from_keys(&hash_keys);
1857
1858         return mhash >> 1;
1859 }
1860 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1861
1862 static int ip_mkroute_input(struct sk_buff *skb,
1863                             struct fib_result *res,
1864                             struct in_device *in_dev,
1865                             __be32 daddr, __be32 saddr, u32 tos,
1866                             struct flow_keys *hkeys)
1867 {
1868 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1869         if (res->fi && res->fi->fib_nhs > 1) {
1870                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1871
1872                 fib_select_multipath(res, h);
1873         }
1874 #endif
1875
1876         /* create a routing cache entry */
1877         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1878 }
1879
1880 /*
1881  *      NOTE. We drop all the packets that has local source
1882  *      addresses, because every properly looped back packet
1883  *      must have correct destination already attached by output routine.
1884  *
1885  *      Such approach solves two big problems:
1886  *      1. Not simplex devices are handled properly.
1887  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1888  *      called with rcu_read_lock()
1889  */
1890
1891 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1892                                u8 tos, struct net_device *dev,
1893                                struct fib_result *res)
1894 {
1895         struct in_device *in_dev = __in_dev_get_rcu(dev);
1896         struct flow_keys *flkeys = NULL, _flkeys;
1897         struct net    *net = dev_net(dev);
1898         struct ip_tunnel_info *tun_info;
1899         int             err = -EINVAL;
1900         unsigned int    flags = 0;
1901         u32             itag = 0;
1902         struct rtable   *rth;
1903         struct flowi4   fl4;
1904         bool do_cache;
1905
1906         /* IP on this device is disabled. */
1907
1908         if (!in_dev)
1909                 goto out;
1910
1911         /* Check for the most weird martians, which can be not detected
1912            by fib_lookup.
1913          */
1914
1915         tun_info = skb_tunnel_info(skb);
1916         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1917                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1918         else
1919                 fl4.flowi4_tun_key.tun_id = 0;
1920         skb_dst_drop(skb);
1921
1922         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1923                 goto martian_source;
1924
1925         res->fi = NULL;
1926         res->table = NULL;
1927         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1928                 goto brd_input;
1929
1930         /* Accept zero addresses only to limited broadcast;
1931          * I even do not know to fix it or not. Waiting for complains :-)
1932          */
1933         if (ipv4_is_zeronet(saddr))
1934                 goto martian_source;
1935
1936         if (ipv4_is_zeronet(daddr))
1937                 goto martian_destination;
1938
1939         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1940          * and call it once if daddr or/and saddr are loopback addresses
1941          */
1942         if (ipv4_is_loopback(daddr)) {
1943                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1944                         goto martian_destination;
1945         } else if (ipv4_is_loopback(saddr)) {
1946                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1947                         goto martian_source;
1948         }
1949
1950         /*
1951          *      Now we are ready to route packet.
1952          */
1953         fl4.flowi4_oif = 0;
1954         fl4.flowi4_iif = dev->ifindex;
1955         fl4.flowi4_mark = skb->mark;
1956         fl4.flowi4_tos = tos;
1957         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1958         fl4.flowi4_flags = 0;
1959         fl4.daddr = daddr;
1960         fl4.saddr = saddr;
1961         fl4.flowi4_uid = sock_net_uid(net, NULL);
1962
1963         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
1964                 flkeys = &_flkeys;
1965
1966         err = fib_lookup(net, &fl4, res, 0);
1967         if (err != 0) {
1968                 if (!IN_DEV_FORWARD(in_dev))
1969                         err = -EHOSTUNREACH;
1970                 goto no_route;
1971         }
1972
1973         if (res->type == RTN_BROADCAST)
1974                 goto brd_input;
1975
1976         if (res->type == RTN_LOCAL) {
1977                 err = fib_validate_source(skb, saddr, daddr, tos,
1978                                           0, dev, in_dev, &itag);
1979                 if (err < 0)
1980                         goto martian_source;
1981                 goto local_input;
1982         }
1983
1984         if (!IN_DEV_FORWARD(in_dev)) {
1985                 err = -EHOSTUNREACH;
1986                 goto no_route;
1987         }
1988         if (res->type != RTN_UNICAST)
1989                 goto martian_destination;
1990
1991         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1992 out:    return err;
1993
1994 brd_input:
1995         if (skb->protocol != htons(ETH_P_IP))
1996                 goto e_inval;
1997
1998         if (!ipv4_is_zeronet(saddr)) {
1999                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2000                                           in_dev, &itag);
2001                 if (err < 0)
2002                         goto martian_source;
2003         }
2004         flags |= RTCF_BROADCAST;
2005         res->type = RTN_BROADCAST;
2006         RT_CACHE_STAT_INC(in_brd);
2007
2008 local_input:
2009         do_cache = false;
2010         if (res->fi) {
2011                 if (!itag) {
2012                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2013                         if (rt_cache_valid(rth)) {
2014                                 skb_dst_set_noref(skb, &rth->dst);
2015                                 err = 0;
2016                                 goto out;
2017                         }
2018                         do_cache = true;
2019                 }
2020         }
2021
2022         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2023                            flags | RTCF_LOCAL, res->type,
2024                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2025         if (!rth)
2026                 goto e_nobufs;
2027
2028         rth->dst.output= ip_rt_bug;
2029 #ifdef CONFIG_IP_ROUTE_CLASSID
2030         rth->dst.tclassid = itag;
2031 #endif
2032         rth->rt_is_input = 1;
2033
2034         RT_CACHE_STAT_INC(in_slow_tot);
2035         if (res->type == RTN_UNREACHABLE) {
2036                 rth->dst.input= ip_error;
2037                 rth->dst.error= -err;
2038                 rth->rt_flags   &= ~RTCF_LOCAL;
2039         }
2040
2041         if (do_cache) {
2042                 struct fib_nh *nh = &FIB_RES_NH(*res);
2043
2044                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2045                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2046                         WARN_ON(rth->dst.input == lwtunnel_input);
2047                         rth->dst.lwtstate->orig_input = rth->dst.input;
2048                         rth->dst.input = lwtunnel_input;
2049                 }
2050
2051                 if (unlikely(!rt_cache_route(nh, rth)))
2052                         rt_add_uncached_list(rth);
2053         }
2054         skb_dst_set(skb, &rth->dst);
2055         err = 0;
2056         goto out;
2057
2058 no_route:
2059         RT_CACHE_STAT_INC(in_no_route);
2060         res->type = RTN_UNREACHABLE;
2061         res->fi = NULL;
2062         res->table = NULL;
2063         goto local_input;
2064
2065         /*
2066          *      Do not cache martian addresses: they should be logged (RFC1812)
2067          */
2068 martian_destination:
2069         RT_CACHE_STAT_INC(in_martian_dst);
2070 #ifdef CONFIG_IP_ROUTE_VERBOSE
2071         if (IN_DEV_LOG_MARTIANS(in_dev))
2072                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2073                                      &daddr, &saddr, dev->name);
2074 #endif
2075
2076 e_inval:
2077         err = -EINVAL;
2078         goto out;
2079
2080 e_nobufs:
2081         err = -ENOBUFS;
2082         goto out;
2083
2084 martian_source:
2085         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086         goto out;
2087 }
2088
2089 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090                          u8 tos, struct net_device *dev)
2091 {
2092         struct fib_result res;
2093         int err;
2094
2095         tos &= IPTOS_RT_MASK;
2096         rcu_read_lock();
2097         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2098         rcu_read_unlock();
2099
2100         return err;
2101 }
2102 EXPORT_SYMBOL(ip_route_input_noref);
2103
2104 /* called with rcu_read_lock held */
2105 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2106                        u8 tos, struct net_device *dev, struct fib_result *res)
2107 {
2108         /* Multicast recognition logic is moved from route cache to here.
2109            The problem was that too many Ethernet cards have broken/missing
2110            hardware multicast filters :-( As result the host on multicasting
2111            network acquires a lot of useless route cache entries, sort of
2112            SDR messages from all the world. Now we try to get rid of them.
2113            Really, provided software IP multicast filter is organized
2114            reasonably (at least, hashed), it does not result in a slowdown
2115            comparing with route cache reject entries.
2116            Note, that multicast routers are not affected, because
2117            route cache entry is created eventually.
2118          */
2119         if (ipv4_is_multicast(daddr)) {
2120                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2121                 int our = 0;
2122                 int err = -EINVAL;
2123
2124                 if (in_dev)
2125                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2126                                               ip_hdr(skb)->protocol);
2127
2128                 /* check l3 master if no match yet */
2129                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2130                         struct in_device *l3_in_dev;
2131
2132                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2133                         if (l3_in_dev)
2134                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2135                                                       ip_hdr(skb)->protocol);
2136                 }
2137
2138                 if (our
2139 #ifdef CONFIG_IP_MROUTE
2140                         ||
2141                     (!ipv4_is_local_multicast(daddr) &&
2142                      IN_DEV_MFORWARD(in_dev))
2143 #endif
2144                    ) {
2145                         err = ip_route_input_mc(skb, daddr, saddr,
2146                                                 tos, dev, our);
2147                 }
2148                 return err;
2149         }
2150
2151         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2152 }
2153
2154 /* called with rcu_read_lock() */
2155 static struct rtable *__mkroute_output(const struct fib_result *res,
2156                                        const struct flowi4 *fl4, int orig_oif,
2157                                        struct net_device *dev_out,
2158                                        unsigned int flags)
2159 {
2160         struct fib_info *fi = res->fi;
2161         struct fib_nh_exception *fnhe;
2162         struct in_device *in_dev;
2163         u16 type = res->type;
2164         struct rtable *rth;
2165         bool do_cache;
2166
2167         in_dev = __in_dev_get_rcu(dev_out);
2168         if (!in_dev)
2169                 return ERR_PTR(-EINVAL);
2170
2171         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2172                 if (ipv4_is_loopback(fl4->saddr) &&
2173                     !(dev_out->flags & IFF_LOOPBACK) &&
2174                     !netif_is_l3_master(dev_out))
2175                         return ERR_PTR(-EINVAL);
2176
2177         if (ipv4_is_lbcast(fl4->daddr))
2178                 type = RTN_BROADCAST;
2179         else if (ipv4_is_multicast(fl4->daddr))
2180                 type = RTN_MULTICAST;
2181         else if (ipv4_is_zeronet(fl4->daddr))
2182                 return ERR_PTR(-EINVAL);
2183
2184         if (dev_out->flags & IFF_LOOPBACK)
2185                 flags |= RTCF_LOCAL;
2186
2187         do_cache = true;
2188         if (type == RTN_BROADCAST) {
2189                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2190                 fi = NULL;
2191         } else if (type == RTN_MULTICAST) {
2192                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2193                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2194                                      fl4->flowi4_proto))
2195                         flags &= ~RTCF_LOCAL;
2196                 else
2197                         do_cache = false;
2198                 /* If multicast route do not exist use
2199                  * default one, but do not gateway in this case.
2200                  * Yes, it is hack.
2201                  */
2202                 if (fi && res->prefixlen < 4)
2203                         fi = NULL;
2204         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2205                    (orig_oif != dev_out->ifindex)) {
2206                 /* For local routes that require a particular output interface
2207                  * we do not want to cache the result.  Caching the result
2208                  * causes incorrect behaviour when there are multiple source
2209                  * addresses on the interface, the end result being that if the
2210                  * intended recipient is waiting on that interface for the
2211                  * packet he won't receive it because it will be delivered on
2212                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2213                  * be set to the loopback interface as well.
2214                  */
2215                 do_cache = false;
2216         }
2217
2218         fnhe = NULL;
2219         do_cache &= fi != NULL;
2220         if (fi) {
2221                 struct rtable __rcu **prth;
2222                 struct fib_nh *nh = &FIB_RES_NH(*res);
2223
2224                 fnhe = find_exception(nh, fl4->daddr);
2225                 if (!do_cache)
2226                         goto add;
2227                 if (fnhe) {
2228                         prth = &fnhe->fnhe_rth_output;
2229                 } else {
2230                         if (unlikely(fl4->flowi4_flags &
2231                                      FLOWI_FLAG_KNOWN_NH &&
2232                                      !(nh->nh_gw &&
2233                                        nh->nh_scope == RT_SCOPE_LINK))) {
2234                                 do_cache = false;
2235                                 goto add;
2236                         }
2237                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2238                 }
2239                 rth = rcu_dereference(*prth);
2240                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2241                         return rth;
2242         }
2243
2244 add:
2245         rth = rt_dst_alloc(dev_out, flags, type,
2246                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2247                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2248                            do_cache);
2249         if (!rth)
2250                 return ERR_PTR(-ENOBUFS);
2251
2252         rth->rt_iif = orig_oif;
2253
2254         RT_CACHE_STAT_INC(out_slow_tot);
2255
2256         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2257                 if (flags & RTCF_LOCAL &&
2258                     !(dev_out->flags & IFF_LOOPBACK)) {
2259                         rth->dst.output = ip_mc_output;
2260                         RT_CACHE_STAT_INC(out_slow_mc);
2261                 }
2262 #ifdef CONFIG_IP_MROUTE
2263                 if (type == RTN_MULTICAST) {
2264                         if (IN_DEV_MFORWARD(in_dev) &&
2265                             !ipv4_is_local_multicast(fl4->daddr)) {
2266                                 rth->dst.input = ip_mr_input;
2267                                 rth->dst.output = ip_mc_output;
2268                         }
2269                 }
2270 #endif
2271         }
2272
2273         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2274         lwtunnel_set_redirect(&rth->dst);
2275
2276         return rth;
2277 }
2278
2279 /*
2280  * Major route resolver routine.
2281  */
2282
2283 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2284                                         const struct sk_buff *skb)
2285 {
2286         __u8 tos = RT_FL_TOS(fl4);
2287         struct fib_result res = {
2288                 .type           = RTN_UNSPEC,
2289                 .fi             = NULL,
2290                 .table          = NULL,
2291                 .tclassid       = 0,
2292         };
2293         struct rtable *rth;
2294
2295         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2296         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2297         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2298                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2299
2300         rcu_read_lock();
2301         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2302         rcu_read_unlock();
2303
2304         return rth;
2305 }
2306 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2307
2308 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2309                                             struct fib_result *res,
2310                                             const struct sk_buff *skb)
2311 {
2312         struct net_device *dev_out = NULL;
2313         int orig_oif = fl4->flowi4_oif;
2314         unsigned int flags = 0;
2315         struct rtable *rth;
2316         int err = -ENETUNREACH;
2317
2318         if (fl4->saddr) {
2319                 rth = ERR_PTR(-EINVAL);
2320                 if (ipv4_is_multicast(fl4->saddr) ||
2321                     ipv4_is_lbcast(fl4->saddr) ||
2322                     ipv4_is_zeronet(fl4->saddr))
2323                         goto out;
2324
2325                 /* I removed check for oif == dev_out->oif here.
2326                    It was wrong for two reasons:
2327                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2328                       is assigned to multiple interfaces.
2329                    2. Moreover, we are allowed to send packets with saddr
2330                       of another iface. --ANK
2331                  */
2332
2333                 if (fl4->flowi4_oif == 0 &&
2334                     (ipv4_is_multicast(fl4->daddr) ||
2335                      ipv4_is_lbcast(fl4->daddr))) {
2336                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2337                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2338                         if (!dev_out)
2339                                 goto out;
2340
2341                         /* Special hack: user can direct multicasts
2342                            and limited broadcast via necessary interface
2343                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2344                            This hack is not just for fun, it allows
2345                            vic,vat and friends to work.
2346                            They bind socket to loopback, set ttl to zero
2347                            and expect that it will work.
2348                            From the viewpoint of routing cache they are broken,
2349                            because we are not allowed to build multicast path
2350                            with loopback source addr (look, routing cache
2351                            cannot know, that ttl is zero, so that packet
2352                            will not leave this host and route is valid).
2353                            Luckily, this hack is good workaround.
2354                          */
2355
2356                         fl4->flowi4_oif = dev_out->ifindex;
2357                         goto make_route;
2358                 }
2359
2360                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2361                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2362                         if (!__ip_dev_find(net, fl4->saddr, false))
2363                                 goto out;
2364                 }
2365         }
2366
2367
2368         if (fl4->flowi4_oif) {
2369                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2370                 rth = ERR_PTR(-ENODEV);
2371                 if (!dev_out)
2372                         goto out;
2373
2374                 /* RACE: Check return value of inet_select_addr instead. */
2375                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2376                         rth = ERR_PTR(-ENETUNREACH);
2377                         goto out;
2378                 }
2379                 if (ipv4_is_local_multicast(fl4->daddr) ||
2380                     ipv4_is_lbcast(fl4->daddr) ||
2381                     fl4->flowi4_proto == IPPROTO_IGMP) {
2382                         if (!fl4->saddr)
2383                                 fl4->saddr = inet_select_addr(dev_out, 0,
2384                                                               RT_SCOPE_LINK);
2385                         goto make_route;
2386                 }
2387                 if (!fl4->saddr) {
2388                         if (ipv4_is_multicast(fl4->daddr))
2389                                 fl4->saddr = inet_select_addr(dev_out, 0,
2390                                                               fl4->flowi4_scope);
2391                         else if (!fl4->daddr)
2392                                 fl4->saddr = inet_select_addr(dev_out, 0,
2393                                                               RT_SCOPE_HOST);
2394                 }
2395         }
2396
2397         if (!fl4->daddr) {
2398                 fl4->daddr = fl4->saddr;
2399                 if (!fl4->daddr)
2400                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2401                 dev_out = net->loopback_dev;
2402                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2403                 res->type = RTN_LOCAL;
2404                 flags |= RTCF_LOCAL;
2405                 goto make_route;
2406         }
2407
2408         err = fib_lookup(net, fl4, res, 0);
2409         if (err) {
2410                 res->fi = NULL;
2411                 res->table = NULL;
2412                 if (fl4->flowi4_oif &&
2413                     (ipv4_is_multicast(fl4->daddr) ||
2414                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2415                         /* Apparently, routing tables are wrong. Assume,
2416                            that the destination is on link.
2417
2418                            WHY? DW.
2419                            Because we are allowed to send to iface
2420                            even if it has NO routes and NO assigned
2421                            addresses. When oif is specified, routing
2422                            tables are looked up with only one purpose:
2423                            to catch if destination is gatewayed, rather than
2424                            direct. Moreover, if MSG_DONTROUTE is set,
2425                            we send packet, ignoring both routing tables
2426                            and ifaddr state. --ANK
2427
2428
2429                            We could make it even if oif is unknown,
2430                            likely IPv6, but we do not.
2431                          */
2432
2433                         if (fl4->saddr == 0)
2434                                 fl4->saddr = inet_select_addr(dev_out, 0,
2435                                                               RT_SCOPE_LINK);
2436                         res->type = RTN_UNICAST;
2437                         goto make_route;
2438                 }
2439                 rth = ERR_PTR(err);
2440                 goto out;
2441         }
2442
2443         if (res->type == RTN_LOCAL) {
2444                 if (!fl4->saddr) {
2445                         if (res->fi->fib_prefsrc)
2446                                 fl4->saddr = res->fi->fib_prefsrc;
2447                         else
2448                                 fl4->saddr = fl4->daddr;
2449                 }
2450
2451                 /* L3 master device is the loopback for that domain */
2452                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2453                         net->loopback_dev;
2454
2455                 /* make sure orig_oif points to fib result device even
2456                  * though packet rx/tx happens over loopback or l3mdev
2457                  */
2458                 orig_oif = FIB_RES_OIF(*res);
2459
2460                 fl4->flowi4_oif = dev_out->ifindex;
2461                 flags |= RTCF_LOCAL;
2462                 goto make_route;
2463         }
2464
2465         fib_select_path(net, res, fl4, skb);
2466
2467         dev_out = FIB_RES_DEV(*res);
2468         fl4->flowi4_oif = dev_out->ifindex;
2469
2470
2471 make_route:
2472         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2473
2474 out:
2475         return rth;
2476 }
2477
2478 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2479 {
2480         return NULL;
2481 }
2482
2483 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2484 {
2485         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2486
2487         return mtu ? : dst->dev->mtu;
2488 }
2489
2490 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2491                                           struct sk_buff *skb, u32 mtu)
2492 {
2493 }
2494
2495 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2496                                        struct sk_buff *skb)
2497 {
2498 }
2499
2500 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2501                                           unsigned long old)
2502 {
2503         return NULL;
2504 }
2505
2506 static struct dst_ops ipv4_dst_blackhole_ops = {
2507         .family                 =       AF_INET,
2508         .check                  =       ipv4_blackhole_dst_check,
2509         .mtu                    =       ipv4_blackhole_mtu,
2510         .default_advmss         =       ipv4_default_advmss,
2511         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2512         .redirect               =       ipv4_rt_blackhole_redirect,
2513         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2514         .neigh_lookup           =       ipv4_neigh_lookup,
2515 };
2516
2517 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2518 {
2519         struct rtable *ort = (struct rtable *) dst_orig;
2520         struct rtable *rt;
2521
2522         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2523         if (rt) {
2524                 struct dst_entry *new = &rt->dst;
2525
2526                 new->__use = 1;
2527                 new->input = dst_discard;
2528                 new->output = dst_discard_out;
2529
2530                 new->dev = net->loopback_dev;
2531                 if (new->dev)
2532                         dev_hold(new->dev);
2533
2534                 rt->rt_is_input = ort->rt_is_input;
2535                 rt->rt_iif = ort->rt_iif;
2536                 rt->rt_pmtu = ort->rt_pmtu;
2537                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2538
2539                 rt->rt_genid = rt_genid_ipv4(net);
2540                 rt->rt_flags = ort->rt_flags;
2541                 rt->rt_type = ort->rt_type;
2542                 rt->rt_gateway = ort->rt_gateway;
2543                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2544
2545                 INIT_LIST_HEAD(&rt->rt_uncached);
2546         }
2547
2548         dst_release(dst_orig);
2549
2550         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2551 }
2552
2553 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2554                                     const struct sock *sk)
2555 {
2556         struct rtable *rt = __ip_route_output_key(net, flp4);
2557
2558         if (IS_ERR(rt))
2559                 return rt;
2560
2561         if (flp4->flowi4_proto)
2562                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2563                                                         flowi4_to_flowi(flp4),
2564                                                         sk, 0);
2565
2566         return rt;
2567 }
2568 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2569
2570 /* called with rcu_read_lock held */
2571 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2572                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2573                         u32 seq)
2574 {
2575         struct rtable *rt = skb_rtable(skb);
2576         struct rtmsg *r;
2577         struct nlmsghdr *nlh;
2578         unsigned long expires = 0;
2579         u32 error;
2580         u32 metrics[RTAX_MAX];
2581
2582         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2583         if (!nlh)
2584                 return -EMSGSIZE;
2585
2586         r = nlmsg_data(nlh);
2587         r->rtm_family    = AF_INET;
2588         r->rtm_dst_len  = 32;
2589         r->rtm_src_len  = 0;
2590         r->rtm_tos      = fl4->flowi4_tos;
2591         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2592         if (nla_put_u32(skb, RTA_TABLE, table_id))
2593                 goto nla_put_failure;
2594         r->rtm_type     = rt->rt_type;
2595         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2596         r->rtm_protocol = RTPROT_UNSPEC;
2597         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2598         if (rt->rt_flags & RTCF_NOTIFY)
2599                 r->rtm_flags |= RTM_F_NOTIFY;
2600         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2601                 r->rtm_flags |= RTCF_DOREDIRECT;
2602
2603         if (nla_put_in_addr(skb, RTA_DST, dst))
2604                 goto nla_put_failure;
2605         if (src) {
2606                 r->rtm_src_len = 32;
2607                 if (nla_put_in_addr(skb, RTA_SRC, src))
2608                         goto nla_put_failure;
2609         }
2610         if (rt->dst.dev &&
2611             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2612                 goto nla_put_failure;
2613 #ifdef CONFIG_IP_ROUTE_CLASSID
2614         if (rt->dst.tclassid &&
2615             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2616                 goto nla_put_failure;
2617 #endif
2618         if (!rt_is_input_route(rt) &&
2619             fl4->saddr != src) {
2620                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2621                         goto nla_put_failure;
2622         }
2623         if (rt->rt_uses_gateway &&
2624             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2625                 goto nla_put_failure;
2626
2627         expires = rt->dst.expires;
2628         if (expires) {
2629                 unsigned long now = jiffies;
2630
2631                 if (time_before(now, expires))
2632                         expires -= now;
2633                 else
2634                         expires = 0;
2635         }
2636
2637         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2638         if (rt->rt_pmtu && expires)
2639                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2640         if (rt->rt_mtu_locked && expires)
2641                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2642         if (rtnetlink_put_metrics(skb, metrics) < 0)
2643                 goto nla_put_failure;
2644
2645         if (fl4->flowi4_mark &&
2646             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2647                 goto nla_put_failure;
2648
2649         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2650             nla_put_u32(skb, RTA_UID,
2651                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2652                 goto nla_put_failure;
2653
2654         error = rt->dst.error;
2655
2656         if (rt_is_input_route(rt)) {
2657 #ifdef CONFIG_IP_MROUTE
2658                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2659                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2660                         int err = ipmr_get_route(net, skb,
2661                                                  fl4->saddr, fl4->daddr,
2662                                                  r, portid);
2663
2664                         if (err <= 0) {
2665                                 if (err == 0)
2666                                         return 0;
2667                                 goto nla_put_failure;
2668                         }
2669                 } else
2670 #endif
2671                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2672                                 goto nla_put_failure;
2673         }
2674
2675         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2676                 goto nla_put_failure;
2677
2678         nlmsg_end(skb, nlh);
2679         return 0;
2680
2681 nla_put_failure:
2682         nlmsg_cancel(skb, nlh);
2683         return -EMSGSIZE;
2684 }
2685
2686 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2687                              struct netlink_ext_ack *extack)
2688 {
2689         struct net *net = sock_net(in_skb->sk);
2690         struct rtmsg *rtm;
2691         struct nlattr *tb[RTA_MAX+1];
2692         struct fib_result res = {};
2693         struct rtable *rt = NULL;
2694         struct flowi4 fl4;
2695         __be32 dst = 0;
2696         __be32 src = 0;
2697         u32 iif;
2698         int err;
2699         int mark;
2700         struct sk_buff *skb;
2701         u32 table_id = RT_TABLE_MAIN;
2702         kuid_t uid;
2703
2704         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2705                           extack);
2706         if (err < 0)
2707                 goto errout;
2708
2709         rtm = nlmsg_data(nlh);
2710
2711         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2712         if (!skb) {
2713                 err = -ENOBUFS;
2714                 goto errout;
2715         }
2716
2717         /* Reserve room for dummy headers, this skb can pass
2718            through good chunk of routing engine.
2719          */
2720         skb_reset_mac_header(skb);
2721         skb_reset_network_header(skb);
2722
2723         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2724         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2725         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2726         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2727         if (tb[RTA_UID])
2728                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2729         else
2730                 uid = (iif ? INVALID_UID : current_uid());
2731
2732         /* Bugfix: need to give ip_route_input enough of an IP header to
2733          * not gag.
2734          */
2735         ip_hdr(skb)->protocol = IPPROTO_UDP;
2736         ip_hdr(skb)->saddr = src;
2737         ip_hdr(skb)->daddr = dst;
2738
2739         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2740
2741         memset(&fl4, 0, sizeof(fl4));
2742         fl4.daddr = dst;
2743         fl4.saddr = src;
2744         fl4.flowi4_tos = rtm->rtm_tos;
2745         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2746         fl4.flowi4_mark = mark;
2747         fl4.flowi4_uid = uid;
2748
2749         rcu_read_lock();
2750
2751         if (iif) {
2752                 struct net_device *dev;
2753
2754                 dev = dev_get_by_index_rcu(net, iif);
2755                 if (!dev) {
2756                         err = -ENODEV;
2757                         goto errout_free;
2758                 }
2759
2760                 skb->protocol   = htons(ETH_P_IP);
2761                 skb->dev        = dev;
2762                 skb->mark       = mark;
2763                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2764                                          dev, &res);
2765
2766                 rt = skb_rtable(skb);
2767                 if (err == 0 && rt->dst.error)
2768                         err = -rt->dst.error;
2769         } else {
2770                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2771                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2772                 err = 0;
2773                 if (IS_ERR(rt))
2774                         err = PTR_ERR(rt);
2775                 else
2776                         skb_dst_set(skb, &rt->dst);
2777         }
2778
2779         if (err)
2780                 goto errout_free;
2781
2782         if (rtm->rtm_flags & RTM_F_NOTIFY)
2783                 rt->rt_flags |= RTCF_NOTIFY;
2784
2785         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2786                 table_id = res.table ? res.table->tb_id : 0;
2787
2788         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2789                 if (!res.fi) {
2790                         err = fib_props[res.type].error;
2791                         if (!err)
2792                                 err = -EHOSTUNREACH;
2793                         goto errout_free;
2794                 }
2795                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2796                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2797                                     rt->rt_type, res.prefix, res.prefixlen,
2798                                     fl4.flowi4_tos, res.fi, 0);
2799         } else {
2800                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2801                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2802         }
2803         if (err < 0)
2804                 goto errout_free;
2805
2806         rcu_read_unlock();
2807
2808         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2809 errout:
2810         return err;
2811
2812 errout_free:
2813         rcu_read_unlock();
2814         kfree_skb(skb);
2815         goto errout;
2816 }
2817
2818 void ip_rt_multicast_event(struct in_device *in_dev)
2819 {
2820         rt_cache_flush(dev_net(in_dev->dev));
2821 }
2822
2823 #ifdef CONFIG_SYSCTL
2824 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2825 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2826 static int ip_rt_gc_elasticity __read_mostly    = 8;
2827 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2828
2829 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2830                                         void __user *buffer,
2831                                         size_t *lenp, loff_t *ppos)
2832 {
2833         struct net *net = (struct net *)__ctl->extra1;
2834
2835         if (write) {
2836                 rt_cache_flush(net);
2837                 fnhe_genid_bump(net);
2838                 return 0;
2839         }
2840
2841         return -EINVAL;
2842 }
2843
2844 static struct ctl_table ipv4_route_table[] = {
2845         {
2846                 .procname       = "gc_thresh",
2847                 .data           = &ipv4_dst_ops.gc_thresh,
2848                 .maxlen         = sizeof(int),
2849                 .mode           = 0644,
2850                 .proc_handler   = proc_dointvec,
2851         },
2852         {
2853                 .procname       = "max_size",
2854                 .data           = &ip_rt_max_size,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = proc_dointvec,
2858         },
2859         {
2860                 /*  Deprecated. Use gc_min_interval_ms */
2861
2862                 .procname       = "gc_min_interval",
2863                 .data           = &ip_rt_gc_min_interval,
2864                 .maxlen         = sizeof(int),
2865                 .mode           = 0644,
2866                 .proc_handler   = proc_dointvec_jiffies,
2867         },
2868         {
2869                 .procname       = "gc_min_interval_ms",
2870                 .data           = &ip_rt_gc_min_interval,
2871                 .maxlen         = sizeof(int),
2872                 .mode           = 0644,
2873                 .proc_handler   = proc_dointvec_ms_jiffies,
2874         },
2875         {
2876                 .procname       = "gc_timeout",
2877                 .data           = &ip_rt_gc_timeout,
2878                 .maxlen         = sizeof(int),
2879                 .mode           = 0644,
2880                 .proc_handler   = proc_dointvec_jiffies,
2881         },
2882         {
2883                 .procname       = "gc_interval",
2884                 .data           = &ip_rt_gc_interval,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = proc_dointvec_jiffies,
2888         },
2889         {
2890                 .procname       = "redirect_load",
2891                 .data           = &ip_rt_redirect_load,
2892                 .maxlen         = sizeof(int),
2893                 .mode           = 0644,
2894                 .proc_handler   = proc_dointvec,
2895         },
2896         {
2897                 .procname       = "redirect_number",
2898                 .data           = &ip_rt_redirect_number,
2899                 .maxlen         = sizeof(int),
2900                 .mode           = 0644,
2901                 .proc_handler   = proc_dointvec,
2902         },
2903         {
2904                 .procname       = "redirect_silence",
2905                 .data           = &ip_rt_redirect_silence,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = proc_dointvec,
2909         },
2910         {
2911                 .procname       = "error_cost",
2912                 .data           = &ip_rt_error_cost,
2913                 .maxlen         = sizeof(int),
2914                 .mode           = 0644,
2915                 .proc_handler   = proc_dointvec,
2916         },
2917         {
2918                 .procname       = "error_burst",
2919                 .data           = &ip_rt_error_burst,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = proc_dointvec,
2923         },
2924         {
2925                 .procname       = "gc_elasticity",
2926                 .data           = &ip_rt_gc_elasticity,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = proc_dointvec,
2930         },
2931         {
2932                 .procname       = "mtu_expires",
2933                 .data           = &ip_rt_mtu_expires,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = proc_dointvec_jiffies,
2937         },
2938         {
2939                 .procname       = "min_pmtu",
2940                 .data           = &ip_rt_min_pmtu,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = proc_dointvec_minmax,
2944                 .extra1         = &ip_min_valid_pmtu,
2945         },
2946         {
2947                 .procname       = "min_adv_mss",
2948                 .data           = &ip_rt_min_advmss,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = proc_dointvec,
2952         },
2953         { }
2954 };
2955
2956 static struct ctl_table ipv4_route_flush_table[] = {
2957         {
2958                 .procname       = "flush",
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0200,
2961                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2962         },
2963         { },
2964 };
2965
2966 static __net_init int sysctl_route_net_init(struct net *net)
2967 {
2968         struct ctl_table *tbl;
2969
2970         tbl = ipv4_route_flush_table;
2971         if (!net_eq(net, &init_net)) {
2972                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2973                 if (!tbl)
2974                         goto err_dup;
2975
2976                 /* Don't export sysctls to unprivileged users */
2977                 if (net->user_ns != &init_user_ns)
2978                         tbl[0].procname = NULL;
2979         }
2980         tbl[0].extra1 = net;
2981
2982         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2983         if (!net->ipv4.route_hdr)
2984                 goto err_reg;
2985         return 0;
2986
2987 err_reg:
2988         if (tbl != ipv4_route_flush_table)
2989                 kfree(tbl);
2990 err_dup:
2991         return -ENOMEM;
2992 }
2993
2994 static __net_exit void sysctl_route_net_exit(struct net *net)
2995 {
2996         struct ctl_table *tbl;
2997
2998         tbl = net->ipv4.route_hdr->ctl_table_arg;
2999         unregister_net_sysctl_table(net->ipv4.route_hdr);
3000         BUG_ON(tbl == ipv4_route_flush_table);
3001         kfree(tbl);
3002 }
3003
3004 static __net_initdata struct pernet_operations sysctl_route_ops = {
3005         .init = sysctl_route_net_init,
3006         .exit = sysctl_route_net_exit,
3007 };
3008 #endif
3009
3010 static __net_init int rt_genid_init(struct net *net)
3011 {
3012         atomic_set(&net->ipv4.rt_genid, 0);
3013         atomic_set(&net->fnhe_genid, 0);
3014         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3015         return 0;
3016 }
3017
3018 static __net_initdata struct pernet_operations rt_genid_ops = {
3019         .init = rt_genid_init,
3020 };
3021
3022 static int __net_init ipv4_inetpeer_init(struct net *net)
3023 {
3024         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3025
3026         if (!bp)
3027                 return -ENOMEM;
3028         inet_peer_base_init(bp);
3029         net->ipv4.peers = bp;
3030         return 0;
3031 }
3032
3033 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3034 {
3035         struct inet_peer_base *bp = net->ipv4.peers;
3036
3037         net->ipv4.peers = NULL;
3038         inetpeer_invalidate_tree(bp);
3039         kfree(bp);
3040 }
3041
3042 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3043         .init   =       ipv4_inetpeer_init,
3044         .exit   =       ipv4_inetpeer_exit,
3045 };
3046
3047 #ifdef CONFIG_IP_ROUTE_CLASSID
3048 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3049 #endif /* CONFIG_IP_ROUTE_CLASSID */
3050
3051 int __init ip_rt_init(void)
3052 {
3053         int cpu;
3054
3055         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3056         if (!ip_idents)
3057                 panic("IP: failed to allocate ip_idents\n");
3058
3059         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3060
3061         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3062         if (!ip_tstamps)
3063                 panic("IP: failed to allocate ip_tstamps\n");
3064
3065         for_each_possible_cpu(cpu) {
3066                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3067
3068                 INIT_LIST_HEAD(&ul->head);
3069                 spin_lock_init(&ul->lock);
3070         }
3071 #ifdef CONFIG_IP_ROUTE_CLASSID
3072         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3073         if (!ip_rt_acct)
3074                 panic("IP: failed to allocate ip_rt_acct\n");
3075 #endif
3076
3077         ipv4_dst_ops.kmem_cachep =
3078                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3079                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3080
3081         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3082
3083         if (dst_entries_init(&ipv4_dst_ops) < 0)
3084                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3085
3086         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3087                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3088
3089         ipv4_dst_ops.gc_thresh = ~0;
3090         ip_rt_max_size = INT_MAX;
3091
3092         devinet_init();
3093         ip_fib_init();
3094
3095         if (ip_rt_proc_init())
3096                 pr_err("Unable to create route proc files\n");
3097 #ifdef CONFIG_XFRM
3098         xfrm_init();
3099         xfrm4_init();
3100 #endif
3101         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3102                       RTNL_FLAG_DOIT_UNLOCKED);
3103
3104 #ifdef CONFIG_SYSCTL
3105         register_pernet_subsys(&sysctl_route_ops);
3106 #endif
3107         register_pernet_subsys(&rt_genid_ops);
3108         register_pernet_subsys(&ipv4_inetpeer_ops);
3109         return 0;
3110 }
3111
3112 #ifdef CONFIG_SYSCTL
3113 /*
3114  * We really need to sanitize the damn ipv4 init order, then all
3115  * this nonsense will go away.
3116  */
3117 void __init ip_static_sysctl_init(void)
3118 {
3119         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3120 }
3121 #endif