net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .open    = rt_cache_seq_open,
 244         .read    = seq_read,
 245         .llseek  = seq_lseek,
 246         .release = seq_release,
 247 };
 248
 249
 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251 {
 252         int cpu;
 253
 254         if (*pos == 0)
 255                 return SEQ_START_TOKEN;
 256
 257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                 if (!cpu_possible(cpu))
 259                         continue;
 260                 *pos = cpu+1;
 261                 return &per_cpu(rt_cache_stat, cpu);
 262         }
 263         return NULL;
 264 }
 265
 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267 {
 268         int cpu;
 269
 270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363
 364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365 {
 366         return single_open(file, rt_acct_proc_show, NULL);
 367 }
 368
 369 static const struct file_operations rt_acct_proc_fops = {
 370         .open           = rt_acct_proc_open,
 371         .read           = seq_read,
 372         .llseek         = seq_lseek,
 373         .release        = single_release,
 374 };
 375 #endif
 376
 377 static int __net_init ip_rt_do_proc_init(struct net *net)
 378 {
 379         struct proc_dir_entry *pde;
 380
 381         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 382                           &rt_cache_seq_fops);
 383         if (!pde)
 384                 goto err1;
 385
 386         pde = proc_create("rt_cache", S_IRUGO,
 387                           net->proc_net_stat, &rt_cpu_seq_fops);
 388         if (!pde)
 389                 goto err2;
 390
 391 #ifdef CONFIG_IP_ROUTE_CLASSID
 392         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 393         if (!pde)
 394                 goto err3;
 395 #endif
 396         return 0;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399 err3:
 400         remove_proc_entry("rt_cache", net->proc_net_stat);
 401 #endif
 402 err2:
 403         remove_proc_entry("rt_cache", net->proc_net);
 404 err1:
 405         return -ENOMEM;
 406 }
 407
 408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409 {
 410         remove_proc_entry("rt_cache", net->proc_net_stat);
 411         remove_proc_entry("rt_cache", net->proc_net);
 412 #ifdef CONFIG_IP_ROUTE_CLASSID
 413         remove_proc_entry("rt_acct", net->proc_net);
 414 #endif
 415 }
 416
 417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418         .init = ip_rt_do_proc_init,
 419         .exit = ip_rt_do_proc_exit,
 420 };
 421
 422 static int __init ip_rt_proc_init(void)
 423 {
 424         return register_pernet_subsys(&ip_rt_proc_ops);
 425 }
 426
 427 #else
 428 static inline int ip_rt_proc_init(void)
 429 {
 430         return 0;
 431 }
 432 #endif /* CONFIG_PROC_FS */
 433
 434 static inline bool rt_is_expired(const struct rtable *rth)
 435 {
 436         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437 }
 438
 439 void rt_cache_flush(struct net *net)
 440 {
 441         rt_genid_bump_ipv4(net);
 442 }
 443
 444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445                                            struct sk_buff *skb,
 446                                            const void *daddr)
 447 {
 448         struct net_device *dev = dst->dev;
 449         const __be32 *pkey = daddr;
 450         const struct rtable *rt;
 451         struct neighbour *n;
 452
 453         rt = (const struct rtable *) dst;
 454         if (rt->rt_gateway)
 455                 pkey = (const __be32 *) &rt->rt_gateway;
 456         else if (skb)
 457                 pkey = &ip_hdr(skb)->daddr;
 458
 459         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460         if (n)
 461                 return n;
 462         return neigh_create(&arp_tbl, pkey, dev);
 463 }
 464
 465 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 466 {
 467         struct net_device *dev = dst->dev;
 468         const __be32 *pkey = daddr;
 469         const struct rtable *rt;
 470
 471         rt = (const struct rtable *)dst;
 472         if (rt->rt_gateway)
 473                 pkey = (const __be32 *)&rt->rt_gateway;
 474         else if (!daddr ||
 475                  (rt->rt_flags &
 476                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477                 return;
 478
 479         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 480 }
 481
 482 #define IP_IDENTS_SZ 2048u
 483
 484 static atomic_t *ip_idents __read_mostly;
 485 static u32 *ip_tstamps __read_mostly;
 486
 487 /* In order to protect privacy, we add a perturbation to identifiers
 488  * if one generator is seldom used. This makes hard for an attacker
 489  * to infer how many packets were sent between two points in time.
 490  */
 491 u32 ip_idents_reserve(u32 hash, int segs)
 492 {
 493         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495         u32 old = READ_ONCE(*p_tstamp);
 496         u32 now = (u32)jiffies;
 497         u32 new, delta = 0;
 498
 499         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500                 delta = prandom_u32_max(now - old);
 501
 502         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503         do {
 504                 old = (u32)atomic_read(p_id);
 505                 new = old + delta + segs;
 506         } while (atomic_cmpxchg(p_id, old, new) != old);
 507
 508         return new - segs;
 509 }
 510 EXPORT_SYMBOL(ip_idents_reserve);
 511
 512 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513 {
 514         static u32 ip_idents_hashrnd __read_mostly;
 515         u32 hash, id;
 516
 517         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 518
 519         hash = jhash_3words((__force u32)iph->daddr,
 520                             (__force u32)iph->saddr,
 521                             iph->protocol ^ net_hash_mix(net),
 522                             ip_idents_hashrnd);
 523         id = ip_idents_reserve(hash, segs);
 524         iph->id = htons(id);
 525 }
 526 EXPORT_SYMBOL(__ip_select_ident);
 527
 528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529                              const struct sock *sk,
 530                              const struct iphdr *iph,
 531                              int oif, u8 tos,
 532                              u8 prot, u32 mark, int flow_flags)
 533 {
 534         if (sk) {
 535                 const struct inet_sock *inet = inet_sk(sk);
 536
 537                 oif = sk->sk_bound_dev_if;
 538                 mark = sk->sk_mark;
 539                 tos = RT_CONN_FLAGS(sk);
 540                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541         }
 542         flowi4_init_output(fl4, oif, mark, tos,
 543                            RT_SCOPE_UNIVERSE, prot,
 544                            flow_flags,
 545                            iph->daddr, iph->saddr, 0, 0,
 546                            sock_net_uid(net, sk));
 547 }
 548
 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550                                const struct sock *sk)
 551 {
 552         const struct net *net = dev_net(skb->dev);
 553         const struct iphdr *iph = ip_hdr(skb);
 554         int oif = skb->dev->ifindex;
 555         u8 tos = RT_TOS(iph->tos);
 556         u8 prot = iph->protocol;
 557         u32 mark = skb->mark;
 558
 559         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560 }
 561
 562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563 {
 564         const struct inet_sock *inet = inet_sk(sk);
 565         const struct ip_options_rcu *inet_opt;
 566         __be32 daddr = inet->inet_daddr;
 567
 568         rcu_read_lock();
 569         inet_opt = rcu_dereference(inet->inet_opt);
 570         if (inet_opt && inet_opt->opt.srr)
 571                 daddr = inet_opt->opt.faddr;
 572         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575                            inet_sk_flowi_flags(sk),
 576                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577         rcu_read_unlock();
 578 }
 579
 580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581                                  const struct sk_buff *skb)
 582 {
 583         if (skb)
 584                 build_skb_flow_key(fl4, skb, sk);
 585         else
 586                 build_sk_flow_key(fl4, sk);
 587 }
 588
 589 static DEFINE_SPINLOCK(fnhe_lock);
 590
 591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592 {
 593         struct rtable *rt;
 594
 595         rt = rcu_dereference(fnhe->fnhe_rth_input);
 596         if (rt) {
 597                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598                 dst_dev_put(&rt->dst);
 599                 dst_release(&rt->dst);
 600         }
 601         rt = rcu_dereference(fnhe->fnhe_rth_output);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604                 dst_dev_put(&rt->dst);
 605                 dst_release(&rt->dst);
 606         }
 607 }
 608
 609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610 {
 611         struct fib_nh_exception *fnhe, *oldest;
 612
 613         oldest = rcu_dereference(hash->chain);
 614         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 617                         oldest = fnhe;
 618         }
 619         fnhe_flush_routes(oldest);
 620         return oldest;
 621 }
 622
 623 static inline u32 fnhe_hashfun(__be32 daddr)
 624 {
 625         static u32 fnhe_hashrnd __read_mostly;
 626         u32 hval;
 627
 628         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630         return hash_32(hval, FNHE_HASH_SHIFT);
 631 }
 632
 633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634 {
 635         rt->rt_pmtu = fnhe->fnhe_pmtu;
 636         rt->dst.expires = fnhe->fnhe_expires;
 637
 638         if (fnhe->fnhe_gw) {
 639                 rt->rt_flags |= RTCF_REDIRECTED;
 640                 rt->rt_gateway = fnhe->fnhe_gw;
 641                 rt->rt_uses_gateway = 1;
 642         }
 643 }
 644
 645 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 646                                   u32 pmtu, unsigned long expires)
 647 {
 648         struct fnhe_hash_bucket *hash;
 649         struct fib_nh_exception *fnhe;
 650         struct rtable *rt;
 651         u32 genid, hval;
 652         unsigned int i;
 653         int depth;
 654
 655         genid = fnhe_genid(dev_net(nh->nh_dev));
 656         hval = fnhe_hashfun(daddr);
 657
 658         spin_lock_bh(&fnhe_lock);
 659
 660         hash = rcu_dereference(nh->nh_exceptions);
 661         if (!hash) {
 662                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 663                 if (!hash)
 664                         goto out_unlock;
 665                 rcu_assign_pointer(nh->nh_exceptions, hash);
 666         }
 667
 668         hash += hval;
 669
 670         depth = 0;
 671         for (fnhe = rcu_dereference(hash->chain); fnhe;
 672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673                 if (fnhe->fnhe_daddr == daddr)
 674                         break;
 675                 depth++;
 676         }
 677
 678         if (fnhe) {
 679                 if (fnhe->fnhe_genid != genid)
 680                         fnhe->fnhe_genid = genid;
 681                 if (gw)
 682                         fnhe->fnhe_gw = gw;
 683                 if (pmtu)
 684                         fnhe->fnhe_pmtu = pmtu;
 685                 fnhe->fnhe_expires = max(1UL, expires);
 686                 /* Update all cached dsts too */
 687                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 688                 if (rt)
 689                         fill_route_from_fnhe(rt, fnhe);
 690                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 691                 if (rt)
 692                         fill_route_from_fnhe(rt, fnhe);
 693         } else {
 694                 if (depth > FNHE_RECLAIM_DEPTH)
 695                         fnhe = fnhe_oldest(hash);
 696                 else {
 697                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 698                         if (!fnhe)
 699                                 goto out_unlock;
 700
 701                         fnhe->fnhe_next = hash->chain;
 702                         rcu_assign_pointer(hash->chain, fnhe);
 703                 }
 704                 fnhe->fnhe_genid = genid;
 705                 fnhe->fnhe_daddr = daddr;
 706                 fnhe->fnhe_gw = gw;
 707                 fnhe->fnhe_pmtu = pmtu;
 708                 fnhe->fnhe_expires = expires;
 709
 710                 /* Exception created; mark the cached routes for the nexthop
 711                  * stale, so anyone caching it rechecks if this exception
 712                  * applies to them.
 713                  */
 714                 rt = rcu_dereference(nh->nh_rth_input);
 715                 if (rt)
 716                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 717
 718                 for_each_possible_cpu(i) {
 719                         struct rtable __rcu **prt;
 720                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 721                         rt = rcu_dereference(*prt);
 722                         if (rt)
 723                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 724                 }
 725         }
 726
 727         fnhe->fnhe_stamp = jiffies;
 728
 729 out_unlock:
 730         spin_unlock_bh(&fnhe_lock);
 731 }
 732
 733 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 734                              bool kill_route)
 735 {
 736         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 737         __be32 old_gw = ip_hdr(skb)->saddr;
 738         struct net_device *dev = skb->dev;
 739         struct in_device *in_dev;
 740         struct fib_result res;
 741         struct neighbour *n;
 742         struct net *net;
 743
 744         switch (icmp_hdr(skb)->code & 7) {
 745         case ICMP_REDIR_NET:
 746         case ICMP_REDIR_NETTOS:
 747         case ICMP_REDIR_HOST:
 748         case ICMP_REDIR_HOSTTOS:
 749                 break;
 750
 751         default:
 752                 return;
 753         }
 754
 755         if (rt->rt_gateway != old_gw)
 756                 return;
 757
 758         in_dev = __in_dev_get_rcu(dev);
 759         if (!in_dev)
 760                 return;
 761
 762         net = dev_net(dev);
 763         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 764             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 765             ipv4_is_zeronet(new_gw))
 766                 goto reject_redirect;
 767
 768         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 769                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 770                         goto reject_redirect;
 771                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 772                         goto reject_redirect;
 773         } else {
 774                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 775                         goto reject_redirect;
 776         }
 777
 778         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 779         if (!n)
 780                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 781         if (!IS_ERR(n)) {
 782                 if (!(n->nud_state & NUD_VALID)) {
 783                         neigh_event_send(n, NULL);
 784                 } else {
 785                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 786                                 struct fib_nh *nh = &FIB_RES_NH(res);
 787
 788                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 789                                                 0, jiffies + ip_rt_gc_timeout);
 790                         }
 791                         if (kill_route)
 792                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 793                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 794                 }
 795                 neigh_release(n);
 796         }
 797         return;
 798
 799 reject_redirect:
 800 #ifdef CONFIG_IP_ROUTE_VERBOSE
 801         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 802                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 803                 __be32 daddr = iph->daddr;
 804                 __be32 saddr = iph->saddr;
 805
 806                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 807                                      "  Advised path = %pI4 -> %pI4\n",
 808                                      &old_gw, dev->name, &new_gw,
 809                                      &saddr, &daddr);
 810         }
 811 #endif
 812         ;
 813 }
 814
 815 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 816 {
 817         struct rtable *rt;
 818         struct flowi4 fl4;
 819         const struct iphdr *iph = (const struct iphdr *) skb->data;
 820         struct net *net = dev_net(skb->dev);
 821         int oif = skb->dev->ifindex;
 822         u8 tos = RT_TOS(iph->tos);
 823         u8 prot = iph->protocol;
 824         u32 mark = skb->mark;
 825
 826         rt = (struct rtable *) dst;
 827
 828         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 829         __ip_do_redirect(rt, skb, &fl4, true);
 830 }
 831
 832 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 833 {
 834         struct rtable *rt = (struct rtable *)dst;
 835         struct dst_entry *ret = dst;
 836
 837         if (rt) {
 838                 if (dst->obsolete > 0) {
 839                         ip_rt_put(rt);
 840                         ret = NULL;
 841                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 842                            rt->dst.expires) {
 843                         ip_rt_put(rt);
 844                         ret = NULL;
 845                 }
 846         }
 847         return ret;
 848 }
 849
 850 /*
 851  * Algorithm:
 852  *      1. The first ip_rt_redirect_number redirects are sent
 853  *         with exponential backoff, then we stop sending them at all,
 854  *         assuming that the host ignores our redirects.
 855  *      2. If we did not see packets requiring redirects
 856  *         during ip_rt_redirect_silence, we assume that the host
 857  *         forgot redirected route and start to send redirects again.
 858  *
 859  * This algorithm is much cheaper and more intelligent than dumb load limiting
 860  * in icmp.c.
 861  *
 862  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 863  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 864  */
 865
 866 void ip_rt_send_redirect(struct sk_buff *skb)
 867 {
 868         struct rtable *rt = skb_rtable(skb);
 869         struct in_device *in_dev;
 870         struct inet_peer *peer;
 871         struct net *net;
 872         int log_martians;
 873         int vif;
 874
 875         rcu_read_lock();
 876         in_dev = __in_dev_get_rcu(rt->dst.dev);
 877         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 878                 rcu_read_unlock();
 879                 return;
 880         }
 881         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 882         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 883         rcu_read_unlock();
 884
 885         net = dev_net(rt->dst.dev);
 886         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 887         if (!peer) {
 888                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 889                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 890                 return;
 891         }
 892
 893         /* No redirected packets during ip_rt_redirect_silence;
 894          * reset the algorithm.
 895          */
 896         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 897                 peer->rate_tokens = 0;
 898
 899         /* Too many ignored redirects; do not send anything
 900          * set dst.rate_last to the last seen redirected packet.
 901          */
 902         if (peer->rate_tokens >= ip_rt_redirect_number) {
 903                 peer->rate_last = jiffies;
 904                 goto out_put_peer;
 905         }
 906
 907         /* Check for load limit; set rate_last to the latest sent
 908          * redirect.
 909          */
 910         if (peer->rate_tokens == 0 ||
 911             time_after(jiffies,
 912                        (peer->rate_last +
 913                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 914                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 915
 916                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 917                 peer->rate_last = jiffies;
 918                 ++peer->rate_tokens;
 919 #ifdef CONFIG_IP_ROUTE_VERBOSE
 920                 if (log_martians &&
 921                     peer->rate_tokens == ip_rt_redirect_number)
 922                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 923                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 924                                              &ip_hdr(skb)->daddr, &gw);
 925 #endif
 926         }
 927 out_put_peer:
 928         inet_putpeer(peer);
 929 }
 930
 931 static int ip_error(struct sk_buff *skb)
 932 {
 933         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 934         struct rtable *rt = skb_rtable(skb);
 935         struct inet_peer *peer;
 936         unsigned long now;
 937         struct net *net;
 938         bool send;
 939         int code;
 940
 941         /* IP on this device is disabled. */
 942         if (!in_dev)
 943                 goto out;
 944
 945         net = dev_net(rt->dst.dev);
 946         if (!IN_DEV_FORWARD(in_dev)) {
 947                 switch (rt->dst.error) {
 948                 case EHOSTUNREACH:
 949                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 950                         break;
 951
 952                 case ENETUNREACH:
 953                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 954                         break;
 955                 }
 956                 goto out;
 957         }
 958
 959         switch (rt->dst.error) {
 960         case EINVAL:
 961         default:
 962                 goto out;
 963         case EHOSTUNREACH:
 964                 code = ICMP_HOST_UNREACH;
 965                 break;
 966         case ENETUNREACH:
 967                 code = ICMP_NET_UNREACH;
 968                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969                 break;
 970         case EACCES:
 971                 code = ICMP_PKT_FILTERED;
 972                 break;
 973         }
 974
 975         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 976                                l3mdev_master_ifindex(skb->dev), 1);
 977
 978         send = true;
 979         if (peer) {
 980                 now = jiffies;
 981                 peer->rate_tokens += now - peer->rate_last;
 982                 if (peer->rate_tokens > ip_rt_error_burst)
 983                         peer->rate_tokens = ip_rt_error_burst;
 984                 peer->rate_last = now;
 985                 if (peer->rate_tokens >= ip_rt_error_cost)
 986                         peer->rate_tokens -= ip_rt_error_cost;
 987                 else
 988                         send = false;
 989                 inet_putpeer(peer);
 990         }
 991         if (send)
 992                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 993
 994 out:    kfree_skb(skb);
 995         return 0;
 996 }
 997
 998 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 999 {
1000         struct dst_entry *dst = &rt->dst;
1001         struct fib_result res;
1002
1003         if (dst_metric_locked(dst, RTAX_MTU))
1004                 return;
1005
1006         if (ipv4_mtu(dst) < mtu)
1007                 return;
1008
1009         if (mtu < ip_rt_min_pmtu)
1010                 mtu = ip_rt_min_pmtu;
1011
1012         if (rt->rt_pmtu == mtu &&
1013             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1014                 return;
1015
1016         rcu_read_lock();
1017         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1018                 struct fib_nh *nh = &FIB_RES_NH(res);
1019
1020                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1021                                       jiffies + ip_rt_mtu_expires);
1022         }
1023         rcu_read_unlock();
1024 }
1025
1026 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1027                               struct sk_buff *skb, u32 mtu)
1028 {
1029         struct rtable *rt = (struct rtable *) dst;
1030         struct flowi4 fl4;
1031
1032         ip_rt_build_flow_key(&fl4, sk, skb);
1033         __ip_rt_update_pmtu(rt, &fl4, mtu);
1034 }
1035
1036 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1037                       int oif, u32 mark, u8 protocol, int flow_flags)
1038 {
1039         const struct iphdr *iph = (const struct iphdr *) skb->data;
1040         struct flowi4 fl4;
1041         struct rtable *rt;
1042
1043         if (!mark)
1044                 mark = IP4_REPLY_MARK(net, skb->mark);
1045
1046         __build_flow_key(net, &fl4, NULL, iph, oif,
1047                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1048         rt = __ip_route_output_key(net, &fl4);
1049         if (!IS_ERR(rt)) {
1050                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1051                 ip_rt_put(rt);
1052         }
1053 }
1054 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1055
1056 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1057 {
1058         const struct iphdr *iph = (const struct iphdr *) skb->data;
1059         struct flowi4 fl4;
1060         struct rtable *rt;
1061
1062         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1063
1064         if (!fl4.flowi4_mark)
1065                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1066
1067         rt = __ip_route_output_key(sock_net(sk), &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073
1074 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079         struct dst_entry *odst = NULL;
1080         bool new = false;
1081         struct net *net = sock_net(sk);
1082
1083         bh_lock_sock(sk);
1084
1085         if (!ip_sk_accept_pmtu(sk))
1086                 goto out;
1087
1088         odst = sk_dst_get(sk);
1089
1090         if (sock_owned_by_user(sk) || !odst) {
1091                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1092                 goto out;
1093         }
1094
1095         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1096
1097         rt = (struct rtable *)odst;
1098         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1099                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1100                 if (IS_ERR(rt))
1101                         goto out;
1102
1103                 new = true;
1104         }
1105
1106         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1107
1108         if (!dst_check(&rt->dst, 0)) {
1109                 if (new)
1110                         dst_release(&rt->dst);
1111
1112                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1113                 if (IS_ERR(rt))
1114                         goto out;
1115
1116                 new = true;
1117         }
1118
1119         if (new)
1120                 sk_dst_set(sk, &rt->dst);
1121
1122 out:
1123         bh_unlock_sock(sk);
1124         dst_release(odst);
1125 }
1126 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1127
1128 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1129                    int oif, u32 mark, u8 protocol, int flow_flags)
1130 {
1131         const struct iphdr *iph = (const struct iphdr *) skb->data;
1132         struct flowi4 fl4;
1133         struct rtable *rt;
1134
1135         __build_flow_key(net, &fl4, NULL, iph, oif,
1136                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1137         rt = __ip_route_output_key(net, &fl4);
1138         if (!IS_ERR(rt)) {
1139                 __ip_do_redirect(rt, skb, &fl4, false);
1140                 ip_rt_put(rt);
1141         }
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_redirect);
1144
1145 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150         struct net *net = sock_net(sk);
1151
1152         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1153         rt = __ip_route_output_key(net, &fl4);
1154         if (!IS_ERR(rt)) {
1155                 __ip_do_redirect(rt, skb, &fl4, false);
1156                 ip_rt_put(rt);
1157         }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1160
1161 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1162 {
1163         struct rtable *rt = (struct rtable *) dst;
1164
1165         /* All IPV4 dsts are created with ->obsolete set to the value
1166          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1167          * into this function always.
1168          *
1169          * When a PMTU/redirect information update invalidates a route,
1170          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1171          * DST_OBSOLETE_DEAD by dst_free().
1172          */
1173         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1174                 return NULL;
1175         return dst;
1176 }
1177
1178 static void ipv4_link_failure(struct sk_buff *skb)
1179 {
1180         struct rtable *rt;
1181
1182         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1183
1184         rt = skb_rtable(skb);
1185         if (rt)
1186                 dst_set_expires(&rt->dst, 0);
1187 }
1188
1189 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1190 {
1191         pr_debug("%s: %pI4 -> %pI4, %s\n",
1192                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1193                  skb->dev ? skb->dev->name : "?");
1194         kfree_skb(skb);
1195         WARN_ON(1);
1196         return 0;
1197 }
1198
1199 /*
1200    We do not cache source address of outgoing interface,
1201    because it is used only by IP RR, TS and SRR options,
1202    so that it out of fast path.
1203
1204    BTW remember: "addr" is allowed to be not aligned
1205    in IP options!
1206  */
1207
1208 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1209 {
1210         __be32 src;
1211
1212         if (rt_is_output_route(rt))
1213                 src = ip_hdr(skb)->saddr;
1214         else {
1215                 struct fib_result res;
1216                 struct flowi4 fl4;
1217                 struct iphdr *iph;
1218
1219                 iph = ip_hdr(skb);
1220
1221                 memset(&fl4, 0, sizeof(fl4));
1222                 fl4.daddr = iph->daddr;
1223                 fl4.saddr = iph->saddr;
1224                 fl4.flowi4_tos = RT_TOS(iph->tos);
1225                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1226                 fl4.flowi4_iif = skb->dev->ifindex;
1227                 fl4.flowi4_mark = skb->mark;
1228
1229                 rcu_read_lock();
1230                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1231                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1232                 else
1233                         src = inet_select_addr(rt->dst.dev,
1234                                                rt_nexthop(rt, iph->daddr),
1235                                                RT_SCOPE_UNIVERSE);
1236                 rcu_read_unlock();
1237         }
1238         memcpy(addr, &src, 4);
1239 }
1240
1241 #ifdef CONFIG_IP_ROUTE_CLASSID
1242 static void set_class_tag(struct rtable *rt, u32 tag)
1243 {
1244         if (!(rt->dst.tclassid & 0xFFFF))
1245                 rt->dst.tclassid |= tag & 0xFFFF;
1246         if (!(rt->dst.tclassid & 0xFFFF0000))
1247                 rt->dst.tclassid |= tag & 0xFFFF0000;
1248 }
1249 #endif
1250
1251 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1252 {
1253         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1254         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1255                                     ip_rt_min_advmss);
1256
1257         return min(advmss, IPV4_MAX_PMTU - header_size);
1258 }
1259
1260 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1261 {
1262         const struct rtable *rt = (const struct rtable *) dst;
1263         unsigned int mtu = rt->rt_pmtu;
1264
1265         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1266                 mtu = dst_metric_raw(dst, RTAX_MTU);
1267
1268         if (mtu)
1269                 return mtu;
1270
1271         mtu = READ_ONCE(dst->dev->mtu);
1272
1273         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1274                 if (rt->rt_uses_gateway && mtu > 576)
1275                         mtu = 576;
1276         }
1277
1278         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1279
1280         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1281 }
1282
1283 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1284 {
1285         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1286         struct fib_nh_exception *fnhe;
1287         u32 hval;
1288
1289         if (!hash)
1290                 return NULL;
1291
1292         hval = fnhe_hashfun(daddr);
1293
1294         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1295              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1296                 if (fnhe->fnhe_daddr == daddr)
1297                         return fnhe;
1298         }
1299         return NULL;
1300 }
1301
1302 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1303                               __be32 daddr, const bool do_cache)
1304 {
1305         bool ret = false;
1306
1307         spin_lock_bh(&fnhe_lock);
1308
1309         if (daddr == fnhe->fnhe_daddr) {
1310                 struct rtable __rcu **porig;
1311                 struct rtable *orig;
1312                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1313
1314                 if (rt_is_input_route(rt))
1315                         porig = &fnhe->fnhe_rth_input;
1316                 else
1317                         porig = &fnhe->fnhe_rth_output;
1318                 orig = rcu_dereference(*porig);
1319
1320                 if (fnhe->fnhe_genid != genid) {
1321                         fnhe->fnhe_genid = genid;
1322                         fnhe->fnhe_gw = 0;
1323                         fnhe->fnhe_pmtu = 0;
1324                         fnhe->fnhe_expires = 0;
1325                         fnhe_flush_routes(fnhe);
1326                         orig = NULL;
1327                 }
1328                 fill_route_from_fnhe(rt, fnhe);
1329                 if (!rt->rt_gateway)
1330                         rt->rt_gateway = daddr;
1331
1332                 if (do_cache) {
1333                         dst_hold(&rt->dst);
1334                         rcu_assign_pointer(*porig, rt);
1335                         if (orig) {
1336                                 dst_dev_put(&orig->dst);
1337                                 dst_release(&orig->dst);
1338                         }
1339                         ret = true;
1340                 }
1341
1342                 fnhe->fnhe_stamp = jiffies;
1343         }
1344         spin_unlock_bh(&fnhe_lock);
1345
1346         return ret;
1347 }
1348
1349 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1350 {
1351         struct rtable *orig, *prev, **p;
1352         bool ret = true;
1353
1354         if (rt_is_input_route(rt)) {
1355                 p = (struct rtable **)&nh->nh_rth_input;
1356         } else {
1357                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1358         }
1359         orig = *p;
1360
1361         /* hold dst before doing cmpxchg() to avoid race condition
1362          * on this dst
1363          */
1364         dst_hold(&rt->dst);
1365         prev = cmpxchg(p, orig, rt);
1366         if (prev == orig) {
1367                 if (orig) {
1368                         dst_dev_put(&orig->dst);
1369                         dst_release(&orig->dst);
1370                 }
1371         } else {
1372                 dst_release(&rt->dst);
1373                 ret = false;
1374         }
1375
1376         return ret;
1377 }
1378
1379 struct uncached_list {
1380         spinlock_t              lock;
1381         struct list_head        head;
1382 };
1383
1384 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1385
1386 static void rt_add_uncached_list(struct rtable *rt)
1387 {
1388         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1389
1390         rt->rt_uncached_list = ul;
1391
1392         spin_lock_bh(&ul->lock);
1393         list_add_tail(&rt->rt_uncached, &ul->head);
1394         spin_unlock_bh(&ul->lock);
1395 }
1396
1397 static void ipv4_dst_destroy(struct dst_entry *dst)
1398 {
1399         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1400         struct rtable *rt = (struct rtable *) dst;
1401
1402         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1403                 kfree(p);
1404
1405         if (!list_empty(&rt->rt_uncached)) {
1406                 struct uncached_list *ul = rt->rt_uncached_list;
1407
1408                 spin_lock_bh(&ul->lock);
1409                 list_del(&rt->rt_uncached);
1410                 spin_unlock_bh(&ul->lock);
1411         }
1412 }
1413
1414 void rt_flush_dev(struct net_device *dev)
1415 {
1416         struct net *net = dev_net(dev);
1417         struct rtable *rt;
1418         int cpu;
1419
1420         for_each_possible_cpu(cpu) {
1421                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1422
1423                 spin_lock_bh(&ul->lock);
1424                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1425                         if (rt->dst.dev != dev)
1426                                 continue;
1427                         rt->dst.dev = net->loopback_dev;
1428                         dev_hold(rt->dst.dev);
1429                         dev_put(dev);
1430                 }
1431                 spin_unlock_bh(&ul->lock);
1432         }
1433 }
1434
1435 static bool rt_cache_valid(const struct rtable *rt)
1436 {
1437         return  rt &&
1438                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1439                 !rt_is_expired(rt);
1440 }
1441
1442 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1443                            const struct fib_result *res,
1444                            struct fib_nh_exception *fnhe,
1445                            struct fib_info *fi, u16 type, u32 itag,
1446                            const bool do_cache)
1447 {
1448         bool cached = false;
1449
1450         if (fi) {
1451                 struct fib_nh *nh = &FIB_RES_NH(*res);
1452
1453                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1454                         rt->rt_gateway = nh->nh_gw;
1455                         rt->rt_uses_gateway = 1;
1456                 }
1457                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1458                 if (fi->fib_metrics != &dst_default_metrics) {
1459                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1460                         refcount_inc(&fi->fib_metrics->refcnt);
1461                 }
1462 #ifdef CONFIG_IP_ROUTE_CLASSID
1463                 rt->dst.tclassid = nh->nh_tclassid;
1464 #endif
1465                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1466                 if (unlikely(fnhe))
1467                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1468                 else if (do_cache)
1469                         cached = rt_cache_route(nh, rt);
1470                 if (unlikely(!cached)) {
1471                         /* Routes we intend to cache in nexthop exception or
1472                          * FIB nexthop have the DST_NOCACHE bit clear.
1473                          * However, if we are unsuccessful at storing this
1474                          * route into the cache we really need to set it.
1475                          */
1476                         if (!rt->rt_gateway)
1477                                 rt->rt_gateway = daddr;
1478                         rt_add_uncached_list(rt);
1479                 }
1480         } else
1481                 rt_add_uncached_list(rt);
1482
1483 #ifdef CONFIG_IP_ROUTE_CLASSID
1484 #ifdef CONFIG_IP_MULTIPLE_TABLES
1485         set_class_tag(rt, res->tclassid);
1486 #endif
1487         set_class_tag(rt, itag);
1488 #endif
1489 }
1490
1491 struct rtable *rt_dst_alloc(struct net_device *dev,
1492                             unsigned int flags, u16 type,
1493                             bool nopolicy, bool noxfrm, bool will_cache)
1494 {
1495         struct rtable *rt;
1496
1497         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1498                        (will_cache ? 0 : DST_HOST) |
1499                        (nopolicy ? DST_NOPOLICY : 0) |
1500                        (noxfrm ? DST_NOXFRM : 0));
1501
1502         if (rt) {
1503                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1504                 rt->rt_flags = flags;
1505                 rt->rt_type = type;
1506                 rt->rt_is_input = 0;
1507                 rt->rt_iif = 0;
1508                 rt->rt_pmtu = 0;
1509                 rt->rt_gateway = 0;
1510                 rt->rt_uses_gateway = 0;
1511                 rt->rt_table_id = 0;
1512                 INIT_LIST_HEAD(&rt->rt_uncached);
1513
1514                 rt->dst.output = ip_output;
1515                 if (flags & RTCF_LOCAL)
1516                         rt->dst.input = ip_local_deliver;
1517         }
1518
1519         return rt;
1520 }
1521 EXPORT_SYMBOL(rt_dst_alloc);
1522
1523 /* called in rcu_read_lock() section */
1524 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1525                           u8 tos, struct net_device *dev,
1526                           struct in_device *in_dev, u32 *itag)
1527 {
1528         int err;
1529
1530         /* Primary sanity checks. */
1531         if (!in_dev)
1532                 return -EINVAL;
1533
1534         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1535             skb->protocol != htons(ETH_P_IP))
1536                 return -EINVAL;
1537
1538         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1539                 return -EINVAL;
1540
1541         if (ipv4_is_zeronet(saddr)) {
1542                 if (!ipv4_is_local_multicast(daddr))
1543                         return -EINVAL;
1544         } else {
1545                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1546                                           in_dev, itag);
1547                 if (err < 0)
1548                         return err;
1549         }
1550         return 0;
1551 }
1552
1553 /* called in rcu_read_lock() section */
1554 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1555                              u8 tos, struct net_device *dev, int our)
1556 {
1557         struct in_device *in_dev = __in_dev_get_rcu(dev);
1558         unsigned int flags = RTCF_MULTICAST;
1559         struct rtable *rth;
1560         u32 itag = 0;
1561         int err;
1562
1563         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1564         if (err)
1565                 return err;
1566
1567         if (our)
1568                 flags |= RTCF_LOCAL;
1569
1570         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1571                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1572         if (!rth)
1573                 return -ENOBUFS;
1574
1575 #ifdef CONFIG_IP_ROUTE_CLASSID
1576         rth->dst.tclassid = itag;
1577 #endif
1578         rth->dst.output = ip_rt_bug;
1579         rth->rt_is_input= 1;
1580
1581 #ifdef CONFIG_IP_MROUTE
1582         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1583                 rth->dst.input = ip_mr_input;
1584 #endif
1585         RT_CACHE_STAT_INC(in_slow_mc);
1586
1587         skb_dst_set(skb, &rth->dst);
1588         return 0;
1589 }
1590
1591
1592 static void ip_handle_martian_source(struct net_device *dev,
1593                                      struct in_device *in_dev,
1594                                      struct sk_buff *skb,
1595                                      __be32 daddr,
1596                                      __be32 saddr)
1597 {
1598         RT_CACHE_STAT_INC(in_martian_src);
1599 #ifdef CONFIG_IP_ROUTE_VERBOSE
1600         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1601                 /*
1602                  *      RFC1812 recommendation, if source is martian,
1603                  *      the only hint is MAC header.
1604                  */
1605                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1606                         &daddr, &saddr, dev->name);
1607                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1608                         print_hex_dump(KERN_WARNING, "ll header: ",
1609                                        DUMP_PREFIX_OFFSET, 16, 1,
1610                                        skb_mac_header(skb),
1611                                        dev->hard_header_len, true);
1612                 }
1613         }
1614 #endif
1615 }
1616
1617 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1618 {
1619         struct fnhe_hash_bucket *hash;
1620         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1621         u32 hval = fnhe_hashfun(daddr);
1622
1623         spin_lock_bh(&fnhe_lock);
1624
1625         hash = rcu_dereference_protected(nh->nh_exceptions,
1626                                          lockdep_is_held(&fnhe_lock));
1627         hash += hval;
1628
1629         fnhe_p = &hash->chain;
1630         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1631         while (fnhe) {
1632                 if (fnhe->fnhe_daddr == daddr) {
1633                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1634                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1635                         fnhe_flush_routes(fnhe);
1636                         kfree_rcu(fnhe, rcu);
1637                         break;
1638                 }
1639                 fnhe_p = &fnhe->fnhe_next;
1640                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1641                                                  lockdep_is_held(&fnhe_lock));
1642         }
1643
1644         spin_unlock_bh(&fnhe_lock);
1645 }
1646
1647 static void set_lwt_redirect(struct rtable *rth)
1648 {
1649         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1650                 rth->dst.lwtstate->orig_output = rth->dst.output;
1651                 rth->dst.output = lwtunnel_output;
1652         }
1653
1654         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1655                 rth->dst.lwtstate->orig_input = rth->dst.input;
1656                 rth->dst.input = lwtunnel_input;
1657         }
1658 }
1659
1660 /* called in rcu_read_lock() section */
1661 static int __mkroute_input(struct sk_buff *skb,
1662                            const struct fib_result *res,
1663                            struct in_device *in_dev,
1664                            __be32 daddr, __be32 saddr, u32 tos)
1665 {
1666         struct fib_nh_exception *fnhe;
1667         struct rtable *rth;
1668         int err;
1669         struct in_device *out_dev;
1670         bool do_cache;
1671         u32 itag = 0;
1672
1673         /* get a working reference to the output device */
1674         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1675         if (!out_dev) {
1676                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1677                 return -EINVAL;
1678         }
1679
1680         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1681                                   in_dev->dev, in_dev, &itag);
1682         if (err < 0) {
1683                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1684                                          saddr);
1685
1686                 goto cleanup;
1687         }
1688
1689         do_cache = res->fi && !itag;
1690         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1691             skb->protocol == htons(ETH_P_IP) &&
1692             (IN_DEV_SHARED_MEDIA(out_dev) ||
1693              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1694                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1695
1696         if (skb->protocol != htons(ETH_P_IP)) {
1697                 /* Not IP (i.e. ARP). Do not create route, if it is
1698                  * invalid for proxy arp. DNAT routes are always valid.
1699                  *
1700                  * Proxy arp feature have been extended to allow, ARP
1701                  * replies back to the same interface, to support
1702                  * Private VLAN switch technologies. See arp.c.
1703                  */
1704                 if (out_dev == in_dev &&
1705                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1706                         err = -EINVAL;
1707                         goto cleanup;
1708                 }
1709         }
1710
1711         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1712         if (do_cache) {
1713                 if (fnhe) {
1714                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1715                         if (rth && rth->dst.expires &&
1716                             time_after(jiffies, rth->dst.expires)) {
1717                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1718                                 fnhe = NULL;
1719                         } else {
1720                                 goto rt_cache;
1721                         }
1722                 }
1723
1724                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1725
1726 rt_cache:
1727                 if (rt_cache_valid(rth)) {
1728                         skb_dst_set_noref(skb, &rth->dst);
1729                         goto out;
1730                 }
1731         }
1732
1733         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1734                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1735                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1736         if (!rth) {
1737                 err = -ENOBUFS;
1738                 goto cleanup;
1739         }
1740
1741         rth->rt_is_input = 1;
1742         if (res->table)
1743                 rth->rt_table_id = res->table->tb_id;
1744         RT_CACHE_STAT_INC(in_slow_tot);
1745
1746         rth->dst.input = ip_forward;
1747
1748         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1749                        do_cache);
1750         set_lwt_redirect(rth);
1751         skb_dst_set(skb, &rth->dst);
1752 out:
1753         err = 0;
1754  cleanup:
1755         return err;
1756 }
1757
1758 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1759 /* To make ICMP packets follow the right flow, the multipath hash is
1760  * calculated from the inner IP addresses.
1761  */
1762 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1763                                  struct flow_keys *hash_keys)
1764 {
1765         const struct iphdr *outer_iph = ip_hdr(skb);
1766         const struct iphdr *inner_iph;
1767         const struct icmphdr *icmph;
1768         struct iphdr _inner_iph;
1769         struct icmphdr _icmph;
1770
1771         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1772         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1773         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1774                 return;
1775
1776         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1777                 return;
1778
1779         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1780                                    &_icmph);
1781         if (!icmph)
1782                 return;
1783
1784         if (icmph->type != ICMP_DEST_UNREACH &&
1785             icmph->type != ICMP_REDIRECT &&
1786             icmph->type != ICMP_TIME_EXCEEDED &&
1787             icmph->type != ICMP_PARAMETERPROB)
1788                 return;
1789
1790         inner_iph = skb_header_pointer(skb,
1791                                        outer_iph->ihl * 4 + sizeof(_icmph),
1792                                        sizeof(_inner_iph), &_inner_iph);
1793         if (!inner_iph)
1794                 return;
1795         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1796         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1797 }
1798
1799 /* if skb is set it will be used and fl4 can be NULL */
1800 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1801                        const struct sk_buff *skb)
1802 {
1803         struct net *net = fi->fib_net;
1804         struct flow_keys hash_keys;
1805         u32 mhash;
1806
1807         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1808         case 0:
1809                 memset(&hash_keys, 0, sizeof(hash_keys));
1810                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1811                 if (skb) {
1812                         ip_multipath_l3_keys(skb, &hash_keys);
1813                 } else {
1814                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1815                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1816                 }
1817                 break;
1818         case 1:
1819                 /* skb is currently provided only when forwarding */
1820                 if (skb) {
1821                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1822                         struct flow_keys keys;
1823
1824                         /* short-circuit if we already have L4 hash present */
1825                         if (skb->l4_hash)
1826                                 return skb_get_hash_raw(skb) >> 1;
1827                         memset(&hash_keys, 0, sizeof(hash_keys));
1828                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1829                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1830                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1831                         hash_keys.ports.src = keys.ports.src;
1832                         hash_keys.ports.dst = keys.ports.dst;
1833                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1834                 } else {
1835                         memset(&hash_keys, 0, sizeof(hash_keys));
1836                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1837                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1838                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1839                         hash_keys.ports.src = fl4->fl4_sport;
1840                         hash_keys.ports.dst = fl4->fl4_dport;
1841                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1842                 }
1843                 break;
1844         }
1845         mhash = flow_hash_from_keys(&hash_keys);
1846
1847         return mhash >> 1;
1848 }
1849 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1850 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1851
1852 static int ip_mkroute_input(struct sk_buff *skb,
1853                             struct fib_result *res,
1854                             struct in_device *in_dev,
1855                             __be32 daddr, __be32 saddr, u32 tos)
1856 {
1857 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1858         if (res->fi && res->fi->fib_nhs > 1) {
1859                 int h = fib_multipath_hash(res->fi, NULL, skb);
1860
1861                 fib_select_multipath(res, h);
1862         }
1863 #endif
1864
1865         /* create a routing cache entry */
1866         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1867 }
1868
1869 /*
1870  *      NOTE. We drop all the packets that has local source
1871  *      addresses, because every properly looped back packet
1872  *      must have correct destination already attached by output routine.
1873  *
1874  *      Such approach solves two big problems:
1875  *      1. Not simplex devices are handled properly.
1876  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1877  *      called with rcu_read_lock()
1878  */
1879
1880 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1881                                u8 tos, struct net_device *dev,
1882                                struct fib_result *res)
1883 {
1884         struct in_device *in_dev = __in_dev_get_rcu(dev);
1885         struct ip_tunnel_info *tun_info;
1886         struct flowi4   fl4;
1887         unsigned int    flags = 0;
1888         u32             itag = 0;
1889         struct rtable   *rth;
1890         int             err = -EINVAL;
1891         struct net    *net = dev_net(dev);
1892         bool do_cache;
1893
1894         /* IP on this device is disabled. */
1895
1896         if (!in_dev)
1897                 goto out;
1898
1899         /* Check for the most weird martians, which can be not detected
1900            by fib_lookup.
1901          */
1902
1903         tun_info = skb_tunnel_info(skb);
1904         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1905                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1906         else
1907                 fl4.flowi4_tun_key.tun_id = 0;
1908         skb_dst_drop(skb);
1909
1910         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1911                 goto martian_source;
1912
1913         res->fi = NULL;
1914         res->table = NULL;
1915         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1916                 goto brd_input;
1917
1918         /* Accept zero addresses only to limited broadcast;
1919          * I even do not know to fix it or not. Waiting for complains :-)
1920          */
1921         if (ipv4_is_zeronet(saddr))
1922                 goto martian_source;
1923
1924         if (ipv4_is_zeronet(daddr))
1925                 goto martian_destination;
1926
1927         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1928          * and call it once if daddr or/and saddr are loopback addresses
1929          */
1930         if (ipv4_is_loopback(daddr)) {
1931                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1932                         goto martian_destination;
1933         } else if (ipv4_is_loopback(saddr)) {
1934                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1935                         goto martian_source;
1936         }
1937
1938         /*
1939          *      Now we are ready to route packet.
1940          */
1941         fl4.flowi4_oif = 0;
1942         fl4.flowi4_iif = dev->ifindex;
1943         fl4.flowi4_mark = skb->mark;
1944         fl4.flowi4_tos = tos;
1945         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1946         fl4.flowi4_flags = 0;
1947         fl4.daddr = daddr;
1948         fl4.saddr = saddr;
1949         fl4.flowi4_uid = sock_net_uid(net, NULL);
1950         err = fib_lookup(net, &fl4, res, 0);
1951         if (err != 0) {
1952                 if (!IN_DEV_FORWARD(in_dev))
1953                         err = -EHOSTUNREACH;
1954                 goto no_route;
1955         }
1956
1957         if (res->type == RTN_BROADCAST)
1958                 goto brd_input;
1959
1960         if (res->type == RTN_LOCAL) {
1961                 err = fib_validate_source(skb, saddr, daddr, tos,
1962                                           0, dev, in_dev, &itag);
1963                 if (err < 0)
1964                         goto martian_source;
1965                 goto local_input;
1966         }
1967
1968         if (!IN_DEV_FORWARD(in_dev)) {
1969                 err = -EHOSTUNREACH;
1970                 goto no_route;
1971         }
1972         if (res->type != RTN_UNICAST)
1973                 goto martian_destination;
1974
1975         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1976 out:    return err;
1977
1978 brd_input:
1979         if (skb->protocol != htons(ETH_P_IP))
1980                 goto e_inval;
1981
1982         if (!ipv4_is_zeronet(saddr)) {
1983                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1984                                           in_dev, &itag);
1985                 if (err < 0)
1986                         goto martian_source;
1987         }
1988         flags |= RTCF_BROADCAST;
1989         res->type = RTN_BROADCAST;
1990         RT_CACHE_STAT_INC(in_brd);
1991
1992 local_input:
1993         do_cache = false;
1994         if (res->fi) {
1995                 if (!itag) {
1996                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1997                         if (rt_cache_valid(rth)) {
1998                                 skb_dst_set_noref(skb, &rth->dst);
1999                                 err = 0;
2000                                 goto out;
2001                         }
2002                         do_cache = true;
2003                 }
2004         }
2005
2006         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2007                            flags | RTCF_LOCAL, res->type,
2008                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2009         if (!rth)
2010                 goto e_nobufs;
2011
2012         rth->dst.output= ip_rt_bug;
2013 #ifdef CONFIG_IP_ROUTE_CLASSID
2014         rth->dst.tclassid = itag;
2015 #endif
2016         rth->rt_is_input = 1;
2017         if (res->table)
2018                 rth->rt_table_id = res->table->tb_id;
2019
2020         RT_CACHE_STAT_INC(in_slow_tot);
2021         if (res->type == RTN_UNREACHABLE) {
2022                 rth->dst.input= ip_error;
2023                 rth->dst.error= -err;
2024                 rth->rt_flags   &= ~RTCF_LOCAL;
2025         }
2026
2027         if (do_cache) {
2028                 struct fib_nh *nh = &FIB_RES_NH(*res);
2029
2030                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2031                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2032                         WARN_ON(rth->dst.input == lwtunnel_input);
2033                         rth->dst.lwtstate->orig_input = rth->dst.input;
2034                         rth->dst.input = lwtunnel_input;
2035                 }
2036
2037                 if (unlikely(!rt_cache_route(nh, rth)))
2038                         rt_add_uncached_list(rth);
2039         }
2040         skb_dst_set(skb, &rth->dst);
2041         err = 0;
2042         goto out;
2043
2044 no_route:
2045         RT_CACHE_STAT_INC(in_no_route);
2046         res->type = RTN_UNREACHABLE;
2047         res->fi = NULL;
2048         res->table = NULL;
2049         goto local_input;
2050
2051         /*
2052          *      Do not cache martian addresses: they should be logged (RFC1812)
2053          */
2054 martian_destination:
2055         RT_CACHE_STAT_INC(in_martian_dst);
2056 #ifdef CONFIG_IP_ROUTE_VERBOSE
2057         if (IN_DEV_LOG_MARTIANS(in_dev))
2058                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2059                                      &daddr, &saddr, dev->name);
2060 #endif
2061
2062 e_inval:
2063         err = -EINVAL;
2064         goto out;
2065
2066 e_nobufs:
2067         err = -ENOBUFS;
2068         goto out;
2069
2070 martian_source:
2071         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2072         goto out;
2073 }
2074
2075 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076                          u8 tos, struct net_device *dev)
2077 {
2078         struct fib_result res;
2079         int err;
2080
2081         tos &= IPTOS_RT_MASK;
2082         rcu_read_lock();
2083         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2084         rcu_read_unlock();
2085
2086         return err;
2087 }
2088 EXPORT_SYMBOL(ip_route_input_noref);
2089
2090 /* called with rcu_read_lock held */
2091 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2092                        u8 tos, struct net_device *dev, struct fib_result *res)
2093 {
2094         /* Multicast recognition logic is moved from route cache to here.
2095            The problem was that too many Ethernet cards have broken/missing
2096            hardware multicast filters :-( As result the host on multicasting
2097            network acquires a lot of useless route cache entries, sort of
2098            SDR messages from all the world. Now we try to get rid of them.
2099            Really, provided software IP multicast filter is organized
2100            reasonably (at least, hashed), it does not result in a slowdown
2101            comparing with route cache reject entries.
2102            Note, that multicast routers are not affected, because
2103            route cache entry is created eventually.
2104          */
2105         if (ipv4_is_multicast(daddr)) {
2106                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2107                 int our = 0;
2108                 int err = -EINVAL;
2109
2110                 if (in_dev)
2111                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2112                                               ip_hdr(skb)->protocol);
2113
2114                 /* check l3 master if no match yet */
2115                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2116                         struct in_device *l3_in_dev;
2117
2118                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2119                         if (l3_in_dev)
2120                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2121                                                       ip_hdr(skb)->protocol);
2122                 }
2123
2124                 if (our
2125 #ifdef CONFIG_IP_MROUTE
2126                         ||
2127                     (!ipv4_is_local_multicast(daddr) &&
2128                      IN_DEV_MFORWARD(in_dev))
2129 #endif
2130                    ) {
2131                         err = ip_route_input_mc(skb, daddr, saddr,
2132                                                 tos, dev, our);
2133                 }
2134                 return err;
2135         }
2136
2137         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2138 }
2139
2140 /* called with rcu_read_lock() */
2141 static struct rtable *__mkroute_output(const struct fib_result *res,
2142                                        const struct flowi4 *fl4, int orig_oif,
2143                                        struct net_device *dev_out,
2144                                        unsigned int flags)
2145 {
2146         struct fib_info *fi = res->fi;
2147         struct fib_nh_exception *fnhe;
2148         struct in_device *in_dev;
2149         u16 type = res->type;
2150         struct rtable *rth;
2151         bool do_cache;
2152
2153         in_dev = __in_dev_get_rcu(dev_out);
2154         if (!in_dev)
2155                 return ERR_PTR(-EINVAL);
2156
2157         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2158                 if (ipv4_is_loopback(fl4->saddr) &&
2159                     !(dev_out->flags & IFF_LOOPBACK) &&
2160                     !netif_is_l3_master(dev_out))
2161                         return ERR_PTR(-EINVAL);
2162
2163         if (ipv4_is_lbcast(fl4->daddr))
2164                 type = RTN_BROADCAST;
2165         else if (ipv4_is_multicast(fl4->daddr))
2166                 type = RTN_MULTICAST;
2167         else if (ipv4_is_zeronet(fl4->daddr))
2168                 return ERR_PTR(-EINVAL);
2169
2170         if (dev_out->flags & IFF_LOOPBACK)
2171                 flags |= RTCF_LOCAL;
2172
2173         do_cache = true;
2174         if (type == RTN_BROADCAST) {
2175                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2176                 fi = NULL;
2177         } else if (type == RTN_MULTICAST) {
2178                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2179                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2180                                      fl4->flowi4_proto))
2181                         flags &= ~RTCF_LOCAL;
2182                 else
2183                         do_cache = false;
2184                 /* If multicast route do not exist use
2185                  * default one, but do not gateway in this case.
2186                  * Yes, it is hack.
2187                  */
2188                 if (fi && res->prefixlen < 4)
2189                         fi = NULL;
2190         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2191                    (orig_oif != dev_out->ifindex)) {
2192                 /* For local routes that require a particular output interface
2193                  * we do not want to cache the result.  Caching the result
2194                  * causes incorrect behaviour when there are multiple source
2195                  * addresses on the interface, the end result being that if the
2196                  * intended recipient is waiting on that interface for the
2197                  * packet he won't receive it because it will be delivered on
2198                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2199                  * be set to the loopback interface as well.
2200                  */
2201                 fi = NULL;
2202         }
2203
2204         fnhe = NULL;
2205         do_cache &= fi != NULL;
2206         if (do_cache) {
2207                 struct rtable __rcu **prth;
2208                 struct fib_nh *nh = &FIB_RES_NH(*res);
2209
2210                 fnhe = find_exception(nh, fl4->daddr);
2211                 if (fnhe) {
2212                         prth = &fnhe->fnhe_rth_output;
2213                         rth = rcu_dereference(*prth);
2214                         if (rth && rth->dst.expires &&
2215                             time_after(jiffies, rth->dst.expires)) {
2216                                 ip_del_fnhe(nh, fl4->daddr);
2217                                 fnhe = NULL;
2218                         } else {
2219                                 goto rt_cache;
2220                         }
2221                 }
2222
2223                 if (unlikely(fl4->flowi4_flags &
2224                              FLOWI_FLAG_KNOWN_NH &&
2225                              !(nh->nh_gw &&
2226                                nh->nh_scope == RT_SCOPE_LINK))) {
2227                         do_cache = false;
2228                         goto add;
2229                 }
2230                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2231                 rth = rcu_dereference(*prth);
2232
2233 rt_cache:
2234                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2235                         return rth;
2236         }
2237
2238 add:
2239         rth = rt_dst_alloc(dev_out, flags, type,
2240                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2241                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2242                            do_cache);
2243         if (!rth)
2244                 return ERR_PTR(-ENOBUFS);
2245
2246         rth->rt_iif = orig_oif;
2247         if (res->table)
2248                 rth->rt_table_id = res->table->tb_id;
2249
2250         RT_CACHE_STAT_INC(out_slow_tot);
2251
2252         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2253                 if (flags & RTCF_LOCAL &&
2254                     !(dev_out->flags & IFF_LOOPBACK)) {
2255                         rth->dst.output = ip_mc_output;
2256                         RT_CACHE_STAT_INC(out_slow_mc);
2257                 }
2258 #ifdef CONFIG_IP_MROUTE
2259                 if (type == RTN_MULTICAST) {
2260                         if (IN_DEV_MFORWARD(in_dev) &&
2261                             !ipv4_is_local_multicast(fl4->daddr)) {
2262                                 rth->dst.input = ip_mr_input;
2263                                 rth->dst.output = ip_mc_output;
2264                         }
2265                 }
2266 #endif
2267         }
2268
2269         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2270         set_lwt_redirect(rth);
2271
2272         return rth;
2273 }
2274
2275 /*
2276  * Major route resolver routine.
2277  */
2278
2279 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2280                                         const struct sk_buff *skb)
2281 {
2282         __u8 tos = RT_FL_TOS(fl4);
2283         struct fib_result res;
2284         struct rtable *rth;
2285
2286         res.tclassid    = 0;
2287         res.fi          = NULL;
2288         res.table       = NULL;
2289
2290         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2291         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2292         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2293                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2294
2295         rcu_read_lock();
2296         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2297         rcu_read_unlock();
2298
2299         return rth;
2300 }
2301 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2302
2303 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2304                                             struct fib_result *res,
2305                                             const struct sk_buff *skb)
2306 {
2307         struct net_device *dev_out = NULL;
2308         int orig_oif = fl4->flowi4_oif;
2309         unsigned int flags = 0;
2310         struct rtable *rth;
2311         int err = -ENETUNREACH;
2312
2313         if (fl4->saddr) {
2314                 rth = ERR_PTR(-EINVAL);
2315                 if (ipv4_is_multicast(fl4->saddr) ||
2316                     ipv4_is_lbcast(fl4->saddr) ||
2317                     ipv4_is_zeronet(fl4->saddr))
2318                         goto out;
2319
2320                 /* I removed check for oif == dev_out->oif here.
2321                    It was wrong for two reasons:
2322                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2323                       is assigned to multiple interfaces.
2324                    2. Moreover, we are allowed to send packets with saddr
2325                       of another iface. --ANK
2326                  */
2327
2328                 if (fl4->flowi4_oif == 0 &&
2329                     (ipv4_is_multicast(fl4->daddr) ||
2330                      ipv4_is_lbcast(fl4->daddr))) {
2331                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2332                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2333                         if (!dev_out)
2334                                 goto out;
2335
2336                         /* Special hack: user can direct multicasts
2337                            and limited broadcast via necessary interface
2338                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2339                            This hack is not just for fun, it allows
2340                            vic,vat and friends to work.
2341                            They bind socket to loopback, set ttl to zero
2342                            and expect that it will work.
2343                            From the viewpoint of routing cache they are broken,
2344                            because we are not allowed to build multicast path
2345                            with loopback source addr (look, routing cache
2346                            cannot know, that ttl is zero, so that packet
2347                            will not leave this host and route is valid).
2348                            Luckily, this hack is good workaround.
2349                          */
2350
2351                         fl4->flowi4_oif = dev_out->ifindex;
2352                         goto make_route;
2353                 }
2354
2355                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2356                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2357                         if (!__ip_dev_find(net, fl4->saddr, false))
2358                                 goto out;
2359                 }
2360         }
2361
2362
2363         if (fl4->flowi4_oif) {
2364                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2365                 rth = ERR_PTR(-ENODEV);
2366                 if (!dev_out)
2367                         goto out;
2368
2369                 /* RACE: Check return value of inet_select_addr instead. */
2370                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2371                         rth = ERR_PTR(-ENETUNREACH);
2372                         goto out;
2373                 }
2374                 if (ipv4_is_local_multicast(fl4->daddr) ||
2375                     ipv4_is_lbcast(fl4->daddr) ||
2376                     fl4->flowi4_proto == IPPROTO_IGMP) {
2377                         if (!fl4->saddr)
2378                                 fl4->saddr = inet_select_addr(dev_out, 0,
2379                                                               RT_SCOPE_LINK);
2380                         goto make_route;
2381                 }
2382                 if (!fl4->saddr) {
2383                         if (ipv4_is_multicast(fl4->daddr))
2384                                 fl4->saddr = inet_select_addr(dev_out, 0,
2385                                                               fl4->flowi4_scope);
2386                         else if (!fl4->daddr)
2387                                 fl4->saddr = inet_select_addr(dev_out, 0,
2388                                                               RT_SCOPE_HOST);
2389                 }
2390         }
2391
2392         if (!fl4->daddr) {
2393                 fl4->daddr = fl4->saddr;
2394                 if (!fl4->daddr)
2395                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2396                 dev_out = net->loopback_dev;
2397                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2398                 res->type = RTN_LOCAL;
2399                 flags |= RTCF_LOCAL;
2400                 goto make_route;
2401         }
2402
2403         err = fib_lookup(net, fl4, res, 0);
2404         if (err) {
2405                 res->fi = NULL;
2406                 res->table = NULL;
2407                 if (fl4->flowi4_oif &&
2408                     (ipv4_is_multicast(fl4->daddr) ||
2409                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2410                         /* Apparently, routing tables are wrong. Assume,
2411                            that the destination is on link.
2412
2413                            WHY? DW.
2414                            Because we are allowed to send to iface
2415                            even if it has NO routes and NO assigned
2416                            addresses. When oif is specified, routing
2417                            tables are looked up with only one purpose:
2418                            to catch if destination is gatewayed, rather than
2419                            direct. Moreover, if MSG_DONTROUTE is set,
2420                            we send packet, ignoring both routing tables
2421                            and ifaddr state. --ANK
2422
2423
2424                            We could make it even if oif is unknown,
2425                            likely IPv6, but we do not.
2426                          */
2427
2428                         if (fl4->saddr == 0)
2429                                 fl4->saddr = inet_select_addr(dev_out, 0,
2430                                                               RT_SCOPE_LINK);
2431                         res->type = RTN_UNICAST;
2432                         goto make_route;
2433                 }
2434                 rth = ERR_PTR(err);
2435                 goto out;
2436         }
2437
2438         if (res->type == RTN_LOCAL) {
2439                 if (!fl4->saddr) {
2440                         if (res->fi->fib_prefsrc)
2441                                 fl4->saddr = res->fi->fib_prefsrc;
2442                         else
2443                                 fl4->saddr = fl4->daddr;
2444                 }
2445
2446                 /* L3 master device is the loopback for that domain */
2447                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2448                         net->loopback_dev;
2449
2450                 /* make sure orig_oif points to fib result device even
2451                  * though packet rx/tx happens over loopback or l3mdev
2452                  */
2453                 orig_oif = FIB_RES_OIF(*res);
2454
2455                 fl4->flowi4_oif = dev_out->ifindex;
2456                 flags |= RTCF_LOCAL;
2457                 goto make_route;
2458         }
2459
2460         fib_select_path(net, res, fl4, skb);
2461
2462         dev_out = FIB_RES_DEV(*res);
2463         fl4->flowi4_oif = dev_out->ifindex;
2464
2465
2466 make_route:
2467         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2468
2469 out:
2470         return rth;
2471 }
2472
2473 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2474 {
2475         return NULL;
2476 }
2477
2478 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2479 {
2480         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2481
2482         return mtu ? : dst->dev->mtu;
2483 }
2484
2485 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2486                                           struct sk_buff *skb, u32 mtu)
2487 {
2488 }
2489
2490 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2491                                        struct sk_buff *skb)
2492 {
2493 }
2494
2495 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2496                                           unsigned long old)
2497 {
2498         return NULL;
2499 }
2500
2501 static struct dst_ops ipv4_dst_blackhole_ops = {
2502         .family                 =       AF_INET,
2503         .check                  =       ipv4_blackhole_dst_check,
2504         .mtu                    =       ipv4_blackhole_mtu,
2505         .default_advmss         =       ipv4_default_advmss,
2506         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2507         .redirect               =       ipv4_rt_blackhole_redirect,
2508         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2509         .neigh_lookup           =       ipv4_neigh_lookup,
2510 };
2511
2512 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2513 {
2514         struct rtable *ort = (struct rtable *) dst_orig;
2515         struct rtable *rt;
2516
2517         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2518         if (rt) {
2519                 struct dst_entry *new = &rt->dst;
2520
2521                 new->__use = 1;
2522                 new->input = dst_discard;
2523                 new->output = dst_discard_out;
2524
2525                 new->dev = net->loopback_dev;
2526                 if (new->dev)
2527                         dev_hold(new->dev);
2528
2529                 rt->rt_is_input = ort->rt_is_input;
2530                 rt->rt_iif = ort->rt_iif;
2531                 rt->rt_pmtu = ort->rt_pmtu;
2532
2533                 rt->rt_genid = rt_genid_ipv4(net);
2534                 rt->rt_flags = ort->rt_flags;
2535                 rt->rt_type = ort->rt_type;
2536                 rt->rt_gateway = ort->rt_gateway;
2537                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2538
2539                 INIT_LIST_HEAD(&rt->rt_uncached);
2540         }
2541
2542         dst_release(dst_orig);
2543
2544         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2545 }
2546
2547 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2548                                     const struct sock *sk)
2549 {
2550         struct rtable *rt = __ip_route_output_key(net, flp4);
2551
2552         if (IS_ERR(rt))
2553                 return rt;
2554
2555         if (flp4->flowi4_proto)
2556                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2557                                                         flowi4_to_flowi(flp4),
2558                                                         sk, 0);
2559
2560         return rt;
2561 }
2562 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2563
2564 /* called with rcu_read_lock held */
2565 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2566                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2567                         u32 seq)
2568 {
2569         struct rtable *rt = skb_rtable(skb);
2570         struct rtmsg *r;
2571         struct nlmsghdr *nlh;
2572         unsigned long expires = 0;
2573         u32 error;
2574         u32 metrics[RTAX_MAX];
2575
2576         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2577         if (!nlh)
2578                 return -EMSGSIZE;
2579
2580         r = nlmsg_data(nlh);
2581         r->rtm_family    = AF_INET;
2582         r->rtm_dst_len  = 32;
2583         r->rtm_src_len  = 0;
2584         r->rtm_tos      = fl4->flowi4_tos;
2585         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2586         if (nla_put_u32(skb, RTA_TABLE, table_id))
2587                 goto nla_put_failure;
2588         r->rtm_type     = rt->rt_type;
2589         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2590         r->rtm_protocol = RTPROT_UNSPEC;
2591         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2592         if (rt->rt_flags & RTCF_NOTIFY)
2593                 r->rtm_flags |= RTM_F_NOTIFY;
2594         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2595                 r->rtm_flags |= RTCF_DOREDIRECT;
2596
2597         if (nla_put_in_addr(skb, RTA_DST, dst))
2598                 goto nla_put_failure;
2599         if (src) {
2600                 r->rtm_src_len = 32;
2601                 if (nla_put_in_addr(skb, RTA_SRC, src))
2602                         goto nla_put_failure;
2603         }
2604         if (rt->dst.dev &&
2605             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2606                 goto nla_put_failure;
2607 #ifdef CONFIG_IP_ROUTE_CLASSID
2608         if (rt->dst.tclassid &&
2609             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2610                 goto nla_put_failure;
2611 #endif
2612         if (!rt_is_input_route(rt) &&
2613             fl4->saddr != src) {
2614                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2615                         goto nla_put_failure;
2616         }
2617         if (rt->rt_uses_gateway &&
2618             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2619                 goto nla_put_failure;
2620
2621         expires = rt->dst.expires;
2622         if (expires) {
2623                 unsigned long now = jiffies;
2624
2625                 if (time_before(now, expires))
2626                         expires -= now;
2627                 else
2628                         expires = 0;
2629         }
2630
2631         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2632         if (rt->rt_pmtu && expires)
2633                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2634         if (rtnetlink_put_metrics(skb, metrics) < 0)
2635                 goto nla_put_failure;
2636
2637         if (fl4->flowi4_mark &&
2638             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2639                 goto nla_put_failure;
2640
2641         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2642             nla_put_u32(skb, RTA_UID,
2643                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2644                 goto nla_put_failure;
2645
2646         error = rt->dst.error;
2647
2648         if (rt_is_input_route(rt)) {
2649 #ifdef CONFIG_IP_MROUTE
2650                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2651                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2652                         int err = ipmr_get_route(net, skb,
2653                                                  fl4->saddr, fl4->daddr,
2654                                                  r, portid);
2655
2656                         if (err <= 0) {
2657                                 if (err == 0)
2658                                         return 0;
2659                                 goto nla_put_failure;
2660                         }
2661                 } else
2662 #endif
2663                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2664                                 goto nla_put_failure;
2665         }
2666
2667         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2668                 goto nla_put_failure;
2669
2670         nlmsg_end(skb, nlh);
2671         return 0;
2672
2673 nla_put_failure:
2674         nlmsg_cancel(skb, nlh);
2675         return -EMSGSIZE;
2676 }
2677
2678 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2679                              struct netlink_ext_ack *extack)
2680 {
2681         struct net *net = sock_net(in_skb->sk);
2682         struct rtmsg *rtm;
2683         struct nlattr *tb[RTA_MAX+1];
2684         struct fib_result res = {};
2685         struct rtable *rt = NULL;
2686         struct flowi4 fl4;
2687         __be32 dst = 0;
2688         __be32 src = 0;
2689         u32 iif;
2690         int err;
2691         int mark;
2692         struct sk_buff *skb;
2693         u32 table_id = RT_TABLE_MAIN;
2694         kuid_t uid;
2695
2696         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2697                           extack);
2698         if (err < 0)
2699                 goto errout;
2700
2701         rtm = nlmsg_data(nlh);
2702
2703         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2704         if (!skb) {
2705                 err = -ENOBUFS;
2706                 goto errout;
2707         }
2708
2709         /* Reserve room for dummy headers, this skb can pass
2710            through good chunk of routing engine.
2711          */
2712         skb_reset_mac_header(skb);
2713         skb_reset_network_header(skb);
2714
2715         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2716         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2717         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2718         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2719         if (tb[RTA_UID])
2720                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2721         else
2722                 uid = (iif ? INVALID_UID : current_uid());
2723
2724         /* Bugfix: need to give ip_route_input enough of an IP header to
2725          * not gag.
2726          */
2727         ip_hdr(skb)->protocol = IPPROTO_UDP;
2728         ip_hdr(skb)->saddr = src;
2729         ip_hdr(skb)->daddr = dst;
2730
2731         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2732
2733         memset(&fl4, 0, sizeof(fl4));
2734         fl4.daddr = dst;
2735         fl4.saddr = src;
2736         fl4.flowi4_tos = rtm->rtm_tos;
2737         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2738         fl4.flowi4_mark = mark;
2739         fl4.flowi4_uid = uid;
2740
2741         rcu_read_lock();
2742
2743         if (iif) {
2744                 struct net_device *dev;
2745
2746                 dev = dev_get_by_index_rcu(net, iif);
2747                 if (!dev) {
2748                         err = -ENODEV;
2749                         goto errout_free;
2750                 }
2751
2752                 skb->protocol   = htons(ETH_P_IP);
2753                 skb->dev        = dev;
2754                 skb->mark       = mark;
2755                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2756                                          dev, &res);
2757
2758                 rt = skb_rtable(skb);
2759                 if (err == 0 && rt->dst.error)
2760                         err = -rt->dst.error;
2761         } else {
2762                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2763                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2764                 err = 0;
2765                 if (IS_ERR(rt))
2766                         err = PTR_ERR(rt);
2767                 else
2768                         skb_dst_set(skb, &rt->dst);
2769         }
2770
2771         if (err)
2772                 goto errout_free;
2773
2774         if (rtm->rtm_flags & RTM_F_NOTIFY)
2775                 rt->rt_flags |= RTCF_NOTIFY;
2776
2777         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2778                 table_id = rt->rt_table_id;
2779
2780         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2781                 if (!res.fi) {
2782                         err = fib_props[res.type].error;
2783                         if (!err)
2784                                 err = -EHOSTUNREACH;
2785                         goto errout_free;
2786                 }
2787                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2788                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2789                                     rt->rt_type, res.prefix, res.prefixlen,
2790                                     fl4.flowi4_tos, res.fi, 0);
2791         } else {
2792                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2793                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2794         }
2795         if (err < 0)
2796                 goto errout_free;
2797
2798         rcu_read_unlock();
2799
2800         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2801 errout:
2802         return err;
2803
2804 errout_free:
2805         rcu_read_unlock();
2806         kfree_skb(skb);
2807         goto errout;
2808 }
2809
2810 void ip_rt_multicast_event(struct in_device *in_dev)
2811 {
2812         rt_cache_flush(dev_net(in_dev->dev));
2813 }
2814
2815 #ifdef CONFIG_SYSCTL
2816 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2817 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2818 static int ip_rt_gc_elasticity __read_mostly    = 8;
2819
2820 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2821                                         void __user *buffer,
2822                                         size_t *lenp, loff_t *ppos)
2823 {
2824         struct net *net = (struct net *)__ctl->extra1;
2825
2826         if (write) {
2827                 rt_cache_flush(net);
2828                 fnhe_genid_bump(net);
2829                 return 0;
2830         }
2831
2832         return -EINVAL;
2833 }
2834
2835 static struct ctl_table ipv4_route_table[] = {
2836         {
2837                 .procname       = "gc_thresh",
2838                 .data           = &ipv4_dst_ops.gc_thresh,
2839                 .maxlen         = sizeof(int),
2840                 .mode           = 0644,
2841                 .proc_handler   = proc_dointvec,
2842         },
2843         {
2844                 .procname       = "max_size",
2845                 .data           = &ip_rt_max_size,
2846                 .maxlen         = sizeof(int),
2847                 .mode           = 0644,
2848                 .proc_handler   = proc_dointvec,
2849         },
2850         {
2851                 /*  Deprecated. Use gc_min_interval_ms */
2852
2853                 .procname       = "gc_min_interval",
2854                 .data           = &ip_rt_gc_min_interval,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = proc_dointvec_jiffies,
2858         },
2859         {
2860                 .procname       = "gc_min_interval_ms",
2861                 .data           = &ip_rt_gc_min_interval,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = proc_dointvec_ms_jiffies,
2865         },
2866         {
2867                 .procname       = "gc_timeout",
2868                 .data           = &ip_rt_gc_timeout,
2869                 .maxlen         = sizeof(int),
2870                 .mode           = 0644,
2871                 .proc_handler   = proc_dointvec_jiffies,
2872         },
2873         {
2874                 .procname       = "gc_interval",
2875                 .data           = &ip_rt_gc_interval,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = proc_dointvec_jiffies,
2879         },
2880         {
2881                 .procname       = "redirect_load",
2882                 .data           = &ip_rt_redirect_load,
2883                 .maxlen         = sizeof(int),
2884                 .mode           = 0644,
2885                 .proc_handler   = proc_dointvec,
2886         },
2887         {
2888                 .procname       = "redirect_number",
2889                 .data           = &ip_rt_redirect_number,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = proc_dointvec,
2893         },
2894         {
2895                 .procname       = "redirect_silence",
2896                 .data           = &ip_rt_redirect_silence,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = proc_dointvec,
2900         },
2901         {
2902                 .procname       = "error_cost",
2903                 .data           = &ip_rt_error_cost,
2904                 .maxlen         = sizeof(int),
2905                 .mode           = 0644,
2906                 .proc_handler   = proc_dointvec,
2907         },
2908         {
2909                 .procname       = "error_burst",
2910                 .data           = &ip_rt_error_burst,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = proc_dointvec,
2914         },
2915         {
2916                 .procname       = "gc_elasticity",
2917                 .data           = &ip_rt_gc_elasticity,
2918                 .maxlen         = sizeof(int),
2919                 .mode           = 0644,
2920                 .proc_handler   = proc_dointvec,
2921         },
2922         {
2923                 .procname       = "mtu_expires",
2924                 .data           = &ip_rt_mtu_expires,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = proc_dointvec_jiffies,
2928         },
2929         {
2930                 .procname       = "min_pmtu",
2931                 .data           = &ip_rt_min_pmtu,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = proc_dointvec,
2935         },
2936         {
2937                 .procname       = "min_adv_mss",
2938                 .data           = &ip_rt_min_advmss,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = proc_dointvec,
2942         },
2943         { }
2944 };
2945
2946 static struct ctl_table ipv4_route_flush_table[] = {
2947         {
2948                 .procname       = "flush",
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0200,
2951                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2952         },
2953         { },
2954 };
2955
2956 static __net_init int sysctl_route_net_init(struct net *net)
2957 {
2958         struct ctl_table *tbl;
2959
2960         tbl = ipv4_route_flush_table;
2961         if (!net_eq(net, &init_net)) {
2962                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2963                 if (!tbl)
2964                         goto err_dup;
2965
2966                 /* Don't export sysctls to unprivileged users */
2967                 if (net->user_ns != &init_user_ns)
2968                         tbl[0].procname = NULL;
2969         }
2970         tbl[0].extra1 = net;
2971
2972         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2973         if (!net->ipv4.route_hdr)
2974                 goto err_reg;
2975         return 0;
2976
2977 err_reg:
2978         if (tbl != ipv4_route_flush_table)
2979                 kfree(tbl);
2980 err_dup:
2981         return -ENOMEM;
2982 }
2983
2984 static __net_exit void sysctl_route_net_exit(struct net *net)
2985 {
2986         struct ctl_table *tbl;
2987
2988         tbl = net->ipv4.route_hdr->ctl_table_arg;
2989         unregister_net_sysctl_table(net->ipv4.route_hdr);
2990         BUG_ON(tbl == ipv4_route_flush_table);
2991         kfree(tbl);
2992 }
2993
2994 static __net_initdata struct pernet_operations sysctl_route_ops = {
2995         .init = sysctl_route_net_init,
2996         .exit = sysctl_route_net_exit,
2997 };
2998 #endif
2999
3000 static __net_init int rt_genid_init(struct net *net)
3001 {
3002         atomic_set(&net->ipv4.rt_genid, 0);
3003         atomic_set(&net->fnhe_genid, 0);
3004         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3005         return 0;
3006 }
3007
3008 static __net_initdata struct pernet_operations rt_genid_ops = {
3009         .init = rt_genid_init,
3010 };
3011
3012 static int __net_init ipv4_inetpeer_init(struct net *net)
3013 {
3014         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3015
3016         if (!bp)
3017                 return -ENOMEM;
3018         inet_peer_base_init(bp);
3019         net->ipv4.peers = bp;
3020         return 0;
3021 }
3022
3023 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3024 {
3025         struct inet_peer_base *bp = net->ipv4.peers;
3026
3027         net->ipv4.peers = NULL;
3028         inetpeer_invalidate_tree(bp);
3029         kfree(bp);
3030 }
3031
3032 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3033         .init   =       ipv4_inetpeer_init,
3034         .exit   =       ipv4_inetpeer_exit,
3035 };
3036
3037 #ifdef CONFIG_IP_ROUTE_CLASSID
3038 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3039 #endif /* CONFIG_IP_ROUTE_CLASSID */
3040
3041 int __init ip_rt_init(void)
3042 {
3043         int cpu;
3044
3045         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3046         if (!ip_idents)
3047                 panic("IP: failed to allocate ip_idents\n");
3048
3049         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3050
3051         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3052         if (!ip_tstamps)
3053                 panic("IP: failed to allocate ip_tstamps\n");
3054
3055         for_each_possible_cpu(cpu) {
3056                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3057
3058                 INIT_LIST_HEAD(&ul->head);
3059                 spin_lock_init(&ul->lock);
3060         }
3061 #ifdef CONFIG_IP_ROUTE_CLASSID
3062         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3063         if (!ip_rt_acct)
3064                 panic("IP: failed to allocate ip_rt_acct\n");
3065 #endif
3066
3067         ipv4_dst_ops.kmem_cachep =
3068                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3069                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3070
3071         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3072
3073         if (dst_entries_init(&ipv4_dst_ops) < 0)
3074                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3075
3076         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3077                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3078
3079         ipv4_dst_ops.gc_thresh = ~0;
3080         ip_rt_max_size = INT_MAX;
3081
3082         devinet_init();
3083         ip_fib_init();
3084
3085         if (ip_rt_proc_init())
3086                 pr_err("Unable to create route proc files\n");
3087 #ifdef CONFIG_XFRM
3088         xfrm_init();
3089         xfrm4_init();
3090 #endif
3091         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3092                       RTNL_FLAG_DOIT_UNLOCKED);
3093
3094 #ifdef CONFIG_SYSCTL
3095         register_pernet_subsys(&sysctl_route_ops);
3096 #endif
3097         register_pernet_subsys(&rt_genid_ops);
3098         register_pernet_subsys(&ipv4_inetpeer_ops);
3099         return 0;
3100 }
3101
3102 #ifdef CONFIG_SYSCTL
3103 /*
3104  * We really need to sanitize the damn ipv4 init order, then all
3105  * this nonsense will go away.
3106  */
3107 void __init ip_static_sysctl_init(void)
3108 {
3109         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3110 }
3111 #endif