net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 144 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 148                                            struct sk_buff *skb, u32 mtu,
 149                                            bool confirm_neigh);
 150 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 151                                         struct sk_buff *skb);
 152 static void             ipv4_dst_destroy(struct dst_entry *dst);
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         WARN_ON(1);
 157         return NULL;
 158 }
 159
 160 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 161                                            struct sk_buff *skb,
 162                                            const void *daddr);
 163 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 164
 165 static struct dst_ops ipv4_dst_ops = {
 166         .family =               AF_INET,
 167         .check =                ipv4_dst_check,
 168         .default_advmss =       ipv4_default_advmss,
 169         .mtu =                  ipv4_mtu,
 170         .cow_metrics =          ipv4_cow_metrics,
 171         .destroy =              ipv4_dst_destroy,
 172         .negative_advice =      ipv4_negative_advice,
 173         .link_failure =         ipv4_link_failure,
 174         .update_pmtu =          ip_rt_update_pmtu,
 175         .redirect =             ip_do_redirect,
 176         .local_out =            __ip_local_out,
 177         .neigh_lookup =         ipv4_neigh_lookup,
 178         .confirm_neigh =        ipv4_confirm_neigh,
 179 };
 180
 181 #define ECN_OR_COST(class)      TC_PRIO_##class
 182
 183 const __u8 ip_tos2prio[16] = {
 184         TC_PRIO_BESTEFFORT,
 185         ECN_OR_COST(BESTEFFORT),
 186         TC_PRIO_BESTEFFORT,
 187         ECN_OR_COST(BESTEFFORT),
 188         TC_PRIO_BULK,
 189         ECN_OR_COST(BULK),
 190         TC_PRIO_BULK,
 191         ECN_OR_COST(BULK),
 192         TC_PRIO_INTERACTIVE,
 193         ECN_OR_COST(INTERACTIVE),
 194         TC_PRIO_INTERACTIVE,
 195         ECN_OR_COST(INTERACTIVE),
 196         TC_PRIO_INTERACTIVE_BULK,
 197         ECN_OR_COST(INTERACTIVE_BULK),
 198         TC_PRIO_INTERACTIVE_BULK,
 199         ECN_OR_COST(INTERACTIVE_BULK)
 200 };
 201 EXPORT_SYMBOL(ip_tos2prio);
 202
 203 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 204 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 205
 206 #ifdef CONFIG_PROC_FS
 207 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 208 {
 209         if (*pos)
 210                 return NULL;
 211         return SEQ_START_TOKEN;
 212 }
 213
 214 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 215 {
 216         ++*pos;
 217         return NULL;
 218 }
 219
 220 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 221 {
 222 }
 223
 224 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 225 {
 226         if (v == SEQ_START_TOKEN)
 227                 seq_printf(seq, "%-127s\n",
 228                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 229                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 230                            "HHUptod\tSpecDst");
 231         return 0;
 232 }
 233
 234 static const struct seq_operations rt_cache_seq_ops = {
 235         .start  = rt_cache_seq_start,
 236         .next   = rt_cache_seq_next,
 237         .stop   = rt_cache_seq_stop,
 238         .show   = rt_cache_seq_show,
 239 };
 240
 241 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 242 {
 243         return seq_open(file, &rt_cache_seq_ops);
 244 }
 245
 246 static const struct file_operations rt_cache_seq_fops = {
 247         .owner   = THIS_MODULE,
 248         .open    = rt_cache_seq_open,
 249         .read    = seq_read,
 250         .llseek  = seq_lseek,
 251         .release = seq_release,
 252 };
 253
 254
 255 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 256 {
 257         int cpu;
 258
 259         if (*pos == 0)
 260                 return SEQ_START_TOKEN;
 261
 262         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 263                 if (!cpu_possible(cpu))
 264                         continue;
 265                 *pos = cpu+1;
 266                 return &per_cpu(rt_cache_stat, cpu);
 267         }
 268         return NULL;
 269 }
 270
 271 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 272 {
 273         int cpu;
 274
 275         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 276                 if (!cpu_possible(cpu))
 277                         continue;
 278                 *pos = cpu+1;
 279                 return &per_cpu(rt_cache_stat, cpu);
 280         }
 281         return NULL;
 282
 283 }
 284
 285 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 286 {
 287
 288 }
 289
 290 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 291 {
 292         struct rt_cache_stat *st = v;
 293
 294         if (v == SEQ_START_TOKEN) {
 295                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 296                 return 0;
 297         }
 298
 299         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 300                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 301                    dst_entries_get_slow(&ipv4_dst_ops),
 302                    0, /* st->in_hit */
 303                    st->in_slow_tot,
 304                    st->in_slow_mc,
 305                    st->in_no_route,
 306                    st->in_brd,
 307                    st->in_martian_dst,
 308                    st->in_martian_src,
 309
 310                    0, /* st->out_hit */
 311                    st->out_slow_tot,
 312                    st->out_slow_mc,
 313
 314                    0, /* st->gc_total */
 315                    0, /* st->gc_ignored */
 316                    0, /* st->gc_goal_miss */
 317                    0, /* st->gc_dst_overflow */
 318                    0, /* st->in_hlist_search */
 319                    0  /* st->out_hlist_search */
 320                 );
 321         return 0;
 322 }
 323
 324 static const struct seq_operations rt_cpu_seq_ops = {
 325         .start  = rt_cpu_seq_start,
 326         .next   = rt_cpu_seq_next,
 327         .stop   = rt_cpu_seq_stop,
 328         .show   = rt_cpu_seq_show,
 329 };
 330
 331
 332 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 333 {
 334         return seq_open(file, &rt_cpu_seq_ops);
 335 }
 336
 337 static const struct file_operations rt_cpu_seq_fops = {
 338         .owner   = THIS_MODULE,
 339         .open    = rt_cpu_seq_open,
 340         .read    = seq_read,
 341         .llseek  = seq_lseek,
 342         .release = seq_release,
 343 };
 344
 345 #ifdef CONFIG_IP_ROUTE_CLASSID
 346 static int rt_acct_proc_show(struct seq_file *m, void *v)
 347 {
 348         struct ip_rt_acct *dst, *src;
 349         unsigned int i, j;
 350
 351         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 352         if (!dst)
 353                 return -ENOMEM;
 354
 355         for_each_possible_cpu(i) {
 356                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 357                 for (j = 0; j < 256; j++) {
 358                         dst[j].o_bytes   += src[j].o_bytes;
 359                         dst[j].o_packets += src[j].o_packets;
 360                         dst[j].i_bytes   += src[j].i_bytes;
 361                         dst[j].i_packets += src[j].i_packets;
 362                 }
 363         }
 364
 365         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 366         kfree(dst);
 367         return 0;
 368 }
 369
 370 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 371 {
 372         return single_open(file, rt_acct_proc_show, NULL);
 373 }
 374
 375 static const struct file_operations rt_acct_proc_fops = {
 376         .owner          = THIS_MODULE,
 377         .open           = rt_acct_proc_open,
 378         .read           = seq_read,
 379         .llseek         = seq_lseek,
 380         .release        = single_release,
 381 };
 382 #endif
 383
 384 static int __net_init ip_rt_do_proc_init(struct net *net)
 385 {
 386         struct proc_dir_entry *pde;
 387
 388         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 389                           &rt_cache_seq_fops);
 390         if (!pde)
 391                 goto err1;
 392
 393         pde = proc_create("rt_cache", S_IRUGO,
 394                           net->proc_net_stat, &rt_cpu_seq_fops);
 395         if (!pde)
 396                 goto err2;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 400         if (!pde)
 401                 goto err3;
 402 #endif
 403         return 0;
 404
 405 #ifdef CONFIG_IP_ROUTE_CLASSID
 406 err3:
 407         remove_proc_entry("rt_cache", net->proc_net_stat);
 408 #endif
 409 err2:
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 err1:
 412         return -ENOMEM;
 413 }
 414
 415 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 416 {
 417         remove_proc_entry("rt_cache", net->proc_net_stat);
 418         remove_proc_entry("rt_cache", net->proc_net);
 419 #ifdef CONFIG_IP_ROUTE_CLASSID
 420         remove_proc_entry("rt_acct", net->proc_net);
 421 #endif
 422 }
 423
 424 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 425         .init = ip_rt_do_proc_init,
 426         .exit = ip_rt_do_proc_exit,
 427 };
 428
 429 static int __init ip_rt_proc_init(void)
 430 {
 431         return register_pernet_subsys(&ip_rt_proc_ops);
 432 }
 433
 434 #else
 435 static inline int ip_rt_proc_init(void)
 436 {
 437         return 0;
 438 }
 439 #endif /* CONFIG_PROC_FS */
 440
 441 static inline bool rt_is_expired(const struct rtable *rth)
 442 {
 443         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 444 }
 445
 446 void rt_cache_flush(struct net *net)
 447 {
 448         rt_genid_bump_ipv4(net);
 449 }
 450
 451 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 452                                            struct sk_buff *skb,
 453                                            const void *daddr)
 454 {
 455         struct net_device *dev = dst->dev;
 456         const __be32 *pkey = daddr;
 457         const struct rtable *rt;
 458         struct neighbour *n;
 459
 460         rt = (const struct rtable *) dst;
 461         if (rt->rt_gateway)
 462                 pkey = (const __be32 *) &rt->rt_gateway;
 463         else if (skb)
 464                 pkey = &ip_hdr(skb)->daddr;
 465
 466         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 467         if (n)
 468                 return n;
 469         return neigh_create(&arp_tbl, pkey, dev);
 470 }
 471
 472 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 473 {
 474         struct net_device *dev = dst->dev;
 475         const __be32 *pkey = daddr;
 476         const struct rtable *rt;
 477
 478         rt = (const struct rtable *)dst;
 479         if (rt->rt_gateway)
 480                 pkey = (const __be32 *)&rt->rt_gateway;
 481         else if (!daddr ||
 482                  (rt->rt_flags &
 483                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 484                 return;
 485
 486         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 487 }
 488
 489 #define IP_IDENTS_SZ 2048u
 490
 491 static atomic_t *ip_idents __read_mostly;
 492 static u32 *ip_tstamps __read_mostly;
 493
 494 /* In order to protect privacy, we add a perturbation to identifiers
 495  * if one generator is seldom used. This makes hard for an attacker
 496  * to infer how many packets were sent between two points in time.
 497  */
 498 u32 ip_idents_reserve(u32 hash, int segs)
 499 {
 500         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 501         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 502         u32 old = READ_ONCE(*p_tstamp);
 503         u32 now = (u32)jiffies;
 504         u32 new, delta = 0;
 505
 506         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 507                 delta = prandom_u32_max(now - old);
 508
 509         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 510         do {
 511                 old = (u32)atomic_read(p_id);
 512                 new = old + delta + segs;
 513         } while (atomic_cmpxchg(p_id, old, new) != old);
 514
 515         return new - segs;
 516 }
 517 EXPORT_SYMBOL(ip_idents_reserve);
 518
 519 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 520 {
 521         u32 hash, id;
 522
 523         /* Note the following code is not safe, but this is okay. */
 524         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 525                 get_random_bytes(&net->ipv4.ip_id_key,
 526                                  sizeof(net->ipv4.ip_id_key));
 527
 528         hash = siphash_3u32((__force u32)iph->daddr,
 529                             (__force u32)iph->saddr,
 530                             iph->protocol,
 531                             &net->ipv4.ip_id_key);
 532         id = ip_idents_reserve(hash, segs);
 533         iph->id = htons(id);
 534 }
 535 EXPORT_SYMBOL(__ip_select_ident);
 536
 537 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 538                              const struct sock *sk,
 539                              const struct iphdr *iph,
 540                              int oif, u8 tos,
 541                              u8 prot, u32 mark, int flow_flags)
 542 {
 543         if (sk) {
 544                 const struct inet_sock *inet = inet_sk(sk);
 545
 546                 oif = sk->sk_bound_dev_if;
 547                 mark = sk->sk_mark;
 548                 tos = RT_CONN_FLAGS(sk);
 549                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 550         }
 551         flowi4_init_output(fl4, oif, mark, tos,
 552                            RT_SCOPE_UNIVERSE, prot,
 553                            flow_flags,
 554                            iph->daddr, iph->saddr, 0, 0,
 555                            sock_net_uid(net, sk));
 556 }
 557
 558 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 559                                const struct sock *sk)
 560 {
 561         const struct net *net = dev_net(skb->dev);
 562         const struct iphdr *iph = ip_hdr(skb);
 563         int oif = skb->dev->ifindex;
 564         u8 tos = RT_TOS(iph->tos);
 565         u8 prot = iph->protocol;
 566         u32 mark = skb->mark;
 567
 568         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 569 }
 570
 571 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 572 {
 573         const struct inet_sock *inet = inet_sk(sk);
 574         const struct ip_options_rcu *inet_opt;
 575         __be32 daddr = inet->inet_daddr;
 576
 577         rcu_read_lock();
 578         inet_opt = rcu_dereference(inet->inet_opt);
 579         if (inet_opt && inet_opt->opt.srr)
 580                 daddr = inet_opt->opt.faddr;
 581         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 582                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 583                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 584                            inet_sk_flowi_flags(sk),
 585                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 586         rcu_read_unlock();
 587 }
 588
 589 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 590                                  const struct sk_buff *skb)
 591 {
 592         if (skb)
 593                 build_skb_flow_key(fl4, skb, sk);
 594         else
 595                 build_sk_flow_key(fl4, sk);
 596 }
 597
 598 static DEFINE_SPINLOCK(fnhe_lock);
 599
 600 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 601 {
 602         struct rtable *rt;
 603
 604         rt = rcu_dereference(fnhe->fnhe_rth_input);
 605         if (rt) {
 606                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 607                 dst_dev_put(&rt->dst);
 608                 dst_release(&rt->dst);
 609         }
 610         rt = rcu_dereference(fnhe->fnhe_rth_output);
 611         if (rt) {
 612                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 613                 dst_dev_put(&rt->dst);
 614                 dst_release(&rt->dst);
 615         }
 616 }
 617
 618 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 619 {
 620         struct fib_nh_exception *fnhe, *oldest;
 621
 622         oldest = rcu_dereference(hash->chain);
 623         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 624              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 625                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 626                         oldest = fnhe;
 627         }
 628         fnhe_flush_routes(oldest);
 629         return oldest;
 630 }
 631
 632 static inline u32 fnhe_hashfun(__be32 daddr)
 633 {
 634         static u32 fnhe_hashrnd __read_mostly;
 635         u32 hval;
 636
 637         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 638         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 639         return hash_32(hval, FNHE_HASH_SHIFT);
 640 }
 641
 642 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 643 {
 644         rt->rt_pmtu = fnhe->fnhe_pmtu;
 645         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 646         rt->dst.expires = fnhe->fnhe_expires;
 647
 648         if (fnhe->fnhe_gw) {
 649                 rt->rt_flags |= RTCF_REDIRECTED;
 650                 rt->rt_gateway = fnhe->fnhe_gw;
 651                 rt->rt_uses_gateway = 1;
 652         }
 653 }
 654
 655 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 656                                   u32 pmtu, bool lock, unsigned long expires)
 657 {
 658         struct fnhe_hash_bucket *hash;
 659         struct fib_nh_exception *fnhe;
 660         struct rtable *rt;
 661         u32 genid, hval;
 662         unsigned int i;
 663         int depth;
 664
 665         genid = fnhe_genid(dev_net(nh->nh_dev));
 666         hval = fnhe_hashfun(daddr);
 667
 668         spin_lock_bh(&fnhe_lock);
 669
 670         hash = rcu_dereference(nh->nh_exceptions);
 671         if (!hash) {
 672                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 673                 if (!hash)
 674                         goto out_unlock;
 675                 rcu_assign_pointer(nh->nh_exceptions, hash);
 676         }
 677
 678         hash += hval;
 679
 680         depth = 0;
 681         for (fnhe = rcu_dereference(hash->chain); fnhe;
 682              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 683                 if (fnhe->fnhe_daddr == daddr)
 684                         break;
 685                 depth++;
 686         }
 687
 688         if (fnhe) {
 689                 if (fnhe->fnhe_genid != genid)
 690                         fnhe->fnhe_genid = genid;
 691                 if (gw)
 692                         fnhe->fnhe_gw = gw;
 693                 if (pmtu) {
 694                         fnhe->fnhe_pmtu = pmtu;
 695                         fnhe->fnhe_mtu_locked = lock;
 696                 }
 697                 fnhe->fnhe_expires = max(1UL, expires);
 698                 /* Update all cached dsts too */
 699                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 700                 if (rt)
 701                         fill_route_from_fnhe(rt, fnhe);
 702                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 703                 if (rt)
 704                         fill_route_from_fnhe(rt, fnhe);
 705         } else {
 706                 if (depth > FNHE_RECLAIM_DEPTH)
 707                         fnhe = fnhe_oldest(hash);
 708                 else {
 709                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 710                         if (!fnhe)
 711                                 goto out_unlock;
 712
 713                         fnhe->fnhe_next = hash->chain;
 714                         rcu_assign_pointer(hash->chain, fnhe);
 715                 }
 716                 fnhe->fnhe_genid = genid;
 717                 fnhe->fnhe_daddr = daddr;
 718                 fnhe->fnhe_gw = gw;
 719                 fnhe->fnhe_pmtu = pmtu;
 720                 fnhe->fnhe_mtu_locked = lock;
 721                 fnhe->fnhe_expires = max(1UL, expires);
 722
 723                 /* Exception created; mark the cached routes for the nexthop
 724                  * stale, so anyone caching it rechecks if this exception
 725                  * applies to them.
 726                  */
 727                 rt = rcu_dereference(nh->nh_rth_input);
 728                 if (rt)
 729                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 730
 731                 for_each_possible_cpu(i) {
 732                         struct rtable __rcu **prt;
 733                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 734                         rt = rcu_dereference(*prt);
 735                         if (rt)
 736                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 737                 }
 738         }
 739
 740         fnhe->fnhe_stamp = jiffies;
 741
 742 out_unlock:
 743         spin_unlock_bh(&fnhe_lock);
 744 }
 745
 746 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 747                              bool kill_route)
 748 {
 749         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 750         __be32 old_gw = ip_hdr(skb)->saddr;
 751         struct net_device *dev = skb->dev;
 752         struct in_device *in_dev;
 753         struct fib_result res;
 754         struct neighbour *n;
 755         struct net *net;
 756
 757         switch (icmp_hdr(skb)->code & 7) {
 758         case ICMP_REDIR_NET:
 759         case ICMP_REDIR_NETTOS:
 760         case ICMP_REDIR_HOST:
 761         case ICMP_REDIR_HOSTTOS:
 762                 break;
 763
 764         default:
 765                 return;
 766         }
 767
 768         if (rt->rt_gateway != old_gw)
 769                 return;
 770
 771         in_dev = __in_dev_get_rcu(dev);
 772         if (!in_dev)
 773                 return;
 774
 775         net = dev_net(dev);
 776         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 777             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 778             ipv4_is_zeronet(new_gw))
 779                 goto reject_redirect;
 780
 781         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 782                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 783                         goto reject_redirect;
 784                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 785                         goto reject_redirect;
 786         } else {
 787                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 788                         goto reject_redirect;
 789         }
 790
 791         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 792         if (!n)
 793                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 794         if (!IS_ERR(n)) {
 795                 if (!(n->nud_state & NUD_VALID)) {
 796                         neigh_event_send(n, NULL);
 797                 } else {
 798                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 799                                 struct fib_nh *nh = &FIB_RES_NH(res);
 800
 801                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 802                                                 0, false,
 803                                                 jiffies + ip_rt_gc_timeout);
 804                         }
 805                         if (kill_route)
 806                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 807                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 808                 }
 809                 neigh_release(n);
 810         }
 811         return;
 812
 813 reject_redirect:
 814 #ifdef CONFIG_IP_ROUTE_VERBOSE
 815         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 816                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 817                 __be32 daddr = iph->daddr;
 818                 __be32 saddr = iph->saddr;
 819
 820                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 821                                      "  Advised path = %pI4 -> %pI4\n",
 822                                      &old_gw, dev->name, &new_gw,
 823                                      &saddr, &daddr);
 824         }
 825 #endif
 826         ;
 827 }
 828
 829 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 830 {
 831         struct rtable *rt;
 832         struct flowi4 fl4;
 833         const struct iphdr *iph = (const struct iphdr *) skb->data;
 834         struct net *net = dev_net(skb->dev);
 835         int oif = skb->dev->ifindex;
 836         u8 tos = RT_TOS(iph->tos);
 837         u8 prot = iph->protocol;
 838         u32 mark = skb->mark;
 839
 840         rt = (struct rtable *) dst;
 841
 842         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 843         __ip_do_redirect(rt, skb, &fl4, true);
 844 }
 845
 846 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 847 {
 848         struct rtable *rt = (struct rtable *)dst;
 849         struct dst_entry *ret = dst;
 850
 851         if (rt) {
 852                 if (dst->obsolete > 0) {
 853                         ip_rt_put(rt);
 854                         ret = NULL;
 855                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 856                            rt->dst.expires) {
 857                         ip_rt_put(rt);
 858                         ret = NULL;
 859                 }
 860         }
 861         return ret;
 862 }
 863
 864 /*
 865  * Algorithm:
 866  *      1. The first ip_rt_redirect_number redirects are sent
 867  *         with exponential backoff, then we stop sending them at all,
 868  *         assuming that the host ignores our redirects.
 869  *      2. If we did not see packets requiring redirects
 870  *         during ip_rt_redirect_silence, we assume that the host
 871  *         forgot redirected route and start to send redirects again.
 872  *
 873  * This algorithm is much cheaper and more intelligent than dumb load limiting
 874  * in icmp.c.
 875  *
 876  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 877  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 878  */
 879
 880 void ip_rt_send_redirect(struct sk_buff *skb)
 881 {
 882         struct rtable *rt = skb_rtable(skb);
 883         struct in_device *in_dev;
 884         struct inet_peer *peer;
 885         struct net *net;
 886         int log_martians;
 887         int vif;
 888
 889         rcu_read_lock();
 890         in_dev = __in_dev_get_rcu(rt->dst.dev);
 891         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 892                 rcu_read_unlock();
 893                 return;
 894         }
 895         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 896         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 897         rcu_read_unlock();
 898
 899         net = dev_net(rt->dst.dev);
 900         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 901         if (!peer) {
 902                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 903                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 904                 return;
 905         }
 906
 907         /* No redirected packets during ip_rt_redirect_silence;
 908          * reset the algorithm.
 909          */
 910         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 911                 peer->rate_tokens = 0;
 912                 peer->n_redirects = 0;
 913         }
 914
 915         /* Too many ignored redirects; do not send anything
 916          * set dst.rate_last to the last seen redirected packet.
 917          */
 918         if (peer->n_redirects >= ip_rt_redirect_number) {
 919                 peer->rate_last = jiffies;
 920                 goto out_put_peer;
 921         }
 922
 923         /* Check for load limit; set rate_last to the latest sent
 924          * redirect.
 925          */
 926         if (peer->rate_tokens == 0 ||
 927             time_after(jiffies,
 928                        (peer->rate_last +
 929                         (ip_rt_redirect_load << peer->n_redirects)))) {
 930                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 931
 932                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 933                 peer->rate_last = jiffies;
 934                 ++peer->n_redirects;
 935 #ifdef CONFIG_IP_ROUTE_VERBOSE
 936                 if (log_martians &&
 937                     peer->n_redirects == ip_rt_redirect_number)
 938                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 939                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 940                                              &ip_hdr(skb)->daddr, &gw);
 941 #endif
 942         }
 943 out_put_peer:
 944         inet_putpeer(peer);
 945 }
 946
 947 static int ip_error(struct sk_buff *skb)
 948 {
 949         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 950         struct rtable *rt = skb_rtable(skb);
 951         struct inet_peer *peer;
 952         unsigned long now;
 953         struct net *net;
 954         bool send;
 955         int code;
 956
 957         /* IP on this device is disabled. */
 958         if (!in_dev)
 959                 goto out;
 960
 961         net = dev_net(rt->dst.dev);
 962         if (!IN_DEV_FORWARD(in_dev)) {
 963                 switch (rt->dst.error) {
 964                 case EHOSTUNREACH:
 965                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 966                         break;
 967
 968                 case ENETUNREACH:
 969                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 970                         break;
 971                 }
 972                 goto out;
 973         }
 974
 975         switch (rt->dst.error) {
 976         case EINVAL:
 977         default:
 978                 goto out;
 979         case EHOSTUNREACH:
 980                 code = ICMP_HOST_UNREACH;
 981                 break;
 982         case ENETUNREACH:
 983                 code = ICMP_NET_UNREACH;
 984                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 985                 break;
 986         case EACCES:
 987                 code = ICMP_PKT_FILTERED;
 988                 break;
 989         }
 990
 991         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 992                                l3mdev_master_ifindex(skb->dev), 1);
 993
 994         send = true;
 995         if (peer) {
 996                 now = jiffies;
 997                 peer->rate_tokens += now - peer->rate_last;
 998                 if (peer->rate_tokens > ip_rt_error_burst)
 999                         peer->rate_tokens = ip_rt_error_burst;
1000                 peer->rate_last = now;
1001                 if (peer->rate_tokens >= ip_rt_error_cost)
1002                         peer->rate_tokens -= ip_rt_error_cost;
1003                 else
1004                         send = false;
1005                 inet_putpeer(peer);
1006         }
1007         if (send)
1008                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009
1010 out:    kfree_skb(skb);
1011         return 0;
1012 }
1013
1014 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 {
1016         struct dst_entry *dst = &rt->dst;
1017         u32 old_mtu = ipv4_mtu(dst);
1018         struct fib_result res;
1019         bool lock = false;
1020
1021         if (ip_mtu_locked(dst))
1022                 return;
1023
1024         if (old_mtu < mtu)
1025                 return;
1026
1027         if (mtu < ip_rt_min_pmtu) {
1028                 lock = true;
1029                 mtu = min(old_mtu, ip_rt_min_pmtu);
1030         }
1031
1032         if (rt->rt_pmtu == mtu && !lock &&
1033             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034                 return;
1035
1036         rcu_read_lock();
1037         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038                 struct fib_nh *nh = &FIB_RES_NH(res);
1039
1040                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1041                                       jiffies + ip_rt_mtu_expires);
1042         }
1043         rcu_read_unlock();
1044 }
1045
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047                               struct sk_buff *skb, u32 mtu,
1048                               bool confirm_neigh)
1049 {
1050         struct rtable *rt = (struct rtable *) dst;
1051         struct flowi4 fl4;
1052
1053         ip_rt_build_flow_key(&fl4, sk, skb);
1054         __ip_rt_update_pmtu(rt, &fl4, mtu);
1055 }
1056
1057 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1058                       int oif, u32 mark, u8 protocol, int flow_flags)
1059 {
1060         const struct iphdr *iph = (const struct iphdr *) skb->data;
1061         struct flowi4 fl4;
1062         struct rtable *rt;
1063
1064         if (!mark)
1065                 mark = IP4_REPLY_MARK(net, skb->mark);
1066
1067         __build_flow_key(net, &fl4, NULL, iph, oif,
1068                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1069         rt = __ip_route_output_key(net, &fl4);
1070         if (!IS_ERR(rt)) {
1071                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1072                 ip_rt_put(rt);
1073         }
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1076
1077 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1078 {
1079         const struct iphdr *iph = (const struct iphdr *) skb->data;
1080         struct flowi4 fl4;
1081         struct rtable *rt;
1082
1083         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1084
1085         if (!fl4.flowi4_mark)
1086                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1087
1088         rt = __ip_route_output_key(sock_net(sk), &fl4);
1089         if (!IS_ERR(rt)) {
1090                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1091                 ip_rt_put(rt);
1092         }
1093 }
1094
1095 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1096 {
1097         const struct iphdr *iph = (const struct iphdr *) skb->data;
1098         struct flowi4 fl4;
1099         struct rtable *rt;
1100         struct dst_entry *odst = NULL;
1101         bool new = false;
1102         struct net *net = sock_net(sk);
1103
1104         bh_lock_sock(sk);
1105
1106         if (!ip_sk_accept_pmtu(sk))
1107                 goto out;
1108
1109         odst = sk_dst_get(sk);
1110
1111         if (sock_owned_by_user(sk) || !odst) {
1112                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1113                 goto out;
1114         }
1115
1116         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1117
1118         rt = (struct rtable *)odst;
1119         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1120                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1121                 if (IS_ERR(rt))
1122                         goto out;
1123
1124                 new = true;
1125         }
1126
1127         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1128
1129         if (!dst_check(&rt->dst, 0)) {
1130                 if (new)
1131                         dst_release(&rt->dst);
1132
1133                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1134                 if (IS_ERR(rt))
1135                         goto out;
1136
1137                 new = true;
1138         }
1139
1140         if (new)
1141                 sk_dst_set(sk, &rt->dst);
1142
1143 out:
1144         bh_unlock_sock(sk);
1145         dst_release(odst);
1146 }
1147 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1148
1149 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1150                    int oif, u32 mark, u8 protocol, int flow_flags)
1151 {
1152         const struct iphdr *iph = (const struct iphdr *) skb->data;
1153         struct flowi4 fl4;
1154         struct rtable *rt;
1155
1156         __build_flow_key(net, &fl4, NULL, iph, oif,
1157                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1158         rt = __ip_route_output_key(net, &fl4);
1159         if (!IS_ERR(rt)) {
1160                 __ip_do_redirect(rt, skb, &fl4, false);
1161                 ip_rt_put(rt);
1162         }
1163 }
1164 EXPORT_SYMBOL_GPL(ipv4_redirect);
1165
1166 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1167 {
1168         const struct iphdr *iph = (const struct iphdr *) skb->data;
1169         struct flowi4 fl4;
1170         struct rtable *rt;
1171         struct net *net = sock_net(sk);
1172
1173         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1174         rt = __ip_route_output_key(net, &fl4);
1175         if (!IS_ERR(rt)) {
1176                 __ip_do_redirect(rt, skb, &fl4, false);
1177                 ip_rt_put(rt);
1178         }
1179 }
1180 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1181
1182 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1183 {
1184         struct rtable *rt = (struct rtable *) dst;
1185
1186         /* All IPV4 dsts are created with ->obsolete set to the value
1187          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1188          * into this function always.
1189          *
1190          * When a PMTU/redirect information update invalidates a route,
1191          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1192          * DST_OBSOLETE_DEAD by dst_free().
1193          */
1194         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1195                 return NULL;
1196         return dst;
1197 }
1198
1199 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1200 {
1201         struct ip_options opt;
1202         int res;
1203
1204         /* Recompile ip options since IPCB may not be valid anymore.
1205          * Also check we have a reasonable ipv4 header.
1206          */
1207         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1208             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1209                 return;
1210
1211         memset(&opt, 0, sizeof(opt));
1212         if (ip_hdr(skb)->ihl > 5) {
1213                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1214                         return;
1215                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1216
1217                 rcu_read_lock();
1218                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1219                 rcu_read_unlock();
1220
1221                 if (res)
1222                         return;
1223         }
1224         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1225 }
1226
1227 static void ipv4_link_failure(struct sk_buff *skb)
1228 {
1229         struct rtable *rt;
1230
1231         ipv4_send_dest_unreach(skb);
1232
1233         rt = skb_rtable(skb);
1234         if (rt)
1235                 dst_set_expires(&rt->dst, 0);
1236 }
1237
1238 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1239 {
1240         pr_debug("%s: %pI4 -> %pI4, %s\n",
1241                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1242                  skb->dev ? skb->dev->name : "?");
1243         kfree_skb(skb);
1244         WARN_ON(1);
1245         return 0;
1246 }
1247
1248 /*
1249    We do not cache source address of outgoing interface,
1250    because it is used only by IP RR, TS and SRR options,
1251    so that it out of fast path.
1252
1253    BTW remember: "addr" is allowed to be not aligned
1254    in IP options!
1255  */
1256
1257 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1258 {
1259         __be32 src;
1260
1261         if (rt_is_output_route(rt))
1262                 src = ip_hdr(skb)->saddr;
1263         else {
1264                 struct fib_result res;
1265                 struct flowi4 fl4;
1266                 struct iphdr *iph;
1267
1268                 iph = ip_hdr(skb);
1269
1270                 memset(&fl4, 0, sizeof(fl4));
1271                 fl4.daddr = iph->daddr;
1272                 fl4.saddr = iph->saddr;
1273                 fl4.flowi4_tos = RT_TOS(iph->tos);
1274                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1275                 fl4.flowi4_iif = skb->dev->ifindex;
1276                 fl4.flowi4_mark = skb->mark;
1277
1278                 rcu_read_lock();
1279                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1280                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1281                 else
1282                         src = inet_select_addr(rt->dst.dev,
1283                                                rt_nexthop(rt, iph->daddr),
1284                                                RT_SCOPE_UNIVERSE);
1285                 rcu_read_unlock();
1286         }
1287         memcpy(addr, &src, 4);
1288 }
1289
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable *rt, u32 tag)
1292 {
1293         if (!(rt->dst.tclassid & 0xFFFF))
1294                 rt->dst.tclassid |= tag & 0xFFFF;
1295         if (!(rt->dst.tclassid & 0xFFFF0000))
1296                 rt->dst.tclassid |= tag & 0xFFFF0000;
1297 }
1298 #endif
1299
1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1301 {
1302         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1303         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1304                                     ip_rt_min_advmss);
1305
1306         return min(advmss, IPV4_MAX_PMTU - header_size);
1307 }
1308
1309 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1310 {
1311         const struct rtable *rt = (const struct rtable *) dst;
1312         unsigned int mtu = rt->rt_pmtu;
1313
1314         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1315                 mtu = dst_metric_raw(dst, RTAX_MTU);
1316
1317         if (mtu)
1318                 return mtu;
1319
1320         mtu = READ_ONCE(dst->dev->mtu);
1321
1322         if (unlikely(ip_mtu_locked(dst))) {
1323                 if (rt->rt_uses_gateway && mtu > 576)
1324                         mtu = 576;
1325         }
1326
1327         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1328
1329         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1330 }
1331
1332 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1333 {
1334         struct fnhe_hash_bucket *hash;
1335         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1336         u32 hval = fnhe_hashfun(daddr);
1337
1338         spin_lock_bh(&fnhe_lock);
1339
1340         hash = rcu_dereference_protected(nh->nh_exceptions,
1341                                          lockdep_is_held(&fnhe_lock));
1342         hash += hval;
1343
1344         fnhe_p = &hash->chain;
1345         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1346         while (fnhe) {
1347                 if (fnhe->fnhe_daddr == daddr) {
1348                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1349                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1350                         /* set fnhe_daddr to 0 to ensure it won't bind with
1351                          * new dsts in rt_bind_exception().
1352                          */
1353                         fnhe->fnhe_daddr = 0;
1354                         fnhe_flush_routes(fnhe);
1355                         kfree_rcu(fnhe, rcu);
1356                         break;
1357                 }
1358                 fnhe_p = &fnhe->fnhe_next;
1359                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1360                                                  lockdep_is_held(&fnhe_lock));
1361         }
1362
1363         spin_unlock_bh(&fnhe_lock);
1364 }
1365
1366 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1367 {
1368         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1369         struct fib_nh_exception *fnhe;
1370         u32 hval;
1371
1372         if (!hash)
1373                 return NULL;
1374
1375         hval = fnhe_hashfun(daddr);
1376
1377         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1378              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1379                 if (fnhe->fnhe_daddr == daddr) {
1380                         if (fnhe->fnhe_expires &&
1381                             time_after(jiffies, fnhe->fnhe_expires)) {
1382                                 ip_del_fnhe(nh, daddr);
1383                                 break;
1384                         }
1385                         return fnhe;
1386                 }
1387         }
1388         return NULL;
1389 }
1390
1391 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1392                               __be32 daddr, const bool do_cache)
1393 {
1394         bool ret = false;
1395
1396         spin_lock_bh(&fnhe_lock);
1397
1398         if (daddr == fnhe->fnhe_daddr) {
1399                 struct rtable __rcu **porig;
1400                 struct rtable *orig;
1401                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1402
1403                 if (rt_is_input_route(rt))
1404                         porig = &fnhe->fnhe_rth_input;
1405                 else
1406                         porig = &fnhe->fnhe_rth_output;
1407                 orig = rcu_dereference(*porig);
1408
1409                 if (fnhe->fnhe_genid != genid) {
1410                         fnhe->fnhe_genid = genid;
1411                         fnhe->fnhe_gw = 0;
1412                         fnhe->fnhe_pmtu = 0;
1413                         fnhe->fnhe_expires = 0;
1414                         fnhe_flush_routes(fnhe);
1415                         orig = NULL;
1416                 }
1417                 fill_route_from_fnhe(rt, fnhe);
1418                 if (!rt->rt_gateway)
1419                         rt->rt_gateway = daddr;
1420
1421                 if (do_cache) {
1422                         dst_hold(&rt->dst);
1423                         rcu_assign_pointer(*porig, rt);
1424                         if (orig) {
1425                                 dst_dev_put(&orig->dst);
1426                                 dst_release(&orig->dst);
1427                         }
1428                         ret = true;
1429                 }
1430
1431                 fnhe->fnhe_stamp = jiffies;
1432         }
1433         spin_unlock_bh(&fnhe_lock);
1434
1435         return ret;
1436 }
1437
1438 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1439 {
1440         struct rtable *orig, *prev, **p;
1441         bool ret = true;
1442
1443         if (rt_is_input_route(rt)) {
1444                 p = (struct rtable **)&nh->nh_rth_input;
1445         } else {
1446                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1447         }
1448         orig = *p;
1449
1450         /* hold dst before doing cmpxchg() to avoid race condition
1451          * on this dst
1452          */
1453         dst_hold(&rt->dst);
1454         prev = cmpxchg(p, orig, rt);
1455         if (prev == orig) {
1456                 if (orig) {
1457                         rt_add_uncached_list(orig);
1458                         dst_release(&orig->dst);
1459                 }
1460         } else {
1461                 dst_release(&rt->dst);
1462                 ret = false;
1463         }
1464
1465         return ret;
1466 }
1467
1468 struct uncached_list {
1469         spinlock_t              lock;
1470         struct list_head        head;
1471 };
1472
1473 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1474
1475 void rt_add_uncached_list(struct rtable *rt)
1476 {
1477         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1478
1479         rt->rt_uncached_list = ul;
1480
1481         spin_lock_bh(&ul->lock);
1482         list_add_tail(&rt->rt_uncached, &ul->head);
1483         spin_unlock_bh(&ul->lock);
1484 }
1485
1486 void rt_del_uncached_list(struct rtable *rt)
1487 {
1488         if (!list_empty(&rt->rt_uncached)) {
1489                 struct uncached_list *ul = rt->rt_uncached_list;
1490
1491                 spin_lock_bh(&ul->lock);
1492                 list_del(&rt->rt_uncached);
1493                 spin_unlock_bh(&ul->lock);
1494         }
1495 }
1496
1497 static void ipv4_dst_destroy(struct dst_entry *dst)
1498 {
1499         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1500         struct rtable *rt = (struct rtable *)dst;
1501
1502         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1503                 kfree(p);
1504
1505         rt_del_uncached_list(rt);
1506 }
1507
1508 void rt_flush_dev(struct net_device *dev)
1509 {
1510         struct net *net = dev_net(dev);
1511         struct rtable *rt;
1512         int cpu;
1513
1514         for_each_possible_cpu(cpu) {
1515                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1516
1517                 spin_lock_bh(&ul->lock);
1518                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1519                         if (rt->dst.dev != dev)
1520                                 continue;
1521                         rt->dst.dev = net->loopback_dev;
1522                         dev_hold(rt->dst.dev);
1523                         dev_put(dev);
1524                 }
1525                 spin_unlock_bh(&ul->lock);
1526         }
1527 }
1528
1529 static bool rt_cache_valid(const struct rtable *rt)
1530 {
1531         return  rt &&
1532                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1533                 !rt_is_expired(rt);
1534 }
1535
1536 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1537                            const struct fib_result *res,
1538                            struct fib_nh_exception *fnhe,
1539                            struct fib_info *fi, u16 type, u32 itag,
1540                            const bool do_cache)
1541 {
1542         bool cached = false;
1543
1544         if (fi) {
1545                 struct fib_nh *nh = &FIB_RES_NH(*res);
1546
1547                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1548                         rt->rt_gateway = nh->nh_gw;
1549                         rt->rt_uses_gateway = 1;
1550                 }
1551                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1552                 if (fi->fib_metrics != &dst_default_metrics) {
1553                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1554                         refcount_inc(&fi->fib_metrics->refcnt);
1555                 }
1556 #ifdef CONFIG_IP_ROUTE_CLASSID
1557                 rt->dst.tclassid = nh->nh_tclassid;
1558 #endif
1559                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1560                 if (unlikely(fnhe))
1561                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1562                 else if (do_cache)
1563                         cached = rt_cache_route(nh, rt);
1564                 if (unlikely(!cached)) {
1565                         /* Routes we intend to cache in nexthop exception or
1566                          * FIB nexthop have the DST_NOCACHE bit clear.
1567                          * However, if we are unsuccessful at storing this
1568                          * route into the cache we really need to set it.
1569                          */
1570                         if (!rt->rt_gateway)
1571                                 rt->rt_gateway = daddr;
1572                         rt_add_uncached_list(rt);
1573                 }
1574         } else
1575                 rt_add_uncached_list(rt);
1576
1577 #ifdef CONFIG_IP_ROUTE_CLASSID
1578 #ifdef CONFIG_IP_MULTIPLE_TABLES
1579         set_class_tag(rt, res->tclassid);
1580 #endif
1581         set_class_tag(rt, itag);
1582 #endif
1583 }
1584
1585 struct rtable *rt_dst_alloc(struct net_device *dev,
1586                             unsigned int flags, u16 type,
1587                             bool nopolicy, bool noxfrm, bool will_cache)
1588 {
1589         struct rtable *rt;
1590
1591         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1592                        (will_cache ? 0 : DST_HOST) |
1593                        (nopolicy ? DST_NOPOLICY : 0) |
1594                        (noxfrm ? DST_NOXFRM : 0));
1595
1596         if (rt) {
1597                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1598                 rt->rt_flags = flags;
1599                 rt->rt_type = type;
1600                 rt->rt_is_input = 0;
1601                 rt->rt_iif = 0;
1602                 rt->rt_pmtu = 0;
1603                 rt->rt_mtu_locked = 0;
1604                 rt->rt_gateway = 0;
1605                 rt->rt_uses_gateway = 0;
1606                 rt->rt_table_id = 0;
1607                 INIT_LIST_HEAD(&rt->rt_uncached);
1608
1609                 rt->dst.output = ip_output;
1610                 if (flags & RTCF_LOCAL)
1611                         rt->dst.input = ip_local_deliver;
1612         }
1613
1614         return rt;
1615 }
1616 EXPORT_SYMBOL(rt_dst_alloc);
1617
1618 /* called in rcu_read_lock() section */
1619 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1620                           u8 tos, struct net_device *dev,
1621                           struct in_device *in_dev, u32 *itag)
1622 {
1623         int err;
1624
1625         /* Primary sanity checks. */
1626         if (!in_dev)
1627                 return -EINVAL;
1628
1629         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1630             skb->protocol != htons(ETH_P_IP))
1631                 return -EINVAL;
1632
1633         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1634                 return -EINVAL;
1635
1636         if (ipv4_is_zeronet(saddr)) {
1637                 if (!ipv4_is_local_multicast(daddr))
1638                         return -EINVAL;
1639         } else {
1640                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1641                                           in_dev, itag);
1642                 if (err < 0)
1643                         return err;
1644         }
1645         return 0;
1646 }
1647
1648 /* called in rcu_read_lock() section */
1649 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1650                              u8 tos, struct net_device *dev, int our)
1651 {
1652         struct in_device *in_dev = __in_dev_get_rcu(dev);
1653         unsigned int flags = RTCF_MULTICAST;
1654         struct rtable *rth;
1655         u32 itag = 0;
1656         int err;
1657
1658         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1659         if (err)
1660                 return err;
1661
1662         if (our)
1663                 flags |= RTCF_LOCAL;
1664
1665         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1666                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1667         if (!rth)
1668                 return -ENOBUFS;
1669
1670 #ifdef CONFIG_IP_ROUTE_CLASSID
1671         rth->dst.tclassid = itag;
1672 #endif
1673         rth->dst.output = ip_rt_bug;
1674         rth->rt_is_input= 1;
1675
1676 #ifdef CONFIG_IP_MROUTE
1677         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1678                 rth->dst.input = ip_mr_input;
1679 #endif
1680         RT_CACHE_STAT_INC(in_slow_mc);
1681
1682         skb_dst_set(skb, &rth->dst);
1683         return 0;
1684 }
1685
1686
1687 static void ip_handle_martian_source(struct net_device *dev,
1688                                      struct in_device *in_dev,
1689                                      struct sk_buff *skb,
1690                                      __be32 daddr,
1691                                      __be32 saddr)
1692 {
1693         RT_CACHE_STAT_INC(in_martian_src);
1694 #ifdef CONFIG_IP_ROUTE_VERBOSE
1695         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1696                 /*
1697                  *      RFC1812 recommendation, if source is martian,
1698                  *      the only hint is MAC header.
1699                  */
1700                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1701                         &daddr, &saddr, dev->name);
1702                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1703                         print_hex_dump(KERN_WARNING, "ll header: ",
1704                                        DUMP_PREFIX_OFFSET, 16, 1,
1705                                        skb_mac_header(skb),
1706                                        dev->hard_header_len, true);
1707                 }
1708         }
1709 #endif
1710 }
1711
1712 static void set_lwt_redirect(struct rtable *rth)
1713 {
1714         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1715                 rth->dst.lwtstate->orig_output = rth->dst.output;
1716                 rth->dst.output = lwtunnel_output;
1717         }
1718
1719         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1720                 rth->dst.lwtstate->orig_input = rth->dst.input;
1721                 rth->dst.input = lwtunnel_input;
1722         }
1723 }
1724
1725 /* called in rcu_read_lock() section */
1726 static int __mkroute_input(struct sk_buff *skb,
1727                            const struct fib_result *res,
1728                            struct in_device *in_dev,
1729                            __be32 daddr, __be32 saddr, u32 tos)
1730 {
1731         struct fib_nh_exception *fnhe;
1732         struct rtable *rth;
1733         int err;
1734         struct in_device *out_dev;
1735         bool do_cache;
1736         u32 itag = 0;
1737
1738         /* get a working reference to the output device */
1739         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1740         if (!out_dev) {
1741                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1742                 return -EINVAL;
1743         }
1744
1745         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1746                                   in_dev->dev, in_dev, &itag);
1747         if (err < 0) {
1748                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1749                                          saddr);
1750
1751                 goto cleanup;
1752         }
1753
1754         do_cache = res->fi && !itag;
1755         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1756             skb->protocol == htons(ETH_P_IP) &&
1757             (IN_DEV_SHARED_MEDIA(out_dev) ||
1758              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1759                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1760
1761         if (skb->protocol != htons(ETH_P_IP)) {
1762                 /* Not IP (i.e. ARP). Do not create route, if it is
1763                  * invalid for proxy arp. DNAT routes are always valid.
1764                  *
1765                  * Proxy arp feature have been extended to allow, ARP
1766                  * replies back to the same interface, to support
1767                  * Private VLAN switch technologies. See arp.c.
1768                  */
1769                 if (out_dev == in_dev &&
1770                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1771                         err = -EINVAL;
1772                         goto cleanup;
1773                 }
1774         }
1775
1776         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1777         if (do_cache) {
1778                 if (fnhe)
1779                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1780                 else
1781                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1782                 if (rt_cache_valid(rth)) {
1783                         skb_dst_set_noref(skb, &rth->dst);
1784                         goto out;
1785                 }
1786         }
1787
1788         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1789                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1790                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1791         if (!rth) {
1792                 err = -ENOBUFS;
1793                 goto cleanup;
1794         }
1795
1796         rth->rt_is_input = 1;
1797         if (res->table)
1798                 rth->rt_table_id = res->table->tb_id;
1799         RT_CACHE_STAT_INC(in_slow_tot);
1800
1801         rth->dst.input = ip_forward;
1802
1803         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1804                        do_cache);
1805         set_lwt_redirect(rth);
1806         skb_dst_set(skb, &rth->dst);
1807 out:
1808         err = 0;
1809  cleanup:
1810         return err;
1811 }
1812
1813 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1814 /* To make ICMP packets follow the right flow, the multipath hash is
1815  * calculated from the inner IP addresses.
1816  */
1817 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1818                                  struct flow_keys *hash_keys)
1819 {
1820         const struct iphdr *outer_iph = ip_hdr(skb);
1821         const struct iphdr *inner_iph;
1822         const struct icmphdr *icmph;
1823         struct iphdr _inner_iph;
1824         struct icmphdr _icmph;
1825
1826         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1827         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1828         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1829                 return;
1830
1831         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1832                 return;
1833
1834         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1835                                    &_icmph);
1836         if (!icmph)
1837                 return;
1838
1839         if (icmph->type != ICMP_DEST_UNREACH &&
1840             icmph->type != ICMP_REDIRECT &&
1841             icmph->type != ICMP_TIME_EXCEEDED &&
1842             icmph->type != ICMP_PARAMETERPROB)
1843                 return;
1844
1845         inner_iph = skb_header_pointer(skb,
1846                                        outer_iph->ihl * 4 + sizeof(_icmph),
1847                                        sizeof(_inner_iph), &_inner_iph);
1848         if (!inner_iph)
1849                 return;
1850         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1851         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1852 }
1853
1854 /* if skb is set it will be used and fl4 can be NULL */
1855 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1856                        const struct sk_buff *skb)
1857 {
1858         struct net *net = fi->fib_net;
1859         struct flow_keys hash_keys;
1860         u32 mhash;
1861
1862         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1863         case 0:
1864                 memset(&hash_keys, 0, sizeof(hash_keys));
1865                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1866                 if (skb) {
1867                         ip_multipath_l3_keys(skb, &hash_keys);
1868                 } else {
1869                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1870                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1871                 }
1872                 break;
1873         case 1:
1874                 /* skb is currently provided only when forwarding */
1875                 if (skb) {
1876                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1877                         struct flow_keys keys;
1878
1879                         /* short-circuit if we already have L4 hash present */
1880                         if (skb->l4_hash)
1881                                 return skb_get_hash_raw(skb) >> 1;
1882                         memset(&hash_keys, 0, sizeof(hash_keys));
1883                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1884
1885                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1886                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1887                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1888                         hash_keys.ports.src = keys.ports.src;
1889                         hash_keys.ports.dst = keys.ports.dst;
1890                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1891                 } else {
1892                         memset(&hash_keys, 0, sizeof(hash_keys));
1893                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1894                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1895                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1896                         hash_keys.ports.src = fl4->fl4_sport;
1897                         hash_keys.ports.dst = fl4->fl4_dport;
1898                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1899                 }
1900                 break;
1901         }
1902         mhash = flow_hash_from_keys(&hash_keys);
1903
1904         return mhash >> 1;
1905 }
1906 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1907 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1908
1909 static int ip_mkroute_input(struct sk_buff *skb,
1910                             struct fib_result *res,
1911                             struct in_device *in_dev,
1912                             __be32 daddr, __be32 saddr, u32 tos)
1913 {
1914 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1915         if (res->fi && res->fi->fib_nhs > 1) {
1916                 int h = fib_multipath_hash(res->fi, NULL, skb);
1917
1918                 fib_select_multipath(res, h);
1919         }
1920 #endif
1921
1922         /* create a routing cache entry */
1923         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1924 }
1925
1926 /*
1927  *      NOTE. We drop all the packets that has local source
1928  *      addresses, because every properly looped back packet
1929  *      must have correct destination already attached by output routine.
1930  *
1931  *      Such approach solves two big problems:
1932  *      1. Not simplex devices are handled properly.
1933  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1934  *      called with rcu_read_lock()
1935  */
1936
1937 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1938                                u8 tos, struct net_device *dev,
1939                                struct fib_result *res)
1940 {
1941         struct in_device *in_dev = __in_dev_get_rcu(dev);
1942         struct ip_tunnel_info *tun_info;
1943         struct flowi4   fl4;
1944         unsigned int    flags = 0;
1945         u32             itag = 0;
1946         struct rtable   *rth;
1947         int             err = -EINVAL;
1948         struct net    *net = dev_net(dev);
1949         bool do_cache;
1950
1951         /* IP on this device is disabled. */
1952
1953         if (!in_dev)
1954                 goto out;
1955
1956         /* Check for the most weird martians, which can be not detected
1957            by fib_lookup.
1958          */
1959
1960         tun_info = skb_tunnel_info(skb);
1961         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1962                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1963         else
1964                 fl4.flowi4_tun_key.tun_id = 0;
1965         skb_dst_drop(skb);
1966
1967         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1968                 goto martian_source;
1969
1970         res->fi = NULL;
1971         res->table = NULL;
1972         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1973                 goto brd_input;
1974
1975         /* Accept zero addresses only to limited broadcast;
1976          * I even do not know to fix it or not. Waiting for complains :-)
1977          */
1978         if (ipv4_is_zeronet(saddr))
1979                 goto martian_source;
1980
1981         if (ipv4_is_zeronet(daddr))
1982                 goto martian_destination;
1983
1984         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1985          * and call it once if daddr or/and saddr are loopback addresses
1986          */
1987         if (ipv4_is_loopback(daddr)) {
1988                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1989                         goto martian_destination;
1990         } else if (ipv4_is_loopback(saddr)) {
1991                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1992                         goto martian_source;
1993         }
1994
1995         /*
1996          *      Now we are ready to route packet.
1997          */
1998         fl4.flowi4_oif = 0;
1999         fl4.flowi4_iif = dev->ifindex;
2000         fl4.flowi4_mark = skb->mark;
2001         fl4.flowi4_tos = tos;
2002         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2003         fl4.flowi4_flags = 0;
2004         fl4.daddr = daddr;
2005         fl4.saddr = saddr;
2006         fl4.flowi4_uid = sock_net_uid(net, NULL);
2007         err = fib_lookup(net, &fl4, res, 0);
2008         if (err != 0) {
2009                 if (!IN_DEV_FORWARD(in_dev))
2010                         err = -EHOSTUNREACH;
2011                 goto no_route;
2012         }
2013
2014         if (res->type == RTN_BROADCAST)
2015                 goto brd_input;
2016
2017         if (res->type == RTN_LOCAL) {
2018                 err = fib_validate_source(skb, saddr, daddr, tos,
2019                                           0, dev, in_dev, &itag);
2020                 if (err < 0)
2021                         goto martian_source;
2022                 goto local_input;
2023         }
2024
2025         if (!IN_DEV_FORWARD(in_dev)) {
2026                 err = -EHOSTUNREACH;
2027                 goto no_route;
2028         }
2029         if (res->type != RTN_UNICAST)
2030                 goto martian_destination;
2031
2032         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2033 out:    return err;
2034
2035 brd_input:
2036         if (skb->protocol != htons(ETH_P_IP))
2037                 goto e_inval;
2038
2039         if (!ipv4_is_zeronet(saddr)) {
2040                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2041                                           in_dev, &itag);
2042                 if (err < 0)
2043                         goto martian_source;
2044         }
2045         flags |= RTCF_BROADCAST;
2046         res->type = RTN_BROADCAST;
2047         RT_CACHE_STAT_INC(in_brd);
2048
2049 local_input:
2050         do_cache = false;
2051         if (res->fi) {
2052                 if (!itag) {
2053                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2054                         if (rt_cache_valid(rth)) {
2055                                 skb_dst_set_noref(skb, &rth->dst);
2056                                 err = 0;
2057                                 goto out;
2058                         }
2059                         do_cache = true;
2060                 }
2061         }
2062
2063         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2064                            flags | RTCF_LOCAL, res->type,
2065                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2066         if (!rth)
2067                 goto e_nobufs;
2068
2069         rth->dst.output= ip_rt_bug;
2070 #ifdef CONFIG_IP_ROUTE_CLASSID
2071         rth->dst.tclassid = itag;
2072 #endif
2073         rth->rt_is_input = 1;
2074         if (res->table)
2075                 rth->rt_table_id = res->table->tb_id;
2076
2077         RT_CACHE_STAT_INC(in_slow_tot);
2078         if (res->type == RTN_UNREACHABLE) {
2079                 rth->dst.input= ip_error;
2080                 rth->dst.error= -err;
2081                 rth->rt_flags   &= ~RTCF_LOCAL;
2082         }
2083
2084         if (do_cache) {
2085                 struct fib_nh *nh = &FIB_RES_NH(*res);
2086
2087                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2088                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2089                         WARN_ON(rth->dst.input == lwtunnel_input);
2090                         rth->dst.lwtstate->orig_input = rth->dst.input;
2091                         rth->dst.input = lwtunnel_input;
2092                 }
2093
2094                 if (unlikely(!rt_cache_route(nh, rth)))
2095                         rt_add_uncached_list(rth);
2096         }
2097         skb_dst_set(skb, &rth->dst);
2098         err = 0;
2099         goto out;
2100
2101 no_route:
2102         RT_CACHE_STAT_INC(in_no_route);
2103         res->type = RTN_UNREACHABLE;
2104         res->fi = NULL;
2105         res->table = NULL;
2106         goto local_input;
2107
2108         /*
2109          *      Do not cache martian addresses: they should be logged (RFC1812)
2110          */
2111 martian_destination:
2112         RT_CACHE_STAT_INC(in_martian_dst);
2113 #ifdef CONFIG_IP_ROUTE_VERBOSE
2114         if (IN_DEV_LOG_MARTIANS(in_dev))
2115                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2116                                      &daddr, &saddr, dev->name);
2117 #endif
2118
2119 e_inval:
2120         err = -EINVAL;
2121         goto out;
2122
2123 e_nobufs:
2124         err = -ENOBUFS;
2125         goto out;
2126
2127 martian_source:
2128         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2129         goto out;
2130 }
2131
2132 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2133                          u8 tos, struct net_device *dev)
2134 {
2135         struct fib_result res;
2136         int err;
2137
2138         tos &= IPTOS_RT_MASK;
2139         rcu_read_lock();
2140         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2141         rcu_read_unlock();
2142
2143         return err;
2144 }
2145 EXPORT_SYMBOL(ip_route_input_noref);
2146
2147 /* called with rcu_read_lock held */
2148 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2149                        u8 tos, struct net_device *dev, struct fib_result *res)
2150 {
2151         /* Multicast recognition logic is moved from route cache to here.
2152            The problem was that too many Ethernet cards have broken/missing
2153            hardware multicast filters :-( As result the host on multicasting
2154            network acquires a lot of useless route cache entries, sort of
2155            SDR messages from all the world. Now we try to get rid of them.
2156            Really, provided software IP multicast filter is organized
2157            reasonably (at least, hashed), it does not result in a slowdown
2158            comparing with route cache reject entries.
2159            Note, that multicast routers are not affected, because
2160            route cache entry is created eventually.
2161          */
2162         if (ipv4_is_multicast(daddr)) {
2163                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2164                 int our = 0;
2165                 int err = -EINVAL;
2166
2167                 if (!in_dev)
2168                         return err;
2169                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2170                                       ip_hdr(skb)->protocol);
2171
2172                 /* check l3 master if no match yet */
2173                 if (!our && netif_is_l3_slave(dev)) {
2174                         struct in_device *l3_in_dev;
2175
2176                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2177                         if (l3_in_dev)
2178                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2179                                                       ip_hdr(skb)->protocol);
2180                 }
2181
2182                 if (our
2183 #ifdef CONFIG_IP_MROUTE
2184                         ||
2185                     (!ipv4_is_local_multicast(daddr) &&
2186                      IN_DEV_MFORWARD(in_dev))
2187 #endif
2188                    ) {
2189                         err = ip_route_input_mc(skb, daddr, saddr,
2190                                                 tos, dev, our);
2191                 }
2192                 return err;
2193         }
2194
2195         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2196 }
2197
2198 /* called with rcu_read_lock() */
2199 static struct rtable *__mkroute_output(const struct fib_result *res,
2200                                        const struct flowi4 *fl4, int orig_oif,
2201                                        struct net_device *dev_out,
2202                                        unsigned int flags)
2203 {
2204         struct fib_info *fi = res->fi;
2205         struct fib_nh_exception *fnhe;
2206         struct in_device *in_dev;
2207         u16 type = res->type;
2208         struct rtable *rth;
2209         bool do_cache;
2210
2211         in_dev = __in_dev_get_rcu(dev_out);
2212         if (!in_dev)
2213                 return ERR_PTR(-EINVAL);
2214
2215         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2216                 if (ipv4_is_loopback(fl4->saddr) &&
2217                     !(dev_out->flags & IFF_LOOPBACK) &&
2218                     !netif_is_l3_master(dev_out))
2219                         return ERR_PTR(-EINVAL);
2220
2221         if (ipv4_is_lbcast(fl4->daddr))
2222                 type = RTN_BROADCAST;
2223         else if (ipv4_is_multicast(fl4->daddr))
2224                 type = RTN_MULTICAST;
2225         else if (ipv4_is_zeronet(fl4->daddr))
2226                 return ERR_PTR(-EINVAL);
2227
2228         if (dev_out->flags & IFF_LOOPBACK)
2229                 flags |= RTCF_LOCAL;
2230
2231         do_cache = true;
2232         if (type == RTN_BROADCAST) {
2233                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2234                 fi = NULL;
2235         } else if (type == RTN_MULTICAST) {
2236                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2237                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2238                                      fl4->flowi4_proto))
2239                         flags &= ~RTCF_LOCAL;
2240                 else
2241                         do_cache = false;
2242                 /* If multicast route do not exist use
2243                  * default one, but do not gateway in this case.
2244                  * Yes, it is hack.
2245                  */
2246                 if (fi && res->prefixlen < 4)
2247                         fi = NULL;
2248         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2249                    (orig_oif != dev_out->ifindex)) {
2250                 /* For local routes that require a particular output interface
2251                  * we do not want to cache the result.  Caching the result
2252                  * causes incorrect behaviour when there are multiple source
2253                  * addresses on the interface, the end result being that if the
2254                  * intended recipient is waiting on that interface for the
2255                  * packet he won't receive it because it will be delivered on
2256                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2257                  * be set to the loopback interface as well.
2258                  */
2259                 do_cache = false;
2260         }
2261
2262         fnhe = NULL;
2263         do_cache &= fi != NULL;
2264         if (fi) {
2265                 struct rtable __rcu **prth;
2266                 struct fib_nh *nh = &FIB_RES_NH(*res);
2267
2268                 fnhe = find_exception(nh, fl4->daddr);
2269                 if (!do_cache)
2270                         goto add;
2271                 if (fnhe) {
2272                         prth = &fnhe->fnhe_rth_output;
2273                 } else {
2274                         if (unlikely(fl4->flowi4_flags &
2275                                      FLOWI_FLAG_KNOWN_NH &&
2276                                      !(nh->nh_gw &&
2277                                        nh->nh_scope == RT_SCOPE_LINK))) {
2278                                 do_cache = false;
2279                                 goto add;
2280                         }
2281                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2282                 }
2283                 rth = rcu_dereference(*prth);
2284                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2285                         return rth;
2286         }
2287
2288 add:
2289         rth = rt_dst_alloc(dev_out, flags, type,
2290                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2291                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2292                            do_cache);
2293         if (!rth)
2294                 return ERR_PTR(-ENOBUFS);
2295
2296         rth->rt_iif = orig_oif;
2297         if (res->table)
2298                 rth->rt_table_id = res->table->tb_id;
2299
2300         RT_CACHE_STAT_INC(out_slow_tot);
2301
2302         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2303                 if (flags & RTCF_LOCAL &&
2304                     !(dev_out->flags & IFF_LOOPBACK)) {
2305                         rth->dst.output = ip_mc_output;
2306                         RT_CACHE_STAT_INC(out_slow_mc);
2307                 }
2308 #ifdef CONFIG_IP_MROUTE
2309                 if (type == RTN_MULTICAST) {
2310                         if (IN_DEV_MFORWARD(in_dev) &&
2311                             !ipv4_is_local_multicast(fl4->daddr)) {
2312                                 rth->dst.input = ip_mr_input;
2313                                 rth->dst.output = ip_mc_output;
2314                         }
2315                 }
2316 #endif
2317         }
2318
2319         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2320         set_lwt_redirect(rth);
2321
2322         return rth;
2323 }
2324
2325 /*
2326  * Major route resolver routine.
2327  */
2328
2329 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2330                                         const struct sk_buff *skb)
2331 {
2332         __u8 tos = RT_FL_TOS(fl4);
2333         struct fib_result res = {
2334                 .type           = RTN_UNSPEC,
2335                 .fi             = NULL,
2336                 .table          = NULL,
2337                 .tclassid       = 0,
2338         };
2339         struct rtable *rth;
2340
2341         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2342         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2343         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2344                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2345
2346         rcu_read_lock();
2347         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2348         rcu_read_unlock();
2349
2350         return rth;
2351 }
2352 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2353
2354 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2355                                             struct fib_result *res,
2356                                             const struct sk_buff *skb)
2357 {
2358         struct net_device *dev_out = NULL;
2359         int orig_oif = fl4->flowi4_oif;
2360         unsigned int flags = 0;
2361         struct rtable *rth;
2362         int err;
2363
2364         if (fl4->saddr) {
2365                 if (ipv4_is_multicast(fl4->saddr) ||
2366                     ipv4_is_lbcast(fl4->saddr) ||
2367                     ipv4_is_zeronet(fl4->saddr)) {
2368                         rth = ERR_PTR(-EINVAL);
2369                         goto out;
2370                 }
2371
2372                 rth = ERR_PTR(-ENETUNREACH);
2373
2374                 /* I removed check for oif == dev_out->oif here.
2375                    It was wrong for two reasons:
2376                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2377                       is assigned to multiple interfaces.
2378                    2. Moreover, we are allowed to send packets with saddr
2379                       of another iface. --ANK
2380                  */
2381
2382                 if (fl4->flowi4_oif == 0 &&
2383                     (ipv4_is_multicast(fl4->daddr) ||
2384                      ipv4_is_lbcast(fl4->daddr))) {
2385                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2386                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2387                         if (!dev_out)
2388                                 goto out;
2389
2390                         /* Special hack: user can direct multicasts
2391                            and limited broadcast via necessary interface
2392                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2393                            This hack is not just for fun, it allows
2394                            vic,vat and friends to work.
2395                            They bind socket to loopback, set ttl to zero
2396                            and expect that it will work.
2397                            From the viewpoint of routing cache they are broken,
2398                            because we are not allowed to build multicast path
2399                            with loopback source addr (look, routing cache
2400                            cannot know, that ttl is zero, so that packet
2401                            will not leave this host and route is valid).
2402                            Luckily, this hack is good workaround.
2403                          */
2404
2405                         fl4->flowi4_oif = dev_out->ifindex;
2406                         goto make_route;
2407                 }
2408
2409                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2410                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2411                         if (!__ip_dev_find(net, fl4->saddr, false))
2412                                 goto out;
2413                 }
2414         }
2415
2416
2417         if (fl4->flowi4_oif) {
2418                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2419                 rth = ERR_PTR(-ENODEV);
2420                 if (!dev_out)
2421                         goto out;
2422
2423                 /* RACE: Check return value of inet_select_addr instead. */
2424                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2425                         rth = ERR_PTR(-ENETUNREACH);
2426                         goto out;
2427                 }
2428                 if (ipv4_is_local_multicast(fl4->daddr) ||
2429                     ipv4_is_lbcast(fl4->daddr) ||
2430                     fl4->flowi4_proto == IPPROTO_IGMP) {
2431                         if (!fl4->saddr)
2432                                 fl4->saddr = inet_select_addr(dev_out, 0,
2433                                                               RT_SCOPE_LINK);
2434                         goto make_route;
2435                 }
2436                 if (!fl4->saddr) {
2437                         if (ipv4_is_multicast(fl4->daddr))
2438                                 fl4->saddr = inet_select_addr(dev_out, 0,
2439                                                               fl4->flowi4_scope);
2440                         else if (!fl4->daddr)
2441                                 fl4->saddr = inet_select_addr(dev_out, 0,
2442                                                               RT_SCOPE_HOST);
2443                 }
2444         }
2445
2446         if (!fl4->daddr) {
2447                 fl4->daddr = fl4->saddr;
2448                 if (!fl4->daddr)
2449                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2450                 dev_out = net->loopback_dev;
2451                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2452                 res->type = RTN_LOCAL;
2453                 flags |= RTCF_LOCAL;
2454                 goto make_route;
2455         }
2456
2457         err = fib_lookup(net, fl4, res, 0);
2458         if (err) {
2459                 res->fi = NULL;
2460                 res->table = NULL;
2461                 if (fl4->flowi4_oif &&
2462                     (ipv4_is_multicast(fl4->daddr) ||
2463                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2464                         /* Apparently, routing tables are wrong. Assume,
2465                            that the destination is on link.
2466
2467                            WHY? DW.
2468                            Because we are allowed to send to iface
2469                            even if it has NO routes and NO assigned
2470                            addresses. When oif is specified, routing
2471                            tables are looked up with only one purpose:
2472                            to catch if destination is gatewayed, rather than
2473                            direct. Moreover, if MSG_DONTROUTE is set,
2474                            we send packet, ignoring both routing tables
2475                            and ifaddr state. --ANK
2476
2477
2478                            We could make it even if oif is unknown,
2479                            likely IPv6, but we do not.
2480                          */
2481
2482                         if (fl4->saddr == 0)
2483                                 fl4->saddr = inet_select_addr(dev_out, 0,
2484                                                               RT_SCOPE_LINK);
2485                         res->type = RTN_UNICAST;
2486                         goto make_route;
2487                 }
2488                 rth = ERR_PTR(err);
2489                 goto out;
2490         }
2491
2492         if (res->type == RTN_LOCAL) {
2493                 if (!fl4->saddr) {
2494                         if (res->fi->fib_prefsrc)
2495                                 fl4->saddr = res->fi->fib_prefsrc;
2496                         else
2497                                 fl4->saddr = fl4->daddr;
2498                 }
2499
2500                 /* L3 master device is the loopback for that domain */
2501                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2502                         net->loopback_dev;
2503
2504                 /* make sure orig_oif points to fib result device even
2505                  * though packet rx/tx happens over loopback or l3mdev
2506                  */
2507                 orig_oif = FIB_RES_OIF(*res);
2508
2509                 fl4->flowi4_oif = dev_out->ifindex;
2510                 flags |= RTCF_LOCAL;
2511                 goto make_route;
2512         }
2513
2514         fib_select_path(net, res, fl4, skb);
2515
2516         dev_out = FIB_RES_DEV(*res);
2517         fl4->flowi4_oif = dev_out->ifindex;
2518
2519
2520 make_route:
2521         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2522
2523 out:
2524         return rth;
2525 }
2526
2527 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2528 {
2529         return NULL;
2530 }
2531
2532 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2533 {
2534         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2535
2536         return mtu ? : dst->dev->mtu;
2537 }
2538
2539 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2540                                           struct sk_buff *skb, u32 mtu,
2541                                           bool confirm_neigh)
2542 {
2543 }
2544
2545 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2546                                        struct sk_buff *skb)
2547 {
2548 }
2549
2550 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2551                                           unsigned long old)
2552 {
2553         return NULL;
2554 }
2555
2556 static struct dst_ops ipv4_dst_blackhole_ops = {
2557         .family                 =       AF_INET,
2558         .check                  =       ipv4_blackhole_dst_check,
2559         .mtu                    =       ipv4_blackhole_mtu,
2560         .default_advmss         =       ipv4_default_advmss,
2561         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2562         .redirect               =       ipv4_rt_blackhole_redirect,
2563         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2564         .neigh_lookup           =       ipv4_neigh_lookup,
2565 };
2566
2567 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2568 {
2569         struct rtable *ort = (struct rtable *) dst_orig;
2570         struct rtable *rt;
2571
2572         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2573         if (rt) {
2574                 struct dst_entry *new = &rt->dst;
2575
2576                 new->__use = 1;
2577                 new->input = dst_discard;
2578                 new->output = dst_discard_out;
2579
2580                 new->dev = net->loopback_dev;
2581                 if (new->dev)
2582                         dev_hold(new->dev);
2583
2584                 rt->rt_is_input = ort->rt_is_input;
2585                 rt->rt_iif = ort->rt_iif;
2586                 rt->rt_pmtu = ort->rt_pmtu;
2587                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2588
2589                 rt->rt_genid = rt_genid_ipv4(net);
2590                 rt->rt_flags = ort->rt_flags;
2591                 rt->rt_type = ort->rt_type;
2592                 rt->rt_gateway = ort->rt_gateway;
2593                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2594
2595                 INIT_LIST_HEAD(&rt->rt_uncached);
2596         }
2597
2598         dst_release(dst_orig);
2599
2600         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2601 }
2602
2603 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2604                                     const struct sock *sk)
2605 {
2606         struct rtable *rt = __ip_route_output_key(net, flp4);
2607
2608         if (IS_ERR(rt))
2609                 return rt;
2610
2611         if (flp4->flowi4_proto)
2612                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2613                                                         flowi4_to_flowi(flp4),
2614                                                         sk, 0);
2615
2616         return rt;
2617 }
2618 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2619
2620 /* called with rcu_read_lock held */
2621 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2622                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2623                         u32 seq)
2624 {
2625         struct rtable *rt = skb_rtable(skb);
2626         struct rtmsg *r;
2627         struct nlmsghdr *nlh;
2628         unsigned long expires = 0;
2629         u32 error;
2630         u32 metrics[RTAX_MAX];
2631
2632         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2633         if (!nlh)
2634                 return -EMSGSIZE;
2635
2636         r = nlmsg_data(nlh);
2637         r->rtm_family    = AF_INET;
2638         r->rtm_dst_len  = 32;
2639         r->rtm_src_len  = 0;
2640         r->rtm_tos      = fl4->flowi4_tos;
2641         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2642         if (nla_put_u32(skb, RTA_TABLE, table_id))
2643                 goto nla_put_failure;
2644         r->rtm_type     = rt->rt_type;
2645         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2646         r->rtm_protocol = RTPROT_UNSPEC;
2647         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648         if (rt->rt_flags & RTCF_NOTIFY)
2649                 r->rtm_flags |= RTM_F_NOTIFY;
2650         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2651                 r->rtm_flags |= RTCF_DOREDIRECT;
2652
2653         if (nla_put_in_addr(skb, RTA_DST, dst))
2654                 goto nla_put_failure;
2655         if (src) {
2656                 r->rtm_src_len = 32;
2657                 if (nla_put_in_addr(skb, RTA_SRC, src))
2658                         goto nla_put_failure;
2659         }
2660         if (rt->dst.dev &&
2661             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2662                 goto nla_put_failure;
2663 #ifdef CONFIG_IP_ROUTE_CLASSID
2664         if (rt->dst.tclassid &&
2665             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2666                 goto nla_put_failure;
2667 #endif
2668         if (!rt_is_input_route(rt) &&
2669             fl4->saddr != src) {
2670                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2671                         goto nla_put_failure;
2672         }
2673         if (rt->rt_uses_gateway &&
2674             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2675                 goto nla_put_failure;
2676
2677         expires = rt->dst.expires;
2678         if (expires) {
2679                 unsigned long now = jiffies;
2680
2681                 if (time_before(now, expires))
2682                         expires -= now;
2683                 else
2684                         expires = 0;
2685         }
2686
2687         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2688         if (rt->rt_pmtu && expires)
2689                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2690         if (rt->rt_mtu_locked && expires)
2691                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2692         if (rtnetlink_put_metrics(skb, metrics) < 0)
2693                 goto nla_put_failure;
2694
2695         if (fl4->flowi4_mark &&
2696             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2697                 goto nla_put_failure;
2698
2699         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2700             nla_put_u32(skb, RTA_UID,
2701                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2702                 goto nla_put_failure;
2703
2704         error = rt->dst.error;
2705
2706         if (rt_is_input_route(rt)) {
2707 #ifdef CONFIG_IP_MROUTE
2708                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2709                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2710                         int err = ipmr_get_route(net, skb,
2711                                                  fl4->saddr, fl4->daddr,
2712                                                  r, portid);
2713
2714                         if (err <= 0) {
2715                                 if (err == 0)
2716                                         return 0;
2717                                 goto nla_put_failure;
2718                         }
2719                 } else
2720 #endif
2721                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2722                                 goto nla_put_failure;
2723         }
2724
2725         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2726                 goto nla_put_failure;
2727
2728         nlmsg_end(skb, nlh);
2729         return 0;
2730
2731 nla_put_failure:
2732         nlmsg_cancel(skb, nlh);
2733         return -EMSGSIZE;
2734 }
2735
2736 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2737                              struct netlink_ext_ack *extack)
2738 {
2739         struct net *net = sock_net(in_skb->sk);
2740         struct rtmsg *rtm;
2741         struct nlattr *tb[RTA_MAX+1];
2742         struct fib_result res = {};
2743         struct rtable *rt = NULL;
2744         struct flowi4 fl4;
2745         __be32 dst = 0;
2746         __be32 src = 0;
2747         u32 iif;
2748         int err;
2749         int mark;
2750         struct sk_buff *skb;
2751         u32 table_id = RT_TABLE_MAIN;
2752         kuid_t uid;
2753
2754         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2755                           extack);
2756         if (err < 0)
2757                 goto errout;
2758
2759         rtm = nlmsg_data(nlh);
2760
2761         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2762         if (!skb) {
2763                 err = -ENOBUFS;
2764                 goto errout;
2765         }
2766
2767         /* Reserve room for dummy headers, this skb can pass
2768            through good chunk of routing engine.
2769          */
2770         skb_reset_mac_header(skb);
2771         skb_reset_network_header(skb);
2772
2773         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2774         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2775         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2776         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2777         if (tb[RTA_UID])
2778                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2779         else
2780                 uid = (iif ? INVALID_UID : current_uid());
2781
2782         /* Bugfix: need to give ip_route_input enough of an IP header to
2783          * not gag.
2784          */
2785         ip_hdr(skb)->protocol = IPPROTO_UDP;
2786         ip_hdr(skb)->saddr = src;
2787         ip_hdr(skb)->daddr = dst;
2788
2789         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2790
2791         memset(&fl4, 0, sizeof(fl4));
2792         fl4.daddr = dst;
2793         fl4.saddr = src;
2794         fl4.flowi4_tos = rtm->rtm_tos;
2795         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2796         fl4.flowi4_mark = mark;
2797         fl4.flowi4_uid = uid;
2798
2799         rcu_read_lock();
2800
2801         if (iif) {
2802                 struct net_device *dev;
2803
2804                 dev = dev_get_by_index_rcu(net, iif);
2805                 if (!dev) {
2806                         err = -ENODEV;
2807                         goto errout_free;
2808                 }
2809
2810                 skb->protocol   = htons(ETH_P_IP);
2811                 skb->dev        = dev;
2812                 skb->mark       = mark;
2813                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2814                                          dev, &res);
2815
2816                 rt = skb_rtable(skb);
2817                 if (err == 0 && rt->dst.error)
2818                         err = -rt->dst.error;
2819         } else {
2820                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2821                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2822                 err = 0;
2823                 if (IS_ERR(rt))
2824                         err = PTR_ERR(rt);
2825                 else
2826                         skb_dst_set(skb, &rt->dst);
2827         }
2828
2829         if (err)
2830                 goto errout_free;
2831
2832         if (rtm->rtm_flags & RTM_F_NOTIFY)
2833                 rt->rt_flags |= RTCF_NOTIFY;
2834
2835         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2836                 table_id = rt->rt_table_id;
2837
2838         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2839                 if (!res.fi) {
2840                         err = fib_props[res.type].error;
2841                         if (!err)
2842                                 err = -EHOSTUNREACH;
2843                         goto errout_free;
2844                 }
2845                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2846                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2847                                     rt->rt_type, res.prefix, res.prefixlen,
2848                                     fl4.flowi4_tos, res.fi, 0);
2849         } else {
2850                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2851                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2852         }
2853         if (err < 0)
2854                 goto errout_free;
2855
2856         rcu_read_unlock();
2857
2858         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2859 errout:
2860         return err;
2861
2862 errout_free:
2863         rcu_read_unlock();
2864         kfree_skb(skb);
2865         goto errout;
2866 }
2867
2868 void ip_rt_multicast_event(struct in_device *in_dev)
2869 {
2870         rt_cache_flush(dev_net(in_dev->dev));
2871 }
2872
2873 #ifdef CONFIG_SYSCTL
2874 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2875 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2876 static int ip_rt_gc_elasticity __read_mostly    = 8;
2877
2878 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2879                                         void __user *buffer,
2880                                         size_t *lenp, loff_t *ppos)
2881 {
2882         struct net *net = (struct net *)__ctl->extra1;
2883
2884         if (write) {
2885                 rt_cache_flush(net);
2886                 fnhe_genid_bump(net);
2887                 return 0;
2888         }
2889
2890         return -EINVAL;
2891 }
2892
2893 static struct ctl_table ipv4_route_table[] = {
2894         {
2895                 .procname       = "gc_thresh",
2896                 .data           = &ipv4_dst_ops.gc_thresh,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = proc_dointvec,
2900         },
2901         {
2902                 .procname       = "max_size",
2903                 .data           = &ip_rt_max_size,
2904                 .maxlen         = sizeof(int),
2905                 .mode           = 0644,
2906                 .proc_handler   = proc_dointvec,
2907         },
2908         {
2909                 /*  Deprecated. Use gc_min_interval_ms */
2910
2911                 .procname       = "gc_min_interval",
2912                 .data           = &ip_rt_gc_min_interval,
2913                 .maxlen         = sizeof(int),
2914                 .mode           = 0644,
2915                 .proc_handler   = proc_dointvec_jiffies,
2916         },
2917         {
2918                 .procname       = "gc_min_interval_ms",
2919                 .data           = &ip_rt_gc_min_interval,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = proc_dointvec_ms_jiffies,
2923         },
2924         {
2925                 .procname       = "gc_timeout",
2926                 .data           = &ip_rt_gc_timeout,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = proc_dointvec_jiffies,
2930         },
2931         {
2932                 .procname       = "gc_interval",
2933                 .data           = &ip_rt_gc_interval,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = proc_dointvec_jiffies,
2937         },
2938         {
2939                 .procname       = "redirect_load",
2940                 .data           = &ip_rt_redirect_load,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = proc_dointvec,
2944         },
2945         {
2946                 .procname       = "redirect_number",
2947                 .data           = &ip_rt_redirect_number,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = proc_dointvec,
2951         },
2952         {
2953                 .procname       = "redirect_silence",
2954                 .data           = &ip_rt_redirect_silence,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = proc_dointvec,
2958         },
2959         {
2960                 .procname       = "error_cost",
2961                 .data           = &ip_rt_error_cost,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = proc_dointvec,
2965         },
2966         {
2967                 .procname       = "error_burst",
2968                 .data           = &ip_rt_error_burst,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = proc_dointvec,
2972         },
2973         {
2974                 .procname       = "gc_elasticity",
2975                 .data           = &ip_rt_gc_elasticity,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = proc_dointvec,
2979         },
2980         {
2981                 .procname       = "mtu_expires",
2982                 .data           = &ip_rt_mtu_expires,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = proc_dointvec_jiffies,
2986         },
2987         {
2988                 .procname       = "min_pmtu",
2989                 .data           = &ip_rt_min_pmtu,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = proc_dointvec_minmax,
2993                 .extra1         = &ip_min_valid_pmtu,
2994         },
2995         {
2996                 .procname       = "min_adv_mss",
2997                 .data           = &ip_rt_min_advmss,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = proc_dointvec,
3001         },
3002         { }
3003 };
3004
3005 static struct ctl_table ipv4_route_flush_table[] = {
3006         {
3007                 .procname       = "flush",
3008                 .maxlen         = sizeof(int),
3009                 .mode           = 0200,
3010                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3011         },
3012         { },
3013 };
3014
3015 static __net_init int sysctl_route_net_init(struct net *net)
3016 {
3017         struct ctl_table *tbl;
3018
3019         tbl = ipv4_route_flush_table;
3020         if (!net_eq(net, &init_net)) {
3021                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3022                 if (!tbl)
3023                         goto err_dup;
3024
3025                 /* Don't export sysctls to unprivileged users */
3026                 if (net->user_ns != &init_user_ns)
3027                         tbl[0].procname = NULL;
3028         }
3029         tbl[0].extra1 = net;
3030
3031         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3032         if (!net->ipv4.route_hdr)
3033                 goto err_reg;
3034         return 0;
3035
3036 err_reg:
3037         if (tbl != ipv4_route_flush_table)
3038                 kfree(tbl);
3039 err_dup:
3040         return -ENOMEM;
3041 }
3042
3043 static __net_exit void sysctl_route_net_exit(struct net *net)
3044 {
3045         struct ctl_table *tbl;
3046
3047         tbl = net->ipv4.route_hdr->ctl_table_arg;
3048         unregister_net_sysctl_table(net->ipv4.route_hdr);
3049         BUG_ON(tbl == ipv4_route_flush_table);
3050         kfree(tbl);
3051 }
3052
3053 static __net_initdata struct pernet_operations sysctl_route_ops = {
3054         .init = sysctl_route_net_init,
3055         .exit = sysctl_route_net_exit,
3056 };
3057 #endif
3058
3059 static __net_init int rt_genid_init(struct net *net)
3060 {
3061         atomic_set(&net->ipv4.rt_genid, 0);
3062         atomic_set(&net->fnhe_genid, 0);
3063         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3064         return 0;
3065 }
3066
3067 static __net_initdata struct pernet_operations rt_genid_ops = {
3068         .init = rt_genid_init,
3069 };
3070
3071 static int __net_init ipv4_inetpeer_init(struct net *net)
3072 {
3073         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3074
3075         if (!bp)
3076                 return -ENOMEM;
3077         inet_peer_base_init(bp);
3078         net->ipv4.peers = bp;
3079         return 0;
3080 }
3081
3082 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3083 {
3084         struct inet_peer_base *bp = net->ipv4.peers;
3085
3086         net->ipv4.peers = NULL;
3087         inetpeer_invalidate_tree(bp);
3088         kfree(bp);
3089 }
3090
3091 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3092         .init   =       ipv4_inetpeer_init,
3093         .exit   =       ipv4_inetpeer_exit,
3094 };
3095
3096 #ifdef CONFIG_IP_ROUTE_CLASSID
3097 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3098 #endif /* CONFIG_IP_ROUTE_CLASSID */
3099
3100 int __init ip_rt_init(void)
3101 {
3102         int cpu;
3103
3104         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3105         if (!ip_idents)
3106                 panic("IP: failed to allocate ip_idents\n");
3107
3108         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3109
3110         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3111         if (!ip_tstamps)
3112                 panic("IP: failed to allocate ip_tstamps\n");
3113
3114         for_each_possible_cpu(cpu) {
3115                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3116
3117                 INIT_LIST_HEAD(&ul->head);
3118                 spin_lock_init(&ul->lock);
3119         }
3120 #ifdef CONFIG_IP_ROUTE_CLASSID
3121         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3122         if (!ip_rt_acct)
3123                 panic("IP: failed to allocate ip_rt_acct\n");
3124 #endif
3125
3126         ipv4_dst_ops.kmem_cachep =
3127                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3128                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3129
3130         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3131
3132         if (dst_entries_init(&ipv4_dst_ops) < 0)
3133                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3134
3135         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3136                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3137
3138         ipv4_dst_ops.gc_thresh = ~0;
3139         ip_rt_max_size = INT_MAX;
3140
3141         devinet_init();
3142         ip_fib_init();
3143
3144         if (ip_rt_proc_init())
3145                 pr_err("Unable to create route proc files\n");
3146 #ifdef CONFIG_XFRM
3147         xfrm_init();
3148         xfrm4_init();
3149 #endif
3150         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3151                       RTNL_FLAG_DOIT_UNLOCKED);
3152
3153 #ifdef CONFIG_SYSCTL
3154         register_pernet_subsys(&sysctl_route_ops);
3155 #endif
3156         register_pernet_subsys(&rt_genid_ops);
3157         register_pernet_subsys(&ipv4_inetpeer_ops);
3158         return 0;
3159 }
3160
3161 #ifdef CONFIG_SYSCTL
3162 /*
3163  * We really need to sanitize the damn ipv4 init order, then all
3164  * this nonsense will go away.
3165  */
3166 void __init ip_static_sysctl_init(void)
3167 {
3168         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3169 }
3170 #endif