net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #define RT_FL_TOS(oldflp4) \
 118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu);
 144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                         struct sk_buff *skb);
 146 static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149 {
 150         WARN_ON(1);
 151         return NULL;
 152 }
 153
 154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                            struct sk_buff *skb,
 156                                            const void *daddr);
 157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 158
 159 static struct dst_ops ipv4_dst_ops = {
 160         .family =               AF_INET,
 161         .check =                ipv4_dst_check,
 162         .default_advmss =       ipv4_default_advmss,
 163         .mtu =                  ipv4_mtu,
 164         .cow_metrics =          ipv4_cow_metrics,
 165         .destroy =              ipv4_dst_destroy,
 166         .negative_advice =      ipv4_negative_advice,
 167         .link_failure =         ipv4_link_failure,
 168         .update_pmtu =          ip_rt_update_pmtu,
 169         .redirect =             ip_do_redirect,
 170         .local_out =            __ip_local_out,
 171         .neigh_lookup =         ipv4_neigh_lookup,
 172         .confirm_neigh =        ipv4_confirm_neigh,
 173 };
 174
 175 #define ECN_OR_COST(class)      TC_PRIO_##class
 176
 177 const __u8 ip_tos2prio[16] = {
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK)
 194 };
 195 EXPORT_SYMBOL(ip_tos2prio);
 196
 197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200 #ifdef CONFIG_PROC_FS
 201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202 {
 203         if (*pos)
 204                 return NULL;
 205         return SEQ_START_TOKEN;
 206 }
 207
 208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209 {
 210         ++*pos;
 211         return NULL;
 212 }
 213
 214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215 {
 216 }
 217
 218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219 {
 220         if (v == SEQ_START_TOKEN)
 221                 seq_printf(seq, "%-127s\n",
 222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224                            "HHUptod\tSpecDst");
 225         return 0;
 226 }
 227
 228 static const struct seq_operations rt_cache_seq_ops = {
 229         .start  = rt_cache_seq_start,
 230         .next   = rt_cache_seq_next,
 231         .stop   = rt_cache_seq_stop,
 232         .show   = rt_cache_seq_show,
 233 };
 234
 235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236 {
 237         return seq_open(file, &rt_cache_seq_ops);
 238 }
 239
 240 static const struct file_operations rt_cache_seq_fops = {
 241         .owner   = THIS_MODULE,
 242         .open    = rt_cache_seq_open,
 243         .read    = seq_read,
 244         .llseek  = seq_lseek,
 245         .release = seq_release,
 246 };
 247
 248
 249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 250 {
 251         int cpu;
 252
 253         if (*pos == 0)
 254                 return SEQ_START_TOKEN;
 255
 256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 257                 if (!cpu_possible(cpu))
 258                         continue;
 259                 *pos = cpu+1;
 260                 return &per_cpu(rt_cache_stat, cpu);
 261         }
 262         return NULL;
 263 }
 264
 265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 266 {
 267         int cpu;
 268
 269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 270                 if (!cpu_possible(cpu))
 271                         continue;
 272                 *pos = cpu+1;
 273                 return &per_cpu(rt_cache_stat, cpu);
 274         }
 275         return NULL;
 276
 277 }
 278
 279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 280 {
 281
 282 }
 283
 284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 285 {
 286         struct rt_cache_stat *st = v;
 287
 288         if (v == SEQ_START_TOKEN) {
 289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 290                 return 0;
 291         }
 292
 293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 295                    dst_entries_get_slow(&ipv4_dst_ops),
 296                    0, /* st->in_hit */
 297                    st->in_slow_tot,
 298                    st->in_slow_mc,
 299                    st->in_no_route,
 300                    st->in_brd,
 301                    st->in_martian_dst,
 302                    st->in_martian_src,
 303
 304                    0, /* st->out_hit */
 305                    st->out_slow_tot,
 306                    st->out_slow_mc,
 307
 308                    0, /* st->gc_total */
 309                    0, /* st->gc_ignored */
 310                    0, /* st->gc_goal_miss */
 311                    0, /* st->gc_dst_overflow */
 312                    0, /* st->in_hlist_search */
 313                    0  /* st->out_hlist_search */
 314                 );
 315         return 0;
 316 }
 317
 318 static const struct seq_operations rt_cpu_seq_ops = {
 319         .start  = rt_cpu_seq_start,
 320         .next   = rt_cpu_seq_next,
 321         .stop   = rt_cpu_seq_stop,
 322         .show   = rt_cpu_seq_show,
 323 };
 324
 325
 326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 327 {
 328         return seq_open(file, &rt_cpu_seq_ops);
 329 }
 330
 331 static const struct file_operations rt_cpu_seq_fops = {
 332         .owner   = THIS_MODULE,
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363
 364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365 {
 366         return single_open(file, rt_acct_proc_show, NULL);
 367 }
 368
 369 static const struct file_operations rt_acct_proc_fops = {
 370         .owner          = THIS_MODULE,
 371         .open           = rt_acct_proc_open,
 372         .read           = seq_read,
 373         .llseek         = seq_lseek,
 374         .release        = single_release,
 375 };
 376 #endif
 377
 378 static int __net_init ip_rt_do_proc_init(struct net *net)
 379 {
 380         struct proc_dir_entry *pde;
 381
 382         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 383                           &rt_cache_seq_fops);
 384         if (!pde)
 385                 goto err1;
 386
 387         pde = proc_create("rt_cache", S_IRUGO,
 388                           net->proc_net_stat, &rt_cpu_seq_fops);
 389         if (!pde)
 390                 goto err2;
 391
 392 #ifdef CONFIG_IP_ROUTE_CLASSID
 393         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 394         if (!pde)
 395                 goto err3;
 396 #endif
 397         return 0;
 398
 399 #ifdef CONFIG_IP_ROUTE_CLASSID
 400 err3:
 401         remove_proc_entry("rt_cache", net->proc_net_stat);
 402 #endif
 403 err2:
 404         remove_proc_entry("rt_cache", net->proc_net);
 405 err1:
 406         return -ENOMEM;
 407 }
 408
 409 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 410 {
 411         remove_proc_entry("rt_cache", net->proc_net_stat);
 412         remove_proc_entry("rt_cache", net->proc_net);
 413 #ifdef CONFIG_IP_ROUTE_CLASSID
 414         remove_proc_entry("rt_acct", net->proc_net);
 415 #endif
 416 }
 417
 418 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 419         .init = ip_rt_do_proc_init,
 420         .exit = ip_rt_do_proc_exit,
 421 };
 422
 423 static int __init ip_rt_proc_init(void)
 424 {
 425         return register_pernet_subsys(&ip_rt_proc_ops);
 426 }
 427
 428 #else
 429 static inline int ip_rt_proc_init(void)
 430 {
 431         return 0;
 432 }
 433 #endif /* CONFIG_PROC_FS */
 434
 435 static inline bool rt_is_expired(const struct rtable *rth)
 436 {
 437         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 438 }
 439
 440 void rt_cache_flush(struct net *net)
 441 {
 442         rt_genid_bump_ipv4(net);
 443 }
 444
 445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 446                                            struct sk_buff *skb,
 447                                            const void *daddr)
 448 {
 449         struct net_device *dev = dst->dev;
 450         const __be32 *pkey = daddr;
 451         const struct rtable *rt;
 452         struct neighbour *n;
 453
 454         rt = (const struct rtable *) dst;
 455         if (rt->rt_gateway)
 456                 pkey = (const __be32 *) &rt->rt_gateway;
 457         else if (skb)
 458                 pkey = &ip_hdr(skb)->daddr;
 459
 460         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 461         if (n)
 462                 return n;
 463         return neigh_create(&arp_tbl, pkey, dev);
 464 }
 465
 466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 467 {
 468         struct net_device *dev = dst->dev;
 469         const __be32 *pkey = daddr;
 470         const struct rtable *rt;
 471
 472         rt = (const struct rtable *)dst;
 473         if (rt->rt_gateway)
 474                 pkey = (const __be32 *)&rt->rt_gateway;
 475         else if (!daddr ||
 476                  (rt->rt_flags &
 477                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 478                 return;
 479
 480         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 481 }
 482
 483 #define IP_IDENTS_SZ 2048u
 484
 485 static atomic_t *ip_idents __read_mostly;
 486 static u32 *ip_tstamps __read_mostly;
 487
 488 /* In order to protect privacy, we add a perturbation to identifiers
 489  * if one generator is seldom used. This makes hard for an attacker
 490  * to infer how many packets were sent between two points in time.
 491  */
 492 u32 ip_idents_reserve(u32 hash, int segs)
 493 {
 494         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 495         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 496         u32 old = ACCESS_ONCE(*p_tstamp);
 497         u32 now = (u32)jiffies;
 498         u32 new, delta = 0;
 499
 500         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 501                 delta = prandom_u32_max(now - old);
 502
 503         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 504         do {
 505                 old = (u32)atomic_read(p_id);
 506                 new = old + delta + segs;
 507         } while (atomic_cmpxchg(p_id, old, new) != old);
 508
 509         return new - segs;
 510 }
 511 EXPORT_SYMBOL(ip_idents_reserve);
 512
 513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 514 {
 515         static u32 ip_idents_hashrnd __read_mostly;
 516         u32 hash, id;
 517
 518         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 519
 520         hash = jhash_3words((__force u32)iph->daddr,
 521                             (__force u32)iph->saddr,
 522                             iph->protocol ^ net_hash_mix(net),
 523                             ip_idents_hashrnd);
 524         id = ip_idents_reserve(hash, segs);
 525         iph->id = htons(id);
 526 }
 527 EXPORT_SYMBOL(__ip_select_ident);
 528
 529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 530                              const struct sock *sk,
 531                              const struct iphdr *iph,
 532                              int oif, u8 tos,
 533                              u8 prot, u32 mark, int flow_flags)
 534 {
 535         if (sk) {
 536                 const struct inet_sock *inet = inet_sk(sk);
 537
 538                 oif = sk->sk_bound_dev_if;
 539                 mark = sk->sk_mark;
 540                 tos = RT_CONN_FLAGS(sk);
 541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 542         }
 543         flowi4_init_output(fl4, oif, mark, tos,
 544                            RT_SCOPE_UNIVERSE, prot,
 545                            flow_flags,
 546                            iph->daddr, iph->saddr, 0, 0,
 547                            sock_net_uid(net, sk));
 548 }
 549
 550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 551                                const struct sock *sk)
 552 {
 553         const struct net *net = dev_net(skb->dev);
 554         const struct iphdr *iph = ip_hdr(skb);
 555         int oif = skb->dev->ifindex;
 556         u8 tos = RT_TOS(iph->tos);
 557         u8 prot = iph->protocol;
 558         u32 mark = skb->mark;
 559
 560         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 561 }
 562
 563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 564 {
 565         const struct inet_sock *inet = inet_sk(sk);
 566         const struct ip_options_rcu *inet_opt;
 567         __be32 daddr = inet->inet_daddr;
 568
 569         rcu_read_lock();
 570         inet_opt = rcu_dereference(inet->inet_opt);
 571         if (inet_opt && inet_opt->opt.srr)
 572                 daddr = inet_opt->opt.faddr;
 573         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 574                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 575                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 576                            inet_sk_flowi_flags(sk),
 577                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 578         rcu_read_unlock();
 579 }
 580
 581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 582                                  const struct sk_buff *skb)
 583 {
 584         if (skb)
 585                 build_skb_flow_key(fl4, skb, sk);
 586         else
 587                 build_sk_flow_key(fl4, sk);
 588 }
 589
 590 static inline void rt_free(struct rtable *rt)
 591 {
 592         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 593 }
 594
 595 static DEFINE_SPINLOCK(fnhe_lock);
 596
 597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 598 {
 599         struct rtable *rt;
 600
 601         rt = rcu_dereference(fnhe->fnhe_rth_input);
 602         if (rt) {
 603                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 604                 rt_free(rt);
 605         }
 606         rt = rcu_dereference(fnhe->fnhe_rth_output);
 607         if (rt) {
 608                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 609                 rt_free(rt);
 610         }
 611 }
 612
 613 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 614 {
 615         struct fib_nh_exception *fnhe, *oldest;
 616
 617         oldest = rcu_dereference(hash->chain);
 618         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 619              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 620                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 621                         oldest = fnhe;
 622         }
 623         fnhe_flush_routes(oldest);
 624         return oldest;
 625 }
 626
 627 static inline u32 fnhe_hashfun(__be32 daddr)
 628 {
 629         static u32 fnhe_hashrnd __read_mostly;
 630         u32 hval;
 631
 632         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 633         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 634         return hash_32(hval, FNHE_HASH_SHIFT);
 635 }
 636
 637 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 638 {
 639         rt->rt_pmtu = fnhe->fnhe_pmtu;
 640         rt->dst.expires = fnhe->fnhe_expires;
 641
 642         if (fnhe->fnhe_gw) {
 643                 rt->rt_flags |= RTCF_REDIRECTED;
 644                 rt->rt_gateway = fnhe->fnhe_gw;
 645                 rt->rt_uses_gateway = 1;
 646         }
 647 }
 648
 649 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 650                                   u32 pmtu, unsigned long expires)
 651 {
 652         struct fnhe_hash_bucket *hash;
 653         struct fib_nh_exception *fnhe;
 654         struct rtable *rt;
 655         unsigned int i;
 656         int depth;
 657         u32 hval = fnhe_hashfun(daddr);
 658
 659         spin_lock_bh(&fnhe_lock);
 660
 661         hash = rcu_dereference(nh->nh_exceptions);
 662         if (!hash) {
 663                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664                 if (!hash)
 665                         goto out_unlock;
 666                 rcu_assign_pointer(nh->nh_exceptions, hash);
 667         }
 668
 669         hash += hval;
 670
 671         depth = 0;
 672         for (fnhe = rcu_dereference(hash->chain); fnhe;
 673              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674                 if (fnhe->fnhe_daddr == daddr)
 675                         break;
 676                 depth++;
 677         }
 678
 679         if (fnhe) {
 680                 if (gw)
 681                         fnhe->fnhe_gw = gw;
 682                 if (pmtu) {
 683                         fnhe->fnhe_pmtu = pmtu;
 684                         fnhe->fnhe_expires = max(1UL, expires);
 685                 }
 686                 /* Update all cached dsts too */
 687                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 688                 if (rt)
 689                         fill_route_from_fnhe(rt, fnhe);
 690                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 691                 if (rt)
 692                         fill_route_from_fnhe(rt, fnhe);
 693         } else {
 694                 if (depth > FNHE_RECLAIM_DEPTH)
 695                         fnhe = fnhe_oldest(hash);
 696                 else {
 697                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 698                         if (!fnhe)
 699                                 goto out_unlock;
 700
 701                         fnhe->fnhe_next = hash->chain;
 702                         rcu_assign_pointer(hash->chain, fnhe);
 703                 }
 704                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 705                 fnhe->fnhe_daddr = daddr;
 706                 fnhe->fnhe_gw = gw;
 707                 fnhe->fnhe_pmtu = pmtu;
 708                 fnhe->fnhe_expires = expires;
 709
 710                 /* Exception created; mark the cached routes for the nexthop
 711                  * stale, so anyone caching it rechecks if this exception
 712                  * applies to them.
 713                  */
 714                 rt = rcu_dereference(nh->nh_rth_input);
 715                 if (rt)
 716                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 717
 718                 for_each_possible_cpu(i) {
 719                         struct rtable __rcu **prt;
 720                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 721                         rt = rcu_dereference(*prt);
 722                         if (rt)
 723                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 724                 }
 725         }
 726
 727         fnhe->fnhe_stamp = jiffies;
 728
 729 out_unlock:
 730         spin_unlock_bh(&fnhe_lock);
 731 }
 732
 733 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 734                              bool kill_route)
 735 {
 736         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 737         __be32 old_gw = ip_hdr(skb)->saddr;
 738         struct net_device *dev = skb->dev;
 739         struct in_device *in_dev;
 740         struct fib_result res;
 741         struct neighbour *n;
 742         struct net *net;
 743
 744         switch (icmp_hdr(skb)->code & 7) {
 745         case ICMP_REDIR_NET:
 746         case ICMP_REDIR_NETTOS:
 747         case ICMP_REDIR_HOST:
 748         case ICMP_REDIR_HOSTTOS:
 749                 break;
 750
 751         default:
 752                 return;
 753         }
 754
 755         if (rt->rt_gateway != old_gw)
 756                 return;
 757
 758         in_dev = __in_dev_get_rcu(dev);
 759         if (!in_dev)
 760                 return;
 761
 762         net = dev_net(dev);
 763         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 764             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 765             ipv4_is_zeronet(new_gw))
 766                 goto reject_redirect;
 767
 768         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 769                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 770                         goto reject_redirect;
 771                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 772                         goto reject_redirect;
 773         } else {
 774                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 775                         goto reject_redirect;
 776         }
 777
 778         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 779         if (!n)
 780                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 781         if (!IS_ERR(n)) {
 782                 if (!(n->nud_state & NUD_VALID)) {
 783                         neigh_event_send(n, NULL);
 784                 } else {
 785                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 786                                 struct fib_nh *nh = &FIB_RES_NH(res);
 787
 788                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 789                                                 0, jiffies + ip_rt_gc_timeout);
 790                         }
 791                         if (kill_route)
 792                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 793                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 794                 }
 795                 neigh_release(n);
 796         }
 797         return;
 798
 799 reject_redirect:
 800 #ifdef CONFIG_IP_ROUTE_VERBOSE
 801         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 802                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 803                 __be32 daddr = iph->daddr;
 804                 __be32 saddr = iph->saddr;
 805
 806                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 807                                      "  Advised path = %pI4 -> %pI4\n",
 808                                      &old_gw, dev->name, &new_gw,
 809                                      &saddr, &daddr);
 810         }
 811 #endif
 812         ;
 813 }
 814
 815 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 816 {
 817         struct rtable *rt;
 818         struct flowi4 fl4;
 819         const struct iphdr *iph = (const struct iphdr *) skb->data;
 820         struct net *net = dev_net(skb->dev);
 821         int oif = skb->dev->ifindex;
 822         u8 tos = RT_TOS(iph->tos);
 823         u8 prot = iph->protocol;
 824         u32 mark = skb->mark;
 825
 826         rt = (struct rtable *) dst;
 827
 828         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 829         __ip_do_redirect(rt, skb, &fl4, true);
 830 }
 831
 832 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 833 {
 834         struct rtable *rt = (struct rtable *)dst;
 835         struct dst_entry *ret = dst;
 836
 837         if (rt) {
 838                 if (dst->obsolete > 0) {
 839                         ip_rt_put(rt);
 840                         ret = NULL;
 841                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 842                            rt->dst.expires) {
 843                         ip_rt_put(rt);
 844                         ret = NULL;
 845                 }
 846         }
 847         return ret;
 848 }
 849
 850 /*
 851  * Algorithm:
 852  *      1. The first ip_rt_redirect_number redirects are sent
 853  *         with exponential backoff, then we stop sending them at all,
 854  *         assuming that the host ignores our redirects.
 855  *      2. If we did not see packets requiring redirects
 856  *         during ip_rt_redirect_silence, we assume that the host
 857  *         forgot redirected route and start to send redirects again.
 858  *
 859  * This algorithm is much cheaper and more intelligent than dumb load limiting
 860  * in icmp.c.
 861  *
 862  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 863  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 864  */
 865
 866 void ip_rt_send_redirect(struct sk_buff *skb)
 867 {
 868         struct rtable *rt = skb_rtable(skb);
 869         struct in_device *in_dev;
 870         struct inet_peer *peer;
 871         struct net *net;
 872         int log_martians;
 873         int vif;
 874
 875         rcu_read_lock();
 876         in_dev = __in_dev_get_rcu(rt->dst.dev);
 877         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 878                 rcu_read_unlock();
 879                 return;
 880         }
 881         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 882         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 883         rcu_read_unlock();
 884
 885         net = dev_net(rt->dst.dev);
 886         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 887         if (!peer) {
 888                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 889                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 890                 return;
 891         }
 892
 893         /* No redirected packets during ip_rt_redirect_silence;
 894          * reset the algorithm.
 895          */
 896         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 897                 peer->rate_tokens = 0;
 898
 899         /* Too many ignored redirects; do not send anything
 900          * set dst.rate_last to the last seen redirected packet.
 901          */
 902         if (peer->rate_tokens >= ip_rt_redirect_number) {
 903                 peer->rate_last = jiffies;
 904                 goto out_put_peer;
 905         }
 906
 907         /* Check for load limit; set rate_last to the latest sent
 908          * redirect.
 909          */
 910         if (peer->rate_tokens == 0 ||
 911             time_after(jiffies,
 912                        (peer->rate_last +
 913                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 914                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 915
 916                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 917                 peer->rate_last = jiffies;
 918                 ++peer->rate_tokens;
 919 #ifdef CONFIG_IP_ROUTE_VERBOSE
 920                 if (log_martians &&
 921                     peer->rate_tokens == ip_rt_redirect_number)
 922                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 923                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 924                                              &ip_hdr(skb)->daddr, &gw);
 925 #endif
 926         }
 927 out_put_peer:
 928         inet_putpeer(peer);
 929 }
 930
 931 static int ip_error(struct sk_buff *skb)
 932 {
 933         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 934         struct rtable *rt = skb_rtable(skb);
 935         struct inet_peer *peer;
 936         unsigned long now;
 937         struct net *net;
 938         bool send;
 939         int code;
 940
 941         /* IP on this device is disabled. */
 942         if (!in_dev)
 943                 goto out;
 944
 945         net = dev_net(rt->dst.dev);
 946         if (!IN_DEV_FORWARD(in_dev)) {
 947                 switch (rt->dst.error) {
 948                 case EHOSTUNREACH:
 949                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 950                         break;
 951
 952                 case ENETUNREACH:
 953                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 954                         break;
 955                 }
 956                 goto out;
 957         }
 958
 959         switch (rt->dst.error) {
 960         case EINVAL:
 961         default:
 962                 goto out;
 963         case EHOSTUNREACH:
 964                 code = ICMP_HOST_UNREACH;
 965                 break;
 966         case ENETUNREACH:
 967                 code = ICMP_NET_UNREACH;
 968                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969                 break;
 970         case EACCES:
 971                 code = ICMP_PKT_FILTERED;
 972                 break;
 973         }
 974
 975         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 976                                l3mdev_master_ifindex(skb->dev), 1);
 977
 978         send = true;
 979         if (peer) {
 980                 now = jiffies;
 981                 peer->rate_tokens += now - peer->rate_last;
 982                 if (peer->rate_tokens > ip_rt_error_burst)
 983                         peer->rate_tokens = ip_rt_error_burst;
 984                 peer->rate_last = now;
 985                 if (peer->rate_tokens >= ip_rt_error_cost)
 986                         peer->rate_tokens -= ip_rt_error_cost;
 987                 else
 988                         send = false;
 989                 inet_putpeer(peer);
 990         }
 991         if (send)
 992                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 993
 994 out:    kfree_skb(skb);
 995         return 0;
 996 }
 997
 998 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 999 {
1000         struct dst_entry *dst = &rt->dst;
1001         struct fib_result res;
1002
1003         if (dst_metric_locked(dst, RTAX_MTU))
1004                 return;
1005
1006         if (ipv4_mtu(dst) < mtu)
1007                 return;
1008
1009         if (mtu < ip_rt_min_pmtu)
1010                 mtu = ip_rt_min_pmtu;
1011
1012         if (rt->rt_pmtu == mtu &&
1013             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1014                 return;
1015
1016         rcu_read_lock();
1017         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1018                 struct fib_nh *nh = &FIB_RES_NH(res);
1019
1020                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1021                                       jiffies + ip_rt_mtu_expires);
1022         }
1023         rcu_read_unlock();
1024 }
1025
1026 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1027                               struct sk_buff *skb, u32 mtu)
1028 {
1029         struct rtable *rt = (struct rtable *) dst;
1030         struct flowi4 fl4;
1031
1032         ip_rt_build_flow_key(&fl4, sk, skb);
1033         __ip_rt_update_pmtu(rt, &fl4, mtu);
1034 }
1035
1036 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1037                       int oif, u32 mark, u8 protocol, int flow_flags)
1038 {
1039         const struct iphdr *iph = (const struct iphdr *) skb->data;
1040         struct flowi4 fl4;
1041         struct rtable *rt;
1042
1043         if (!mark)
1044                 mark = IP4_REPLY_MARK(net, skb->mark);
1045
1046         __build_flow_key(net, &fl4, NULL, iph, oif,
1047                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1048         rt = __ip_route_output_key(net, &fl4);
1049         if (!IS_ERR(rt)) {
1050                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1051                 ip_rt_put(rt);
1052         }
1053 }
1054 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1055
1056 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1057 {
1058         const struct iphdr *iph = (const struct iphdr *) skb->data;
1059         struct flowi4 fl4;
1060         struct rtable *rt;
1061
1062         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1063
1064         if (!fl4.flowi4_mark)
1065                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1066
1067         rt = __ip_route_output_key(sock_net(sk), &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073
1074 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079         struct dst_entry *odst = NULL;
1080         bool new = false;
1081         struct net *net = sock_net(sk);
1082
1083         bh_lock_sock(sk);
1084
1085         if (!ip_sk_accept_pmtu(sk))
1086                 goto out;
1087
1088         odst = sk_dst_get(sk);
1089
1090         if (sock_owned_by_user(sk) || !odst) {
1091                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1092                 goto out;
1093         }
1094
1095         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1096
1097         rt = (struct rtable *)odst;
1098         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1099                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1100                 if (IS_ERR(rt))
1101                         goto out;
1102
1103                 new = true;
1104         }
1105
1106         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1107
1108         if (!dst_check(&rt->dst, 0)) {
1109                 if (new)
1110                         dst_release(&rt->dst);
1111
1112                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1113                 if (IS_ERR(rt))
1114                         goto out;
1115
1116                 new = true;
1117         }
1118
1119         if (new)
1120                 sk_dst_set(sk, &rt->dst);
1121
1122 out:
1123         bh_unlock_sock(sk);
1124         dst_release(odst);
1125 }
1126 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1127
1128 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1129                    int oif, u32 mark, u8 protocol, int flow_flags)
1130 {
1131         const struct iphdr *iph = (const struct iphdr *) skb->data;
1132         struct flowi4 fl4;
1133         struct rtable *rt;
1134
1135         __build_flow_key(net, &fl4, NULL, iph, oif,
1136                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1137         rt = __ip_route_output_key(net, &fl4);
1138         if (!IS_ERR(rt)) {
1139                 __ip_do_redirect(rt, skb, &fl4, false);
1140                 ip_rt_put(rt);
1141         }
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_redirect);
1144
1145 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150         struct net *net = sock_net(sk);
1151
1152         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1153         rt = __ip_route_output_key(net, &fl4);
1154         if (!IS_ERR(rt)) {
1155                 __ip_do_redirect(rt, skb, &fl4, false);
1156                 ip_rt_put(rt);
1157         }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1160
1161 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1162 {
1163         struct rtable *rt = (struct rtable *) dst;
1164
1165         /* All IPV4 dsts are created with ->obsolete set to the value
1166          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1167          * into this function always.
1168          *
1169          * When a PMTU/redirect information update invalidates a route,
1170          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1171          * DST_OBSOLETE_DEAD by dst_free().
1172          */
1173         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1174                 return NULL;
1175         return dst;
1176 }
1177
1178 static void ipv4_link_failure(struct sk_buff *skb)
1179 {
1180         struct rtable *rt;
1181
1182         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1183
1184         rt = skb_rtable(skb);
1185         if (rt)
1186                 dst_set_expires(&rt->dst, 0);
1187 }
1188
1189 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1190 {
1191         pr_debug("%s: %pI4 -> %pI4, %s\n",
1192                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1193                  skb->dev ? skb->dev->name : "?");
1194         kfree_skb(skb);
1195         WARN_ON(1);
1196         return 0;
1197 }
1198
1199 /*
1200    We do not cache source address of outgoing interface,
1201    because it is used only by IP RR, TS and SRR options,
1202    so that it out of fast path.
1203
1204    BTW remember: "addr" is allowed to be not aligned
1205    in IP options!
1206  */
1207
1208 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1209 {
1210         __be32 src;
1211
1212         if (rt_is_output_route(rt))
1213                 src = ip_hdr(skb)->saddr;
1214         else {
1215                 struct fib_result res;
1216                 struct flowi4 fl4;
1217                 struct iphdr *iph;
1218
1219                 iph = ip_hdr(skb);
1220
1221                 memset(&fl4, 0, sizeof(fl4));
1222                 fl4.daddr = iph->daddr;
1223                 fl4.saddr = iph->saddr;
1224                 fl4.flowi4_tos = RT_TOS(iph->tos);
1225                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1226                 fl4.flowi4_iif = skb->dev->ifindex;
1227                 fl4.flowi4_mark = skb->mark;
1228
1229                 rcu_read_lock();
1230                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1231                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1232                 else
1233                         src = inet_select_addr(rt->dst.dev,
1234                                                rt_nexthop(rt, iph->daddr),
1235                                                RT_SCOPE_UNIVERSE);
1236                 rcu_read_unlock();
1237         }
1238         memcpy(addr, &src, 4);
1239 }
1240
1241 #ifdef CONFIG_IP_ROUTE_CLASSID
1242 static void set_class_tag(struct rtable *rt, u32 tag)
1243 {
1244         if (!(rt->dst.tclassid & 0xFFFF))
1245                 rt->dst.tclassid |= tag & 0xFFFF;
1246         if (!(rt->dst.tclassid & 0xFFFF0000))
1247                 rt->dst.tclassid |= tag & 0xFFFF0000;
1248 }
1249 #endif
1250
1251 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1252 {
1253         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1254
1255         if (advmss == 0) {
1256                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1257                                ip_rt_min_advmss);
1258                 if (advmss > 65535 - 40)
1259                         advmss = 65535 - 40;
1260         }
1261         return advmss;
1262 }
1263
1264 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1265 {
1266         const struct rtable *rt = (const struct rtable *) dst;
1267         unsigned int mtu = rt->rt_pmtu;
1268
1269         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1270                 mtu = dst_metric_raw(dst, RTAX_MTU);
1271
1272         if (mtu)
1273                 return mtu;
1274
1275         mtu = dst->dev->mtu;
1276
1277         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1278                 if (rt->rt_uses_gateway && mtu > 576)
1279                         mtu = 576;
1280         }
1281
1282         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1283
1284         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1285 }
1286
1287 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1288 {
1289         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1290         struct fib_nh_exception *fnhe;
1291         u32 hval;
1292
1293         if (!hash)
1294                 return NULL;
1295
1296         hval = fnhe_hashfun(daddr);
1297
1298         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1299              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1300                 if (fnhe->fnhe_daddr == daddr)
1301                         return fnhe;
1302         }
1303         return NULL;
1304 }
1305
1306 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1307                               __be32 daddr)
1308 {
1309         bool ret = false;
1310
1311         spin_lock_bh(&fnhe_lock);
1312
1313         if (daddr == fnhe->fnhe_daddr) {
1314                 struct rtable __rcu **porig;
1315                 struct rtable *orig;
1316                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1317
1318                 if (rt_is_input_route(rt))
1319                         porig = &fnhe->fnhe_rth_input;
1320                 else
1321                         porig = &fnhe->fnhe_rth_output;
1322                 orig = rcu_dereference(*porig);
1323
1324                 if (fnhe->fnhe_genid != genid) {
1325                         fnhe->fnhe_genid = genid;
1326                         fnhe->fnhe_gw = 0;
1327                         fnhe->fnhe_pmtu = 0;
1328                         fnhe->fnhe_expires = 0;
1329                         fnhe_flush_routes(fnhe);
1330                         orig = NULL;
1331                 }
1332                 fill_route_from_fnhe(rt, fnhe);
1333                 if (!rt->rt_gateway)
1334                         rt->rt_gateway = daddr;
1335
1336                 if (!(rt->dst.flags & DST_NOCACHE)) {
1337                         rcu_assign_pointer(*porig, rt);
1338                         if (orig)
1339                                 rt_free(orig);
1340                         ret = true;
1341                 }
1342
1343                 fnhe->fnhe_stamp = jiffies;
1344         }
1345         spin_unlock_bh(&fnhe_lock);
1346
1347         return ret;
1348 }
1349
1350 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1351 {
1352         struct rtable *orig, *prev, **p;
1353         bool ret = true;
1354
1355         if (rt_is_input_route(rt)) {
1356                 p = (struct rtable **)&nh->nh_rth_input;
1357         } else {
1358                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1359         }
1360         orig = *p;
1361
1362         prev = cmpxchg(p, orig, rt);
1363         if (prev == orig) {
1364                 if (orig)
1365                         rt_free(orig);
1366         } else
1367                 ret = false;
1368
1369         return ret;
1370 }
1371
1372 struct uncached_list {
1373         spinlock_t              lock;
1374         struct list_head        head;
1375 };
1376
1377 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1378
1379 static void rt_add_uncached_list(struct rtable *rt)
1380 {
1381         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1382
1383         rt->rt_uncached_list = ul;
1384
1385         spin_lock_bh(&ul->lock);
1386         list_add_tail(&rt->rt_uncached, &ul->head);
1387         spin_unlock_bh(&ul->lock);
1388 }
1389
1390 static void ipv4_dst_destroy(struct dst_entry *dst)
1391 {
1392         struct rtable *rt = (struct rtable *) dst;
1393
1394         if (!list_empty(&rt->rt_uncached)) {
1395                 struct uncached_list *ul = rt->rt_uncached_list;
1396
1397                 spin_lock_bh(&ul->lock);
1398                 list_del(&rt->rt_uncached);
1399                 spin_unlock_bh(&ul->lock);
1400         }
1401 }
1402
1403 void rt_flush_dev(struct net_device *dev)
1404 {
1405         struct net *net = dev_net(dev);
1406         struct rtable *rt;
1407         int cpu;
1408
1409         for_each_possible_cpu(cpu) {
1410                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1411
1412                 spin_lock_bh(&ul->lock);
1413                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1414                         if (rt->dst.dev != dev)
1415                                 continue;
1416                         rt->dst.dev = net->loopback_dev;
1417                         dev_hold(rt->dst.dev);
1418                         dev_put(dev);
1419                 }
1420                 spin_unlock_bh(&ul->lock);
1421         }
1422 }
1423
1424 static bool rt_cache_valid(const struct rtable *rt)
1425 {
1426         return  rt &&
1427                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1428                 !rt_is_expired(rt);
1429 }
1430
1431 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1432                            const struct fib_result *res,
1433                            struct fib_nh_exception *fnhe,
1434                            struct fib_info *fi, u16 type, u32 itag)
1435 {
1436         bool cached = false;
1437
1438         if (fi) {
1439                 struct fib_nh *nh = &FIB_RES_NH(*res);
1440
1441                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1442                         rt->rt_gateway = nh->nh_gw;
1443                         rt->rt_uses_gateway = 1;
1444                 }
1445                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1446 #ifdef CONFIG_IP_ROUTE_CLASSID
1447                 rt->dst.tclassid = nh->nh_tclassid;
1448 #endif
1449                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1450                 if (unlikely(fnhe))
1451                         cached = rt_bind_exception(rt, fnhe, daddr);
1452                 else if (!(rt->dst.flags & DST_NOCACHE))
1453                         cached = rt_cache_route(nh, rt);
1454                 if (unlikely(!cached)) {
1455                         /* Routes we intend to cache in nexthop exception or
1456                          * FIB nexthop have the DST_NOCACHE bit clear.
1457                          * However, if we are unsuccessful at storing this
1458                          * route into the cache we really need to set it.
1459                          */
1460                         rt->dst.flags |= DST_NOCACHE;
1461                         if (!rt->rt_gateway)
1462                                 rt->rt_gateway = daddr;
1463                         rt_add_uncached_list(rt);
1464                 }
1465         } else
1466                 rt_add_uncached_list(rt);
1467
1468 #ifdef CONFIG_IP_ROUTE_CLASSID
1469 #ifdef CONFIG_IP_MULTIPLE_TABLES
1470         set_class_tag(rt, res->tclassid);
1471 #endif
1472         set_class_tag(rt, itag);
1473 #endif
1474 }
1475
1476 struct rtable *rt_dst_alloc(struct net_device *dev,
1477                             unsigned int flags, u16 type,
1478                             bool nopolicy, bool noxfrm, bool will_cache)
1479 {
1480         struct rtable *rt;
1481
1482         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1483                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1484                        (nopolicy ? DST_NOPOLICY : 0) |
1485                        (noxfrm ? DST_NOXFRM : 0));
1486
1487         if (rt) {
1488                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1489                 rt->rt_flags = flags;
1490                 rt->rt_type = type;
1491                 rt->rt_is_input = 0;
1492                 rt->rt_iif = 0;
1493                 rt->rt_pmtu = 0;
1494                 rt->rt_gateway = 0;
1495                 rt->rt_uses_gateway = 0;
1496                 rt->rt_table_id = 0;
1497                 INIT_LIST_HEAD(&rt->rt_uncached);
1498
1499                 rt->dst.output = ip_output;
1500                 if (flags & RTCF_LOCAL)
1501                         rt->dst.input = ip_local_deliver;
1502         }
1503
1504         return rt;
1505 }
1506 EXPORT_SYMBOL(rt_dst_alloc);
1507
1508 /* called in rcu_read_lock() section */
1509 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1510                                 u8 tos, struct net_device *dev, int our)
1511 {
1512         struct rtable *rth;
1513         struct in_device *in_dev = __in_dev_get_rcu(dev);
1514         unsigned int flags = RTCF_MULTICAST;
1515         u32 itag = 0;
1516         int err;
1517
1518         /* Primary sanity checks. */
1519
1520         if (!in_dev)
1521                 return -EINVAL;
1522
1523         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1524             skb->protocol != htons(ETH_P_IP))
1525                 goto e_inval;
1526
1527         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1528                 goto e_inval;
1529
1530         if (ipv4_is_zeronet(saddr)) {
1531                 if (!ipv4_is_local_multicast(daddr))
1532                         goto e_inval;
1533         } else {
1534                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1535                                           in_dev, &itag);
1536                 if (err < 0)
1537                         goto e_err;
1538         }
1539         if (our)
1540                 flags |= RTCF_LOCAL;
1541
1542         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1543                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1544         if (!rth)
1545                 goto e_nobufs;
1546
1547 #ifdef CONFIG_IP_ROUTE_CLASSID
1548         rth->dst.tclassid = itag;
1549 #endif
1550         rth->dst.output = ip_rt_bug;
1551         rth->rt_is_input= 1;
1552
1553 #ifdef CONFIG_IP_MROUTE
1554         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1555                 rth->dst.input = ip_mr_input;
1556 #endif
1557         RT_CACHE_STAT_INC(in_slow_mc);
1558
1559         skb_dst_set(skb, &rth->dst);
1560         return 0;
1561
1562 e_nobufs:
1563         return -ENOBUFS;
1564 e_inval:
1565         return -EINVAL;
1566 e_err:
1567         return err;
1568 }
1569
1570
1571 static void ip_handle_martian_source(struct net_device *dev,
1572                                      struct in_device *in_dev,
1573                                      struct sk_buff *skb,
1574                                      __be32 daddr,
1575                                      __be32 saddr)
1576 {
1577         RT_CACHE_STAT_INC(in_martian_src);
1578 #ifdef CONFIG_IP_ROUTE_VERBOSE
1579         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1580                 /*
1581                  *      RFC1812 recommendation, if source is martian,
1582                  *      the only hint is MAC header.
1583                  */
1584                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1585                         &daddr, &saddr, dev->name);
1586                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1587                         print_hex_dump(KERN_WARNING, "ll header: ",
1588                                        DUMP_PREFIX_OFFSET, 16, 1,
1589                                        skb_mac_header(skb),
1590                                        dev->hard_header_len, true);
1591                 }
1592         }
1593 #endif
1594 }
1595
1596 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1597 {
1598         struct fnhe_hash_bucket *hash;
1599         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1600         u32 hval = fnhe_hashfun(daddr);
1601
1602         spin_lock_bh(&fnhe_lock);
1603
1604         hash = rcu_dereference_protected(nh->nh_exceptions,
1605                                          lockdep_is_held(&fnhe_lock));
1606         hash += hval;
1607
1608         fnhe_p = &hash->chain;
1609         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1610         while (fnhe) {
1611                 if (fnhe->fnhe_daddr == daddr) {
1612                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1613                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1614                         fnhe_flush_routes(fnhe);
1615                         kfree_rcu(fnhe, rcu);
1616                         break;
1617                 }
1618                 fnhe_p = &fnhe->fnhe_next;
1619                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1620                                                  lockdep_is_held(&fnhe_lock));
1621         }
1622
1623         spin_unlock_bh(&fnhe_lock);
1624 }
1625
1626 static void set_lwt_redirect(struct rtable *rth)
1627 {
1628         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1629                 rth->dst.lwtstate->orig_output = rth->dst.output;
1630                 rth->dst.output = lwtunnel_output;
1631         }
1632
1633         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1634                 rth->dst.lwtstate->orig_input = rth->dst.input;
1635                 rth->dst.input = lwtunnel_input;
1636         }
1637 }
1638
1639 /* called in rcu_read_lock() section */
1640 static int __mkroute_input(struct sk_buff *skb,
1641                            const struct fib_result *res,
1642                            struct in_device *in_dev,
1643                            __be32 daddr, __be32 saddr, u32 tos)
1644 {
1645         struct fib_nh_exception *fnhe;
1646         struct rtable *rth;
1647         int err;
1648         struct in_device *out_dev;
1649         bool do_cache;
1650         u32 itag = 0;
1651
1652         /* get a working reference to the output device */
1653         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1654         if (!out_dev) {
1655                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1656                 return -EINVAL;
1657         }
1658
1659         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1660                                   in_dev->dev, in_dev, &itag);
1661         if (err < 0) {
1662                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1663                                          saddr);
1664
1665                 goto cleanup;
1666         }
1667
1668         do_cache = res->fi && !itag;
1669         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1670             skb->protocol == htons(ETH_P_IP) &&
1671             (IN_DEV_SHARED_MEDIA(out_dev) ||
1672              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1673                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1674
1675         if (skb->protocol != htons(ETH_P_IP)) {
1676                 /* Not IP (i.e. ARP). Do not create route, if it is
1677                  * invalid for proxy arp. DNAT routes are always valid.
1678                  *
1679                  * Proxy arp feature have been extended to allow, ARP
1680                  * replies back to the same interface, to support
1681                  * Private VLAN switch technologies. See arp.c.
1682                  */
1683                 if (out_dev == in_dev &&
1684                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1685                         err = -EINVAL;
1686                         goto cleanup;
1687                 }
1688         }
1689
1690         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1691         if (do_cache) {
1692                 if (fnhe) {
1693                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1694                         if (rth && rth->dst.expires &&
1695                             time_after(jiffies, rth->dst.expires)) {
1696                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1697                                 fnhe = NULL;
1698                         } else {
1699                                 goto rt_cache;
1700                         }
1701                 }
1702
1703                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1704
1705 rt_cache:
1706                 if (rt_cache_valid(rth)) {
1707                         skb_dst_set_noref(skb, &rth->dst);
1708                         goto out;
1709                 }
1710         }
1711
1712         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1713                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1714                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1715         if (!rth) {
1716                 err = -ENOBUFS;
1717                 goto cleanup;
1718         }
1719
1720         rth->rt_is_input = 1;
1721         if (res->table)
1722                 rth->rt_table_id = res->table->tb_id;
1723         RT_CACHE_STAT_INC(in_slow_tot);
1724
1725         rth->dst.input = ip_forward;
1726
1727         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1728         set_lwt_redirect(rth);
1729         skb_dst_set(skb, &rth->dst);
1730 out:
1731         err = 0;
1732  cleanup:
1733         return err;
1734 }
1735
1736 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1737
1738 /* To make ICMP packets follow the right flow, the multipath hash is
1739  * calculated from the inner IP addresses in reverse order.
1740  */
1741 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1742 {
1743         const struct iphdr *outer_iph = ip_hdr(skb);
1744         struct icmphdr _icmph;
1745         const struct icmphdr *icmph;
1746         struct iphdr _inner_iph;
1747         const struct iphdr *inner_iph;
1748
1749         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1750                 goto standard_hash;
1751
1752         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1753                                    &_icmph);
1754         if (!icmph)
1755                 goto standard_hash;
1756
1757         if (icmph->type != ICMP_DEST_UNREACH &&
1758             icmph->type != ICMP_REDIRECT &&
1759             icmph->type != ICMP_TIME_EXCEEDED &&
1760             icmph->type != ICMP_PARAMETERPROB) {
1761                 goto standard_hash;
1762         }
1763
1764         inner_iph = skb_header_pointer(skb,
1765                                        outer_iph->ihl * 4 + sizeof(_icmph),
1766                                        sizeof(_inner_iph), &_inner_iph);
1767         if (!inner_iph)
1768                 goto standard_hash;
1769
1770         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1771
1772 standard_hash:
1773         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1774 }
1775
1776 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1777
1778 static int ip_mkroute_input(struct sk_buff *skb,
1779                             struct fib_result *res,
1780                             struct in_device *in_dev,
1781                             __be32 daddr, __be32 saddr, u32 tos)
1782 {
1783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1784         if (res->fi && res->fi->fib_nhs > 1) {
1785                 int h;
1786
1787                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1788                         h = ip_multipath_icmp_hash(skb);
1789                 else
1790                         h = fib_multipath_hash(saddr, daddr);
1791                 fib_select_multipath(res, h);
1792         }
1793 #endif
1794
1795         /* create a routing cache entry */
1796         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1797 }
1798
1799 /*
1800  *      NOTE. We drop all the packets that has local source
1801  *      addresses, because every properly looped back packet
1802  *      must have correct destination already attached by output routine.
1803  *
1804  *      Such approach solves two big problems:
1805  *      1. Not simplex devices are handled properly.
1806  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1807  *      called with rcu_read_lock()
1808  */
1809
1810 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1811                                u8 tos, struct net_device *dev)
1812 {
1813         struct fib_result res;
1814         struct in_device *in_dev = __in_dev_get_rcu(dev);
1815         struct ip_tunnel_info *tun_info;
1816         struct flowi4   fl4;
1817         unsigned int    flags = 0;
1818         u32             itag = 0;
1819         struct rtable   *rth;
1820         int             err = -EINVAL;
1821         struct net    *net = dev_net(dev);
1822         bool do_cache;
1823
1824         /* IP on this device is disabled. */
1825
1826         if (!in_dev)
1827                 goto out;
1828
1829         /* Check for the most weird martians, which can be not detected
1830            by fib_lookup.
1831          */
1832
1833         tun_info = skb_tunnel_info(skb);
1834         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1835                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1836         else
1837                 fl4.flowi4_tun_key.tun_id = 0;
1838         skb_dst_drop(skb);
1839
1840         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1841                 goto martian_source;
1842
1843         res.fi = NULL;
1844         res.table = NULL;
1845         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1846                 goto brd_input;
1847
1848         /* Accept zero addresses only to limited broadcast;
1849          * I even do not know to fix it or not. Waiting for complains :-)
1850          */
1851         if (ipv4_is_zeronet(saddr))
1852                 goto martian_source;
1853
1854         if (ipv4_is_zeronet(daddr))
1855                 goto martian_destination;
1856
1857         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1858          * and call it once if daddr or/and saddr are loopback addresses
1859          */
1860         if (ipv4_is_loopback(daddr)) {
1861                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1862                         goto martian_destination;
1863         } else if (ipv4_is_loopback(saddr)) {
1864                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1865                         goto martian_source;
1866         }
1867
1868         /*
1869          *      Now we are ready to route packet.
1870          */
1871         fl4.flowi4_oif = 0;
1872         fl4.flowi4_iif = dev->ifindex;
1873         fl4.flowi4_mark = skb->mark;
1874         fl4.flowi4_tos = tos;
1875         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1876         fl4.flowi4_flags = 0;
1877         fl4.daddr = daddr;
1878         fl4.saddr = saddr;
1879         fl4.flowi4_uid = sock_net_uid(net, NULL);
1880         err = fib_lookup(net, &fl4, &res, 0);
1881         if (err != 0) {
1882                 if (!IN_DEV_FORWARD(in_dev))
1883                         err = -EHOSTUNREACH;
1884                 goto no_route;
1885         }
1886
1887         if (res.type == RTN_BROADCAST)
1888                 goto brd_input;
1889
1890         if (res.type == RTN_LOCAL) {
1891                 err = fib_validate_source(skb, saddr, daddr, tos,
1892                                           0, dev, in_dev, &itag);
1893                 if (err < 0)
1894                         goto martian_source;
1895                 goto local_input;
1896         }
1897
1898         if (!IN_DEV_FORWARD(in_dev)) {
1899                 err = -EHOSTUNREACH;
1900                 goto no_route;
1901         }
1902         if (res.type != RTN_UNICAST)
1903                 goto martian_destination;
1904
1905         err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
1906 out:    return err;
1907
1908 brd_input:
1909         if (skb->protocol != htons(ETH_P_IP))
1910                 goto e_inval;
1911
1912         if (!ipv4_is_zeronet(saddr)) {
1913                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1914                                           in_dev, &itag);
1915                 if (err < 0)
1916                         goto martian_source;
1917         }
1918         flags |= RTCF_BROADCAST;
1919         res.type = RTN_BROADCAST;
1920         RT_CACHE_STAT_INC(in_brd);
1921
1922 local_input:
1923         do_cache = false;
1924         if (res.fi) {
1925                 if (!itag) {
1926                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1927                         if (rt_cache_valid(rth)) {
1928                                 skb_dst_set_noref(skb, &rth->dst);
1929                                 err = 0;
1930                                 goto out;
1931                         }
1932                         do_cache = true;
1933                 }
1934         }
1935
1936         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1937                            flags | RTCF_LOCAL, res.type,
1938                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1939         if (!rth)
1940                 goto e_nobufs;
1941
1942         rth->dst.output= ip_rt_bug;
1943 #ifdef CONFIG_IP_ROUTE_CLASSID
1944         rth->dst.tclassid = itag;
1945 #endif
1946         rth->rt_is_input = 1;
1947         if (res.table)
1948                 rth->rt_table_id = res.table->tb_id;
1949
1950         RT_CACHE_STAT_INC(in_slow_tot);
1951         if (res.type == RTN_UNREACHABLE) {
1952                 rth->dst.input= ip_error;
1953                 rth->dst.error= -err;
1954                 rth->rt_flags   &= ~RTCF_LOCAL;
1955         }
1956
1957         if (do_cache) {
1958                 struct fib_nh *nh = &FIB_RES_NH(res);
1959
1960                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1961                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1962                         WARN_ON(rth->dst.input == lwtunnel_input);
1963                         rth->dst.lwtstate->orig_input = rth->dst.input;
1964                         rth->dst.input = lwtunnel_input;
1965                 }
1966
1967                 if (unlikely(!rt_cache_route(nh, rth))) {
1968                         rth->dst.flags |= DST_NOCACHE;
1969                         rt_add_uncached_list(rth);
1970                 }
1971         }
1972         skb_dst_set(skb, &rth->dst);
1973         err = 0;
1974         goto out;
1975
1976 no_route:
1977         RT_CACHE_STAT_INC(in_no_route);
1978         res.type = RTN_UNREACHABLE;
1979         res.fi = NULL;
1980         res.table = NULL;
1981         goto local_input;
1982
1983         /*
1984          *      Do not cache martian addresses: they should be logged (RFC1812)
1985          */
1986 martian_destination:
1987         RT_CACHE_STAT_INC(in_martian_dst);
1988 #ifdef CONFIG_IP_ROUTE_VERBOSE
1989         if (IN_DEV_LOG_MARTIANS(in_dev))
1990                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1991                                      &daddr, &saddr, dev->name);
1992 #endif
1993
1994 e_inval:
1995         err = -EINVAL;
1996         goto out;
1997
1998 e_nobufs:
1999         err = -ENOBUFS;
2000         goto out;
2001
2002 martian_source:
2003         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2004         goto out;
2005 }
2006
2007 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2008                          u8 tos, struct net_device *dev)
2009 {
2010         int res;
2011
2012         tos &= IPTOS_RT_MASK;
2013         rcu_read_lock();
2014
2015         /* Multicast recognition logic is moved from route cache to here.
2016            The problem was that too many Ethernet cards have broken/missing
2017            hardware multicast filters :-( As result the host on multicasting
2018            network acquires a lot of useless route cache entries, sort of
2019            SDR messages from all the world. Now we try to get rid of them.
2020            Really, provided software IP multicast filter is organized
2021            reasonably (at least, hashed), it does not result in a slowdown
2022            comparing with route cache reject entries.
2023            Note, that multicast routers are not affected, because
2024            route cache entry is created eventually.
2025          */
2026         if (ipv4_is_multicast(daddr)) {
2027                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2028                 int our = 0;
2029
2030                 if (in_dev)
2031                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2032                                               ip_hdr(skb)->protocol);
2033
2034                 /* check l3 master if no match yet */
2035                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2036                         struct in_device *l3_in_dev;
2037
2038                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2039                         if (l3_in_dev)
2040                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2041                                                       ip_hdr(skb)->protocol);
2042                 }
2043
2044                 res = -EINVAL;
2045                 if (our
2046 #ifdef CONFIG_IP_MROUTE
2047                         ||
2048                     (!ipv4_is_local_multicast(daddr) &&
2049                      IN_DEV_MFORWARD(in_dev))
2050 #endif
2051                    ) {
2052                         res = ip_route_input_mc(skb, daddr, saddr,
2053                                                 tos, dev, our);
2054                 }
2055                 rcu_read_unlock();
2056                 return res;
2057         }
2058         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2059         rcu_read_unlock();
2060         return res;
2061 }
2062 EXPORT_SYMBOL(ip_route_input_noref);
2063
2064 /* called with rcu_read_lock() */
2065 static struct rtable *__mkroute_output(const struct fib_result *res,
2066                                        const struct flowi4 *fl4, int orig_oif,
2067                                        struct net_device *dev_out,
2068                                        unsigned int flags)
2069 {
2070         struct fib_info *fi = res->fi;
2071         struct fib_nh_exception *fnhe;
2072         struct in_device *in_dev;
2073         u16 type = res->type;
2074         struct rtable *rth;
2075         bool do_cache;
2076
2077         in_dev = __in_dev_get_rcu(dev_out);
2078         if (!in_dev)
2079                 return ERR_PTR(-EINVAL);
2080
2081         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2082                 if (ipv4_is_loopback(fl4->saddr) &&
2083                     !(dev_out->flags & IFF_LOOPBACK) &&
2084                     !netif_is_l3_master(dev_out))
2085                         return ERR_PTR(-EINVAL);
2086
2087         if (ipv4_is_lbcast(fl4->daddr))
2088                 type = RTN_BROADCAST;
2089         else if (ipv4_is_multicast(fl4->daddr))
2090                 type = RTN_MULTICAST;
2091         else if (ipv4_is_zeronet(fl4->daddr))
2092                 return ERR_PTR(-EINVAL);
2093
2094         if (dev_out->flags & IFF_LOOPBACK)
2095                 flags |= RTCF_LOCAL;
2096
2097         do_cache = true;
2098         if (type == RTN_BROADCAST) {
2099                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2100                 fi = NULL;
2101         } else if (type == RTN_MULTICAST) {
2102                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2103                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2104                                      fl4->flowi4_proto))
2105                         flags &= ~RTCF_LOCAL;
2106                 else
2107                         do_cache = false;
2108                 /* If multicast route do not exist use
2109                  * default one, but do not gateway in this case.
2110                  * Yes, it is hack.
2111                  */
2112                 if (fi && res->prefixlen < 4)
2113                         fi = NULL;
2114         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2115                    (orig_oif != dev_out->ifindex)) {
2116                 /* For local routes that require a particular output interface
2117                  * we do not want to cache the result.  Caching the result
2118                  * causes incorrect behaviour when there are multiple source
2119                  * addresses on the interface, the end result being that if the
2120                  * intended recipient is waiting on that interface for the
2121                  * packet he won't receive it because it will be delivered on
2122                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2123                  * be set to the loopback interface as well.
2124                  */
2125                 fi = NULL;
2126         }
2127
2128         fnhe = NULL;
2129         do_cache &= fi != NULL;
2130         if (do_cache) {
2131                 struct rtable __rcu **prth;
2132                 struct fib_nh *nh = &FIB_RES_NH(*res);
2133
2134                 fnhe = find_exception(nh, fl4->daddr);
2135                 if (fnhe) {
2136                         prth = &fnhe->fnhe_rth_output;
2137                         rth = rcu_dereference(*prth);
2138                         if (rth && rth->dst.expires &&
2139                             time_after(jiffies, rth->dst.expires)) {
2140                                 ip_del_fnhe(nh, fl4->daddr);
2141                                 fnhe = NULL;
2142                         } else {
2143                                 goto rt_cache;
2144                         }
2145                 }
2146
2147                 if (unlikely(fl4->flowi4_flags &
2148                              FLOWI_FLAG_KNOWN_NH &&
2149                              !(nh->nh_gw &&
2150                                nh->nh_scope == RT_SCOPE_LINK))) {
2151                         do_cache = false;
2152                         goto add;
2153                 }
2154                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2155                 rth = rcu_dereference(*prth);
2156
2157 rt_cache:
2158                 if (rt_cache_valid(rth)) {
2159                         dst_hold(&rth->dst);
2160                         return rth;
2161                 }
2162         }
2163
2164 add:
2165         rth = rt_dst_alloc(dev_out, flags, type,
2166                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2167                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2168                            do_cache);
2169         if (!rth)
2170                 return ERR_PTR(-ENOBUFS);
2171
2172         rth->rt_iif     = orig_oif ? : 0;
2173         if (res->table)
2174                 rth->rt_table_id = res->table->tb_id;
2175
2176         RT_CACHE_STAT_INC(out_slow_tot);
2177
2178         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2179                 if (flags & RTCF_LOCAL &&
2180                     !(dev_out->flags & IFF_LOOPBACK)) {
2181                         rth->dst.output = ip_mc_output;
2182                         RT_CACHE_STAT_INC(out_slow_mc);
2183                 }
2184 #ifdef CONFIG_IP_MROUTE
2185                 if (type == RTN_MULTICAST) {
2186                         if (IN_DEV_MFORWARD(in_dev) &&
2187                             !ipv4_is_local_multicast(fl4->daddr)) {
2188                                 rth->dst.input = ip_mr_input;
2189                                 rth->dst.output = ip_mc_output;
2190                         }
2191                 }
2192 #endif
2193         }
2194
2195         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2196         set_lwt_redirect(rth);
2197
2198         return rth;
2199 }
2200
2201 /*
2202  * Major route resolver routine.
2203  */
2204
2205 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2206                                           int mp_hash)
2207 {
2208         struct net_device *dev_out = NULL;
2209         __u8 tos = RT_FL_TOS(fl4);
2210         unsigned int flags = 0;
2211         struct fib_result res;
2212         struct rtable *rth;
2213         int orig_oif;
2214         int err = -ENETUNREACH;
2215
2216         res.tclassid    = 0;
2217         res.fi          = NULL;
2218         res.table       = NULL;
2219
2220         orig_oif = fl4->flowi4_oif;
2221
2222         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2223         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2224         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2225                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2226
2227         rcu_read_lock();
2228         if (fl4->saddr) {
2229                 rth = ERR_PTR(-EINVAL);
2230                 if (ipv4_is_multicast(fl4->saddr) ||
2231                     ipv4_is_lbcast(fl4->saddr) ||
2232                     ipv4_is_zeronet(fl4->saddr))
2233                         goto out;
2234
2235                 /* I removed check for oif == dev_out->oif here.
2236                    It was wrong for two reasons:
2237                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2238                       is assigned to multiple interfaces.
2239                    2. Moreover, we are allowed to send packets with saddr
2240                       of another iface. --ANK
2241                  */
2242
2243                 if (fl4->flowi4_oif == 0 &&
2244                     (ipv4_is_multicast(fl4->daddr) ||
2245                      ipv4_is_lbcast(fl4->daddr))) {
2246                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2247                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2248                         if (!dev_out)
2249                                 goto out;
2250
2251                         /* Special hack: user can direct multicasts
2252                            and limited broadcast via necessary interface
2253                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2254                            This hack is not just for fun, it allows
2255                            vic,vat and friends to work.
2256                            They bind socket to loopback, set ttl to zero
2257                            and expect that it will work.
2258                            From the viewpoint of routing cache they are broken,
2259                            because we are not allowed to build multicast path
2260                            with loopback source addr (look, routing cache
2261                            cannot know, that ttl is zero, so that packet
2262                            will not leave this host and route is valid).
2263                            Luckily, this hack is good workaround.
2264                          */
2265
2266                         fl4->flowi4_oif = dev_out->ifindex;
2267                         goto make_route;
2268                 }
2269
2270                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2271                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2272                         if (!__ip_dev_find(net, fl4->saddr, false))
2273                                 goto out;
2274                 }
2275         }
2276
2277
2278         if (fl4->flowi4_oif) {
2279                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2280                 rth = ERR_PTR(-ENODEV);
2281                 if (!dev_out)
2282                         goto out;
2283
2284                 /* RACE: Check return value of inet_select_addr instead. */
2285                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2286                         rth = ERR_PTR(-ENETUNREACH);
2287                         goto out;
2288                 }
2289                 if (ipv4_is_local_multicast(fl4->daddr) ||
2290                     ipv4_is_lbcast(fl4->daddr) ||
2291                     fl4->flowi4_proto == IPPROTO_IGMP) {
2292                         if (!fl4->saddr)
2293                                 fl4->saddr = inet_select_addr(dev_out, 0,
2294                                                               RT_SCOPE_LINK);
2295                         goto make_route;
2296                 }
2297                 if (!fl4->saddr) {
2298                         if (ipv4_is_multicast(fl4->daddr))
2299                                 fl4->saddr = inet_select_addr(dev_out, 0,
2300                                                               fl4->flowi4_scope);
2301                         else if (!fl4->daddr)
2302                                 fl4->saddr = inet_select_addr(dev_out, 0,
2303                                                               RT_SCOPE_HOST);
2304                 }
2305         }
2306
2307         if (!fl4->daddr) {
2308                 fl4->daddr = fl4->saddr;
2309                 if (!fl4->daddr)
2310                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2311                 dev_out = net->loopback_dev;
2312                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2313                 res.type = RTN_LOCAL;
2314                 flags |= RTCF_LOCAL;
2315                 goto make_route;
2316         }
2317
2318         err = fib_lookup(net, fl4, &res, 0);
2319         if (err) {
2320                 res.fi = NULL;
2321                 res.table = NULL;
2322                 if (fl4->flowi4_oif &&
2323                     (ipv4_is_multicast(fl4->daddr) ||
2324                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2325                         /* Apparently, routing tables are wrong. Assume,
2326                            that the destination is on link.
2327
2328                            WHY? DW.
2329                            Because we are allowed to send to iface
2330                            even if it has NO routes and NO assigned
2331                            addresses. When oif is specified, routing
2332                            tables are looked up with only one purpose:
2333                            to catch if destination is gatewayed, rather than
2334                            direct. Moreover, if MSG_DONTROUTE is set,
2335                            we send packet, ignoring both routing tables
2336                            and ifaddr state. --ANK
2337
2338
2339                            We could make it even if oif is unknown,
2340                            likely IPv6, but we do not.
2341                          */
2342
2343                         if (fl4->saddr == 0)
2344                                 fl4->saddr = inet_select_addr(dev_out, 0,
2345                                                               RT_SCOPE_LINK);
2346                         res.type = RTN_UNICAST;
2347                         goto make_route;
2348                 }
2349                 rth = ERR_PTR(err);
2350                 goto out;
2351         }
2352
2353         if (res.type == RTN_LOCAL) {
2354                 if (!fl4->saddr) {
2355                         if (res.fi->fib_prefsrc)
2356                                 fl4->saddr = res.fi->fib_prefsrc;
2357                         else
2358                                 fl4->saddr = fl4->daddr;
2359                 }
2360
2361                 /* L3 master device is the loopback for that domain */
2362                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2363                 fl4->flowi4_oif = dev_out->ifindex;
2364                 flags |= RTCF_LOCAL;
2365                 goto make_route;
2366         }
2367
2368         fib_select_path(net, &res, fl4, mp_hash);
2369
2370         dev_out = FIB_RES_DEV(res);
2371         fl4->flowi4_oif = dev_out->ifindex;
2372
2373
2374 make_route:
2375         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2376
2377 out:
2378         rcu_read_unlock();
2379         return rth;
2380 }
2381 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2382
2383 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2384 {
2385         return NULL;
2386 }
2387
2388 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2389 {
2390         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2391
2392         return mtu ? : dst->dev->mtu;
2393 }
2394
2395 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2396                                           struct sk_buff *skb, u32 mtu)
2397 {
2398 }
2399
2400 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2401                                        struct sk_buff *skb)
2402 {
2403 }
2404
2405 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2406                                           unsigned long old)
2407 {
2408         return NULL;
2409 }
2410
2411 static struct dst_ops ipv4_dst_blackhole_ops = {
2412         .family                 =       AF_INET,
2413         .check                  =       ipv4_blackhole_dst_check,
2414         .mtu                    =       ipv4_blackhole_mtu,
2415         .default_advmss         =       ipv4_default_advmss,
2416         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2417         .redirect               =       ipv4_rt_blackhole_redirect,
2418         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2419         .neigh_lookup           =       ipv4_neigh_lookup,
2420 };
2421
2422 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2423 {
2424         struct rtable *ort = (struct rtable *) dst_orig;
2425         struct rtable *rt;
2426
2427         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2428         if (rt) {
2429                 struct dst_entry *new = &rt->dst;
2430
2431                 new->__use = 1;
2432                 new->input = dst_discard;
2433                 new->output = dst_discard_out;
2434
2435                 new->dev = ort->dst.dev;
2436                 if (new->dev)
2437                         dev_hold(new->dev);
2438
2439                 rt->rt_is_input = ort->rt_is_input;
2440                 rt->rt_iif = ort->rt_iif;
2441                 rt->rt_pmtu = ort->rt_pmtu;
2442
2443                 rt->rt_genid = rt_genid_ipv4(net);
2444                 rt->rt_flags = ort->rt_flags;
2445                 rt->rt_type = ort->rt_type;
2446                 rt->rt_gateway = ort->rt_gateway;
2447                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2448
2449                 INIT_LIST_HEAD(&rt->rt_uncached);
2450                 dst_free(new);
2451         }
2452
2453         dst_release(dst_orig);
2454
2455         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2456 }
2457
2458 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2459                                     const struct sock *sk)
2460 {
2461         struct rtable *rt = __ip_route_output_key(net, flp4);
2462
2463         if (IS_ERR(rt))
2464                 return rt;
2465
2466         if (flp4->flowi4_proto)
2467                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2468                                                         flowi4_to_flowi(flp4),
2469                                                         sk, 0);
2470
2471         return rt;
2472 }
2473 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2474
2475 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2476                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2477                         u32 seq, int event)
2478 {
2479         struct rtable *rt = skb_rtable(skb);
2480         struct rtmsg *r;
2481         struct nlmsghdr *nlh;
2482         unsigned long expires = 0;
2483         u32 error;
2484         u32 metrics[RTAX_MAX];
2485
2486         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
2487         if (!nlh)
2488                 return -EMSGSIZE;
2489
2490         r = nlmsg_data(nlh);
2491         r->rtm_family    = AF_INET;
2492         r->rtm_dst_len  = 32;
2493         r->rtm_src_len  = 0;
2494         r->rtm_tos      = fl4->flowi4_tos;
2495         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2496         if (nla_put_u32(skb, RTA_TABLE, table_id))
2497                 goto nla_put_failure;
2498         r->rtm_type     = rt->rt_type;
2499         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2500         r->rtm_protocol = RTPROT_UNSPEC;
2501         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2502         if (rt->rt_flags & RTCF_NOTIFY)
2503                 r->rtm_flags |= RTM_F_NOTIFY;
2504         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2505                 r->rtm_flags |= RTCF_DOREDIRECT;
2506
2507         if (nla_put_in_addr(skb, RTA_DST, dst))
2508                 goto nla_put_failure;
2509         if (src) {
2510                 r->rtm_src_len = 32;
2511                 if (nla_put_in_addr(skb, RTA_SRC, src))
2512                         goto nla_put_failure;
2513         }
2514         if (rt->dst.dev &&
2515             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2516                 goto nla_put_failure;
2517 #ifdef CONFIG_IP_ROUTE_CLASSID
2518         if (rt->dst.tclassid &&
2519             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2520                 goto nla_put_failure;
2521 #endif
2522         if (!rt_is_input_route(rt) &&
2523             fl4->saddr != src) {
2524                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2525                         goto nla_put_failure;
2526         }
2527         if (rt->rt_uses_gateway &&
2528             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2529                 goto nla_put_failure;
2530
2531         expires = rt->dst.expires;
2532         if (expires) {
2533                 unsigned long now = jiffies;
2534
2535                 if (time_before(now, expires))
2536                         expires -= now;
2537                 else
2538                         expires = 0;
2539         }
2540
2541         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2542         if (rt->rt_pmtu && expires)
2543                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2544         if (rtnetlink_put_metrics(skb, metrics) < 0)
2545                 goto nla_put_failure;
2546
2547         if (fl4->flowi4_mark &&
2548             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2549                 goto nla_put_failure;
2550
2551         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2552             nla_put_u32(skb, RTA_UID,
2553                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2554                 goto nla_put_failure;
2555
2556         error = rt->dst.error;
2557
2558         if (rt_is_input_route(rt)) {
2559 #ifdef CONFIG_IP_MROUTE
2560                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2561                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2562                         int err = ipmr_get_route(net, skb,
2563                                                  fl4->saddr, fl4->daddr,
2564                                                  r, portid);
2565
2566                         if (err <= 0) {
2567                                 if (err == 0)
2568                                         return 0;
2569                                 goto nla_put_failure;
2570                         }
2571                 } else
2572 #endif
2573                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2574                                 goto nla_put_failure;
2575         }
2576
2577         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2578                 goto nla_put_failure;
2579
2580         nlmsg_end(skb, nlh);
2581         return 0;
2582
2583 nla_put_failure:
2584         nlmsg_cancel(skb, nlh);
2585         return -EMSGSIZE;
2586 }
2587
2588 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2589 {
2590         struct net *net = sock_net(in_skb->sk);
2591         struct rtmsg *rtm;
2592         struct nlattr *tb[RTA_MAX+1];
2593         struct rtable *rt = NULL;
2594         struct flowi4 fl4;
2595         __be32 dst = 0;
2596         __be32 src = 0;
2597         u32 iif;
2598         int err;
2599         int mark;
2600         struct sk_buff *skb;
2601         u32 table_id = RT_TABLE_MAIN;
2602         kuid_t uid;
2603
2604         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2605         if (err < 0)
2606                 goto errout;
2607
2608         rtm = nlmsg_data(nlh);
2609
2610         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2611         if (!skb) {
2612                 err = -ENOBUFS;
2613                 goto errout;
2614         }
2615
2616         /* Reserve room for dummy headers, this skb can pass
2617            through good chunk of routing engine.
2618          */
2619         skb_reset_mac_header(skb);
2620         skb_reset_network_header(skb);
2621
2622         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2623         ip_hdr(skb)->protocol = IPPROTO_UDP;
2624         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2625
2626         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2627         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2628         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2629         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2630         if (tb[RTA_UID])
2631                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2632         else
2633                 uid = (iif ? INVALID_UID : current_uid());
2634
2635         memset(&fl4, 0, sizeof(fl4));
2636         fl4.daddr = dst;
2637         fl4.saddr = src;
2638         fl4.flowi4_tos = rtm->rtm_tos;
2639         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2640         fl4.flowi4_mark = mark;
2641         fl4.flowi4_uid = uid;
2642
2643         if (iif) {
2644                 struct net_device *dev;
2645
2646                 dev = __dev_get_by_index(net, iif);
2647                 if (!dev) {
2648                         err = -ENODEV;
2649                         goto errout_free;
2650                 }
2651
2652                 skb->protocol   = htons(ETH_P_IP);
2653                 skb->dev        = dev;
2654                 skb->mark       = mark;
2655                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2656
2657                 rt = skb_rtable(skb);
2658                 if (err == 0 && rt->dst.error)
2659                         err = -rt->dst.error;
2660         } else {
2661                 rt = ip_route_output_key(net, &fl4);
2662
2663                 err = 0;
2664                 if (IS_ERR(rt))
2665                         err = PTR_ERR(rt);
2666         }
2667
2668         if (err)
2669                 goto errout_free;
2670
2671         skb_dst_set(skb, &rt->dst);
2672         if (rtm->rtm_flags & RTM_F_NOTIFY)
2673                 rt->rt_flags |= RTCF_NOTIFY;
2674
2675         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2676                 table_id = rt->rt_table_id;
2677
2678         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2679                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2680                            RTM_NEWROUTE);
2681         if (err < 0)
2682                 goto errout_free;
2683
2684         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2685 errout:
2686         return err;
2687
2688 errout_free:
2689         kfree_skb(skb);
2690         goto errout;
2691 }
2692
2693 void ip_rt_multicast_event(struct in_device *in_dev)
2694 {
2695         rt_cache_flush(dev_net(in_dev->dev));
2696 }
2697
2698 #ifdef CONFIG_SYSCTL
2699 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2700 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2701 static int ip_rt_gc_elasticity __read_mostly    = 8;
2702
2703 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2704                                         void __user *buffer,
2705                                         size_t *lenp, loff_t *ppos)
2706 {
2707         struct net *net = (struct net *)__ctl->extra1;
2708
2709         if (write) {
2710                 rt_cache_flush(net);
2711                 fnhe_genid_bump(net);
2712                 return 0;
2713         }
2714
2715         return -EINVAL;
2716 }
2717
2718 static struct ctl_table ipv4_route_table[] = {
2719         {
2720                 .procname       = "gc_thresh",
2721                 .data           = &ipv4_dst_ops.gc_thresh,
2722                 .maxlen         = sizeof(int),
2723                 .mode           = 0644,
2724                 .proc_handler   = proc_dointvec,
2725         },
2726         {
2727                 .procname       = "max_size",
2728                 .data           = &ip_rt_max_size,
2729                 .maxlen         = sizeof(int),
2730                 .mode           = 0644,
2731                 .proc_handler   = proc_dointvec,
2732         },
2733         {
2734                 /*  Deprecated. Use gc_min_interval_ms */
2735
2736                 .procname       = "gc_min_interval",
2737                 .data           = &ip_rt_gc_min_interval,
2738                 .maxlen         = sizeof(int),
2739                 .mode           = 0644,
2740                 .proc_handler   = proc_dointvec_jiffies,
2741         },
2742         {
2743                 .procname       = "gc_min_interval_ms",
2744                 .data           = &ip_rt_gc_min_interval,
2745                 .maxlen         = sizeof(int),
2746                 .mode           = 0644,
2747                 .proc_handler   = proc_dointvec_ms_jiffies,
2748         },
2749         {
2750                 .procname       = "gc_timeout",
2751                 .data           = &ip_rt_gc_timeout,
2752                 .maxlen         = sizeof(int),
2753                 .mode           = 0644,
2754                 .proc_handler   = proc_dointvec_jiffies,
2755         },
2756         {
2757                 .procname       = "gc_interval",
2758                 .data           = &ip_rt_gc_interval,
2759                 .maxlen         = sizeof(int),
2760                 .mode           = 0644,
2761                 .proc_handler   = proc_dointvec_jiffies,
2762         },
2763         {
2764                 .procname       = "redirect_load",
2765                 .data           = &ip_rt_redirect_load,
2766                 .maxlen         = sizeof(int),
2767                 .mode           = 0644,
2768                 .proc_handler   = proc_dointvec,
2769         },
2770         {
2771                 .procname       = "redirect_number",
2772                 .data           = &ip_rt_redirect_number,
2773                 .maxlen         = sizeof(int),
2774                 .mode           = 0644,
2775                 .proc_handler   = proc_dointvec,
2776         },
2777         {
2778                 .procname       = "redirect_silence",
2779                 .data           = &ip_rt_redirect_silence,
2780                 .maxlen         = sizeof(int),
2781                 .mode           = 0644,
2782                 .proc_handler   = proc_dointvec,
2783         },
2784         {
2785                 .procname       = "error_cost",
2786                 .data           = &ip_rt_error_cost,
2787                 .maxlen         = sizeof(int),
2788                 .mode           = 0644,
2789                 .proc_handler   = proc_dointvec,
2790         },
2791         {
2792                 .procname       = "error_burst",
2793                 .data           = &ip_rt_error_burst,
2794                 .maxlen         = sizeof(int),
2795                 .mode           = 0644,
2796                 .proc_handler   = proc_dointvec,
2797         },
2798         {
2799                 .procname       = "gc_elasticity",
2800                 .data           = &ip_rt_gc_elasticity,
2801                 .maxlen         = sizeof(int),
2802                 .mode           = 0644,
2803                 .proc_handler   = proc_dointvec,
2804         },
2805         {
2806                 .procname       = "mtu_expires",
2807                 .data           = &ip_rt_mtu_expires,
2808                 .maxlen         = sizeof(int),
2809                 .mode           = 0644,
2810                 .proc_handler   = proc_dointvec_jiffies,
2811         },
2812         {
2813                 .procname       = "min_pmtu",
2814                 .data           = &ip_rt_min_pmtu,
2815                 .maxlen         = sizeof(int),
2816                 .mode           = 0644,
2817                 .proc_handler   = proc_dointvec,
2818         },
2819         {
2820                 .procname       = "min_adv_mss",
2821                 .data           = &ip_rt_min_advmss,
2822                 .maxlen         = sizeof(int),
2823                 .mode           = 0644,
2824                 .proc_handler   = proc_dointvec,
2825         },
2826         { }
2827 };
2828
2829 static struct ctl_table ipv4_route_flush_table[] = {
2830         {
2831                 .procname       = "flush",
2832                 .maxlen         = sizeof(int),
2833                 .mode           = 0200,
2834                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2835         },
2836         { },
2837 };
2838
2839 static __net_init int sysctl_route_net_init(struct net *net)
2840 {
2841         struct ctl_table *tbl;
2842
2843         tbl = ipv4_route_flush_table;
2844         if (!net_eq(net, &init_net)) {
2845                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2846                 if (!tbl)
2847                         goto err_dup;
2848
2849                 /* Don't export sysctls to unprivileged users */
2850                 if (net->user_ns != &init_user_ns)
2851                         tbl[0].procname = NULL;
2852         }
2853         tbl[0].extra1 = net;
2854
2855         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2856         if (!net->ipv4.route_hdr)
2857                 goto err_reg;
2858         return 0;
2859
2860 err_reg:
2861         if (tbl != ipv4_route_flush_table)
2862                 kfree(tbl);
2863 err_dup:
2864         return -ENOMEM;
2865 }
2866
2867 static __net_exit void sysctl_route_net_exit(struct net *net)
2868 {
2869         struct ctl_table *tbl;
2870
2871         tbl = net->ipv4.route_hdr->ctl_table_arg;
2872         unregister_net_sysctl_table(net->ipv4.route_hdr);
2873         BUG_ON(tbl == ipv4_route_flush_table);
2874         kfree(tbl);
2875 }
2876
2877 static __net_initdata struct pernet_operations sysctl_route_ops = {
2878         .init = sysctl_route_net_init,
2879         .exit = sysctl_route_net_exit,
2880 };
2881 #endif
2882
2883 static __net_init int rt_genid_init(struct net *net)
2884 {
2885         atomic_set(&net->ipv4.rt_genid, 0);
2886         atomic_set(&net->fnhe_genid, 0);
2887         get_random_bytes(&net->ipv4.dev_addr_genid,
2888                          sizeof(net->ipv4.dev_addr_genid));
2889         return 0;
2890 }
2891
2892 static __net_initdata struct pernet_operations rt_genid_ops = {
2893         .init = rt_genid_init,
2894 };
2895
2896 static int __net_init ipv4_inetpeer_init(struct net *net)
2897 {
2898         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2899
2900         if (!bp)
2901                 return -ENOMEM;
2902         inet_peer_base_init(bp);
2903         net->ipv4.peers = bp;
2904         return 0;
2905 }
2906
2907 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2908 {
2909         struct inet_peer_base *bp = net->ipv4.peers;
2910
2911         net->ipv4.peers = NULL;
2912         inetpeer_invalidate_tree(bp);
2913         kfree(bp);
2914 }
2915
2916 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2917         .init   =       ipv4_inetpeer_init,
2918         .exit   =       ipv4_inetpeer_exit,
2919 };
2920
2921 #ifdef CONFIG_IP_ROUTE_CLASSID
2922 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2923 #endif /* CONFIG_IP_ROUTE_CLASSID */
2924
2925 int __init ip_rt_init(void)
2926 {
2927         int rc = 0;
2928         int cpu;
2929
2930         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2931         if (!ip_idents)
2932                 panic("IP: failed to allocate ip_idents\n");
2933
2934         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2935
2936         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2937         if (!ip_tstamps)
2938                 panic("IP: failed to allocate ip_tstamps\n");
2939
2940         for_each_possible_cpu(cpu) {
2941                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2942
2943                 INIT_LIST_HEAD(&ul->head);
2944                 spin_lock_init(&ul->lock);
2945         }
2946 #ifdef CONFIG_IP_ROUTE_CLASSID
2947         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2948         if (!ip_rt_acct)
2949                 panic("IP: failed to allocate ip_rt_acct\n");
2950 #endif
2951
2952         ipv4_dst_ops.kmem_cachep =
2953                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2954                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2955
2956         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2957
2958         if (dst_entries_init(&ipv4_dst_ops) < 0)
2959                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2960
2961         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2962                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2963
2964         ipv4_dst_ops.gc_thresh = ~0;
2965         ip_rt_max_size = INT_MAX;
2966
2967         devinet_init();
2968         ip_fib_init();
2969
2970         if (ip_rt_proc_init())
2971                 pr_err("Unable to create route proc files\n");
2972 #ifdef CONFIG_XFRM
2973         xfrm_init();
2974         xfrm4_init();
2975 #endif
2976         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2977
2978 #ifdef CONFIG_SYSCTL
2979         register_pernet_subsys(&sysctl_route_ops);
2980 #endif
2981         register_pernet_subsys(&rt_genid_ops);
2982         register_pernet_subsys(&ipv4_inetpeer_ops);
2983         return rc;
2984 }
2985
2986 #ifdef CONFIG_SYSCTL
2987 /*
2988  * We really need to sanitize the damn ipv4 init order, then all
2989  * this nonsense will go away.
2990  */
2991 void __init ip_static_sysctl_init(void)
2992 {
2993         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2994 }
2995 #endif