]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - net/ipv4/route.c
inet: switch IP ID generator to siphash
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / route.c
index 4e153b23bceca1b8f6071febf4bb0df8cca7f12f..956290896808e2134ceb85b5ff13ba883063e25f 100644 (file)
@@ -128,10 +128,13 @@ static int ip_rt_redirect_silence __read_mostly   = ((HZ / 50) << (9 + 1));
 static int ip_rt_error_cost __read_mostly      = HZ;
 static int ip_rt_error_burst __read_mostly     = 5 * HZ;
 static int ip_rt_mtu_expires __read_mostly     = 10 * 60 * HZ;
-static int ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
+static u32 ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly      = 256;
 
 static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
+
+static int ip_min_valid_pmtu __read_mostly     = IPV4_MIN_MTU;
+
 /*
  *     Interface to generic destination cache.
  */
@@ -514,15 +517,17 @@ EXPORT_SYMBOL(ip_idents_reserve);
 
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 {
-       static u32 ip_idents_hashrnd __read_mostly;
        u32 hash, id;
 
-       net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+       /* Note the following code is not safe, but this is okay. */
+       if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+               get_random_bytes(&net->ipv4.ip_id_key,
+                                sizeof(net->ipv4.ip_id_key));
 
-       hash = jhash_3words((__force u32)iph->daddr,
+       hash = siphash_3u32((__force u32)iph->daddr,
                            (__force u32)iph->saddr,
-                           iph->protocol ^ net_hash_mix(net),
-                           ip_idents_hashrnd);
+                           iph->protocol,
+                           &net->ipv4.ip_id_key);
        id = ip_idents_reserve(hash, segs);
        iph->id = htons(id);
 }
@@ -636,6 +641,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 {
        rt->rt_pmtu = fnhe->fnhe_pmtu;
+       rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
        rt->dst.expires = fnhe->fnhe_expires;
 
        if (fnhe->fnhe_gw) {
@@ -646,7 +652,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 }
 
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
-                                 u32 pmtu, unsigned long expires)
+                                 u32 pmtu, bool lock, unsigned long expires)
 {
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe;
@@ -683,8 +689,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
                        fnhe->fnhe_genid = genid;
                if (gw)
                        fnhe->fnhe_gw = gw;
-               if (pmtu)
+               if (pmtu) {
                        fnhe->fnhe_pmtu = pmtu;
+                       fnhe->fnhe_mtu_locked = lock;
+               }
                fnhe->fnhe_expires = max(1UL, expires);
                /* Update all cached dsts too */
                rt = rcu_dereference(fnhe->fnhe_rth_input);
@@ -708,7 +716,8 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
                fnhe->fnhe_daddr = daddr;
                fnhe->fnhe_gw = gw;
                fnhe->fnhe_pmtu = pmtu;
-               fnhe->fnhe_expires = expires;
+               fnhe->fnhe_mtu_locked = lock;
+               fnhe->fnhe_expires = max(1UL, expires);
 
                /* Exception created; mark the cached routes for the nexthop
                 * stale, so anyone caching it rechecks if this exception
@@ -789,7 +798,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
                                struct fib_nh *nh = &FIB_RES_NH(res);
 
                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
-                                               0, jiffies + ip_rt_gc_timeout);
+                                               0, false,
+                                               jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -896,13 +906,15 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
         */
-       if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+       if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
                peer->rate_tokens = 0;
+               peer->n_redirects = 0;
+       }
 
        /* Too many ignored redirects; do not send anything
         * set dst.rate_last to the last seen redirected packet.
         */
-       if (peer->rate_tokens >= ip_rt_redirect_number) {
+       if (peer->n_redirects >= ip_rt_redirect_number) {
                peer->rate_last = jiffies;
                goto out_put_peer;
        }
@@ -919,6 +931,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->rate_tokens;
+               ++peer->n_redirects;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
                if (log_martians &&
                    peer->rate_tokens == ip_rt_redirect_number)
@@ -1001,18 +1014,22 @@ out:    kfree_skb(skb);
 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
        struct dst_entry *dst = &rt->dst;
+       u32 old_mtu = ipv4_mtu(dst);
        struct fib_result res;
+       bool lock = false;
 
-       if (dst_metric_locked(dst, RTAX_MTU))
+       if (ip_mtu_locked(dst))
                return;
 
-       if (ipv4_mtu(dst) < mtu)
+       if (old_mtu < mtu)
                return;
 
-       if (mtu < ip_rt_min_pmtu)
-               mtu = ip_rt_min_pmtu;
+       if (mtu < ip_rt_min_pmtu) {
+               lock = true;
+               mtu = min(old_mtu, ip_rt_min_pmtu);
+       }
 
-       if (rt->rt_pmtu == mtu &&
+       if (rt->rt_pmtu == mtu && !lock &&
            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
                return;
 
@@ -1020,7 +1037,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
                struct fib_nh *nh = &FIB_RES_NH(res);
 
-               update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+               update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
                                      jiffies + ip_rt_mtu_expires);
        }
        rcu_read_unlock();
@@ -1178,11 +1195,39 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
        return dst;
 }
 
+static void ipv4_send_dest_unreach(struct sk_buff *skb)
+{
+       struct ip_options opt;
+       int res;
+
+       /* Recompile ip options since IPCB may not be valid anymore.
+        * Also check we have a reasonable ipv4 header.
+        */
+       if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
+           ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
+               return;
+
+       memset(&opt, 0, sizeof(opt));
+       if (ip_hdr(skb)->ihl > 5) {
+               if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
+                       return;
+               opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+
+               rcu_read_lock();
+               res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
+               rcu_read_unlock();
+
+               if (res)
+                       return;
+       }
+       __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
+}
+
 static void ipv4_link_failure(struct sk_buff *skb)
 {
        struct rtable *rt;
 
-       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+       ipv4_send_dest_unreach(skb);
 
        rt = skb_rtable(skb);
        if (rt)
@@ -1273,7 +1318,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 
        mtu = READ_ONCE(dst->dev->mtu);
 
-       if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+       if (unlikely(ip_mtu_locked(dst))) {
                if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }
@@ -1283,6 +1328,40 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+       struct fnhe_hash_bucket *hash;
+       struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+       u32 hval = fnhe_hashfun(daddr);
+
+       spin_lock_bh(&fnhe_lock);
+
+       hash = rcu_dereference_protected(nh->nh_exceptions,
+                                        lockdep_is_held(&fnhe_lock));
+       hash += hval;
+
+       fnhe_p = &hash->chain;
+       fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+       while (fnhe) {
+               if (fnhe->fnhe_daddr == daddr) {
+                       rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+                               fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+                       /* set fnhe_daddr to 0 to ensure it won't bind with
+                        * new dsts in rt_bind_exception().
+                        */
+                       fnhe->fnhe_daddr = 0;
+                       fnhe_flush_routes(fnhe);
+                       kfree_rcu(fnhe, rcu);
+                       break;
+               }
+               fnhe_p = &fnhe->fnhe_next;
+               fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+                                                lockdep_is_held(&fnhe_lock));
+       }
+
+       spin_unlock_bh(&fnhe_lock);
+}
+
 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 {
        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
@@ -1296,8 +1375,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 
        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
-               if (fnhe->fnhe_daddr == daddr)
+               if (fnhe->fnhe_daddr == daddr) {
+                       if (fnhe->fnhe_expires &&
+                           time_after(jiffies, fnhe->fnhe_expires)) {
+                               ip_del_fnhe(nh, daddr);
+                               break;
+                       }
                        return fnhe;
+               }
        }
        return NULL;
 }
@@ -1386,7 +1471,7 @@ struct uncached_list {
 
 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
 
-static void rt_add_uncached_list(struct rtable *rt)
+void rt_add_uncached_list(struct rtable *rt)
 {
        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
 
@@ -1397,14 +1482,8 @@ static void rt_add_uncached_list(struct rtable *rt)
        spin_unlock_bh(&ul->lock);
 }
 
-static void ipv4_dst_destroy(struct dst_entry *dst)
+void rt_del_uncached_list(struct rtable *rt)
 {
-       struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
-       struct rtable *rt = (struct rtable *) dst;
-
-       if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
-               kfree(p);
-
        if (!list_empty(&rt->rt_uncached)) {
                struct uncached_list *ul = rt->rt_uncached_list;
 
@@ -1414,6 +1493,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
        }
 }
 
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+       struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
+       struct rtable *rt = (struct rtable *)dst;
+
+       if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
+               kfree(p);
+
+       rt_del_uncached_list(rt);
+}
+
 void rt_flush_dev(struct net_device *dev)
 {
        struct net *net = dev_net(dev);
@@ -1509,6 +1599,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
                rt->rt_is_input = 0;
                rt->rt_iif = 0;
                rt->rt_pmtu = 0;
+               rt->rt_mtu_locked = 0;
                rt->rt_gateway = 0;
                rt->rt_uses_gateway = 0;
                rt->rt_table_id = 0;
@@ -1617,36 +1708,6 @@ static void ip_handle_martian_source(struct net_device *dev,
 #endif
 }
 
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
-{
-       struct fnhe_hash_bucket *hash;
-       struct fib_nh_exception *fnhe, __rcu **fnhe_p;
-       u32 hval = fnhe_hashfun(daddr);
-
-       spin_lock_bh(&fnhe_lock);
-
-       hash = rcu_dereference_protected(nh->nh_exceptions,
-                                        lockdep_is_held(&fnhe_lock));
-       hash += hval;
-
-       fnhe_p = &hash->chain;
-       fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
-       while (fnhe) {
-               if (fnhe->fnhe_daddr == daddr) {
-                       rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
-                               fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
-                       fnhe_flush_routes(fnhe);
-                       kfree_rcu(fnhe, rcu);
-                       break;
-               }
-               fnhe_p = &fnhe->fnhe_next;
-               fnhe = rcu_dereference_protected(fnhe->fnhe_next,
-                                                lockdep_is_held(&fnhe_lock));
-       }
-
-       spin_unlock_bh(&fnhe_lock);
-}
-
 static void set_lwt_redirect(struct rtable *rth)
 {
        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
@@ -1713,20 +1774,10 @@ static int __mkroute_input(struct sk_buff *skb,
 
        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
        if (do_cache) {
-               if (fnhe) {
+               if (fnhe)
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
-                       if (rth && rth->dst.expires &&
-                           time_after(jiffies, rth->dst.expires)) {
-                               ip_del_fnhe(&FIB_RES_NH(*res), daddr);
-                               fnhe = NULL;
-                       } else {
-                               goto rt_cache;
-                       }
-               }
-
-               rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-
-rt_cache:
+               else
+                       rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
@@ -1829,6 +1880,8 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
                                return skb_get_hash_raw(skb) >> 1;
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        skb_flow_dissect_flow_keys(skb, &keys, flag);
+
+                       hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
                        hash_keys.ports.src = keys.ports.src;
@@ -2110,12 +2163,13 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                int our = 0;
                int err = -EINVAL;
 
-               if (in_dev)
-                       our = ip_check_mc_rcu(in_dev, daddr, saddr,
-                                             ip_hdr(skb)->protocol);
+               if (!in_dev)
+                       return err;
+               our = ip_check_mc_rcu(in_dev, daddr, saddr,
+                                     ip_hdr(skb)->protocol);
 
                /* check l3 master if no match yet */
-               if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
+               if (!our && netif_is_l3_slave(dev)) {
                        struct in_device *l3_in_dev;
 
                        l3_in_dev = __in_dev_get_rcu(skb->dev);
@@ -2201,39 +2255,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
                 * be set to the loopback interface as well.
                 */
-               fi = NULL;
+               do_cache = false;
        }
 
        fnhe = NULL;
        do_cache &= fi != NULL;
-       if (do_cache) {
+       if (fi) {
                struct rtable __rcu **prth;
                struct fib_nh *nh = &FIB_RES_NH(*res);
 
                fnhe = find_exception(nh, fl4->daddr);
+               if (!do_cache)
+                       goto add;
                if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
-                       rth = rcu_dereference(*prth);
-                       if (rth && rth->dst.expires &&
-                           time_after(jiffies, rth->dst.expires)) {
-                               ip_del_fnhe(nh, fl4->daddr);
-                               fnhe = NULL;
-                       } else {
-                               goto rt_cache;
+               } else {
+                       if (unlikely(fl4->flowi4_flags &
+                                    FLOWI_FLAG_KNOWN_NH &&
+                                    !(nh->nh_gw &&
+                                      nh->nh_scope == RT_SCOPE_LINK))) {
+                               do_cache = false;
+                               goto add;
                        }
+                       prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                }
-
-               if (unlikely(fl4->flowi4_flags &
-                            FLOWI_FLAG_KNOWN_NH &&
-                            !(nh->nh_gw &&
-                              nh->nh_scope == RT_SCOPE_LINK))) {
-                       do_cache = false;
-                       goto add;
-               }
-               prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                rth = rcu_dereference(*prth);
-
-rt_cache:
                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
                        return rth;
        }
@@ -2283,13 +2329,14 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
                                        const struct sk_buff *skb)
 {
        __u8 tos = RT_FL_TOS(fl4);
-       struct fib_result res;
+       struct fib_result res = {
+               .type           = RTN_UNSPEC,
+               .fi             = NULL,
+               .table          = NULL,
+               .tclassid       = 0,
+       };
        struct rtable *rth;
 
-       res.tclassid    = 0;
-       res.fi          = NULL;
-       res.table       = NULL;
-
        fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
@@ -2532,6 +2579,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
                rt->rt_is_input = ort->rt_is_input;
                rt->rt_iif = ort->rt_iif;
                rt->rt_pmtu = ort->rt_pmtu;
+               rt->rt_mtu_locked = ort->rt_mtu_locked;
 
                rt->rt_genid = rt_genid_ipv4(net);
                rt->rt_flags = ort->rt_flags;
@@ -2634,6 +2682,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
        if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+       if (rt->rt_mtu_locked && expires)
+               metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
 
@@ -2934,7 +2984,8 @@ static struct ctl_table ipv4_route_table[] = {
                .data           = &ip_rt_min_pmtu,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &ip_min_valid_pmtu,
        },
        {
                .procname       = "min_adv_mss",