]> git.proxmox.com Git - mirror_ovs.git/commitdiff
compat: Simplify inet_fragment backports.
authorJoe Stringer <joe@ovn.org>
Tue, 12 Jul 2016 22:26:19 +0000 (15:26 -0700)
committerJoe Stringer <joe@ovn.org>
Mon, 1 Aug 2016 21:16:10 +0000 (14:16 -0700)
The core fragmentation handling logic is exported on all supported
kernels, so it's not necessary to backport the latest version of this.
This greatly simplifies the code due to inconsistencies between the old
per-lookup garbage collection and the newer workqueue based garbage
collection.

As a result of simplifying and removing unnecessary backport code, a few
bugs are fixed for corner cases such as when some fragments remain in
the fragment cache when openvswitch is unloaded.

Some backported ip functions need a little extra logic than what is seen
on the latest code due to this, for instance on kernels <3.17:
* Call inet_frag_evictor() before defrag
* Limit hashsize in ip{,6}_fragment logic

The pernet init/exit logic also differs a little from upstream. Upstream
ipv[46]_defrag logic initializes the various pernet fragment parameters
and its own global fragments cache. In the OVS backport, the pernet
parameters are shared while the fragments cache is separate. The
backport relies upon upstream pernet initialization to perform the
shared setup, and performs no pernet initialization of its own. When it
comes to pernet exit however, the backport must ensure that all
OVS-specific fragment state is cleared, while the shared state remains
untouched so that the regular ipv[46] logic may do its own cleanup. In
practice this means that OVS must have its own divergent implementation
of inet_frags_exit_net().

Fixes the following crash:

Call Trace:
 <IRQ>
 [<ffffffff810744f6>] ? call_timer_fn+0x36/0x100
 [<ffffffff8107548f>] run_timer_softirq+0x1ef/0x2f0
 [<ffffffff8106cccc>] __do_softirq+0xec/0x2c0
 [<ffffffff8106d215>] irq_exit+0x105/0x110
 [<ffffffff81737095>] smp_apic_timer_interrupt+0x45/0x60
 [<ffffffff81735a1d>] apic_timer_interrupt+0x6d/0x80
 <EOI>
 [<ffffffff8104f596>] ? native_safe_halt+0x6/0x10
 [<ffffffff8101cb2f>] default_idle+0x1f/0xc0
 [<ffffffff8101d406>] arch_cpu_idle+0x26/0x30
 [<ffffffff810bf3a5>] cpu_startup_entry+0xc5/0x290
 [<ffffffff810415ed>] start_secondary+0x21d/0x2d0
Code:  Bad RIP value.
RIP  [<ffffffffa0177480>] 0xffffffffa0177480
 RSP <ffff88003f703e78>
CR2: ffffffffa0177480
---[ end trace eb98ca80ba07bd9c ]---
Kernel panic - not syncing: Fatal exception in interrupt

Signed-off-by: Joe Stringer <joe@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
acinclude.m4
datapath/linux/compat/include/net/inet_frag.h
datapath/linux/compat/inet_fragment.c
datapath/linux/compat/ip_fragment.c
datapath/linux/compat/nf_conntrack_reasm.c

index faf79eb9d47c434b6ddbfd81e231eea294c35482..7a714b2cb624e929f4b14ba1febccbd480d4d35e 100644 (file)
@@ -428,6 +428,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
   OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h], [last_in],
                   [OVS_DEFINE([HAVE_INET_FRAGS_LAST_IN])])
   OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frag_evicting])
+  OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frag_evictor])
   OVS_FIND_FIELD_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frags],
                         [frags_work])
   OVS_FIND_FIELD_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frags],
index aa9a019c6fcc276ed7ba438241cc77e9814342c2..49c1bceb695a9cd024bde92a40f6951a51b20e97 100644 (file)
 #define qp_flags(qp) (qp->q.flags)
 #endif
 
-#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-/**
- * struct ovs_inet_frag_queue - fragment queue
- *
- * Wrap the system inet_frag_queue to provide a list evictor.
- *
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
- */
-struct ovs_inet_frag_queue {
-       struct inet_frag_queue  fq;
-       struct hlist_node       list_evictor;
-};
-
-static inline bool rpl_inet_frag_evicting(struct inet_frag_queue *q)
-{
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       struct ovs_inet_frag_queue *ofq = (struct ovs_inet_frag_queue *)q;
-       return !hlist_unhashed(&ofq->list_evictor);
-#else
-       return (q_flags(q) & INET_FRAG_FIRST_IN) && q->fragments != NULL;
-#endif
-}
-#define inet_frag_evicting rpl_inet_frag_evicting
-#else /* HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR */
 #ifndef HAVE_INET_FRAG_EVICTING
-static inline bool rpl_inet_frag_evicting(struct inet_frag_queue *q)
+static inline bool inet_frag_evicting(struct inet_frag_queue *q)
 {
+#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
        return !hlist_unhashed(&q->list_evictor);
+#else
+       return (q_flags(q) & INET_FRAG_FIRST_IN) && q->fragments != NULL;
+#endif /* HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR */
 }
-#define inet_frag_evicting rpl_inet_frag_evicting
-#endif
-#endif
+#endif /* HAVE_INET_FRAG_EVICTING */
 
 #ifndef HAVE_CORRECT_MRU_HANDLING
-static unsigned int rpl_frag_percpu_counter_batch = 130000;
-#define frag_percpu_counter_batch rpl_frag_percpu_counter_batch
-
 static inline void rpl_sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
        __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
@@ -70,14 +45,29 @@ static inline void rpl_add_frag_mem_limit(struct netns_frags *nf, int i)
 }
 #define add_frag_mem_limit rpl_add_frag_mem_limit
 
-int rpl_inet_frags_init(struct inet_frags *f);
+static inline int rpl_inet_frags_init(struct inet_frags *frags)
+{
+       inet_frags_init(frags);
+       return 0;
+}
 #define inet_frags_init rpl_inet_frags_init
 
+/* We reuse the upstream inet_fragment.c common code for managing fragment
+ * stores, However we actually store the fragments within our own 'inet_frags'
+ * structures (in {ip_fragment,nf_conntrack_reasm}.c). When unloading the OVS
+ * kernel module, we need to flush all of the remaining fragments from these
+ * caches, or else we will panic with the following sequence of events:
+ *
+ * 1) A fragment for a packet arrives and is cached in inet_frags. This
+ *    starts a timer to ensure the fragment does not hang around forever.
+ * 2) openvswitch module is unloaded.
+ * 3) The timer for the fragment fires, calling into backported OVS code
+ *    to free the fragment.
+ * 4) BUG: unable to handle kernel paging request at ffffffffc03c01e0
+ */
 void rpl_inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 #define inet_frags_exit_net rpl_inet_frags_exit_net
 
-void rpl_inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
-#define inet_frag_destroy(q, f, work) rpl_inet_frag_destroy(q, f)
 #endif /* !HAVE_CORRECT_MRU_HANDLING */
 
 #endif /* inet_frag.h */
index 4479450c37377244ccbabd8860f69cf516e088c3..f05e6177bfb3effa95bd438ea9ee14e3f7cb1490 100644 (file)
@@ -11,8 +11,6 @@
  *                             ipv6/reassembly. and ipv6 nf conntrack reassembly
  */
 
-#include <linux/version.h>
-
 #ifndef HAVE_CORRECT_MRU_HANDLING
 
 #include <linux/list.h>
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
-#define INETFRAGS_EVICT_BUCKETS   128
-#define INETFRAGS_EVICT_MAX      512
-
-/* don't rebuild inetfrag table with new secret more often than this */
-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
-
-/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
- * Value : 0xff if frame should be dropped.
- *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
- */
-const u8 ip_frag_ecn_table[16] = {
-       /* at least one fragment had CE, and others ECT_0 or ECT_1 */
-       [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
-       [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
-       [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
-
-       /* invalid combinations : drop frame */
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
-       [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-};
-
-static unsigned int
-inet_frag_hashfn(const struct inet_frags *f, struct inet_frag_queue *q)
-{
-       return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
-}
-
 #ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-static bool inet_frag_may_rebuild(struct inet_frags *f)
-{
-       return time_after(jiffies,
-              f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
-}
-
-static void inet_frag_secret_rebuild(struct inet_frags *f)
-{
-       int i;
-
-       write_seqlock_bh(&f->rnd_seqlock);
-
-       if (!inet_frag_may_rebuild(f))
-               goto out;
-
-       get_random_bytes(&f->rnd, sizeof(u32));
-
-       for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-               struct inet_frag_bucket *hb;
-               struct inet_frag_queue *q;
-               struct hlist_node *n;
-
-               hb = &f->hash[i];
-               spin_lock(&hb->chain_lock);
-
-               hlist_for_each_entry_safe(q, n, &hb->chain, list) {
-                       unsigned int hval = inet_frag_hashfn(f, q);
-
-                       if (hval != i) {
-                               struct inet_frag_bucket *hb_dest;
-
-                               hlist_del(&q->list);
-
-                               /* Relink to new hash chain. */
-                               hb_dest = &f->hash[hval];
-
-                               /* This is the only place where we take
-                                * another chain_lock while already holding
-                                * one.  As this will not run concurrently,
-                                * we cannot deadlock on hb_dest lock below, if its
-                                * already locked it will be released soon since
-                                * other caller cannot be waiting for hb lock
-                                * that we've taken above.
-                                */
-                               spin_lock_nested(&hb_dest->chain_lock,
-                                                SINGLE_DEPTH_NESTING);
-                               hlist_add_head(&q->list, &hb_dest->chain);
-                               spin_unlock(&hb_dest->chain_lock);
-                       }
-               }
-               spin_unlock(&hb->chain_lock);
-       }
-
-       f->rebuild = false;
-       f->last_rebuild_jiffies = jiffies;
-out:
-       write_sequnlock_bh(&f->rnd_seqlock);
-}
-
 static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
 {
        return q->net->low_thresh == 0 ||
@@ -130,9 +37,6 @@ static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
 static unsigned int
 inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 {
-#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-       struct ovs_inet_frag_queue *ofq;
-#endif
        struct inet_frag_queue *fq;
        struct hlist_node *n;
        unsigned int evicted = 0;
@@ -150,8 +54,8 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 #ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
                hlist_add_head(&fq->list_evictor, &expired);
 #else
-               ofq = (struct ovs_inet_frag_queue *)fq;
-               hlist_add_head(&ofq->list_evictor, &expired);
+               hlist_del(&fq->list);
+               hlist_add_head(&fq->list, &expired);
 #endif
                ++evicted;
        }
@@ -160,99 +64,28 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 
 #ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
        hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-               f->frag_expire((unsigned long) fq);
 #else
-       hlist_for_each_entry_safe(ofq, n, &expired, list_evictor)
-               f->frag_expire((unsigned long) &ofq->fq);
+       hlist_for_each_entry_safe(fq, n, &expired, list)
 #endif
+               f->frag_expire((unsigned long) fq);
 
        return evicted;
 }
 
-static void inet_frag_worker(struct work_struct *work)
-{
-       unsigned int budget = INETFRAGS_EVICT_BUCKETS;
-       unsigned int i, evicted = 0;
-       struct inet_frags *f;
-
-       f = container_of(work, struct inet_frags, frags_work);
-
-       BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
-
-       local_bh_disable();
-
-       for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
-               evicted += inet_evict_bucket(f, &f->hash[i]);
-               i = (i + 1) & (INETFRAGS_HASHSZ - 1);
-               if (evicted > INETFRAGS_EVICT_MAX)
-                       break;
-       }
-
-       f->next_bucket = i;
-
-       local_bh_enable();
-
-       if (f->rebuild && inet_frag_may_rebuild(f))
-               inet_frag_secret_rebuild(f);
-}
-
-static void inet_frag_schedule_worker(struct inet_frags *f)
-{
-       if (unlikely(!work_pending(&f->frags_work)))
-               schedule_work(&f->frags_work);
-}
-#endif /* >= 3.17 */
-
-int inet_frags_init(struct inet_frags *f)
-{
-       int i;
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       INIT_WORK(&f->frags_work, inet_frag_worker);
-#endif
-
-       for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-               struct inet_frag_bucket *hb = &f->hash[i];
-
-               spin_lock_init(&hb->chain_lock);
-               INIT_HLIST_HEAD(&hb->chain);
-       }
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       seqlock_init(&f->rnd_seqlock);
-       f->last_rebuild_jiffies = 0;
-       f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
-                                           NULL);
-       if (!f->frags_cachep)
-               return -ENOMEM;
-#else
-       rwlock_init(&f->lock);
-       f->secret_timer.expires = jiffies + f->secret_interval;
-#endif
-
-       return 0;
-}
-
-void inet_frags_fini(struct inet_frags *f)
-{
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       cancel_work_sync(&f->frags_work);
-       kmem_cache_destroy(f->frags_cachep);
-#endif
-}
-
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
+       int thresh = nf->low_thresh;
        unsigned int seq;
+       int i;
+
+       nf->low_thresh = 0;
 
 evict_again:
        local_bh_disable();
        seq = read_seqbegin(&f->rnd_seqlock);
 
-       inet_frag_evictor(nf, f, true);
+       for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+               inet_evict_bucket(f, &f->hash[i]);
 
        local_bh_enable();
        cond_resched();
@@ -260,301 +93,22 @@ evict_again:
        if (read_seqretry(&f->rnd_seqlock, seq) ||
            percpu_counter_sum(&nf->mem))
                goto evict_again;
-}
-#else
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-       read_lock_bh(&f->lock);
-       inet_frag_evictor(nf, f, true);
-       read_unlock_bh(&f->lock);
-}
-#endif
-
-static struct inet_frag_bucket *
-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-__acquires(f->lock)
-#endif
-__acquires(hb->chain_lock)
-{
-       struct inet_frag_bucket *hb;
-       unsigned int hash;
-
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-       read_lock(&f->lock);
-#else
-       unsigned int seq;
- restart:
-       seq = read_seqbegin(&f->rnd_seqlock);
-#endif
-
-       hash = inet_frag_hashfn(f, fq);
-       hb = &f->hash[hash];
-
-       spin_lock(&hb->chain_lock);
-
-#ifndef HAVE_INET_FRAGS_WITH_RWLOCK
-       if (read_seqretry(&f->rnd_seqlock, seq)) {
-               spin_unlock(&hb->chain_lock);
-               goto restart;
-       }
-#endif
 
-       return hb;
+       nf->low_thresh = thresh;
 }
-
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-__releases(f->lock)
-#endif
-__releases(hb->chain_lock)
-{
-       struct inet_frag_bucket *hb;
-
-       hb = get_frag_bucket_locked(fq, f);
-       hlist_del(&fq->list);
-       q_flags(fq) |= INET_FRAG_COMPLETE;
-       spin_unlock(&hb->chain_lock);
-
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-       read_unlock(&f->lock);
-#endif
-}
-
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
-{
-       if (del_timer(&fq->timer))
-               atomic_dec(&fq->refcnt);
-
-       if (!(q_flags(fq) & INET_FRAG_COMPLETE)) {
-               fq_unlink(fq, f);
-               atomic_dec(&fq->refcnt);
-       }
-}
-
-static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
-                                 struct sk_buff *skb)
-{
-       if (f->skb_free)
-               f->skb_free(skb);
-       kfree_skb(skb);
-}
-
-void rpl_inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
-{
-       struct sk_buff *fp;
-       struct netns_frags *nf;
-       unsigned int sum, sum_truesize = 0;
-
-       WARN_ON(!(q_flags(q) & INET_FRAG_COMPLETE));
-       WARN_ON(del_timer(&q->timer) != 0);
-
-       /* Release all fragment data. */
-       fp = q->fragments;
-       nf = q->net;
-       while (fp) {
-               struct sk_buff *xp = fp->next;
-
-               sum_truesize += fp->truesize;
-               frag_kfree_skb(nf, f, fp);
-               fp = xp;
-       }
-       sum = sum_truesize + f->qsize;
-
-       if (f->destructor)
-               f->destructor(q);
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       kmem_cache_free(f->frags_cachep, q);
-#else
-       kfree(q);
-#endif
-
-       sub_frag_mem_limit(nf, sum);
-}
-
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
-{
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       int i;
-
-       for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-               inet_evict_bucket(f, &f->hash[i]);
-
-       return 0;
-#else
-       struct inet_frag_queue *q;
-       int work, evicted = 0;
-
-       work = frag_mem_limit(nf) - nf->low_thresh;
-       while (work > 0 || force) {
-               spin_lock(&nf->lru_lock);
-
-               if (list_empty(&nf->lru_list)) {
-                       spin_unlock(&nf->lru_lock);
-                       break;
-               }
-
-               q = list_first_entry(&nf->lru_list,
-                                    struct inet_frag_queue, lru_list);
-               atomic_inc(&q->refcnt);
-               /* Remove q from list to avoid several CPUs grabbing it */
-               list_del_init(&q->lru_list);
-
-               spin_unlock(&nf->lru_lock);
-
-               spin_lock(&q->lock);
-               if (!(q->last_in & INET_FRAG_COMPLETE))
-                       inet_frag_kill(q, f);
-               spin_unlock(&q->lock);
-
-               if (atomic_dec_and_test(&q->refcnt))
-                       inet_frag_destroy(q, f, &work);
-               evicted++;
-       }
-
-       return evicted;
-#endif
-}
-
-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
-                                               struct inet_frag_queue *qp_in,
-                                               struct inet_frags *f,
-                                               void *arg)
-{
-       struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
-       struct inet_frag_queue *qp;
-
-#ifdef CONFIG_SMP
-       /* With SMP race we have to recheck hash table, because
-        * such entry could have been created on other cpu before
-        * we acquired hash bucket lock.
-        */
-       hlist_for_each_entry(qp, &hb->chain, list) {
-               if (qp->net == nf && f->match(qp, arg)) {
-                       atomic_inc(&qp->refcnt);
-                       spin_unlock(&hb->chain_lock);
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-                       read_unlock(&f->lock);
-#endif
-                       q_flags(qp_in) |= INET_FRAG_COMPLETE;
-                       inet_frag_put(qp_in, f);
-                       return qp;
-               }
-       }
-#endif /* CONFIG_SMP */
-       qp = qp_in;
-       if (!mod_timer(&qp->timer, jiffies + nf->timeout))
-               atomic_inc(&qp->refcnt);
-
-       atomic_inc(&qp->refcnt);
-       hlist_add_head(&qp->list, &hb->chain);
-
-       spin_unlock(&hb->chain_lock);
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-       read_unlock(&f->lock);
-#endif
-
-       return qp;
-}
-
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
-                                              struct inet_frags *f,
-                                              void *arg)
-{
-       struct inet_frag_queue *q;
-
-       if (frag_mem_limit(nf) > nf->high_thresh) {
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-               inet_frag_schedule_worker(f);
-#endif
-               return NULL;
-       }
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
-#else
-       q = kzalloc(f->qsize, GFP_ATOMIC);
-#endif
-       if (!q)
-               return NULL;
-
-       q->net = nf;
-       f->constructor(q, arg);
-       add_frag_mem_limit(nf, f->qsize);
-
-       setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
-       spin_lock_init(&q->lock);
-       atomic_set(&q->refcnt, 1);
-
-       return q;
-}
-
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-                                               struct inet_frags *f,
-                                               void *arg)
-{
-       struct inet_frag_queue *q;
-
-       q = inet_frag_alloc(nf, f, arg);
-       if (!q)
-               return NULL;
-
-       return inet_frag_intern(nf, q, f, arg);
-}
-
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-                                      struct inet_frags *f, void *key,
-                                      unsigned int hash)
+#else /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
-       struct inet_frag_bucket *hb;
-       struct inet_frag_queue *q;
-       int depth = 0;
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       if (frag_mem_limit(nf) > nf->low_thresh)
-               inet_frag_schedule_worker(f);
-#else
-       if (frag_mem_limit(nf) > nf->high_thresh)
-               inet_frag_evictor(nf, f, false);
-#endif
-
-       hash &= (INETFRAGS_HASHSZ - 1);
-       hb = &f->hash[hash];
-
-       spin_lock(&hb->chain_lock);
-       hlist_for_each_entry(q, &hb->chain, list) {
-               if (q->net == nf && f->match(q, key)) {
-                       atomic_inc(&q->refcnt);
-                       spin_unlock(&hb->chain_lock);
-                       return q;
-               }
-               depth++;
-       }
-       spin_unlock(&hb->chain_lock);
-
-       if (depth <= INETFRAGS_MAXDEPTH)
-               return inet_frag_create(nf, f, key);
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-       if (inet_frag_may_rebuild(f)) {
-               if (!f->rebuild)
-                       f->rebuild = true;
-               inet_frag_schedule_worker(f);
-       }
-#endif
+       int thresh = nf->low_thresh;
 
-       return ERR_PTR(-ENOBUFS);
-}
+       nf->low_thresh = 0;
 
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-                                  const char *prefix)
-{
-       static const char msg[] = "inet_frag_find: Fragment hash bucket"
-               " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
-               ". Dropping fragment.\n";
+       local_bh_disable();
+       inet_frag_evictor(nf, f, true);
+       local_bh_enable();
 
-       if (PTR_ERR(q) == -ENOBUFS)
-               net_dbg_ratelimited("%s%s", prefix, msg);
+       nf->low_thresh = thresh;
 }
+#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
 
 #endif /* !HAVE_CORRECT_MRU_HANDLING */
index 8d01088abc0a78f196f4f5ef345a08a7df48fe9f..4e9c5034913b7b6e7db3f3b0aac99eded1f09343 100644 (file)
@@ -76,12 +76,7 @@ struct ipfrag_skb_cb
 
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
-       union {
-               struct inet_frag_queue q;
-#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-               struct ovs_inet_frag_queue oq;
-#endif
-       };
+       struct inet_frag_queue q;
 
        u32             user;
        __be32          saddr;
@@ -119,6 +114,12 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
                            (__force u32)saddr, (__force u32)daddr,
                            ip4_frags.rnd);
 }
+/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers")
+ * shifted this logic into inet_fragment, but prior kernels still need this.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)
+#define ipqhashfn(a, b, c, d) (ipqhashfn(a, b, c, d) & (INETFRAGS_HASHSZ - 1))
+#endif
 
 #ifdef HAVE_INET_FRAGS_CONST
 static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
@@ -267,6 +268,23 @@ out:
        ipq_put(qp);
 }
 
+#ifdef HAVE_INET_FRAG_EVICTOR
+/* Memory limiting on fragments.  Evictor trashes the oldest
+ * fragment queue until we are back under the threshold.
+ *
+ * Necessary for kernels earlier than v3.17. Replaced in commit
+ * b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue").
+ */
+static void ip_evictor(struct net *net)
+{
+       int evicted;
+
+       evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
+       if (evicted)
+               IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
+}
+#endif
+
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
@@ -281,6 +299,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
        arg.user = user;
        arg.vif = vif;
 
+#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
+       read_lock(&ip4_frags.lock);
+#endif
        hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 
        q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
@@ -683,6 +704,11 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
        skb_orphan(skb);
 
+#ifdef HAVE_INET_FRAG_EVICTOR
+       /* Start by cleaning up the memory. */
+       ip_evictor(net);
+#endif
+
        /* Lookup (or create) queue header */
        qp = ip_find(net, ip_hdr(skb), user, vif);
        if (qp) {
@@ -701,7 +727,6 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
        kfree_skb(skb);
        return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(rpl_ip_defrag);
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
index ca19a9ff9d56047827849909e8bb41c40ca3b6ff..2024f1f590a09e48688c140bad02bf84b8cb25e9 100644 (file)
@@ -80,6 +80,12 @@ static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
        return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
                            (__force u32)id, nf_frags.rnd);
 }
+/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers")
+ * shifted this logic into inet_fragment, but prior kernels still need this.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)
+#define nf_hash_frag(a, b, c) (nf_hash_frag(a, b, c) & (INETFRAGS_HASHSZ - 1))
+#endif
 
 #ifdef HAVE_INET_FRAGS_CONST
 static unsigned int nf_hashfn(const struct inet_frag_queue *q)
@@ -119,7 +125,11 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
        arg.dst = dst;
        arg.ecn = ecn;
 
+#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
+       read_lock_bh(&nf_frags.lock);
+#else
        local_bh_disable();
+#endif
        hash = nf_hash_frag(id, src, dst);
 
        q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
@@ -512,6 +522,13 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
        hdr = ipv6_hdr(skb);
        fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
+/* See ip_evictor(). */
+#ifdef HAVE_INET_FRAG_EVICTOR
+       local_bh_disable();
+       inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
+       local_bh_enable();
+#endif
+
        fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
                     ip6_frag_ecn(hdr));
        if (fq == NULL)