kmem_cache hardening and performance improvements

author Brian Behlendorf <behlendorf1@llnl.gov>

Sat, 31 Jan 2009 04:54:49 +0000 (20:54 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Sat, 31 Jan 2009 04:54:49 +0000 (20:54 -0800)
author Brian Behlendorf <behlendorf1@llnl.gov>
Sat, 31 Jan 2009 04:54:49 +0000 (20:54 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Sat, 31 Jan 2009 04:54:49 +0000 (20:54 -0800)
diff --git a/include/sys/kmem.h b/include/sys/kmem.h

index ef587631263b81e1b8d2a42b6f214939d7085024..4f939e0fc11d137ccff991c13ca9c5bbb9ba953a 100644 (file)
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -45,6 +45,7 @@ extern "C" {
  #include <asm/atomic_compat.h>
  #include <sys/types.h>
  #include <sys/debug.h>
+#include <sys/workqueue.h>
  
  /*
   * Memory allocation interfaces
@@ -161,17 +162,32 @@ kmem_alloc_tryhard(size_t size, size_t *alloc_size, int kmflags)
  /*
   * Slab allocation interfaces
   */
-#define KMC_NOTOUCH                     0x00000001
-#define KMC_NODEBUG                     0x00000002 /* Default behavior */
-#define KMC_NOMAGAZINE                  0x00000004 /* XXX: No disable support available */
-#define KMC_NOHASH                      0x00000008 /* XXX: No hash available */
-#define KMC_QCACHE                      0x00000010 /* XXX: Unsupported */
-#define KMC_KMEM                       0x00000100 /* Use kmem cache */
-#define KMC_VMEM                       0x00000200 /* Use vmem cache */
-#define KMC_OFFSLAB                    0x00000400 /* Objects not on slab */
-
-#define KMC_REAP_CHUNK                  256
-#define KMC_DEFAULT_SEEKS               DEFAULT_SEEKS
+enum {
+       KMC_BIT_NOTOUCH         = 0,    /* Don't update ages */
+       KMC_BIT_NODEBUG         = 1,    /* Default behavior */
+       KMC_BIT_NOMAGAZINE      = 2,    /* XXX: Unsupported */
+       KMC_BIT_NOHASH          = 3,    /* XXX: Unsupported */
+       KMC_BIT_QCACHE          = 4,    /* XXX: Unsupported */
+       KMC_BIT_KMEM            = 5,    /* Use kmem cache */
+       KMC_BIT_VMEM            = 6,    /* Use vmem cache */
+       KMC_BIT_OFFSLAB         = 7,    /* Objects not on slab */
+       KMC_BIT_REAPING         = 16,   /* Reaping in progress */
+       KMC_BIT_DESTROY         = 17,   /* Destroy in progress */
+};
+
+#define KMC_NOTOUCH            (1 << KMC_BIT_NOTOUCH)
+#define KMC_NODEBUG            (1 << KMC_BIT_NODEBUG)
+#define KMC_NOMAGAZINE         (1 << KMC_BIT_NOMAGAZINE)
+#define KMC_NOHASH             (1 << KMC_BIT_NOHASH)
+#define KMC_QCACHE             (1 << KMC_BIT_QCACHE)
+#define KMC_KMEM               (1 << KMC_BIT_KMEM)
+#define KMC_VMEM               (1 << KMC_BIT_VMEM)
+#define KMC_OFFSLAB            (1 << KMC_BIT_OFFSLAB)
+#define KMC_REAPING            (1 << KMC_BIT_REAPING)
+#define KMC_DESTROY            (1 << KMC_BIT_DESTROY)
+
+#define KMC_REAP_CHUNK                 INT_MAX
+#define KMC_DEFAULT_SEEKS              1
  
  #ifdef DEBUG_KMEM_UNIMPLEMENTED
  static __inline__ void kmem_init(void) {
@@ -223,9 +239,10 @@ extern struct rw_semaphore spl_kmem_cache_sem;
  #define SKS_MAGIC                      0x22222222
  #define SKC_MAGIC                      0x2c2c2c2c
  
-#define SPL_KMEM_CACHE_DELAY           5
-#define SPL_KMEM_CACHE_OBJ_PER_SLAB    32
-#define SPL_KMEM_CACHE_ALIGN           8
+#define SPL_KMEM_CACHE_DELAY           5       /* Minimum slab release age */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB    32      /* Target objects per slab */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN        8       /* Minimum objects per slab */
+#define SPL_KMEM_CACHE_ALIGN           8       /* Default object alignment */
  
  typedef int (*spl_kmem_ctor_t)(void *, void *, int);
  typedef void (*spl_kmem_dtor_t)(void *, void *);
@@ -258,24 +275,28 @@ typedef struct spl_kmem_slab {
  } spl_kmem_slab_t;
  
  typedef struct spl_kmem_cache {
-        uint32_t               skc_magic;      /* Sanity magic */
-        uint32_t               skc_name_size;  /* Name length */
-        char                   *skc_name;      /* Name string */
+       uint32_t                skc_magic;      /* Sanity magic */
+       uint32_t                skc_name_size;  /* Name length */
+       char                    *skc_name;      /* Name string */
         spl_kmem_magazine_t     *skc_mag[NR_CPUS]; /* Per-CPU warm cache */
         uint32_t                skc_mag_size;   /* Magazine size */
         uint32_t                skc_mag_refill; /* Magazine refill count */
-        spl_kmem_ctor_t                skc_ctor;       /* Constructor */
-        spl_kmem_dtor_t                skc_dtor;       /* Destructor */
-        spl_kmem_reclaim_t      skc_reclaim;   /* Reclaimator */
-        void                   *skc_private;   /* Private data */
-        void                   *skc_vmp;       /* Unused */
+       spl_kmem_ctor_t         skc_ctor;       /* Constructor */
+       spl_kmem_dtor_t         skc_dtor;       /* Destructor */
+       spl_kmem_reclaim_t      skc_reclaim;    /* Reclaimator */
+       void                    *skc_private;   /* Private data */
+       void                    *skc_vmp;       /* Unused */
         uint32_t                skc_flags;      /* Flags */
         uint32_t                skc_obj_size;   /* Object size */
         uint32_t                skc_obj_align;  /* Object alignment */
         uint32_t                skc_slab_objs;  /* Objects per slab */
-       uint32_t                skc_slab_size;  /* Slab size */
-       uint32_t                skc_delay;      /* slab reclaim interval */
-        struct list_head       skc_list;       /* List of caches linkage */
+       uint32_t                skc_slab_size;  /* Slab size */
+       uint32_t                skc_delay;      /* Slab reclaim interval */
+       atomic_t                skc_ref;        /* Ref count callers */
+       struct delayed_work     skc_work;       /* Slab reclaim work */
+        struct work_struct work;
+        struct timer_list timer;
+       struct list_head        skc_list;       /* List of caches linkage */
         struct list_head        skc_complete_list;/* Completely alloc'ed */
         struct list_head        skc_partial_list; /* Partially alloc'ed */
         spinlock_t              skc_lock;       /* Cache lock */
@@ -283,7 +304,7 @@ typedef struct spl_kmem_cache {
         uint64_t                skc_slab_create;/* Slab creates */
         uint64_t                skc_slab_destroy;/* Slab destroys */
         uint64_t                skc_slab_total; /* Slab total current */
-       uint64_t                skc_slab_alloc; /* Slab alloc current */
+       uint64_t                skc_slab_alloc; /* Slab alloc current */
         uint64_t                skc_slab_max;   /* Slab max historic  */
         uint64_t                skc_obj_total;  /* Obj total current */
         uint64_t                skc_obj_alloc;  /* Obj alloc current */
diff --git a/include/sys/sysmacros.h b/include/sys/sysmacros.h

index 94ff3f84ede6d4c44e8c341f3a80a5b5575c4c4f..b8281238574cbb1cb6140c40d40ed8c414f62404 100644 (file)
--- a/include/sys/sysmacros.h
+++ b/include/sys/sysmacros.h
@@ -203,18 +203,6 @@ extern int ddi_strtoul(const char *str, char **nptr,
  #define offsetof(s, m)  ((size_t)(&(((s *)0)->m)))
  #endif
  
-#ifdef HAVE_3ARGS_INIT_WORK
-
-#define spl_init_work(wq,cb,d) INIT_WORK((wq), (void *)(cb), (void *)(d))
-#define spl_get_work_data(type,field,data)     (data)
-
-#else
-
-#define spl_init_work(wq,cb,d) INIT_WORK((wq), (void *)(cb));
-#define spl_get_work_data(type,field,data)     container_of(data,type,field)
-
-#endif
-
  #ifdef  __cplusplus
  }
  #endif
diff --git a/include/sys/vmsystm.h b/include/sys/vmsystm.h

index e92c17bddb2bc7fba39e03372142a8c7fa046578..1cb716f13c7a537fa77fe42732cf2ef8aa27bc00 100644 (file)
--- a/include/sys/vmsystm.h
+++ b/include/sys/vmsystm.h
@@ -35,8 +35,7 @@
  extern vmem_t *zio_alloc_arena;                /* arena for zio caches */
  
  #define physmem                                num_physpages
-#define freemem                                nr_free_pages() // Expensive on linux,
-                                                       // cheap on solaris
+#define freemem                                nr_free_pages()
  #define minfree                                0
  #define needfree                       0       /* # of needed pages */
  #define ptob(pages)                    (pages * PAGE_SIZE)
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c

index a68f8efe9d6eb928fa0fbc451e291308ea9fc2cd..83eefe29361d18f0f34a744fef149ad2e206b829 100644 (file)
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -132,10 +132,6 @@ EXPORT_SYMBOL(kmem_set_warning);
   * small virtual address space on 32bit arches.  This will seriously
   * constrain the size of the slab caches and their performance.
   *
- * XXX: Implement work requests to keep an eye on each cache and
- *      shrink them via spl_slab_reclaim() when they are wasting lots
- *      of space.  Currently this process is driven by the reapers.
- *
   * XXX: Improve the partial slab list by carefully maintaining a
   *      strict ordering of fullest to emptiest slabs based on
   *      the slab reference count.  This gaurentees the when freeing
@@ -571,7 +567,8 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
         }
  }
  
-/* It's important that we pack the spl_kmem_obj_t structure and the
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
   * actual objects in to one large address space to minimize the number
   * of calls to the allocator.  It is far better to do a few large
   * allocations and then subdivide it ourselves.  Now which allocator
@@ -662,14 +659,17 @@ out:
         RETURN(sks);
  }
  
-/* Removes slab from complete or partial list, so it must
- * be called with the 'skc->skc_lock' held.
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
   */
  static void
-spl_slab_free(spl_kmem_slab_t *sks) {
+spl_slab_free(spl_kmem_slab_t *sks,
+             struct list_head *sks_list, struct list_head *sko_list)
+{
         spl_kmem_cache_t *skc;
         spl_kmem_obj_t *sko, *n;
-       int size;
         ENTRY;
  
         ASSERT(sks->sks_magic == SKS_MAGIC);
@@ -682,114 +682,190 @@ spl_slab_free(spl_kmem_slab_t *sks) {
         skc->skc_obj_total -= sks->sks_objs;
         skc->skc_slab_total--;
         list_del(&sks->sks_list);
-       size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
-              P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
  
         /* Run destructors slab is being released */
         list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
                 ASSERT(sko->sko_magic == SKO_MAGIC);
+               list_del(&sko->sko_list);
  
                 if (skc->skc_dtor)
                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
  
                 if (skc->skc_flags & KMC_OFFSLAB)
-                       kv_free(skc, sko->sko_addr, size);
+                       list_add(&sko->sko_list, sko_list);
         }
  
-       kv_free(skc, sks, skc->skc_slab_size);
+       list_add(&sks->sks_list, sks_list);
         EXIT;
  }
  
-static int
-__spl_slab_reclaim(spl_kmem_cache_t *skc)
+/*
+ * Traverses all the partial slabs attached to a cache and free those
+ * which which are currently empty, and have not been touched for
+ * skc_delay seconds.  This is to avoid thrashing.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
  {
         spl_kmem_slab_t *sks, *m;
-       int rc = 0;
+       spl_kmem_obj_t *sko, *n;
+       LIST_HEAD(sks_list);
+       LIST_HEAD(sko_list);
+       int size;
         ENTRY;
  
-       ASSERT(spin_is_locked(&skc->skc_lock));
         /*
-        * Free empty slabs which have not been touched in skc_delay
-        * seconds.  This delay time is important to avoid thrashing.
-        * Empty slabs will be at the end of the skc_partial_list.
+        * Move empty slabs and objects which have not been touched in
+        * skc_delay seconds on to private lists to be freed outside
+        * the spin lock.  This delay time is important to avoid
+        * thrashing however when flag is set the delay will not be
+        * used.  Empty slabs will be at the end of the skc_partial_list.
          */
+       spin_lock(&skc->skc_lock);
          list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
                                          sks_list) {
                 if (sks->sks_ref > 0)
                        break;
  
-               if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
-                       spl_slab_free(sks);
-                       rc++;
-               }
+               if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
+                       spl_slab_free(sks, &sks_list, &sko_list);
         }
+       spin_unlock(&skc->skc_lock);
  
-       /* Returns number of slabs reclaimed */
-       RETURN(rc);
+       /*
+        * We only have list of spl_kmem_obj_t's if they are located off
+        * the slab, otherwise they get feed with the spl_kmem_slab_t.
+        */
+       if (!list_empty(&sko_list)) {
+               ASSERT(skc->skc_flags & KMC_OFFSLAB);
+
+               size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
+                      P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
+
+               list_for_each_entry_safe(sko, n, &sko_list, sko_list)
+                       kv_free(skc, sko->sko_addr, size);
+       }
+
+       list_for_each_entry_safe(sks, m, &sks_list, sks_list)
+               kv_free(skc, sks, skc->skc_slab_size);
+
+       EXIT;
  }
  
-static int
-spl_slab_reclaim(spl_kmem_cache_t *skc)
+/*
+ * Called regularly on all caches to age objects out of the magazines
+ * which have not been access in skc->skc_delay seconds.  This prevents
+ * idle magazines from holding memory which might be better used by
+ * other caches or parts of the system.  The delay is present to
+ * prevent thrashing the magazine.
+ */
+static void
+spl_magazine_age(void *data)
  {
-       int rc;
-       ENTRY;
+       spl_kmem_cache_t *skc = data;
+       spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
  
-       spin_lock(&skc->skc_lock);
-       rc = __spl_slab_reclaim(skc);
-       spin_unlock(&skc->skc_lock);
+       if (skm->skm_avail > 0 &&
+           time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
+               (void)spl_cache_flush(skc, skm, skm->skm_refill);
+}
  
-       RETURN(rc);
+/*
+ * Called regularly to keep a downward pressure on the size of idle
+ * magazines and to release free slabs from the cache.  This function
+ * never calls the registered reclaim function, that only occures
+ * under memory pressure or with a direct call to spl_kmem_reap().
+ */
+static void
+spl_cache_age(void *data)
+{
+        spl_kmem_cache_t *skc =
+               spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
+
+       ASSERT(skc->skc_magic == SKC_MAGIC);
+       on_each_cpu(spl_magazine_age, skc, 0, 1);
+       spl_slab_reclaim(skc, 0);
+
+       if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
+               schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
  }
  
-/* Size slabs properly to ensure they are not too large */
+/*
+ * Size a slab based on the size of each aliged object plus spl_kmem_obj_t.
+ * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page.  Also for
+ * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
+ * lower than this and we will fail.
+ */
  static int
  spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
  {
-       int max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
-       int align = skc->skc_obj_align;
-
-       *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+       int sks_size, obj_size, max_size, align;
  
         if (skc->skc_flags & KMC_OFFSLAB) {
+               *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
                 *size = sizeof(spl_kmem_slab_t);
         } else {
-resize:
-               *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
-                       *objs * (P2ROUNDUP(skc->skc_obj_size, align) +
-                       P2ROUNDUP(sizeof(spl_kmem_obj_t), align));
+               align = skc->skc_obj_align;
+               sks_size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align);
+               obj_size = P2ROUNDUP(skc->skc_obj_size, align) +
+                           P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
+
+               if (skc->skc_flags & KMC_KMEM)
+                       max_size = ((uint64_t)1 << (MAX_ORDER-1)) * PAGE_SIZE;
+               else
+                       max_size = (32 * 1024 * 1024);
  
-               if (*size > max)
-                       GOTO(resize, *objs = *objs - 1);
+               for (*size = PAGE_SIZE; *size <= max_size; *size += PAGE_SIZE) {
+                       *objs = (*size - sks_size) / obj_size;
+                       if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
+                               RETURN(0);
+               }
  
-               ASSERT(*objs > 0);
+               /*
+                * Unable to satisfy target objets per slab, fallback to
+                * allocating a maximally sized slab and assuming it can
+                * contain the minimum objects count use it.  If not fail.
+                */
+               *size = max_size;
+               *objs = (*size - sks_size) / obj_size;
+               if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
+                       RETURN(0);
         }
  
-       ASSERTF(*size <= max, "%d < %d\n", *size, max);
-       RETURN(0);
+       RETURN(-ENOSPC);
  }
  
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine.  Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
  static int
  spl_magazine_size(spl_kmem_cache_t *skc)
  {
         int size, align = skc->skc_obj_align;
         ENTRY;
  
-       /* Guesses for reasonable magazine sizes, they
-        * should really adapt based on observed usage. */
+       /* Per-magazine sizes below assume a 4Kib page size */
         if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
-               size = 4;
+               size = 4;  /* Minimum 4Mib per-magazine */
         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
-               size = 16;
+               size = 16; /* Minimum 2Mib per-magazine */
         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
-               size = 64;
+               size = 64; /* Minimum 256Kib per-magazine */
         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
-               size = 128;
+               size = 128; /* Minimum 128Kib per-magazine */
         else
-               size = 512;
+               size = 256;
  
         RETURN(size);
  }
  
+/*
+ * Allocate a per-cpu magazine to assoicate with a specific core.
+ */
  static spl_kmem_magazine_t *
  spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
  {
@@ -798,19 +874,21 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
                    sizeof(void *) * skc->skc_mag_size;
         ENTRY;
  
-       skm = kmem_alloc_node(size, GFP_KERNEL, node);
+       skm = kmem_alloc_node(size, GFP_KERNEL | __GFP_NOFAIL, node);
         if (skm) {
                 skm->skm_magic = SKM_MAGIC;
                 skm->skm_avail = 0;
                 skm->skm_size = skc->skc_mag_size;
                 skm->skm_refill = skc->skc_mag_refill;
-               if (!(skc->skc_flags & KMC_NOTOUCH))
-                       skm->skm_age = jiffies;
+               skm->skm_age = jiffies;
         }
  
         RETURN(skm);
  }
  
+/*
+ * Free a per-cpu magazine assoicated with a specific core.
+ */
  static void
  spl_magazine_free(spl_kmem_magazine_t *skm)
  {
@@ -825,44 +903,72 @@ spl_magazine_free(spl_kmem_magazine_t *skm)
         EXIT;
  }
  
+static void
+__spl_magazine_create(void *data)
+{
+        spl_kmem_cache_t *skc = data;
+       int id = smp_processor_id();
+
+       skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
+       ASSERT(skc->skc_mag[id]);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
  static int
  spl_magazine_create(spl_kmem_cache_t *skc)
  {
-       int i;
         ENTRY;
  
         skc->skc_mag_size = spl_magazine_size(skc);
-       skc->skc_mag_refill = (skc->skc_mag_size + 1)  / 2;
+       skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+       on_each_cpu(__spl_magazine_create, skc, 0, 1);
  
-       for_each_online_cpu(i) {
-               skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
-               if (!skc->skc_mag[i]) {
-                       for (i--; i >= 0; i--)
-                               spl_magazine_free(skc->skc_mag[i]);
+       RETURN(0);
+}
  
-                       RETURN(-ENOMEM);
-               }
-       }
+static void
+__spl_magazine_destroy(void *data)
+{
+        spl_kmem_cache_t *skc = data;
+       spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
  
-       RETURN(0);
+       (void)spl_cache_flush(skc, skm, skm->skm_avail);
+       spl_magazine_free(skm);
  }
  
+/*
+ * Destroy all pre-cpu magazines.
+ */
  static void
  spl_magazine_destroy(spl_kmem_cache_t *skc)
  {
-        spl_kmem_magazine_t *skm;
-       int i;
         ENTRY;
-
-       for_each_online_cpu(i) {
-               skm = skc->skc_mag[i];
-               (void)spl_cache_flush(skc, skm, skm->skm_avail);
-               spl_magazine_free(skm);
-       }
-
+       on_each_cpu(__spl_magazine_destroy, skc, 0, 1);
         EXIT;
  }
  
+/*
+ * Create a object cache based on the following arguments:
+ * name                cache name
+ * size                cache object size
+ * align       cache object alignment
+ * ctor                cache object constructor
+ * dtor                cache object destructor
+ * reclaim     cache object reclaim
+ * priv                cache private data for ctor/dtor/reclaim
+ * vmp         unused must be NULL
+ * flags
+ *     KMC_NOTOUCH     Disable cache object aging (unsupported)
+ *     KMC_NODEBUG     Disable debugging (unsupported)
+ *     KMC_NOMAGAZINE  Disable magazine (unsupported)
+ *     KMC_NOHASH      Disable hashing (unsupported)
+ *     KMC_QCACHE      Disable qcache (unsupported)
+ *     KMC_KMEM        Force kmem backed cache
+ *     KMC_VMEM        Force vmem backed cache
+ *     KMC_OFFSLAB     Locate objects off the slab
+ */
  spl_kmem_cache_t *
  spl_kmem_cache_create(char *name, size_t size, size_t align,
                        spl_kmem_ctor_t ctor,
@@ -908,6 +1014,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
         skc->skc_obj_size = size;
         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+       atomic_set(&skc->skc_ref, 0);
  
         INIT_LIST_HEAD(&skc->skc_list);
         INIT_LIST_HEAD(&skc->skc_complete_list);
@@ -947,6 +1054,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
         if (rc)
                 GOTO(out, rc);
  
+       spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
+       schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
+
         down_write(&spl_kmem_cache_sem);
         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
         up_write(&spl_kmem_cache_sem);
@@ -959,10 +1069,13 @@ out:
  }
  EXPORT_SYMBOL(spl_kmem_cache_create);
  
+/*
+ * Destroy a cache and all objects assoicated with the cache.
+ */
  void
  spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
  {
-        spl_kmem_slab_t *sks, *m;
+       DECLARE_WAIT_QUEUE_HEAD(wq);
         ENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -971,20 +1084,27 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
         list_del_init(&skc->skc_list);
         up_write(&spl_kmem_cache_sem);
  
+       /* Cancel any and wait for any pending delayed work */
+       ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+       cancel_delayed_work(&skc->skc_work);
+       flush_scheduled_work();
+
+       /* Wait until all current callers complete, this is mainly
+        * to catch the case where a low memory situation triggers a
+        * cache reaping action which races with this destroy. */
+       wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
         spl_magazine_destroy(skc);
+       spl_slab_reclaim(skc, 1);
         spin_lock(&skc->skc_lock);
  
         /* Validate there are no objects in use and free all the
          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
+       ASSERT3U(skc->skc_slab_alloc, ==, 0);
+       ASSERT3U(skc->skc_obj_alloc, ==, 0);
+       ASSERT3U(skc->skc_slab_total, ==, 0);
+       ASSERT3U(skc->skc_obj_total, ==, 0);
         ASSERT(list_empty(&skc->skc_complete_list));
-       ASSERT(skc->skc_slab_alloc == 0);
-       ASSERT(skc->skc_obj_alloc == 0);
-
-       list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
-               spl_slab_free(sks);
-
-       ASSERT(skc->skc_slab_total == 0);
-       ASSERT(skc->skc_obj_total == 0);
  
         kmem_free(skc->skc_name, skc->skc_name_size);
         spin_unlock(&skc->skc_lock);
@@ -995,6 +1115,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
  }
  EXPORT_SYMBOL(spl_kmem_cache_destroy);
  
+/*
+ * Allocate an object from a slab attached to the cache.  This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
  static void *
  spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
  {
@@ -1030,10 +1154,11 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
         return sko->sko_addr;
  }
  
-/* No available objects create a new slab.  Since this is an
- * expensive operation we do it without holding the spinlock
- * and only briefly aquire it when we link in the fully
- * allocated and constructed slab.
+/*
+ * No available objects on any slabsi, create a new slab.  Since this
+ * is an expensive operation we do it without holding the spinlock and
+ * only briefly aquire it when we link in the fully allocated and
+ * constructed slab.
   */
  static spl_kmem_slab_t *
  spl_cache_grow(spl_kmem_cache_t *skc, int flags)
@@ -1042,34 +1167,42 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags)
         ENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
+       local_irq_enable();
+       might_sleep();
  
-       if (flags & __GFP_WAIT) {
-               flags |= __GFP_NOFAIL;
-               local_irq_enable();
-               might_sleep();
-       }
-
-       sks = spl_slab_alloc(skc, flags);
-       if (sks == NULL) {
-               if (flags & __GFP_WAIT)
-                       local_irq_disable();
-
-               RETURN(NULL);
+       /*
+        * Before allocating a new slab check if the slab is being reaped.
+        * If it is there is a good chance we can wait until it finishes
+        * and then use one of the newly freed but not aged-out slabs.
+        */
+       if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+               schedule();
+               GOTO(out, sks= NULL);
         }
  
-       if (flags & __GFP_WAIT)
-               local_irq_disable();
+       /* Allocate a new slab for the cache */
+       sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | __GFP_NOWARN);
+       if (sks == NULL)
+               GOTO(out, sks = NULL);
  
-       /* Link the new empty slab in to the end of skc_partial_list */
+       /* Link the new empty slab in to the end of skc_partial_list. */
         spin_lock(&skc->skc_lock);
         skc->skc_slab_total++;
         skc->skc_obj_total += sks->sks_objs;
         list_add_tail(&sks->sks_list, &skc->skc_partial_list);
         spin_unlock(&skc->skc_lock);
+out:
+       local_irq_disable();
  
         RETURN(sks);
  }
  
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this
+ * cache.  Ideally the magazine can be repopulated using existing
+ * objects which have been released, however if we are unable to
+ * locate enough free objects new slabs of objects will be created.
+ */
  static int
  spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
  {
@@ -1080,13 +1213,11 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
         ASSERT(skc->skc_magic == SKC_MAGIC);
         ASSERT(skm->skm_magic == SKM_MAGIC);
  
-       /* XXX: Check for refill bouncing by age perhaps */
         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
-
         spin_lock(&skc->skc_lock);
  
         while (refill > 0) {
-               /* No slabs available we must grow the cache */
+               /* No slabs available we may need to grow the cache */
                 if (list_empty(&skc->skc_partial_list)) {
                         spin_unlock(&skc->skc_lock);
  
@@ -1135,6 +1266,9 @@ out:
         RETURN(rc);
  }
  
+/*
+ * Release an object back to the slab from which it came.
+ */
  static void
  spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
  {
@@ -1176,6 +1310,13 @@ spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
         EXIT;
  }
  
+/*
+ * Release a batch of objects from a per-cpu magazine back to their
+ * respective slabs.  This occurs when we exceed the magazine size,
+ * are under memory pressure, when the cache is idle, or during
+ * cache cleanup.  The flush argument contains the number of entries
+ * to remove from the magazine.
+ */
  static int
  spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
  {
@@ -1185,12 +1326,17 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
         ASSERT(skc->skc_magic == SKC_MAGIC);
         ASSERT(skm->skm_magic == SKM_MAGIC);
  
+       /*
+        * XXX: Currently we simply return objects from the magazine to
+        * the slabs in fifo order.  The ideal thing to do from a memory
+        * fragmentation standpoint is to cheaply determine the set of
+        * objects in the magazine which will result in the largest
+        * number of free slabs if released from the magazine.
+        */
         spin_lock(&skc->skc_lock);
-
         for (i = 0; i < count; i++)
                 spl_cache_shrink(skc, skm->skm_objs[i]);
  
-//     __spl_slab_reclaim(skc);
         skm->skm_avail -= count;
         memmove(skm->skm_objs, &(skm->skm_objs[count]),
                 sizeof(void *) * skm->skm_avail);
@@ -1200,6 +1346,10 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
         RETURN(count);
  }
  
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
  void *
  spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
  {
@@ -1209,7 +1359,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
         ENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
-       ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
+       ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+       ASSERT(flags & KM_SLEEP);
+       atomic_inc(&skc->skc_ref);
         local_irq_save(irq_flags);
  
  restart:
@@ -1225,8 +1377,7 @@ restart:
         if (likely(skm->skm_avail)) {
                 /* Object available in CPU cache, use it */
                 obj = skm->skm_objs[--skm->skm_avail];
-               if (!(skc->skc_flags & KMC_NOTOUCH))
-                       skm->skm_age = jiffies;
+               skm->skm_age = jiffies;
         } else {
                 /* Per-CPU cache empty, directly allocate from
                  * the slab and refill the per-CPU cache. */
@@ -1240,11 +1391,18 @@ restart:
  
         /* Pre-emptively migrate object to CPU L1 cache */
         prefetchw(obj);
+       atomic_dec(&skc->skc_ref);
  
         RETURN(obj);
  }
  EXPORT_SYMBOL(spl_kmem_cache_alloc);
  
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from.  We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
  void
  spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
  {
@@ -1253,6 +1411,8 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
         ENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
+       ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+       atomic_inc(&skc->skc_ref);
         local_irq_save(flags);
  
         /* Safe to update per-cpu structure without lock, but
@@ -1270,62 +1430,87 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
         skm->skm_objs[skm->skm_avail++] = obj;
  
         local_irq_restore(flags);
+       atomic_dec(&skc->skc_ref);
  
         EXIT;
  }
  EXPORT_SYMBOL(spl_kmem_cache_free);
  
+/*
+ * The generic shrinker function for all caches.  Under linux a shrinker
+ * may not be tightly coupled with a slab cache.  In fact linux always
+ * systematically trys calling all registered shrinker callbacks which
+ * report that they contain unused objects.  Because of this we only
+ * register one shrinker function in the shim layer for all slab caches.
+ * We always attempt to shrink all caches when this generic shrinker
+ * is called.  The shrinker should return the number of free objects
+ * in the cache when called with nr_to_scan == 0 but not attempt to
+ * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
+ * objects should be freed, because Solaris semantics are to free
+ * all available objects we may free more objects than requested.
+ */
  static int
  spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
  {
         spl_kmem_cache_t *skc;
+       int unused = 0;
  
-       /* Under linux a shrinker is not tightly coupled with a slab
-        * cache.  In fact linux always systematically trys calling all
-        * registered shrinker callbacks until its target reclamation level
-        * is reached.  Because of this we only register one shrinker
-        * function in the shim layer for all slab caches.  And we always
-        * attempt to shrink all caches when this generic shrinker is called.
-        */
         down_read(&spl_kmem_cache_sem);
-
-       list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
-               spl_kmem_cache_reap_now(skc);
-
+       list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+               if (nr_to_scan)
+                       spl_kmem_cache_reap_now(skc);
+
+               /*
+                * Presume everything alloc'ed in reclaimable, this ensures
+                * we are called again with nr_to_scan > 0 so can try and
+                * reclaim.  The exact number is not important either so
+                * we forgo taking this already highly contented lock.
+                */
+               unused += skc->skc_obj_alloc;
+       }
         up_read(&spl_kmem_cache_sem);
  
-       /* XXX: Under linux we should return the remaining number of
-        * entries in the cache.  We should do this as well.
-        */
-       return 1;
+       return (unused * sysctl_vfs_cache_pressure) / 100;
  }
  
+/*
+ * Call the registered reclaim function for a cache.  Depending on how
+ * many and which objects are released it may simply repopulate the
+ * local magazine which will then need to age-out.  Objects which cannot
+ * fit in the magazine we will be released back to their slabs which will
+ * also need to age out before being release.  This is all just best
+ * effort and we do not want to thrash creating and destroying slabs.
+ */
  void
  spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
  {
-       spl_kmem_magazine_t *skm;
-       int i;
         ENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
+       ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
  
-       if (skc->skc_reclaim)
-               skc->skc_reclaim(skc->skc_private);
+       /* Prevent concurrent cache reaping when contended */
+       if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+               EXIT;
+               return;
+       }
  
-       /* Ensure per-CPU caches which are idle gradually flush */
-       for_each_online_cpu(i) {
-               skm = skc->skc_mag[i];
+       atomic_inc(&skc->skc_ref);
  
-               if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
-                       (void)spl_cache_flush(skc, skm, skm->skm_refill);
-       }
+       if (skc->skc_reclaim)
+               skc->skc_reclaim(skc->skc_private);
  
-       spl_slab_reclaim(skc);
+       spl_slab_reclaim(skc, 0);
+       clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
+       atomic_dec(&skc->skc_ref);
  
         EXIT;
  }
  EXPORT_SYMBOL(spl_kmem_cache_reap_now);
  
+/*
+ * Reap all free slabs from all registered caches.
+ */
  void
  spl_kmem_reap(void)
  {
diff --git a/module/splat/splat-internal.h b/module/splat/splat-internal.h

index 87c47b173373719e24f8e9549294b8e5a134e1f9..0fa177c0293b91066a0849521b6f9c939a543c2d 100644 (file)
--- a/module/splat/splat-internal.h
+++ b/module/splat/splat-internal.h
@@ -40,6 +40,7 @@
  #include <linux/module.h>
  #include <linux/device.h>
  #include <linux/list.h>
+#include <linux/swap.h>
  
  #include <asm/ioctls.h>
  #include <asm/uaccess.h>
diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c

index 9b96fce90e2ae0d8cf783ca92f7abfa7622e9676..c592e983cba1b05ffa53e96f04309a5eda123128 100644 (file)
--- a/module/splat/splat-kmem.c
+++ b/module/splat/splat-kmem.c
@@ -4,9 +4,9 @@
   *  Copyright (c) 2008 Lawrence Livermore National Security, LLC.
   *  Produced at Lawrence Livermore National Laboratory
   *  Written by:
- *          Brian Behlendorf <behlendorf1@llnl.gov>,
- *          Herb Wartens <wartens2@llnl.gov>,
- *          Jim Garlick <garlick@llnl.gov>
+ *       Brian Behlendorf <behlendorf1@llnl.gov>,
+ *       Herb Wartens <wartens2@llnl.gov>,
+ *       Jim Garlick <garlick@llnl.gov>
   *  UCRL-CODE-235197
   *
   *  This is free software; you can redistribute it and/or modify it
@@ -47,30 +47,37 @@
  #define SPLAT_KMEM_TEST4_DESC          "Memory allocation test (vmem_zalloc)"
  
  #define SPLAT_KMEM_TEST5_ID            0x0105
-#define SPLAT_KMEM_TEST5_NAME          "kmem_small"
+#define SPLAT_KMEM_TEST5_NAME          "slab_small"
  #define SPLAT_KMEM_TEST5_DESC          "Slab ctor/dtor test (small)"
  
  #define SPLAT_KMEM_TEST6_ID            0x0106
-#define SPLAT_KMEM_TEST6_NAME          "kmem_large"
+#define SPLAT_KMEM_TEST6_NAME          "slab_large"
  #define SPLAT_KMEM_TEST6_DESC          "Slab ctor/dtor test (large)"
  
  #define SPLAT_KMEM_TEST7_ID            0x0107
-#define SPLAT_KMEM_TEST7_NAME          "kmem_reap"
-#define SPLAT_KMEM_TEST7_DESC          "Slab reaping test"
+#define SPLAT_KMEM_TEST7_NAME          "slab_align"
+#define SPLAT_KMEM_TEST7_DESC          "Slab alignment test"
  
  #define SPLAT_KMEM_TEST8_ID            0x0108
-#define SPLAT_KMEM_TEST8_NAME          "kmem_lock"
-#define SPLAT_KMEM_TEST8_DESC          "Slab locking test"
+#define SPLAT_KMEM_TEST8_NAME          "slab_reap"
+#define SPLAT_KMEM_TEST8_DESC          "Slab reaping test"
  
  #define SPLAT_KMEM_TEST9_ID            0x0109
-#define SPLAT_KMEM_TEST9_NAME          "kmem_align"
-#define SPLAT_KMEM_TEST9_DESC          "Slab alignment test"
+#define SPLAT_KMEM_TEST9_NAME          "slab_age"
+#define SPLAT_KMEM_TEST9_DESC          "Slab aging test"
+
+#define SPLAT_KMEM_TEST10_ID           0x010a
+#define SPLAT_KMEM_TEST10_NAME         "slab_lock"
+#define SPLAT_KMEM_TEST10_DESC         "Slab locking test"
+
+#define SPLAT_KMEM_TEST11_ID           0x010b
+#define SPLAT_KMEM_TEST11_NAME         "slab_overcommit"
+#define SPLAT_KMEM_TEST11_DESC         "Slab memory overcommit test"
  
  #define SPLAT_KMEM_ALLOC_COUNT         10
  #define SPLAT_VMEM_ALLOC_COUNT         10
  
  
-/* XXX - This test may fail under tight memory conditions */
  static int
  splat_kmem_test1(struct file *file, void *arg)
  {
@@ -96,8 +103,8 @@ splat_kmem_test1(struct file *file, void *arg)
                                 kmem_free(ptr[i], size);
  
                 splat_vprint(file, SPLAT_KMEM_TEST1_NAME,
-                          "%d byte allocations, %d/%d successful\n",
-                          size, count, SPLAT_KMEM_ALLOC_COUNT);
+                          "%d byte allocations, %d/%d successful\n",
+                          size, count, SPLAT_KMEM_ALLOC_COUNT);
                 if (count != SPLAT_KMEM_ALLOC_COUNT)
                         rc = -ENOMEM;
  
@@ -134,8 +141,8 @@ splat_kmem_test2(struct file *file, void *arg)
                         for (j = 0; j < size; j++) {
                                 if (((char *)ptr[i])[j] != '\0') {
                                         splat_vprint(file, SPLAT_KMEM_TEST2_NAME,
-                                                 "%d-byte allocation was "
-                                                 "not zeroed\n", size);
+                                                 "%d-byte allocation was "
+                                                 "not zeroed\n", size);
                                         rc = -EFAULT;
                                 }
                         }
@@ -146,8 +153,8 @@ splat_kmem_test2(struct file *file, void *arg)
                                 kmem_free(ptr[i], size);
  
                 splat_vprint(file, SPLAT_KMEM_TEST2_NAME,
-                          "%d byte allocations, %d/%d successful\n",
-                          size, count, SPLAT_KMEM_ALLOC_COUNT);
+                          "%d byte allocations, %d/%d successful\n",
+                          size, count, SPLAT_KMEM_ALLOC_COUNT);
                 if (count != SPLAT_KMEM_ALLOC_COUNT)
                         rc = -ENOMEM;
  
@@ -180,8 +187,8 @@ splat_kmem_test3(struct file *file, void *arg)
                                 vmem_free(ptr[i], size);
  
                 splat_vprint(file, SPLAT_KMEM_TEST3_NAME,
-                          "%d byte allocations, %d/%d successful\n",
-                          size, count, SPLAT_VMEM_ALLOC_COUNT);
+                          "%d byte allocations, %d/%d successful\n",
+                          size, count, SPLAT_VMEM_ALLOC_COUNT);
                 if (count != SPLAT_VMEM_ALLOC_COUNT)
                         rc = -ENOMEM;
  
@@ -212,8 +219,8 @@ splat_kmem_test4(struct file *file, void *arg)
                         for (j = 0; j < size; j++) {
                                 if (((char *)ptr[i])[j] != '\0') {
                                         splat_vprint(file, SPLAT_KMEM_TEST4_NAME,
-                                                 "%d-byte allocation was "
-                                                 "not zeroed\n", size);
+                                                 "%d-byte allocation was "
+                                                 "not zeroed\n", size);
                                         rc = -EFAULT;
                                 }
                         }
@@ -224,8 +231,8 @@ splat_kmem_test4(struct file *file, void *arg)
                                 vmem_free(ptr[i], size);
  
                 splat_vprint(file, SPLAT_KMEM_TEST4_NAME,
-                          "%d byte allocations, %d/%d successful\n",
-                          size, count, SPLAT_VMEM_ALLOC_COUNT);
+                          "%d byte allocations, %d/%d successful\n",
+                          size, count, SPLAT_VMEM_ALLOC_COUNT);
                 if (count != SPLAT_VMEM_ALLOC_COUNT)
                         rc = -ENOMEM;
  
@@ -237,8 +244,11 @@ splat_kmem_test4(struct file *file, void *arg)
  
  #define SPLAT_KMEM_TEST_MAGIC          0x004488CCUL
  #define SPLAT_KMEM_CACHE_NAME          "kmem_test"
-#define SPLAT_KMEM_OBJ_COUNT           128
-#define SPLAT_KMEM_OBJ_RECLAIM         16
+#define SPLAT_KMEM_OBJ_COUNT           1024
+#define SPLAT_KMEM_OBJ_RECLAIM         20 /* percent */
+#define SPLAT_KMEM_THREADS             32
+
+#define KCP_FLAG_READY                 0x01
  
  typedef struct kmem_cache_data {
         unsigned long kcd_magic;
@@ -246,21 +256,95 @@ typedef struct kmem_cache_data {
         char kcd_buf[0];
  } kmem_cache_data_t;
  
+typedef struct kmem_cache_thread {
+       kmem_cache_t *kct_cache;
+       spinlock_t kct_lock;
+       int kct_id;
+       int kct_kcd_count;
+       kmem_cache_data_t *kct_kcd[0];
+} kmem_cache_thread_t;
+
  typedef struct kmem_cache_priv {
         unsigned long kcp_magic;
         struct file *kcp_file;
         kmem_cache_t *kcp_cache;
-       kmem_cache_data_t *kcp_kcd[SPLAT_KMEM_OBJ_COUNT];
         spinlock_t kcp_lock;
-       wait_queue_head_t kcp_waitq;
+       wait_queue_head_t kcp_ctl_waitq;
+       wait_queue_head_t kcp_thr_waitq;
+       int kcp_flags;
+       int kcp_kct_count;
+       kmem_cache_thread_t *kcp_kct[SPLAT_KMEM_THREADS];
         int kcp_size;
         int kcp_align;
         int kcp_count;
-       int kcp_threads;
         int kcp_alloc;
         int kcp_rc;
+       int kcp_kcd_count;
+       kmem_cache_data_t *kcp_kcd[0];
  } kmem_cache_priv_t;
  
+static kmem_cache_priv_t *
+splat_kmem_cache_test_kcp_alloc(struct file *file, char *name,
+                               int size, int align, int alloc, int count)
+{
+       kmem_cache_priv_t *kcp;
+
+       kcp = vmem_zalloc(sizeof(kmem_cache_priv_t) +
+                         count * sizeof(kmem_cache_data_t *), KM_SLEEP);
+       if (!kcp)
+               return NULL;
+
+       kcp->kcp_magic = SPLAT_KMEM_TEST_MAGIC;
+       kcp->kcp_file = file;
+       kcp->kcp_cache = NULL;
+       spin_lock_init(&kcp->kcp_lock);
+       init_waitqueue_head(&kcp->kcp_ctl_waitq);
+       init_waitqueue_head(&kcp->kcp_thr_waitq);
+       kcp->kcp_flags = 0;
+       kcp->kcp_kct_count = -1;
+       kcp->kcp_size = size;
+       kcp->kcp_align = align;
+       kcp->kcp_count = 0;
+       kcp->kcp_alloc = alloc;
+       kcp->kcp_rc = 0;
+       kcp->kcp_kcd_count = count;
+
+       return kcp;
+}
+
+static void
+splat_kmem_cache_test_kcp_free(kmem_cache_priv_t *kcp)
+{
+       vmem_free(kcp, sizeof(kmem_cache_priv_t) +
+                 kcp->kcp_kcd_count * sizeof(kmem_cache_data_t *));
+}
+
+static kmem_cache_thread_t *
+splat_kmem_cache_test_kct_alloc(int id, int count)
+{
+       kmem_cache_thread_t *kct;
+
+       ASSERTF(id < SPLAT_KMEM_THREADS, "id=%d\n", id);
+       kct = vmem_zalloc(sizeof(kmem_cache_thread_t) +
+                         count * sizeof(kmem_cache_data_t *), KM_SLEEP);
+       if (!kct)
+               return NULL;
+
+       spin_lock_init(&kct->kct_lock);
+       kct->kct_cache = NULL;
+       kct->kct_id = id;
+       kct->kct_kcd_count = count;
+
+       return kct;
+}
+
+static void
+splat_kmem_cache_test_kct_free(kmem_cache_thread_t *kct)
+{
+       vmem_free(kct, sizeof(kmem_cache_thread_t) +
+                 kct->kct_kcd_count * sizeof(kmem_cache_data_t *));
+}
+
  static int
  splat_kmem_cache_test_constructor(void *ptr, void *priv, int flags)
  {
@@ -293,83 +377,340 @@ splat_kmem_cache_test_destructor(void *ptr, void *priv)
         return;
  }
  
+/*
+ * Generic reclaim function which assumes that all objects may
+ * be reclaimed at any time.  We free a small  percentage of the
+ * objects linked off the kcp or kct[] every time we are called.
+ */
+static void
+splat_kmem_cache_test_reclaim(void *priv)
+{
+       kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv;
+       kmem_cache_thread_t *kct;
+       int i, j, count;
+
+       ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC);
+       count = kcp->kcp_kcd_count * SPLAT_KMEM_OBJ_RECLAIM / 100;
+
+       /* Objects directly attached to the kcp */
+       spin_lock(&kcp->kcp_lock);
+       for (i = 0; i < kcp->kcp_kcd_count; i++) {
+               if (kcp->kcp_kcd[i]) {
+                       kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
+                       kcp->kcp_kcd[i] = NULL;
+
+                       if ((--count) == 0)
+                               break;
+               }
+       }
+       spin_unlock(&kcp->kcp_lock);
+
+       /* No threads containing objects to consider */
+       if (kcp->kcp_kct_count == -1)
+               return;
+
+       /* Objects attached to a kct thread */
+       for (i = 0; i < kcp->kcp_kct_count; i++) {
+               spin_lock(&kcp->kcp_lock);
+               kct = kcp->kcp_kct[i];
+               spin_unlock(&kcp->kcp_lock);
+               if (!kct)
+                       continue;
+
+               spin_lock(&kct->kct_lock);
+               count = kct->kct_kcd_count * SPLAT_KMEM_OBJ_RECLAIM / 100;
+
+               for (j = 0; j < kct->kct_kcd_count; j++) {
+                       if (kct->kct_kcd[j]) {
+                               kmem_cache_free(kcp->kcp_cache,kct->kct_kcd[j]);
+                               kct->kct_kcd[j] = NULL;
+
+                               if ((--count) == 0)
+                                       break;
+                       }
+               }
+               spin_unlock(&kct->kct_lock);
+       }
+
+       return;
+}
+
+static int
+splat_kmem_cache_test_threads(kmem_cache_priv_t *kcp, int threads)
+{
+       int rc;
+
+       spin_lock(&kcp->kcp_lock);
+       rc = (kcp->kcp_kct_count == threads);
+       spin_unlock(&kcp->kcp_lock);
+
+       return rc;
+}
+
+static int
+splat_kmem_cache_test_flags(kmem_cache_priv_t *kcp, int flags)
+{
+       int rc;
+
+       spin_lock(&kcp->kcp_lock);
+       rc = (kcp->kcp_flags & flags);
+       spin_unlock(&kcp->kcp_lock);
+
+       return rc;
+}
+
+static void
+splat_kmem_cache_test_thread(void *arg)
+{
+       kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)arg;
+       kmem_cache_thread_t *kct;
+       int rc = 0, id, i;
+       void *obj;
+
+       ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC);
+
+       /* Assign thread ids */
+       spin_lock(&kcp->kcp_lock);
+       if (kcp->kcp_kct_count == -1)
+               kcp->kcp_kct_count = 0;
+
+       id = kcp->kcp_kct_count;
+       kcp->kcp_kct_count++;
+       spin_unlock(&kcp->kcp_lock);
+
+       kct = splat_kmem_cache_test_kct_alloc(id, kcp->kcp_alloc);
+       if (!kct) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       spin_lock(&kcp->kcp_lock);
+       kcp->kcp_kct[id] = kct;
+       spin_unlock(&kcp->kcp_lock);
+
+       /* Wait for all threads to have started and report they are ready */
+       if (kcp->kcp_kct_count == SPLAT_KMEM_THREADS)
+               wake_up(&kcp->kcp_ctl_waitq);
+
+       wait_event(kcp->kcp_thr_waitq,
+               splat_kmem_cache_test_flags(kcp, KCP_FLAG_READY));
+
+       /*
+        * Updates to kct->kct_kcd[] are performed under a spin_lock so
+        * they may safely run concurrent with the reclaim function.  If
+        * we are not in a low memory situation we have one lock per-
+        * thread so they are not expected to be contended.
+        */
+       for (i = 0; i < kct->kct_kcd_count; i++) {
+               obj = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+               spin_lock(&kct->kct_lock);
+               kct->kct_kcd[i] = obj;
+               spin_unlock(&kct->kct_lock);
+       }
+
+       for (i = 0; i < kct->kct_kcd_count; i++) {
+               spin_lock(&kct->kct_lock);
+               if (kct->kct_kcd[i]) {
+                       kmem_cache_free(kcp->kcp_cache, kct->kct_kcd[i]);
+                       kct->kct_kcd[i] = NULL;
+               }
+               spin_unlock(&kct->kct_lock);
+       }
+out:
+       spin_lock(&kcp->kcp_lock);
+       if (kct) {
+               splat_kmem_cache_test_kct_free(kct);
+               kcp->kcp_kct[id] = kct = NULL;
+       }
+
+       if (!kcp->kcp_rc)
+               kcp->kcp_rc = rc;
+
+       if ((--kcp->kcp_kct_count) == 0)
+               wake_up(&kcp->kcp_ctl_waitq);
+
+       spin_unlock(&kcp->kcp_lock);
+
+       thread_exit();
+}
+
  static int
  splat_kmem_cache_test(struct file *file, void *arg, char *name,
-                          int size, int align, int flags)
+                     int size, int align, int flags)
  {
-       kmem_cache_t *cache = NULL;
-       kmem_cache_data_t *kcd = NULL;
-       kmem_cache_priv_t kcp;
+       kmem_cache_priv_t *kcp;
+       kmem_cache_data_t *kcd;
         int rc = 0, max;
  
-       kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
-       kcp.kcp_file = file;
-       kcp.kcp_size = size;
-       kcp.kcp_align = align;
-       kcp.kcp_count = 0;
-       kcp.kcp_rc = 0;
-
-       cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
-                                 kcp.kcp_size, kcp.kcp_align,
-                                 splat_kmem_cache_test_constructor,
-                                 splat_kmem_cache_test_destructor,
-                                 NULL, &kcp, NULL, flags);
-       if (!cache) {
+       kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, align, 0, 1);
+       if (!kcp) {
+               splat_vprint(file, name, "Unable to create '%s'\n", "kcp");
+               return -ENOMEM;
+       }
+
+       kcp->kcp_cache =
+               kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
+                                 kcp->kcp_size, kcp->kcp_align,
+                                 splat_kmem_cache_test_constructor,
+                                 splat_kmem_cache_test_destructor,
+                                 NULL, kcp, NULL, flags);
+       if (!kcp->kcp_cache) {
                 splat_vprint(file, name,
-                            "Unable to create '%s'\n",
+                            "Unable to create '%s'\n",
                              SPLAT_KMEM_CACHE_NAME);
-               return -ENOMEM;
+               rc = -ENOMEM;
+               goto out_free;
         }
  
-       kcd = kmem_cache_alloc(cache, KM_SLEEP);
+       kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
         if (!kcd) {
                 splat_vprint(file, name,
-                            "Unable to allocate from '%s'\n",
-                            SPLAT_KMEM_CACHE_NAME);
+                            "Unable to allocate from '%s'\n",
+                            SPLAT_KMEM_CACHE_NAME);
                 rc = -EINVAL;
                 goto out_free;
         }
+       spin_lock(&kcp->kcp_lock);
+       kcp->kcp_kcd[0] = kcd;
+       spin_unlock(&kcp->kcp_lock);
  
-       if (!kcd->kcd_flag) {
+       if (!kcp->kcp_kcd[0]->kcd_flag) {
                 splat_vprint(file, name,
-                            "Failed to run contructor for '%s'\n",
-                            SPLAT_KMEM_CACHE_NAME);
+                            "Failed to run contructor for '%s'\n",
+                            SPLAT_KMEM_CACHE_NAME);
                 rc = -EINVAL;
                 goto out_free;
         }
  
-       if (kcd->kcd_magic != kcp.kcp_magic) {
+       if (kcp->kcp_kcd[0]->kcd_magic != kcp->kcp_magic) {
                 splat_vprint(file, name,
-                            "Failed to pass private data to constructor "
-                            "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+                            "Failed to pass private data to constructor "
+                            "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
                 rc = -EINVAL;
                 goto out_free;
         }
  
-       max = kcp.kcp_count;
-       kmem_cache_free(cache, kcd);
+       max = kcp->kcp_count;
+       spin_lock(&kcp->kcp_lock);
+       kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[0]);
+       kcp->kcp_kcd[0] = NULL;
+       spin_unlock(&kcp->kcp_lock);
  
         /* Destroy the entire cache which will force destructors to
          * run and we can verify one was called for every object */
-       kmem_cache_destroy(cache);
-       if (kcp.kcp_count) {
+       kmem_cache_destroy(kcp->kcp_cache);
+       if (kcp->kcp_count) {
                 splat_vprint(file, name,
-                            "Failed to run destructor on all slab objects "
-                            "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+                            "Failed to run destructor on all slab objects "
+                            "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
                 rc = -EINVAL;
         }
  
         splat_vprint(file, name,
-                    "Successfully ran ctors/dtors for %d elements in '%s'\n",
-                    max, SPLAT_KMEM_CACHE_NAME);
+                    "Successfully ran ctors/dtors for %d elements in '%s'\n",
+                    max, SPLAT_KMEM_CACHE_NAME);
  
         return rc;
  
  out_free:
-       if (kcd)
-               kmem_cache_free(cache, kcd);
+       if (kcp->kcp_kcd[0]) {
+               spin_lock(&kcp->kcp_lock);
+               kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[0]);
+               kcp->kcp_kcd[0] = NULL;
+               spin_unlock(&kcp->kcp_lock);
+       }
+
+       if (kcp->kcp_cache)
+               kmem_cache_destroy(kcp->kcp_cache);
+
+       splat_kmem_cache_test_kcp_free(kcp);
+
+       return rc;
+}
+
+static int
+splat_kmem_cache_thread_test(struct file *file, void *arg, char *name,
+                            int size, int alloc)
+{
+       kmem_cache_priv_t *kcp;
+       kthread_t *thr;
+       struct timespec start, stop, delta;
+       char cache_name[32];
+       int i, rc = 0;
+
+       kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, 0, alloc, 0);
+       if (!kcp) {
+               splat_vprint(file, name, "Unable to create '%s'\n", "kcp");
+               return -ENOMEM;
+       }
+
+       (void)snprintf(cache_name, 32, "%s-%d-%d",
+                      SPLAT_KMEM_CACHE_NAME, size, alloc);
+       kcp->kcp_cache =
+               kmem_cache_create(cache_name, kcp->kcp_size, 0,
+                                 splat_kmem_cache_test_constructor,
+                                 splat_kmem_cache_test_destructor,
+                                 splat_kmem_cache_test_reclaim,
+                                 kcp, NULL, KMC_VMEM);
+       if (!kcp->kcp_cache) {
+               splat_vprint(file, name, "Unable to create '%s'\n", cache_name);
+               rc = -ENOMEM;
+               goto out_kcp;
+       }
+
+       start = current_kernel_time();
+
+       for (i = 0; i < SPLAT_KMEM_THREADS; i++) {
+               thr = thread_create(NULL, 0,
+                                   splat_kmem_cache_test_thread,
+                                   kcp, 0, &p0, TS_RUN, minclsyspri);
+               if (thr == NULL) {
+                       rc = -ESRCH;
+                       goto out_cache;
+               }
+       }
+
+       /* Sleep until all threads have started, then set the ready
+        * flag and wake them all up for maximum concurrency. */
+       wait_event(kcp->kcp_ctl_waitq,
+                  splat_kmem_cache_test_threads(kcp, SPLAT_KMEM_THREADS));
+
+       spin_lock(&kcp->kcp_lock);
+       kcp->kcp_flags |= KCP_FLAG_READY;
+       spin_unlock(&kcp->kcp_lock);
+       wake_up_all(&kcp->kcp_thr_waitq);
+
+       /* Sleep until all thread have finished */
+       wait_event(kcp->kcp_ctl_waitq, splat_kmem_cache_test_threads(kcp, 0));
+
+       stop = current_kernel_time();
+       delta = timespec_sub(stop, start);
  
-       kmem_cache_destroy(cache);
+       splat_vprint(file, name,
+                    "%-22s %2ld.%09ld\t"
+                    "%lu/%lu/%lu\t%lu/%lu/%lu\n",
+                    kcp->kcp_cache->skc_name,
+                    delta.tv_sec, delta.tv_nsec,
+                    (unsigned long)kcp->kcp_cache->skc_slab_total,
+                    (unsigned long)kcp->kcp_cache->skc_slab_max,
+                    (unsigned long)(kcp->kcp_alloc *
+                                   SPLAT_KMEM_THREADS /
+                                   SPL_KMEM_CACHE_OBJ_PER_SLAB),
+                    (unsigned long)kcp->kcp_cache->skc_obj_total,
+                    (unsigned long)kcp->kcp_cache->skc_obj_max,
+                    (unsigned long)(kcp->kcp_alloc *
+                                    SPLAT_KMEM_THREADS));
+
+       if (delta.tv_sec >= 5)
+               rc = -ETIME;
+
+       if (!rc && kcp->kcp_rc)
+               rc = kcp->kcp_rc;
+
+out_cache:
+       kmem_cache_destroy(kcp->kcp_cache);
+out_kcp:
+       splat_kmem_cache_test_kcp_free(kcp);
         return rc;
  }
  
@@ -409,291 +750,279 @@ splat_kmem_test6(struct file *file, void *arg)
         return splat_kmem_cache_test(file, arg, name, 128*1028, 0, KMC_VMEM);
  }
  
-static void
-splat_kmem_cache_test_reclaim(void *priv)
+/* Validate object alignment cache behavior for caches */
+static int
+splat_kmem_test7(struct file *file, void *arg)
  {
-       kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv;
-       int i, count;
-
-       count = min(SPLAT_KMEM_OBJ_RECLAIM, kcp->kcp_count);
-       splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST7_NAME,
-                     "Reaping %d objects from '%s'\n", count,
-                    SPLAT_KMEM_CACHE_NAME);
-
-       for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) {
-               if (kcp->kcp_kcd[i]) {
-                       kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
-                       kcp->kcp_kcd[i] = NULL;
+       char *name = SPLAT_KMEM_TEST7_NAME;
+       int i, rc;
  
-                       if (--count == 0)
-                               break;
-               }
+       for (i = 8; i <= PAGE_SIZE; i *= 2) {
+               rc = splat_kmem_cache_test(file, arg, name, 157, i, 0);
+               if (rc)
+                       return rc;
         }
  
-       return;
+       return rc;
  }
  
  static int
-splat_kmem_test7(struct file *file, void *arg)
+splat_kmem_test8(struct file *file, void *arg)
  {
-       kmem_cache_t *cache;
-       kmem_cache_priv_t kcp;
-       int i, rc = 0;
-
-       kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
-       kcp.kcp_file = file;
-       kcp.kcp_size = 256;
-       kcp.kcp_count = 0;
-       kcp.kcp_rc = 0;
-
-       cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp.kcp_size, 0,
-                                 splat_kmem_cache_test_constructor,
-                                 splat_kmem_cache_test_destructor,
-                                 splat_kmem_cache_test_reclaim,
-                                 &kcp, NULL, 0);
-       if (!cache) {
-               splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
-                          "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME);
+       kmem_cache_priv_t *kcp;
+       kmem_cache_data_t *kcd;
+       int i, j, rc = 0;
+
+       kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST8_NAME,
+                                             256, 0, 0, SPLAT_KMEM_OBJ_COUNT);
+       if (!kcp) {
+               splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+                            "Unable to create '%s'\n", "kcp");
                 return -ENOMEM;
         }
  
-       kcp.kcp_cache = cache;
+       kcp->kcp_cache =
+               kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0,
+                                 splat_kmem_cache_test_constructor,
+                                 splat_kmem_cache_test_destructor,
+                                 splat_kmem_cache_test_reclaim,
+                                 kcp, NULL, 0);
+       if (!kcp->kcp_cache) {
+               splat_kmem_cache_test_kcp_free(kcp);
+               splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+                          "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME);
+               return -ENOMEM;
+       }
  
         for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++) {
-               /* All allocations need not succeed */
-               kcp.kcp_kcd[i] = kmem_cache_alloc(cache, KM_SLEEP);
-               if (!kcp.kcp_kcd[i]) {
-                       splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
-                                  "Unable to allocate from '%s'\n",
-                                  SPLAT_KMEM_CACHE_NAME);
+               kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+               spin_lock(&kcp->kcp_lock);
+               kcp->kcp_kcd[i] = kcd;
+               spin_unlock(&kcp->kcp_lock);
+               if (!kcd) {
+                       splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+                                  "Unable to allocate from '%s'\n",
+                                  SPLAT_KMEM_CACHE_NAME);
                 }
         }
  
-       ASSERT(kcp.kcp_count > 0);
-
         /* Request the slab cache free any objects it can.  For a few reasons
          * this may not immediately result in more free memory even if objects
          * are freed.  First off, due to fragmentation we may not be able to
          * reclaim any slabs.  Secondly, even if we do we fully clear some
          * slabs we will not want to immedately reclaim all of them because
          * we may contend with cache allocs and thrash.  What we want to see
-        * is slab size decrease more gradually as it becomes clear they
+        * is the slab size decrease more gradually as it becomes clear they
          * will not be needed.  This should be acheivable in less than minute
          * if it takes longer than this something has gone wrong.
          */
         for (i = 0; i < 60; i++) {
-               kmem_cache_reap_now(cache);
-               splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
-                             "%s cache objects %d, slabs %u/%u objs %u/%u\n",
-                            SPLAT_KMEM_CACHE_NAME, kcp.kcp_count,
-                           (unsigned)cache->skc_slab_alloc,
-                           (unsigned)cache->skc_slab_total,
-                           (unsigned)cache->skc_obj_alloc,
-                           (unsigned)cache->skc_obj_total);
-
-               if (cache->skc_obj_total == 0)
+               kmem_cache_reap_now(kcp->kcp_cache);
+               splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
+                            "%s cache objects %d, slabs %u/%u objs %u/%u mags ",
+                            SPLAT_KMEM_CACHE_NAME, kcp->kcp_count,
+                           (unsigned)kcp->kcp_cache->skc_slab_alloc,
+                           (unsigned)kcp->kcp_cache->skc_slab_total,
+                           (unsigned)kcp->kcp_cache->skc_obj_alloc,
+                           (unsigned)kcp->kcp_cache->skc_obj_total);
+
+               for_each_online_cpu(j)
+                       splat_print(file, "%u/%u ",
+                                    kcp->kcp_cache->skc_mag[j]->skm_avail,
+                                    kcp->kcp_cache->skc_mag[j]->skm_size);
+
+               splat_print(file, "%s\n", "");
+
+               if (kcp->kcp_cache->skc_obj_total == 0)
                         break;
  
                 set_current_state(TASK_INTERRUPTIBLE);
                 schedule_timeout(HZ);
         }
  
-       if (cache->skc_obj_total == 0) {
-               splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
+       if (kcp->kcp_cache->skc_obj_total == 0) {
+               splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
                         "Successfully created %d objects "
                         "in cache %s and reclaimed them\n",
-                       SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
+                       SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
         } else {
-               splat_vprint(file, SPLAT_KMEM_TEST7_NAME,
+               splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
                         "Failed to reclaim %u/%d objects from cache %s\n",
-                       (unsigned)cache->skc_obj_total, SPLAT_KMEM_OBJ_COUNT,
-                       SPLAT_KMEM_CACHE_NAME);
+                       (unsigned)kcp->kcp_cache->skc_obj_total,
+                       SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
                 rc = -ENOMEM;
         }
  
         /* Cleanup our mess (for failure case of time expiring) */
+       spin_lock(&kcp->kcp_lock);
         for (i = 0; i < SPLAT_KMEM_OBJ_COUNT; i++)
-               if (kcp.kcp_kcd[i])
-                       kmem_cache_free(cache, kcp.kcp_kcd[i]);
+               if (kcp->kcp_kcd[i])
+                       kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
+       spin_unlock(&kcp->kcp_lock);
  
-       kmem_cache_destroy(cache);
+       kmem_cache_destroy(kcp->kcp_cache);
+       splat_kmem_cache_test_kcp_free(kcp);
  
         return rc;
  }
  
-static void
-splat_kmem_test8_thread(void *arg)
+static int
+splat_kmem_test9(struct file *file, void *arg)
  {
-       kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)arg;
-       int count = kcp->kcp_alloc, rc = 0, i;
-       void **objs;
-
-       ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC);
+       kmem_cache_priv_t *kcp;
+       kmem_cache_data_t *kcd;
+       int i, j, rc = 0, count = SPLAT_KMEM_OBJ_COUNT * 128;
+
+       kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST9_NAME,
+                                             256, 0, 0, count);
+       if (!kcp) {
+               splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+                            "Unable to create '%s'\n", "kcp");
+               return -ENOMEM;
+       }
  
-       objs = vmem_zalloc(count * sizeof(void *), KM_SLEEP);
-       if (!objs) {
-               splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST8_NAME,
-                            "Unable to alloc objp array for cache '%s'\n",
-                            kcp->kcp_cache->skc_name);
-               rc = -ENOMEM;
-               goto out;
+       kcp->kcp_cache =
+               kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0,
+                                 splat_kmem_cache_test_constructor,
+                                 splat_kmem_cache_test_destructor,
+                                 NULL, kcp, NULL, 0);
+       if (!kcp->kcp_cache) {
+               splat_kmem_cache_test_kcp_free(kcp);
+               splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+                          "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME);
+               return -ENOMEM;
         }
  
         for (i = 0; i < count; i++) {
-               objs[i] = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
-               if (!objs[i]) {
-                       splat_vprint(kcp->kcp_file, SPLAT_KMEM_TEST8_NAME,
-                                    "Unable to allocate from cache '%s'\n",
-                                    kcp->kcp_cache->skc_name);
-                       rc = -ENOMEM;
-                       break;
+               kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+               spin_lock(&kcp->kcp_lock);
+               kcp->kcp_kcd[i] = kcd;
+               spin_unlock(&kcp->kcp_lock);
+               if (!kcd) {
+                       splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+                                  "Unable to allocate from '%s'\n",
+                                  SPLAT_KMEM_CACHE_NAME);
                 }
         }
  
-       for (i = 0; i < count; i++)
-               if (objs[i])
-                       kmem_cache_free(kcp->kcp_cache, objs[i]);
-
-       vmem_free(objs, count * sizeof(void *));
-out:
         spin_lock(&kcp->kcp_lock);
-       if (!kcp->kcp_rc)
-               kcp->kcp_rc = rc;
-
-       if (--kcp->kcp_threads == 0)
-               wake_up(&kcp->kcp_waitq);
-
+       for (i = 0; i < count; i++)
+               if (kcp->kcp_kcd[i])
+                       kmem_cache_free(kcp->kcp_cache, kcp->kcp_kcd[i]);
         spin_unlock(&kcp->kcp_lock);
  
-        thread_exit();
-}
+       /* We have allocated a large number of objects thus creating a
+        * large number of slabs and then free'd them all.  However since
+        * there should be little memory pressure at the moment those
+        * slabs have not been freed.  What we want to see is the slab
+        * size decrease gradually as it becomes clear they will not be
+        * be needed.  This should be acheivable in less than minute
+        * if it takes longer than this something has gone wrong.
+        */
+       for (i = 0; i < 60; i++) {
+               splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+                            "%s cache objects %d, slabs %u/%u objs %u/%u mags ",
+                            SPLAT_KMEM_CACHE_NAME, kcp->kcp_count,
+                           (unsigned)kcp->kcp_cache->skc_slab_alloc,
+                           (unsigned)kcp->kcp_cache->skc_slab_total,
+                           (unsigned)kcp->kcp_cache->skc_obj_alloc,
+                           (unsigned)kcp->kcp_cache->skc_obj_total);
+
+               for_each_online_cpu(j)
+                       splat_print(file, "%u/%u ",
+                                    kcp->kcp_cache->skc_mag[j]->skm_avail,
+                                    kcp->kcp_cache->skc_mag[j]->skm_size);
+
+               splat_print(file, "%s\n", "");
+
+               if (kcp->kcp_cache->skc_obj_total == 0)
+                       break;
  
-static int
-splat_kmem_test8_count(kmem_cache_priv_t *kcp, int threads)
-{
-       int ret;
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(HZ);
+       }
  
-       spin_lock(&kcp->kcp_lock);
-       ret = (kcp->kcp_threads == threads);
-       spin_unlock(&kcp->kcp_lock);
+       if (kcp->kcp_cache->skc_obj_total == 0) {
+               splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+                       "Successfully created %d objects "
+                       "in cache %s and reclaimed them\n",
+                       count, SPLAT_KMEM_CACHE_NAME);
+       } else {
+               splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
+                       "Failed to reclaim %u/%d objects from cache %s\n",
+                       (unsigned)kcp->kcp_cache->skc_obj_total, count,
+                       SPLAT_KMEM_CACHE_NAME);
+               rc = -ENOMEM;
+       }
+
+       kmem_cache_destroy(kcp->kcp_cache);
+       splat_kmem_cache_test_kcp_free(kcp);
  
-       return ret;
+       return rc;
  }
  
-/* This test will always pass and is simply here so I can easily
- * eyeball the slab cache locking overhead to ensure it is reasonable.
+/*
+ * This test creates N threads with a shared kmem cache.  They then all
+ * concurrently allocate and free from the cache to stress the locking and
+ * concurrent cache performance.  If any one test takes longer than 5
+ * seconds to complete it is treated as a failure and may indicate a
+ * performance regression.  On my test system no one test takes more
+ * than 1 second to complete so a 5x slowdown likely a problem.
   */
  static int
-splat_kmem_test8_sc(struct file *file, void *arg, int size, int count)
+splat_kmem_test10(struct file *file, void *arg)
  {
-       kmem_cache_priv_t kcp;
-       kthread_t *thr;
-       struct timespec start, stop, delta;
-       char cache_name[32];
-       int i, j, rc = 0, threads = 32;
-
-       kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
-       kcp.kcp_file = file;
-
-        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s  %s", "name",
-                    "time (sec)\tslabs       \tobjs        \thash\n");
-        splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s  %s", "",
-                    "          \ttot/max/calc\ttot/max/calc\n");
-
-       for (i = 1; i <= count; i *= 2) {
-               kcp.kcp_size = size;
-               kcp.kcp_count = 0;
-               kcp.kcp_threads = 0;
-               kcp.kcp_alloc = i;
-               kcp.kcp_rc = 0;
-               spin_lock_init(&kcp.kcp_lock);
-               init_waitqueue_head(&kcp.kcp_waitq);
-
-               (void)snprintf(cache_name, 32, "%s-%d-%d",
-                              SPLAT_KMEM_CACHE_NAME, size, i);
-               kcp.kcp_cache = kmem_cache_create(cache_name, kcp.kcp_size, 0,
-                                         splat_kmem_cache_test_constructor,
-                                         splat_kmem_cache_test_destructor,
-                                         NULL, &kcp, NULL, 0);
-               if (!kcp.kcp_cache) {
-                       splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
-                                    "Unable to create '%s' cache\n",
-                                    SPLAT_KMEM_CACHE_NAME);
-                       rc = -ENOMEM;
-                       break;
-               }
-
-               start = current_kernel_time();
-
-               for (j = 0; j < threads; j++) {
-                       thr = thread_create(NULL, 0, splat_kmem_test8_thread,
-                                           &kcp, 0, &p0, TS_RUN, minclsyspri);
-                       if (thr == NULL) {
-                               rc = -ESRCH;
-                               break;
-                       }
-                       spin_lock(&kcp.kcp_lock);
-                       kcp.kcp_threads++;
-                       spin_unlock(&kcp.kcp_lock);
-               }
+       uint64_t size, alloc, free_mem, rc = 0;
  
-               /* Sleep until the thread sets kcp.kcp_threads == 0 */
-               wait_event(kcp.kcp_waitq, splat_kmem_test8_count(&kcp, 0));
-               stop = current_kernel_time();
-               delta = timespec_sub(stop, start);
+       free_mem = nr_free_pages() * PAGE_SIZE;
+       for (size = 16; size <= 1024*1024; size *= 2) {
  
-               splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %2ld.%09ld\t"
-                            "%lu/%lu/%lu\t%lu/%lu/%lu\n",
-                            kcp.kcp_cache->skc_name,
-                            delta.tv_sec, delta.tv_nsec,
-                            (unsigned long)kcp.kcp_cache->skc_slab_total,
-                            (unsigned long)kcp.kcp_cache->skc_slab_max,
-                            (unsigned long)(kcp.kcp_alloc * threads /
-                                           SPL_KMEM_CACHE_OBJ_PER_SLAB),
-                            (unsigned long)kcp.kcp_cache->skc_obj_total,
-                            (unsigned long)kcp.kcp_cache->skc_obj_max,
-                            (unsigned long)(kcp.kcp_alloc * threads));
+               splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "name",
+                            "time (sec)\tslabs       \tobjs    \thash\n");
+               splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "",
+                            "    \ttot/max/calc\ttot/max/calc\n");
  
-               kmem_cache_destroy(kcp.kcp_cache);
+               for (alloc = 1; alloc <= 1024; alloc *= 2) {
  
-               if (!rc && kcp.kcp_rc)
-                       rc = kcp.kcp_rc;
+                       /* Skip tests which exceed free memory */
+                       if (size * alloc * SPLAT_KMEM_THREADS > free_mem / 2)
+                               continue;
  
-               if (rc)
-                       break;
+                       rc = splat_kmem_cache_thread_test(file, arg,
+                               SPLAT_KMEM_TEST10_NAME, size, alloc);
+                       if (rc)
+                               break;
+               }
         }
  
         return rc;
  }
  
+/*
+ * This test creates N threads with a shared kmem cache which overcommits
+ * memory by 4x.  This makes it impossible for the slab to satify the
+ * thread requirements without having its reclaim hook run which will
+ * free objects back for use.  This behavior is triggered by the linum VM
+ * detecting a low memory condition on the node and invoking the shrinkers.
+ * This should allow all the threads to complete while avoiding deadlock
+ * and for the most part out of memory events.  This is very tough on the
+ * system so it is possible the test app may get oom'ed.
+ */
  static int
-splat_kmem_test8(struct file *file, void *arg)
+splat_kmem_test11(struct file *file, void *arg)
  {
-       int i, rc = 0;
+       uint64_t size, alloc, rc;
  
-       /* Run through slab cache with objects size from
-        * 16-1Mb in 4x multiples with 1024 objects each */
-       for (i = 16; i <= 1024*1024; i *= 4) {
-               rc = splat_kmem_test8_sc(file, arg, i, 256);
-               if (rc)
-                       break;
-       }
-
-       return rc;
-}
+       size = 1024*1024;
+       alloc = ((4 * num_physpages * PAGE_SIZE) / size) / SPLAT_KMEM_THREADS;
  
-/* Validate object alignment cache behavior for caches */
-static int
-splat_kmem_test9(struct file *file, void *arg)
-{
-       char *name = SPLAT_KMEM_TEST9_NAME;
-       int i, rc;
+       splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "name",
+                    "time (sec)\tslabs       \tobjs    \thash\n");
+       splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s  %s", "",
+                    "    \ttot/max/calc\ttot/max/calc\n");
  
-       for (i = 8; i <= PAGE_SIZE; i *= 2) {
-               rc = splat_kmem_cache_test(file, arg, name, 157, i, 0);
-               if (rc)
-                       return rc;
-       }
+       rc = splat_kmem_cache_thread_test(file, arg,
+               SPLAT_KMEM_TEST11_NAME, size, alloc);
  
         return rc;
  }
@@ -701,60 +1030,66 @@ splat_kmem_test9(struct file *file, void *arg)
  splat_subsystem_t *
  splat_kmem_init(void)
  {
-        splat_subsystem_t *sub;
+       splat_subsystem_t *sub;
  
-        sub = kmalloc(sizeof(*sub), GFP_KERNEL);
-        if (sub == NULL)
-                return NULL;
+       sub = kmalloc(sizeof(*sub), GFP_KERNEL);
+       if (sub == NULL)
+               return NULL;
  
-        memset(sub, 0, sizeof(*sub));
-        strncpy(sub->desc.name, SPLAT_KMEM_NAME, SPLAT_NAME_SIZE);
+       memset(sub, 0, sizeof(*sub));
+       strncpy(sub->desc.name, SPLAT_KMEM_NAME, SPLAT_NAME_SIZE);
         strncpy(sub->desc.desc, SPLAT_KMEM_DESC, SPLAT_DESC_SIZE);
-        INIT_LIST_HEAD(&sub->subsystem_list);
+       INIT_LIST_HEAD(&sub->subsystem_list);
         INIT_LIST_HEAD(&sub->test_list);
-        spin_lock_init(&sub->test_lock);
-        sub->desc.id = SPLAT_SUBSYSTEM_KMEM;
-
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST1_NAME, SPLAT_KMEM_TEST1_DESC,
-                     SPLAT_KMEM_TEST1_ID, splat_kmem_test1);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST2_NAME, SPLAT_KMEM_TEST2_DESC,
-                     SPLAT_KMEM_TEST2_ID, splat_kmem_test2);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST3_NAME, SPLAT_KMEM_TEST3_DESC,
-                     SPLAT_KMEM_TEST3_ID, splat_kmem_test3);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST4_NAME, SPLAT_KMEM_TEST4_DESC,
-                     SPLAT_KMEM_TEST4_ID, splat_kmem_test4);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC,
-                     SPLAT_KMEM_TEST5_ID, splat_kmem_test5);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC,
-                     SPLAT_KMEM_TEST6_ID, splat_kmem_test6);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC,
-                     SPLAT_KMEM_TEST7_ID, splat_kmem_test7);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST8_NAME, SPLAT_KMEM_TEST8_DESC,
-                     SPLAT_KMEM_TEST8_ID, splat_kmem_test8);
-        SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST9_NAME, SPLAT_KMEM_TEST9_DESC,
-                     SPLAT_KMEM_TEST9_ID, splat_kmem_test9);
-
-        return sub;
+       spin_lock_init(&sub->test_lock);
+       sub->desc.id = SPLAT_SUBSYSTEM_KMEM;
+
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST1_NAME, SPLAT_KMEM_TEST1_DESC,
+                       SPLAT_KMEM_TEST1_ID, splat_kmem_test1);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST2_NAME, SPLAT_KMEM_TEST2_DESC,
+                       SPLAT_KMEM_TEST2_ID, splat_kmem_test2);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST3_NAME, SPLAT_KMEM_TEST3_DESC,
+                       SPLAT_KMEM_TEST3_ID, splat_kmem_test3);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST4_NAME, SPLAT_KMEM_TEST4_DESC,
+                       SPLAT_KMEM_TEST4_ID, splat_kmem_test4);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC,
+                       SPLAT_KMEM_TEST5_ID, splat_kmem_test5);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC,
+                       SPLAT_KMEM_TEST6_ID, splat_kmem_test6);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC,
+                       SPLAT_KMEM_TEST7_ID, splat_kmem_test7);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST8_NAME, SPLAT_KMEM_TEST8_DESC,
+                       SPLAT_KMEM_TEST8_ID, splat_kmem_test8);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST9_NAME, SPLAT_KMEM_TEST9_DESC,
+                       SPLAT_KMEM_TEST9_ID, splat_kmem_test9);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST10_NAME, SPLAT_KMEM_TEST10_DESC,
+                       SPLAT_KMEM_TEST10_ID, splat_kmem_test10);
+       SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST11_NAME, SPLAT_KMEM_TEST11_DESC,
+                       SPLAT_KMEM_TEST11_ID, splat_kmem_test11);
+
+       return sub;
  }
  
  void
  splat_kmem_fini(splat_subsystem_t *sub)
  {
-        ASSERT(sub);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST9_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST8_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST2_ID);
-        SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST1_ID);
-
-        kfree(sub);
+       ASSERT(sub);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST11_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST10_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST9_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST8_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST2_ID);
+       SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST1_ID);
+
+       kfree(sub);
  }
  
  int
  splat_kmem_id(void) {
-        return SPLAT_SUBSYSTEM_KMEM;
+       return SPLAT_SUBSYSTEM_KMEM;
  }
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Sat, 31 Jan 2009 04:54:49 +0000 (20:54 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Sat, 31 Jan 2009 04:54:49 +0000 (20:54 -0800)
include/sys/kmem.h		patch \| blob \| blame \| history
include/sys/sysmacros.h		patch \| blob \| blame \| history
include/sys/vmsystm.h		patch \| blob \| blame \| history
module/spl/spl-kmem.c		patch \| blob \| blame \| history
module/splat/splat-internal.h		patch \| blob \| blame \| history
module/splat/splat-kmem.c		patch \| blob \| blame \| history