Emergency slab objects

author Brian Behlendorf <behlendorf1@llnl.gov>

Tue, 7 Aug 2012 23:59:50 +0000 (16:59 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 27 Aug 2012 19:00:42 +0000 (12:00 -0700)
author Brian Behlendorf <behlendorf1@llnl.gov>
Tue, 7 Aug 2012 23:59:50 +0000 (16:59 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 27 Aug 2012 19:00:42 +0000 (12:00 -0700)
diff --git a/include/sys/kmem.h b/include/sys/kmem.h

index 344e2716b1588227093e84f431858dad3704016c..aaff6d046d36903474b43d72c127b22ab35f073d 100644 (file)
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -291,6 +291,7 @@ enum {
         KMC_BIT_KMEM            = 5,    /* Use kmem cache */
         KMC_BIT_VMEM            = 6,    /* Use vmem cache */
         KMC_BIT_OFFSLAB         = 7,    /* Objects not on slab */
+       KMC_BIT_GROWING         = 15,   /* Growing in progress */
         KMC_BIT_REAPING         = 16,   /* Reaping in progress */
         KMC_BIT_DESTROY         = 17,   /* Destroy in progress */
         KMC_BIT_TOTAL           = 18,   /* Proc handler helper bit */
@@ -315,6 +316,7 @@ typedef enum kmem_cbrc {
  #define KMC_KMEM               (1 << KMC_BIT_KMEM)
  #define KMC_VMEM               (1 << KMC_BIT_VMEM)
  #define KMC_OFFSLAB            (1 << KMC_BIT_OFFSLAB)
+#define KMC_GROWING            (1 << KMC_BIT_GROWING)
  #define KMC_REAPING            (1 << KMC_BIT_REAPING)
  #define KMC_DESTROY            (1 << KMC_BIT_DESTROY)
  #define KMC_TOTAL              (1 << KMC_BIT_TOTAL)
@@ -374,6 +376,17 @@ typedef struct spl_kmem_slab {
         uint32_t                sks_ref;        /* Ref count used objects */
  } spl_kmem_slab_t;
  
+typedef struct spl_kmem_alloc {
+       struct spl_kmem_cache   *ska_cache;     /* Owned by cache */
+       int                     ska_flags;      /* Allocation flags */
+       struct delayed_work     ska_work;       /* Allocation work */
+} spl_kmem_alloc_t;
+
+typedef struct spl_kmem_emergency {
+       void                    *ske_obj;       /* Buffer address */
+       struct list_head        ske_list;       /* Emergency list linkage */
+} spl_kmem_emergency_t;
+
  typedef struct spl_kmem_cache {
         uint32_t                skc_magic;      /* Sanity magic */
         uint32_t                skc_name_size;  /* Name length */
@@ -398,7 +411,9 @@ typedef struct spl_kmem_cache {
         struct list_head        skc_list;       /* List of caches linkage */
         struct list_head        skc_complete_list;/* Completely alloc'ed */
         struct list_head        skc_partial_list; /* Partially alloc'ed */
+       struct list_head        skc_emergency_list; /* Min sized objects */
         spinlock_t              skc_lock;       /* Cache lock */
+       wait_queue_head_t       skc_waitq;      /* Allocation waiters */
         uint64_t                skc_slab_fail;  /* Slab alloc failures */
         uint64_t                skc_slab_create;/* Slab creates */
         uint64_t                skc_slab_destroy;/* Slab destroys */
@@ -408,6 +423,8 @@ typedef struct spl_kmem_cache {
         uint64_t                skc_obj_total;  /* Obj total current */
         uint64_t                skc_obj_alloc;  /* Obj alloc current */
         uint64_t                skc_obj_max;    /* Obj max historic */
+       uint64_t                skc_obj_emergency; /* Obj emergency current */
+       uint64_t                skc_obj_emergency_max; /* Obj emergency max */
  } spl_kmem_cache_t;
  #define kmem_cache_t           spl_kmem_cache_t
  
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c

index 258d61478cc725e219b75734b86bce8304cf5a2f..4cf3b26ad0be4b0e5863dc1d547b63ffa89339a2 100644 (file)
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -1143,6 +1143,86 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
         SEXIT;
  }
  
+/*
+ * Allocate a single emergency object for use by the caller.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+       spl_kmem_emergency_t *ske;
+       int empty;
+       SENTRY;
+
+       /* Last chance use a partial slab if one now exists */
+       spin_lock(&skc->skc_lock);
+       empty = list_empty(&skc->skc_partial_list);
+       spin_unlock(&skc->skc_lock);
+       if (!empty)
+               SRETURN(-EEXIST);
+
+       ske = kmalloc(sizeof(*ske), flags);
+       if (ske == NULL)
+               SRETURN(-ENOMEM);
+
+       ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
+       if (ske->ske_obj == NULL) {
+               kfree(ske);
+               SRETURN(-ENOMEM);
+       }
+
+       if (skc->skc_ctor)
+               skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
+
+       spin_lock(&skc->skc_lock);
+       skc->skc_obj_total++;
+       skc->skc_obj_emergency++;
+       if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+               skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+
+       list_add(&ske->ske_list, &skc->skc_emergency_list);
+       spin_unlock(&skc->skc_lock);
+
+       *obj = ske->ske_obj;
+
+       SRETURN(0);
+}
+
+/*
+ * Free the passed object if it is an emergency object or a normal slab
+ * object.  Currently this is done by walking what should be a short list of
+ * emergency objects.  If this proves to be too inefficient we can replace
+ * the simple list with a hash.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+       spl_kmem_emergency_t *m, *n, *ske = NULL;
+       SENTRY;
+
+       spin_lock(&skc->skc_lock);
+       list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
+               if (m->ske_obj == obj) {
+                       list_del(&m->ske_list);
+                       skc->skc_obj_emergency--;
+                       skc->skc_obj_total--;
+                       ske = m;
+                       break;
+               }
+       }
+       spin_unlock(&skc->skc_lock);
+
+       if (ske == NULL)
+               SRETURN(-ENOENT);
+
+       if (skc->skc_dtor)
+               skc->skc_dtor(ske->ske_obj, skc->skc_private);
+
+       kfree(ske->ske_obj);
+       kfree(ske);
+
+       SRETURN(0);
+}
+
  /*
   * Called regularly on all caches to age objects out of the magazines
   * which have not been access in skc->skc_delay seconds.  This prevents
@@ -1430,7 +1510,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
         INIT_LIST_HEAD(&skc->skc_list);
         INIT_LIST_HEAD(&skc->skc_complete_list);
         INIT_LIST_HEAD(&skc->skc_partial_list);
+       INIT_LIST_HEAD(&skc->skc_emergency_list);
         spin_lock_init(&skc->skc_lock);
+       init_waitqueue_head(&skc->skc_waitq);
         skc->skc_slab_fail = 0;
         skc->skc_slab_create = 0;
         skc->skc_slab_destroy = 0;
@@ -1440,6 +1522,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
         skc->skc_obj_total = 0;
         skc->skc_obj_alloc = 0;
         skc->skc_obj_max = 0;
+       skc->skc_obj_emergency = 0;
+       skc->skc_obj_emergency_max = 0;
  
         if (align) {
                 VERIFY(ISP2(align));
@@ -1530,7 +1614,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
         ASSERT3U(skc->skc_obj_alloc, ==, 0);
         ASSERT3U(skc->skc_slab_total, ==, 0);
         ASSERT3U(skc->skc_obj_total, ==, 0);
+       ASSERT3U(skc->skc_obj_emergency, ==, 0);
         ASSERT(list_empty(&skc->skc_complete_list));
+       ASSERT(list_empty(&skc->skc_emergency_list));
  
         kmem_free(skc->skc_name, skc->skc_name_size);
         spin_unlock(&skc->skc_lock);
@@ -1581,59 +1667,112 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
  }
  
  /*
- * No available objects on any slabs, create a new slab.  Since this
- * is an expensive operation we do it without holding the spin lock and
- * only briefly acquire it when we link in the fully allocated and
- * constructed slab.
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
   */
-static spl_kmem_slab_t *
-spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+static void
+spl_cache_grow_work(void *data)
  {
+       spl_kmem_alloc_t *ska =
+               spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work);
+       spl_kmem_cache_t *skc = ska->ska_cache;
         spl_kmem_slab_t *sks;
+
+       sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
+       spin_lock(&skc->skc_lock);
+       if (sks) {
+               skc->skc_slab_total++;
+               skc->skc_obj_total += sks->sks_objs;
+               list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+       }
+
+       atomic_dec(&skc->skc_ref);
+       clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+       wake_up_all(&skc->skc_waitq);
+       spin_unlock(&skc->skc_lock);
+
+       kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+       return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
+}
+
+/*
+ * No available objects on any slabs, create a new slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+       int remaining, rc = 0;
         SENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
-       local_irq_enable();
         might_sleep();
+       *obj = NULL;
  
         /*
          * Before allocating a new slab check if the slab is being reaped.
          * If it is there is a good chance we can wait until it finishes
          * and then use one of the newly freed but not aged-out slabs.
          */
-       if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
-               schedule();
-               SGOTO(out, sks= NULL);
-       }
+       if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
+               SRETURN(-EAGAIN);
  
-       /* Allocate a new slab for the cache */
-       sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG);
-       if (sks == NULL)
-               SGOTO(out, sks = NULL);
+       /*
+        * This is handled by dispatching a work request to the global work
+        * queue.  This allows us to asynchronously allocate a new slab while
+        * retaining the ability to safely fall back to a smaller synchronous
+        * allocations to ensure forward progress is always maintained.
+        */
+       if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+               spl_kmem_alloc_t *ska;
  
-       /* Link the new empty slab in to the end of skc_partial_list. */
-       spin_lock(&skc->skc_lock);
-       skc->skc_slab_total++;
-       skc->skc_obj_total += sks->sks_objs;
-       list_add_tail(&sks->sks_list, &skc->skc_partial_list);
-       spin_unlock(&skc->skc_lock);
-out:
-       local_irq_disable();
+               ska = kmalloc(sizeof(*ska), flags);
+               if (ska == NULL) {
+                       clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+                       wake_up_all(&skc->skc_waitq);
+                       SRETURN(-ENOMEM);
+               }
  
-       SRETURN(sks);
+               atomic_inc(&skc->skc_ref);
+               ska->ska_cache = skc;
+               ska->ska_flags = flags;
+               spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska);
+               schedule_delayed_work(&ska->ska_work, 0);
+       }
+
+       /*
+        * Allow a single timer tick before falling back to synchronously
+        * allocating the minimum about of memory required by the caller.
+        */
+       remaining = wait_event_timeout(skc->skc_waitq,
+                                      spl_cache_grow_wait(skc), 1);
+       if (remaining == 0)
+               rc = spl_emergency_alloc(skc, flags, obj);
+
+       SRETURN(rc);
  }
  
  /*
- * Refill a per-cpu magazine with objects from the slabs for this
- * cache.  Ideally the magazine can be repopulated using existing
- * objects which have been released, however if we are unable to
- * locate enough free objects new slabs of objects will be created.
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created.  On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
   */
-static int
+static void *
  spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
  {
         spl_kmem_slab_t *sks;
-       int rc = 0, refill;
+       int count = 0, rc, refill;
+       void *obj = NULL;
         SENTRY;
  
         ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1647,8 +1786,15 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
                 if (list_empty(&skc->skc_partial_list)) {
                         spin_unlock(&skc->skc_lock);
  
-                       sks = spl_cache_grow(skc, flags);
-                       if (!sks)
+                       local_irq_enable();
+                       rc = spl_cache_grow(skc, flags, &obj);
+                       local_irq_disable();
+
+                       /* Emergency object for immediate use by caller */
+                       if (rc == 0 && obj != NULL)
+                               SRETURN(obj);
+
+                       if (rc)
                                 SGOTO(out, rc);
  
                         /* Rescheduled to different CPU skm is not local */
@@ -1673,9 +1819,9 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
  
                 /* Consume as many objects as needed to refill the requested
                  * cache.  We must also be careful not to overfill it. */
-               while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
+               while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
                         ASSERT(skm->skm_avail < skm->skm_size);
-                       ASSERT(rc < skm->skm_size);
+                       ASSERT(count < skm->skm_size);
                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
                 }
  
@@ -1688,8 +1834,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
  
         spin_unlock(&skc->skc_lock);
  out:
-       /* Returns the number of entries added to cache */
-       SRETURN(rc);
+       SRETURN(NULL);
  }
  
  /*
@@ -1804,10 +1949,9 @@ restart:
                 obj = skm->skm_objs[--skm->skm_avail];
                 skm->skm_age = jiffies;
         } else {
-               /* Per-CPU cache empty, directly allocate from
-                * the slab and refill the per-CPU cache. */
-               (void)spl_cache_refill(skc, skm, flags);
-               SGOTO(restart, obj = NULL);
+               obj = spl_cache_refill(skc, skm, flags);
+               if (obj == NULL)
+                       SGOTO(restart, obj = NULL);
         }
  
         local_irq_restore(irq_flags);
@@ -1838,6 +1982,14 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
         ASSERT(skc->skc_magic == SKC_MAGIC);
         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
         atomic_inc(&skc->skc_ref);
+
+       /*
+        * Emergency objects are never part of the virtual address space
+        * so if we get a virtual address we can optimize this check out.
+        */
+       if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
+               SGOTO(out, 0);
+
         local_irq_save(flags);
  
         /* Safe to update per-cpu structure without lock, but
@@ -1855,6 +2007,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
         skm->skm_objs[skm->skm_avail++] = obj;
  
         local_irq_restore(flags);
+out:
         atomic_dec(&skc->skc_ref);
  
         SEXIT;
diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c

index 8149143ae247e02ffb9dc84eb3f375caac3cdfb0..11a2d1068e0e335e30bbfbbbd398590833ca34ad 100644 (file)
--- a/module/spl/spl-proc.c
+++ b/module/spl/spl-proc.c
@@ -625,12 +625,12 @@ slab_seq_show_headers(struct seq_file *f)
              "--------------------- cache ----------"
              "---------------------------------------------  "
              "----- slab ------  "
-            "---- object -----\n");
+            "---- object -----------------\n");
          seq_printf(f,
              "name                                  "
              "  flags      size     alloc slabsize  objsize  "
              "total alloc   max  "
-            "total alloc   max\n");
+            "total alloc   max emerg   max\n");
  }
  
  static int
@@ -643,7 +643,7 @@ slab_seq_show(struct seq_file *f, void *p)
          spin_lock(&skc->skc_lock);
          seq_printf(f, "%-36s  ", skc->skc_name);
          seq_printf(f, "0x%05lx %9lu %9lu %8u %8u  "
-            "%5lu %5lu %5lu  %5lu %5lu %5lu\n",
+            "%5lu %5lu %5lu  %5lu %5lu %5lu %5lu %5lu\n",
              (long unsigned)skc->skc_flags,
              (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
              (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
@@ -654,7 +654,9 @@ slab_seq_show(struct seq_file *f, void *p)
              (long unsigned)skc->skc_slab_max,
              (long unsigned)skc->skc_obj_total,
              (long unsigned)skc->skc_obj_alloc,
-            (long unsigned)skc->skc_obj_max);
+            (long unsigned)skc->skc_obj_max,
+            (long unsigned)skc->skc_obj_emergency,
+            (long unsigned)skc->skc_obj_emergency_max);
  
          spin_unlock(&skc->skc_lock);
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Tue, 7 Aug 2012 23:59:50 +0000 (16:59 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 27 Aug 2012 19:00:42 +0000 (12:00 -0700)
include/sys/kmem.h		patch \| blob \| blame \| history
module/spl/spl-kmem.c		patch \| blob \| blame \| history
module/spl/spl-proc.c		patch \| blob \| blame \| history