]> git.proxmox.com Git - mirror_spl.git/blobdiff - module/spl/spl-kmem-cache.c
Add support for recent kmem_cache_create_usercopy
[mirror_spl.git] / module / spl / spl-kmem-cache.c
index 809ac5cc51174c9c93a7ada10b7f68a64398eafe..45576b9761e72863495bd6c5e8259b5d85bcec36 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/swap.h>
 #include <linux/mm_compat.h>
 #include <linux/wait_compat.h>
+#include <linux/prefetch.h>
 
 /*
  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
@@ -87,7 +88,7 @@ MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
 unsigned int spl_kmem_cache_magazine_size = 0;
 module_param(spl_kmem_cache_magazine_size, uint, 0444);
 MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
-       "Default magazine size (2-256), set automatically (0)\n");
+       "Default magazine size (2-256), set automatically (0)");
 
 /*
  * The default behavior is to report the number of objects remaining in the
@@ -109,7 +110,7 @@ module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
        "Minimal number of objects per slab");
 
-unsigned int spl_kmem_cache_max_size = 32;
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
 module_param(spl_kmem_cache_max_size, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
 
@@ -128,11 +129,26 @@ module_param(spl_kmem_cache_slab_limit, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
        "Objects less than N bytes use the Linux slab");
 
-unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
+/*
+ * This value defaults to a threshold designed to avoid allocations which
+ * have been deemed costly by the kernel.
+ */
+unsigned int spl_kmem_cache_kmem_limit =
+    ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
+    SPL_KMEM_CACHE_OBJ_PER_SLAB;
 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
        "Objects less than N bytes use the kmalloc");
 
+/*
+ * The number of threads available to allocate new slabs for caches.  This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+       "Number of spl_kmem_cache threads");
+
 /*
  * Slab allocation interfaces
  *
@@ -181,12 +197,12 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
        gfp_t lflags = kmem_flags_convert(flags);
        void *ptr;
 
-       ASSERT(ISP2(size));
-
-       if (skc->skc_flags & KMC_KMEM)
+       if (skc->skc_flags & KMC_KMEM) {
+               ASSERT(ISP2(size));
                ptr = (void *)__get_free_pages(lflags, get_order(size));
-       else
-               ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+       } else {
+               ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+       }
 
        /* Resulting allocated memory will be page aligned */
        ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
@@ -198,7 +214,6 @@ static void
 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 {
        ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
-       ASSERT(ISP2(size));
 
        /*
         * The Linux direct reclaim path uses this out of band value to
@@ -210,10 +225,12 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 
-       if (skc->skc_flags & KMC_KMEM)
+       if (skc->skc_flags & KMC_KMEM) {
+               ASSERT(ISP2(size));
                free_pages((unsigned long)ptr, get_order(size));
-       else
+       } else {
                vfree(ptr);
+       }
 }
 
 /*
@@ -443,9 +460,9 @@ spl_emergency_search(struct rb_root *root, void *obj)
        while (node) {
                ske = container_of(node, spl_kmem_emergency_t, ske_node);
 
-               if (address < (unsigned long)ske->ske_obj)
+               if (address < ske->ske_obj)
                        node = node->rb_left;
-               else if (address > (unsigned long)ske->ske_obj)
+               else if (address > ske->ske_obj)
                        node = node->rb_right;
                else
                        return (ske);
@@ -459,15 +476,15 @@ spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 {
        struct rb_node **new = &(root->rb_node), *parent = NULL;
        spl_kmem_emergency_t *ske_tmp;
-       unsigned long address = (unsigned long)ske->ske_obj;
+       unsigned long address = ske->ske_obj;
 
        while (*new) {
                ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 
                parent = *new;
-               if (address < (unsigned long)ske_tmp->ske_obj)
+               if (address < ske_tmp->ske_obj)
                        new = &((*new)->rb_left);
-               else if (address > (unsigned long)ske_tmp->ske_obj)
+               else if (address > ske_tmp->ske_obj)
                        new = &((*new)->rb_right);
                else
                        return (0);
@@ -487,6 +504,7 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 {
        gfp_t lflags = kmem_flags_convert(flags);
        spl_kmem_emergency_t *ske;
+       int order = get_order(skc->skc_obj_size);
        int empty;
 
        /* Last chance use a partial slab if one now exists */
@@ -500,8 +518,8 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
        if (ske == NULL)
                return (-ENOMEM);
 
-       ske->ske_obj = kmalloc(skc->skc_obj_size, lflags);
-       if (ske->ske_obj == NULL) {
+       ske->ske_obj = __get_free_pages(lflags, order);
+       if (ske->ske_obj == 0) {
                kfree(ske);
                return (-ENOMEM);
        }
@@ -517,12 +535,12 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
        spin_unlock(&skc->skc_lock);
 
        if (unlikely(!empty)) {
-               kfree(ske->ske_obj);
+               free_pages(ske->ske_obj, order);
                kfree(ske);
                return (-EINVAL);
        }
 
-       *obj = ske->ske_obj;
+       *obj = (void *)ske->ske_obj;
 
        return (0);
 }
@@ -534,20 +552,21 @@ static int
 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 {
        spl_kmem_emergency_t *ske;
+       int order = get_order(skc->skc_obj_size);
 
        spin_lock(&skc->skc_lock);
        ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
-       if (likely(ske)) {
+       if (ske) {
                rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
                skc->skc_obj_emergency--;
                skc->skc_obj_total--;
        }
        spin_unlock(&skc->skc_lock);
 
-       if (unlikely(ske == NULL))
+       if (ske == NULL)
                return (-ENOENT);
 
-       kfree(ske->ske_obj);
+       free_pages(ske->ske_obj, order);
        kfree(ske);
 
        return (0);
@@ -668,40 +687,48 @@ spl_cache_age(void *data)
 static int
 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 {
-       uint32_t sks_size, obj_size, max_size;
+       uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
 
        if (skc->skc_flags & KMC_OFFSLAB) {
-               *objs = spl_kmem_cache_obj_per_slab;
-               *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
-               return (0);
+               tgt_objs = spl_kmem_cache_obj_per_slab;
+               tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
+
+               if ((skc->skc_flags & KMC_KMEM) &&
+                   (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE)))
+                       return (-ENOSPC);
        } else {
                sks_size = spl_sks_size(skc);
                obj_size = spl_obj_size(skc);
-
-               if (skc->skc_flags & KMC_KMEM)
-                       max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
-               else
-                       max_size = (spl_kmem_cache_max_size * 1024 * 1024);
-
-               /* Power of two sized slab */
-               for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
-                       *objs = (*size - sks_size) / obj_size;
-                       if (*objs >= spl_kmem_cache_obj_per_slab)
-                               return (0);
-               }
+               max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+               tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
 
                /*
-                * Unable to satisfy target objects per slab, fall back to
-                * allocating a maximally sized slab and assuming it can
-                * contain the minimum objects count use it.  If not fail.
+                * KMC_KMEM slabs are allocated by __get_free_pages() which
+                * rounds up to the nearest order.  Knowing this the size
+                * should be rounded up to the next power of two with a hard
+                * maximum defined by the maximum allowed allocation order.
                 */
-               *size = max_size;
-               *objs = (*size - sks_size) / obj_size;
-               if (*objs >= (spl_kmem_cache_obj_per_slab_min))
-                       return (0);
+               if (skc->skc_flags & KMC_KMEM) {
+                       max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
+                       tgt_size = MIN(max_size,
+                           PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
+               }
+
+               if (tgt_size <= max_size) {
+                       tgt_objs = (tgt_size - sks_size) / obj_size;
+               } else {
+                       tgt_objs = (max_size - sks_size) / obj_size;
+                       tgt_size = (tgt_objs * obj_size) + sks_size;
+               }
        }
 
-       return (-ENOSPC);
+       if (tgt_objs == 0)
+               return (-ENOSPC);
+
+       *objs = tgt_objs;
+       *size = tgt_size;
+
+       return (0);
 }
 
 /*
@@ -779,15 +806,18 @@ spl_magazine_create(spl_kmem_cache_t *skc)
        if (skc->skc_flags & KMC_NOMAGAZINE)
                return (0);
 
+       skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+           num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
        skc->skc_mag_size = spl_magazine_size(skc);
        skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 
-       for_each_online_cpu(i) {
+       for_each_possible_cpu(i) {
                skc->skc_mag[i] = spl_magazine_alloc(skc, i);
                if (!skc->skc_mag[i]) {
                        for (i--; i >= 0; i--)
                                spl_magazine_free(skc->skc_mag[i]);
 
+                       kfree(skc->skc_mag);
                        return (-ENOMEM);
                }
        }
@@ -807,11 +837,13 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
        if (skc->skc_flags & KMC_NOMAGAZINE)
                return;
 
-       for_each_online_cpu(i) {
+       for_each_possible_cpu(i) {
                skm = skc->skc_mag[i];
                spl_cache_flush(skc, skm, skm->skm_avail);
                spl_magazine_free(skm);
        }
+
+       kfree(skc->skc_mag);
 }
 
 /*
@@ -854,12 +886,6 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 
        might_sleep();
 
-       /*
-        * Allocate memory for a new cache and initialize it.  Unfortunately,
-        * this usually ends up being a large allocation of ~32k because
-        * we need to allocate enough memory for the worst case number of
-        * cpus in the magazine, skc_mag[NR_CPUS].
-        */
        skc = kzalloc(sizeof (*skc), lflags);
        if (skc == NULL)
                return (NULL);
@@ -960,8 +986,32 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
                if (rc)
                        goto out;
        } else {
-               skc->skc_linux_cache = kmem_cache_create(
-                   skc->skc_name, size, align, 0, NULL);
+               unsigned long slabflags = 0;
+
+               if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+                       rc = EINVAL;
+                       goto out;
+               }
+
+#if defined(SLAB_USERCOPY)
+               /*
+                * Required for PAX-enabled kernels if the slab is to be
+                * used for coping between user and kernel space.
+                */
+               slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+        /*
+         * Newer grsec patchset uses kmem_cache_create_usercopy()
+         * instead of SLAB_USERCOPY flag
+         */
+        skc->skc_linux_cache = kmem_cache_create_usercopy(
+            skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+        skc->skc_linux_cache = kmem_cache_create(
+            skc->skc_name, size, align, slabflags, NULL);
+#endif
                if (skc->skc_linux_cache == NULL) {
                        rc = ENOMEM;
                        goto out;
@@ -1108,36 +1158,43 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
  * It is responsible for allocating a new slab, linking it in to the list
  * of partial slabs, and then waking any waiters.
  */
-static void
-spl_cache_grow_work(void *data)
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
 {
-       spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
-       spl_kmem_cache_t *skc = ska->ska_cache;
        spl_kmem_slab_t *sks;
 
-#if defined(PF_MEMALLOC_NOIO)
-       unsigned noio_flag = memalloc_noio_save();
-       sks = spl_slab_alloc(skc, ska->ska_flags);
-       memalloc_noio_restore(noio_flag);
-#else
        fstrans_cookie_t cookie = spl_fstrans_mark();
-       sks = spl_slab_alloc(skc, ska->ska_flags);
+       sks = spl_slab_alloc(skc, flags);
        spl_fstrans_unmark(cookie);
-#endif
+
        spin_lock(&skc->skc_lock);
        if (sks) {
                skc->skc_slab_total++;
                skc->skc_obj_total += sks->sks_objs;
                list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+               smp_mb__before_atomic();
+               clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+               smp_mb__after_atomic();
+               wake_up_all(&skc->skc_waitq);
        }
+       spin_unlock(&skc->skc_lock);
+
+       return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+       spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+       spl_kmem_cache_t *skc = ska->ska_cache;
+
+       (void)__spl_cache_grow(skc, ska->ska_flags);
 
        atomic_dec(&skc->skc_ref);
        smp_mb__before_atomic();
        clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
-       clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
        smp_mb__after_atomic();
-       wake_up_all(&skc->skc_waitq);
-       spin_unlock(&skc->skc_lock);
 
        kfree(ska);
 }
@@ -1177,6 +1234,21 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
                return (rc ? rc : -EAGAIN);
        }
 
+       /*
+        * To reduce the overhead of context switch and improve NUMA locality,
+        * it tries to allocate a new slab in the current process context with
+        * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
+        * allocation.
+        *
+        * However, this can't be applied to KVM_VMEM due to a bug that
+        * __vmalloc() doesn't honor gfp flags in page table allocation.
+        */
+       if (!(skc->skc_flags & KMC_VMEM)) {
+               rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
+               if (rc == 0)
+                       return (0);
+       }
+
        /*
         * This is handled by dispatching a work request to the global work
         * queue.  This allows us to asynchronously allocate a new slab while
@@ -1217,7 +1289,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
                remaining = wait_event_timeout(skc->skc_waitq,
                    spl_cache_grow_wait(skc), HZ / 10);
 
-               if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
+               if (!remaining) {
                        spin_lock(&skc->skc_lock);
                        if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
                                set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
@@ -1372,8 +1444,6 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
        ASSERT(skc->skc_magic == SKC_MAGIC);
        ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
-       atomic_inc(&skc->skc_ref);
-
        /*
         * Allocate directly from a Linux slab.  All optimizations are left
         * to the underlying cache we only need to guarantee that KM_SLEEP
@@ -1406,8 +1476,11 @@ restart:
                skm->skm_age = jiffies;
        } else {
                obj = spl_cache_refill(skc, skm, flags);
-               if (obj == NULL)
+               if ((obj == NULL) && !(flags & KM_NOSLEEP))
                        goto restart;
+
+               local_irq_enable();
+               goto ret;
        }
 
        local_irq_enable();
@@ -1423,11 +1496,8 @@ ret:
                        prefetchw(obj);
        }
 
-       atomic_dec(&skc->skc_ref);
-
        return (obj);
 }
-
 EXPORT_SYMBOL(spl_kmem_cache_alloc);
 
 /*
@@ -1442,10 +1512,10 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
        spl_kmem_magazine_t *skm;
        unsigned long flags;
        int do_reclaim = 0;
+       int do_emergency = 0;
 
        ASSERT(skc->skc_magic == SKC_MAGIC);
        ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
-       atomic_inc(&skc->skc_ref);
 
        /*
         * Run the destructor
@@ -1458,17 +1528,22 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
         */
        if (skc->skc_flags & KMC_SLAB) {
                kmem_cache_free(skc->skc_linux_cache, obj);
-               goto out;
+               return;
        }
 
        /*
-        * Only virtual slabs may have emergency objects and these objects
-        * are guaranteed to have physical addresses.  They must be removed
-        * from the tree of emergency objects and the freed.
+        * While a cache has outstanding emergency objects all freed objects
+        * must be checked.  However, since emergency objects will never use
+        * a virtual address these objects can be safely excluded as an
+        * optimization.
         */
-       if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) {
-               spl_emergency_free(skc, obj);
-               goto out;
+       if (!is_vmalloc_addr(obj)) {
+               spin_lock(&skc->skc_lock);
+               do_emergency = (skc->skc_obj_emergency > 0);
+               spin_unlock(&skc->skc_lock);
+
+               if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+                       return;
        }
 
        local_irq_save(flags);
@@ -1499,8 +1574,6 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 
        if (do_reclaim)
                spl_slab_reclaim(skc);
-out:
-       atomic_dec(&skc->skc_ref);
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);
 
@@ -1535,6 +1608,12 @@ __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
        spl_kmem_cache_t *skc;
        int alloc = 0;
 
+       /*
+        * No shrinking in a transaction context.  Can cause deadlocks.
+        */
+       if (sc->nr_to_scan && spl_fstrans_check())
+               return (SHRINK_STOP);
+
        down_read(&spl_kmem_cache_sem);
        list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
                if (sc->nr_to_scan) {
@@ -1587,16 +1666,11 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
        atomic_inc(&skc->skc_ref);
 
        /*
-        * Execute the registered reclaim callback if it exists.  The
-        * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
+        * Execute the registered reclaim callback if it exists.
         */
        if (skc->skc_flags & KMC_SLAB) {
                if (skc->skc_reclaim)
                        skc->skc_reclaim(skc->skc_private);
-
-               if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
-                       kmem_cache_shrink(skc->skc_linux_cache);
-
                goto out;
        }
 
@@ -1680,7 +1754,9 @@ spl_kmem_cache_init(void)
        init_rwsem(&spl_kmem_cache_sem);
        INIT_LIST_HEAD(&spl_kmem_cache_list);
        spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
-           1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
+           spl_kmem_cache_kmem_threads, maxclsyspri,
+           spl_kmem_cache_kmem_threads * 8, INT_MAX,
+           TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
        spl_register_shrinker(&spl_kmem_cache_shrinker);
 
        return (0);