#include <linux/swap.h>
#include <linux/mm_compat.h>
#include <linux/wait_compat.h>
+#include <linux/prefetch.h>
/*
* Within the scope of spl-kmem.c file the kmem_cache_* definitions
unsigned int spl_kmem_cache_magazine_size = 0;
module_param(spl_kmem_cache_magazine_size, uint, 0444);
MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
- "Default magazine size (2-256), set automatically (0)\n");
+ "Default magazine size (2-256), set automatically (0)");
/*
* The default behavior is to report the number of objects remaining in the
MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
"Minimal number of objects per slab");
-unsigned int spl_kmem_cache_max_size = 32;
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
module_param(spl_kmem_cache_max_size, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
"Objects less than N bytes use the Linux slab");
-unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
+/*
+ * This value defaults to a threshold designed to avoid allocations which
+ * have been deemed costly by the kernel.
+ */
+unsigned int spl_kmem_cache_kmem_limit =
+ ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
+ SPL_KMEM_CACHE_OBJ_PER_SLAB;
module_param(spl_kmem_cache_kmem_limit, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
"Objects less than N bytes use the kmalloc");
+/*
+ * The number of threads available to allocate new slabs for caches. This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+ "Number of spl_kmem_cache threads");
+
/*
* Slab allocation interfaces
*
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
- ASSERT(ISP2(size));
-
- if (skc->skc_flags & KMC_KMEM)
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
ptr = (void *)__get_free_pages(lflags, get_order(size));
- else
- ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+ } else {
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+ }
/* Resulting allocated memory will be page aligned */
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
{
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
- ASSERT(ISP2(size));
/*
* The Linux direct reclaim path uses this out of band value to
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
- if (skc->skc_flags & KMC_KMEM)
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
free_pages((unsigned long)ptr, get_order(size));
- else
+ } else {
vfree(ptr);
+ }
}
/*
while (node) {
ske = container_of(node, spl_kmem_emergency_t, ske_node);
- if (address < (unsigned long)ske->ske_obj)
+ if (address < ske->ske_obj)
node = node->rb_left;
- else if (address > (unsigned long)ske->ske_obj)
+ else if (address > ske->ske_obj)
node = node->rb_right;
else
return (ske);
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
spl_kmem_emergency_t *ske_tmp;
- unsigned long address = (unsigned long)ske->ske_obj;
+ unsigned long address = ske->ske_obj;
while (*new) {
ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
parent = *new;
- if (address < (unsigned long)ske_tmp->ske_obj)
+ if (address < ske_tmp->ske_obj)
new = &((*new)->rb_left);
- else if (address > (unsigned long)ske_tmp->ske_obj)
+ else if (address > ske_tmp->ske_obj)
new = &((*new)->rb_right);
else
return (0);
{
gfp_t lflags = kmem_flags_convert(flags);
spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
int empty;
/* Last chance use a partial slab if one now exists */
if (ske == NULL)
return (-ENOMEM);
- ske->ske_obj = kmalloc(skc->skc_obj_size, lflags);
- if (ske->ske_obj == NULL) {
+ ske->ske_obj = __get_free_pages(lflags, order);
+ if (ske->ske_obj == 0) {
kfree(ske);
return (-ENOMEM);
}
spin_unlock(&skc->skc_lock);
if (unlikely(!empty)) {
- kfree(ske->ske_obj);
+ free_pages(ske->ske_obj, order);
kfree(ske);
return (-EINVAL);
}
- *obj = ske->ske_obj;
+ *obj = (void *)ske->ske_obj;
return (0);
}
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
spin_lock(&skc->skc_lock);
ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
- if (likely(ske)) {
+ if (ske) {
rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
}
spin_unlock(&skc->skc_lock);
- if (unlikely(ske == NULL))
+ if (ske == NULL)
return (-ENOENT);
- kfree(ske->ske_obj);
+ free_pages(ske->ske_obj, order);
kfree(ske);
return (0);
static int
spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
{
- uint32_t sks_size, obj_size, max_size;
+ uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
if (skc->skc_flags & KMC_OFFSLAB) {
- *objs = spl_kmem_cache_obj_per_slab;
- *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
- return (0);
+ tgt_objs = spl_kmem_cache_obj_per_slab;
+ tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
+
+ if ((skc->skc_flags & KMC_KMEM) &&
+ (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE)))
+ return (-ENOSPC);
} else {
sks_size = spl_sks_size(skc);
obj_size = spl_obj_size(skc);
-
- if (skc->skc_flags & KMC_KMEM)
- max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
- else
- max_size = (spl_kmem_cache_max_size * 1024 * 1024);
-
- /* Power of two sized slab */
- for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
- *objs = (*size - sks_size) / obj_size;
- if (*objs >= spl_kmem_cache_obj_per_slab)
- return (0);
- }
+ max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+ tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
/*
- * Unable to satisfy target objects per slab, fall back to
- * allocating a maximally sized slab and assuming it can
- * contain the minimum objects count use it. If not fail.
+ * KMC_KMEM slabs are allocated by __get_free_pages() which
+ * rounds up to the nearest order. Knowing this the size
+ * should be rounded up to the next power of two with a hard
+ * maximum defined by the maximum allowed allocation order.
*/
- *size = max_size;
- *objs = (*size - sks_size) / obj_size;
- if (*objs >= (spl_kmem_cache_obj_per_slab_min))
- return (0);
+ if (skc->skc_flags & KMC_KMEM) {
+ max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
+ tgt_size = MIN(max_size,
+ PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
+ }
+
+ if (tgt_size <= max_size) {
+ tgt_objs = (tgt_size - sks_size) / obj_size;
+ } else {
+ tgt_objs = (max_size - sks_size) / obj_size;
+ tgt_size = (tgt_objs * obj_size) + sks_size;
+ }
}
- return (-ENOSPC);
+ if (tgt_objs == 0)
+ return (-ENOSPC);
+
+ *objs = tgt_objs;
+ *size = tgt_size;
+
+ return (0);
}
/*
if (skc->skc_flags & KMC_NOMAGAZINE)
return (0);
+ skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+ num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
skc->skc_mag_size = spl_magazine_size(skc);
skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
- for_each_online_cpu(i) {
+ for_each_possible_cpu(i) {
skc->skc_mag[i] = spl_magazine_alloc(skc, i);
if (!skc->skc_mag[i]) {
for (i--; i >= 0; i--)
spl_magazine_free(skc->skc_mag[i]);
+ kfree(skc->skc_mag);
return (-ENOMEM);
}
}
if (skc->skc_flags & KMC_NOMAGAZINE)
return;
- for_each_online_cpu(i) {
+ for_each_possible_cpu(i) {
skm = skc->skc_mag[i];
spl_cache_flush(skc, skm, skm->skm_avail);
spl_magazine_free(skm);
}
+
+ kfree(skc->skc_mag);
}
/*
might_sleep();
- /*
- * Allocate memory for a new cache and initialize it. Unfortunately,
- * this usually ends up being a large allocation of ~32k because
- * we need to allocate enough memory for the worst case number of
- * cpus in the magazine, skc_mag[NR_CPUS].
- */
skc = kzalloc(sizeof (*skc), lflags);
if (skc == NULL)
return (NULL);
if (rc)
goto out;
} else {
- skc->skc_linux_cache = kmem_cache_create(
- skc->skc_name, size, align, 0, NULL);
+ unsigned long slabflags = 0;
+
+ if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+ rc = EINVAL;
+ goto out;
+ }
+
+#if defined(SLAB_USERCOPY)
+ /*
+ * Required for PAX-enabled kernels if the slab is to be
+ * used for coping between user and kernel space.
+ */
+ slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+ /*
+ * Newer grsec patchset uses kmem_cache_create_usercopy()
+ * instead of SLAB_USERCOPY flag
+ */
+ skc->skc_linux_cache = kmem_cache_create_usercopy(
+ skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+ skc->skc_linux_cache = kmem_cache_create(
+ skc->skc_name, size, align, slabflags, NULL);
+#endif
if (skc->skc_linux_cache == NULL) {
rc = ENOMEM;
goto out;
* It is responsible for allocating a new slab, linking it in to the list
* of partial slabs, and then waking any waiters.
*/
-static void
-spl_cache_grow_work(void *data)
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
{
- spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
- spl_kmem_cache_t *skc = ska->ska_cache;
spl_kmem_slab_t *sks;
-#if defined(PF_MEMALLOC_NOIO)
- unsigned noio_flag = memalloc_noio_save();
- sks = spl_slab_alloc(skc, ska->ska_flags);
- memalloc_noio_restore(noio_flag);
-#else
fstrans_cookie_t cookie = spl_fstrans_mark();
- sks = spl_slab_alloc(skc, ska->ska_flags);
+ sks = spl_slab_alloc(skc, flags);
spl_fstrans_unmark(cookie);
-#endif
+
spin_lock(&skc->skc_lock);
if (sks) {
skc->skc_slab_total++;
skc->skc_obj_total += sks->sks_objs;
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
}
+ spin_unlock(&skc->skc_lock);
+
+ return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+ spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+ spl_kmem_cache_t *skc = ska->ska_cache;
+
+ (void)__spl_cache_grow(skc, ska->ska_flags);
atomic_dec(&skc->skc_ref);
smp_mb__before_atomic();
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
- clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
smp_mb__after_atomic();
- wake_up_all(&skc->skc_waitq);
- spin_unlock(&skc->skc_lock);
kfree(ska);
}
return (rc ? rc : -EAGAIN);
}
+ /*
+ * To reduce the overhead of context switch and improve NUMA locality,
+ * it tries to allocate a new slab in the current process context with
+ * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
+ * allocation.
+ *
+ * However, this can't be applied to KVM_VMEM due to a bug that
+ * __vmalloc() doesn't honor gfp flags in page table allocation.
+ */
+ if (!(skc->skc_flags & KMC_VMEM)) {
+ rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
+ if (rc == 0)
+ return (0);
+ }
+
/*
* This is handled by dispatching a work request to the global work
* queue. This allows us to asynchronously allocate a new slab while
remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), HZ / 10);
- if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
+ if (!remaining) {
spin_lock(&skc->skc_lock);
if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
- atomic_inc(&skc->skc_ref);
-
/*
* Allocate directly from a Linux slab. All optimizations are left
* to the underlying cache we only need to guarantee that KM_SLEEP
skm->skm_age = jiffies;
} else {
obj = spl_cache_refill(skc, skm, flags);
- if (obj == NULL)
+ if ((obj == NULL) && !(flags & KM_NOSLEEP))
goto restart;
+
+ local_irq_enable();
+ goto ret;
}
local_irq_enable();
prefetchw(obj);
}
- atomic_dec(&skc->skc_ref);
-
return (obj);
}
-
EXPORT_SYMBOL(spl_kmem_cache_alloc);
/*
spl_kmem_magazine_t *skm;
unsigned long flags;
int do_reclaim = 0;
+ int do_emergency = 0;
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
- atomic_inc(&skc->skc_ref);
/*
* Run the destructor
*/
if (skc->skc_flags & KMC_SLAB) {
kmem_cache_free(skc->skc_linux_cache, obj);
- goto out;
+ return;
}
/*
- * Only virtual slabs may have emergency objects and these objects
- * are guaranteed to have physical addresses. They must be removed
- * from the tree of emergency objects and the freed.
+ * While a cache has outstanding emergency objects all freed objects
+ * must be checked. However, since emergency objects will never use
+ * a virtual address these objects can be safely excluded as an
+ * optimization.
*/
- if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) {
- spl_emergency_free(skc, obj);
- goto out;
+ if (!is_vmalloc_addr(obj)) {
+ spin_lock(&skc->skc_lock);
+ do_emergency = (skc->skc_obj_emergency > 0);
+ spin_unlock(&skc->skc_lock);
+
+ if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+ return;
}
local_irq_save(flags);
if (do_reclaim)
spl_slab_reclaim(skc);
-out:
- atomic_dec(&skc->skc_ref);
}
EXPORT_SYMBOL(spl_kmem_cache_free);
spl_kmem_cache_t *skc;
int alloc = 0;
+ /*
+ * No shrinking in a transaction context. Can cause deadlocks.
+ */
+ if (sc->nr_to_scan && spl_fstrans_check())
+ return (SHRINK_STOP);
+
down_read(&spl_kmem_cache_sem);
list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
if (sc->nr_to_scan) {
atomic_inc(&skc->skc_ref);
/*
- * Execute the registered reclaim callback if it exists. The
- * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
+ * Execute the registered reclaim callback if it exists.
*/
if (skc->skc_flags & KMC_SLAB) {
if (skc->skc_reclaim)
skc->skc_reclaim(skc->skc_private);
-
- if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
- kmem_cache_shrink(skc->skc_linux_cache);
-
goto out;
}
init_rwsem(&spl_kmem_cache_sem);
INIT_LIST_HEAD(&spl_kmem_cache_list);
spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
- 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
+ spl_kmem_cache_kmem_threads, maxclsyspri,
+ spl_kmem_cache_kmem_threads * 8, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
spl_register_shrinker(&spl_kmem_cache_shrinker);
return (0);