-/*****************************************************************************\
+/*
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
*
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
- *****************************************************************************
- * Solaris Porting Layer (SPL) Kmem Implementation.
-\*****************************************************************************/
+ */
#include <sys/kmem.h>
#include <sys/kmem_cache.h>
#include <linux/swap.h>
#include <linux/mm_compat.h>
#include <linux/wait_compat.h>
+#include <linux/prefetch.h>
/*
* Within the scope of spl-kmem.c file the kmem_cache_* definitions
#undef kmem_cache_free
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
/*
* Cache expiration was implemented because it was part of the default Solaris
* kmem_cache behavior. The idea is that per-cpu objects which haven't been
module_param(spl_kmem_cache_expire, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory. They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches. However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size. When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size. Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+ "Default magazine size (2-256), set automatically (0)");
+
/*
* The default behavior is to report the number of objects remaining in the
* cache. This allows the Linux VM to repeatedly reclaim objects from the
unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
- "Minimal number of objects per slab");
+ "Minimal number of objects per slab");
-unsigned int spl_kmem_cache_max_size = 32;
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
module_param(spl_kmem_cache_max_size, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
#endif
module_param(spl_kmem_cache_slab_limit, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
- "Objects less than N bytes use the Linux slab");
+ "Objects less than N bytes use the Linux slab");
-unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
+/*
+ * This value defaults to a threshold designed to avoid allocations which
+ * have been deemed costly by the kernel.
+ */
+unsigned int spl_kmem_cache_kmem_limit =
+ ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
+ SPL_KMEM_CACHE_OBJ_PER_SLAB;
module_param(spl_kmem_cache_kmem_limit, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
- "Objects less than N bytes use the kmalloc");
+ "Objects less than N bytes use the kmalloc");
+
+/*
+ * The number of threads available to allocate new slabs for caches. This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+ "Number of spl_kmem_cache threads");
/*
* Slab allocation interfaces
* breaker for the SPL which contains particularly expensive
* initializers for mutex's, condition variables, etc. We also
* require a minimal level of cleanup for these data types unlike
- * many Linux data type which do need to be explicitly destroyed.
+ * many Linux data types which do need to be explicitly destroyed.
*
* 2) Virtual address space backed slab. Callers of the Solaris slab
* expect it to work well for both small are very large allocations.
* One serious concern I do have about this method is the relatively
* small virtual address space on 32bit arches. This will seriously
* constrain the size of the slab caches and their performance.
- *
- * XXX: Improve the partial slab list by carefully maintaining a
- * strict ordering of fullest to emptiest slabs based on
- * the slab reference count. This guarantees the when freeing
- * slabs back to the system we need only linearly traverse the
- * last N slabs in the list to discover all the freeable slabs.
- *
- * XXX: NUMA awareness for optionally allocating memory close to a
- * particular core. This can be advantageous if you know the slab
- * object will be short lived and primarily accessed from one core.
- *
- * XXX: Slab coloring may also yield performance improvements and would
- * be desirable to implement.
*/
struct list_head spl_kmem_cache_list; /* List of caches */
struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
-taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
+taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
static void *
kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
{
+ gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
- ASSERT(ISP2(size));
-
- if (skc->skc_flags & KMC_KMEM)
- ptr = (void *)__get_free_pages(flags | __GFP_COMP,
- get_order(size));
- else
- ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ ptr = (void *)__get_free_pages(lflags, get_order(size));
+ } else {
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+ }
/* Resulting allocated memory will be page aligned */
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
- return ptr;
+ return (ptr);
}
static void
kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
{
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
- ASSERT(ISP2(size));
/*
* The Linux direct reclaim path uses this out of band value to
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
- if (skc->skc_flags & KMC_KMEM)
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
free_pages((unsigned long)ptr, get_order(size));
- else
+ } else {
vfree(ptr);
+ }
}
/*
static inline uint32_t
spl_sks_size(spl_kmem_cache_t *skc)
{
- return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
- skc->skc_obj_align, uint32_t);
+ return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+ skc->skc_obj_align, uint32_t));
}
/*
{
uint32_t align = skc->skc_obj_align;
- return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
- P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
+ return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+ P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
}
/*
static inline spl_kmem_obj_t *
spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
{
- return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
- skc->skc_obj_align, uint32_t);
+ return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+ skc->skc_obj_align, uint32_t));
}
/*
static inline uint32_t
spl_offslab_size(spl_kmem_cache_t *skc)
{
- return 1UL << (fls64(spl_obj_size(skc)) + 1);
+ return (1UL << (fls64(spl_obj_size(skc)) + 1));
}
/*
out:
if (rc) {
if (skc->skc_flags & KMC_OFFSLAB)
- list_for_each_entry_safe(sko, n, &sks->sks_free_list,
- sko_list)
+ list_for_each_entry_safe(sko,
+ n, &sks->sks_free_list, sko_list)
kv_free(skc, sko->sko_addr, offslab_size);
kv_free(skc, base, skc->skc_slab_size);
*/
static void
spl_slab_free(spl_kmem_slab_t *sks,
- struct list_head *sks_list, struct list_head *sko_list)
+ struct list_head *sks_list, struct list_head *sko_list)
{
spl_kmem_cache_t *skc;
}
/*
- * Traverses all the partial slabs attached to a cache and free those
- * which which are currently empty, and have not been touched for
- * skc_delay seconds to avoid thrashing. The count argument is
- * passed to optionally cap the number of slabs reclaimed, a count
- * of zero means try and reclaim everything. When flag is set we
- * always free an available slab regardless of age.
+ * Reclaim empty slabs at the end of the partial list.
*/
static void
-spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
+spl_slab_reclaim(spl_kmem_cache_t *skc)
{
spl_kmem_slab_t *sks, *m;
spl_kmem_obj_t *sko, *n;
LIST_HEAD(sks_list);
LIST_HEAD(sko_list);
uint32_t size = 0;
- int i = 0;
/*
- * Move empty slabs and objects which have not been touched in
- * skc_delay seconds on to private lists to be freed outside
- * the spin lock. This delay time is important to avoid thrashing
- * however when flag is set the delay will not be used.
+ * Empty slabs and objects must be moved to a private list so they
+ * can be safely freed outside the spin lock. All empty slabs are
+ * at the end of skc->skc_partial_list, therefore once a non-empty
+ * slab is found we can stop scanning.
*/
spin_lock(&skc->skc_lock);
- list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
- /*
- * All empty slabs are at the end of skc->skc_partial_list,
- * therefore once a non-empty slab is found we can stop
- * scanning. Additionally, stop when reaching the target
- * reclaim 'count' if a non-zero threshold is given.
- */
- if ((sks->sks_ref > 0) || (count && i >= count))
+ list_for_each_entry_safe_reverse(sks, m,
+ &skc->skc_partial_list, sks_list) {
+
+ if (sks->sks_ref > 0)
break;
- if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
- spl_slab_free(sks, &sks_list, &sko_list);
- i++;
- }
+ spl_slab_free(sks, &sks_list, &sko_list);
}
spin_unlock(&skc->skc_lock);
while (node) {
ske = container_of(node, spl_kmem_emergency_t, ske_node);
- if (address < (unsigned long)ske->ske_obj)
+ if (address < ske->ske_obj)
node = node->rb_left;
- else if (address > (unsigned long)ske->ske_obj)
+ else if (address > ske->ske_obj)
node = node->rb_right;
else
- return ske;
+ return (ske);
}
- return NULL;
+ return (NULL);
}
static int
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
spl_kmem_emergency_t *ske_tmp;
- unsigned long address = (unsigned long)ske->ske_obj;
+ unsigned long address = ske->ske_obj;
while (*new) {
ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
parent = *new;
- if (address < (unsigned long)ske_tmp->ske_obj)
+ if (address < ske_tmp->ske_obj)
new = &((*new)->rb_left);
- else if (address > (unsigned long)ske_tmp->ske_obj)
+ else if (address > ske_tmp->ske_obj)
new = &((*new)->rb_right);
else
- return 0;
+ return (0);
}
rb_link_node(&ske->ske_node, parent, new);
rb_insert_color(&ske->ske_node, root);
- return 1;
+ return (1);
}
/*
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
{
+ gfp_t lflags = kmem_flags_convert(flags);
spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
int empty;
/* Last chance use a partial slab if one now exists */
if (!empty)
return (-EEXIST);
- ske = kmalloc(sizeof(*ske), flags);
+ ske = kmalloc(sizeof (*ske), lflags);
if (ske == NULL)
return (-ENOMEM);
- ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
- if (ske->ske_obj == NULL) {
+ ske->ske_obj = __get_free_pages(lflags, order);
+ if (ske->ske_obj == 0) {
kfree(ske);
return (-ENOMEM);
}
spin_unlock(&skc->skc_lock);
if (unlikely(!empty)) {
- kfree(ske->ske_obj);
+ free_pages(ske->ske_obj, order);
kfree(ske);
return (-EINVAL);
}
- *obj = ske->ske_obj;
+ *obj = (void *)ske->ske_obj;
return (0);
}
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
spin_lock(&skc->skc_lock);
ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
- if (likely(ske)) {
+ if (ske) {
rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
}
spin_unlock(&skc->skc_lock);
- if (unlikely(ske == NULL))
+ if (ske == NULL)
return (-ENOENT);
- kfree(ske->ske_obj);
+ free_pages(ske->ske_obj, order);
kfree(ske);
return (0);
skm->skm_avail -= count;
memmove(skm->skm_objs, &(skm->skm_objs[count]),
- sizeof(void *) * skm->skm_avail);
+ sizeof (void *) * skm->skm_avail);
}
static void
if (!(skc->skc_flags & KMC_NOMAGAZINE))
on_each_cpu(spl_magazine_age, skc, 1);
- spl_slab_reclaim(skc, skc->skc_reap, 0);
+ spl_slab_reclaim(skc);
while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
id = taskq_dispatch_delay(
static int
spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
{
- uint32_t sks_size, obj_size, max_size;
+ uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
if (skc->skc_flags & KMC_OFFSLAB) {
- *objs = spl_kmem_cache_obj_per_slab;
- *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
- return (0);
+ tgt_objs = spl_kmem_cache_obj_per_slab;
+ tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
+
+ if ((skc->skc_flags & KMC_KMEM) &&
+ (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE)))
+ return (-ENOSPC);
} else {
sks_size = spl_sks_size(skc);
obj_size = spl_obj_size(skc);
-
- if (skc->skc_flags & KMC_KMEM)
- max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
- else
- max_size = (spl_kmem_cache_max_size * 1024 * 1024);
-
- /* Power of two sized slab */
- for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
- *objs = (*size - sks_size) / obj_size;
- if (*objs >= spl_kmem_cache_obj_per_slab)
- return (0);
- }
+ max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+ tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
/*
- * Unable to satisfy target objects per slab, fall back to
- * allocating a maximally sized slab and assuming it can
- * contain the minimum objects count use it. If not fail.
+ * KMC_KMEM slabs are allocated by __get_free_pages() which
+ * rounds up to the nearest order. Knowing this the size
+ * should be rounded up to the next power of two with a hard
+ * maximum defined by the maximum allowed allocation order.
*/
- *size = max_size;
- *objs = (*size - sks_size) / obj_size;
- if (*objs >= (spl_kmem_cache_obj_per_slab_min))
- return (0);
+ if (skc->skc_flags & KMC_KMEM) {
+ max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
+ tgt_size = MIN(max_size,
+ PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
+ }
+
+ if (tgt_size <= max_size) {
+ tgt_objs = (tgt_size - sks_size) / obj_size;
+ } else {
+ tgt_objs = (max_size - sks_size) / obj_size;
+ tgt_size = (tgt_objs * obj_size) + sks_size;
+ }
}
- return (-ENOSPC);
+ if (tgt_objs == 0)
+ return (-ENOSPC);
+
+ *objs = tgt_objs;
+ *size = tgt_size;
+
+ return (0);
}
/*
uint32_t obj_size = spl_obj_size(skc);
int size;
+ if (spl_kmem_cache_magazine_size > 0)
+ return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
/* Per-magazine sizes below assume a 4Kib page size */
if (obj_size > (PAGE_SIZE * 256))
size = 4; /* Minimum 4Mib per-magazine */
spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
{
spl_kmem_magazine_t *skm;
- int size = sizeof(spl_kmem_magazine_t) +
- sizeof(void *) * skc->skc_mag_size;
+ int size = sizeof (spl_kmem_magazine_t) +
+ sizeof (void *) * skc->skc_mag_size;
- skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
+ skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
if (skm) {
skm->skm_magic = SKM_MAGIC;
skm->skm_avail = 0;
static void
spl_magazine_free(spl_kmem_magazine_t *skm)
{
- int size = sizeof(spl_kmem_magazine_t) +
- sizeof(void *) * skm->skm_size;
-
ASSERT(skm->skm_magic == SKM_MAGIC);
ASSERT(skm->skm_avail == 0);
-
- kmem_free(skm, size);
+ kfree(skm);
}
/*
if (skc->skc_flags & KMC_NOMAGAZINE)
return (0);
+ skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+ num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
skc->skc_mag_size = spl_magazine_size(skc);
skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
- for_each_online_cpu(i) {
+ for_each_possible_cpu(i) {
skc->skc_mag[i] = spl_magazine_alloc(skc, i);
if (!skc->skc_mag[i]) {
for (i--; i >= 0; i--)
spl_magazine_free(skc->skc_mag[i]);
+ kfree(skc->skc_mag);
return (-ENOMEM);
}
}
if (skc->skc_flags & KMC_NOMAGAZINE)
return;
- for_each_online_cpu(i) {
+ for_each_possible_cpu(i) {
skm = skc->skc_mag[i];
spl_cache_flush(skc, skm, skm->skm_avail);
spl_magazine_free(skm);
- }
+ }
+
+ kfree(skc->skc_mag);
}
/*
*/
spl_kmem_cache_t *
spl_kmem_cache_create(char *name, size_t size, size_t align,
- spl_kmem_ctor_t ctor,
- spl_kmem_dtor_t dtor,
- spl_kmem_reclaim_t reclaim,
- void *priv, void *vmp, int flags)
+ spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
+ void *priv, void *vmp, int flags)
{
- spl_kmem_cache_t *skc;
+ gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+ spl_kmem_cache_t *skc;
int rc;
/*
might_sleep();
- /*
- * Allocate memory for a new cache an initialize it. Unfortunately,
- * this usually ends up being a large allocation of ~32k because
- * we need to allocate enough memory for the worst case number of
- * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
- * explicitly pass KM_NODEBUG to suppress the kmem warning
- */
- skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
+ skc = kzalloc(sizeof (*skc), lflags);
if (skc == NULL)
return (NULL);
skc->skc_magic = SKC_MAGIC;
skc->skc_name_size = strlen(name) + 1;
- skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
+ skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
if (skc->skc_name == NULL) {
- kmem_free(skc, sizeof(*skc));
+ kfree(skc);
return (NULL);
}
strncpy(skc->skc_name, name, skc->skc_name_size);
* Objects smaller than spl_kmem_cache_slab_limit can
* use the Linux slab for better space-efficiency. By
* default this functionality is disabled until its
- * performance characters are fully understood.
+ * performance characteristics are fully understood.
*/
if (spl_kmem_cache_slab_limit &&
size <= (size_t)spl_kmem_cache_slab_limit)
if (rc)
goto out;
} else {
- skc->skc_linux_cache = kmem_cache_create(
- skc->skc_name, size, align, 0, NULL);
+ unsigned long slabflags = 0;
+
+ if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+ rc = EINVAL;
+ goto out;
+ }
+
+#if defined(SLAB_USERCOPY)
+ /*
+ * Required for PAX-enabled kernels if the slab is to be
+ * used for coping between user and kernel space.
+ */
+ slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+ /*
+ * Newer grsec patchset uses kmem_cache_create_usercopy()
+ * instead of SLAB_USERCOPY flag
+ */
+ skc->skc_linux_cache = kmem_cache_create_usercopy(
+ skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+ skc->skc_linux_cache = kmem_cache_create(
+ skc->skc_name, size, align, slabflags, NULL);
+#endif
if (skc->skc_linux_cache == NULL) {
rc = ENOMEM;
goto out;
}
- kmem_cache_set_allocflags(skc, __GFP_COMP);
+#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
+ skc->skc_linux_cache->allocflags |= __GFP_COMP;
+#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
+ skc->skc_linux_cache->gfpflags |= __GFP_COMP;
+#endif
skc->skc_flags |= KMC_NOMAGAZINE;
}
return (skc);
out:
- kmem_free(skc->skc_name, skc->skc_name_size);
- kmem_free(skc, sizeof(*skc));
+ kfree(skc->skc_name);
+ kfree(skc);
return (NULL);
}
EXPORT_SYMBOL(spl_kmem_cache_create);
/*
- * Register a move callback to for cache defragmentation.
+ * Register a move callback for cache defragmentation.
* XXX: Unimplemented but harmless to stub out for now.
*/
void
spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
kmem_cbrc_t (move)(void *, void *, size_t, void *))
{
- ASSERT(move != NULL);
+ ASSERT(move != NULL);
}
EXPORT_SYMBOL(spl_kmem_cache_set_move);
taskq_cancel_id(spl_kmem_cache_taskq, id);
- /* Wait until all current callers complete, this is mainly
+ /*
+ * Wait until all current callers complete, this is mainly
* to catch the case where a low memory situation triggers a
- * cache reaping action which races with this destroy. */
+ * cache reaping action which races with this destroy.
+ */
wait_event(wq, atomic_read(&skc->skc_ref) == 0);
if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
spl_magazine_destroy(skc);
- spl_slab_reclaim(skc, 0, 1);
+ spl_slab_reclaim(skc);
} else {
ASSERT(skc->skc_flags & KMC_SLAB);
kmem_cache_destroy(skc->skc_linux_cache);
spin_lock(&skc->skc_lock);
- /* Validate there are no objects in use and free all the
- * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
+ /*
+ * Validate there are no objects in use and free all the
+ * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+ */
ASSERT3U(skc->skc_slab_alloc, ==, 0);
ASSERT3U(skc->skc_obj_alloc, ==, 0);
ASSERT3U(skc->skc_slab_total, ==, 0);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
- kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
- kmem_free(skc, sizeof(*skc));
+ kfree(skc->skc_name);
+ kfree(skc);
}
EXPORT_SYMBOL(spl_kmem_cache_destroy);
skc->skc_slab_max = skc->skc_slab_alloc;
}
- return sko->sko_addr;
+ return (sko->sko_addr);
}
/*
* It is responsible for allocating a new slab, linking it in to the list
* of partial slabs, and then waking any waiters.
*/
-static void
-spl_cache_grow_work(void *data)
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
{
- spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
- spl_kmem_cache_t *skc = ska->ska_cache;
spl_kmem_slab_t *sks;
- sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ sks = spl_slab_alloc(skc, flags);
+ spl_fstrans_unmark(cookie);
+
spin_lock(&skc->skc_lock);
if (sks) {
skc->skc_slab_total++;
skc->skc_obj_total += sks->sks_objs;
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
}
+ spin_unlock(&skc->skc_lock);
+
+ return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+ spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+ spl_kmem_cache_t *skc = ska->ska_cache;
+
+ (void)__spl_cache_grow(skc, ska->ska_flags);
atomic_dec(&skc->skc_ref);
+ smp_mb__before_atomic();
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
- clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
- wake_up_all(&skc->skc_waitq);
- spin_unlock(&skc->skc_lock);
+ smp_mb__after_atomic();
kfree(ska);
}
static int
spl_cache_grow_wait(spl_kmem_cache_t *skc)
{
- return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
}
/*
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
- int remaining, rc;
+ int remaining, rc = 0;
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT((skc->skc_flags & KMC_SLAB) == 0);
might_sleep();
return (rc ? rc : -EAGAIN);
}
+ /*
+ * To reduce the overhead of context switch and improve NUMA locality,
+ * it tries to allocate a new slab in the current process context with
+ * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
+ * allocation.
+ *
+ * However, this can't be applied to KVM_VMEM due to a bug that
+ * __vmalloc() doesn't honor gfp flags in page table allocation.
+ */
+ if (!(skc->skc_flags & KMC_VMEM)) {
+ rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
+ if (rc == 0)
+ return (0);
+ }
+
/*
* This is handled by dispatching a work request to the global work
* queue. This allows us to asynchronously allocate a new slab while
if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
spl_kmem_alloc_t *ska;
- ska = kmalloc(sizeof(*ska), flags);
+ ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
if (ska == NULL) {
- clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
wake_up_all(&skc->skc_waitq);
return (-ENOMEM);
}
atomic_inc(&skc->skc_ref);
ska->ska_cache = skc;
- ska->ska_flags = flags & ~__GFP_FS;
+ ska->ska_flags = flags;
taskq_init_ent(&ska->ska_tqe);
taskq_dispatch_ent(spl_kmem_cache_taskq,
spl_cache_grow_work, ska, 0, &ska->ska_tqe);
rc = spl_emergency_alloc(skc, flags, obj);
} else {
remaining = wait_event_timeout(skc->skc_waitq,
- spl_cache_grow_wait(skc), HZ);
+ spl_cache_grow_wait(skc), HZ / 10);
- if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
+ if (!remaining) {
spin_lock(&skc->skc_lock);
if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
if (skm != skc->skc_mag[smp_processor_id()])
goto out;
- /* Potentially rescheduled to the same CPU but
+ /*
+ * Potentially rescheduled to the same CPU but
* allocations may have occurred from this CPU while
- * we were sleeping so recalculate max refill. */
+ * we were sleeping so recalculate max refill.
+ */
refill = MIN(refill, skm->skm_size - skm->skm_avail);
spin_lock(&skc->skc_lock);
/* Grab the next available slab */
sks = list_entry((&skc->skc_partial_list)->next,
- spl_kmem_slab_t, sks_list);
+ spl_kmem_slab_t, sks_list);
ASSERT(sks->sks_magic == SKS_MAGIC);
ASSERT(sks->sks_ref < sks->sks_objs);
ASSERT(!list_empty(&sks->sks_free_list));
- /* Consume as many objects as needed to refill the requested
- * cache. We must also be careful not to overfill it. */
- while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
+ /*
+ * Consume as many objects as needed to refill the requested
+ * cache. We must also be careful not to overfill it.
+ */
+ while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+ ++count) {
ASSERT(skm->skm_avail < skm->skm_size);
ASSERT(count < skm->skm_size);
- skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
+ skm->skm_objs[skm->skm_avail++] =
+ spl_cache_obj(skc, sks);
}
/* Move slab to skc_complete_list when full */
sks->sks_ref--;
skc->skc_obj_alloc--;
- /* Move slab to skc_partial_list when no longer full. Slabs
+ /*
+ * Move slab to skc_partial_list when no longer full. Slabs
* are added to the head to keep the partial list is quasi-full
- * sorted order. Fuller at the head, emptier at the tail. */
+ * sorted order. Fuller at the head, emptier at the tail.
+ */
if (sks->sks_ref == (sks->sks_objs - 1)) {
list_del(&sks->sks_list);
list_add(&sks->sks_list, &skc->skc_partial_list);
}
- /* Move empty slabs to the end of the partial list so
- * they can be easily found and freed during reclamation. */
+ /*
+ * Move empty slabs to the end of the partial list so
+ * they can be easily found and freed during reclamation.
+ */
if (sks->sks_ref == 0) {
list_del(&sks->sks_list);
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
spl_kmem_magazine_t *skm;
void *obj = NULL;
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
- ASSERT(flags & KM_SLEEP);
-
- atomic_inc(&skc->skc_ref);
/*
* Allocate directly from a Linux slab. All optimizations are left
*/
if (skc->skc_flags & KMC_SLAB) {
struct kmem_cache *slc = skc->skc_linux_cache;
-
do {
- obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
+ obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
} while ((obj == NULL) && !(flags & KM_NOSLEEP));
goto ret;
local_irq_disable();
restart:
- /* Safe to update per-cpu structure without lock, but
+ /*
+ * Safe to update per-cpu structure without lock, but
* in the restart case we must be careful to reacquire
* the local magazine since this may have changed
- * when we need to grow the cache. */
+ * when we need to grow the cache.
+ */
skm = skc->skc_mag[smp_processor_id()];
ASSERT(skm->skm_magic == SKM_MAGIC);
skm->skm_age = jiffies;
} else {
obj = spl_cache_refill(skc, skm, flags);
- if (obj == NULL)
+ if ((obj == NULL) && !(flags & KM_NOSLEEP))
goto restart;
+
+ local_irq_enable();
+ goto ret;
}
local_irq_enable();
prefetchw(obj);
}
- atomic_dec(&skc->skc_ref);
-
return (obj);
}
-
EXPORT_SYMBOL(spl_kmem_cache_alloc);
/*
{
spl_kmem_magazine_t *skm;
unsigned long flags;
+ int do_reclaim = 0;
+ int do_emergency = 0;
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
- atomic_inc(&skc->skc_ref);
/*
* Run the destructor
*/
if (skc->skc_flags & KMC_SLAB) {
kmem_cache_free(skc->skc_linux_cache, obj);
- goto out;
+ return;
}
/*
- * Only virtual slabs may have emergency objects and these objects
- * are guaranteed to have physical addresses. They must be removed
- * from the tree of emergency objects and the freed.
+ * While a cache has outstanding emergency objects all freed objects
+ * must be checked. However, since emergency objects will never use
+ * a virtual address these objects can be safely excluded as an
+ * optimization.
*/
- if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
- spl_emergency_free(skc, obj);
- goto out;
+ if (!is_vmalloc_addr(obj)) {
+ spin_lock(&skc->skc_lock);
+ do_emergency = (skc->skc_obj_emergency > 0);
+ spin_unlock(&skc->skc_lock);
+
+ if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+ return;
}
local_irq_save(flags);
- /* Safe to update per-cpu structure without lock, but
+ /*
+ * Safe to update per-cpu structure without lock, but
* no remote memory allocation tracking is being performed
* it is entirely possible to allocate an object from one
- * CPU cache and return it to another. */
+ * CPU cache and return it to another.
+ */
skm = skc->skc_mag[smp_processor_id()];
ASSERT(skm->skm_magic == SKM_MAGIC);
- /* Per-CPU cache full, flush it to make space */
- if (unlikely(skm->skm_avail >= skm->skm_size))
+ /*
+ * Per-CPU cache full, flush it to make space for this object,
+ * this may result in an empty slab which can be reclaimed once
+ * interrupts are re-enabled.
+ */
+ if (unlikely(skm->skm_avail >= skm->skm_size)) {
spl_cache_flush(skc, skm, skm->skm_refill);
+ do_reclaim = 1;
+ }
/* Available space in cache, use it */
skm->skm_objs[skm->skm_avail++] = obj;
local_irq_restore(flags);
-out:
- atomic_dec(&skc->skc_ref);
+
+ if (do_reclaim)
+ spl_slab_reclaim(skc);
}
EXPORT_SYMBOL(spl_kmem_cache_free);
spl_kmem_cache_t *skc;
int alloc = 0;
+ /*
+ * No shrinking in a transaction context. Can cause deadlocks.
+ */
+ if (sc->nr_to_scan && spl_fstrans_check())
+ return (SHRINK_STOP);
+
down_read(&spl_kmem_cache_sem);
list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
if (sc->nr_to_scan) {
#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
uint64_t oldalloc = skc->skc_obj_alloc;
spl_kmem_cache_reap_now(skc,
- MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
if (oldalloc > skc->skc_obj_alloc)
alloc += oldalloc - skc->skc_obj_alloc;
#else
spl_kmem_cache_reap_now(skc,
- MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
alloc += skc->skc_obj_alloc;
#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
} else {
atomic_inc(&skc->skc_ref);
/*
- * Execute the registered reclaim callback if it exists. The
- * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
+ * Execute the registered reclaim callback if it exists.
*/
if (skc->skc_flags & KMC_SLAB) {
if (skc->skc_reclaim)
skc->skc_reclaim(skc->skc_private);
-
- if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
- kmem_cache_shrink(skc->skc_linux_cache);
-
goto out;
}
spin_lock(&skc->skc_lock);
do_reclaim =
(skc->skc_slab_total > 0) &&
- ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
+ ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
(skc->skc_obj_alloc < objects);
objects = skc->skc_obj_alloc;
} while (do_reclaim);
}
- /* Reclaim from the magazine then the slabs ignoring age and delay. */
+ /* Reclaim from the magazine and free all now empty slabs. */
if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
spl_kmem_magazine_t *skm;
unsigned long irq_flags;
local_irq_restore(irq_flags);
}
- spl_slab_reclaim(skc, count, 1);
- clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
- smp_wmb();
+ spl_slab_reclaim(skc);
+ clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+ smp_mb__after_atomic();
wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
out:
atomic_dec(&skc->skc_ref);
init_rwsem(&spl_kmem_cache_sem);
INIT_LIST_HEAD(&spl_kmem_cache_list);
spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
- 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
+ spl_kmem_cache_kmem_threads, maxclsyspri,
+ spl_kmem_cache_kmem_threads * 8, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
spl_register_shrinker(&spl_kmem_cache_shrinker);
return (0);