/*
* The minimum amount of memory measured in pages to be free at all
* times on the system. This is similar to Linux's zone->pages_min
- * multipled by the number of zones and is sized based on that.
+ * multiplied by the number of zones and is sized based on that.
*/
pgcnt_t minfree = 0;
EXPORT_SYMBOL(minfree);
/*
* The desired amount of memory measured in pages to be free at all
* times on the system. This is similar to Linux's zone->pages_low
- * multipled by the number of zones and is sized based on that.
+ * multiplied by the number of zones and is sized based on that.
* Assuming all zones are being used roughly equally, when we drop
- * below this threshold async page reclamation is triggered.
+ * below this threshold asynchronous page reclamation is triggered.
*/
pgcnt_t desfree = 0;
EXPORT_SYMBOL(desfree);
/*
* When above this amount of memory measures in pages the system is
* determined to have enough free memory. This is similar to Linux's
- * zone->pages_high multipled by the number of zones and is sized based
+ * zone->pages_high multiplied by the number of zones and is sized based
* on that. Assuming all zones are being used roughly equally, when
- * async page reclamation reaches this threshold it stops.
+ * asynchronous page reclamation reaches this threshold it stops.
*/
pgcnt_t lotsfree = 0;
EXPORT_SYMBOL(lotsfree);
#endif /* NEED_GET_ZONE_COUNTS */
EXPORT_SYMBOL(spl_global_page_state);
-#ifndef HAVE_INVALIDATE_INODES
+#if !defined(HAVE_INVALIDATE_INODES) && !defined(HAVE_INVALIDATE_INODES_CHECK)
invalidate_inodes_t invalidate_inodes_fn = SYMBOL_POISON;
EXPORT_SYMBOL(invalidate_inodes_fn);
-#endif /* HAVE_INVALIDATE_INODES */
+#endif /* !HAVE_INVALIDATE_INODES && !HAVE_INVALIDATE_INODES_CHECK */
#ifndef HAVE_SHRINK_DCACHE_MEMORY
shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
EXPORT_SYMBOL(vmem_list);
static kmem_debug_t *
-kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, void *addr)
+kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
{
struct hlist_head *head;
struct hlist_node *node;
EXPORT_SYMBOL(kmem_alloc_track);
void
-kmem_free_track(void *ptr, size_t size)
+kmem_free_track(const void *ptr, size_t size)
{
kmem_debug_t *dptr;
SENTRY;
EXPORT_SYMBOL(vmem_alloc_track);
void
-vmem_free_track(void *ptr, size_t size)
+vmem_free_track(const void *ptr, size_t size)
{
kmem_debug_t *dptr;
SENTRY;
"large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
(unsigned long long) size, flags, func, line,
kmem_alloc_used_read(), kmem_alloc_max);
- spl_debug_dumpstack(NULL);
+ dump_stack();
}
/* Use the correct allocator */
EXPORT_SYMBOL(kmem_alloc_debug);
void
-kmem_free_debug(void *ptr, size_t size)
+kmem_free_debug(const void *ptr, size_t size)
{
SENTRY;
EXPORT_SYMBOL(vmem_alloc_debug);
void
-vmem_free_debug(void *ptr, size_t size)
+vmem_free_debug(const void *ptr, size_t size)
{
SENTRY;
* Slab allocation interfaces
*
* While the Linux slab implementation was inspired by the Solaris
- * implemenation I cannot use it to emulate the Solaris APIs. I
+ * implementation I cannot use it to emulate the Solaris APIs. I
* require two features which are not provided by the Linux slab.
*
* 1) Constructors AND destructors. Recent versions of the Linux
* Because of memory fragmentation the Linux slab which is backed
* by kmalloc'ed memory performs very badly when confronted with
* large numbers of large allocations. Basing the slab on the
- * virtual address space removes the need for contigeous pages
+ * virtual address space removes the need for contiguous pages
* and greatly improve performance for large allocations.
*
* For these reasons, the SPL has its own slab implementation with
*
* XXX: Improve the partial slab list by carefully maintaining a
* strict ordering of fullest to emptiest slabs based on
- * the slab reference count. This gaurentees the when freeing
+ * the slab reference count. This guarantees the when freeing
* slabs back to the system we need only linearly traverse the
* last N slabs in the list to discover all the freeable slabs.
*
* XXX: NUMA awareness for optionally allocating memory close to a
- * particular core. This can be adventageous if you know the slab
+ * particular core. This can be advantageous if you know the slab
* object will be short lived and primarily accessed from one core.
*
* XXX: Slab coloring may also yield performance improvements and would
* been filed at kernel.org to track the issue.
*
* https://bugzilla.kernel.org/show_bug.cgi?id=30702
+ *
+ * NOTE: Only set PF_MEMALLOC if it's not already set, and
+ * then only clear it when we were the one who set it.
*/
- if (!(flags & __GFP_FS))
+ if (!(flags & __GFP_FS) && !(current->flags & PF_MEMALLOC)) {
current->flags |= PF_MEMALLOC;
-
- ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
-
- if (!(flags & __GFP_FS))
+ ptr = __vmalloc(size, flags|__GFP_HIGHMEM, PAGE_KERNEL);
current->flags &= ~PF_MEMALLOC;
+ } else {
+ ptr = __vmalloc(size, flags|__GFP_HIGHMEM, PAGE_KERNEL);
+ }
}
/* Resulting allocated memory will be page aligned */
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
ASSERT(ISP2(size));
+ /*
+ * The Linux direct reclaim path uses this out of band value to
+ * determine if forward progress is being made. Normally this is
+ * incremented by kmem_freepages() which is part of the various
+ * Linux slab implementations. However, since we are using none
+ * of that infrastructure we are responsible for incrementing it.
+ */
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
if (skc->skc_flags & KMC_KMEM)
free_pages((unsigned long)ptr, get_order(size));
else
* For small objects we use kmem_alloc() because as long as you are
* only requesting a small number of pages (ideally just one) its cheap.
* However, when you start requesting multiple pages with kmem_alloc()
- * it gets increasingly expensive since it requires contigeous pages.
+ * it gets increasingly expensive since it requires contiguous pages.
* For this reason we shift to vmem_alloc() for slabs of large objects
- * which removes the need for contigeous pages. We do not use
+ * which removes the need for contiguous pages. We do not use
* vmem_alloc() in all cases because there is significant locking
* overhead in __get_vm_area_node(). This function takes a single
- * global lock when aquiring an available virtual address range which
+ * global lock when acquiring an available virtual address range which
* serializes all vmem_alloc()'s for all slab caches. Using slightly
* different allocation functions for small and large objects should
* give us the best of both worlds.
* All empty slabs are at the end of skc->skc_partial_list,
* therefore once a non-empty slab is found we can stop
* scanning. Additionally, stop when reaching the target
- * reclaim 'count' if a non-zero threshhold is given.
+ * reclaim 'count' if a non-zero threshold is given.
*/
- if ((sks->sks_ref > 0) || (count && i > count))
+ if ((sks->sks_ref > 0) || (count && i >= count))
break;
if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
/*
* Called regularly to keep a downward pressure on the size of idle
* magazines and to release free slabs from the cache. This function
- * never calls the registered reclaim function, that only occures
+ * never calls the registered reclaim function, that only occurs
* under memory pressure or with a direct call to spl_kmem_reap().
*/
static void
}
/*
- * Allocate a per-cpu magazine to assoicate with a specific core.
+ * Allocate a per-cpu magazine to associate with a specific core.
*/
static spl_kmem_magazine_t *
spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
}
/*
- * Free a per-cpu magazine assoicated with a specific core.
+ * Free a per-cpu magazine associated with a specific core.
*/
static void
spl_magazine_free(spl_kmem_magazine_t *skm)
if (current_thread_info()->preempt_count || irqs_disabled())
kmem_flags = KM_NOSLEEP;
- /* Allocate memry for a new cache an initialize it. Unfortunately,
+ /* Allocate memory for a new cache an initialize it. Unfortunately,
* this usually ends up being a large allocation of ~32k because
* we need to allocate enough memory for the worst case number of
* cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
EXPORT_SYMBOL(spl_kmem_cache_set_move);
/*
- * Destroy a cache and all objects assoicated with the cache.
+ * Destroy a cache and all objects associated with the cache.
*/
void
spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
}
/*
- * No available objects on any slabsi, create a new slab. Since this
- * is an expensive operation we do it without holding the spinlock and
- * only briefly aquire it when we link in the fully allocated and
+ * No available objects on any slabs, create a new slab. Since this
+ * is an expensive operation we do it without holding the spin lock and
+ * only briefly acquire it when we link in the fully allocated and
* constructed slab.
*/
static spl_kmem_slab_t *
SGOTO(out, rc);
/* Potentially rescheduled to the same CPU but
- * allocations may have occured from this CPU while
+ * allocations may have occurred from this CPU while
* we were sleeping so recalculate max refill. */
refill = MIN(refill, skm->skm_size - skm->skm_avail);
list_add(&sks->sks_list, &skc->skc_partial_list);
}
- /* Move emply slabs to the end of the partial list so
+ /* Move empty slabs to the end of the partial list so
* they can be easily found and freed during reclamation. */
if (sks->sks_ref == 0) {
list_del(&sks->sks_list);
restart:
/* Safe to update per-cpu structure without lock, but
- * in the restart case we must be careful to reaquire
+ * in the restart case we must be careful to reacquire
* the local magazine since this may have changed
* when we need to grow the cache. */
skm = skc->skc_mag[smp_processor_id()];
EXPORT_SYMBOL(spl_kmem_cache_free);
/*
- * The generic shrinker function for all caches. Under linux a shrinker
- * may not be tightly coupled with a slab cache. In fact linux always
- * systematically trys calling all registered shrinker callbacks which
+ * The generic shrinker function for all caches. Under Linux a shrinker
+ * may not be tightly coupled with a slab cache. In fact Linux always
+ * systematically tries calling all registered shrinker callbacks which
* report that they contain unused objects. Because of this we only
* register one shrinker function in the shim layer for all slab caches.
* We always attempt to shrink all caches when this generic shrinker
* is called. The shrinker should return the number of free objects
* in the cache when called with nr_to_scan == 0 but not attempt to
* free any objects. When nr_to_scan > 0 it is a request that nr_to_scan
- * objects should be freed, because Solaris semantics are to free
- * all available objects we may free more objects than requested.
+ * objects should be freed, which differs from Solaris semantics.
+ * Solaris semantics are to free all available objects which may (and
+ * probably will) be more objects than the requested nr_to_scan.
*/
static int
__spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
down_read(&spl_kmem_cache_sem);
list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
if (sc->nr_to_scan)
- spl_kmem_cache_reap_now(skc);
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
/*
* Presume everything alloc'ed in reclaimable, this ensures
* effort and we do not want to thrash creating and destroying slabs.
*/
void
-spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
{
SENTRY;
atomic_inc(&skc->skc_ref);
- if (skc->skc_reclaim)
- skc->skc_reclaim(skc->skc_private);
+ /*
+ * When a reclaim function is available it may be invoked repeatedly
+ * until at least a single slab can be freed. This ensures that we
+ * do free memory back to the system. This helps minimize the chance
+ * of an OOM event when the bulk of memory is used by the slab.
+ *
+ * When free slabs are already available the reclaim callback will be
+ * skipped. Additionally, if no forward progress is detected despite
+ * a reclaim function the cache will be skipped to avoid deadlock.
+ *
+ * Longer term this would be the correct place to add the code which
+ * repacks the slabs in order minimize fragmentation.
+ */
+ if (skc->skc_reclaim) {
+ uint64_t objects = UINT64_MAX;
+ int do_reclaim;
- spl_slab_reclaim(skc, skc->skc_reap, 0);
+ do {
+ spin_lock(&skc->skc_lock);
+ do_reclaim =
+ (skc->skc_slab_total > 0) &&
+ ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
+ (skc->skc_obj_alloc < objects);
+
+ objects = skc->skc_obj_alloc;
+ spin_unlock(&skc->skc_lock);
+
+ if (do_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+
+ } while (do_reclaim);
+ }
+
+ /* Reclaim from the cache, ignoring it's age and delay. */
+ spl_slab_reclaim(skc, count, 1);
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
atomic_dec(&skc->skc_ref);
*/
spl_kmem_init_globals();
-#ifndef HAVE_INVALIDATE_INODES
+#if !defined(HAVE_INVALIDATE_INODES) && !defined(HAVE_INVALIDATE_INODES_CHECK)
invalidate_inodes_fn = (invalidate_inodes_t)
spl_kallsyms_lookup_name("invalidate_inodes");
if (!invalidate_inodes_fn) {
printk(KERN_ERR "Error: Unknown symbol invalidate_inodes\n");
return -EFAULT;
}
-#endif /* HAVE_INVALIDATE_INODES */
+#endif /* !HAVE_INVALIDATE_INODES && !HAVE_INVALIDATE_INODES_CHECK */
#ifndef HAVE_SHRINK_DCACHE_MEMORY
+ /* When shrink_dcache_memory_fn == NULL support is disabled */
shrink_dcache_memory_fn = (shrink_dcache_memory_t)
- spl_kallsyms_lookup_name("shrink_dcache_memory");
- if (!shrink_dcache_memory_fn) {
- printk(KERN_ERR "Error: Unknown symbol shrink_dcache_memory\n");
- return -EFAULT;
- }
+ spl_kallsyms_lookup_name("shrink_dcache_memory");
#endif /* HAVE_SHRINK_DCACHE_MEMORY */
#ifndef HAVE_SHRINK_ICACHE_MEMORY
+ /* When shrink_icache_memory_fn == NULL support is disabled */
shrink_icache_memory_fn = (shrink_icache_memory_t)
- spl_kallsyms_lookup_name("shrink_icache_memory");
- if (!shrink_icache_memory_fn) {
- printk(KERN_ERR "Error: Unknown symbol shrink_icache_memory\n");
- return -EFAULT;
- }
+ spl_kallsyms_lookup_name("shrink_icache_memory");
#endif /* HAVE_SHRINK_ICACHE_MEMORY */
return 0;