Constify memory management functions

[mirror_spl.git] / module / spl / spl-kmem.c
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c

index 112b0e31866d46a44a49a079af2cc7df03d95401..a6d09f9a25752681c8f5fc1c4b2b62b3c655bebb 100644 (file)
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -36,7 +36,7 @@
  /*
   * The minimum amount of memory measured in pages to be free at all
   * times on the system.  This is similar to Linux's zone->pages_min
- * multipled by the number of zones and is sized based on that.
+ * multiplied by the number of zones and is sized based on that.
   */
  pgcnt_t minfree = 0;
  EXPORT_SYMBOL(minfree);
@@ -44,9 +44,9 @@ EXPORT_SYMBOL(minfree);
  /*
   * The desired amount of memory measured in pages to be free at all
   * times on the system.  This is similar to Linux's zone->pages_low
- * multipled by the number of zones and is sized based on that.
+ * multiplied by the number of zones and is sized based on that.
   * Assuming all zones are being used roughly equally, when we drop
- * below this threshold async page reclamation is triggered.
+ * below this threshold asynchronous page reclamation is triggered.
   */
  pgcnt_t desfree = 0;
  EXPORT_SYMBOL(desfree);
@@ -54,9 +54,9 @@ EXPORT_SYMBOL(desfree);
  /*
   * When above this amount of memory measures in pages the system is
   * determined to have enough free memory.  This is similar to Linux's
- * zone->pages_high multipled by the number of zones and is sized based
+ * zone->pages_high multiplied by the number of zones and is sized based
   * on that.  Assuming all zones are being used roughly equally, when
- * async page reclamation reaches this threshold it stops.
+ * asynchronous page reclamation reaches this threshold it stops.
   */
  pgcnt_t lotsfree = 0;
  EXPORT_SYMBOL(lotsfree);
@@ -180,10 +180,10 @@ spl_global_page_state(spl_zone_stat_item_t item)
  #endif /* NEED_GET_ZONE_COUNTS */
  EXPORT_SYMBOL(spl_global_page_state);
  
-#ifndef HAVE_INVALIDATE_INODES
+#if !defined(HAVE_INVALIDATE_INODES) && !defined(HAVE_INVALIDATE_INODES_CHECK)
  invalidate_inodes_t invalidate_inodes_fn = SYMBOL_POISON;
  EXPORT_SYMBOL(invalidate_inodes_fn);
-#endif /* HAVE_INVALIDATE_INODES */
+#endif /* !HAVE_INVALIDATE_INODES && !HAVE_INVALIDATE_INODES_CHECK */
  
  #ifndef HAVE_SHRINK_DCACHE_MEMORY
  shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
@@ -385,7 +385,7 @@ EXPORT_SYMBOL(vmem_table);
  EXPORT_SYMBOL(vmem_list);
  
  static kmem_debug_t *
-kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, void *addr)
+kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
  {
         struct hlist_head *head;
         struct hlist_node *node;
@@ -504,7 +504,7 @@ out:
  EXPORT_SYMBOL(kmem_alloc_track);
  
  void
-kmem_free_track(void *ptr, size_t size)
+kmem_free_track(const void *ptr, size_t size)
  {
         kmem_debug_t *dptr;
         SENTRY;
@@ -619,7 +619,7 @@ out:
  EXPORT_SYMBOL(vmem_alloc_track);
  
  void
-vmem_free_track(void *ptr, size_t size)
+vmem_free_track(const void *ptr, size_t size)
  {
         kmem_debug_t *dptr;
         SENTRY;
@@ -672,7 +672,7 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
                     (unsigned long long) size, flags, func, line,
                     kmem_alloc_used_read(), kmem_alloc_max);
-               spl_debug_dumpstack(NULL);
+               dump_stack();
         }
  
         /* Use the correct allocator */
@@ -706,7 +706,7 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
  EXPORT_SYMBOL(kmem_alloc_debug);
  
  void
-kmem_free_debug(void *ptr, size_t size)
+kmem_free_debug(const void *ptr, size_t size)
  {
         SENTRY;
  
@@ -758,7 +758,7 @@ vmem_alloc_debug(size_t size, int flags, const char *func, int line)
  EXPORT_SYMBOL(vmem_alloc_debug);
  
  void
-vmem_free_debug(void *ptr, size_t size)
+vmem_free_debug(const void *ptr, size_t size)
  {
         SENTRY;
  
@@ -782,7 +782,7 @@ EXPORT_SYMBOL(vmem_free_debug);
   * Slab allocation interfaces
   *
   * While the Linux slab implementation was inspired by the Solaris
- * implemenation I cannot use it to emulate the Solaris APIs.  I
+ * implementation I cannot use it to emulate the Solaris APIs.  I
   * require two features which are not provided by the Linux slab.
   *
   * 1) Constructors AND destructors.  Recent versions of the Linux
@@ -797,7 +797,7 @@ EXPORT_SYMBOL(vmem_free_debug);
   *    Because of memory fragmentation the Linux slab which is backed
   *    by kmalloc'ed memory performs very badly when confronted with
   *    large numbers of large allocations.  Basing the slab on the
- *    virtual address space removes the need for contigeous pages
+ *    virtual address space removes the need for contiguous pages
   *    and greatly improve performance for large allocations.
   *
   * For these reasons, the SPL has its own slab implementation with
@@ -811,12 +811,12 @@ EXPORT_SYMBOL(vmem_free_debug);
   *
   * XXX: Improve the partial slab list by carefully maintaining a
   *      strict ordering of fullest to emptiest slabs based on
- *      the slab reference count.  This gaurentees the when freeing
+ *      the slab reference count.  This guarantees the when freeing
   *      slabs back to the system we need only linearly traverse the
   *      last N slabs in the list to discover all the freeable slabs.
   *
   * XXX: NUMA awareness for optionally allocating memory close to a
- *      particular core.  This can be adventageous if you know the slab
+ *      particular core.  This can be advantageous if you know the slab
   *      object will be short lived and primarily accessed from one core.
   *
   * XXX: Slab coloring may also yield performance improvements and would
@@ -855,14 +855,17 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
                  * been filed at kernel.org to track the issue.
                  *
                  * https://bugzilla.kernel.org/show_bug.cgi?id=30702
+                *
+                * NOTE: Only set PF_MEMALLOC if it's not already set, and
+                * then only clear it when we were the one who set it.
                  */
-               if (!(flags & __GFP_FS))
+               if (!(flags & __GFP_FS) && !(current->flags & PF_MEMALLOC)) {
                         current->flags |= PF_MEMALLOC;
-
-               ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
-
-               if (!(flags & __GFP_FS))
+                       ptr = __vmalloc(size, flags|__GFP_HIGHMEM, PAGE_KERNEL);
                         current->flags &= ~PF_MEMALLOC;
+               } else {
+                       ptr = __vmalloc(size, flags|__GFP_HIGHMEM, PAGE_KERNEL);
+               }
         }
  
         /* Resulting allocated memory will be page aligned */
@@ -877,6 +880,16 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
         ASSERT(ISP2(size));
  
+       /*
+        * The Linux direct reclaim path uses this out of band value to
+        * determine if forward progress is being made.  Normally this is
+        * incremented by kmem_freepages() which is part of the various
+        * Linux slab implementations.  However, since we are using none
+        * of that infrastructure we are responsible for incrementing it.
+        */
+       if (current->reclaim_state)
+               current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
         if (skc->skc_flags & KMC_KMEM)
                 free_pages((unsigned long)ptr, get_order(size));
         else
@@ -935,12 +948,12 @@ spl_offslab_size(spl_kmem_cache_t *skc)
   * For small objects we use kmem_alloc() because as long as you are
   * only requesting a small number of pages (ideally just one) its cheap.
   * However, when you start requesting multiple pages with kmem_alloc()
- * it gets increasingly expensive since it requires contigeous pages.
+ * it gets increasingly expensive since it requires contiguous pages.
   * For this reason we shift to vmem_alloc() for slabs of large objects
- * which removes the need for contigeous pages.  We do not use
+ * which removes the need for contiguous pages.  We do not use
   * vmem_alloc() in all cases because there is significant locking
   * overhead in __get_vm_area_node().  This function takes a single
- * global lock when aquiring an available virtual address range which
+ * global lock when acquiring an available virtual address range which
   * serializes all vmem_alloc()'s for all slab caches.  Using slightly
   * different allocation functions for small and large objects should
   * give us the best of both worlds.
@@ -1082,9 +1095,9 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
                  * All empty slabs are at the end of skc->skc_partial_list,
                  * therefore once a non-empty slab is found we can stop
                  * scanning.  Additionally, stop when reaching the target
-                * reclaim 'count' if a non-zero threshhold is given.
+                * reclaim 'count' if a non-zero threshold is given.
                  */
-               if ((sks->sks_ref > 0) || (count && i > count))
+               if ((sks->sks_ref > 0) || (count && i >= count))
                         break;
  
                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
@@ -1157,7 +1170,7 @@ spl_magazine_age(void *data)
  /*
   * Called regularly to keep a downward pressure on the size of idle
   * magazines and to release free slabs from the cache.  This function
- * never calls the registered reclaim function, that only occures
+ * never calls the registered reclaim function, that only occurs
   * under memory pressure or with a direct call to spl_kmem_reap().
   */
  static void
@@ -1247,7 +1260,7 @@ spl_magazine_size(spl_kmem_cache_t *skc)
  }
  
  /*
- * Allocate a per-cpu magazine to assoicate with a specific core.
+ * Allocate a per-cpu magazine to associate with a specific core.
   */
  static spl_kmem_magazine_t *
  spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
@@ -1272,7 +1285,7 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
  }
  
  /*
- * Free a per-cpu magazine assoicated with a specific core.
+ * Free a per-cpu magazine associated with a specific core.
   */
  static void
  spl_magazine_free(spl_kmem_magazine_t *skm)
@@ -1379,7 +1392,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
         if (current_thread_info()->preempt_count || irqs_disabled())
                 kmem_flags = KM_NOSLEEP;
  
-       /* Allocate memry for a new cache an initialize it.  Unfortunately,
+       /* Allocate memory for a new cache an initialize it.  Unfortunately,
          * this usually ends up being a large allocation of ~32k because
          * we need to allocate enough memory for the worst case number of
          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
@@ -1475,7 +1488,7 @@ spl_kmem_cache_set_move(kmem_cache_t *skc,
  EXPORT_SYMBOL(spl_kmem_cache_set_move);
  
  /*
- * Destroy a cache and all objects assoicated with the cache.
+ * Destroy a cache and all objects associated with the cache.
   */
  void
  spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
@@ -1564,9 +1577,9 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
  }
  
  /*
- * No available objects on any slabsi, create a new slab.  Since this
- * is an expensive operation we do it without holding the spinlock and
- * only briefly aquire it when we link in the fully allocated and
+ * No available objects on any slabs, create a new slab.  Since this
+ * is an expensive operation we do it without holding the spin lock and
+ * only briefly acquire it when we link in the fully allocated and
   * constructed slab.
   */
  static spl_kmem_slab_t *
@@ -1639,7 +1652,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
                                 SGOTO(out, rc);
  
                         /* Potentially rescheduled to the same CPU but
-                        * allocations may have occured from this CPU while
+                        * allocations may have occurred from this CPU while
                          * we were sleeping so recalculate max refill. */
                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
  
@@ -1707,7 +1720,7 @@ spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
                 list_add(&sks->sks_list, &skc->skc_partial_list);
         }
  
-       /* Move emply slabs to the end of the partial list so
+       /* Move empty slabs to the end of the partial list so
          * they can be easily found and freed during reclamation. */
         if (sks->sks_ref == 0) {
                 list_del(&sks->sks_list);
@@ -1774,7 +1787,7 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
  
  restart:
         /* Safe to update per-cpu structure without lock, but
-        * in the restart case we must be careful to reaquire
+        * in the restart case we must be careful to reacquire
          * the local magazine since this may have changed
          * when we need to grow the cache. */
         skm = skc->skc_mag[smp_processor_id()];
@@ -1845,17 +1858,18 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
  EXPORT_SYMBOL(spl_kmem_cache_free);
  
  /*
- * The generic shrinker function for all caches.  Under linux a shrinker
- * may not be tightly coupled with a slab cache.  In fact linux always
- * systematically trys calling all registered shrinker callbacks which
+ * The generic shrinker function for all caches.  Under Linux a shrinker
+ * may not be tightly coupled with a slab cache.  In fact Linux always
+ * systematically tries calling all registered shrinker callbacks which
   * report that they contain unused objects.  Because of this we only
   * register one shrinker function in the shim layer for all slab caches.
   * We always attempt to shrink all caches when this generic shrinker
   * is called.  The shrinker should return the number of free objects
   * in the cache when called with nr_to_scan == 0 but not attempt to
   * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
- * objects should be freed, because Solaris semantics are to free
- * all available objects we may free more objects than requested.
+ * objects should be freed, which differs from Solaris semantics.
+ * Solaris semantics are to free all available objects which may (and
+ * probably will) be more objects than the requested nr_to_scan.
   */
  static int
  __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
@@ -1867,7 +1881,8 @@ __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
         down_read(&spl_kmem_cache_sem);
         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
                 if (sc->nr_to_scan)
-                       spl_kmem_cache_reap_now(skc);
+                       spl_kmem_cache_reap_now(skc,
+                          MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
  
                 /*
                  * Presume everything alloc'ed in reclaimable, this ensures
@@ -1893,7 +1908,7 @@ SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
   * effort and we do not want to thrash creating and destroying slabs.
   */
  void
-spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
  {
         SENTRY;
  
@@ -1908,10 +1923,41 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
  
         atomic_inc(&skc->skc_ref);
  
-       if (skc->skc_reclaim)
-               skc->skc_reclaim(skc->skc_private);
+       /*
+        * When a reclaim function is available it may be invoked repeatedly
+        * until at least a single slab can be freed.  This ensures that we
+        * do free memory back to the system.  This helps minimize the chance
+        * of an OOM event when the bulk of memory is used by the slab.
+        *
+        * When free slabs are already available the reclaim callback will be
+        * skipped.  Additionally, if no forward progress is detected despite
+        * a reclaim function the cache will be skipped to avoid deadlock.
+        *
+        * Longer term this would be the correct place to add the code which
+        * repacks the slabs in order minimize fragmentation.
+        */
+       if (skc->skc_reclaim) {
+               uint64_t objects = UINT64_MAX;
+               int do_reclaim;
  
-       spl_slab_reclaim(skc, skc->skc_reap, 0);
+               do {
+                       spin_lock(&skc->skc_lock);
+                       do_reclaim =
+                           (skc->skc_slab_total > 0) &&
+                           ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
+                           (skc->skc_obj_alloc < objects);
+
+                       objects = skc->skc_obj_alloc;
+                       spin_unlock(&skc->skc_lock);
+
+                       if (do_reclaim)
+                               skc->skc_reclaim(skc->skc_private);
+
+               } while (do_reclaim);
+       }
+
+       /* Reclaim from the cache, ignoring it's age and delay. */
+       spl_slab_reclaim(skc, count, 1);
         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
         atomic_dec(&skc->skc_ref);
  
@@ -2110,31 +2156,25 @@ spl_kmem_init_kallsyms_lookup(void)
          */
         spl_kmem_init_globals();
  
-#ifndef HAVE_INVALIDATE_INODES
+#if !defined(HAVE_INVALIDATE_INODES) && !defined(HAVE_INVALIDATE_INODES_CHECK)
         invalidate_inodes_fn = (invalidate_inodes_t)
                 spl_kallsyms_lookup_name("invalidate_inodes");
         if (!invalidate_inodes_fn) {
                 printk(KERN_ERR "Error: Unknown symbol invalidate_inodes\n");
                 return -EFAULT;
         }
-#endif /* HAVE_INVALIDATE_INODES */
+#endif /* !HAVE_INVALIDATE_INODES && !HAVE_INVALIDATE_INODES_CHECK */
  
  #ifndef HAVE_SHRINK_DCACHE_MEMORY
+       /* When shrink_dcache_memory_fn == NULL support is disabled */
         shrink_dcache_memory_fn = (shrink_dcache_memory_t)
-       spl_kallsyms_lookup_name("shrink_dcache_memory");
-       if (!shrink_dcache_memory_fn) {
-               printk(KERN_ERR "Error: Unknown symbol shrink_dcache_memory\n");
-               return -EFAULT;
-       }
+               spl_kallsyms_lookup_name("shrink_dcache_memory");
  #endif /* HAVE_SHRINK_DCACHE_MEMORY */
  
  #ifndef HAVE_SHRINK_ICACHE_MEMORY
+       /* When shrink_icache_memory_fn == NULL support is disabled */
         shrink_icache_memory_fn = (shrink_icache_memory_t)
-       spl_kallsyms_lookup_name("shrink_icache_memory");
-       if (!shrink_icache_memory_fn) {
-               printk(KERN_ERR "Error: Unknown symbol shrink_icache_memory\n");
-               return -EFAULT;
-       }
+               spl_kallsyms_lookup_name("shrink_icache_memory");
  #endif /* HAVE_SHRINK_ICACHE_MEMORY */
  
         return 0;