Ensure kmem_alloc() and vmem_alloc() never fail

author Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 26 Jul 2010 22:47:55 +0000 (15:47 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 26 Jul 2010 22:47:55 +0000 (15:47 -0700)
author Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 26 Jul 2010 22:47:55 +0000 (15:47 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 26 Jul 2010 22:47:55 +0000 (15:47 -0700)
diff --git a/include/sys/kmem.h b/include/sys/kmem.h

index 17b3a22761420a7c7c20b601537acec23b68ddd2..e90c6b8ceb2ce01686c74767d5c0b302b6fc210a 100644 (file)
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -87,10 +87,10 @@ kzalloc_nofail(size_t size, gfp_t flags)
         return ptr;
  }
  
-#ifdef HAVE_KMALLOC_NODE
  static inline void *
  kmalloc_node_nofail(size_t size, gfp_t flags, int node)
  {
+#ifdef HAVE_KMALLOC_NODE
         void *ptr;
  
         do {
@@ -98,16 +98,63 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node)
         } while (ptr == NULL && (flags & __GFP_WAIT));
  
         return ptr;
-}
+#else
+       return kmalloc_nofail(size, flags);
  #endif /* HAVE_KMALLOC_NODE */
+}
+
+static inline void *
+vmalloc_nofail(size_t size, gfp_t flags)
+{
+       void *ptr;
+
+       /*
+        * Retry failed __vmalloc() allocations once every second.  The
+        * rational for the delay is that the likely failure modes are:
+        *
+        * 1) The system has completely exhausted memory, in which case
+        *    delaying 1 second for the memory reclaim to run is reasonable
+        *    to avoid thrashing the system.
+        * 2) The system has memory but has exhausted the small virtual
+        *    address space available on 32-bit systems.  Retrying the
+        *    allocation immediately will only result in spinning on the
+        *    virtual address space lock.  It is better delay a second and
+        *    hope that another process will free some of the address space.
+        *    But the bottom line is there is not much we can actually do
+        *    since we can never safely return a failure and honor the
+        *    Solaris semantics.
+        */
+       while (1) {
+               ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+               if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(HZ);
+               } else {
+                       break;
+               }
+       }
+
+       return ptr;
+}
+
+static inline void *
+vzalloc_nofail(size_t size, gfp_t flags)
+{
+       void *ptr;
+
+       ptr = vmalloc_nofail(size, flags);
+       if (ptr)
+               memset(ptr, 0, (size));
+
+       return ptr;
+}
  
  #ifdef DEBUG_KMEM
-# ifdef HAVE_ATOMIC64_T
  
-extern atomic64_t kmem_alloc_used;
-extern unsigned long long kmem_alloc_max;
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
+/*
+ * Memory accounting functions to be used only when DEBUG_KMEM is set.
+ */
+# ifdef HAVE_ATOMIC64_T
  
  # define kmem_alloc_used_add(size)      atomic64_add(size, &kmem_alloc_used)
  # define kmem_alloc_used_sub(size)      atomic64_sub(size, &kmem_alloc_used)
@@ -118,13 +165,13 @@ extern unsigned long long vmem_alloc_max;
  # define vmem_alloc_used_read()         atomic64_read(&vmem_alloc_used)
  # define vmem_alloc_used_set(size)      atomic64_set(&vmem_alloc_used, size)
  
-# else
-
-extern atomic_t kmem_alloc_used;
+extern atomic64_t kmem_alloc_used;
  extern unsigned long long kmem_alloc_max;
-extern atomic_t vmem_alloc_used;
+extern atomic64_t vmem_alloc_used;
  extern unsigned long long vmem_alloc_max;
  
+# else  /* HAVE_ATOMIC64_T */
+
  # define kmem_alloc_used_add(size)      atomic_add(size, &kmem_alloc_used)
  # define kmem_alloc_used_sub(size)      atomic_sub(size, &kmem_alloc_used)
  # define kmem_alloc_used_read()         atomic_read(&kmem_alloc_used)
@@ -134,90 +181,107 @@ extern unsigned long long vmem_alloc_max;
  # define vmem_alloc_used_read()         atomic_read(&vmem_alloc_used)
  # define vmem_alloc_used_set(size)      atomic_set(&vmem_alloc_used, size)
  
-# endif /* _LP64 */
-
-# define kmem_alloc(size, flags)             __kmem_alloc((size), (flags), 0, 0)
-# define kmem_zalloc(size, flags)            __kmem_alloc((size), ((flags) |  \
-                                                 __GFP_ZERO), 0, 0)
-
-/* The node alloc functions are only used by the SPL code itself */
-# ifdef HAVE_KMALLOC_NODE
-#  define kmem_alloc_node(size, flags, node) __kmem_alloc((size), (flags), 1, \
-                                                 node)
-# else
-#  define kmem_alloc_node(size, flags, node) __kmem_alloc((size), (flags), 0, 0)
-# endif
+extern atomic_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+extern atomic_t vmem_alloc_used;
+extern unsigned long long vmem_alloc_max;
  
-# define vmem_zalloc(size, flags)            vmem_alloc((size), ((flags) |    \
-                                                 __GFP_ZERO))
+# endif /* HAVE_ATOMIC64_T */
  
  # ifdef DEBUG_KMEM_TRACKING
-
-extern void *kmem_alloc_track(size_t size, int flags, const char *func,
-    int line, int node_alloc, int node);
-extern void kmem_free_track(void *ptr, size_t size);
-extern void *vmem_alloc_track(size_t size, int flags, const char *func,
-    int line);
-extern void vmem_free_track(void *ptr, size_t size);
-
-#  define __kmem_alloc(size, flags, na, node) kmem_alloc_track((size),        \
-                                                  (flags), __FUNCTION__,      \
-                                                  __LINE__, (na), (node))
-#  define kmem_free(ptr, size)                kmem_free_track((ptr), (size))
-#  define vmem_alloc(size, flags)             vmem_alloc_track((size),        \
-                                                  (flags),__FUNCTION__,       \
-                                                  __LINE__)
-#  define vmem_free(ptr, size)                vmem_free_track((ptr), (size))
+/*
+ * DEBUG_KMEM && DEBUG_KMEM_TRACKING
+ *
+ * The maximum level of memory debugging.  All memory will be accounted
+ * for and each allocation will be explicitly tracked.  Any allocation
+ * which is leaked will be reported on module unload and the exact location
+ * where that memory was allocation will be reported.  This level of memory
+ * tracking will have a significant impact on performance and should only
+ * be enabled for debugging.  This feature may be enabled by passing
+ * --enable-debug-kmem-tracking to configure.
+ */
+#  define kmem_alloc(sz, fl)            kmem_alloc_track((sz), (fl),           \
+                                             __FUNCTION__, __LINE__, 0, 0)
+#  define kmem_zalloc(sz, fl)           kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
+                                             __FUNCTION__, __LINE__, 0, 0)
+#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_track((sz), (fl),           \
+                                             __FUNCTION__, __LINE__, 1, nd)
+#  define kmem_free(ptr, sz)            kmem_free_track((ptr), (sz))
+
+#  define vmem_alloc(sz, fl)            vmem_alloc_track((sz), (fl),           \
+                                             __FUNCTION__, __LINE__)
+#  define vmem_zalloc(sz, fl)           vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
+                                             __FUNCTION__, __LINE__)
+#  define vmem_free(ptr, sz)            vmem_free_track((ptr), (sz))
+
+extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
+extern void kmem_free_track(void *, size_t);
+extern void *vmem_alloc_track(size_t, int, const char *, int);
+extern void vmem_free_track(void *, size_t);
  
  # else /* DEBUG_KMEM_TRACKING */
-
-extern void *kmem_alloc_debug(size_t size, int flags, const char *func,
-    int line, int node_alloc, int node);
-extern void kmem_free_debug(void *ptr, size_t size);
-extern void *vmem_alloc_debug(size_t size, int flags, const char *func,
-    int line);
-extern void vmem_free_debug(void *ptr, size_t size);
-
-#  define __kmem_alloc(size, flags, na, node) kmem_alloc_debug((size),        \
-                                                  (flags), __FUNCTION__,      \
-                                                  __LINE__, (na), (node))
-#  define kmem_free(ptr, size)                kmem_free_debug((ptr), (size))
-#  define vmem_alloc(size, flags)             vmem_alloc_debug((size),        \
-                                                  (flags), __FUNCTION__,      \
-                                                  __LINE__)
-#  define vmem_free(ptr, size)                vmem_free_debug((ptr), (size))
+/*
+ * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ *
+ * The default build will set DEBUG_KEM.  This provides basic memory
+ * accounting with little to no impact on performance.  When the module
+ * is unloaded in any memory was leaked the total number of leaked bytes
+ * will be reported on the console.  To disable this basic accounting
+ * pass the --disable-debug-kmem option to configure.
+ */
+#  define kmem_alloc(sz, fl)            kmem_alloc_debug((sz), (fl),           \
+                                             __FUNCTION__, __LINE__, 0, 0)
+#  define kmem_zalloc(sz, fl)           kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
+                                             __FUNCTION__, __LINE__, 0, 0)
+#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_debug((sz), (fl),           \
+                                             __FUNCTION__, __LINE__, 1, nd)
+#  define kmem_free(ptr, sz)            kmem_free_debug((ptr), (sz))
+
+#  define vmem_alloc(sz, fl)            vmem_alloc_debug((sz), (fl),           \
+                                             __FUNCTION__, __LINE__)
+#  define vmem_zalloc(sz, fl)           vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
+                                             __FUNCTION__, __LINE__)
+#  define vmem_free(ptr, sz)            vmem_free_debug((ptr), (sz))
+
+extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
+extern void kmem_free_debug(void *, size_t);
+extern void *vmem_alloc_debug(size_t, int, const char *, int);
+extern void vmem_free_debug(void *, size_t);
  
  # endif /* DEBUG_KMEM_TRACKING */
-
  #else /* DEBUG_KMEM */
+/*
+ * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ *
+ * All debugging is disabled.  There will be no overhead even for
+ * minimal memory accounting.  To enable basic accounting pass the
+ * --enable-debug-kmem option to configure.
+ */
+# define kmem_alloc(sz, fl)             kmalloc_nofail((sz), (fl))
+# define kmem_zalloc(sz, fl)            kzalloc_nofail((sz), (fl))
+# define kmem_alloc_node(sz, fl, nd)    kmalloc_node_nofail((sz), (fl), (nd))
+# define kmem_free(ptr, sz)             ((void)(sz), kfree(ptr))
  
-# define kmem_alloc(size, flags)              kmalloc_nofail((size), (flags))
-# define kmem_zalloc(size, flags)             kzalloc_nofail((size), (flags))
-# define kmem_free(ptr, size)                 ((void)(size), kfree(ptr))
-
-# ifdef HAVE_KMALLOC_NODE
-#  define kmem_alloc_node(size, flags, node)                                  \
-          kmalloc_node_nofail((size), (flags), (node))
-# else
-#  define kmem_alloc_node(size, flags, node)                                  \
-          kmalloc_nofail((size), (flags))
-# endif
-
-# define vmem_alloc(size, flags)              __vmalloc((size), ((flags) |    \
-                                                  __GFP_HIGHMEM), PAGE_KERNEL)
-# define vmem_zalloc(size, flags)                                             \
-({                                                                            \
-        void *_ptr_ = __vmalloc((size),((flags)|__GFP_HIGHMEM),PAGE_KERNEL);  \
-        if (_ptr_)                                                            \
-                memset(_ptr_, 0, (size));                                     \
-        _ptr_;                                                                \
-})
-# define vmem_free(ptr, size)           ((void)(size), vfree(ptr))
+# define vmem_alloc(sz, fl)             vmalloc_nofail((sz), (fl))
+# define vmem_zalloc(sz, fl)            vzalloc_nofail((sz), (fl))
+# define vmem_free(ptr, sz)             ((void)(sz), vfree(ptr))
  
  #endif /* DEBUG_KMEM */
  
+extern int kmem_debugging(void);
+extern char *kmem_vasprintf(const char *fmt, va_list ap);
+extern char *kmem_asprintf(const char *fmt, ...);
+extern char *strdup(const char *str);
+extern void strfree(char *str);
+
+
  /*
- * Slab allocation interfaces
+ * Slab allocation interfaces.  The SPL slab differs from the standard
+ * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
+ * allocated from the physical or virtal memory address space.  The virtual
+ * slabs allow for good behavior when allocation large objects of identical
+ * size.  This slab implementation also supports both constructors and
+ * destructions which the Linux slab does not.
   */
  enum {
         KMC_BIT_NOTOUCH         = 0,    /* Don't update ages */
@@ -246,12 +310,6 @@ enum {
  #define KMC_REAP_CHUNK                 INT_MAX
  #define KMC_DEFAULT_SEEKS              1
  
-extern int kmem_debugging(void);
-extern char *kmem_vasprintf(const char *fmt, va_list ap);
-extern char *kmem_asprintf(const char *fmt, ...);
-#define strfree(str)           kfree(str)
-#define strdup(str)            kstrdup(str, GFP_KERNEL)
-
  extern struct list_head spl_kmem_cache_list;
  extern struct rw_semaphore spl_kmem_cache_sem;
  
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c

index e575b1ee97df79da1c7d8cb157019951898026c9..ec1ccb4ce45f00e089d5eacd445cfa6fff7e112a 100644 (file)
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
@@ -271,6 +271,34 @@ kmem_asprintf(const char *fmt, ...)
  }
  EXPORT_SYMBOL(kmem_asprintf);
  
+static char *
+__strdup(const char *str, int flags)
+{
+       char *ptr;
+       int n;
+
+       n = strlen(str);
+       ptr = kmalloc_nofail(n + 1, flags);
+       if (ptr)
+               memcpy(ptr, str, n + 1);
+
+       return ptr;
+}
+
+char *
+strdup(const char *str)
+{
+       return __strdup(str, KM_SLEEP);
+}
+EXPORT_SYMBOL(strdup);
+
+void
+strfree(char *str)
+{
+       kmem_free(str, strlen(str) + 1);
+}
+EXPORT_SYMBOL(strfree);
+
  /*
   * Memory allocation interfaces and debugging for basic kmem_*
   * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
@@ -285,12 +313,12 @@ atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
  unsigned long long kmem_alloc_max = 0;
  atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
  unsigned long long vmem_alloc_max = 0;
-# else
+# else  /* HAVE_ATOMIC64_T */
  atomic_t kmem_alloc_used = ATOMIC_INIT(0);
  unsigned long long kmem_alloc_max = 0;
  atomic_t vmem_alloc_used = ATOMIC_INIT(0);
  unsigned long long vmem_alloc_max = 0;
-# endif /* _LP64 */
+# endif /* HAVE_ATOMIC64_T */
  
  EXPORT_SYMBOL(kmem_alloc_used);
  EXPORT_SYMBOL(kmem_alloc_max);
@@ -340,77 +368,9 @@ EXPORT_SYMBOL(kmem_list);
  EXPORT_SYMBOL(vmem_lock);
  EXPORT_SYMBOL(vmem_table);
  EXPORT_SYMBOL(vmem_list);
-# endif
-#endif
-
-/*
- * Slab allocation interfaces
- *
- * While the Linux slab implementation was inspired by the Solaris
- * implemenation I cannot use it to emulate the Solaris APIs.  I
- * require two features which are not provided by the Linux slab.
- *
- * 1) Constructors AND destructors.  Recent versions of the Linux
- *    kernel have removed support for destructors.  This is a deal
- *    breaker for the SPL which contains particularly expensive
- *    initializers for mutex's, condition variables, etc.  We also
- *    require a minimal level of cleanup for these data types unlike
- *    many Linux data type which do need to be explicitly destroyed.
- *
- * 2) Virtual address space backed slab.  Callers of the Solaris slab
- *    expect it to work well for both small are very large allocations.
- *    Because of memory fragmentation the Linux slab which is backed
- *    by kmalloc'ed memory performs very badly when confronted with
- *    large numbers of large allocations.  Basing the slab on the
- *    virtual address space removes the need for contigeous pages
- *    and greatly improve performance for large allocations.
- *
- * For these reasons, the SPL has its own slab implementation with
- * the needed features.  It is not as highly optimized as either the
- * Solaris or Linux slabs, but it should get me most of what is
- * needed until it can be optimized or obsoleted by another approach.
- *
- * One serious concern I do have about this method is the relatively
- * small virtual address space on 32bit arches.  This will seriously
- * constrain the size of the slab caches and their performance.
- *
- * XXX: Improve the partial slab list by carefully maintaining a
- *      strict ordering of fullest to emptiest slabs based on
- *      the slab reference count.  This gaurentees the when freeing
- *      slabs back to the system we need only linearly traverse the
- *      last N slabs in the list to discover all the freeable slabs.
- *
- * XXX: NUMA awareness for optionally allocating memory close to a
- *      particular core.  This can be adventageous if you know the slab
- *      object will be short lived and primarily accessed from one core.
- *
- * XXX: Slab coloring may also yield performance improvements and would
- *      be desirable to implement.
- */
-
-struct list_head spl_kmem_cache_list;   /* List of caches */
-struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
-
-static int spl_cache_flush(spl_kmem_cache_t *skc,
-                           spl_kmem_magazine_t *skm, int flush);
-
-#ifdef HAVE_SET_SHRINKER
-static struct shrinker *spl_kmem_cache_shrinker;
-#else
-static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
-                                           unsigned int gfp_mask);
-static struct shrinker spl_kmem_cache_shrinker = {
-       .shrink = spl_kmem_cache_generic_shrinker,
-       .seeks = KMC_DEFAULT_SEEKS,
-};
-#endif
-
-#ifdef DEBUG_KMEM
-# ifdef DEBUG_KMEM_TRACKING
  
  static kmem_debug_t *
-kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
-                void *addr)
+kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, void *addr)
  {
         struct hlist_head *head;
         struct hlist_node *node;
@@ -444,17 +404,20 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line,
         unsigned long irq_flags;
         SENTRY;
  
+       /* Function may be called with KM_NOSLEEP so failure is possible */
         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
             flags & ~__GFP_ZERO);
  
-       if (dptr == NULL) {
+       if (unlikely(dptr == NULL)) {
                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
                     "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
                     sizeof(kmem_debug_t), flags, func, line,
                     kmem_alloc_used_read(), kmem_alloc_max);
         } else {
-               /* Marked unlikely because we should never be doing this,
-                * we tolerate to up 2 pages but a single page is best.   */
+               /*
+                * Marked unlikely because we should never be doing this,
+                * we tolerate to up 2 pages but a single page is best.
+                */
                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
                             "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
@@ -463,14 +426,17 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line,
                         spl_debug_dumpstack(NULL);
                 }
  
-               /* We use kstrdup() below because the string pointed to by
+               /*
+                *  We use __strdup() below because the string pointed to by
                  * __FUNCTION__ might not be available by the time we want
-                * to print it since the module might have been unloaded. */
-               dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
+                * to print it since the module might have been unloaded.
+                * This can only fail in the KM_NOSLEEP case.
+                */
+               dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
                 if (unlikely(dptr->kd_func == NULL)) {
                         kfree(dptr);
                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
-                           "debug kstrdup() at %s:%d failed (%lld/%llu)\n",
+                           "debug __strdup() at %s:%d failed (%lld/%llu)\n",
                             func, line, kmem_alloc_used_read(), kmem_alloc_max);
                         goto out;
                 }
@@ -533,7 +499,8 @@ kmem_free_track(void *ptr, size_t size)
  
         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
  
-       ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
+       /* Must exist in hash due to kmem_alloc() */
+       ASSERT(dptr);
  
         /* Size must match */
         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
@@ -567,28 +534,37 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line)
  
         ASSERT(flags & KM_SLEEP);
  
+       /* Function may be called with KM_NOSLEEP so failure is possible */
         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
             flags & ~__GFP_ZERO);
-       if (dptr == NULL) {
+       if (unlikely(dptr == NULL)) {
                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
                     "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
                     sizeof(kmem_debug_t), flags, func, line,
                     vmem_alloc_used_read(), vmem_alloc_max);
         } else {
-               /* We use kstrdup() below because the string pointed to by
+               /*
+                * We use __strdup() below because the string pointed to by
                  * __FUNCTION__ might not be available by the time we want
-                * to print it, since the module might have been unloaded. */
-               dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
+                * to print it, since the module might have been unloaded.
+                * This can never fail because we have already asserted
+                * that flags is KM_SLEEP.
+                */
+               dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
                 if (unlikely(dptr->kd_func == NULL)) {
                         kfree(dptr);
                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
-                           "debug kstrdup() at %s:%d failed (%lld/%llu)\n",
+                           "debug __strdup() at %s:%d failed (%lld/%llu)\n",
                             func, line, vmem_alloc_used_read(), vmem_alloc_max);
                         goto out;
                 }
  
-               ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
-                   PAGE_KERNEL);
+               /* Use the correct allocator */
+               if (flags & __GFP_ZERO) {
+                       ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
+               } else {
+                       ptr = vmalloc_nofail(size, flags);
+               }
  
                 if (unlikely(ptr == NULL)) {
                         kfree(dptr->kd_func);
@@ -600,9 +576,6 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line)
                         goto out;
                 }
  
-               if (flags & __GFP_ZERO)
-                       memset(ptr, 0, size);
-
                 vmem_alloc_used_add(size);
                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
                         vmem_alloc_max = vmem_alloc_used_read();
@@ -640,7 +613,9 @@ vmem_free_track(void *ptr, size_t size)
             (unsigned long long) size);
  
         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
-       ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
+
+       /* Must exist in hash due to vmem_alloc() */
+       ASSERT(dptr);
  
         /* Size must match */
         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
@@ -673,11 +648,13 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
         void *ptr;
         SENTRY;
  
-       /* Marked unlikely because we should never be doing this,
-        * we tolerate to up 2 pages but a single page is best.   */
+       /*
+        * Marked unlikely because we should never be doing this,
+        * we tolerate to up 2 pages but a single page is best.
+        */
         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
                 SDEBUG(SD_CONSOLE | SD_WARNING,
-                   "Large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
+                   "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
                     (unsigned long long) size, flags, func, line,
                     kmem_alloc_used_read(), kmem_alloc_max);
                 spl_debug_dumpstack(NULL);
@@ -693,7 +670,7 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
                 ptr = kmalloc_nofail(size, flags);
         }
  
-       if (ptr == NULL) {
+       if (unlikely(ptr == NULL)) {
                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
                     (unsigned long long) size, flags, func, line,
@@ -706,8 +683,9 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
                 SDEBUG_LIMIT(SD_INFO,
                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
                     (unsigned long long) size, flags, func, line, ptr,
-                      kmem_alloc_used_read(), kmem_alloc_max);
+                   kmem_alloc_used_read(), kmem_alloc_max);
         }
+
         SRETURN(ptr);
  }
  EXPORT_SYMBOL(kmem_alloc_debug);
@@ -724,8 +702,6 @@ kmem_free_debug(void *ptr, size_t size)
         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
             (unsigned long long) size, kmem_alloc_used_read(),
             kmem_alloc_max);
-
-       memset(ptr, 0x5a, size);
         kfree(ptr);
  
         SEXIT;
@@ -740,17 +716,19 @@ vmem_alloc_debug(size_t size, int flags, const char *func, int line)
  
         ASSERT(flags & KM_SLEEP);
  
-       ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
-           PAGE_KERNEL);
-       if (ptr == NULL) {
+       /* Use the correct allocator */
+       if (flags & __GFP_ZERO) {
+               ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
+       } else {
+               ptr = vmalloc_nofail(size, flags);
+       }
+
+       if (unlikely(ptr == NULL)) {
                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
                     (unsigned long long) size, flags, func, line,
                     vmem_alloc_used_read(), vmem_alloc_max);
         } else {
-               if (flags & __GFP_ZERO)
-                       memset(ptr, 0, size);
-
                 vmem_alloc_used_add(size);
                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
                         vmem_alloc_max = vmem_alloc_used_read();
@@ -776,8 +754,6 @@ vmem_free_debug(void *ptr, size_t size)
         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
             (unsigned long long) size, vmem_alloc_used_read(),
             vmem_alloc_max);
-
-       memset(ptr, 0x5a, size);
         vfree(ptr);
  
         SEXIT;
@@ -787,6 +763,68 @@ EXPORT_SYMBOL(vmem_free_debug);
  # endif /* DEBUG_KMEM_TRACKING */
  #endif /* DEBUG_KMEM */
  
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implemenation I cannot use it to emulate the Solaris APIs.  I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors.  Recent versions of the Linux
+ *    kernel have removed support for destructors.  This is a deal
+ *    breaker for the SPL which contains particularly expensive
+ *    initializers for mutex's, condition variables, etc.  We also
+ *    require a minimal level of cleanup for these data types unlike
+ *    many Linux data type which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab.  Callers of the Solaris slab
+ *    expect it to work well for both small are very large allocations.
+ *    Because of memory fragmentation the Linux slab which is backed
+ *    by kmalloc'ed memory performs very badly when confronted with
+ *    large numbers of large allocations.  Basing the slab on the
+ *    virtual address space removes the need for contigeous pages
+ *    and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features.  It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches.  This will seriously
+ * constrain the size of the slab caches and their performance.
+ *
+ * XXX: Improve the partial slab list by carefully maintaining a
+ *      strict ordering of fullest to emptiest slabs based on
+ *      the slab reference count.  This gaurentees the when freeing
+ *      slabs back to the system we need only linearly traverse the
+ *      last N slabs in the list to discover all the freeable slabs.
+ *
+ * XXX: NUMA awareness for optionally allocating memory close to a
+ *      particular core.  This can be adventageous if you know the slab
+ *      object will be short lived and primarily accessed from one core.
+ *
+ * XXX: Slab coloring may also yield performance improvements and would
+ *      be desirable to implement.
+ */
+
+struct list_head spl_kmem_cache_list;   /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+
+static int spl_cache_flush(spl_kmem_cache_t *skc,
+                           spl_kmem_magazine_t *skm, int flush);
+
+#ifdef HAVE_SET_SHRINKER
+static struct shrinker *spl_kmem_cache_shrinker;
+#else
+static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
+                                           unsigned int gfp_mask);
+static struct shrinker spl_kmem_cache_shrinker = {
+       .shrink = spl_kmem_cache_generic_shrinker,
+       .seeks = KMC_DEFAULT_SEEKS,
+};
+#endif
+
  static void *
  kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
  {
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 26 Jul 2010 22:47:55 +0000 (15:47 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 26 Jul 2010 22:47:55 +0000 (15:47 -0700)
include/sys/kmem.h		patch \| blob \| blame \| history
module/spl/spl-kmem.c		patch \| blob \| blame \| history