module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28 #include <spl-debug.h>
  29
  30 #ifdef SS_DEBUG_SUBSYS
  31 #undef SS_DEBUG_SUBSYS
  32 #endif
  33
  34 #define SS_DEBUG_SUBSYS SS_KMEM
  35
  36 /*
  37  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  38  * are removed to allow access to the real Linux slab allocator.
  39  */
  40 #undef kmem_cache_destroy
  41 #undef kmem_cache_create
  42 #undef kmem_cache_alloc
  43 #undef kmem_cache_free
  44
  45
  46 /*
  47  * Cache expiration was implemented because it was part of the default Solaris
  48  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  49  * accessed in several seconds should be returned to the cache.  On the other
  50  * hand Linux slabs never move objects back to the slabs unless there is
  51  * memory pressure on the system.  By default the Linux method is enabled
  52  * because it has been shown to improve responsiveness on low memory systems.
  53  * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  54  */
  55 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
  56 EXPORT_SYMBOL(spl_kmem_cache_expire);
  57 module_param(spl_kmem_cache_expire, uint, 0644);
  58 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  59
  60 /*
  61  * The default behavior is to report the number of objects remaining in the
  62  * cache.  This allows the Linux VM to repeatedly reclaim objects from the
  63  * cache when memory is low satisfy other memory allocations.  Alternately,
  64  * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
  65  * is reclaimed.  This may increase the likelihood of out of memory events.
  66  */
  67 unsigned int spl_kmem_cache_reclaim = 0;
  68 module_param(spl_kmem_cache_reclaim, uint, 0644);
  69 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
  70
  71 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  72 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  73 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  74
  75 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
  76 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
  77 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
  78     "Minimal number of objects per slab");
  79
  80 unsigned int spl_kmem_cache_max_size = 32;
  81 module_param(spl_kmem_cache_max_size, uint, 0644);
  82 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  83
  84 /*
  85  * For small objects the Linux slab allocator should be used to make the most
  86  * efficient use of the memory.  However, large objects are not supported by
  87  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  88  * of 16K was determined to be optimal for architectures using 4K pages.
  89  */
  90 #if PAGE_SIZE == 4096
  91 unsigned int spl_kmem_cache_slab_limit = 16384;
  92 #else
  93 unsigned int spl_kmem_cache_slab_limit = 0;
  94 #endif
  95 module_param(spl_kmem_cache_slab_limit, uint, 0644);
  96 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
  97     "Objects less than N bytes use the Linux slab");
  98
  99 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
 100 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
 101 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
 102     "Objects less than N bytes use the kmalloc");
 103
 104 /*
 105  * The minimum amount of memory measured in pages to be free at all
 106  * times on the system.  This is similar to Linux's zone->pages_min
 107  * multiplied by the number of zones and is sized based on that.
 108  */
 109 pgcnt_t minfree = 0;
 110 EXPORT_SYMBOL(minfree);
 111
 112 /*
 113  * The desired amount of memory measured in pages to be free at all
 114  * times on the system.  This is similar to Linux's zone->pages_low
 115  * multiplied by the number of zones and is sized based on that.
 116  * Assuming all zones are being used roughly equally, when we drop
 117  * below this threshold asynchronous page reclamation is triggered.
 118  */
 119 pgcnt_t desfree = 0;
 120 EXPORT_SYMBOL(desfree);
 121
 122 /*
 123  * When above this amount of memory measures in pages the system is
 124  * determined to have enough free memory.  This is similar to Linux's
 125  * zone->pages_high multiplied by the number of zones and is sized based
 126  * on that.  Assuming all zones are being used roughly equally, when
 127  * asynchronous page reclamation reaches this threshold it stops.
 128  */
 129 pgcnt_t lotsfree = 0;
 130 EXPORT_SYMBOL(lotsfree);
 131
 132 /* Unused always 0 in this implementation */
 133 pgcnt_t needfree = 0;
 134 EXPORT_SYMBOL(needfree);
 135
 136 pgcnt_t swapfs_minfree = 0;
 137 EXPORT_SYMBOL(swapfs_minfree);
 138
 139 pgcnt_t swapfs_reserve = 0;
 140 EXPORT_SYMBOL(swapfs_reserve);
 141
 142 vmem_t *heap_arena = NULL;
 143 EXPORT_SYMBOL(heap_arena);
 144
 145 vmem_t *zio_alloc_arena = NULL;
 146 EXPORT_SYMBOL(zio_alloc_arena);
 147
 148 vmem_t *zio_arena = NULL;
 149 EXPORT_SYMBOL(zio_arena);
 150
 151 #ifndef HAVE_GET_VMALLOC_INFO
 152 get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
 153 EXPORT_SYMBOL(get_vmalloc_info_fn);
 154 #endif /* HAVE_GET_VMALLOC_INFO */
 155
 156 #ifdef HAVE_PGDAT_HELPERS
 157 # ifndef HAVE_FIRST_ONLINE_PGDAT
 158 first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
 159 EXPORT_SYMBOL(first_online_pgdat_fn);
 160 # endif /* HAVE_FIRST_ONLINE_PGDAT */
 161
 162 # ifndef HAVE_NEXT_ONLINE_PGDAT
 163 next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
 164 EXPORT_SYMBOL(next_online_pgdat_fn);
 165 # endif /* HAVE_NEXT_ONLINE_PGDAT */
 166
 167 # ifndef HAVE_NEXT_ZONE
 168 next_zone_t next_zone_fn = SYMBOL_POISON;
 169 EXPORT_SYMBOL(next_zone_fn);
 170 # endif /* HAVE_NEXT_ZONE */
 171
 172 #else /* HAVE_PGDAT_HELPERS */
 173
 174 # ifndef HAVE_PGDAT_LIST
 175 struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
 176 EXPORT_SYMBOL(pgdat_list_addr);
 177 # endif /* HAVE_PGDAT_LIST */
 178
 179 #endif /* HAVE_PGDAT_HELPERS */
 180
 181 #ifdef NEED_GET_ZONE_COUNTS
 182 # ifndef HAVE_GET_ZONE_COUNTS
 183 get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
 184 EXPORT_SYMBOL(get_zone_counts_fn);
 185 # endif /* HAVE_GET_ZONE_COUNTS */
 186
 187 unsigned long
 188 spl_global_page_state(spl_zone_stat_item_t item)
 189 {
 190         unsigned long active;
 191         unsigned long inactive;
 192         unsigned long free;
 193
 194         get_zone_counts(&active, &inactive, &free);
 195         switch (item) {
 196         case SPL_NR_FREE_PAGES: return free;
 197         case SPL_NR_INACTIVE:   return inactive;
 198         case SPL_NR_ACTIVE:     return active;
 199         default:                ASSERT(0); /* Unsupported */
 200         }
 201
 202         return 0;
 203 }
 204 #else
 205 # ifdef HAVE_GLOBAL_PAGE_STATE
 206 unsigned long
 207 spl_global_page_state(spl_zone_stat_item_t item)
 208 {
 209         unsigned long pages = 0;
 210
 211         switch (item) {
 212         case SPL_NR_FREE_PAGES:
 213 #  ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
 214                 pages += global_page_state(NR_FREE_PAGES);
 215 #  endif
 216                 break;
 217         case SPL_NR_INACTIVE:
 218 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
 219                 pages += global_page_state(NR_INACTIVE);
 220 #  endif
 221 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
 222                 pages += global_page_state(NR_INACTIVE_ANON);
 223 #  endif
 224 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
 225                 pages += global_page_state(NR_INACTIVE_FILE);
 226 #  endif
 227                 break;
 228         case SPL_NR_ACTIVE:
 229 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
 230                 pages += global_page_state(NR_ACTIVE);
 231 #  endif
 232 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
 233                 pages += global_page_state(NR_ACTIVE_ANON);
 234 #  endif
 235 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
 236                 pages += global_page_state(NR_ACTIVE_FILE);
 237 #  endif
 238                 break;
 239         default:
 240                 ASSERT(0); /* Unsupported */
 241         }
 242
 243         return pages;
 244 }
 245 # else
 246 #  error "Both global_page_state() and get_zone_counts() unavailable"
 247 # endif /* HAVE_GLOBAL_PAGE_STATE */
 248 #endif /* NEED_GET_ZONE_COUNTS */
 249 EXPORT_SYMBOL(spl_global_page_state);
 250
 251 #ifndef HAVE_SHRINK_DCACHE_MEMORY
 252 shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
 253 EXPORT_SYMBOL(shrink_dcache_memory_fn);
 254 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
 255
 256 #ifndef HAVE_SHRINK_ICACHE_MEMORY
 257 shrink_icache_memory_t shrink_icache_memory_fn = SYMBOL_POISON;
 258 EXPORT_SYMBOL(shrink_icache_memory_fn);
 259 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
 260
 261 pgcnt_t
 262 spl_kmem_availrmem(void)
 263 {
 264         /* The amount of easily available memory */
 265         return (spl_global_page_state(SPL_NR_FREE_PAGES) +
 266                 spl_global_page_state(SPL_NR_INACTIVE));
 267 }
 268 EXPORT_SYMBOL(spl_kmem_availrmem);
 269
 270 size_t
 271 vmem_size(vmem_t *vmp, int typemask)
 272 {
 273         struct vmalloc_info vmi;
 274         size_t size = 0;
 275
 276         ASSERT(vmp == NULL);
 277         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 278
 279         get_vmalloc_info(&vmi);
 280         if (typemask & VMEM_ALLOC)
 281                 size += (size_t)vmi.used;
 282
 283         if (typemask & VMEM_FREE)
 284                 size += (size_t)(VMALLOC_TOTAL - vmi.used);
 285
 286         return size;
 287 }
 288 EXPORT_SYMBOL(vmem_size);
 289
 290 int
 291 kmem_debugging(void)
 292 {
 293         return 0;
 294 }
 295 EXPORT_SYMBOL(kmem_debugging);
 296
 297 #ifndef HAVE_KVASPRINTF
 298 /* Simplified asprintf. */
 299 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 300 {
 301         unsigned int len;
 302         char *p;
 303         va_list aq;
 304
 305         va_copy(aq, ap);
 306         len = vsnprintf(NULL, 0, fmt, aq);
 307         va_end(aq);
 308
 309         p = kmalloc(len+1, gfp);
 310         if (!p)
 311                 return NULL;
 312
 313         vsnprintf(p, len+1, fmt, ap);
 314
 315         return p;
 316 }
 317 EXPORT_SYMBOL(kvasprintf);
 318 #endif /* HAVE_KVASPRINTF */
 319
 320 char *
 321 kmem_vasprintf(const char *fmt, va_list ap)
 322 {
 323         va_list aq;
 324         char *ptr;
 325
 326         do {
 327                 va_copy(aq, ap);
 328                 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
 329                 va_end(aq);
 330         } while (ptr == NULL);
 331
 332         return ptr;
 333 }
 334 EXPORT_SYMBOL(kmem_vasprintf);
 335
 336 char *
 337 kmem_asprintf(const char *fmt, ...)
 338 {
 339         va_list ap;
 340         char *ptr;
 341
 342         do {
 343                 va_start(ap, fmt);
 344                 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
 345                 va_end(ap);
 346         } while (ptr == NULL);
 347
 348         return ptr;
 349 }
 350 EXPORT_SYMBOL(kmem_asprintf);
 351
 352 static char *
 353 __strdup(const char *str, int flags)
 354 {
 355         char *ptr;
 356         int n;
 357
 358         n = strlen(str);
 359         ptr = kmalloc_nofail(n + 1, flags);
 360         if (ptr)
 361                 memcpy(ptr, str, n + 1);
 362
 363         return ptr;
 364 }
 365
 366 char *
 367 strdup(const char *str)
 368 {
 369         return __strdup(str, KM_SLEEP);
 370 }
 371 EXPORT_SYMBOL(strdup);
 372
 373 void
 374 strfree(char *str)
 375 {
 376         kfree(str);
 377 }
 378 EXPORT_SYMBOL(strfree);
 379
 380 /*
 381  * Memory allocation interfaces and debugging for basic kmem_*
 382  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 383  * the SPL will keep track of the total memory allocated, and
 384  * report any memory leaked when the module is unloaded.
 385  */
 386 #ifdef DEBUG_KMEM
 387
 388 /* Shim layer memory accounting */
 389 # ifdef HAVE_ATOMIC64_T
 390 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 391 unsigned long long kmem_alloc_max = 0;
 392 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 393 unsigned long long vmem_alloc_max = 0;
 394 # else  /* HAVE_ATOMIC64_T */
 395 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 396 unsigned long long kmem_alloc_max = 0;
 397 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 398 unsigned long long vmem_alloc_max = 0;
 399 # endif /* HAVE_ATOMIC64_T */
 400
 401 EXPORT_SYMBOL(kmem_alloc_used);
 402 EXPORT_SYMBOL(kmem_alloc_max);
 403 EXPORT_SYMBOL(vmem_alloc_used);
 404 EXPORT_SYMBOL(vmem_alloc_max);
 405
 406 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 407  * but also the location of every alloc and free.  When the SPL module is
 408  * unloaded a list of all leaked addresses and where they were allocated
 409  * will be dumped to the console.  Enabling this feature has a significant
 410  * impact on performance but it makes finding memory leaks straight forward.
 411  *
 412  * Not surprisingly with debugging enabled the xmem_locks are very highly
 413  * contended particularly on xfree().  If we want to run with this detailed
 414  * debugging enabled for anything other than debugging  we need to minimize
 415  * the contention by moving to a lock per xmem_table entry model.
 416  */
 417 # ifdef DEBUG_KMEM_TRACKING
 418
 419 #  define KMEM_HASH_BITS          10
 420 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 421
 422 #  define VMEM_HASH_BITS          10
 423 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 424
 425 typedef struct kmem_debug {
 426         struct hlist_node kd_hlist;     /* Hash node linkage */
 427         struct list_head kd_list;       /* List of all allocations */
 428         void *kd_addr;                  /* Allocation pointer */
 429         size_t kd_size;                 /* Allocation size */
 430         const char *kd_func;            /* Allocation function */
 431         int kd_line;                    /* Allocation line */
 432 } kmem_debug_t;
 433
 434 spinlock_t kmem_lock;
 435 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 436 struct list_head kmem_list;
 437
 438 spinlock_t vmem_lock;
 439 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 440 struct list_head vmem_list;
 441
 442 EXPORT_SYMBOL(kmem_lock);
 443 EXPORT_SYMBOL(kmem_table);
 444 EXPORT_SYMBOL(kmem_list);
 445
 446 EXPORT_SYMBOL(vmem_lock);
 447 EXPORT_SYMBOL(vmem_table);
 448 EXPORT_SYMBOL(vmem_list);
 449
 450 static kmem_debug_t *
 451 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
 452 {
 453         struct hlist_head *head;
 454         struct hlist_node *node;
 455         struct kmem_debug *p;
 456         unsigned long flags;
 457         SENTRY;
 458
 459         spin_lock_irqsave(lock, flags);
 460
 461         head = &table[hash_ptr((void *)addr, bits)];
 462         hlist_for_each(node, head) {
 463                 p = list_entry(node, struct kmem_debug, kd_hlist);
 464                 if (p->kd_addr == addr) {
 465                         hlist_del_init(&p->kd_hlist);
 466                         list_del_init(&p->kd_list);
 467                         spin_unlock_irqrestore(lock, flags);
 468                         return p;
 469                 }
 470         }
 471
 472         spin_unlock_irqrestore(lock, flags);
 473
 474         SRETURN(NULL);
 475 }
 476
 477 void *
 478 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 479     int node_alloc, int node)
 480 {
 481         void *ptr = NULL;
 482         kmem_debug_t *dptr;
 483         unsigned long irq_flags;
 484         SENTRY;
 485
 486         /* Function may be called with KM_NOSLEEP so failure is possible */
 487         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 488             flags & ~__GFP_ZERO);
 489
 490         if (unlikely(dptr == NULL)) {
 491                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 492                     "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 493                     sizeof(kmem_debug_t), flags, func, line,
 494                     kmem_alloc_used_read(), kmem_alloc_max);
 495         } else {
 496                 /*
 497                  * Marked unlikely because we should never be doing this,
 498                  * we tolerate to up 2 pages but a single page is best.
 499                  */
 500                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 501                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
 502                             "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 503                             (unsigned long long) size, flags, func, line,
 504                             kmem_alloc_used_read(), kmem_alloc_max);
 505                         spl_debug_dumpstack(NULL);
 506                 }
 507
 508                 /*
 509                  *  We use __strdup() below because the string pointed to by
 510                  * __FUNCTION__ might not be available by the time we want
 511                  * to print it since the module might have been unloaded.
 512                  * This can only fail in the KM_NOSLEEP case.
 513                  */
 514                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 515                 if (unlikely(dptr->kd_func == NULL)) {
 516                         kfree(dptr);
 517                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 518                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 519                             func, line, kmem_alloc_used_read(), kmem_alloc_max);
 520                         goto out;
 521                 }
 522
 523                 /* Use the correct allocator */
 524                 if (node_alloc) {
 525                         ASSERT(!(flags & __GFP_ZERO));
 526                         ptr = kmalloc_node_nofail(size, flags, node);
 527                 } else if (flags & __GFP_ZERO) {
 528                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 529                 } else {
 530                         ptr = kmalloc_nofail(size, flags);
 531                 }
 532
 533                 if (unlikely(ptr == NULL)) {
 534                         kfree(dptr->kd_func);
 535                         kfree(dptr);
 536                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc"
 537                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 538                             (unsigned long long) size, flags, func, line,
 539                             kmem_alloc_used_read(), kmem_alloc_max);
 540                         goto out;
 541                 }
 542
 543                 kmem_alloc_used_add(size);
 544                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 545                         kmem_alloc_max = kmem_alloc_used_read();
 546
 547                 INIT_HLIST_NODE(&dptr->kd_hlist);
 548                 INIT_LIST_HEAD(&dptr->kd_list);
 549
 550                 dptr->kd_addr = ptr;
 551                 dptr->kd_size = size;
 552                 dptr->kd_line = line;
 553
 554                 spin_lock_irqsave(&kmem_lock, irq_flags);
 555                 hlist_add_head(&dptr->kd_hlist,
 556                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 557                 list_add_tail(&dptr->kd_list, &kmem_list);
 558                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 559
 560                 SDEBUG_LIMIT(SD_INFO,
 561                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 562                     (unsigned long long) size, flags, func, line, ptr,
 563                     kmem_alloc_used_read(), kmem_alloc_max);
 564         }
 565 out:
 566         SRETURN(ptr);
 567 }
 568 EXPORT_SYMBOL(kmem_alloc_track);
 569
 570 void
 571 kmem_free_track(const void *ptr, size_t size)
 572 {
 573         kmem_debug_t *dptr;
 574         SENTRY;
 575
 576         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 577             (unsigned long long) size);
 578
 579         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 580
 581         /* Must exist in hash due to kmem_alloc() */
 582         ASSERT(dptr);
 583
 584         /* Size must match */
 585         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 586             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 587             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 588
 589         kmem_alloc_used_sub(size);
 590         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 591             (unsigned long long) size, kmem_alloc_used_read(),
 592             kmem_alloc_max);
 593
 594         kfree(dptr->kd_func);
 595
 596         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 597         kfree(dptr);
 598
 599         memset((void *)ptr, 0x5a, size);
 600         kfree(ptr);
 601
 602         SEXIT;
 603 }
 604 EXPORT_SYMBOL(kmem_free_track);
 605
 606 void *
 607 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 608 {
 609         void *ptr = NULL;
 610         kmem_debug_t *dptr;
 611         unsigned long irq_flags;
 612         SENTRY;
 613
 614         ASSERT(flags & KM_SLEEP);
 615
 616         /* Function may be called with KM_NOSLEEP so failure is possible */
 617         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 618             flags & ~__GFP_ZERO);
 619         if (unlikely(dptr == NULL)) {
 620                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 621                     "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 622                     sizeof(kmem_debug_t), flags, func, line,
 623                     vmem_alloc_used_read(), vmem_alloc_max);
 624         } else {
 625                 /*
 626                  * We use __strdup() below because the string pointed to by
 627                  * __FUNCTION__ might not be available by the time we want
 628                  * to print it, since the module might have been unloaded.
 629                  * This can never fail because we have already asserted
 630                  * that flags is KM_SLEEP.
 631                  */
 632                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 633                 if (unlikely(dptr->kd_func == NULL)) {
 634                         kfree(dptr);
 635                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 636                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 637                             func, line, vmem_alloc_used_read(), vmem_alloc_max);
 638                         goto out;
 639                 }
 640
 641                 /* Use the correct allocator */
 642                 if (flags & __GFP_ZERO) {
 643                         ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
 644                 } else {
 645                         ptr = vmalloc_nofail(size, flags);
 646                 }
 647
 648                 if (unlikely(ptr == NULL)) {
 649                         kfree(dptr->kd_func);
 650                         kfree(dptr);
 651                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc"
 652                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 653                             (unsigned long long) size, flags, func, line,
 654                             vmem_alloc_used_read(), vmem_alloc_max);
 655                         goto out;
 656                 }
 657
 658                 vmem_alloc_used_add(size);
 659                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 660                         vmem_alloc_max = vmem_alloc_used_read();
 661
 662                 INIT_HLIST_NODE(&dptr->kd_hlist);
 663                 INIT_LIST_HEAD(&dptr->kd_list);
 664
 665                 dptr->kd_addr = ptr;
 666                 dptr->kd_size = size;
 667                 dptr->kd_line = line;
 668
 669                 spin_lock_irqsave(&vmem_lock, irq_flags);
 670                 hlist_add_head(&dptr->kd_hlist,
 671                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 672                 list_add_tail(&dptr->kd_list, &vmem_list);
 673                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 674
 675                 SDEBUG_LIMIT(SD_INFO,
 676                     "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 677                     (unsigned long long) size, flags, func, line,
 678                     ptr, vmem_alloc_used_read(), vmem_alloc_max);
 679         }
 680 out:
 681         SRETURN(ptr);
 682 }
 683 EXPORT_SYMBOL(vmem_alloc_track);
 684
 685 void
 686 vmem_free_track(const void *ptr, size_t size)
 687 {
 688         kmem_debug_t *dptr;
 689         SENTRY;
 690
 691         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 692             (unsigned long long) size);
 693
 694         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 695
 696         /* Must exist in hash due to vmem_alloc() */
 697         ASSERT(dptr);
 698
 699         /* Size must match */
 700         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 701             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 702             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 703
 704         vmem_alloc_used_sub(size);
 705         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 706             (unsigned long long) size, vmem_alloc_used_read(),
 707             vmem_alloc_max);
 708
 709         kfree(dptr->kd_func);
 710
 711         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 712         kfree(dptr);
 713
 714         memset((void *)ptr, 0x5a, size);
 715         vfree(ptr);
 716
 717         SEXIT;
 718 }
 719 EXPORT_SYMBOL(vmem_free_track);
 720
 721 # else /* DEBUG_KMEM_TRACKING */
 722
 723 void *
 724 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 725     int node_alloc, int node)
 726 {
 727         void *ptr;
 728         SENTRY;
 729
 730         /*
 731          * Marked unlikely because we should never be doing this,
 732          * we tolerate to up 2 pages but a single page is best.
 733          */
 734         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 735                 SDEBUG(SD_CONSOLE | SD_WARNING,
 736                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 737                     (unsigned long long) size, flags, func, line,
 738                     kmem_alloc_used_read(), kmem_alloc_max);
 739                 spl_debug_dumpstack(NULL);
 740         }
 741
 742         /* Use the correct allocator */
 743         if (node_alloc) {
 744                 ASSERT(!(flags & __GFP_ZERO));
 745                 ptr = kmalloc_node_nofail(size, flags, node);
 746         } else if (flags & __GFP_ZERO) {
 747                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 748         } else {
 749                 ptr = kmalloc_nofail(size, flags);
 750         }
 751
 752         if (unlikely(ptr == NULL)) {
 753                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 754                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 755                     (unsigned long long) size, flags, func, line,
 756                     kmem_alloc_used_read(), kmem_alloc_max);
 757         } else {
 758                 kmem_alloc_used_add(size);
 759                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 760                         kmem_alloc_max = kmem_alloc_used_read();
 761
 762                 SDEBUG_LIMIT(SD_INFO,
 763                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 764                     (unsigned long long) size, flags, func, line, ptr,
 765                     kmem_alloc_used_read(), kmem_alloc_max);
 766         }
 767
 768         SRETURN(ptr);
 769 }
 770 EXPORT_SYMBOL(kmem_alloc_debug);
 771
 772 void
 773 kmem_free_debug(const void *ptr, size_t size)
 774 {
 775         SENTRY;
 776
 777         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 778             (unsigned long long) size);
 779
 780         kmem_alloc_used_sub(size);
 781         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 782             (unsigned long long) size, kmem_alloc_used_read(),
 783             kmem_alloc_max);
 784         kfree(ptr);
 785
 786         SEXIT;
 787 }
 788 EXPORT_SYMBOL(kmem_free_debug);
 789
 790 void *
 791 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 792 {
 793         void *ptr;
 794         SENTRY;
 795
 796         ASSERT(flags & KM_SLEEP);
 797
 798         /* Use the correct allocator */
 799         if (flags & __GFP_ZERO) {
 800                 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
 801         } else {
 802                 ptr = vmalloc_nofail(size, flags);
 803         }
 804
 805         if (unlikely(ptr == NULL)) {
 806                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 807                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 808                     (unsigned long long) size, flags, func, line,
 809                     vmem_alloc_used_read(), vmem_alloc_max);
 810         } else {
 811                 vmem_alloc_used_add(size);
 812                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 813                         vmem_alloc_max = vmem_alloc_used_read();
 814
 815                 SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 816                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 817                     vmem_alloc_used_read(), vmem_alloc_max);
 818         }
 819
 820         SRETURN(ptr);
 821 }
 822 EXPORT_SYMBOL(vmem_alloc_debug);
 823
 824 void
 825 vmem_free_debug(const void *ptr, size_t size)
 826 {
 827         SENTRY;
 828
 829         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 830             (unsigned long long) size);
 831
 832         vmem_alloc_used_sub(size);
 833         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 834             (unsigned long long) size, vmem_alloc_used_read(),
 835             vmem_alloc_max);
 836         vfree(ptr);
 837
 838         SEXIT;
 839 }
 840 EXPORT_SYMBOL(vmem_free_debug);
 841
 842 # endif /* DEBUG_KMEM_TRACKING */
 843 #endif /* DEBUG_KMEM */
 844
 845 /*
 846  * Slab allocation interfaces
 847  *
 848  * While the Linux slab implementation was inspired by the Solaris
 849  * implementation I cannot use it to emulate the Solaris APIs.  I
 850  * require two features which are not provided by the Linux slab.
 851  *
 852  * 1) Constructors AND destructors.  Recent versions of the Linux
 853  *    kernel have removed support for destructors.  This is a deal
 854  *    breaker for the SPL which contains particularly expensive
 855  *    initializers for mutex's, condition variables, etc.  We also
 856  *    require a minimal level of cleanup for these data types unlike
 857  *    many Linux data type which do need to be explicitly destroyed.
 858  *
 859  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 860  *    expect it to work well for both small are very large allocations.
 861  *    Because of memory fragmentation the Linux slab which is backed
 862  *    by kmalloc'ed memory performs very badly when confronted with
 863  *    large numbers of large allocations.  Basing the slab on the
 864  *    virtual address space removes the need for contiguous pages
 865  *    and greatly improve performance for large allocations.
 866  *
 867  * For these reasons, the SPL has its own slab implementation with
 868  * the needed features.  It is not as highly optimized as either the
 869  * Solaris or Linux slabs, but it should get me most of what is
 870  * needed until it can be optimized or obsoleted by another approach.
 871  *
 872  * One serious concern I do have about this method is the relatively
 873  * small virtual address space on 32bit arches.  This will seriously
 874  * constrain the size of the slab caches and their performance.
 875  *
 876  * XXX: Improve the partial slab list by carefully maintaining a
 877  *      strict ordering of fullest to emptiest slabs based on
 878  *      the slab reference count.  This guarantees the when freeing
 879  *      slabs back to the system we need only linearly traverse the
 880  *      last N slabs in the list to discover all the freeable slabs.
 881  *
 882  * XXX: NUMA awareness for optionally allocating memory close to a
 883  *      particular core.  This can be advantageous if you know the slab
 884  *      object will be short lived and primarily accessed from one core.
 885  *
 886  * XXX: Slab coloring may also yield performance improvements and would
 887  *      be desirable to implement.
 888  */
 889
 890 struct list_head spl_kmem_cache_list;   /* List of caches */
 891 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 892 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 893
 894 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 895
 896 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 897 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 898         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 899
 900 static void *
 901 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 902 {
 903         void *ptr;
 904
 905         ASSERT(ISP2(size));
 906
 907         if (skc->skc_flags & KMC_KMEM)
 908                 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
 909                     get_order(size));
 910         else
 911                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 912
 913         /* Resulting allocated memory will be page aligned */
 914         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 915
 916         return ptr;
 917 }
 918
 919 static void
 920 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 921 {
 922         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 923         ASSERT(ISP2(size));
 924
 925         /*
 926          * The Linux direct reclaim path uses this out of band value to
 927          * determine if forward progress is being made.  Normally this is
 928          * incremented by kmem_freepages() which is part of the various
 929          * Linux slab implementations.  However, since we are using none
 930          * of that infrastructure we are responsible for incrementing it.
 931          */
 932         if (current->reclaim_state)
 933                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 934
 935         if (skc->skc_flags & KMC_KMEM)
 936                 free_pages((unsigned long)ptr, get_order(size));
 937         else
 938                 vfree(ptr);
 939 }
 940
 941 /*
 942  * Required space for each aligned sks.
 943  */
 944 static inline uint32_t
 945 spl_sks_size(spl_kmem_cache_t *skc)
 946 {
 947         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 948                skc->skc_obj_align, uint32_t);
 949 }
 950
 951 /*
 952  * Required space for each aligned object.
 953  */
 954 static inline uint32_t
 955 spl_obj_size(spl_kmem_cache_t *skc)
 956 {
 957         uint32_t align = skc->skc_obj_align;
 958
 959         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 960                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 961 }
 962
 963 /*
 964  * Lookup the spl_kmem_object_t for an object given that object.
 965  */
 966 static inline spl_kmem_obj_t *
 967 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 968 {
 969         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 970                skc->skc_obj_align, uint32_t);
 971 }
 972
 973 /*
 974  * Required space for each offslab object taking in to account alignment
 975  * restrictions and the power-of-two requirement of kv_alloc().
 976  */
 977 static inline uint32_t
 978 spl_offslab_size(spl_kmem_cache_t *skc)
 979 {
 980         return 1UL << (highbit(spl_obj_size(skc)) + 1);
 981 }
 982
 983 /*
 984  * It's important that we pack the spl_kmem_obj_t structure and the
 985  * actual objects in to one large address space to minimize the number
 986  * of calls to the allocator.  It is far better to do a few large
 987  * allocations and then subdivide it ourselves.  Now which allocator
 988  * we use requires balancing a few trade offs.
 989  *
 990  * For small objects we use kmem_alloc() because as long as you are
 991  * only requesting a small number of pages (ideally just one) its cheap.
 992  * However, when you start requesting multiple pages with kmem_alloc()
 993  * it gets increasingly expensive since it requires contiguous pages.
 994  * For this reason we shift to vmem_alloc() for slabs of large objects
 995  * which removes the need for contiguous pages.  We do not use
 996  * vmem_alloc() in all cases because there is significant locking
 997  * overhead in __get_vm_area_node().  This function takes a single
 998  * global lock when acquiring an available virtual address range which
 999  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
1000  * different allocation functions for small and large objects should
1001  * give us the best of both worlds.
1002  *
1003  * KMC_ONSLAB                       KMC_OFFSLAB
1004  *
1005  * +------------------------+       +-----------------+
1006  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
1007  * | skc_obj_size    <-+ |  |       +-----------------+   | |
1008  * | spl_kmem_obj_t      |  |                             | |
1009  * | skc_obj_size    <---+  |       +-----------------+   | |
1010  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
1011  * | ...                 v  |       | spl_kmem_obj_t  |     |
1012  * +------------------------+       +-----------------+     v
1013  */
1014 static spl_kmem_slab_t *
1015 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
1016 {
1017         spl_kmem_slab_t *sks;
1018         spl_kmem_obj_t *sko, *n;
1019         void *base, *obj;
1020         uint32_t obj_size, offslab_size = 0;
1021         int i,  rc = 0;
1022
1023         base = kv_alloc(skc, skc->skc_slab_size, flags);
1024         if (base == NULL)
1025                 SRETURN(NULL);
1026
1027         sks = (spl_kmem_slab_t *)base;
1028         sks->sks_magic = SKS_MAGIC;
1029         sks->sks_objs = skc->skc_slab_objs;
1030         sks->sks_age = jiffies;
1031         sks->sks_cache = skc;
1032         INIT_LIST_HEAD(&sks->sks_list);
1033         INIT_LIST_HEAD(&sks->sks_free_list);
1034         sks->sks_ref = 0;
1035         obj_size = spl_obj_size(skc);
1036
1037         if (skc->skc_flags & KMC_OFFSLAB)
1038                 offslab_size = spl_offslab_size(skc);
1039
1040         for (i = 0; i < sks->sks_objs; i++) {
1041                 if (skc->skc_flags & KMC_OFFSLAB) {
1042                         obj = kv_alloc(skc, offslab_size, flags);
1043                         if (!obj)
1044                                 SGOTO(out, rc = -ENOMEM);
1045                 } else {
1046                         obj = base + spl_sks_size(skc) + (i * obj_size);
1047                 }
1048
1049                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1050                 sko = spl_sko_from_obj(skc, obj);
1051                 sko->sko_addr = obj;
1052                 sko->sko_magic = SKO_MAGIC;
1053                 sko->sko_slab = sks;
1054                 INIT_LIST_HEAD(&sko->sko_list);
1055                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
1056         }
1057
1058         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
1059                 if (skc->skc_ctor)
1060                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
1061 out:
1062         if (rc) {
1063                 if (skc->skc_flags & KMC_OFFSLAB)
1064                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
1065                                                  sko_list)
1066                                 kv_free(skc, sko->sko_addr, offslab_size);
1067
1068                 kv_free(skc, base, skc->skc_slab_size);
1069                 sks = NULL;
1070         }
1071
1072         SRETURN(sks);
1073 }
1074
1075 /*
1076  * Remove a slab from complete or partial list, it must be called with
1077  * the 'skc->skc_lock' held but the actual free must be performed
1078  * outside the lock to prevent deadlocking on vmem addresses.
1079  */
1080 static void
1081 spl_slab_free(spl_kmem_slab_t *sks,
1082               struct list_head *sks_list, struct list_head *sko_list)
1083 {
1084         spl_kmem_cache_t *skc;
1085         SENTRY;
1086
1087         ASSERT(sks->sks_magic == SKS_MAGIC);
1088         ASSERT(sks->sks_ref == 0);
1089
1090         skc = sks->sks_cache;
1091         ASSERT(skc->skc_magic == SKC_MAGIC);
1092         ASSERT(spin_is_locked(&skc->skc_lock));
1093
1094         /*
1095          * Update slab/objects counters in the cache, then remove the
1096          * slab from the skc->skc_partial_list.  Finally add the slab
1097          * and all its objects in to the private work lists where the
1098          * destructors will be called and the memory freed to the system.
1099          */
1100         skc->skc_obj_total -= sks->sks_objs;
1101         skc->skc_slab_total--;
1102         list_del(&sks->sks_list);
1103         list_add(&sks->sks_list, sks_list);
1104         list_splice_init(&sks->sks_free_list, sko_list);
1105
1106         SEXIT;
1107 }
1108
1109 /*
1110  * Traverses all the partial slabs attached to a cache and free those
1111  * which which are currently empty, and have not been touched for
1112  * skc_delay seconds to  avoid thrashing.  The count argument is
1113  * passed to optionally cap the number of slabs reclaimed, a count
1114  * of zero means try and reclaim everything.  When flag is set we
1115  * always free an available slab regardless of age.
1116  */
1117 static void
1118 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
1119 {
1120         spl_kmem_slab_t *sks, *m;
1121         spl_kmem_obj_t *sko, *n;
1122         LIST_HEAD(sks_list);
1123         LIST_HEAD(sko_list);
1124         uint32_t size = 0;
1125         int i = 0;
1126         SENTRY;
1127
1128         /*
1129          * Move empty slabs and objects which have not been touched in
1130          * skc_delay seconds on to private lists to be freed outside
1131          * the spin lock.  This delay time is important to avoid thrashing
1132          * however when flag is set the delay will not be used.
1133          */
1134         spin_lock(&skc->skc_lock);
1135         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
1136                 /*
1137                  * All empty slabs are at the end of skc->skc_partial_list,
1138                  * therefore once a non-empty slab is found we can stop
1139                  * scanning.  Additionally, stop when reaching the target
1140                  * reclaim 'count' if a non-zero threshold is given.
1141                  */
1142                 if ((sks->sks_ref > 0) || (count && i >= count))
1143                         break;
1144
1145                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
1146                         spl_slab_free(sks, &sks_list, &sko_list);
1147                         i++;
1148                 }
1149         }
1150         spin_unlock(&skc->skc_lock);
1151
1152         /*
1153          * The following two loops ensure all the object destructors are
1154          * run, any offslab objects are freed, and the slabs themselves
1155          * are freed.  This is all done outside the skc->skc_lock since
1156          * this allows the destructor to sleep, and allows us to perform
1157          * a conditional reschedule when a freeing a large number of
1158          * objects and slabs back to the system.
1159          */
1160         if (skc->skc_flags & KMC_OFFSLAB)
1161                 size = spl_offslab_size(skc);
1162
1163         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1164                 ASSERT(sko->sko_magic == SKO_MAGIC);
1165
1166                 if (skc->skc_dtor)
1167                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
1168
1169                 if (skc->skc_flags & KMC_OFFSLAB)
1170                         kv_free(skc, sko->sko_addr, size);
1171         }
1172
1173         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1174                 ASSERT(sks->sks_magic == SKS_MAGIC);
1175                 kv_free(skc, sks, skc->skc_slab_size);
1176         }
1177
1178         SEXIT;
1179 }
1180
1181 static spl_kmem_emergency_t *
1182 spl_emergency_search(struct rb_root *root, void *obj)
1183 {
1184         struct rb_node *node = root->rb_node;
1185         spl_kmem_emergency_t *ske;
1186         unsigned long address = (unsigned long)obj;
1187
1188         while (node) {
1189                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
1190
1191                 if (address < (unsigned long)ske->ske_obj)
1192                         node = node->rb_left;
1193                 else if (address > (unsigned long)ske->ske_obj)
1194                         node = node->rb_right;
1195                 else
1196                         return ske;
1197         }
1198
1199         return NULL;
1200 }
1201
1202 static int
1203 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
1204 {
1205         struct rb_node **new = &(root->rb_node), *parent = NULL;
1206         spl_kmem_emergency_t *ske_tmp;
1207         unsigned long address = (unsigned long)ske->ske_obj;
1208
1209         while (*new) {
1210                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
1211
1212                 parent = *new;
1213                 if (address < (unsigned long)ske_tmp->ske_obj)
1214                         new = &((*new)->rb_left);
1215                 else if (address > (unsigned long)ske_tmp->ske_obj)
1216                         new = &((*new)->rb_right);
1217                 else
1218                         return 0;
1219         }
1220
1221         rb_link_node(&ske->ske_node, parent, new);
1222         rb_insert_color(&ske->ske_node, root);
1223
1224         return 1;
1225 }
1226
1227 /*
1228  * Allocate a single emergency object and track it in a red black tree.
1229  */
1230 static int
1231 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
1232 {
1233         spl_kmem_emergency_t *ske;
1234         int empty;
1235         SENTRY;
1236
1237         /* Last chance use a partial slab if one now exists */
1238         spin_lock(&skc->skc_lock);
1239         empty = list_empty(&skc->skc_partial_list);
1240         spin_unlock(&skc->skc_lock);
1241         if (!empty)
1242                 SRETURN(-EEXIST);
1243
1244         ske = kmalloc(sizeof(*ske), flags);
1245         if (ske == NULL)
1246                 SRETURN(-ENOMEM);
1247
1248         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
1249         if (ske->ske_obj == NULL) {
1250                 kfree(ske);
1251                 SRETURN(-ENOMEM);
1252         }
1253
1254         spin_lock(&skc->skc_lock);
1255         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
1256         if (likely(empty)) {
1257                 skc->skc_obj_total++;
1258                 skc->skc_obj_emergency++;
1259                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
1260                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
1261         }
1262         spin_unlock(&skc->skc_lock);
1263
1264         if (unlikely(!empty)) {
1265                 kfree(ske->ske_obj);
1266                 kfree(ske);
1267                 SRETURN(-EINVAL);
1268         }
1269
1270         if (skc->skc_ctor)
1271                 skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
1272
1273         *obj = ske->ske_obj;
1274
1275         SRETURN(0);
1276 }
1277
1278 /*
1279  * Locate the passed object in the red black tree and free it.
1280  */
1281 static int
1282 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1283 {
1284         spl_kmem_emergency_t *ske;
1285         SENTRY;
1286
1287         spin_lock(&skc->skc_lock);
1288         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1289         if (likely(ske)) {
1290                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1291                 skc->skc_obj_emergency--;
1292                 skc->skc_obj_total--;
1293         }
1294         spin_unlock(&skc->skc_lock);
1295
1296         if (unlikely(ske == NULL))
1297                 SRETURN(-ENOENT);
1298
1299         if (skc->skc_dtor)
1300                 skc->skc_dtor(ske->ske_obj, skc->skc_private);
1301
1302         kfree(ske->ske_obj);
1303         kfree(ske);
1304
1305         SRETURN(0);
1306 }
1307
1308 /*
1309  * Release objects from the per-cpu magazine back to their slab.  The flush
1310  * argument contains the max number of entries to remove from the magazine.
1311  */
1312 static void
1313 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1314 {
1315         int i, count = MIN(flush, skm->skm_avail);
1316         SENTRY;
1317
1318         ASSERT(skc->skc_magic == SKC_MAGIC);
1319         ASSERT(skm->skm_magic == SKM_MAGIC);
1320         ASSERT(spin_is_locked(&skc->skc_lock));
1321
1322         for (i = 0; i < count; i++)
1323                 spl_cache_shrink(skc, skm->skm_objs[i]);
1324
1325         skm->skm_avail -= count;
1326         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1327                 sizeof(void *) * skm->skm_avail);
1328
1329         SEXIT;
1330 }
1331
1332 static void
1333 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1334 {
1335         spin_lock(&skc->skc_lock);
1336         __spl_cache_flush(skc, skm, flush);
1337         spin_unlock(&skc->skc_lock);
1338 }
1339
1340 static void
1341 spl_magazine_age(void *data)
1342 {
1343         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1344         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1345
1346         ASSERT(skm->skm_magic == SKM_MAGIC);
1347         ASSERT(skm->skm_cpu == smp_processor_id());
1348         ASSERT(irqs_disabled());
1349
1350         /* There are no available objects or they are too young to age out */
1351         if ((skm->skm_avail == 0) ||
1352             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1353                 return;
1354
1355         /*
1356          * Because we're executing in interrupt context we may have
1357          * interrupted the holder of this lock.  To avoid a potential
1358          * deadlock return if the lock is contended.
1359          */
1360         if (!spin_trylock(&skc->skc_lock))
1361                 return;
1362
1363         __spl_cache_flush(skc, skm, skm->skm_refill);
1364         spin_unlock(&skc->skc_lock);
1365 }
1366
1367 /*
1368  * Called regularly to keep a downward pressure on the cache.
1369  *
1370  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1371  * be returned to the caches.  This is done to prevent idle magazines from
1372  * holding memory which could be better used elsewhere.  The delay is
1373  * present to prevent thrashing the magazine.
1374  *
1375  * The newly released objects may result in empty partial slabs.  Those
1376  * slabs should be released to the system.  Otherwise moving the objects
1377  * out of the magazines is just wasted work.
1378  */
1379 static void
1380 spl_cache_age(void *data)
1381 {
1382         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1383         taskqid_t id = 0;
1384
1385         ASSERT(skc->skc_magic == SKC_MAGIC);
1386
1387         /* Dynamically disabled at run time */
1388         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1389                 return;
1390
1391         atomic_inc(&skc->skc_ref);
1392
1393         if (!(skc->skc_flags & KMC_NOMAGAZINE))
1394                 spl_on_each_cpu(spl_magazine_age, skc, 1);
1395
1396         spl_slab_reclaim(skc, skc->skc_reap, 0);
1397
1398         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1399                 id = taskq_dispatch_delay(
1400                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1401                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1402
1403                 /* Destroy issued after dispatch immediately cancel it */
1404                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1405                         taskq_cancel_id(spl_kmem_cache_taskq, id);
1406         }
1407
1408         spin_lock(&skc->skc_lock);
1409         skc->skc_taskqid = id;
1410         spin_unlock(&skc->skc_lock);
1411
1412         atomic_dec(&skc->skc_ref);
1413 }
1414
1415 /*
1416  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1417  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
1418  * for very small objects we may end up with more than this so as not
1419  * to waste space in the minimal allocation of a single page.  Also for
1420  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
1421  * lower than this and we will fail.
1422  */
1423 static int
1424 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1425 {
1426         uint32_t sks_size, obj_size, max_size;
1427
1428         if (skc->skc_flags & KMC_OFFSLAB) {
1429                 *objs = spl_kmem_cache_obj_per_slab;
1430                 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
1431                 SRETURN(0);
1432         } else {
1433                 sks_size = spl_sks_size(skc);
1434                 obj_size = spl_obj_size(skc);
1435
1436                 if (skc->skc_flags & KMC_KMEM)
1437                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1438                 else
1439                         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
1440
1441                 /* Power of two sized slab */
1442                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1443                         *objs = (*size - sks_size) / obj_size;
1444                         if (*objs >= spl_kmem_cache_obj_per_slab)
1445                                 SRETURN(0);
1446                 }
1447
1448                 /*
1449                  * Unable to satisfy target objects per slab, fall back to
1450                  * allocating a maximally sized slab and assuming it can
1451                  * contain the minimum objects count use it.  If not fail.
1452                  */
1453                 *size = max_size;
1454                 *objs = (*size - sks_size) / obj_size;
1455                 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
1456                         SRETURN(0);
1457         }
1458
1459         SRETURN(-ENOSPC);
1460 }
1461
1462 /*
1463  * Make a guess at reasonable per-cpu magazine size based on the size of
1464  * each object and the cost of caching N of them in each magazine.  Long
1465  * term this should really adapt based on an observed usage heuristic.
1466  */
1467 static int
1468 spl_magazine_size(spl_kmem_cache_t *skc)
1469 {
1470         uint32_t obj_size = spl_obj_size(skc);
1471         int size;
1472         SENTRY;
1473
1474         /* Per-magazine sizes below assume a 4Kib page size */
1475         if (obj_size > (PAGE_SIZE * 256))
1476                 size = 4;  /* Minimum 4Mib per-magazine */
1477         else if (obj_size > (PAGE_SIZE * 32))
1478                 size = 16; /* Minimum 2Mib per-magazine */
1479         else if (obj_size > (PAGE_SIZE))
1480                 size = 64; /* Minimum 256Kib per-magazine */
1481         else if (obj_size > (PAGE_SIZE / 4))
1482                 size = 128; /* Minimum 128Kib per-magazine */
1483         else
1484                 size = 256;
1485
1486         SRETURN(size);
1487 }
1488
1489 /*
1490  * Allocate a per-cpu magazine to associate with a specific core.
1491  */
1492 static spl_kmem_magazine_t *
1493 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
1494 {
1495         spl_kmem_magazine_t *skm;
1496         int size = sizeof(spl_kmem_magazine_t) +
1497                    sizeof(void *) * skc->skc_mag_size;
1498         SENTRY;
1499
1500         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
1501         if (skm) {
1502                 skm->skm_magic = SKM_MAGIC;
1503                 skm->skm_avail = 0;
1504                 skm->skm_size = skc->skc_mag_size;
1505                 skm->skm_refill = skc->skc_mag_refill;
1506                 skm->skm_cache = skc;
1507                 skm->skm_age = jiffies;
1508                 skm->skm_cpu = cpu;
1509         }
1510
1511         SRETURN(skm);
1512 }
1513
1514 /*
1515  * Free a per-cpu magazine associated with a specific core.
1516  */
1517 static void
1518 spl_magazine_free(spl_kmem_magazine_t *skm)
1519 {
1520         int size = sizeof(spl_kmem_magazine_t) +
1521                    sizeof(void *) * skm->skm_size;
1522
1523         SENTRY;
1524         ASSERT(skm->skm_magic == SKM_MAGIC);
1525         ASSERT(skm->skm_avail == 0);
1526
1527         kmem_free(skm, size);
1528         SEXIT;
1529 }
1530
1531 /*
1532  * Create all pre-cpu magazines of reasonable sizes.
1533  */
1534 static int
1535 spl_magazine_create(spl_kmem_cache_t *skc)
1536 {
1537         int i;
1538         SENTRY;
1539
1540         if (skc->skc_flags & KMC_NOMAGAZINE)
1541                 SRETURN(0);
1542
1543         skc->skc_mag_size = spl_magazine_size(skc);
1544         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1545
1546         for_each_online_cpu(i) {
1547                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
1548                 if (!skc->skc_mag[i]) {
1549                         for (i--; i >= 0; i--)
1550                                 spl_magazine_free(skc->skc_mag[i]);
1551
1552                         SRETURN(-ENOMEM);
1553                 }
1554         }
1555
1556         SRETURN(0);
1557 }
1558
1559 /*
1560  * Destroy all pre-cpu magazines.
1561  */
1562 static void
1563 spl_magazine_destroy(spl_kmem_cache_t *skc)
1564 {
1565         spl_kmem_magazine_t *skm;
1566         int i;
1567         SENTRY;
1568
1569         if (skc->skc_flags & KMC_NOMAGAZINE) {
1570                 SEXIT;
1571                 return;
1572         }
1573
1574         for_each_online_cpu(i) {
1575                 skm = skc->skc_mag[i];
1576                 spl_cache_flush(skc, skm, skm->skm_avail);
1577                 spl_magazine_free(skm);
1578         }
1579
1580         SEXIT;
1581 }
1582
1583 /*
1584  * Create a object cache based on the following arguments:
1585  * name         cache name
1586  * size         cache object size
1587  * align        cache object alignment
1588  * ctor         cache object constructor
1589  * dtor         cache object destructor
1590  * reclaim      cache object reclaim
1591  * priv         cache private data for ctor/dtor/reclaim
1592  * vmp          unused must be NULL
1593  * flags
1594  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1595  *      KMC_NODEBUG     Disable debugging (unsupported)
1596  *      KMC_NOHASH      Disable hashing (unsupported)
1597  *      KMC_QCACHE      Disable qcache (unsupported)
1598  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
1599  *      KMC_KMEM        Force kmem backed cache
1600  *      KMC_VMEM        Force vmem backed cache
1601  *      KMC_SLAB        Force Linux slab backed cache
1602  *      KMC_OFFSLAB     Locate objects off the slab
1603  */
1604 spl_kmem_cache_t *
1605 spl_kmem_cache_create(char *name, size_t size, size_t align,
1606                       spl_kmem_ctor_t ctor,
1607                       spl_kmem_dtor_t dtor,
1608                       spl_kmem_reclaim_t reclaim,
1609                       void *priv, void *vmp, int flags)
1610 {
1611         spl_kmem_cache_t *skc;
1612         int rc;
1613         SENTRY;
1614
1615         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1616         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1617         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1618         ASSERT(vmp == NULL);
1619
1620         might_sleep();
1621
1622         /*
1623          * Allocate memory for a new cache an initialize it.  Unfortunately,
1624          * this usually ends up being a large allocation of ~32k because
1625          * we need to allocate enough memory for the worst case number of
1626          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1627          * explicitly pass KM_NODEBUG to suppress the kmem warning
1628          */
1629         skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
1630         if (skc == NULL)
1631                 SRETURN(NULL);
1632
1633         skc->skc_magic = SKC_MAGIC;
1634         skc->skc_name_size = strlen(name) + 1;
1635         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
1636         if (skc->skc_name == NULL) {
1637                 kmem_free(skc, sizeof(*skc));
1638                 SRETURN(NULL);
1639         }
1640         strncpy(skc->skc_name, name, skc->skc_name_size);
1641
1642         skc->skc_ctor = ctor;
1643         skc->skc_dtor = dtor;
1644         skc->skc_reclaim = reclaim;
1645         skc->skc_private = priv;
1646         skc->skc_vmp = vmp;
1647         skc->skc_linux_cache = NULL;
1648         skc->skc_flags = flags;
1649         skc->skc_obj_size = size;
1650         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1651         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1652         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1653         atomic_set(&skc->skc_ref, 0);
1654
1655         INIT_LIST_HEAD(&skc->skc_list);
1656         INIT_LIST_HEAD(&skc->skc_complete_list);
1657         INIT_LIST_HEAD(&skc->skc_partial_list);
1658         skc->skc_emergency_tree = RB_ROOT;
1659         spin_lock_init(&skc->skc_lock);
1660         init_waitqueue_head(&skc->skc_waitq);
1661         skc->skc_slab_fail = 0;
1662         skc->skc_slab_create = 0;
1663         skc->skc_slab_destroy = 0;
1664         skc->skc_slab_total = 0;
1665         skc->skc_slab_alloc = 0;
1666         skc->skc_slab_max = 0;
1667         skc->skc_obj_total = 0;
1668         skc->skc_obj_alloc = 0;
1669         skc->skc_obj_max = 0;
1670         skc->skc_obj_deadlock = 0;
1671         skc->skc_obj_emergency = 0;
1672         skc->skc_obj_emergency_max = 0;
1673
1674         /*
1675          * Verify the requested alignment restriction is sane.
1676          */
1677         if (align) {
1678                 VERIFY(ISP2(align));
1679                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
1680                 VERIFY3U(align, <=, PAGE_SIZE);
1681                 skc->skc_obj_align = align;
1682         }
1683
1684         /*
1685          * When no specific type of slab is requested (kmem, vmem, or
1686          * linuxslab) then select a cache type based on the object size
1687          * and default tunables.
1688          */
1689         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
1690
1691                 /*
1692                  * Objects smaller than spl_kmem_cache_slab_limit can
1693                  * use the Linux slab for better space-efficiency.  By
1694                  * default this functionality is disabled until its
1695                  * performance characters are fully understood.
1696                  */
1697                 if (spl_kmem_cache_slab_limit &&
1698                     size <= (size_t)spl_kmem_cache_slab_limit)
1699                         skc->skc_flags |= KMC_SLAB;
1700
1701                 /*
1702                  * Small objects, less than spl_kmem_cache_kmem_limit per
1703                  * object should use kmem because their slabs are small.
1704                  */
1705                 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
1706                         skc->skc_flags |= KMC_KMEM;
1707
1708                 /*
1709                  * All other objects are considered large and are placed
1710                  * on vmem backed slabs.
1711                  */
1712                 else
1713                         skc->skc_flags |= KMC_VMEM;
1714         }
1715
1716         /*
1717          * Given the type of slab allocate the required resources.
1718          */
1719         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1720                 rc = spl_slab_size(skc,
1721                     &skc->skc_slab_objs, &skc->skc_slab_size);
1722                 if (rc)
1723                         SGOTO(out, rc);
1724
1725                 rc = spl_magazine_create(skc);
1726                 if (rc)
1727                         SGOTO(out, rc);
1728         } else {
1729                 skc->skc_linux_cache = kmem_cache_create(
1730                     skc->skc_name, size, align, 0, NULL);
1731                 if (skc->skc_linux_cache == NULL)
1732                         SGOTO(out, rc = ENOMEM);
1733
1734                 kmem_cache_set_allocflags(skc, __GFP_COMP);
1735                 skc->skc_flags |= KMC_NOMAGAZINE;
1736         }
1737
1738         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1739                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1740                     spl_cache_age, skc, TQ_SLEEP,
1741                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1742
1743         down_write(&spl_kmem_cache_sem);
1744         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1745         up_write(&spl_kmem_cache_sem);
1746
1747         SRETURN(skc);
1748 out:
1749         kmem_free(skc->skc_name, skc->skc_name_size);
1750         kmem_free(skc, sizeof(*skc));
1751         SRETURN(NULL);
1752 }
1753 EXPORT_SYMBOL(spl_kmem_cache_create);
1754
1755 /*
1756  * Register a move callback to for cache defragmentation.
1757  * XXX: Unimplemented but harmless to stub out for now.
1758  */
1759 void
1760 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
1761     kmem_cbrc_t (move)(void *, void *, size_t, void *))
1762 {
1763         ASSERT(move != NULL);
1764 }
1765 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1766
1767 /*
1768  * Destroy a cache and all objects associated with the cache.
1769  */
1770 void
1771 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1772 {
1773         DECLARE_WAIT_QUEUE_HEAD(wq);
1774         taskqid_t id;
1775         SENTRY;
1776
1777         ASSERT(skc->skc_magic == SKC_MAGIC);
1778         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1779
1780         down_write(&spl_kmem_cache_sem);
1781         list_del_init(&skc->skc_list);
1782         up_write(&spl_kmem_cache_sem);
1783
1784         /* Cancel any and wait for any pending delayed tasks */
1785         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1786
1787         spin_lock(&skc->skc_lock);
1788         id = skc->skc_taskqid;
1789         spin_unlock(&skc->skc_lock);
1790
1791         taskq_cancel_id(spl_kmem_cache_taskq, id);
1792
1793         /* Wait until all current callers complete, this is mainly
1794          * to catch the case where a low memory situation triggers a
1795          * cache reaping action which races with this destroy. */
1796         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1797
1798         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1799                 spl_magazine_destroy(skc);
1800                 spl_slab_reclaim(skc, 0, 1);
1801         } else {
1802                 ASSERT(skc->skc_flags & KMC_SLAB);
1803                 kmem_cache_destroy(skc->skc_linux_cache);
1804         }
1805
1806         spin_lock(&skc->skc_lock);
1807
1808         /* Validate there are no objects in use and free all the
1809          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1810         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1811         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1812         ASSERT3U(skc->skc_slab_total, ==, 0);
1813         ASSERT3U(skc->skc_obj_total, ==, 0);
1814         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1815         ASSERT(list_empty(&skc->skc_complete_list));
1816
1817         kmem_free(skc->skc_name, skc->skc_name_size);
1818         spin_unlock(&skc->skc_lock);
1819
1820         kmem_free(skc, sizeof(*skc));
1821
1822         SEXIT;
1823 }
1824 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1825
1826 /*
1827  * Allocate an object from a slab attached to the cache.  This is used to
1828  * repopulate the per-cpu magazine caches in batches when they run low.
1829  */
1830 static void *
1831 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1832 {
1833         spl_kmem_obj_t *sko;
1834
1835         ASSERT(skc->skc_magic == SKC_MAGIC);
1836         ASSERT(sks->sks_magic == SKS_MAGIC);
1837         ASSERT(spin_is_locked(&skc->skc_lock));
1838
1839         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1840         ASSERT(sko->sko_magic == SKO_MAGIC);
1841         ASSERT(sko->sko_addr != NULL);
1842
1843         /* Remove from sks_free_list */
1844         list_del_init(&sko->sko_list);
1845
1846         sks->sks_age = jiffies;
1847         sks->sks_ref++;
1848         skc->skc_obj_alloc++;
1849
1850         /* Track max obj usage statistics */
1851         if (skc->skc_obj_alloc > skc->skc_obj_max)
1852                 skc->skc_obj_max = skc->skc_obj_alloc;
1853
1854         /* Track max slab usage statistics */
1855         if (sks->sks_ref == 1) {
1856                 skc->skc_slab_alloc++;
1857
1858                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1859                         skc->skc_slab_max = skc->skc_slab_alloc;
1860         }
1861
1862         return sko->sko_addr;
1863 }
1864
1865 /*
1866  * Generic slab allocation function to run by the global work queues.
1867  * It is responsible for allocating a new slab, linking it in to the list
1868  * of partial slabs, and then waking any waiters.
1869  */
1870 static void
1871 spl_cache_grow_work(void *data)
1872 {
1873         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1874         spl_kmem_cache_t *skc = ska->ska_cache;
1875         spl_kmem_slab_t *sks;
1876
1877         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1878         spin_lock(&skc->skc_lock);
1879         if (sks) {
1880                 skc->skc_slab_total++;
1881                 skc->skc_obj_total += sks->sks_objs;
1882                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1883         }
1884
1885         atomic_dec(&skc->skc_ref);
1886         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1887         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1888         wake_up_all(&skc->skc_waitq);
1889         spin_unlock(&skc->skc_lock);
1890
1891         kfree(ska);
1892 }
1893
1894 /*
1895  * Returns non-zero when a new slab should be available.
1896  */
1897 static int
1898 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1899 {
1900         return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1901 }
1902
1903 /*
1904  * No available objects on any slabs, create a new slab.  Note that this
1905  * functionality is disabled for KMC_SLAB caches which are backed by the
1906  * Linux slab.
1907  */
1908 static int
1909 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1910 {
1911         int remaining, rc;
1912         SENTRY;
1913
1914         ASSERT(skc->skc_magic == SKC_MAGIC);
1915         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1916         might_sleep();
1917         *obj = NULL;
1918
1919         /*
1920          * Before allocating a new slab wait for any reaping to complete and
1921          * then return so the local magazine can be rechecked for new objects.
1922          */
1923         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1924                 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1925                     TASK_UNINTERRUPTIBLE);
1926                 SRETURN(rc ? rc : -EAGAIN);
1927         }
1928
1929         /*
1930          * This is handled by dispatching a work request to the global work
1931          * queue.  This allows us to asynchronously allocate a new slab while
1932          * retaining the ability to safely fall back to a smaller synchronous
1933          * allocations to ensure forward progress is always maintained.
1934          */
1935         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1936                 spl_kmem_alloc_t *ska;
1937
1938                 ska = kmalloc(sizeof(*ska), flags);
1939                 if (ska == NULL) {
1940                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1941                         wake_up_all(&skc->skc_waitq);
1942                         SRETURN(-ENOMEM);
1943                 }
1944
1945                 atomic_inc(&skc->skc_ref);
1946                 ska->ska_cache = skc;
1947                 ska->ska_flags = flags & ~__GFP_FS;
1948                 taskq_init_ent(&ska->ska_tqe);
1949                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1950                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1951         }
1952
1953         /*
1954          * The goal here is to only detect the rare case where a virtual slab
1955          * allocation has deadlocked.  We must be careful to minimize the use
1956          * of emergency objects which are more expensive to track.  Therefore,
1957          * we set a very long timeout for the asynchronous allocation and if
1958          * the timeout is reached the cache is flagged as deadlocked.  From
1959          * this point only new emergency objects will be allocated until the
1960          * asynchronous allocation completes and clears the deadlocked flag.
1961          */
1962         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1963                 rc = spl_emergency_alloc(skc, flags, obj);
1964         } else {
1965                 remaining = wait_event_timeout(skc->skc_waitq,
1966                                                spl_cache_grow_wait(skc), HZ);
1967
1968                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1969                         spin_lock(&skc->skc_lock);
1970                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1971                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1972                                 skc->skc_obj_deadlock++;
1973                         }
1974                         spin_unlock(&skc->skc_lock);
1975                 }
1976
1977                 rc = -ENOMEM;
1978         }
1979
1980         SRETURN(rc);
1981 }
1982
1983 /*
1984  * Refill a per-cpu magazine with objects from the slabs for this cache.
1985  * Ideally the magazine can be repopulated using existing objects which have
1986  * been released, however if we are unable to locate enough free objects new
1987  * slabs of objects will be created.  On success NULL is returned, otherwise
1988  * the address of a single emergency object is returned for use by the caller.
1989  */
1990 static void *
1991 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1992 {
1993         spl_kmem_slab_t *sks;
1994         int count = 0, rc, refill;
1995         void *obj = NULL;
1996         SENTRY;
1997
1998         ASSERT(skc->skc_magic == SKC_MAGIC);
1999         ASSERT(skm->skm_magic == SKM_MAGIC);
2000
2001         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
2002         spin_lock(&skc->skc_lock);
2003
2004         while (refill > 0) {
2005                 /* No slabs available we may need to grow the cache */
2006                 if (list_empty(&skc->skc_partial_list)) {
2007                         spin_unlock(&skc->skc_lock);
2008
2009                         local_irq_enable();
2010                         rc = spl_cache_grow(skc, flags, &obj);
2011                         local_irq_disable();
2012
2013                         /* Emergency object for immediate use by caller */
2014                         if (rc == 0 && obj != NULL)
2015                                 SRETURN(obj);
2016
2017                         if (rc)
2018                                 SGOTO(out, rc);
2019
2020                         /* Rescheduled to different CPU skm is not local */
2021                         if (skm != skc->skc_mag[smp_processor_id()])
2022                                 SGOTO(out, rc);
2023
2024                         /* Potentially rescheduled to the same CPU but
2025                          * allocations may have occurred from this CPU while
2026                          * we were sleeping so recalculate max refill. */
2027                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
2028
2029                         spin_lock(&skc->skc_lock);
2030                         continue;
2031                 }
2032
2033                 /* Grab the next available slab */
2034                 sks = list_entry((&skc->skc_partial_list)->next,
2035                                  spl_kmem_slab_t, sks_list);
2036                 ASSERT(sks->sks_magic == SKS_MAGIC);
2037                 ASSERT(sks->sks_ref < sks->sks_objs);
2038                 ASSERT(!list_empty(&sks->sks_free_list));
2039
2040                 /* Consume as many objects as needed to refill the requested
2041                  * cache.  We must also be careful not to overfill it. */
2042                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
2043                         ASSERT(skm->skm_avail < skm->skm_size);
2044                         ASSERT(count < skm->skm_size);
2045                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
2046                 }
2047
2048                 /* Move slab to skc_complete_list when full */
2049                 if (sks->sks_ref == sks->sks_objs) {
2050                         list_del(&sks->sks_list);
2051                         list_add(&sks->sks_list, &skc->skc_complete_list);
2052                 }
2053         }
2054
2055         spin_unlock(&skc->skc_lock);
2056 out:
2057         SRETURN(NULL);
2058 }
2059
2060 /*
2061  * Release an object back to the slab from which it came.
2062  */
2063 static void
2064 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
2065 {
2066         spl_kmem_slab_t *sks = NULL;
2067         spl_kmem_obj_t *sko = NULL;
2068         SENTRY;
2069
2070         ASSERT(skc->skc_magic == SKC_MAGIC);
2071         ASSERT(spin_is_locked(&skc->skc_lock));
2072
2073         sko = spl_sko_from_obj(skc, obj);
2074         ASSERT(sko->sko_magic == SKO_MAGIC);
2075         sks = sko->sko_slab;
2076         ASSERT(sks->sks_magic == SKS_MAGIC);
2077         ASSERT(sks->sks_cache == skc);
2078         list_add(&sko->sko_list, &sks->sks_free_list);
2079
2080         sks->sks_age = jiffies;
2081         sks->sks_ref--;
2082         skc->skc_obj_alloc--;
2083
2084         /* Move slab to skc_partial_list when no longer full.  Slabs
2085          * are added to the head to keep the partial list is quasi-full
2086          * sorted order.  Fuller at the head, emptier at the tail. */
2087         if (sks->sks_ref == (sks->sks_objs - 1)) {
2088                 list_del(&sks->sks_list);
2089                 list_add(&sks->sks_list, &skc->skc_partial_list);
2090         }
2091
2092         /* Move empty slabs to the end of the partial list so
2093          * they can be easily found and freed during reclamation. */
2094         if (sks->sks_ref == 0) {
2095                 list_del(&sks->sks_list);
2096                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
2097                 skc->skc_slab_alloc--;
2098         }
2099
2100         SEXIT;
2101 }
2102
2103 /*
2104  * Allocate an object from the per-cpu magazine, or if the magazine
2105  * is empty directly allocate from a slab and repopulate the magazine.
2106  */
2107 void *
2108 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
2109 {
2110         spl_kmem_magazine_t *skm;
2111         void *obj = NULL;
2112         SENTRY;
2113
2114         ASSERT(skc->skc_magic == SKC_MAGIC);
2115         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2116         ASSERT(flags & KM_SLEEP);
2117
2118         atomic_inc(&skc->skc_ref);
2119
2120         /*
2121          * Allocate directly from a Linux slab.  All optimizations are left
2122          * to the underlying cache we only need to guarantee that KM_SLEEP
2123          * callers will never fail.
2124          */
2125         if (skc->skc_flags & KMC_SLAB) {
2126                 struct kmem_cache *slc = skc->skc_linux_cache;
2127
2128                 do {
2129                         obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
2130                         if (obj && skc->skc_ctor)
2131                                 skc->skc_ctor(obj, skc->skc_private, flags);
2132
2133                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
2134
2135                 atomic_dec(&skc->skc_ref);
2136                 SRETURN(obj);
2137         }
2138
2139         local_irq_disable();
2140
2141 restart:
2142         /* Safe to update per-cpu structure without lock, but
2143          * in the restart case we must be careful to reacquire
2144          * the local magazine since this may have changed
2145          * when we need to grow the cache. */
2146         skm = skc->skc_mag[smp_processor_id()];
2147         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
2148                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
2149                 skm->skm_size, skm->skm_refill, skm->skm_avail);
2150
2151         if (likely(skm->skm_avail)) {
2152                 /* Object available in CPU cache, use it */
2153                 obj = skm->skm_objs[--skm->skm_avail];
2154                 skm->skm_age = jiffies;
2155         } else {
2156                 obj = spl_cache_refill(skc, skm, flags);
2157                 if (obj == NULL)
2158                         SGOTO(restart, obj = NULL);
2159         }
2160
2161         local_irq_enable();
2162         ASSERT(obj);
2163         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
2164
2165         /* Pre-emptively migrate object to CPU L1 cache */
2166         prefetchw(obj);
2167         atomic_dec(&skc->skc_ref);
2168
2169         SRETURN(obj);
2170 }
2171 EXPORT_SYMBOL(spl_kmem_cache_alloc);
2172
2173 /*
2174  * Free an object back to the local per-cpu magazine, there is no
2175  * guarantee that this is the same magazine the object was originally
2176  * allocated from.  We may need to flush entire from the magazine
2177  * back to the slabs to make space.
2178  */
2179 void
2180 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
2181 {
2182         spl_kmem_magazine_t *skm;
2183         unsigned long flags;
2184         SENTRY;
2185
2186         ASSERT(skc->skc_magic == SKC_MAGIC);
2187         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2188         atomic_inc(&skc->skc_ref);
2189
2190         /*
2191          * Free the object from the Linux underlying Linux slab.
2192          */
2193         if (skc->skc_flags & KMC_SLAB) {
2194                 if (skc->skc_dtor)
2195                         skc->skc_dtor(obj, skc->skc_private);
2196
2197                 kmem_cache_free(skc->skc_linux_cache, obj);
2198                 goto out;
2199         }
2200
2201         /*
2202          * Only virtual slabs may have emergency objects and these objects
2203          * are guaranteed to have physical addresses.  They must be removed
2204          * from the tree of emergency objects and the freed.
2205          */
2206         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
2207                 SGOTO(out, spl_emergency_free(skc, obj));
2208
2209         local_irq_save(flags);
2210
2211         /* Safe to update per-cpu structure without lock, but
2212          * no remote memory allocation tracking is being performed
2213          * it is entirely possible to allocate an object from one
2214          * CPU cache and return it to another. */
2215         skm = skc->skc_mag[smp_processor_id()];
2216         ASSERT(skm->skm_magic == SKM_MAGIC);
2217
2218         /* Per-CPU cache full, flush it to make space */
2219         if (unlikely(skm->skm_avail >= skm->skm_size))
2220                 spl_cache_flush(skc, skm, skm->skm_refill);
2221
2222         /* Available space in cache, use it */
2223         skm->skm_objs[skm->skm_avail++] = obj;
2224
2225         local_irq_restore(flags);
2226 out:
2227         atomic_dec(&skc->skc_ref);
2228
2229         SEXIT;
2230 }
2231 EXPORT_SYMBOL(spl_kmem_cache_free);
2232
2233 /*
2234  * The generic shrinker function for all caches.  Under Linux a shrinker
2235  * may not be tightly coupled with a slab cache.  In fact Linux always
2236  * systematically tries calling all registered shrinker callbacks which
2237  * report that they contain unused objects.  Because of this we only
2238  * register one shrinker function in the shim layer for all slab caches.
2239  * We always attempt to shrink all caches when this generic shrinker
2240  * is called.  The shrinker should return the number of free objects
2241  * in the cache when called with nr_to_scan == 0 but not attempt to
2242  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
2243  * objects should be freed, which differs from Solaris semantics.
2244  * Solaris semantics are to free all available objects which may (and
2245  * probably will) be more objects than the requested nr_to_scan.
2246  */
2247 static int
2248 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
2249     struct shrink_control *sc)
2250 {
2251         spl_kmem_cache_t *skc;
2252         int alloc = 0;
2253
2254         down_read(&spl_kmem_cache_sem);
2255         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
2256                 if (sc->nr_to_scan)
2257                         spl_kmem_cache_reap_now(skc,
2258                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
2259
2260                 /*
2261                  * Presume everything alloc'ed is reclaimable, this ensures
2262                  * we are called again with nr_to_scan > 0 so can try and
2263                  * reclaim.  The exact number is not important either so
2264                  * we forgo taking this already highly contented lock.
2265                  */
2266                 alloc += skc->skc_obj_alloc;
2267         }
2268         up_read(&spl_kmem_cache_sem);
2269
2270         /*
2271          * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
2272          * This functionality only exists to work around a rare issue where
2273          * shrink_slabs() is repeatedly invoked by many cores causing the
2274          * system to thrash.
2275          */
2276         if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
2277                 return (-1);
2278
2279         return MAX((alloc * sysctl_vfs_cache_pressure) / 100, 0);
2280 }
2281
2282 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2283
2284 /*
2285  * Call the registered reclaim function for a cache.  Depending on how
2286  * many and which objects are released it may simply repopulate the
2287  * local magazine which will then need to age-out.  Objects which cannot
2288  * fit in the magazine we will be released back to their slabs which will
2289  * also need to age out before being release.  This is all just best
2290  * effort and we do not want to thrash creating and destroying slabs.
2291  */
2292 void
2293 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
2294 {
2295         SENTRY;
2296
2297         ASSERT(skc->skc_magic == SKC_MAGIC);
2298         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2299
2300         atomic_inc(&skc->skc_ref);
2301
2302         /*
2303          * Execute the registered reclaim callback if it exists.  The
2304          * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
2305          */
2306         if (skc->skc_flags & KMC_SLAB) {
2307                 if (skc->skc_reclaim)
2308                         skc->skc_reclaim(skc->skc_private);
2309
2310                 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
2311                         kmem_cache_shrink(skc->skc_linux_cache);
2312
2313                 SGOTO(out, 0);
2314         }
2315
2316         /*
2317          * Prevent concurrent cache reaping when contended.
2318          */
2319         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
2320                 SGOTO(out, 0);
2321
2322         /*
2323          * When a reclaim function is available it may be invoked repeatedly
2324          * until at least a single slab can be freed.  This ensures that we
2325          * do free memory back to the system.  This helps minimize the chance
2326          * of an OOM event when the bulk of memory is used by the slab.
2327          *
2328          * When free slabs are already available the reclaim callback will be
2329          * skipped.  Additionally, if no forward progress is detected despite
2330          * a reclaim function the cache will be skipped to avoid deadlock.
2331          *
2332          * Longer term this would be the correct place to add the code which
2333          * repacks the slabs in order minimize fragmentation.
2334          */
2335         if (skc->skc_reclaim) {
2336                 uint64_t objects = UINT64_MAX;
2337                 int do_reclaim;
2338
2339                 do {
2340                         spin_lock(&skc->skc_lock);
2341                         do_reclaim =
2342                             (skc->skc_slab_total > 0) &&
2343                             ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2344                             (skc->skc_obj_alloc < objects);
2345
2346                         objects = skc->skc_obj_alloc;
2347                         spin_unlock(&skc->skc_lock);
2348
2349                         if (do_reclaim)
2350                                 skc->skc_reclaim(skc->skc_private);
2351
2352                 } while (do_reclaim);
2353         }
2354
2355         /* Reclaim from the magazine then the slabs ignoring age and delay. */
2356         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2357                 spl_kmem_magazine_t *skm;
2358                 unsigned long irq_flags;
2359
2360                 local_irq_save(irq_flags);
2361                 skm = skc->skc_mag[smp_processor_id()];
2362                 spl_cache_flush(skc, skm, skm->skm_avail);
2363                 local_irq_restore(irq_flags);
2364         }
2365
2366         spl_slab_reclaim(skc, count, 1);
2367         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
2368         smp_mb__after_clear_bit();
2369         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
2370 out:
2371         atomic_dec(&skc->skc_ref);
2372
2373         SEXIT;
2374 }
2375 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
2376
2377 /*
2378  * Reap all free slabs from all registered caches.
2379  */
2380 void
2381 spl_kmem_reap(void)
2382 {
2383         struct shrink_control sc;
2384
2385         sc.nr_to_scan = KMC_REAP_CHUNK;
2386         sc.gfp_mask = GFP_KERNEL;
2387
2388         __spl_kmem_cache_generic_shrinker(NULL, &sc);
2389 }
2390 EXPORT_SYMBOL(spl_kmem_reap);
2391
2392 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
2393 static char *
2394 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
2395 {
2396         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
2397         int i, flag = 1;
2398
2399         ASSERT(str != NULL && len >= 17);
2400         memset(str, 0, len);
2401
2402         /* Check for a fully printable string, and while we are at
2403          * it place the printable characters in the passed buffer. */
2404         for (i = 0; i < size; i++) {
2405                 str[i] = ((char *)(kd->kd_addr))[i];
2406                 if (isprint(str[i])) {
2407                         continue;
2408                 } else {
2409                         /* Minimum number of printable characters found
2410                          * to make it worthwhile to print this as ascii. */
2411                         if (i > min)
2412                                 break;
2413
2414                         flag = 0;
2415                         break;
2416                 }
2417         }
2418
2419         if (!flag) {
2420                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2421                         *((uint8_t *)kd->kd_addr),
2422                         *((uint8_t *)kd->kd_addr + 2),
2423                         *((uint8_t *)kd->kd_addr + 4),
2424                         *((uint8_t *)kd->kd_addr + 6),
2425                         *((uint8_t *)kd->kd_addr + 8),
2426                         *((uint8_t *)kd->kd_addr + 10),
2427                         *((uint8_t *)kd->kd_addr + 12),
2428                         *((uint8_t *)kd->kd_addr + 14));
2429         }
2430
2431         return str;
2432 }
2433
2434 static int
2435 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2436 {
2437         int i;
2438         SENTRY;
2439
2440         spin_lock_init(lock);
2441         INIT_LIST_HEAD(list);
2442
2443         for (i = 0; i < size; i++)
2444                 INIT_HLIST_HEAD(&kmem_table[i]);
2445
2446         SRETURN(0);
2447 }
2448
2449 static void
2450 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
2451 {
2452         unsigned long flags;
2453         kmem_debug_t *kd;
2454         char str[17];
2455         SENTRY;
2456
2457         spin_lock_irqsave(lock, flags);
2458         if (!list_empty(list))
2459                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2460                        "size", "data", "func", "line");
2461
2462         list_for_each_entry(kd, list, kd_list)
2463                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
2464                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2465                        kd->kd_func, kd->kd_line);
2466
2467         spin_unlock_irqrestore(lock, flags);
2468         SEXIT;
2469 }
2470 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2471 #define spl_kmem_init_tracking(list, lock, size)
2472 #define spl_kmem_fini_tracking(list, lock)
2473 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2474
2475 static void
2476 spl_kmem_init_globals(void)
2477 {
2478         struct zone *zone;
2479
2480         /* For now all zones are includes, it may be wise to restrict
2481          * this to normal and highmem zones if we see problems. */
2482         for_each_zone(zone) {
2483
2484                 if (!populated_zone(zone))
2485                         continue;
2486
2487                 minfree += min_wmark_pages(zone);
2488                 desfree += low_wmark_pages(zone);
2489                 lotsfree += high_wmark_pages(zone);
2490         }
2491
2492         /* Solaris default values */
2493         swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
2494         swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
2495 }
2496
2497 /*
2498  * Called at module init when it is safe to use spl_kallsyms_lookup_name()
2499  */
2500 int
2501 spl_kmem_init_kallsyms_lookup(void)
2502 {
2503 #ifndef HAVE_GET_VMALLOC_INFO
2504         get_vmalloc_info_fn = (get_vmalloc_info_t)
2505                 spl_kallsyms_lookup_name("get_vmalloc_info");
2506         if (!get_vmalloc_info_fn) {
2507                 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
2508                 return -EFAULT;
2509         }
2510 #endif /* HAVE_GET_VMALLOC_INFO */
2511
2512 #ifdef HAVE_PGDAT_HELPERS
2513 # ifndef HAVE_FIRST_ONLINE_PGDAT
2514         first_online_pgdat_fn = (first_online_pgdat_t)
2515                 spl_kallsyms_lookup_name("first_online_pgdat");
2516         if (!first_online_pgdat_fn) {
2517                 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
2518                 return -EFAULT;
2519         }
2520 # endif /* HAVE_FIRST_ONLINE_PGDAT */
2521
2522 # ifndef HAVE_NEXT_ONLINE_PGDAT
2523         next_online_pgdat_fn = (next_online_pgdat_t)
2524                 spl_kallsyms_lookup_name("next_online_pgdat");
2525         if (!next_online_pgdat_fn) {
2526                 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
2527                 return -EFAULT;
2528         }
2529 # endif /* HAVE_NEXT_ONLINE_PGDAT */
2530
2531 # ifndef HAVE_NEXT_ZONE
2532         next_zone_fn = (next_zone_t)
2533                 spl_kallsyms_lookup_name("next_zone");
2534         if (!next_zone_fn) {
2535                 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
2536                 return -EFAULT;
2537         }
2538 # endif /* HAVE_NEXT_ZONE */
2539
2540 #else /* HAVE_PGDAT_HELPERS */
2541
2542 # ifndef HAVE_PGDAT_LIST
2543         pgdat_list_addr = *(struct pglist_data **)
2544                 spl_kallsyms_lookup_name("pgdat_list");
2545         if (!pgdat_list_addr) {
2546                 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
2547                 return -EFAULT;
2548         }
2549 # endif /* HAVE_PGDAT_LIST */
2550 #endif /* HAVE_PGDAT_HELPERS */
2551
2552 #if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
2553         get_zone_counts_fn = (get_zone_counts_t)
2554                 spl_kallsyms_lookup_name("get_zone_counts");
2555         if (!get_zone_counts_fn) {
2556                 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
2557                 return -EFAULT;
2558         }
2559 #endif  /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
2560
2561         /*
2562          * It is now safe to initialize the global tunings which rely on
2563          * the use of the for_each_zone() macro.  This macro in turns
2564          * depends on the *_pgdat symbols which are now available.
2565          */
2566         spl_kmem_init_globals();
2567
2568 #ifndef HAVE_SHRINK_DCACHE_MEMORY
2569         /* When shrink_dcache_memory_fn == NULL support is disabled */
2570         shrink_dcache_memory_fn = (shrink_dcache_memory_t)
2571                 spl_kallsyms_lookup_name("shrink_dcache_memory");
2572 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
2573
2574 #ifndef HAVE_SHRINK_ICACHE_MEMORY
2575         /* When shrink_icache_memory_fn == NULL support is disabled */
2576         shrink_icache_memory_fn = (shrink_icache_memory_t)
2577                 spl_kallsyms_lookup_name("shrink_icache_memory");
2578 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
2579
2580         return 0;
2581 }
2582
2583 int
2584 spl_kmem_init(void)
2585 {
2586         int rc = 0;
2587         SENTRY;
2588
2589 #ifdef DEBUG_KMEM
2590         kmem_alloc_used_set(0);
2591         vmem_alloc_used_set(0);
2592
2593         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2594         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2595 #endif
2596
2597         init_rwsem(&spl_kmem_cache_sem);
2598         INIT_LIST_HEAD(&spl_kmem_cache_list);
2599         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2600             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2601
2602         spl_register_shrinker(&spl_kmem_cache_shrinker);
2603
2604         SRETURN(rc);
2605 }
2606
2607 void
2608 spl_kmem_fini(void)
2609 {
2610         SENTRY;
2611
2612         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2613         taskq_destroy(spl_kmem_cache_taskq);
2614
2615 #ifdef DEBUG_KMEM
2616         /* Display all unreclaimed memory addresses, including the
2617          * allocation size and the first few bytes of what's located
2618          * at that address to aid in debugging.  Performance is not
2619          * a serious concern here since it is module unload time. */
2620         if (kmem_alloc_used_read() != 0)
2621                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2622                     "kmem leaked %ld/%ld bytes\n",
2623                     kmem_alloc_used_read(), kmem_alloc_max);
2624
2625
2626         if (vmem_alloc_used_read() != 0)
2627                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2628                     "vmem leaked %ld/%ld bytes\n",
2629                     vmem_alloc_used_read(), vmem_alloc_max);
2630
2631         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2632         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2633 #endif /* DEBUG_KMEM */
2634
2635         SEXIT;
2636 }