module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28 #include <spl-debug.h>
  29
  30 #ifdef SS_DEBUG_SUBSYS
  31 #undef SS_DEBUG_SUBSYS
  32 #endif
  33
  34 #define SS_DEBUG_SUBSYS SS_KMEM
  35
  36 /*
  37  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  38  * are removed to allow access to the real Linux slab allocator.
  39  */
  40 #undef kmem_cache_destroy
  41 #undef kmem_cache_create
  42 #undef kmem_cache_alloc
  43 #undef kmem_cache_free
  44
  45
  46 /*
  47  * Cache expiration was implemented because it was part of the default Solaris
  48  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  49  * accessed in several seconds should be returned to the cache.  On the other
  50  * hand Linux slabs never move objects back to the slabs unless there is
  51  * memory pressure on the system.  By default the Linux method is enabled
  52  * because it has been shown to improve responsiveness on low memory systems.
  53  * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  54  */
  55 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
  56 EXPORT_SYMBOL(spl_kmem_cache_expire);
  57 module_param(spl_kmem_cache_expire, uint, 0644);
  58 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  59
  60 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  61 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  62 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  63
  64 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
  65 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
  66 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
  67     "Minimal number of objects per slab");
  68
  69 unsigned int spl_kmem_cache_max_size = 32;
  70 module_param(spl_kmem_cache_max_size, uint, 0644);
  71 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  72
  73 unsigned int spl_kmem_cache_slab_limit = 0;
  74 module_param(spl_kmem_cache_slab_limit, uint, 0644);
  75 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
  76     "Objects less than N bytes use the Linux slab");
  77
  78 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
  79 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
  80 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
  81     "Objects less than N bytes use the kmalloc");
  82
  83 /*
  84  * The minimum amount of memory measured in pages to be free at all
  85  * times on the system.  This is similar to Linux's zone->pages_min
  86  * multiplied by the number of zones and is sized based on that.
  87  */
  88 pgcnt_t minfree = 0;
  89 EXPORT_SYMBOL(minfree);
  90
  91 /*
  92  * The desired amount of memory measured in pages to be free at all
  93  * times on the system.  This is similar to Linux's zone->pages_low
  94  * multiplied by the number of zones and is sized based on that.
  95  * Assuming all zones are being used roughly equally, when we drop
  96  * below this threshold asynchronous page reclamation is triggered.
  97  */
  98 pgcnt_t desfree = 0;
  99 EXPORT_SYMBOL(desfree);
 100
 101 /*
 102  * When above this amount of memory measures in pages the system is
 103  * determined to have enough free memory.  This is similar to Linux's
 104  * zone->pages_high multiplied by the number of zones and is sized based
 105  * on that.  Assuming all zones are being used roughly equally, when
 106  * asynchronous page reclamation reaches this threshold it stops.
 107  */
 108 pgcnt_t lotsfree = 0;
 109 EXPORT_SYMBOL(lotsfree);
 110
 111 /* Unused always 0 in this implementation */
 112 pgcnt_t needfree = 0;
 113 EXPORT_SYMBOL(needfree);
 114
 115 pgcnt_t swapfs_minfree = 0;
 116 EXPORT_SYMBOL(swapfs_minfree);
 117
 118 pgcnt_t swapfs_reserve = 0;
 119 EXPORT_SYMBOL(swapfs_reserve);
 120
 121 vmem_t *heap_arena = NULL;
 122 EXPORT_SYMBOL(heap_arena);
 123
 124 vmem_t *zio_alloc_arena = NULL;
 125 EXPORT_SYMBOL(zio_alloc_arena);
 126
 127 vmem_t *zio_arena = NULL;
 128 EXPORT_SYMBOL(zio_arena);
 129
 130 #ifndef HAVE_GET_VMALLOC_INFO
 131 get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
 132 EXPORT_SYMBOL(get_vmalloc_info_fn);
 133 #endif /* HAVE_GET_VMALLOC_INFO */
 134
 135 #ifdef HAVE_PGDAT_HELPERS
 136 # ifndef HAVE_FIRST_ONLINE_PGDAT
 137 first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
 138 EXPORT_SYMBOL(first_online_pgdat_fn);
 139 # endif /* HAVE_FIRST_ONLINE_PGDAT */
 140
 141 # ifndef HAVE_NEXT_ONLINE_PGDAT
 142 next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
 143 EXPORT_SYMBOL(next_online_pgdat_fn);
 144 # endif /* HAVE_NEXT_ONLINE_PGDAT */
 145
 146 # ifndef HAVE_NEXT_ZONE
 147 next_zone_t next_zone_fn = SYMBOL_POISON;
 148 EXPORT_SYMBOL(next_zone_fn);
 149 # endif /* HAVE_NEXT_ZONE */
 150
 151 #else /* HAVE_PGDAT_HELPERS */
 152
 153 # ifndef HAVE_PGDAT_LIST
 154 struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
 155 EXPORT_SYMBOL(pgdat_list_addr);
 156 # endif /* HAVE_PGDAT_LIST */
 157
 158 #endif /* HAVE_PGDAT_HELPERS */
 159
 160 #ifdef NEED_GET_ZONE_COUNTS
 161 # ifndef HAVE_GET_ZONE_COUNTS
 162 get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
 163 EXPORT_SYMBOL(get_zone_counts_fn);
 164 # endif /* HAVE_GET_ZONE_COUNTS */
 165
 166 unsigned long
 167 spl_global_page_state(spl_zone_stat_item_t item)
 168 {
 169         unsigned long active;
 170         unsigned long inactive;
 171         unsigned long free;
 172
 173         get_zone_counts(&active, &inactive, &free);
 174         switch (item) {
 175         case SPL_NR_FREE_PAGES: return free;
 176         case SPL_NR_INACTIVE:   return inactive;
 177         case SPL_NR_ACTIVE:     return active;
 178         default:                ASSERT(0); /* Unsupported */
 179         }
 180
 181         return 0;
 182 }
 183 #else
 184 # ifdef HAVE_GLOBAL_PAGE_STATE
 185 unsigned long
 186 spl_global_page_state(spl_zone_stat_item_t item)
 187 {
 188         unsigned long pages = 0;
 189
 190         switch (item) {
 191         case SPL_NR_FREE_PAGES:
 192 #  ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
 193                 pages += global_page_state(NR_FREE_PAGES);
 194 #  endif
 195                 break;
 196         case SPL_NR_INACTIVE:
 197 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
 198                 pages += global_page_state(NR_INACTIVE);
 199 #  endif
 200 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
 201                 pages += global_page_state(NR_INACTIVE_ANON);
 202 #  endif
 203 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
 204                 pages += global_page_state(NR_INACTIVE_FILE);
 205 #  endif
 206                 break;
 207         case SPL_NR_ACTIVE:
 208 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
 209                 pages += global_page_state(NR_ACTIVE);
 210 #  endif
 211 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
 212                 pages += global_page_state(NR_ACTIVE_ANON);
 213 #  endif
 214 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
 215                 pages += global_page_state(NR_ACTIVE_FILE);
 216 #  endif
 217                 break;
 218         default:
 219                 ASSERT(0); /* Unsupported */
 220         }
 221
 222         return pages;
 223 }
 224 # else
 225 #  error "Both global_page_state() and get_zone_counts() unavailable"
 226 # endif /* HAVE_GLOBAL_PAGE_STATE */
 227 #endif /* NEED_GET_ZONE_COUNTS */
 228 EXPORT_SYMBOL(spl_global_page_state);
 229
 230 #ifndef HAVE_SHRINK_DCACHE_MEMORY
 231 shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
 232 EXPORT_SYMBOL(shrink_dcache_memory_fn);
 233 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
 234
 235 #ifndef HAVE_SHRINK_ICACHE_MEMORY
 236 shrink_icache_memory_t shrink_icache_memory_fn = SYMBOL_POISON;
 237 EXPORT_SYMBOL(shrink_icache_memory_fn);
 238 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
 239
 240 pgcnt_t
 241 spl_kmem_availrmem(void)
 242 {
 243         /* The amount of easily available memory */
 244         return (spl_global_page_state(SPL_NR_FREE_PAGES) +
 245                 spl_global_page_state(SPL_NR_INACTIVE));
 246 }
 247 EXPORT_SYMBOL(spl_kmem_availrmem);
 248
 249 size_t
 250 vmem_size(vmem_t *vmp, int typemask)
 251 {
 252         struct vmalloc_info vmi;
 253         size_t size = 0;
 254
 255         ASSERT(vmp == NULL);
 256         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 257
 258         get_vmalloc_info(&vmi);
 259         if (typemask & VMEM_ALLOC)
 260                 size += (size_t)vmi.used;
 261
 262         if (typemask & VMEM_FREE)
 263                 size += (size_t)(VMALLOC_TOTAL - vmi.used);
 264
 265         return size;
 266 }
 267 EXPORT_SYMBOL(vmem_size);
 268
 269 int
 270 kmem_debugging(void)
 271 {
 272         return 0;
 273 }
 274 EXPORT_SYMBOL(kmem_debugging);
 275
 276 #ifndef HAVE_KVASPRINTF
 277 /* Simplified asprintf. */
 278 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 279 {
 280         unsigned int len;
 281         char *p;
 282         va_list aq;
 283
 284         va_copy(aq, ap);
 285         len = vsnprintf(NULL, 0, fmt, aq);
 286         va_end(aq);
 287
 288         p = kmalloc(len+1, gfp);
 289         if (!p)
 290                 return NULL;
 291
 292         vsnprintf(p, len+1, fmt, ap);
 293
 294         return p;
 295 }
 296 EXPORT_SYMBOL(kvasprintf);
 297 #endif /* HAVE_KVASPRINTF */
 298
 299 char *
 300 kmem_vasprintf(const char *fmt, va_list ap)
 301 {
 302         va_list aq;
 303         char *ptr;
 304
 305         do {
 306                 va_copy(aq, ap);
 307                 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
 308                 va_end(aq);
 309         } while (ptr == NULL);
 310
 311         return ptr;
 312 }
 313 EXPORT_SYMBOL(kmem_vasprintf);
 314
 315 char *
 316 kmem_asprintf(const char *fmt, ...)
 317 {
 318         va_list ap;
 319         char *ptr;
 320
 321         do {
 322                 va_start(ap, fmt);
 323                 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
 324                 va_end(ap);
 325         } while (ptr == NULL);
 326
 327         return ptr;
 328 }
 329 EXPORT_SYMBOL(kmem_asprintf);
 330
 331 static char *
 332 __strdup(const char *str, int flags)
 333 {
 334         char *ptr;
 335         int n;
 336
 337         n = strlen(str);
 338         ptr = kmalloc_nofail(n + 1, flags);
 339         if (ptr)
 340                 memcpy(ptr, str, n + 1);
 341
 342         return ptr;
 343 }
 344
 345 char *
 346 strdup(const char *str)
 347 {
 348         return __strdup(str, KM_SLEEP);
 349 }
 350 EXPORT_SYMBOL(strdup);
 351
 352 void
 353 strfree(char *str)
 354 {
 355         kfree(str);
 356 }
 357 EXPORT_SYMBOL(strfree);
 358
 359 /*
 360  * Memory allocation interfaces and debugging for basic kmem_*
 361  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 362  * the SPL will keep track of the total memory allocated, and
 363  * report any memory leaked when the module is unloaded.
 364  */
 365 #ifdef DEBUG_KMEM
 366
 367 /* Shim layer memory accounting */
 368 # ifdef HAVE_ATOMIC64_T
 369 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 370 unsigned long long kmem_alloc_max = 0;
 371 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 372 unsigned long long vmem_alloc_max = 0;
 373 # else  /* HAVE_ATOMIC64_T */
 374 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 375 unsigned long long kmem_alloc_max = 0;
 376 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 377 unsigned long long vmem_alloc_max = 0;
 378 # endif /* HAVE_ATOMIC64_T */
 379
 380 EXPORT_SYMBOL(kmem_alloc_used);
 381 EXPORT_SYMBOL(kmem_alloc_max);
 382 EXPORT_SYMBOL(vmem_alloc_used);
 383 EXPORT_SYMBOL(vmem_alloc_max);
 384
 385 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 386  * but also the location of every alloc and free.  When the SPL module is
 387  * unloaded a list of all leaked addresses and where they were allocated
 388  * will be dumped to the console.  Enabling this feature has a significant
 389  * impact on performance but it makes finding memory leaks straight forward.
 390  *
 391  * Not surprisingly with debugging enabled the xmem_locks are very highly
 392  * contended particularly on xfree().  If we want to run with this detailed
 393  * debugging enabled for anything other than debugging  we need to minimize
 394  * the contention by moving to a lock per xmem_table entry model.
 395  */
 396 # ifdef DEBUG_KMEM_TRACKING
 397
 398 #  define KMEM_HASH_BITS          10
 399 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 400
 401 #  define VMEM_HASH_BITS          10
 402 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 403
 404 typedef struct kmem_debug {
 405         struct hlist_node kd_hlist;     /* Hash node linkage */
 406         struct list_head kd_list;       /* List of all allocations */
 407         void *kd_addr;                  /* Allocation pointer */
 408         size_t kd_size;                 /* Allocation size */
 409         const char *kd_func;            /* Allocation function */
 410         int kd_line;                    /* Allocation line */
 411 } kmem_debug_t;
 412
 413 spinlock_t kmem_lock;
 414 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 415 struct list_head kmem_list;
 416
 417 spinlock_t vmem_lock;
 418 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 419 struct list_head vmem_list;
 420
 421 EXPORT_SYMBOL(kmem_lock);
 422 EXPORT_SYMBOL(kmem_table);
 423 EXPORT_SYMBOL(kmem_list);
 424
 425 EXPORT_SYMBOL(vmem_lock);
 426 EXPORT_SYMBOL(vmem_table);
 427 EXPORT_SYMBOL(vmem_list);
 428
 429 static kmem_debug_t *
 430 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
 431 {
 432         struct hlist_head *head;
 433         struct hlist_node *node;
 434         struct kmem_debug *p;
 435         unsigned long flags;
 436         SENTRY;
 437
 438         spin_lock_irqsave(lock, flags);
 439
 440         head = &table[hash_ptr((void *)addr, bits)];
 441         hlist_for_each(node, head) {
 442                 p = list_entry(node, struct kmem_debug, kd_hlist);
 443                 if (p->kd_addr == addr) {
 444                         hlist_del_init(&p->kd_hlist);
 445                         list_del_init(&p->kd_list);
 446                         spin_unlock_irqrestore(lock, flags);
 447                         return p;
 448                 }
 449         }
 450
 451         spin_unlock_irqrestore(lock, flags);
 452
 453         SRETURN(NULL);
 454 }
 455
 456 void *
 457 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 458     int node_alloc, int node)
 459 {
 460         void *ptr = NULL;
 461         kmem_debug_t *dptr;
 462         unsigned long irq_flags;
 463         SENTRY;
 464
 465         /* Function may be called with KM_NOSLEEP so failure is possible */
 466         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 467             flags & ~__GFP_ZERO);
 468
 469         if (unlikely(dptr == NULL)) {
 470                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 471                     "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 472                     sizeof(kmem_debug_t), flags, func, line,
 473                     kmem_alloc_used_read(), kmem_alloc_max);
 474         } else {
 475                 /*
 476                  * Marked unlikely because we should never be doing this,
 477                  * we tolerate to up 2 pages but a single page is best.
 478                  */
 479                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 480                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
 481                             "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 482                             (unsigned long long) size, flags, func, line,
 483                             kmem_alloc_used_read(), kmem_alloc_max);
 484                         spl_debug_dumpstack(NULL);
 485                 }
 486
 487                 /*
 488                  *  We use __strdup() below because the string pointed to by
 489                  * __FUNCTION__ might not be available by the time we want
 490                  * to print it since the module might have been unloaded.
 491                  * This can only fail in the KM_NOSLEEP case.
 492                  */
 493                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 494                 if (unlikely(dptr->kd_func == NULL)) {
 495                         kfree(dptr);
 496                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 497                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 498                             func, line, kmem_alloc_used_read(), kmem_alloc_max);
 499                         goto out;
 500                 }
 501
 502                 /* Use the correct allocator */
 503                 if (node_alloc) {
 504                         ASSERT(!(flags & __GFP_ZERO));
 505                         ptr = kmalloc_node_nofail(size, flags, node);
 506                 } else if (flags & __GFP_ZERO) {
 507                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 508                 } else {
 509                         ptr = kmalloc_nofail(size, flags);
 510                 }
 511
 512                 if (unlikely(ptr == NULL)) {
 513                         kfree(dptr->kd_func);
 514                         kfree(dptr);
 515                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc"
 516                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 517                             (unsigned long long) size, flags, func, line,
 518                             kmem_alloc_used_read(), kmem_alloc_max);
 519                         goto out;
 520                 }
 521
 522                 kmem_alloc_used_add(size);
 523                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 524                         kmem_alloc_max = kmem_alloc_used_read();
 525
 526                 INIT_HLIST_NODE(&dptr->kd_hlist);
 527                 INIT_LIST_HEAD(&dptr->kd_list);
 528
 529                 dptr->kd_addr = ptr;
 530                 dptr->kd_size = size;
 531                 dptr->kd_line = line;
 532
 533                 spin_lock_irqsave(&kmem_lock, irq_flags);
 534                 hlist_add_head(&dptr->kd_hlist,
 535                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 536                 list_add_tail(&dptr->kd_list, &kmem_list);
 537                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 538
 539                 SDEBUG_LIMIT(SD_INFO,
 540                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 541                     (unsigned long long) size, flags, func, line, ptr,
 542                     kmem_alloc_used_read(), kmem_alloc_max);
 543         }
 544 out:
 545         SRETURN(ptr);
 546 }
 547 EXPORT_SYMBOL(kmem_alloc_track);
 548
 549 void
 550 kmem_free_track(const void *ptr, size_t size)
 551 {
 552         kmem_debug_t *dptr;
 553         SENTRY;
 554
 555         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 556             (unsigned long long) size);
 557
 558         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 559
 560         /* Must exist in hash due to kmem_alloc() */
 561         ASSERT(dptr);
 562
 563         /* Size must match */
 564         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 565             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 566             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 567
 568         kmem_alloc_used_sub(size);
 569         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 570             (unsigned long long) size, kmem_alloc_used_read(),
 571             kmem_alloc_max);
 572
 573         kfree(dptr->kd_func);
 574
 575         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 576         kfree(dptr);
 577
 578         memset((void *)ptr, 0x5a, size);
 579         kfree(ptr);
 580
 581         SEXIT;
 582 }
 583 EXPORT_SYMBOL(kmem_free_track);
 584
 585 void *
 586 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 587 {
 588         void *ptr = NULL;
 589         kmem_debug_t *dptr;
 590         unsigned long irq_flags;
 591         SENTRY;
 592
 593         ASSERT(flags & KM_SLEEP);
 594
 595         /* Function may be called with KM_NOSLEEP so failure is possible */
 596         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 597             flags & ~__GFP_ZERO);
 598         if (unlikely(dptr == NULL)) {
 599                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 600                     "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 601                     sizeof(kmem_debug_t), flags, func, line,
 602                     vmem_alloc_used_read(), vmem_alloc_max);
 603         } else {
 604                 /*
 605                  * We use __strdup() below because the string pointed to by
 606                  * __FUNCTION__ might not be available by the time we want
 607                  * to print it, since the module might have been unloaded.
 608                  * This can never fail because we have already asserted
 609                  * that flags is KM_SLEEP.
 610                  */
 611                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 612                 if (unlikely(dptr->kd_func == NULL)) {
 613                         kfree(dptr);
 614                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 615                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 616                             func, line, vmem_alloc_used_read(), vmem_alloc_max);
 617                         goto out;
 618                 }
 619
 620                 /* Use the correct allocator */
 621                 if (flags & __GFP_ZERO) {
 622                         ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
 623                 } else {
 624                         ptr = vmalloc_nofail(size, flags);
 625                 }
 626
 627                 if (unlikely(ptr == NULL)) {
 628                         kfree(dptr->kd_func);
 629                         kfree(dptr);
 630                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc"
 631                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 632                             (unsigned long long) size, flags, func, line,
 633                             vmem_alloc_used_read(), vmem_alloc_max);
 634                         goto out;
 635                 }
 636
 637                 vmem_alloc_used_add(size);
 638                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 639                         vmem_alloc_max = vmem_alloc_used_read();
 640
 641                 INIT_HLIST_NODE(&dptr->kd_hlist);
 642                 INIT_LIST_HEAD(&dptr->kd_list);
 643
 644                 dptr->kd_addr = ptr;
 645                 dptr->kd_size = size;
 646                 dptr->kd_line = line;
 647
 648                 spin_lock_irqsave(&vmem_lock, irq_flags);
 649                 hlist_add_head(&dptr->kd_hlist,
 650                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 651                 list_add_tail(&dptr->kd_list, &vmem_list);
 652                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 653
 654                 SDEBUG_LIMIT(SD_INFO,
 655                     "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 656                     (unsigned long long) size, flags, func, line,
 657                     ptr, vmem_alloc_used_read(), vmem_alloc_max);
 658         }
 659 out:
 660         SRETURN(ptr);
 661 }
 662 EXPORT_SYMBOL(vmem_alloc_track);
 663
 664 void
 665 vmem_free_track(const void *ptr, size_t size)
 666 {
 667         kmem_debug_t *dptr;
 668         SENTRY;
 669
 670         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 671             (unsigned long long) size);
 672
 673         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 674
 675         /* Must exist in hash due to vmem_alloc() */
 676         ASSERT(dptr);
 677
 678         /* Size must match */
 679         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 680             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 681             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 682
 683         vmem_alloc_used_sub(size);
 684         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 685             (unsigned long long) size, vmem_alloc_used_read(),
 686             vmem_alloc_max);
 687
 688         kfree(dptr->kd_func);
 689
 690         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 691         kfree(dptr);
 692
 693         memset((void *)ptr, 0x5a, size);
 694         vfree(ptr);
 695
 696         SEXIT;
 697 }
 698 EXPORT_SYMBOL(vmem_free_track);
 699
 700 # else /* DEBUG_KMEM_TRACKING */
 701
 702 void *
 703 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 704     int node_alloc, int node)
 705 {
 706         void *ptr;
 707         SENTRY;
 708
 709         /*
 710          * Marked unlikely because we should never be doing this,
 711          * we tolerate to up 2 pages but a single page is best.
 712          */
 713         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 714                 SDEBUG(SD_CONSOLE | SD_WARNING,
 715                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 716                     (unsigned long long) size, flags, func, line,
 717                     kmem_alloc_used_read(), kmem_alloc_max);
 718                 dump_stack();
 719         }
 720
 721         /* Use the correct allocator */
 722         if (node_alloc) {
 723                 ASSERT(!(flags & __GFP_ZERO));
 724                 ptr = kmalloc_node_nofail(size, flags, node);
 725         } else if (flags & __GFP_ZERO) {
 726                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 727         } else {
 728                 ptr = kmalloc_nofail(size, flags);
 729         }
 730
 731         if (unlikely(ptr == NULL)) {
 732                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 733                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 734                     (unsigned long long) size, flags, func, line,
 735                     kmem_alloc_used_read(), kmem_alloc_max);
 736         } else {
 737                 kmem_alloc_used_add(size);
 738                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 739                         kmem_alloc_max = kmem_alloc_used_read();
 740
 741                 SDEBUG_LIMIT(SD_INFO,
 742                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 743                     (unsigned long long) size, flags, func, line, ptr,
 744                     kmem_alloc_used_read(), kmem_alloc_max);
 745         }
 746
 747         SRETURN(ptr);
 748 }
 749 EXPORT_SYMBOL(kmem_alloc_debug);
 750
 751 void
 752 kmem_free_debug(const void *ptr, size_t size)
 753 {
 754         SENTRY;
 755
 756         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 757             (unsigned long long) size);
 758
 759         kmem_alloc_used_sub(size);
 760         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 761             (unsigned long long) size, kmem_alloc_used_read(),
 762             kmem_alloc_max);
 763         kfree(ptr);
 764
 765         SEXIT;
 766 }
 767 EXPORT_SYMBOL(kmem_free_debug);
 768
 769 void *
 770 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 771 {
 772         void *ptr;
 773         SENTRY;
 774
 775         ASSERT(flags & KM_SLEEP);
 776
 777         /* Use the correct allocator */
 778         if (flags & __GFP_ZERO) {
 779                 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
 780         } else {
 781                 ptr = vmalloc_nofail(size, flags);
 782         }
 783
 784         if (unlikely(ptr == NULL)) {
 785                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 786                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 787                     (unsigned long long) size, flags, func, line,
 788                     vmem_alloc_used_read(), vmem_alloc_max);
 789         } else {
 790                 vmem_alloc_used_add(size);
 791                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 792                         vmem_alloc_max = vmem_alloc_used_read();
 793
 794                 SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 795                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 796                     vmem_alloc_used_read(), vmem_alloc_max);
 797         }
 798
 799         SRETURN(ptr);
 800 }
 801 EXPORT_SYMBOL(vmem_alloc_debug);
 802
 803 void
 804 vmem_free_debug(const void *ptr, size_t size)
 805 {
 806         SENTRY;
 807
 808         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 809             (unsigned long long) size);
 810
 811         vmem_alloc_used_sub(size);
 812         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 813             (unsigned long long) size, vmem_alloc_used_read(),
 814             vmem_alloc_max);
 815         vfree(ptr);
 816
 817         SEXIT;
 818 }
 819 EXPORT_SYMBOL(vmem_free_debug);
 820
 821 # endif /* DEBUG_KMEM_TRACKING */
 822 #endif /* DEBUG_KMEM */
 823
 824 /*
 825  * Slab allocation interfaces
 826  *
 827  * While the Linux slab implementation was inspired by the Solaris
 828  * implementation I cannot use it to emulate the Solaris APIs.  I
 829  * require two features which are not provided by the Linux slab.
 830  *
 831  * 1) Constructors AND destructors.  Recent versions of the Linux
 832  *    kernel have removed support for destructors.  This is a deal
 833  *    breaker for the SPL which contains particularly expensive
 834  *    initializers for mutex's, condition variables, etc.  We also
 835  *    require a minimal level of cleanup for these data types unlike
 836  *    many Linux data type which do need to be explicitly destroyed.
 837  *
 838  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 839  *    expect it to work well for both small are very large allocations.
 840  *    Because of memory fragmentation the Linux slab which is backed
 841  *    by kmalloc'ed memory performs very badly when confronted with
 842  *    large numbers of large allocations.  Basing the slab on the
 843  *    virtual address space removes the need for contiguous pages
 844  *    and greatly improve performance for large allocations.
 845  *
 846  * For these reasons, the SPL has its own slab implementation with
 847  * the needed features.  It is not as highly optimized as either the
 848  * Solaris or Linux slabs, but it should get me most of what is
 849  * needed until it can be optimized or obsoleted by another approach.
 850  *
 851  * One serious concern I do have about this method is the relatively
 852  * small virtual address space on 32bit arches.  This will seriously
 853  * constrain the size of the slab caches and their performance.
 854  *
 855  * XXX: Improve the partial slab list by carefully maintaining a
 856  *      strict ordering of fullest to emptiest slabs based on
 857  *      the slab reference count.  This guarantees the when freeing
 858  *      slabs back to the system we need only linearly traverse the
 859  *      last N slabs in the list to discover all the freeable slabs.
 860  *
 861  * XXX: NUMA awareness for optionally allocating memory close to a
 862  *      particular core.  This can be advantageous if you know the slab
 863  *      object will be short lived and primarily accessed from one core.
 864  *
 865  * XXX: Slab coloring may also yield performance improvements and would
 866  *      be desirable to implement.
 867  */
 868
 869 struct list_head spl_kmem_cache_list;   /* List of caches */
 870 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 871 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 872
 873 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 874
 875 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 876 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 877         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 878
 879 static void *
 880 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 881 {
 882         void *ptr;
 883
 884         ASSERT(ISP2(size));
 885
 886         if (skc->skc_flags & KMC_KMEM)
 887                 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
 888                     get_order(size));
 889         else
 890                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 891
 892         /* Resulting allocated memory will be page aligned */
 893         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 894
 895         return ptr;
 896 }
 897
 898 static void
 899 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 900 {
 901         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 902         ASSERT(ISP2(size));
 903
 904         /*
 905          * The Linux direct reclaim path uses this out of band value to
 906          * determine if forward progress is being made.  Normally this is
 907          * incremented by kmem_freepages() which is part of the various
 908          * Linux slab implementations.  However, since we are using none
 909          * of that infrastructure we are responsible for incrementing it.
 910          */
 911         if (current->reclaim_state)
 912                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 913
 914         if (skc->skc_flags & KMC_KMEM)
 915                 free_pages((unsigned long)ptr, get_order(size));
 916         else
 917                 vfree(ptr);
 918 }
 919
 920 /*
 921  * Required space for each aligned sks.
 922  */
 923 static inline uint32_t
 924 spl_sks_size(spl_kmem_cache_t *skc)
 925 {
 926         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 927                skc->skc_obj_align, uint32_t);
 928 }
 929
 930 /*
 931  * Required space for each aligned object.
 932  */
 933 static inline uint32_t
 934 spl_obj_size(spl_kmem_cache_t *skc)
 935 {
 936         uint32_t align = skc->skc_obj_align;
 937
 938         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 939                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 940 }
 941
 942 /*
 943  * Lookup the spl_kmem_object_t for an object given that object.
 944  */
 945 static inline spl_kmem_obj_t *
 946 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 947 {
 948         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 949                skc->skc_obj_align, uint32_t);
 950 }
 951
 952 /*
 953  * Required space for each offslab object taking in to account alignment
 954  * restrictions and the power-of-two requirement of kv_alloc().
 955  */
 956 static inline uint32_t
 957 spl_offslab_size(spl_kmem_cache_t *skc)
 958 {
 959         return 1UL << (highbit(spl_obj_size(skc)) + 1);
 960 }
 961
 962 /*
 963  * It's important that we pack the spl_kmem_obj_t structure and the
 964  * actual objects in to one large address space to minimize the number
 965  * of calls to the allocator.  It is far better to do a few large
 966  * allocations and then subdivide it ourselves.  Now which allocator
 967  * we use requires balancing a few trade offs.
 968  *
 969  * For small objects we use kmem_alloc() because as long as you are
 970  * only requesting a small number of pages (ideally just one) its cheap.
 971  * However, when you start requesting multiple pages with kmem_alloc()
 972  * it gets increasingly expensive since it requires contiguous pages.
 973  * For this reason we shift to vmem_alloc() for slabs of large objects
 974  * which removes the need for contiguous pages.  We do not use
 975  * vmem_alloc() in all cases because there is significant locking
 976  * overhead in __get_vm_area_node().  This function takes a single
 977  * global lock when acquiring an available virtual address range which
 978  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 979  * different allocation functions for small and large objects should
 980  * give us the best of both worlds.
 981  *
 982  * KMC_ONSLAB                       KMC_OFFSLAB
 983  *
 984  * +------------------------+       +-----------------+
 985  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 986  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 987  * | spl_kmem_obj_t      |  |                             | |
 988  * | skc_obj_size    <---+  |       +-----------------+   | |
 989  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 990  * | ...                 v  |       | spl_kmem_obj_t  |     |
 991  * +------------------------+       +-----------------+     v
 992  */
 993 static spl_kmem_slab_t *
 994 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 995 {
 996         spl_kmem_slab_t *sks;
 997         spl_kmem_obj_t *sko, *n;
 998         void *base, *obj;
 999         uint32_t obj_size, offslab_size = 0;
1000         int i,  rc = 0;
1001
1002         base = kv_alloc(skc, skc->skc_slab_size, flags);
1003         if (base == NULL)
1004                 SRETURN(NULL);
1005
1006         sks = (spl_kmem_slab_t *)base;
1007         sks->sks_magic = SKS_MAGIC;
1008         sks->sks_objs = skc->skc_slab_objs;
1009         sks->sks_age = jiffies;
1010         sks->sks_cache = skc;
1011         INIT_LIST_HEAD(&sks->sks_list);
1012         INIT_LIST_HEAD(&sks->sks_free_list);
1013         sks->sks_ref = 0;
1014         obj_size = spl_obj_size(skc);
1015
1016         if (skc->skc_flags & KMC_OFFSLAB)
1017                 offslab_size = spl_offslab_size(skc);
1018
1019         for (i = 0; i < sks->sks_objs; i++) {
1020                 if (skc->skc_flags & KMC_OFFSLAB) {
1021                         obj = kv_alloc(skc, offslab_size, flags);
1022                         if (!obj)
1023                                 SGOTO(out, rc = -ENOMEM);
1024                 } else {
1025                         obj = base + spl_sks_size(skc) + (i * obj_size);
1026                 }
1027
1028                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1029                 sko = spl_sko_from_obj(skc, obj);
1030                 sko->sko_addr = obj;
1031                 sko->sko_magic = SKO_MAGIC;
1032                 sko->sko_slab = sks;
1033                 INIT_LIST_HEAD(&sko->sko_list);
1034                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
1035         }
1036
1037         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
1038                 if (skc->skc_ctor)
1039                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
1040 out:
1041         if (rc) {
1042                 if (skc->skc_flags & KMC_OFFSLAB)
1043                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
1044                                                  sko_list)
1045                                 kv_free(skc, sko->sko_addr, offslab_size);
1046
1047                 kv_free(skc, base, skc->skc_slab_size);
1048                 sks = NULL;
1049         }
1050
1051         SRETURN(sks);
1052 }
1053
1054 /*
1055  * Remove a slab from complete or partial list, it must be called with
1056  * the 'skc->skc_lock' held but the actual free must be performed
1057  * outside the lock to prevent deadlocking on vmem addresses.
1058  */
1059 static void
1060 spl_slab_free(spl_kmem_slab_t *sks,
1061               struct list_head *sks_list, struct list_head *sko_list)
1062 {
1063         spl_kmem_cache_t *skc;
1064         SENTRY;
1065
1066         ASSERT(sks->sks_magic == SKS_MAGIC);
1067         ASSERT(sks->sks_ref == 0);
1068
1069         skc = sks->sks_cache;
1070         ASSERT(skc->skc_magic == SKC_MAGIC);
1071         ASSERT(spin_is_locked(&skc->skc_lock));
1072
1073         /*
1074          * Update slab/objects counters in the cache, then remove the
1075          * slab from the skc->skc_partial_list.  Finally add the slab
1076          * and all its objects in to the private work lists where the
1077          * destructors will be called and the memory freed to the system.
1078          */
1079         skc->skc_obj_total -= sks->sks_objs;
1080         skc->skc_slab_total--;
1081         list_del(&sks->sks_list);
1082         list_add(&sks->sks_list, sks_list);
1083         list_splice_init(&sks->sks_free_list, sko_list);
1084
1085         SEXIT;
1086 }
1087
1088 /*
1089  * Traverses all the partial slabs attached to a cache and free those
1090  * which which are currently empty, and have not been touched for
1091  * skc_delay seconds to  avoid thrashing.  The count argument is
1092  * passed to optionally cap the number of slabs reclaimed, a count
1093  * of zero means try and reclaim everything.  When flag is set we
1094  * always free an available slab regardless of age.
1095  */
1096 static void
1097 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
1098 {
1099         spl_kmem_slab_t *sks, *m;
1100         spl_kmem_obj_t *sko, *n;
1101         LIST_HEAD(sks_list);
1102         LIST_HEAD(sko_list);
1103         uint32_t size = 0;
1104         int i = 0;
1105         SENTRY;
1106
1107         /*
1108          * Move empty slabs and objects which have not been touched in
1109          * skc_delay seconds on to private lists to be freed outside
1110          * the spin lock.  This delay time is important to avoid thrashing
1111          * however when flag is set the delay will not be used.
1112          */
1113         spin_lock(&skc->skc_lock);
1114         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
1115                 /*
1116                  * All empty slabs are at the end of skc->skc_partial_list,
1117                  * therefore once a non-empty slab is found we can stop
1118                  * scanning.  Additionally, stop when reaching the target
1119                  * reclaim 'count' if a non-zero threshold is given.
1120                  */
1121                 if ((sks->sks_ref > 0) || (count && i >= count))
1122                         break;
1123
1124                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
1125                         spl_slab_free(sks, &sks_list, &sko_list);
1126                         i++;
1127                 }
1128         }
1129         spin_unlock(&skc->skc_lock);
1130
1131         /*
1132          * The following two loops ensure all the object destructors are
1133          * run, any offslab objects are freed, and the slabs themselves
1134          * are freed.  This is all done outside the skc->skc_lock since
1135          * this allows the destructor to sleep, and allows us to perform
1136          * a conditional reschedule when a freeing a large number of
1137          * objects and slabs back to the system.
1138          */
1139         if (skc->skc_flags & KMC_OFFSLAB)
1140                 size = spl_offslab_size(skc);
1141
1142         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1143                 ASSERT(sko->sko_magic == SKO_MAGIC);
1144
1145                 if (skc->skc_dtor)
1146                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
1147
1148                 if (skc->skc_flags & KMC_OFFSLAB)
1149                         kv_free(skc, sko->sko_addr, size);
1150         }
1151
1152         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1153                 ASSERT(sks->sks_magic == SKS_MAGIC);
1154                 kv_free(skc, sks, skc->skc_slab_size);
1155         }
1156
1157         SEXIT;
1158 }
1159
1160 static spl_kmem_emergency_t *
1161 spl_emergency_search(struct rb_root *root, void *obj)
1162 {
1163         struct rb_node *node = root->rb_node;
1164         spl_kmem_emergency_t *ske;
1165         unsigned long address = (unsigned long)obj;
1166
1167         while (node) {
1168                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
1169
1170                 if (address < (unsigned long)ske->ske_obj)
1171                         node = node->rb_left;
1172                 else if (address > (unsigned long)ske->ske_obj)
1173                         node = node->rb_right;
1174                 else
1175                         return ske;
1176         }
1177
1178         return NULL;
1179 }
1180
1181 static int
1182 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
1183 {
1184         struct rb_node **new = &(root->rb_node), *parent = NULL;
1185         spl_kmem_emergency_t *ske_tmp;
1186         unsigned long address = (unsigned long)ske->ske_obj;
1187
1188         while (*new) {
1189                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
1190
1191                 parent = *new;
1192                 if (address < (unsigned long)ske_tmp->ske_obj)
1193                         new = &((*new)->rb_left);
1194                 else if (address > (unsigned long)ske_tmp->ske_obj)
1195                         new = &((*new)->rb_right);
1196                 else
1197                         return 0;
1198         }
1199
1200         rb_link_node(&ske->ske_node, parent, new);
1201         rb_insert_color(&ske->ske_node, root);
1202
1203         return 1;
1204 }
1205
1206 /*
1207  * Allocate a single emergency object and track it in a red black tree.
1208  */
1209 static int
1210 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
1211 {
1212         spl_kmem_emergency_t *ske;
1213         int empty;
1214         SENTRY;
1215
1216         /* Last chance use a partial slab if one now exists */
1217         spin_lock(&skc->skc_lock);
1218         empty = list_empty(&skc->skc_partial_list);
1219         spin_unlock(&skc->skc_lock);
1220         if (!empty)
1221                 SRETURN(-EEXIST);
1222
1223         ske = kmalloc(sizeof(*ske), flags);
1224         if (ske == NULL)
1225                 SRETURN(-ENOMEM);
1226
1227         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
1228         if (ske->ske_obj == NULL) {
1229                 kfree(ske);
1230                 SRETURN(-ENOMEM);
1231         }
1232
1233         spin_lock(&skc->skc_lock);
1234         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
1235         if (likely(empty)) {
1236                 skc->skc_obj_total++;
1237                 skc->skc_obj_emergency++;
1238                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
1239                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
1240         }
1241         spin_unlock(&skc->skc_lock);
1242
1243         if (unlikely(!empty)) {
1244                 kfree(ske->ske_obj);
1245                 kfree(ske);
1246                 SRETURN(-EINVAL);
1247         }
1248
1249         if (skc->skc_ctor)
1250                 skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
1251
1252         *obj = ske->ske_obj;
1253
1254         SRETURN(0);
1255 }
1256
1257 /*
1258  * Locate the passed object in the red black tree and free it.
1259  */
1260 static int
1261 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1262 {
1263         spl_kmem_emergency_t *ske;
1264         SENTRY;
1265
1266         spin_lock(&skc->skc_lock);
1267         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1268         if (likely(ske)) {
1269                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1270                 skc->skc_obj_emergency--;
1271                 skc->skc_obj_total--;
1272         }
1273         spin_unlock(&skc->skc_lock);
1274
1275         if (unlikely(ske == NULL))
1276                 SRETURN(-ENOENT);
1277
1278         if (skc->skc_dtor)
1279                 skc->skc_dtor(ske->ske_obj, skc->skc_private);
1280
1281         kfree(ske->ske_obj);
1282         kfree(ske);
1283
1284         SRETURN(0);
1285 }
1286
1287 /*
1288  * Release objects from the per-cpu magazine back to their slab.  The flush
1289  * argument contains the max number of entries to remove from the magazine.
1290  */
1291 static void
1292 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1293 {
1294         int i, count = MIN(flush, skm->skm_avail);
1295         SENTRY;
1296
1297         ASSERT(skc->skc_magic == SKC_MAGIC);
1298         ASSERT(skm->skm_magic == SKM_MAGIC);
1299         ASSERT(spin_is_locked(&skc->skc_lock));
1300
1301         for (i = 0; i < count; i++)
1302                 spl_cache_shrink(skc, skm->skm_objs[i]);
1303
1304         skm->skm_avail -= count;
1305         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1306                 sizeof(void *) * skm->skm_avail);
1307
1308         SEXIT;
1309 }
1310
1311 static void
1312 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1313 {
1314         spin_lock(&skc->skc_lock);
1315         __spl_cache_flush(skc, skm, flush);
1316         spin_unlock(&skc->skc_lock);
1317 }
1318
1319 static void
1320 spl_magazine_age(void *data)
1321 {
1322         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1323         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1324
1325         ASSERT(skm->skm_magic == SKM_MAGIC);
1326         ASSERT(skm->skm_cpu == smp_processor_id());
1327         ASSERT(irqs_disabled());
1328
1329         /* There are no available objects or they are too young to age out */
1330         if ((skm->skm_avail == 0) ||
1331             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1332                 return;
1333
1334         /*
1335          * Because we're executing in interrupt context we may have
1336          * interrupted the holder of this lock.  To avoid a potential
1337          * deadlock return if the lock is contended.
1338          */
1339         if (!spin_trylock(&skc->skc_lock))
1340                 return;
1341
1342         __spl_cache_flush(skc, skm, skm->skm_refill);
1343         spin_unlock(&skc->skc_lock);
1344 }
1345
1346 /*
1347  * Called regularly to keep a downward pressure on the cache.
1348  *
1349  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1350  * be returned to the caches.  This is done to prevent idle magazines from
1351  * holding memory which could be better used elsewhere.  The delay is
1352  * present to prevent thrashing the magazine.
1353  *
1354  * The newly released objects may result in empty partial slabs.  Those
1355  * slabs should be released to the system.  Otherwise moving the objects
1356  * out of the magazines is just wasted work.
1357  */
1358 static void
1359 spl_cache_age(void *data)
1360 {
1361         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1362         taskqid_t id = 0;
1363
1364         ASSERT(skc->skc_magic == SKC_MAGIC);
1365
1366         /* Dynamically disabled at run time */
1367         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1368                 return;
1369
1370         atomic_inc(&skc->skc_ref);
1371
1372         if (!(skc->skc_flags & KMC_NOMAGAZINE))
1373                 spl_on_each_cpu(spl_magazine_age, skc, 1);
1374
1375         spl_slab_reclaim(skc, skc->skc_reap, 0);
1376
1377         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1378                 id = taskq_dispatch_delay(
1379                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1380                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1381
1382                 /* Destroy issued after dispatch immediately cancel it */
1383                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1384                         taskq_cancel_id(spl_kmem_cache_taskq, id);
1385         }
1386
1387         spin_lock(&skc->skc_lock);
1388         skc->skc_taskqid = id;
1389         spin_unlock(&skc->skc_lock);
1390
1391         atomic_dec(&skc->skc_ref);
1392 }
1393
1394 /*
1395  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1396  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
1397  * for very small objects we may end up with more than this so as not
1398  * to waste space in the minimal allocation of a single page.  Also for
1399  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
1400  * lower than this and we will fail.
1401  */
1402 static int
1403 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1404 {
1405         uint32_t sks_size, obj_size, max_size;
1406
1407         if (skc->skc_flags & KMC_OFFSLAB) {
1408                 *objs = spl_kmem_cache_obj_per_slab;
1409                 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
1410                 SRETURN(0);
1411         } else {
1412                 sks_size = spl_sks_size(skc);
1413                 obj_size = spl_obj_size(skc);
1414
1415                 if (skc->skc_flags & KMC_KMEM)
1416                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1417                 else
1418                         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
1419
1420                 /* Power of two sized slab */
1421                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1422                         *objs = (*size - sks_size) / obj_size;
1423                         if (*objs >= spl_kmem_cache_obj_per_slab)
1424                                 SRETURN(0);
1425                 }
1426
1427                 /*
1428                  * Unable to satisfy target objects per slab, fall back to
1429                  * allocating a maximally sized slab and assuming it can
1430                  * contain the minimum objects count use it.  If not fail.
1431                  */
1432                 *size = max_size;
1433                 *objs = (*size - sks_size) / obj_size;
1434                 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
1435                         SRETURN(0);
1436         }
1437
1438         SRETURN(-ENOSPC);
1439 }
1440
1441 /*
1442  * Make a guess at reasonable per-cpu magazine size based on the size of
1443  * each object and the cost of caching N of them in each magazine.  Long
1444  * term this should really adapt based on an observed usage heuristic.
1445  */
1446 static int
1447 spl_magazine_size(spl_kmem_cache_t *skc)
1448 {
1449         uint32_t obj_size = spl_obj_size(skc);
1450         int size;
1451         SENTRY;
1452
1453         /* Per-magazine sizes below assume a 4Kib page size */
1454         if (obj_size > (PAGE_SIZE * 256))
1455                 size = 4;  /* Minimum 4Mib per-magazine */
1456         else if (obj_size > (PAGE_SIZE * 32))
1457                 size = 16; /* Minimum 2Mib per-magazine */
1458         else if (obj_size > (PAGE_SIZE))
1459                 size = 64; /* Minimum 256Kib per-magazine */
1460         else if (obj_size > (PAGE_SIZE / 4))
1461                 size = 128; /* Minimum 128Kib per-magazine */
1462         else
1463                 size = 256;
1464
1465         SRETURN(size);
1466 }
1467
1468 /*
1469  * Allocate a per-cpu magazine to associate with a specific core.
1470  */
1471 static spl_kmem_magazine_t *
1472 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
1473 {
1474         spl_kmem_magazine_t *skm;
1475         int size = sizeof(spl_kmem_magazine_t) +
1476                    sizeof(void *) * skc->skc_mag_size;
1477         SENTRY;
1478
1479         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
1480         if (skm) {
1481                 skm->skm_magic = SKM_MAGIC;
1482                 skm->skm_avail = 0;
1483                 skm->skm_size = skc->skc_mag_size;
1484                 skm->skm_refill = skc->skc_mag_refill;
1485                 skm->skm_cache = skc;
1486                 skm->skm_age = jiffies;
1487                 skm->skm_cpu = cpu;
1488         }
1489
1490         SRETURN(skm);
1491 }
1492
1493 /*
1494  * Free a per-cpu magazine associated with a specific core.
1495  */
1496 static void
1497 spl_magazine_free(spl_kmem_magazine_t *skm)
1498 {
1499         int size = sizeof(spl_kmem_magazine_t) +
1500                    sizeof(void *) * skm->skm_size;
1501
1502         SENTRY;
1503         ASSERT(skm->skm_magic == SKM_MAGIC);
1504         ASSERT(skm->skm_avail == 0);
1505
1506         kmem_free(skm, size);
1507         SEXIT;
1508 }
1509
1510 /*
1511  * Create all pre-cpu magazines of reasonable sizes.
1512  */
1513 static int
1514 spl_magazine_create(spl_kmem_cache_t *skc)
1515 {
1516         int i;
1517         SENTRY;
1518
1519         if (skc->skc_flags & KMC_NOMAGAZINE)
1520                 SRETURN(0);
1521
1522         skc->skc_mag_size = spl_magazine_size(skc);
1523         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1524
1525         for_each_online_cpu(i) {
1526                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
1527                 if (!skc->skc_mag[i]) {
1528                         for (i--; i >= 0; i--)
1529                                 spl_magazine_free(skc->skc_mag[i]);
1530
1531                         SRETURN(-ENOMEM);
1532                 }
1533         }
1534
1535         SRETURN(0);
1536 }
1537
1538 /*
1539  * Destroy all pre-cpu magazines.
1540  */
1541 static void
1542 spl_magazine_destroy(spl_kmem_cache_t *skc)
1543 {
1544         spl_kmem_magazine_t *skm;
1545         int i;
1546         SENTRY;
1547
1548         if (skc->skc_flags & KMC_NOMAGAZINE) {
1549                 SEXIT;
1550                 return;
1551         }
1552
1553         for_each_online_cpu(i) {
1554                 skm = skc->skc_mag[i];
1555                 spl_cache_flush(skc, skm, skm->skm_avail);
1556                 spl_magazine_free(skm);
1557         }
1558
1559         SEXIT;
1560 }
1561
1562 /*
1563  * Create a object cache based on the following arguments:
1564  * name         cache name
1565  * size         cache object size
1566  * align        cache object alignment
1567  * ctor         cache object constructor
1568  * dtor         cache object destructor
1569  * reclaim      cache object reclaim
1570  * priv         cache private data for ctor/dtor/reclaim
1571  * vmp          unused must be NULL
1572  * flags
1573  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1574  *      KMC_NODEBUG     Disable debugging (unsupported)
1575  *      KMC_NOHASH      Disable hashing (unsupported)
1576  *      KMC_QCACHE      Disable qcache (unsupported)
1577  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
1578  *      KMC_KMEM        Force kmem backed cache
1579  *      KMC_VMEM        Force vmem backed cache
1580  *      KMC_SLAB        Force Linux slab backed cache
1581  *      KMC_OFFSLAB     Locate objects off the slab
1582  */
1583 spl_kmem_cache_t *
1584 spl_kmem_cache_create(char *name, size_t size, size_t align,
1585                       spl_kmem_ctor_t ctor,
1586                       spl_kmem_dtor_t dtor,
1587                       spl_kmem_reclaim_t reclaim,
1588                       void *priv, void *vmp, int flags)
1589 {
1590         spl_kmem_cache_t *skc;
1591         int rc;
1592         SENTRY;
1593
1594         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1595         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1596         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1597         ASSERT(vmp == NULL);
1598
1599         might_sleep();
1600
1601         /*
1602          * Allocate memory for a new cache an initialize it.  Unfortunately,
1603          * this usually ends up being a large allocation of ~32k because
1604          * we need to allocate enough memory for the worst case number of
1605          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1606          * explicitly pass KM_NODEBUG to suppress the kmem warning
1607          */
1608         skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
1609         if (skc == NULL)
1610                 SRETURN(NULL);
1611
1612         skc->skc_magic = SKC_MAGIC;
1613         skc->skc_name_size = strlen(name) + 1;
1614         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
1615         if (skc->skc_name == NULL) {
1616                 kmem_free(skc, sizeof(*skc));
1617                 SRETURN(NULL);
1618         }
1619         strncpy(skc->skc_name, name, skc->skc_name_size);
1620
1621         skc->skc_ctor = ctor;
1622         skc->skc_dtor = dtor;
1623         skc->skc_reclaim = reclaim;
1624         skc->skc_private = priv;
1625         skc->skc_vmp = vmp;
1626         skc->skc_linux_cache = NULL;
1627         skc->skc_flags = flags;
1628         skc->skc_obj_size = size;
1629         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1630         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1631         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1632         atomic_set(&skc->skc_ref, 0);
1633
1634         INIT_LIST_HEAD(&skc->skc_list);
1635         INIT_LIST_HEAD(&skc->skc_complete_list);
1636         INIT_LIST_HEAD(&skc->skc_partial_list);
1637         skc->skc_emergency_tree = RB_ROOT;
1638         spin_lock_init(&skc->skc_lock);
1639         init_waitqueue_head(&skc->skc_waitq);
1640         skc->skc_slab_fail = 0;
1641         skc->skc_slab_create = 0;
1642         skc->skc_slab_destroy = 0;
1643         skc->skc_slab_total = 0;
1644         skc->skc_slab_alloc = 0;
1645         skc->skc_slab_max = 0;
1646         skc->skc_obj_total = 0;
1647         skc->skc_obj_alloc = 0;
1648         skc->skc_obj_max = 0;
1649         skc->skc_obj_deadlock = 0;
1650         skc->skc_obj_emergency = 0;
1651         skc->skc_obj_emergency_max = 0;
1652
1653         /*
1654          * Verify the requested alignment restriction is sane.
1655          */
1656         if (align) {
1657                 VERIFY(ISP2(align));
1658                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
1659                 VERIFY3U(align, <=, PAGE_SIZE);
1660                 skc->skc_obj_align = align;
1661         }
1662
1663         /*
1664          * When no specific type of slab is requested (kmem, vmem, or
1665          * linuxslab) then select a cache type based on the object size
1666          * and default tunables.
1667          */
1668         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
1669
1670                 /*
1671                  * Objects smaller than spl_kmem_cache_slab_limit can
1672                  * use the Linux slab for better space-efficiency.  By
1673                  * default this functionality is disabled until its
1674                  * performance characters are fully understood.
1675                  */
1676                 if (spl_kmem_cache_slab_limit &&
1677                     size <= (size_t)spl_kmem_cache_slab_limit)
1678                         skc->skc_flags |= KMC_SLAB;
1679
1680                 /*
1681                  * Small objects, less than spl_kmem_cache_kmem_limit per
1682                  * object should use kmem because their slabs are small.
1683                  */
1684                 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
1685                         skc->skc_flags |= KMC_KMEM;
1686
1687                 /*
1688                  * All other objects are considered large and are placed
1689                  * on vmem backed slabs.
1690                  */
1691                 else
1692                         skc->skc_flags |= KMC_VMEM;
1693         }
1694
1695         /*
1696          * Given the type of slab allocate the required resources.
1697          */
1698         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1699                 rc = spl_slab_size(skc,
1700                     &skc->skc_slab_objs, &skc->skc_slab_size);
1701                 if (rc)
1702                         SGOTO(out, rc);
1703
1704                 rc = spl_magazine_create(skc);
1705                 if (rc)
1706                         SGOTO(out, rc);
1707         } else {
1708                 skc->skc_linux_cache = kmem_cache_create(
1709                     skc->skc_name, size, align, 0, NULL);
1710                 if (skc->skc_linux_cache == NULL)
1711                         SGOTO(out, rc = ENOMEM);
1712
1713                 kmem_cache_set_allocflags(skc, __GFP_COMP);
1714                 skc->skc_flags |= KMC_NOMAGAZINE;
1715         }
1716
1717         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1718                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1719                     spl_cache_age, skc, TQ_SLEEP,
1720                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1721
1722         down_write(&spl_kmem_cache_sem);
1723         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1724         up_write(&spl_kmem_cache_sem);
1725
1726         SRETURN(skc);
1727 out:
1728         kmem_free(skc->skc_name, skc->skc_name_size);
1729         kmem_free(skc, sizeof(*skc));
1730         SRETURN(NULL);
1731 }
1732 EXPORT_SYMBOL(spl_kmem_cache_create);
1733
1734 /*
1735  * Register a move callback to for cache defragmentation.
1736  * XXX: Unimplemented but harmless to stub out for now.
1737  */
1738 void
1739 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
1740     kmem_cbrc_t (move)(void *, void *, size_t, void *))
1741 {
1742         ASSERT(move != NULL);
1743 }
1744 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1745
1746 /*
1747  * Destroy a cache and all objects associated with the cache.
1748  */
1749 void
1750 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1751 {
1752         DECLARE_WAIT_QUEUE_HEAD(wq);
1753         taskqid_t id;
1754         SENTRY;
1755
1756         ASSERT(skc->skc_magic == SKC_MAGIC);
1757         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1758
1759         down_write(&spl_kmem_cache_sem);
1760         list_del_init(&skc->skc_list);
1761         up_write(&spl_kmem_cache_sem);
1762
1763         /* Cancel any and wait for any pending delayed tasks */
1764         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1765
1766         spin_lock(&skc->skc_lock);
1767         id = skc->skc_taskqid;
1768         spin_unlock(&skc->skc_lock);
1769
1770         taskq_cancel_id(spl_kmem_cache_taskq, id);
1771
1772         /* Wait until all current callers complete, this is mainly
1773          * to catch the case where a low memory situation triggers a
1774          * cache reaping action which races with this destroy. */
1775         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1776
1777         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1778                 spl_magazine_destroy(skc);
1779                 spl_slab_reclaim(skc, 0, 1);
1780         } else {
1781                 ASSERT(skc->skc_flags & KMC_SLAB);
1782                 kmem_cache_destroy(skc->skc_linux_cache);
1783         }
1784
1785         spin_lock(&skc->skc_lock);
1786
1787         /* Validate there are no objects in use and free all the
1788          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1789         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1790         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1791         ASSERT3U(skc->skc_slab_total, ==, 0);
1792         ASSERT3U(skc->skc_obj_total, ==, 0);
1793         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1794         ASSERT(list_empty(&skc->skc_complete_list));
1795
1796         kmem_free(skc->skc_name, skc->skc_name_size);
1797         spin_unlock(&skc->skc_lock);
1798
1799         kmem_free(skc, sizeof(*skc));
1800
1801         SEXIT;
1802 }
1803 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1804
1805 /*
1806  * Allocate an object from a slab attached to the cache.  This is used to
1807  * repopulate the per-cpu magazine caches in batches when they run low.
1808  */
1809 static void *
1810 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1811 {
1812         spl_kmem_obj_t *sko;
1813
1814         ASSERT(skc->skc_magic == SKC_MAGIC);
1815         ASSERT(sks->sks_magic == SKS_MAGIC);
1816         ASSERT(spin_is_locked(&skc->skc_lock));
1817
1818         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1819         ASSERT(sko->sko_magic == SKO_MAGIC);
1820         ASSERT(sko->sko_addr != NULL);
1821
1822         /* Remove from sks_free_list */
1823         list_del_init(&sko->sko_list);
1824
1825         sks->sks_age = jiffies;
1826         sks->sks_ref++;
1827         skc->skc_obj_alloc++;
1828
1829         /* Track max obj usage statistics */
1830         if (skc->skc_obj_alloc > skc->skc_obj_max)
1831                 skc->skc_obj_max = skc->skc_obj_alloc;
1832
1833         /* Track max slab usage statistics */
1834         if (sks->sks_ref == 1) {
1835                 skc->skc_slab_alloc++;
1836
1837                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1838                         skc->skc_slab_max = skc->skc_slab_alloc;
1839         }
1840
1841         return sko->sko_addr;
1842 }
1843
1844 /*
1845  * Generic slab allocation function to run by the global work queues.
1846  * It is responsible for allocating a new slab, linking it in to the list
1847  * of partial slabs, and then waking any waiters.
1848  */
1849 static void
1850 spl_cache_grow_work(void *data)
1851 {
1852         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1853         spl_kmem_cache_t *skc = ska->ska_cache;
1854         spl_kmem_slab_t *sks;
1855
1856         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1857         spin_lock(&skc->skc_lock);
1858         if (sks) {
1859                 skc->skc_slab_total++;
1860                 skc->skc_obj_total += sks->sks_objs;
1861                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1862         }
1863
1864         atomic_dec(&skc->skc_ref);
1865         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1866         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1867         wake_up_all(&skc->skc_waitq);
1868         spin_unlock(&skc->skc_lock);
1869
1870         kfree(ska);
1871 }
1872
1873 /*
1874  * Returns non-zero when a new slab should be available.
1875  */
1876 static int
1877 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1878 {
1879         return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1880 }
1881
1882 static int
1883 spl_cache_reclaim_wait(void *word)
1884 {
1885         schedule();
1886         return 0;
1887 }
1888
1889 /*
1890  * No available objects on any slabs, create a new slab.  Note that this
1891  * functionality is disabled for KMC_SLAB caches which are backed by the
1892  * Linux slab.
1893  */
1894 static int
1895 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1896 {
1897         int remaining, rc;
1898         SENTRY;
1899
1900         ASSERT(skc->skc_magic == SKC_MAGIC);
1901         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1902         might_sleep();
1903         *obj = NULL;
1904
1905         /*
1906          * Before allocating a new slab wait for any reaping to complete and
1907          * then return so the local magazine can be rechecked for new objects.
1908          */
1909         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1910                 rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1911                     spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
1912                 SRETURN(rc ? rc : -EAGAIN);
1913         }
1914
1915         /*
1916          * This is handled by dispatching a work request to the global work
1917          * queue.  This allows us to asynchronously allocate a new slab while
1918          * retaining the ability to safely fall back to a smaller synchronous
1919          * allocations to ensure forward progress is always maintained.
1920          */
1921         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1922                 spl_kmem_alloc_t *ska;
1923
1924                 ska = kmalloc(sizeof(*ska), flags);
1925                 if (ska == NULL) {
1926                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1927                         wake_up_all(&skc->skc_waitq);
1928                         SRETURN(-ENOMEM);
1929                 }
1930
1931                 atomic_inc(&skc->skc_ref);
1932                 ska->ska_cache = skc;
1933                 ska->ska_flags = flags & ~__GFP_FS;
1934                 taskq_init_ent(&ska->ska_tqe);
1935                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1936                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1937         }
1938
1939         /*
1940          * The goal here is to only detect the rare case where a virtual slab
1941          * allocation has deadlocked.  We must be careful to minimize the use
1942          * of emergency objects which are more expensive to track.  Therefore,
1943          * we set a very long timeout for the asynchronous allocation and if
1944          * the timeout is reached the cache is flagged as deadlocked.  From
1945          * this point only new emergency objects will be allocated until the
1946          * asynchronous allocation completes and clears the deadlocked flag.
1947          */
1948         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1949                 rc = spl_emergency_alloc(skc, flags, obj);
1950         } else {
1951                 remaining = wait_event_timeout(skc->skc_waitq,
1952                                                spl_cache_grow_wait(skc), HZ);
1953
1954                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1955                         spin_lock(&skc->skc_lock);
1956                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1957                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1958                                 skc->skc_obj_deadlock++;
1959                         }
1960                         spin_unlock(&skc->skc_lock);
1961                 }
1962
1963                 rc = -ENOMEM;
1964         }
1965
1966         SRETURN(rc);
1967 }
1968
1969 /*
1970  * Refill a per-cpu magazine with objects from the slabs for this cache.
1971  * Ideally the magazine can be repopulated using existing objects which have
1972  * been released, however if we are unable to locate enough free objects new
1973  * slabs of objects will be created.  On success NULL is returned, otherwise
1974  * the address of a single emergency object is returned for use by the caller.
1975  */
1976 static void *
1977 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1978 {
1979         spl_kmem_slab_t *sks;
1980         int count = 0, rc, refill;
1981         void *obj = NULL;
1982         SENTRY;
1983
1984         ASSERT(skc->skc_magic == SKC_MAGIC);
1985         ASSERT(skm->skm_magic == SKM_MAGIC);
1986
1987         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1988         spin_lock(&skc->skc_lock);
1989
1990         while (refill > 0) {
1991                 /* No slabs available we may need to grow the cache */
1992                 if (list_empty(&skc->skc_partial_list)) {
1993                         spin_unlock(&skc->skc_lock);
1994
1995                         local_irq_enable();
1996                         rc = spl_cache_grow(skc, flags, &obj);
1997                         local_irq_disable();
1998
1999                         /* Emergency object for immediate use by caller */
2000                         if (rc == 0 && obj != NULL)
2001                                 SRETURN(obj);
2002
2003                         if (rc)
2004                                 SGOTO(out, rc);
2005
2006                         /* Rescheduled to different CPU skm is not local */
2007                         if (skm != skc->skc_mag[smp_processor_id()])
2008                                 SGOTO(out, rc);
2009
2010                         /* Potentially rescheduled to the same CPU but
2011                          * allocations may have occurred from this CPU while
2012                          * we were sleeping so recalculate max refill. */
2013                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
2014
2015                         spin_lock(&skc->skc_lock);
2016                         continue;
2017                 }
2018
2019                 /* Grab the next available slab */
2020                 sks = list_entry((&skc->skc_partial_list)->next,
2021                                  spl_kmem_slab_t, sks_list);
2022                 ASSERT(sks->sks_magic == SKS_MAGIC);
2023                 ASSERT(sks->sks_ref < sks->sks_objs);
2024                 ASSERT(!list_empty(&sks->sks_free_list));
2025
2026                 /* Consume as many objects as needed to refill the requested
2027                  * cache.  We must also be careful not to overfill it. */
2028                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
2029                         ASSERT(skm->skm_avail < skm->skm_size);
2030                         ASSERT(count < skm->skm_size);
2031                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
2032                 }
2033
2034                 /* Move slab to skc_complete_list when full */
2035                 if (sks->sks_ref == sks->sks_objs) {
2036                         list_del(&sks->sks_list);
2037                         list_add(&sks->sks_list, &skc->skc_complete_list);
2038                 }
2039         }
2040
2041         spin_unlock(&skc->skc_lock);
2042 out:
2043         SRETURN(NULL);
2044 }
2045
2046 /*
2047  * Release an object back to the slab from which it came.
2048  */
2049 static void
2050 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
2051 {
2052         spl_kmem_slab_t *sks = NULL;
2053         spl_kmem_obj_t *sko = NULL;
2054         SENTRY;
2055
2056         ASSERT(skc->skc_magic == SKC_MAGIC);
2057         ASSERT(spin_is_locked(&skc->skc_lock));
2058
2059         sko = spl_sko_from_obj(skc, obj);
2060         ASSERT(sko->sko_magic == SKO_MAGIC);
2061         sks = sko->sko_slab;
2062         ASSERT(sks->sks_magic == SKS_MAGIC);
2063         ASSERT(sks->sks_cache == skc);
2064         list_add(&sko->sko_list, &sks->sks_free_list);
2065
2066         sks->sks_age = jiffies;
2067         sks->sks_ref--;
2068         skc->skc_obj_alloc--;
2069
2070         /* Move slab to skc_partial_list when no longer full.  Slabs
2071          * are added to the head to keep the partial list is quasi-full
2072          * sorted order.  Fuller at the head, emptier at the tail. */
2073         if (sks->sks_ref == (sks->sks_objs - 1)) {
2074                 list_del(&sks->sks_list);
2075                 list_add(&sks->sks_list, &skc->skc_partial_list);
2076         }
2077
2078         /* Move empty slabs to the end of the partial list so
2079          * they can be easily found and freed during reclamation. */
2080         if (sks->sks_ref == 0) {
2081                 list_del(&sks->sks_list);
2082                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
2083                 skc->skc_slab_alloc--;
2084         }
2085
2086         SEXIT;
2087 }
2088
2089 /*
2090  * Allocate an object from the per-cpu magazine, or if the magazine
2091  * is empty directly allocate from a slab and repopulate the magazine.
2092  */
2093 void *
2094 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
2095 {
2096         spl_kmem_magazine_t *skm;
2097         void *obj = NULL;
2098         SENTRY;
2099
2100         ASSERT(skc->skc_magic == SKC_MAGIC);
2101         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2102         ASSERT(flags & KM_SLEEP);
2103
2104         atomic_inc(&skc->skc_ref);
2105
2106         /*
2107          * Allocate directly from a Linux slab.  All optimizations are left
2108          * to the underlying cache we only need to guarantee that KM_SLEEP
2109          * callers will never fail.
2110          */
2111         if (skc->skc_flags & KMC_SLAB) {
2112                 struct kmem_cache *slc = skc->skc_linux_cache;
2113
2114                 do {
2115                         obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
2116                         if (obj && skc->skc_ctor)
2117                                 skc->skc_ctor(obj, skc->skc_private, flags);
2118
2119                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
2120
2121                 atomic_dec(&skc->skc_ref);
2122                 SRETURN(obj);
2123         }
2124
2125         local_irq_disable();
2126
2127 restart:
2128         /* Safe to update per-cpu structure without lock, but
2129          * in the restart case we must be careful to reacquire
2130          * the local magazine since this may have changed
2131          * when we need to grow the cache. */
2132         skm = skc->skc_mag[smp_processor_id()];
2133         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
2134                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
2135                 skm->skm_size, skm->skm_refill, skm->skm_avail);
2136
2137         if (likely(skm->skm_avail)) {
2138                 /* Object available in CPU cache, use it */
2139                 obj = skm->skm_objs[--skm->skm_avail];
2140                 skm->skm_age = jiffies;
2141         } else {
2142                 obj = spl_cache_refill(skc, skm, flags);
2143                 if (obj == NULL)
2144                         SGOTO(restart, obj = NULL);
2145         }
2146
2147         local_irq_enable();
2148         ASSERT(obj);
2149         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
2150
2151         /* Pre-emptively migrate object to CPU L1 cache */
2152         prefetchw(obj);
2153         atomic_dec(&skc->skc_ref);
2154
2155         SRETURN(obj);
2156 }
2157 EXPORT_SYMBOL(spl_kmem_cache_alloc);
2158
2159 /*
2160  * Free an object back to the local per-cpu magazine, there is no
2161  * guarantee that this is the same magazine the object was originally
2162  * allocated from.  We may need to flush entire from the magazine
2163  * back to the slabs to make space.
2164  */
2165 void
2166 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
2167 {
2168         spl_kmem_magazine_t *skm;
2169         unsigned long flags;
2170         SENTRY;
2171
2172         ASSERT(skc->skc_magic == SKC_MAGIC);
2173         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2174         atomic_inc(&skc->skc_ref);
2175
2176         /*
2177          * Free the object from the Linux underlying Linux slab.
2178          */
2179         if (skc->skc_flags & KMC_SLAB) {
2180                 if (skc->skc_dtor)
2181                         skc->skc_dtor(obj, skc->skc_private);
2182
2183                 kmem_cache_free(skc->skc_linux_cache, obj);
2184                 goto out;
2185         }
2186
2187         /*
2188          * Only virtual slabs may have emergency objects and these objects
2189          * are guaranteed to have physical addresses.  They must be removed
2190          * from the tree of emergency objects and the freed.
2191          */
2192         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
2193                 SGOTO(out, spl_emergency_free(skc, obj));
2194
2195         local_irq_save(flags);
2196
2197         /* Safe to update per-cpu structure without lock, but
2198          * no remote memory allocation tracking is being performed
2199          * it is entirely possible to allocate an object from one
2200          * CPU cache and return it to another. */
2201         skm = skc->skc_mag[smp_processor_id()];
2202         ASSERT(skm->skm_magic == SKM_MAGIC);
2203
2204         /* Per-CPU cache full, flush it to make space */
2205         if (unlikely(skm->skm_avail >= skm->skm_size))
2206                 spl_cache_flush(skc, skm, skm->skm_refill);
2207
2208         /* Available space in cache, use it */
2209         skm->skm_objs[skm->skm_avail++] = obj;
2210
2211         local_irq_restore(flags);
2212 out:
2213         atomic_dec(&skc->skc_ref);
2214
2215         SEXIT;
2216 }
2217 EXPORT_SYMBOL(spl_kmem_cache_free);
2218
2219 /*
2220  * The generic shrinker function for all caches.  Under Linux a shrinker
2221  * may not be tightly coupled with a slab cache.  In fact Linux always
2222  * systematically tries calling all registered shrinker callbacks which
2223  * report that they contain unused objects.  Because of this we only
2224  * register one shrinker function in the shim layer for all slab caches.
2225  * We always attempt to shrink all caches when this generic shrinker
2226  * is called.  The shrinker should return the number of free objects
2227  * in the cache when called with nr_to_scan == 0 but not attempt to
2228  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
2229  * objects should be freed, which differs from Solaris semantics.
2230  * Solaris semantics are to free all available objects which may (and
2231  * probably will) be more objects than the requested nr_to_scan.
2232  */
2233 static int
2234 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
2235     struct shrink_control *sc)
2236 {
2237         spl_kmem_cache_t *skc;
2238         int unused = 0;
2239
2240         down_read(&spl_kmem_cache_sem);
2241         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
2242                 if (sc->nr_to_scan)
2243                         spl_kmem_cache_reap_now(skc,
2244                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
2245
2246                 /*
2247                  * Presume everything alloc'ed in reclaimable, this ensures
2248                  * we are called again with nr_to_scan > 0 so can try and
2249                  * reclaim.  The exact number is not important either so
2250                  * we forgo taking this already highly contented lock.
2251                  */
2252                 unused += skc->skc_obj_alloc;
2253         }
2254         up_read(&spl_kmem_cache_sem);
2255
2256         /*
2257          * After performing reclaim always return -1 to indicate we cannot
2258          * perform additional reclaim.  This prevents shrink_slabs() from
2259          * repeatedly invoking this generic shrinker and potentially spinning.
2260          */
2261         if (sc->nr_to_scan)
2262                 return -1;
2263
2264         return unused;
2265 }
2266
2267 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2268
2269 /*
2270  * Call the registered reclaim function for a cache.  Depending on how
2271  * many and which objects are released it may simply repopulate the
2272  * local magazine which will then need to age-out.  Objects which cannot
2273  * fit in the magazine we will be released back to their slabs which will
2274  * also need to age out before being release.  This is all just best
2275  * effort and we do not want to thrash creating and destroying slabs.
2276  */
2277 void
2278 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
2279 {
2280         SENTRY;
2281
2282         ASSERT(skc->skc_magic == SKC_MAGIC);
2283         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2284
2285         atomic_inc(&skc->skc_ref);
2286
2287         /*
2288          * Execute the registered reclaim callback if it exists.  The
2289          * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
2290          */
2291         if (skc->skc_flags & KMC_SLAB) {
2292                 if (skc->skc_reclaim)
2293                         skc->skc_reclaim(skc->skc_private);
2294
2295                 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
2296                         kmem_cache_shrink(skc->skc_linux_cache);
2297
2298                 SGOTO(out, 0);
2299         }
2300
2301         /*
2302          * Prevent concurrent cache reaping when contended.
2303          */
2304         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
2305                 SGOTO(out, 0);
2306
2307         /*
2308          * When a reclaim function is available it may be invoked repeatedly
2309          * until at least a single slab can be freed.  This ensures that we
2310          * do free memory back to the system.  This helps minimize the chance
2311          * of an OOM event when the bulk of memory is used by the slab.
2312          *
2313          * When free slabs are already available the reclaim callback will be
2314          * skipped.  Additionally, if no forward progress is detected despite
2315          * a reclaim function the cache will be skipped to avoid deadlock.
2316          *
2317          * Longer term this would be the correct place to add the code which
2318          * repacks the slabs in order minimize fragmentation.
2319          */
2320         if (skc->skc_reclaim) {
2321                 uint64_t objects = UINT64_MAX;
2322                 int do_reclaim;
2323
2324                 do {
2325                         spin_lock(&skc->skc_lock);
2326                         do_reclaim =
2327                             (skc->skc_slab_total > 0) &&
2328                             ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2329                             (skc->skc_obj_alloc < objects);
2330
2331                         objects = skc->skc_obj_alloc;
2332                         spin_unlock(&skc->skc_lock);
2333
2334                         if (do_reclaim)
2335                                 skc->skc_reclaim(skc->skc_private);
2336
2337                 } while (do_reclaim);
2338         }
2339
2340         /* Reclaim from the magazine then the slabs ignoring age and delay. */
2341         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2342                 spl_kmem_magazine_t *skm;
2343                 unsigned long irq_flags;
2344
2345                 local_irq_save(irq_flags);
2346                 skm = skc->skc_mag[smp_processor_id()];
2347                 spl_cache_flush(skc, skm, skm->skm_avail);
2348                 local_irq_restore(irq_flags);
2349         }
2350
2351         spl_slab_reclaim(skc, count, 1);
2352         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
2353         smp_mb__after_clear_bit();
2354         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
2355 out:
2356         atomic_dec(&skc->skc_ref);
2357
2358         SEXIT;
2359 }
2360 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
2361
2362 /*
2363  * Reap all free slabs from all registered caches.
2364  */
2365 void
2366 spl_kmem_reap(void)
2367 {
2368         struct shrink_control sc;
2369
2370         sc.nr_to_scan = KMC_REAP_CHUNK;
2371         sc.gfp_mask = GFP_KERNEL;
2372
2373         __spl_kmem_cache_generic_shrinker(NULL, &sc);
2374 }
2375 EXPORT_SYMBOL(spl_kmem_reap);
2376
2377 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
2378 static char *
2379 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
2380 {
2381         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
2382         int i, flag = 1;
2383
2384         ASSERT(str != NULL && len >= 17);
2385         memset(str, 0, len);
2386
2387         /* Check for a fully printable string, and while we are at
2388          * it place the printable characters in the passed buffer. */
2389         for (i = 0; i < size; i++) {
2390                 str[i] = ((char *)(kd->kd_addr))[i];
2391                 if (isprint(str[i])) {
2392                         continue;
2393                 } else {
2394                         /* Minimum number of printable characters found
2395                          * to make it worthwhile to print this as ascii. */
2396                         if (i > min)
2397                                 break;
2398
2399                         flag = 0;
2400                         break;
2401                 }
2402         }
2403
2404         if (!flag) {
2405                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2406                         *((uint8_t *)kd->kd_addr),
2407                         *((uint8_t *)kd->kd_addr + 2),
2408                         *((uint8_t *)kd->kd_addr + 4),
2409                         *((uint8_t *)kd->kd_addr + 6),
2410                         *((uint8_t *)kd->kd_addr + 8),
2411                         *((uint8_t *)kd->kd_addr + 10),
2412                         *((uint8_t *)kd->kd_addr + 12),
2413                         *((uint8_t *)kd->kd_addr + 14));
2414         }
2415
2416         return str;
2417 }
2418
2419 static int
2420 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2421 {
2422         int i;
2423         SENTRY;
2424
2425         spin_lock_init(lock);
2426         INIT_LIST_HEAD(list);
2427
2428         for (i = 0; i < size; i++)
2429                 INIT_HLIST_HEAD(&kmem_table[i]);
2430
2431         SRETURN(0);
2432 }
2433
2434 static void
2435 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
2436 {
2437         unsigned long flags;
2438         kmem_debug_t *kd;
2439         char str[17];
2440         SENTRY;
2441
2442         spin_lock_irqsave(lock, flags);
2443         if (!list_empty(list))
2444                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2445                        "size", "data", "func", "line");
2446
2447         list_for_each_entry(kd, list, kd_list)
2448                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
2449                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2450                        kd->kd_func, kd->kd_line);
2451
2452         spin_unlock_irqrestore(lock, flags);
2453         SEXIT;
2454 }
2455 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2456 #define spl_kmem_init_tracking(list, lock, size)
2457 #define spl_kmem_fini_tracking(list, lock)
2458 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2459
2460 static void
2461 spl_kmem_init_globals(void)
2462 {
2463         struct zone *zone;
2464
2465         /* For now all zones are includes, it may be wise to restrict
2466          * this to normal and highmem zones if we see problems. */
2467         for_each_zone(zone) {
2468
2469                 if (!populated_zone(zone))
2470                         continue;
2471
2472                 minfree += min_wmark_pages(zone);
2473                 desfree += low_wmark_pages(zone);
2474                 lotsfree += high_wmark_pages(zone);
2475         }
2476
2477         /* Solaris default values */
2478         swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
2479         swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
2480 }
2481
2482 /*
2483  * Called at module init when it is safe to use spl_kallsyms_lookup_name()
2484  */
2485 int
2486 spl_kmem_init_kallsyms_lookup(void)
2487 {
2488 #ifndef HAVE_GET_VMALLOC_INFO
2489         get_vmalloc_info_fn = (get_vmalloc_info_t)
2490                 spl_kallsyms_lookup_name("get_vmalloc_info");
2491         if (!get_vmalloc_info_fn) {
2492                 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
2493                 return -EFAULT;
2494         }
2495 #endif /* HAVE_GET_VMALLOC_INFO */
2496
2497 #ifdef HAVE_PGDAT_HELPERS
2498 # ifndef HAVE_FIRST_ONLINE_PGDAT
2499         first_online_pgdat_fn = (first_online_pgdat_t)
2500                 spl_kallsyms_lookup_name("first_online_pgdat");
2501         if (!first_online_pgdat_fn) {
2502                 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
2503                 return -EFAULT;
2504         }
2505 # endif /* HAVE_FIRST_ONLINE_PGDAT */
2506
2507 # ifndef HAVE_NEXT_ONLINE_PGDAT
2508         next_online_pgdat_fn = (next_online_pgdat_t)
2509                 spl_kallsyms_lookup_name("next_online_pgdat");
2510         if (!next_online_pgdat_fn) {
2511                 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
2512                 return -EFAULT;
2513         }
2514 # endif /* HAVE_NEXT_ONLINE_PGDAT */
2515
2516 # ifndef HAVE_NEXT_ZONE
2517         next_zone_fn = (next_zone_t)
2518                 spl_kallsyms_lookup_name("next_zone");
2519         if (!next_zone_fn) {
2520                 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
2521                 return -EFAULT;
2522         }
2523 # endif /* HAVE_NEXT_ZONE */
2524
2525 #else /* HAVE_PGDAT_HELPERS */
2526
2527 # ifndef HAVE_PGDAT_LIST
2528         pgdat_list_addr = *(struct pglist_data **)
2529                 spl_kallsyms_lookup_name("pgdat_list");
2530         if (!pgdat_list_addr) {
2531                 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
2532                 return -EFAULT;
2533         }
2534 # endif /* HAVE_PGDAT_LIST */
2535 #endif /* HAVE_PGDAT_HELPERS */
2536
2537 #if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
2538         get_zone_counts_fn = (get_zone_counts_t)
2539                 spl_kallsyms_lookup_name("get_zone_counts");
2540         if (!get_zone_counts_fn) {
2541                 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
2542                 return -EFAULT;
2543         }
2544 #endif  /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
2545
2546         /*
2547          * It is now safe to initialize the global tunings which rely on
2548          * the use of the for_each_zone() macro.  This macro in turns
2549          * depends on the *_pgdat symbols which are now available.
2550          */
2551         spl_kmem_init_globals();
2552
2553 #ifndef HAVE_SHRINK_DCACHE_MEMORY
2554         /* When shrink_dcache_memory_fn == NULL support is disabled */
2555         shrink_dcache_memory_fn = (shrink_dcache_memory_t)
2556                 spl_kallsyms_lookup_name("shrink_dcache_memory");
2557 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
2558
2559 #ifndef HAVE_SHRINK_ICACHE_MEMORY
2560         /* When shrink_icache_memory_fn == NULL support is disabled */
2561         shrink_icache_memory_fn = (shrink_icache_memory_t)
2562                 spl_kallsyms_lookup_name("shrink_icache_memory");
2563 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
2564
2565         return 0;
2566 }
2567
2568 int
2569 spl_kmem_init(void)
2570 {
2571         int rc = 0;
2572         SENTRY;
2573
2574 #ifdef DEBUG_KMEM
2575         kmem_alloc_used_set(0);
2576         vmem_alloc_used_set(0);
2577
2578         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2579         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2580 #endif
2581
2582         init_rwsem(&spl_kmem_cache_sem);
2583         INIT_LIST_HEAD(&spl_kmem_cache_list);
2584         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2585             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2586
2587         spl_register_shrinker(&spl_kmem_cache_shrinker);
2588
2589         SRETURN(rc);
2590 }
2591
2592 void
2593 spl_kmem_fini(void)
2594 {
2595         SENTRY;
2596
2597         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2598         taskq_destroy(spl_kmem_cache_taskq);
2599
2600 #ifdef DEBUG_KMEM
2601         /* Display all unreclaimed memory addresses, including the
2602          * allocation size and the first few bytes of what's located
2603          * at that address to aid in debugging.  Performance is not
2604          * a serious concern here since it is module unload time. */
2605         if (kmem_alloc_used_read() != 0)
2606                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2607                     "kmem leaked %ld/%ld bytes\n",
2608                     kmem_alloc_used_read(), kmem_alloc_max);
2609
2610
2611         if (vmem_alloc_used_read() != 0)
2612                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2613                     "vmem leaked %ld/%ld bytes\n",
2614                     vmem_alloc_used_read(), vmem_alloc_max);
2615
2616         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2617         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2618 #endif /* DEBUG_KMEM */
2619
2620         SEXIT;
2621 }