module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28
  29 /*
  30  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  31  * are removed to allow access to the real Linux slab allocator.
  32  */
  33 #undef kmem_cache_destroy
  34 #undef kmem_cache_create
  35 #undef kmem_cache_alloc
  36 #undef kmem_cache_free
  37
  38
  39 /*
  40  * Cache expiration was implemented because it was part of the default Solaris
  41  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  42  * accessed in several seconds should be returned to the cache.  On the other
  43  * hand Linux slabs never move objects back to the slabs unless there is
  44  * memory pressure on the system.  By default the Linux method is enabled
  45  * because it has been shown to improve responsiveness on low memory systems.
  46  * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  47  */
  48 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
  49 EXPORT_SYMBOL(spl_kmem_cache_expire);
  50 module_param(spl_kmem_cache_expire, uint, 0644);
  51 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  52
  53 /*
  54  * The default behavior is to report the number of objects remaining in the
  55  * cache.  This allows the Linux VM to repeatedly reclaim objects from the
  56  * cache when memory is low satisfy other memory allocations.  Alternately,
  57  * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
  58  * is reclaimed.  This may increase the likelihood of out of memory events.
  59  */
  60 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
  61 module_param(spl_kmem_cache_reclaim, uint, 0644);
  62 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
  63
  64 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  65 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  66 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  67
  68 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
  69 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
  70 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
  71     "Minimal number of objects per slab");
  72
  73 unsigned int spl_kmem_cache_max_size = 32;
  74 module_param(spl_kmem_cache_max_size, uint, 0644);
  75 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  76
  77 /*
  78  * For small objects the Linux slab allocator should be used to make the most
  79  * efficient use of the memory.  However, large objects are not supported by
  80  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  81  * of 16K was determined to be optimal for architectures using 4K pages.
  82  */
  83 #if PAGE_SIZE == 4096
  84 unsigned int spl_kmem_cache_slab_limit = 16384;
  85 #else
  86 unsigned int spl_kmem_cache_slab_limit = 0;
  87 #endif
  88 module_param(spl_kmem_cache_slab_limit, uint, 0644);
  89 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
  90     "Objects less than N bytes use the Linux slab");
  91
  92 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
  93 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
  94 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
  95     "Objects less than N bytes use the kmalloc");
  96
  97 vmem_t *heap_arena = NULL;
  98 EXPORT_SYMBOL(heap_arena);
  99
 100 vmem_t *zio_alloc_arena = NULL;
 101 EXPORT_SYMBOL(zio_alloc_arena);
 102
 103 vmem_t *zio_arena = NULL;
 104 EXPORT_SYMBOL(zio_arena);
 105
 106 size_t
 107 vmem_size(vmem_t *vmp, int typemask)
 108 {
 109         ASSERT3P(vmp, ==, NULL);
 110         ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC);
 111         ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE);
 112
 113         return (VMALLOC_TOTAL);
 114 }
 115 EXPORT_SYMBOL(vmem_size);
 116
 117 int
 118 kmem_debugging(void)
 119 {
 120         return 0;
 121 }
 122 EXPORT_SYMBOL(kmem_debugging);
 123
 124 char *
 125 kmem_vasprintf(const char *fmt, va_list ap)
 126 {
 127         va_list aq;
 128         char *ptr;
 129
 130         do {
 131                 va_copy(aq, ap);
 132                 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
 133                 va_end(aq);
 134         } while (ptr == NULL);
 135
 136         return ptr;
 137 }
 138 EXPORT_SYMBOL(kmem_vasprintf);
 139
 140 char *
 141 kmem_asprintf(const char *fmt, ...)
 142 {
 143         va_list ap;
 144         char *ptr;
 145
 146         do {
 147                 va_start(ap, fmt);
 148                 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
 149                 va_end(ap);
 150         } while (ptr == NULL);
 151
 152         return ptr;
 153 }
 154 EXPORT_SYMBOL(kmem_asprintf);
 155
 156 static char *
 157 __strdup(const char *str, int flags)
 158 {
 159         char *ptr;
 160         int n;
 161
 162         n = strlen(str);
 163         ptr = kmalloc_nofail(n + 1, flags);
 164         if (ptr)
 165                 memcpy(ptr, str, n + 1);
 166
 167         return ptr;
 168 }
 169
 170 char *
 171 strdup(const char *str)
 172 {
 173         return __strdup(str, KM_SLEEP);
 174 }
 175 EXPORT_SYMBOL(strdup);
 176
 177 void
 178 strfree(char *str)
 179 {
 180         kfree(str);
 181 }
 182 EXPORT_SYMBOL(strfree);
 183
 184 /*
 185  * Memory allocation interfaces and debugging for basic kmem_*
 186  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 187  * the SPL will keep track of the total memory allocated, and
 188  * report any memory leaked when the module is unloaded.
 189  */
 190 #ifdef DEBUG_KMEM
 191
 192 /* Shim layer memory accounting */
 193 # ifdef HAVE_ATOMIC64_T
 194 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 195 unsigned long long kmem_alloc_max = 0;
 196 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 197 unsigned long long vmem_alloc_max = 0;
 198 # else  /* HAVE_ATOMIC64_T */
 199 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 200 unsigned long long kmem_alloc_max = 0;
 201 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 202 unsigned long long vmem_alloc_max = 0;
 203 # endif /* HAVE_ATOMIC64_T */
 204
 205 EXPORT_SYMBOL(kmem_alloc_used);
 206 EXPORT_SYMBOL(kmem_alloc_max);
 207 EXPORT_SYMBOL(vmem_alloc_used);
 208 EXPORT_SYMBOL(vmem_alloc_max);
 209
 210 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 211  * but also the location of every alloc and free.  When the SPL module is
 212  * unloaded a list of all leaked addresses and where they were allocated
 213  * will be dumped to the console.  Enabling this feature has a significant
 214  * impact on performance but it makes finding memory leaks straight forward.
 215  *
 216  * Not surprisingly with debugging enabled the xmem_locks are very highly
 217  * contended particularly on xfree().  If we want to run with this detailed
 218  * debugging enabled for anything other than debugging  we need to minimize
 219  * the contention by moving to a lock per xmem_table entry model.
 220  */
 221 # ifdef DEBUG_KMEM_TRACKING
 222
 223 #  define KMEM_HASH_BITS          10
 224 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 225
 226 #  define VMEM_HASH_BITS          10
 227 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 228
 229 typedef struct kmem_debug {
 230         struct hlist_node kd_hlist;     /* Hash node linkage */
 231         struct list_head kd_list;       /* List of all allocations */
 232         void *kd_addr;                  /* Allocation pointer */
 233         size_t kd_size;                 /* Allocation size */
 234         const char *kd_func;            /* Allocation function */
 235         int kd_line;                    /* Allocation line */
 236 } kmem_debug_t;
 237
 238 spinlock_t kmem_lock;
 239 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 240 struct list_head kmem_list;
 241
 242 spinlock_t vmem_lock;
 243 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 244 struct list_head vmem_list;
 245
 246 EXPORT_SYMBOL(kmem_lock);
 247 EXPORT_SYMBOL(kmem_table);
 248 EXPORT_SYMBOL(kmem_list);
 249
 250 EXPORT_SYMBOL(vmem_lock);
 251 EXPORT_SYMBOL(vmem_table);
 252 EXPORT_SYMBOL(vmem_list);
 253
 254 static kmem_debug_t *
 255 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
 256 {
 257         struct hlist_head *head;
 258         struct hlist_node *node;
 259         struct kmem_debug *p;
 260         unsigned long flags;
 261
 262         spin_lock_irqsave(lock, flags);
 263
 264         head = &table[hash_ptr((void *)addr, bits)];
 265         hlist_for_each(node, head) {
 266                 p = list_entry(node, struct kmem_debug, kd_hlist);
 267                 if (p->kd_addr == addr) {
 268                         hlist_del_init(&p->kd_hlist);
 269                         list_del_init(&p->kd_list);
 270                         spin_unlock_irqrestore(lock, flags);
 271                         return p;
 272                 }
 273         }
 274
 275         spin_unlock_irqrestore(lock, flags);
 276
 277         return (NULL);
 278 }
 279
 280 void *
 281 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 282     int node_alloc, int node)
 283 {
 284         void *ptr = NULL;
 285         kmem_debug_t *dptr;
 286         unsigned long irq_flags;
 287
 288         /* Function may be called with KM_NOSLEEP so failure is possible */
 289         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 290             flags & ~__GFP_ZERO);
 291
 292         if (unlikely(dptr == NULL)) {
 293                 printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d "
 294                     "failed (%lld/%llu)\n", sizeof(kmem_debug_t), flags,
 295                     func, line, kmem_alloc_used_read(), kmem_alloc_max);
 296         } else {
 297                 /*
 298                  * Marked unlikely because we should never be doing this,
 299                  * we tolerate to up 2 pages but a single page is best.
 300                  */
 301                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 302                         printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) "
 303                             "at %s:%d failed (%lld/%llu)\n",
 304                             (unsigned long long)size, flags, func, line,
 305                             kmem_alloc_used_read(), kmem_alloc_max);
 306                         spl_dumpstack();
 307                 }
 308
 309                 /*
 310                  *  We use __strdup() below because the string pointed to by
 311                  * __FUNCTION__ might not be available by the time we want
 312                  * to print it since the module might have been unloaded.
 313                  * This can only fail in the KM_NOSLEEP case.
 314                  */
 315                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 316                 if (unlikely(dptr->kd_func == NULL)) {
 317                         kfree(dptr);
 318                         printk(KERN_WARNING "debug __strdup() at %s:%d "
 319                             "failed (%lld/%llu)\n", func, line,
 320                             kmem_alloc_used_read(), kmem_alloc_max);
 321                         goto out;
 322                 }
 323
 324                 /* Use the correct allocator */
 325                 if (node_alloc) {
 326                         ASSERT(!(flags & __GFP_ZERO));
 327                         ptr = kmalloc_node_nofail(size, flags, node);
 328                 } else if (flags & __GFP_ZERO) {
 329                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 330                 } else {
 331                         ptr = kmalloc_nofail(size, flags);
 332                 }
 333
 334                 if (unlikely(ptr == NULL)) {
 335                         kfree(dptr->kd_func);
 336                         kfree(dptr);
 337                         printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) "
 338                             "at %s:%d failed (%lld/%llu)\n",
 339                             (unsigned long long) size, flags, func, line,
 340                             kmem_alloc_used_read(), kmem_alloc_max);
 341                         goto out;
 342                 }
 343
 344                 kmem_alloc_used_add(size);
 345                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 346                         kmem_alloc_max = kmem_alloc_used_read();
 347
 348                 INIT_HLIST_NODE(&dptr->kd_hlist);
 349                 INIT_LIST_HEAD(&dptr->kd_list);
 350
 351                 dptr->kd_addr = ptr;
 352                 dptr->kd_size = size;
 353                 dptr->kd_line = line;
 354
 355                 spin_lock_irqsave(&kmem_lock, irq_flags);
 356                 hlist_add_head(&dptr->kd_hlist,
 357                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 358                 list_add_tail(&dptr->kd_list, &kmem_list);
 359                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 360         }
 361 out:
 362         return (ptr);
 363 }
 364 EXPORT_SYMBOL(kmem_alloc_track);
 365
 366 void
 367 kmem_free_track(const void *ptr, size_t size)
 368 {
 369         kmem_debug_t *dptr;
 370
 371         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 372             (unsigned long long) size);
 373
 374         /* Must exist in hash due to kmem_alloc() */
 375         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 376         ASSERT(dptr);
 377
 378         /* Size must match */
 379         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 380             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 381             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 382
 383         kmem_alloc_used_sub(size);
 384         kfree(dptr->kd_func);
 385
 386         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 387         kfree(dptr);
 388
 389         memset((void *)ptr, 0x5a, size);
 390         kfree(ptr);
 391 }
 392 EXPORT_SYMBOL(kmem_free_track);
 393
 394 void *
 395 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 396 {
 397         void *ptr = NULL;
 398         kmem_debug_t *dptr;
 399         unsigned long irq_flags;
 400
 401         ASSERT(flags & KM_SLEEP);
 402
 403         /* Function may be called with KM_NOSLEEP so failure is possible */
 404         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 405             flags & ~__GFP_ZERO);
 406         if (unlikely(dptr == NULL)) {
 407                 printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) "
 408                     "at %s:%d failed (%lld/%llu)\n",
 409                     sizeof(kmem_debug_t), flags, func, line,
 410                     vmem_alloc_used_read(), vmem_alloc_max);
 411         } else {
 412                 /*
 413                  * We use __strdup() below because the string pointed to by
 414                  * __FUNCTION__ might not be available by the time we want
 415                  * to print it, since the module might have been unloaded.
 416                  * This can never fail because we have already asserted
 417                  * that flags is KM_SLEEP.
 418                  */
 419                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 420                 if (unlikely(dptr->kd_func == NULL)) {
 421                         kfree(dptr);
 422                         printk(KERN_WARNING "debug __strdup() at %s:%d "
 423                             "failed (%lld/%llu)\n", func, line,
 424                             vmem_alloc_used_read(), vmem_alloc_max);
 425                         goto out;
 426                 }
 427
 428                 /* Use the correct allocator */
 429                 if (flags & __GFP_ZERO) {
 430                         ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
 431                 } else {
 432                         ptr = vmalloc_nofail(size, flags);
 433                 }
 434
 435                 if (unlikely(ptr == NULL)) {
 436                         kfree(dptr->kd_func);
 437                         kfree(dptr);
 438                         printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) "
 439                             "at %s:%d failed (%lld/%llu)\n",
 440                             (unsigned long long) size, flags, func, line,
 441                             vmem_alloc_used_read(), vmem_alloc_max);
 442                         goto out;
 443                 }
 444
 445                 vmem_alloc_used_add(size);
 446                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 447                         vmem_alloc_max = vmem_alloc_used_read();
 448
 449                 INIT_HLIST_NODE(&dptr->kd_hlist);
 450                 INIT_LIST_HEAD(&dptr->kd_list);
 451
 452                 dptr->kd_addr = ptr;
 453                 dptr->kd_size = size;
 454                 dptr->kd_line = line;
 455
 456                 spin_lock_irqsave(&vmem_lock, irq_flags);
 457                 hlist_add_head(&dptr->kd_hlist,
 458                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 459                 list_add_tail(&dptr->kd_list, &vmem_list);
 460                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 461         }
 462 out:
 463         return (ptr);
 464 }
 465 EXPORT_SYMBOL(vmem_alloc_track);
 466
 467 void
 468 vmem_free_track(const void *ptr, size_t size)
 469 {
 470         kmem_debug_t *dptr;
 471
 472         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 473             (unsigned long long) size);
 474
 475         /* Must exist in hash due to vmem_alloc() */
 476         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 477         ASSERT(dptr);
 478
 479         /* Size must match */
 480         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 481             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 482             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 483
 484         vmem_alloc_used_sub(size);
 485         kfree(dptr->kd_func);
 486
 487         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 488         kfree(dptr);
 489
 490         memset((void *)ptr, 0x5a, size);
 491         vfree(ptr);
 492 }
 493 EXPORT_SYMBOL(vmem_free_track);
 494
 495 # else /* DEBUG_KMEM_TRACKING */
 496
 497 void *
 498 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 499     int node_alloc, int node)
 500 {
 501         void *ptr;
 502
 503         /*
 504          * Marked unlikely because we should never be doing this,
 505          * we tolerate to up 2 pages but a single page is best.
 506          */
 507         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 508                 printk(KERN_WARNING
 509                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 510                     (unsigned long long)size, flags, func, line,
 511                     (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
 512                 spl_dumpstack();
 513         }
 514
 515         /* Use the correct allocator */
 516         if (node_alloc) {
 517                 ASSERT(!(flags & __GFP_ZERO));
 518                 ptr = kmalloc_node_nofail(size, flags, node);
 519         } else if (flags & __GFP_ZERO) {
 520                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 521         } else {
 522                 ptr = kmalloc_nofail(size, flags);
 523         }
 524
 525         if (unlikely(ptr == NULL)) {
 526                 printk(KERN_WARNING
 527                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 528                     (unsigned long long)size, flags, func, line,
 529                     (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
 530         } else {
 531                 kmem_alloc_used_add(size);
 532                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 533                         kmem_alloc_max = kmem_alloc_used_read();
 534         }
 535
 536         return (ptr);
 537 }
 538 EXPORT_SYMBOL(kmem_alloc_debug);
 539
 540 void
 541 kmem_free_debug(const void *ptr, size_t size)
 542 {
 543         ASSERT(ptr || size > 0);
 544         kmem_alloc_used_sub(size);
 545         kfree(ptr);
 546 }
 547 EXPORT_SYMBOL(kmem_free_debug);
 548
 549 void *
 550 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 551 {
 552         void *ptr;
 553
 554         ASSERT(flags & KM_SLEEP);
 555
 556         /* Use the correct allocator */
 557         if (flags & __GFP_ZERO) {
 558                 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
 559         } else {
 560                 ptr = vmalloc_nofail(size, flags);
 561         }
 562
 563         if (unlikely(ptr == NULL)) {
 564                 printk(KERN_WARNING
 565                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 566                     (unsigned long long)size, flags, func, line,
 567                     (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max);
 568         } else {
 569                 vmem_alloc_used_add(size);
 570                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 571                         vmem_alloc_max = vmem_alloc_used_read();
 572         }
 573
 574         return (ptr);
 575 }
 576 EXPORT_SYMBOL(vmem_alloc_debug);
 577
 578 void
 579 vmem_free_debug(const void *ptr, size_t size)
 580 {
 581         ASSERT(ptr || size > 0);
 582         vmem_alloc_used_sub(size);
 583         vfree(ptr);
 584 }
 585 EXPORT_SYMBOL(vmem_free_debug);
 586
 587 # endif /* DEBUG_KMEM_TRACKING */
 588 #endif /* DEBUG_KMEM */
 589
 590 /*
 591  * Slab allocation interfaces
 592  *
 593  * While the Linux slab implementation was inspired by the Solaris
 594  * implementation I cannot use it to emulate the Solaris APIs.  I
 595  * require two features which are not provided by the Linux slab.
 596  *
 597  * 1) Constructors AND destructors.  Recent versions of the Linux
 598  *    kernel have removed support for destructors.  This is a deal
 599  *    breaker for the SPL which contains particularly expensive
 600  *    initializers for mutex's, condition variables, etc.  We also
 601  *    require a minimal level of cleanup for these data types unlike
 602  *    many Linux data type which do need to be explicitly destroyed.
 603  *
 604  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 605  *    expect it to work well for both small are very large allocations.
 606  *    Because of memory fragmentation the Linux slab which is backed
 607  *    by kmalloc'ed memory performs very badly when confronted with
 608  *    large numbers of large allocations.  Basing the slab on the
 609  *    virtual address space removes the need for contiguous pages
 610  *    and greatly improve performance for large allocations.
 611  *
 612  * For these reasons, the SPL has its own slab implementation with
 613  * the needed features.  It is not as highly optimized as either the
 614  * Solaris or Linux slabs, but it should get me most of what is
 615  * needed until it can be optimized or obsoleted by another approach.
 616  *
 617  * One serious concern I do have about this method is the relatively
 618  * small virtual address space on 32bit arches.  This will seriously
 619  * constrain the size of the slab caches and their performance.
 620  *
 621  * XXX: Improve the partial slab list by carefully maintaining a
 622  *      strict ordering of fullest to emptiest slabs based on
 623  *      the slab reference count.  This guarantees the when freeing
 624  *      slabs back to the system we need only linearly traverse the
 625  *      last N slabs in the list to discover all the freeable slabs.
 626  *
 627  * XXX: NUMA awareness for optionally allocating memory close to a
 628  *      particular core.  This can be advantageous if you know the slab
 629  *      object will be short lived and primarily accessed from one core.
 630  *
 631  * XXX: Slab coloring may also yield performance improvements and would
 632  *      be desirable to implement.
 633  */
 634
 635 struct list_head spl_kmem_cache_list;   /* List of caches */
 636 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 637 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 638
 639 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 640
 641 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 642 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 643         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 644
 645 static void *
 646 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 647 {
 648         void *ptr;
 649
 650         ASSERT(ISP2(size));
 651
 652         if (skc->skc_flags & KMC_KMEM)
 653                 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
 654                     get_order(size));
 655         else
 656                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 657
 658         /* Resulting allocated memory will be page aligned */
 659         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 660
 661         return ptr;
 662 }
 663
 664 static void
 665 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 666 {
 667         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 668         ASSERT(ISP2(size));
 669
 670         /*
 671          * The Linux direct reclaim path uses this out of band value to
 672          * determine if forward progress is being made.  Normally this is
 673          * incremented by kmem_freepages() which is part of the various
 674          * Linux slab implementations.  However, since we are using none
 675          * of that infrastructure we are responsible for incrementing it.
 676          */
 677         if (current->reclaim_state)
 678                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 679
 680         if (skc->skc_flags & KMC_KMEM)
 681                 free_pages((unsigned long)ptr, get_order(size));
 682         else
 683                 vfree(ptr);
 684 }
 685
 686 /*
 687  * Required space for each aligned sks.
 688  */
 689 static inline uint32_t
 690 spl_sks_size(spl_kmem_cache_t *skc)
 691 {
 692         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 693                skc->skc_obj_align, uint32_t);
 694 }
 695
 696 /*
 697  * Required space for each aligned object.
 698  */
 699 static inline uint32_t
 700 spl_obj_size(spl_kmem_cache_t *skc)
 701 {
 702         uint32_t align = skc->skc_obj_align;
 703
 704         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 705                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 706 }
 707
 708 /*
 709  * Lookup the spl_kmem_object_t for an object given that object.
 710  */
 711 static inline spl_kmem_obj_t *
 712 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 713 {
 714         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 715                skc->skc_obj_align, uint32_t);
 716 }
 717
 718 /*
 719  * Required space for each offslab object taking in to account alignment
 720  * restrictions and the power-of-two requirement of kv_alloc().
 721  */
 722 static inline uint32_t
 723 spl_offslab_size(spl_kmem_cache_t *skc)
 724 {
 725         return 1UL << (fls64(spl_obj_size(skc)) + 1);
 726 }
 727
 728 /*
 729  * It's important that we pack the spl_kmem_obj_t structure and the
 730  * actual objects in to one large address space to minimize the number
 731  * of calls to the allocator.  It is far better to do a few large
 732  * allocations and then subdivide it ourselves.  Now which allocator
 733  * we use requires balancing a few trade offs.
 734  *
 735  * For small objects we use kmem_alloc() because as long as you are
 736  * only requesting a small number of pages (ideally just one) its cheap.
 737  * However, when you start requesting multiple pages with kmem_alloc()
 738  * it gets increasingly expensive since it requires contiguous pages.
 739  * For this reason we shift to vmem_alloc() for slabs of large objects
 740  * which removes the need for contiguous pages.  We do not use
 741  * vmem_alloc() in all cases because there is significant locking
 742  * overhead in __get_vm_area_node().  This function takes a single
 743  * global lock when acquiring an available virtual address range which
 744  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 745  * different allocation functions for small and large objects should
 746  * give us the best of both worlds.
 747  *
 748  * KMC_ONSLAB                       KMC_OFFSLAB
 749  *
 750  * +------------------------+       +-----------------+
 751  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 752  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 753  * | spl_kmem_obj_t      |  |                             | |
 754  * | skc_obj_size    <---+  |       +-----------------+   | |
 755  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 756  * | ...                 v  |       | spl_kmem_obj_t  |     |
 757  * +------------------------+       +-----------------+     v
 758  */
 759 static spl_kmem_slab_t *
 760 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 761 {
 762         spl_kmem_slab_t *sks;
 763         spl_kmem_obj_t *sko, *n;
 764         void *base, *obj;
 765         uint32_t obj_size, offslab_size = 0;
 766         int i,  rc = 0;
 767
 768         base = kv_alloc(skc, skc->skc_slab_size, flags);
 769         if (base == NULL)
 770                 return (NULL);
 771
 772         sks = (spl_kmem_slab_t *)base;
 773         sks->sks_magic = SKS_MAGIC;
 774         sks->sks_objs = skc->skc_slab_objs;
 775         sks->sks_age = jiffies;
 776         sks->sks_cache = skc;
 777         INIT_LIST_HEAD(&sks->sks_list);
 778         INIT_LIST_HEAD(&sks->sks_free_list);
 779         sks->sks_ref = 0;
 780         obj_size = spl_obj_size(skc);
 781
 782         if (skc->skc_flags & KMC_OFFSLAB)
 783                 offslab_size = spl_offslab_size(skc);
 784
 785         for (i = 0; i < sks->sks_objs; i++) {
 786                 if (skc->skc_flags & KMC_OFFSLAB) {
 787                         obj = kv_alloc(skc, offslab_size, flags);
 788                         if (!obj) {
 789                                 rc = -ENOMEM;
 790                                 goto out;
 791                         }
 792                 } else {
 793                         obj = base + spl_sks_size(skc) + (i * obj_size);
 794                 }
 795
 796                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 797                 sko = spl_sko_from_obj(skc, obj);
 798                 sko->sko_addr = obj;
 799                 sko->sko_magic = SKO_MAGIC;
 800                 sko->sko_slab = sks;
 801                 INIT_LIST_HEAD(&sko->sko_list);
 802                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 803         }
 804
 805 out:
 806         if (rc) {
 807                 if (skc->skc_flags & KMC_OFFSLAB)
 808                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
 809                                                  sko_list)
 810                                 kv_free(skc, sko->sko_addr, offslab_size);
 811
 812                 kv_free(skc, base, skc->skc_slab_size);
 813                 sks = NULL;
 814         }
 815
 816         return (sks);
 817 }
 818
 819 /*
 820  * Remove a slab from complete or partial list, it must be called with
 821  * the 'skc->skc_lock' held but the actual free must be performed
 822  * outside the lock to prevent deadlocking on vmem addresses.
 823  */
 824 static void
 825 spl_slab_free(spl_kmem_slab_t *sks,
 826               struct list_head *sks_list, struct list_head *sko_list)
 827 {
 828         spl_kmem_cache_t *skc;
 829
 830         ASSERT(sks->sks_magic == SKS_MAGIC);
 831         ASSERT(sks->sks_ref == 0);
 832
 833         skc = sks->sks_cache;
 834         ASSERT(skc->skc_magic == SKC_MAGIC);
 835         ASSERT(spin_is_locked(&skc->skc_lock));
 836
 837         /*
 838          * Update slab/objects counters in the cache, then remove the
 839          * slab from the skc->skc_partial_list.  Finally add the slab
 840          * and all its objects in to the private work lists where the
 841          * destructors will be called and the memory freed to the system.
 842          */
 843         skc->skc_obj_total -= sks->sks_objs;
 844         skc->skc_slab_total--;
 845         list_del(&sks->sks_list);
 846         list_add(&sks->sks_list, sks_list);
 847         list_splice_init(&sks->sks_free_list, sko_list);
 848 }
 849
 850 /*
 851  * Traverses all the partial slabs attached to a cache and free those
 852  * which which are currently empty, and have not been touched for
 853  * skc_delay seconds to  avoid thrashing.  The count argument is
 854  * passed to optionally cap the number of slabs reclaimed, a count
 855  * of zero means try and reclaim everything.  When flag is set we
 856  * always free an available slab regardless of age.
 857  */
 858 static void
 859 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 860 {
 861         spl_kmem_slab_t *sks, *m;
 862         spl_kmem_obj_t *sko, *n;
 863         LIST_HEAD(sks_list);
 864         LIST_HEAD(sko_list);
 865         uint32_t size = 0;
 866         int i = 0;
 867
 868         /*
 869          * Move empty slabs and objects which have not been touched in
 870          * skc_delay seconds on to private lists to be freed outside
 871          * the spin lock.  This delay time is important to avoid thrashing
 872          * however when flag is set the delay will not be used.
 873          */
 874         spin_lock(&skc->skc_lock);
 875         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
 876                 /*
 877                  * All empty slabs are at the end of skc->skc_partial_list,
 878                  * therefore once a non-empty slab is found we can stop
 879                  * scanning.  Additionally, stop when reaching the target
 880                  * reclaim 'count' if a non-zero threshold is given.
 881                  */
 882                 if ((sks->sks_ref > 0) || (count && i >= count))
 883                         break;
 884
 885                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
 886                         spl_slab_free(sks, &sks_list, &sko_list);
 887                         i++;
 888                 }
 889         }
 890         spin_unlock(&skc->skc_lock);
 891
 892         /*
 893          * The following two loops ensure all the object destructors are
 894          * run, any offslab objects are freed, and the slabs themselves
 895          * are freed.  This is all done outside the skc->skc_lock since
 896          * this allows the destructor to sleep, and allows us to perform
 897          * a conditional reschedule when a freeing a large number of
 898          * objects and slabs back to the system.
 899          */
 900         if (skc->skc_flags & KMC_OFFSLAB)
 901                 size = spl_offslab_size(skc);
 902
 903         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 904                 ASSERT(sko->sko_magic == SKO_MAGIC);
 905
 906                 if (skc->skc_flags & KMC_OFFSLAB)
 907                         kv_free(skc, sko->sko_addr, size);
 908         }
 909
 910         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 911                 ASSERT(sks->sks_magic == SKS_MAGIC);
 912                 kv_free(skc, sks, skc->skc_slab_size);
 913         }
 914 }
 915
 916 static spl_kmem_emergency_t *
 917 spl_emergency_search(struct rb_root *root, void *obj)
 918 {
 919         struct rb_node *node = root->rb_node;
 920         spl_kmem_emergency_t *ske;
 921         unsigned long address = (unsigned long)obj;
 922
 923         while (node) {
 924                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
 925
 926                 if (address < (unsigned long)ske->ske_obj)
 927                         node = node->rb_left;
 928                 else if (address > (unsigned long)ske->ske_obj)
 929                         node = node->rb_right;
 930                 else
 931                         return ske;
 932         }
 933
 934         return NULL;
 935 }
 936
 937 static int
 938 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 939 {
 940         struct rb_node **new = &(root->rb_node), *parent = NULL;
 941         spl_kmem_emergency_t *ske_tmp;
 942         unsigned long address = (unsigned long)ske->ske_obj;
 943
 944         while (*new) {
 945                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 946
 947                 parent = *new;
 948                 if (address < (unsigned long)ske_tmp->ske_obj)
 949                         new = &((*new)->rb_left);
 950                 else if (address > (unsigned long)ske_tmp->ske_obj)
 951                         new = &((*new)->rb_right);
 952                 else
 953                         return 0;
 954         }
 955
 956         rb_link_node(&ske->ske_node, parent, new);
 957         rb_insert_color(&ske->ske_node, root);
 958
 959         return 1;
 960 }
 961
 962 /*
 963  * Allocate a single emergency object and track it in a red black tree.
 964  */
 965 static int
 966 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 967 {
 968         spl_kmem_emergency_t *ske;
 969         int empty;
 970
 971         /* Last chance use a partial slab if one now exists */
 972         spin_lock(&skc->skc_lock);
 973         empty = list_empty(&skc->skc_partial_list);
 974         spin_unlock(&skc->skc_lock);
 975         if (!empty)
 976                 return (-EEXIST);
 977
 978         ske = kmalloc(sizeof(*ske), flags);
 979         if (ske == NULL)
 980                 return (-ENOMEM);
 981
 982         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
 983         if (ske->ske_obj == NULL) {
 984                 kfree(ske);
 985                 return (-ENOMEM);
 986         }
 987
 988         spin_lock(&skc->skc_lock);
 989         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 990         if (likely(empty)) {
 991                 skc->skc_obj_total++;
 992                 skc->skc_obj_emergency++;
 993                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 994                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 995         }
 996         spin_unlock(&skc->skc_lock);
 997
 998         if (unlikely(!empty)) {
 999                 kfree(ske->ske_obj);
1000                 kfree(ske);
1001                 return (-EINVAL);
1002         }
1003
1004         *obj = ske->ske_obj;
1005
1006         return (0);
1007 }
1008
1009 /*
1010  * Locate the passed object in the red black tree and free it.
1011  */
1012 static int
1013 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1014 {
1015         spl_kmem_emergency_t *ske;
1016
1017         spin_lock(&skc->skc_lock);
1018         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1019         if (likely(ske)) {
1020                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1021                 skc->skc_obj_emergency--;
1022                 skc->skc_obj_total--;
1023         }
1024         spin_unlock(&skc->skc_lock);
1025
1026         if (unlikely(ske == NULL))
1027                 return (-ENOENT);
1028
1029         kfree(ske->ske_obj);
1030         kfree(ske);
1031
1032         return (0);
1033 }
1034
1035 /*
1036  * Release objects from the per-cpu magazine back to their slab.  The flush
1037  * argument contains the max number of entries to remove from the magazine.
1038  */
1039 static void
1040 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1041 {
1042         int i, count = MIN(flush, skm->skm_avail);
1043
1044         ASSERT(skc->skc_magic == SKC_MAGIC);
1045         ASSERT(skm->skm_magic == SKM_MAGIC);
1046         ASSERT(spin_is_locked(&skc->skc_lock));
1047
1048         for (i = 0; i < count; i++)
1049                 spl_cache_shrink(skc, skm->skm_objs[i]);
1050
1051         skm->skm_avail -= count;
1052         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1053                 sizeof(void *) * skm->skm_avail);
1054 }
1055
1056 static void
1057 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1058 {
1059         spin_lock(&skc->skc_lock);
1060         __spl_cache_flush(skc, skm, flush);
1061         spin_unlock(&skc->skc_lock);
1062 }
1063
1064 static void
1065 spl_magazine_age(void *data)
1066 {
1067         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1068         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1069
1070         ASSERT(skm->skm_magic == SKM_MAGIC);
1071         ASSERT(skm->skm_cpu == smp_processor_id());
1072         ASSERT(irqs_disabled());
1073
1074         /* There are no available objects or they are too young to age out */
1075         if ((skm->skm_avail == 0) ||
1076             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1077                 return;
1078
1079         /*
1080          * Because we're executing in interrupt context we may have
1081          * interrupted the holder of this lock.  To avoid a potential
1082          * deadlock return if the lock is contended.
1083          */
1084         if (!spin_trylock(&skc->skc_lock))
1085                 return;
1086
1087         __spl_cache_flush(skc, skm, skm->skm_refill);
1088         spin_unlock(&skc->skc_lock);
1089 }
1090
1091 /*
1092  * Called regularly to keep a downward pressure on the cache.
1093  *
1094  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1095  * be returned to the caches.  This is done to prevent idle magazines from
1096  * holding memory which could be better used elsewhere.  The delay is
1097  * present to prevent thrashing the magazine.
1098  *
1099  * The newly released objects may result in empty partial slabs.  Those
1100  * slabs should be released to the system.  Otherwise moving the objects
1101  * out of the magazines is just wasted work.
1102  */
1103 static void
1104 spl_cache_age(void *data)
1105 {
1106         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1107         taskqid_t id = 0;
1108
1109         ASSERT(skc->skc_magic == SKC_MAGIC);
1110
1111         /* Dynamically disabled at run time */
1112         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1113                 return;
1114
1115         atomic_inc(&skc->skc_ref);
1116
1117         if (!(skc->skc_flags & KMC_NOMAGAZINE))
1118                 on_each_cpu(spl_magazine_age, skc, 1);
1119
1120         spl_slab_reclaim(skc, skc->skc_reap, 0);
1121
1122         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1123                 id = taskq_dispatch_delay(
1124                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1125                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1126
1127                 /* Destroy issued after dispatch immediately cancel it */
1128                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1129                         taskq_cancel_id(spl_kmem_cache_taskq, id);
1130         }
1131
1132         spin_lock(&skc->skc_lock);
1133         skc->skc_taskqid = id;
1134         spin_unlock(&skc->skc_lock);
1135
1136         atomic_dec(&skc->skc_ref);
1137 }
1138
1139 /*
1140  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1141  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
1142  * for very small objects we may end up with more than this so as not
1143  * to waste space in the minimal allocation of a single page.  Also for
1144  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
1145  * lower than this and we will fail.
1146  */
1147 static int
1148 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1149 {
1150         uint32_t sks_size, obj_size, max_size;
1151
1152         if (skc->skc_flags & KMC_OFFSLAB) {
1153                 *objs = spl_kmem_cache_obj_per_slab;
1154                 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
1155                 return (0);
1156         } else {
1157                 sks_size = spl_sks_size(skc);
1158                 obj_size = spl_obj_size(skc);
1159
1160                 if (skc->skc_flags & KMC_KMEM)
1161                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1162                 else
1163                         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
1164
1165                 /* Power of two sized slab */
1166                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1167                         *objs = (*size - sks_size) / obj_size;
1168                         if (*objs >= spl_kmem_cache_obj_per_slab)
1169                                 return (0);
1170                 }
1171
1172                 /*
1173                  * Unable to satisfy target objects per slab, fall back to
1174                  * allocating a maximally sized slab and assuming it can
1175                  * contain the minimum objects count use it.  If not fail.
1176                  */
1177                 *size = max_size;
1178                 *objs = (*size - sks_size) / obj_size;
1179                 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
1180                         return (0);
1181         }
1182
1183         return (-ENOSPC);
1184 }
1185
1186 /*
1187  * Make a guess at reasonable per-cpu magazine size based on the size of
1188  * each object and the cost of caching N of them in each magazine.  Long
1189  * term this should really adapt based on an observed usage heuristic.
1190  */
1191 static int
1192 spl_magazine_size(spl_kmem_cache_t *skc)
1193 {
1194         uint32_t obj_size = spl_obj_size(skc);
1195         int size;
1196
1197         /* Per-magazine sizes below assume a 4Kib page size */
1198         if (obj_size > (PAGE_SIZE * 256))
1199                 size = 4;  /* Minimum 4Mib per-magazine */
1200         else if (obj_size > (PAGE_SIZE * 32))
1201                 size = 16; /* Minimum 2Mib per-magazine */
1202         else if (obj_size > (PAGE_SIZE))
1203                 size = 64; /* Minimum 256Kib per-magazine */
1204         else if (obj_size > (PAGE_SIZE / 4))
1205                 size = 128; /* Minimum 128Kib per-magazine */
1206         else
1207                 size = 256;
1208
1209         return (size);
1210 }
1211
1212 /*
1213  * Allocate a per-cpu magazine to associate with a specific core.
1214  */
1215 static spl_kmem_magazine_t *
1216 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
1217 {
1218         spl_kmem_magazine_t *skm;
1219         int size = sizeof(spl_kmem_magazine_t) +
1220                    sizeof(void *) * skc->skc_mag_size;
1221
1222         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
1223         if (skm) {
1224                 skm->skm_magic = SKM_MAGIC;
1225                 skm->skm_avail = 0;
1226                 skm->skm_size = skc->skc_mag_size;
1227                 skm->skm_refill = skc->skc_mag_refill;
1228                 skm->skm_cache = skc;
1229                 skm->skm_age = jiffies;
1230                 skm->skm_cpu = cpu;
1231         }
1232
1233         return (skm);
1234 }
1235
1236 /*
1237  * Free a per-cpu magazine associated with a specific core.
1238  */
1239 static void
1240 spl_magazine_free(spl_kmem_magazine_t *skm)
1241 {
1242         int size = sizeof(spl_kmem_magazine_t) +
1243                    sizeof(void *) * skm->skm_size;
1244
1245         ASSERT(skm->skm_magic == SKM_MAGIC);
1246         ASSERT(skm->skm_avail == 0);
1247
1248         kmem_free(skm, size);
1249 }
1250
1251 /*
1252  * Create all pre-cpu magazines of reasonable sizes.
1253  */
1254 static int
1255 spl_magazine_create(spl_kmem_cache_t *skc)
1256 {
1257         int i;
1258
1259         if (skc->skc_flags & KMC_NOMAGAZINE)
1260                 return (0);
1261
1262         skc->skc_mag_size = spl_magazine_size(skc);
1263         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1264
1265         for_each_online_cpu(i) {
1266                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
1267                 if (!skc->skc_mag[i]) {
1268                         for (i--; i >= 0; i--)
1269                                 spl_magazine_free(skc->skc_mag[i]);
1270
1271                         return (-ENOMEM);
1272                 }
1273         }
1274
1275         return (0);
1276 }
1277
1278 /*
1279  * Destroy all pre-cpu magazines.
1280  */
1281 static void
1282 spl_magazine_destroy(spl_kmem_cache_t *skc)
1283 {
1284         spl_kmem_magazine_t *skm;
1285         int i;
1286
1287         if (skc->skc_flags & KMC_NOMAGAZINE)
1288                 return;
1289
1290         for_each_online_cpu(i) {
1291                 skm = skc->skc_mag[i];
1292                 spl_cache_flush(skc, skm, skm->skm_avail);
1293                 spl_magazine_free(skm);
1294         }
1295 }
1296
1297 /*
1298  * Create a object cache based on the following arguments:
1299  * name         cache name
1300  * size         cache object size
1301  * align        cache object alignment
1302  * ctor         cache object constructor
1303  * dtor         cache object destructor
1304  * reclaim      cache object reclaim
1305  * priv         cache private data for ctor/dtor/reclaim
1306  * vmp          unused must be NULL
1307  * flags
1308  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1309  *      KMC_NODEBUG     Disable debugging (unsupported)
1310  *      KMC_NOHASH      Disable hashing (unsupported)
1311  *      KMC_QCACHE      Disable qcache (unsupported)
1312  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
1313  *      KMC_KMEM        Force kmem backed cache
1314  *      KMC_VMEM        Force vmem backed cache
1315  *      KMC_SLAB        Force Linux slab backed cache
1316  *      KMC_OFFSLAB     Locate objects off the slab
1317  */
1318 spl_kmem_cache_t *
1319 spl_kmem_cache_create(char *name, size_t size, size_t align,
1320                       spl_kmem_ctor_t ctor,
1321                       spl_kmem_dtor_t dtor,
1322                       spl_kmem_reclaim_t reclaim,
1323                       void *priv, void *vmp, int flags)
1324 {
1325         spl_kmem_cache_t *skc;
1326         int rc;
1327
1328         /*
1329          * Unsupported flags
1330          */
1331         ASSERT0(flags & KMC_NOMAGAZINE);
1332         ASSERT0(flags & KMC_NOHASH);
1333         ASSERT0(flags & KMC_QCACHE);
1334         ASSERT(vmp == NULL);
1335
1336         might_sleep();
1337
1338         /*
1339          * Allocate memory for a new cache an initialize it.  Unfortunately,
1340          * this usually ends up being a large allocation of ~32k because
1341          * we need to allocate enough memory for the worst case number of
1342          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1343          * explicitly pass KM_NODEBUG to suppress the kmem warning
1344          */
1345         skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
1346         if (skc == NULL)
1347                 return (NULL);
1348
1349         skc->skc_magic = SKC_MAGIC;
1350         skc->skc_name_size = strlen(name) + 1;
1351         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
1352         if (skc->skc_name == NULL) {
1353                 kmem_free(skc, sizeof(*skc));
1354                 return (NULL);
1355         }
1356         strncpy(skc->skc_name, name, skc->skc_name_size);
1357
1358         skc->skc_ctor = ctor;
1359         skc->skc_dtor = dtor;
1360         skc->skc_reclaim = reclaim;
1361         skc->skc_private = priv;
1362         skc->skc_vmp = vmp;
1363         skc->skc_linux_cache = NULL;
1364         skc->skc_flags = flags;
1365         skc->skc_obj_size = size;
1366         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1367         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1368         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1369         atomic_set(&skc->skc_ref, 0);
1370
1371         INIT_LIST_HEAD(&skc->skc_list);
1372         INIT_LIST_HEAD(&skc->skc_complete_list);
1373         INIT_LIST_HEAD(&skc->skc_partial_list);
1374         skc->skc_emergency_tree = RB_ROOT;
1375         spin_lock_init(&skc->skc_lock);
1376         init_waitqueue_head(&skc->skc_waitq);
1377         skc->skc_slab_fail = 0;
1378         skc->skc_slab_create = 0;
1379         skc->skc_slab_destroy = 0;
1380         skc->skc_slab_total = 0;
1381         skc->skc_slab_alloc = 0;
1382         skc->skc_slab_max = 0;
1383         skc->skc_obj_total = 0;
1384         skc->skc_obj_alloc = 0;
1385         skc->skc_obj_max = 0;
1386         skc->skc_obj_deadlock = 0;
1387         skc->skc_obj_emergency = 0;
1388         skc->skc_obj_emergency_max = 0;
1389
1390         /*
1391          * Verify the requested alignment restriction is sane.
1392          */
1393         if (align) {
1394                 VERIFY(ISP2(align));
1395                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
1396                 VERIFY3U(align, <=, PAGE_SIZE);
1397                 skc->skc_obj_align = align;
1398         }
1399
1400         /*
1401          * When no specific type of slab is requested (kmem, vmem, or
1402          * linuxslab) then select a cache type based on the object size
1403          * and default tunables.
1404          */
1405         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
1406
1407                 /*
1408                  * Objects smaller than spl_kmem_cache_slab_limit can
1409                  * use the Linux slab for better space-efficiency.  By
1410                  * default this functionality is disabled until its
1411                  * performance characters are fully understood.
1412                  */
1413                 if (spl_kmem_cache_slab_limit &&
1414                     size <= (size_t)spl_kmem_cache_slab_limit)
1415                         skc->skc_flags |= KMC_SLAB;
1416
1417                 /*
1418                  * Small objects, less than spl_kmem_cache_kmem_limit per
1419                  * object should use kmem because their slabs are small.
1420                  */
1421                 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
1422                         skc->skc_flags |= KMC_KMEM;
1423
1424                 /*
1425                  * All other objects are considered large and are placed
1426                  * on vmem backed slabs.
1427                  */
1428                 else
1429                         skc->skc_flags |= KMC_VMEM;
1430         }
1431
1432         /*
1433          * Given the type of slab allocate the required resources.
1434          */
1435         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1436                 rc = spl_slab_size(skc,
1437                     &skc->skc_slab_objs, &skc->skc_slab_size);
1438                 if (rc)
1439                         goto out;
1440
1441                 rc = spl_magazine_create(skc);
1442                 if (rc)
1443                         goto out;
1444         } else {
1445                 skc->skc_linux_cache = kmem_cache_create(
1446                     skc->skc_name, size, align, 0, NULL);
1447                 if (skc->skc_linux_cache == NULL) {
1448                         rc = ENOMEM;
1449                         goto out;
1450                 }
1451
1452                 kmem_cache_set_allocflags(skc, __GFP_COMP);
1453                 skc->skc_flags |= KMC_NOMAGAZINE;
1454         }
1455
1456         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1457                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1458                     spl_cache_age, skc, TQ_SLEEP,
1459                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1460
1461         down_write(&spl_kmem_cache_sem);
1462         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1463         up_write(&spl_kmem_cache_sem);
1464
1465         return (skc);
1466 out:
1467         kmem_free(skc->skc_name, skc->skc_name_size);
1468         kmem_free(skc, sizeof(*skc));
1469         return (NULL);
1470 }
1471 EXPORT_SYMBOL(spl_kmem_cache_create);
1472
1473 /*
1474  * Register a move callback to for cache defragmentation.
1475  * XXX: Unimplemented but harmless to stub out for now.
1476  */
1477 void
1478 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
1479     kmem_cbrc_t (move)(void *, void *, size_t, void *))
1480 {
1481         ASSERT(move != NULL);
1482 }
1483 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1484
1485 /*
1486  * Destroy a cache and all objects associated with the cache.
1487  */
1488 void
1489 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1490 {
1491         DECLARE_WAIT_QUEUE_HEAD(wq);
1492         taskqid_t id;
1493
1494         ASSERT(skc->skc_magic == SKC_MAGIC);
1495         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1496
1497         down_write(&spl_kmem_cache_sem);
1498         list_del_init(&skc->skc_list);
1499         up_write(&spl_kmem_cache_sem);
1500
1501         /* Cancel any and wait for any pending delayed tasks */
1502         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1503
1504         spin_lock(&skc->skc_lock);
1505         id = skc->skc_taskqid;
1506         spin_unlock(&skc->skc_lock);
1507
1508         taskq_cancel_id(spl_kmem_cache_taskq, id);
1509
1510         /* Wait until all current callers complete, this is mainly
1511          * to catch the case where a low memory situation triggers a
1512          * cache reaping action which races with this destroy. */
1513         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1514
1515         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1516                 spl_magazine_destroy(skc);
1517                 spl_slab_reclaim(skc, 0, 1);
1518         } else {
1519                 ASSERT(skc->skc_flags & KMC_SLAB);
1520                 kmem_cache_destroy(skc->skc_linux_cache);
1521         }
1522
1523         spin_lock(&skc->skc_lock);
1524
1525         /* Validate there are no objects in use and free all the
1526          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1527         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1528         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1529         ASSERT3U(skc->skc_slab_total, ==, 0);
1530         ASSERT3U(skc->skc_obj_total, ==, 0);
1531         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1532         ASSERT(list_empty(&skc->skc_complete_list));
1533
1534         kmem_free(skc->skc_name, skc->skc_name_size);
1535         spin_unlock(&skc->skc_lock);
1536
1537         kmem_free(skc, sizeof(*skc));
1538 }
1539 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1540
1541 /*
1542  * Allocate an object from a slab attached to the cache.  This is used to
1543  * repopulate the per-cpu magazine caches in batches when they run low.
1544  */
1545 static void *
1546 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1547 {
1548         spl_kmem_obj_t *sko;
1549
1550         ASSERT(skc->skc_magic == SKC_MAGIC);
1551         ASSERT(sks->sks_magic == SKS_MAGIC);
1552         ASSERT(spin_is_locked(&skc->skc_lock));
1553
1554         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1555         ASSERT(sko->sko_magic == SKO_MAGIC);
1556         ASSERT(sko->sko_addr != NULL);
1557
1558         /* Remove from sks_free_list */
1559         list_del_init(&sko->sko_list);
1560
1561         sks->sks_age = jiffies;
1562         sks->sks_ref++;
1563         skc->skc_obj_alloc++;
1564
1565         /* Track max obj usage statistics */
1566         if (skc->skc_obj_alloc > skc->skc_obj_max)
1567                 skc->skc_obj_max = skc->skc_obj_alloc;
1568
1569         /* Track max slab usage statistics */
1570         if (sks->sks_ref == 1) {
1571                 skc->skc_slab_alloc++;
1572
1573                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1574                         skc->skc_slab_max = skc->skc_slab_alloc;
1575         }
1576
1577         return sko->sko_addr;
1578 }
1579
1580 /*
1581  * Generic slab allocation function to run by the global work queues.
1582  * It is responsible for allocating a new slab, linking it in to the list
1583  * of partial slabs, and then waking any waiters.
1584  */
1585 static void
1586 spl_cache_grow_work(void *data)
1587 {
1588         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1589         spl_kmem_cache_t *skc = ska->ska_cache;
1590         spl_kmem_slab_t *sks;
1591
1592         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1593         spin_lock(&skc->skc_lock);
1594         if (sks) {
1595                 skc->skc_slab_total++;
1596                 skc->skc_obj_total += sks->sks_objs;
1597                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1598         }
1599
1600         atomic_dec(&skc->skc_ref);
1601         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1602         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1603         wake_up_all(&skc->skc_waitq);
1604         spin_unlock(&skc->skc_lock);
1605
1606         kfree(ska);
1607 }
1608
1609 /*
1610  * Returns non-zero when a new slab should be available.
1611  */
1612 static int
1613 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1614 {
1615         return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1616 }
1617
1618 /*
1619  * No available objects on any slabs, create a new slab.  Note that this
1620  * functionality is disabled for KMC_SLAB caches which are backed by the
1621  * Linux slab.
1622  */
1623 static int
1624 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1625 {
1626         int remaining, rc;
1627
1628         ASSERT(skc->skc_magic == SKC_MAGIC);
1629         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1630         might_sleep();
1631         *obj = NULL;
1632
1633         /*
1634          * Before allocating a new slab wait for any reaping to complete and
1635          * then return so the local magazine can be rechecked for new objects.
1636          */
1637         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1638                 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1639                     TASK_UNINTERRUPTIBLE);
1640                 return (rc ? rc : -EAGAIN);
1641         }
1642
1643         /*
1644          * This is handled by dispatching a work request to the global work
1645          * queue.  This allows us to asynchronously allocate a new slab while
1646          * retaining the ability to safely fall back to a smaller synchronous
1647          * allocations to ensure forward progress is always maintained.
1648          */
1649         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1650                 spl_kmem_alloc_t *ska;
1651
1652                 ska = kmalloc(sizeof(*ska), flags);
1653                 if (ska == NULL) {
1654                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1655                         wake_up_all(&skc->skc_waitq);
1656                         return (-ENOMEM);
1657                 }
1658
1659                 atomic_inc(&skc->skc_ref);
1660                 ska->ska_cache = skc;
1661                 ska->ska_flags = flags & ~__GFP_FS;
1662                 taskq_init_ent(&ska->ska_tqe);
1663                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1664                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1665         }
1666
1667         /*
1668          * The goal here is to only detect the rare case where a virtual slab
1669          * allocation has deadlocked.  We must be careful to minimize the use
1670          * of emergency objects which are more expensive to track.  Therefore,
1671          * we set a very long timeout for the asynchronous allocation and if
1672          * the timeout is reached the cache is flagged as deadlocked.  From
1673          * this point only new emergency objects will be allocated until the
1674          * asynchronous allocation completes and clears the deadlocked flag.
1675          */
1676         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1677                 rc = spl_emergency_alloc(skc, flags, obj);
1678         } else {
1679                 remaining = wait_event_timeout(skc->skc_waitq,
1680                                                spl_cache_grow_wait(skc), HZ);
1681
1682                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1683                         spin_lock(&skc->skc_lock);
1684                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1685                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1686                                 skc->skc_obj_deadlock++;
1687                         }
1688                         spin_unlock(&skc->skc_lock);
1689                 }
1690
1691                 rc = -ENOMEM;
1692         }
1693
1694         return (rc);
1695 }
1696
1697 /*
1698  * Refill a per-cpu magazine with objects from the slabs for this cache.
1699  * Ideally the magazine can be repopulated using existing objects which have
1700  * been released, however if we are unable to locate enough free objects new
1701  * slabs of objects will be created.  On success NULL is returned, otherwise
1702  * the address of a single emergency object is returned for use by the caller.
1703  */
1704 static void *
1705 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1706 {
1707         spl_kmem_slab_t *sks;
1708         int count = 0, rc, refill;
1709         void *obj = NULL;
1710
1711         ASSERT(skc->skc_magic == SKC_MAGIC);
1712         ASSERT(skm->skm_magic == SKM_MAGIC);
1713
1714         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1715         spin_lock(&skc->skc_lock);
1716
1717         while (refill > 0) {
1718                 /* No slabs available we may need to grow the cache */
1719                 if (list_empty(&skc->skc_partial_list)) {
1720                         spin_unlock(&skc->skc_lock);
1721
1722                         local_irq_enable();
1723                         rc = spl_cache_grow(skc, flags, &obj);
1724                         local_irq_disable();
1725
1726                         /* Emergency object for immediate use by caller */
1727                         if (rc == 0 && obj != NULL)
1728                                 return (obj);
1729
1730                         if (rc)
1731                                 goto out;
1732
1733                         /* Rescheduled to different CPU skm is not local */
1734                         if (skm != skc->skc_mag[smp_processor_id()])
1735                                 goto out;
1736
1737                         /* Potentially rescheduled to the same CPU but
1738                          * allocations may have occurred from this CPU while
1739                          * we were sleeping so recalculate max refill. */
1740                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1741
1742                         spin_lock(&skc->skc_lock);
1743                         continue;
1744                 }
1745
1746                 /* Grab the next available slab */
1747                 sks = list_entry((&skc->skc_partial_list)->next,
1748                                  spl_kmem_slab_t, sks_list);
1749                 ASSERT(sks->sks_magic == SKS_MAGIC);
1750                 ASSERT(sks->sks_ref < sks->sks_objs);
1751                 ASSERT(!list_empty(&sks->sks_free_list));
1752
1753                 /* Consume as many objects as needed to refill the requested
1754                  * cache.  We must also be careful not to overfill it. */
1755                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
1756                         ASSERT(skm->skm_avail < skm->skm_size);
1757                         ASSERT(count < skm->skm_size);
1758                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1759                 }
1760
1761                 /* Move slab to skc_complete_list when full */
1762                 if (sks->sks_ref == sks->sks_objs) {
1763                         list_del(&sks->sks_list);
1764                         list_add(&sks->sks_list, &skc->skc_complete_list);
1765                 }
1766         }
1767
1768         spin_unlock(&skc->skc_lock);
1769 out:
1770         return (NULL);
1771 }
1772
1773 /*
1774  * Release an object back to the slab from which it came.
1775  */
1776 static void
1777 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1778 {
1779         spl_kmem_slab_t *sks = NULL;
1780         spl_kmem_obj_t *sko = NULL;
1781
1782         ASSERT(skc->skc_magic == SKC_MAGIC);
1783         ASSERT(spin_is_locked(&skc->skc_lock));
1784
1785         sko = spl_sko_from_obj(skc, obj);
1786         ASSERT(sko->sko_magic == SKO_MAGIC);
1787         sks = sko->sko_slab;
1788         ASSERT(sks->sks_magic == SKS_MAGIC);
1789         ASSERT(sks->sks_cache == skc);
1790         list_add(&sko->sko_list, &sks->sks_free_list);
1791
1792         sks->sks_age = jiffies;
1793         sks->sks_ref--;
1794         skc->skc_obj_alloc--;
1795
1796         /* Move slab to skc_partial_list when no longer full.  Slabs
1797          * are added to the head to keep the partial list is quasi-full
1798          * sorted order.  Fuller at the head, emptier at the tail. */
1799         if (sks->sks_ref == (sks->sks_objs - 1)) {
1800                 list_del(&sks->sks_list);
1801                 list_add(&sks->sks_list, &skc->skc_partial_list);
1802         }
1803
1804         /* Move empty slabs to the end of the partial list so
1805          * they can be easily found and freed during reclamation. */
1806         if (sks->sks_ref == 0) {
1807                 list_del(&sks->sks_list);
1808                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1809                 skc->skc_slab_alloc--;
1810         }
1811 }
1812
1813 /*
1814  * Allocate an object from the per-cpu magazine, or if the magazine
1815  * is empty directly allocate from a slab and repopulate the magazine.
1816  */
1817 void *
1818 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1819 {
1820         spl_kmem_magazine_t *skm;
1821         void *obj = NULL;
1822
1823         ASSERT(skc->skc_magic == SKC_MAGIC);
1824         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1825         ASSERT(flags & KM_SLEEP);
1826
1827         atomic_inc(&skc->skc_ref);
1828
1829         /*
1830          * Allocate directly from a Linux slab.  All optimizations are left
1831          * to the underlying cache we only need to guarantee that KM_SLEEP
1832          * callers will never fail.
1833          */
1834         if (skc->skc_flags & KMC_SLAB) {
1835                 struct kmem_cache *slc = skc->skc_linux_cache;
1836
1837                 do {
1838                         obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
1839                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1840
1841                 goto ret;
1842         }
1843
1844         local_irq_disable();
1845
1846 restart:
1847         /* Safe to update per-cpu structure without lock, but
1848          * in the restart case we must be careful to reacquire
1849          * the local magazine since this may have changed
1850          * when we need to grow the cache. */
1851         skm = skc->skc_mag[smp_processor_id()];
1852         ASSERT(skm->skm_magic == SKM_MAGIC);
1853
1854         if (likely(skm->skm_avail)) {
1855                 /* Object available in CPU cache, use it */
1856                 obj = skm->skm_objs[--skm->skm_avail];
1857                 skm->skm_age = jiffies;
1858         } else {
1859                 obj = spl_cache_refill(skc, skm, flags);
1860                 if (obj == NULL)
1861                         goto restart;
1862         }
1863
1864         local_irq_enable();
1865         ASSERT(obj);
1866         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1867
1868 ret:
1869         /* Pre-emptively migrate object to CPU L1 cache */
1870         if (obj) {
1871                 if (obj && skc->skc_ctor)
1872                         skc->skc_ctor(obj, skc->skc_private, flags);
1873                 else
1874                         prefetchw(obj);
1875         }
1876
1877         atomic_dec(&skc->skc_ref);
1878
1879         return (obj);
1880 }
1881
1882 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1883
1884 /*
1885  * Free an object back to the local per-cpu magazine, there is no
1886  * guarantee that this is the same magazine the object was originally
1887  * allocated from.  We may need to flush entire from the magazine
1888  * back to the slabs to make space.
1889  */
1890 void
1891 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1892 {
1893         spl_kmem_magazine_t *skm;
1894         unsigned long flags;
1895
1896         ASSERT(skc->skc_magic == SKC_MAGIC);
1897         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1898         atomic_inc(&skc->skc_ref);
1899
1900         /*
1901          * Run the destructor
1902          */
1903         if (skc->skc_dtor)
1904                 skc->skc_dtor(obj, skc->skc_private);
1905
1906         /*
1907          * Free the object from the Linux underlying Linux slab.
1908          */
1909         if (skc->skc_flags & KMC_SLAB) {
1910                 kmem_cache_free(skc->skc_linux_cache, obj);
1911                 goto out;
1912         }
1913
1914         /*
1915          * Only virtual slabs may have emergency objects and these objects
1916          * are guaranteed to have physical addresses.  They must be removed
1917          * from the tree of emergency objects and the freed.
1918          */
1919         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
1920                 spl_emergency_free(skc, obj);
1921                 goto out;
1922         }
1923
1924         local_irq_save(flags);
1925
1926         /* Safe to update per-cpu structure without lock, but
1927          * no remote memory allocation tracking is being performed
1928          * it is entirely possible to allocate an object from one
1929          * CPU cache and return it to another. */
1930         skm = skc->skc_mag[smp_processor_id()];
1931         ASSERT(skm->skm_magic == SKM_MAGIC);
1932
1933         /* Per-CPU cache full, flush it to make space */
1934         if (unlikely(skm->skm_avail >= skm->skm_size))
1935                 spl_cache_flush(skc, skm, skm->skm_refill);
1936
1937         /* Available space in cache, use it */
1938         skm->skm_objs[skm->skm_avail++] = obj;
1939
1940         local_irq_restore(flags);
1941 out:
1942         atomic_dec(&skc->skc_ref);
1943 }
1944 EXPORT_SYMBOL(spl_kmem_cache_free);
1945
1946 /*
1947  * The generic shrinker function for all caches.  Under Linux a shrinker
1948  * may not be tightly coupled with a slab cache.  In fact Linux always
1949  * systematically tries calling all registered shrinker callbacks which
1950  * report that they contain unused objects.  Because of this we only
1951  * register one shrinker function in the shim layer for all slab caches.
1952  * We always attempt to shrink all caches when this generic shrinker
1953  * is called.
1954  *
1955  * If sc->nr_to_scan is zero, the caller is requesting a query of the
1956  * number of objects which can potentially be freed.  If it is nonzero,
1957  * the request is to free that many objects.
1958  *
1959  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1960  * in struct shrinker and also require the shrinker to return the number
1961  * of objects freed.
1962  *
1963  * Older kernels require the shrinker to return the number of freeable
1964  * objects following the freeing of nr_to_free.
1965  *
1966  * Linux semantics differ from those under Solaris, which are to
1967  * free all available objects which may (and probably will) be more
1968  * objects than the requested nr_to_scan.
1969  */
1970 static spl_shrinker_t
1971 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1972     struct shrink_control *sc)
1973 {
1974         spl_kmem_cache_t *skc;
1975         int alloc = 0;
1976
1977         down_read(&spl_kmem_cache_sem);
1978         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1979                 if (sc->nr_to_scan) {
1980 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1981                         uint64_t oldalloc = skc->skc_obj_alloc;
1982                         spl_kmem_cache_reap_now(skc,
1983                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1984                         if (oldalloc > skc->skc_obj_alloc)
1985                                 alloc += oldalloc - skc->skc_obj_alloc;
1986 #else
1987                         spl_kmem_cache_reap_now(skc,
1988                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1989                         alloc += skc->skc_obj_alloc;
1990 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1991                 } else {
1992                         /* Request to query number of freeable objects */
1993                         alloc += skc->skc_obj_alloc;
1994                 }
1995         }
1996         up_read(&spl_kmem_cache_sem);
1997
1998         /*
1999          * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
2000          * This functionality only exists to work around a rare issue where
2001          * shrink_slabs() is repeatedly invoked by many cores causing the
2002          * system to thrash.
2003          */
2004         if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
2005                 return (SHRINK_STOP);
2006
2007         return (MAX(alloc, 0));
2008 }
2009
2010 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2011
2012 /*
2013  * Call the registered reclaim function for a cache.  Depending on how
2014  * many and which objects are released it may simply repopulate the
2015  * local magazine which will then need to age-out.  Objects which cannot
2016  * fit in the magazine we will be released back to their slabs which will
2017  * also need to age out before being release.  This is all just best
2018  * effort and we do not want to thrash creating and destroying slabs.
2019  */
2020 void
2021 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
2022 {
2023         ASSERT(skc->skc_magic == SKC_MAGIC);
2024         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2025
2026         atomic_inc(&skc->skc_ref);
2027
2028         /*
2029          * Execute the registered reclaim callback if it exists.  The
2030          * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
2031          */
2032         if (skc->skc_flags & KMC_SLAB) {
2033                 if (skc->skc_reclaim)
2034                         skc->skc_reclaim(skc->skc_private);
2035
2036                 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
2037                         kmem_cache_shrink(skc->skc_linux_cache);
2038
2039                 goto out;
2040         }
2041
2042         /*
2043          * Prevent concurrent cache reaping when contended.
2044          */
2045         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
2046                 goto out;
2047
2048         /*
2049          * When a reclaim function is available it may be invoked repeatedly
2050          * until at least a single slab can be freed.  This ensures that we
2051          * do free memory back to the system.  This helps minimize the chance
2052          * of an OOM event when the bulk of memory is used by the slab.
2053          *
2054          * When free slabs are already available the reclaim callback will be
2055          * skipped.  Additionally, if no forward progress is detected despite
2056          * a reclaim function the cache will be skipped to avoid deadlock.
2057          *
2058          * Longer term this would be the correct place to add the code which
2059          * repacks the slabs in order minimize fragmentation.
2060          */
2061         if (skc->skc_reclaim) {
2062                 uint64_t objects = UINT64_MAX;
2063                 int do_reclaim;
2064
2065                 do {
2066                         spin_lock(&skc->skc_lock);
2067                         do_reclaim =
2068                             (skc->skc_slab_total > 0) &&
2069                             ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2070                             (skc->skc_obj_alloc < objects);
2071
2072                         objects = skc->skc_obj_alloc;
2073                         spin_unlock(&skc->skc_lock);
2074
2075                         if (do_reclaim)
2076                                 skc->skc_reclaim(skc->skc_private);
2077
2078                 } while (do_reclaim);
2079         }
2080
2081         /* Reclaim from the magazine then the slabs ignoring age and delay. */
2082         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2083                 spl_kmem_magazine_t *skm;
2084                 unsigned long irq_flags;
2085
2086                 local_irq_save(irq_flags);
2087                 skm = skc->skc_mag[smp_processor_id()];
2088                 spl_cache_flush(skc, skm, skm->skm_avail);
2089                 local_irq_restore(irq_flags);
2090         }
2091
2092         spl_slab_reclaim(skc, count, 1);
2093         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
2094         smp_wmb();
2095         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
2096 out:
2097         atomic_dec(&skc->skc_ref);
2098 }
2099 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
2100
2101 /*
2102  * Reap all free slabs from all registered caches.
2103  */
2104 void
2105 spl_kmem_reap(void)
2106 {
2107         struct shrink_control sc;
2108
2109         sc.nr_to_scan = KMC_REAP_CHUNK;
2110         sc.gfp_mask = GFP_KERNEL;
2111
2112         (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
2113 }
2114 EXPORT_SYMBOL(spl_kmem_reap);
2115
2116 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
2117 static char *
2118 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
2119 {
2120         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
2121         int i, flag = 1;
2122
2123         ASSERT(str != NULL && len >= 17);
2124         memset(str, 0, len);
2125
2126         /* Check for a fully printable string, and while we are at
2127          * it place the printable characters in the passed buffer. */
2128         for (i = 0; i < size; i++) {
2129                 str[i] = ((char *)(kd->kd_addr))[i];
2130                 if (isprint(str[i])) {
2131                         continue;
2132                 } else {
2133                         /* Minimum number of printable characters found
2134                          * to make it worthwhile to print this as ascii. */
2135                         if (i > min)
2136                                 break;
2137
2138                         flag = 0;
2139                         break;
2140                 }
2141         }
2142
2143         if (!flag) {
2144                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2145                         *((uint8_t *)kd->kd_addr),
2146                         *((uint8_t *)kd->kd_addr + 2),
2147                         *((uint8_t *)kd->kd_addr + 4),
2148                         *((uint8_t *)kd->kd_addr + 6),
2149                         *((uint8_t *)kd->kd_addr + 8),
2150                         *((uint8_t *)kd->kd_addr + 10),
2151                         *((uint8_t *)kd->kd_addr + 12),
2152                         *((uint8_t *)kd->kd_addr + 14));
2153         }
2154
2155         return str;
2156 }
2157
2158 static int
2159 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2160 {
2161         int i;
2162
2163         spin_lock_init(lock);
2164         INIT_LIST_HEAD(list);
2165
2166         for (i = 0; i < size; i++)
2167                 INIT_HLIST_HEAD(&kmem_table[i]);
2168
2169         return (0);
2170 }
2171
2172 static void
2173 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
2174 {
2175         unsigned long flags;
2176         kmem_debug_t *kd;
2177         char str[17];
2178
2179         spin_lock_irqsave(lock, flags);
2180         if (!list_empty(list))
2181                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2182                        "size", "data", "func", "line");
2183
2184         list_for_each_entry(kd, list, kd_list)
2185                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
2186                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2187                        kd->kd_func, kd->kd_line);
2188
2189         spin_unlock_irqrestore(lock, flags);
2190 }
2191 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2192 #define spl_kmem_init_tracking(list, lock, size)
2193 #define spl_kmem_fini_tracking(list, lock)
2194 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2195
2196 int
2197 spl_kmem_init(void)
2198 {
2199         int rc = 0;
2200
2201 #ifdef DEBUG_KMEM
2202         kmem_alloc_used_set(0);
2203         vmem_alloc_used_set(0);
2204
2205         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2206         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2207 #endif
2208
2209         init_rwsem(&spl_kmem_cache_sem);
2210         INIT_LIST_HEAD(&spl_kmem_cache_list);
2211         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2212             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2213
2214         spl_register_shrinker(&spl_kmem_cache_shrinker);
2215
2216         return (rc);
2217 }
2218
2219 void
2220 spl_kmem_fini(void)
2221 {
2222         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2223         taskq_destroy(spl_kmem_cache_taskq);
2224
2225 #ifdef DEBUG_KMEM
2226         /* Display all unreclaimed memory addresses, including the
2227          * allocation size and the first few bytes of what's located
2228          * at that address to aid in debugging.  Performance is not
2229          * a serious concern here since it is module unload time. */
2230         if (kmem_alloc_used_read() != 0)
2231                 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
2232                     kmem_alloc_used_read(), kmem_alloc_max);
2233
2234         if (vmem_alloc_used_read() != 0)
2235                 printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n",
2236                     vmem_alloc_used_read(), vmem_alloc_max);
2237
2238         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2239         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2240 #endif /* DEBUG_KMEM */
2241 }