module/os/linux/spl/spl-kmem-cache.c

   1 /*
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  */
  24
  25 #include <linux/percpu_compat.h>
  26 #include <sys/kmem.h>
  27 #include <sys/kmem_cache.h>
  28 #include <sys/taskq.h>
  29 #include <sys/timer.h>
  30 #include <sys/vmem.h>
  31 #include <sys/wait.h>
  32 #include <linux/slab.h>
  33 #include <linux/swap.h>
  34 #include <linux/prefetch.h>
  35
  36 /*
  37  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  38  * are removed to allow access to the real Linux slab allocator.
  39  */
  40 #undef kmem_cache_destroy
  41 #undef kmem_cache_create
  42 #undef kmem_cache_alloc
  43 #undef kmem_cache_free
  44
  45
  46 /*
  47  * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
  48  * with smp_mb__{before,after}_atomic() because they were redundant. This is
  49  * only used inside our SLAB allocator, so we implement an internal wrapper
  50  * here to give us smp_mb__{before,after}_atomic() on older kernels.
  51  */
  52 #ifndef smp_mb__before_atomic
  53 #define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
  54 #endif
  55
  56 #ifndef smp_mb__after_atomic
  57 #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
  58 #endif
  59
  60 /* BEGIN CSTYLED */
  61
  62 /*
  63  * Cache magazines are an optimization designed to minimize the cost of
  64  * allocating memory.  They do this by keeping a per-cpu cache of recently
  65  * freed objects, which can then be reallocated without taking a lock. This
  66  * can improve performance on highly contended caches.  However, because
  67  * objects in magazines will prevent otherwise empty slabs from being
  68  * immediately released this may not be ideal for low memory machines.
  69  *
  70  * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
  71  * magazine size.  When this value is set to 0 the magazine size will be
  72  * automatically determined based on the object size.  Otherwise magazines
  73  * will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
  74  * may never be entirely disabled in this implementation.
  75  */
  76 unsigned int spl_kmem_cache_magazine_size = 0;
  77 module_param(spl_kmem_cache_magazine_size, uint, 0444);
  78 MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
  79         "Default magazine size (2-256), set automatically (0)");
  80
  81 /*
  82  * The default behavior is to report the number of objects remaining in the
  83  * cache.  This allows the Linux VM to repeatedly reclaim objects from the
  84  * cache when memory is low satisfy other memory allocations.  Alternately,
  85  * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
  86  * is reclaimed.  This may increase the likelihood of out of memory events.
  87  */
  88 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
  89 module_param(spl_kmem_cache_reclaim, uint, 0644);
  90 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
  91
  92 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  93 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  94 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  95
  96 unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
  97 module_param(spl_kmem_cache_max_size, uint, 0644);
  98 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  99
 100 /*
 101  * For small objects the Linux slab allocator should be used to make the most
 102  * efficient use of the memory.  However, large objects are not supported by
 103  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
 104  * of 16K was determined to be optimal for architectures using 4K pages.
 105  */
 106 #if PAGE_SIZE == 4096
 107 unsigned int spl_kmem_cache_slab_limit = 16384;
 108 #else
 109 unsigned int spl_kmem_cache_slab_limit = 0;
 110 #endif
 111 module_param(spl_kmem_cache_slab_limit, uint, 0644);
 112 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
 113         "Objects less than N bytes use the Linux slab");
 114
 115 /*
 116  * This value defaults to a threshold designed to avoid allocations which
 117  * have been deemed costly by the kernel.
 118  */
 119 unsigned int spl_kmem_cache_kmem_limit =
 120         ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
 121         SPL_KMEM_CACHE_OBJ_PER_SLAB;
 122 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
 123 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
 124         "Objects less than N bytes use the kmalloc");
 125
 126 /*
 127  * The number of threads available to allocate new slabs for caches.  This
 128  * should not need to be tuned but it is available for performance analysis.
 129  */
 130 unsigned int spl_kmem_cache_kmem_threads = 4;
 131 module_param(spl_kmem_cache_kmem_threads, uint, 0444);
 132 MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
 133         "Number of spl_kmem_cache threads");
 134 /* END CSTYLED */
 135
 136 /*
 137  * Slab allocation interfaces
 138  *
 139  * While the Linux slab implementation was inspired by the Solaris
 140  * implementation I cannot use it to emulate the Solaris APIs.  I
 141  * require two features which are not provided by the Linux slab.
 142  *
 143  * 1) Constructors AND destructors.  Recent versions of the Linux
 144  *    kernel have removed support for destructors.  This is a deal
 145  *    breaker for the SPL which contains particularly expensive
 146  *    initializers for mutex's, condition variables, etc.  We also
 147  *    require a minimal level of cleanup for these data types unlike
 148  *    many Linux data types which do need to be explicitly destroyed.
 149  *
 150  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 151  *    expect it to work well for both small are very large allocations.
 152  *    Because of memory fragmentation the Linux slab which is backed
 153  *    by kmalloc'ed memory performs very badly when confronted with
 154  *    large numbers of large allocations.  Basing the slab on the
 155  *    virtual address space removes the need for contiguous pages
 156  *    and greatly improve performance for large allocations.
 157  *
 158  * For these reasons, the SPL has its own slab implementation with
 159  * the needed features.  It is not as highly optimized as either the
 160  * Solaris or Linux slabs, but it should get me most of what is
 161  * needed until it can be optimized or obsoleted by another approach.
 162  *
 163  * One serious concern I do have about this method is the relatively
 164  * small virtual address space on 32bit arches.  This will seriously
 165  * constrain the size of the slab caches and their performance.
 166  */
 167
 168 struct list_head spl_kmem_cache_list;   /* List of caches */
 169 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 170 taskq_t *spl_kmem_cache_taskq;          /* Task queue for aging / reclaim */
 171
 172 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 173
 174 static void *
 175 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 176 {
 177         gfp_t lflags = kmem_flags_convert(flags);
 178         void *ptr;
 179
 180         if (skc->skc_flags & KMC_KMEM) {
 181                 ASSERT(ISP2(size));
 182                 ptr = (void *)__get_free_pages(lflags, get_order(size));
 183         } else {
 184                 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
 185         }
 186
 187         /* Resulting allocated memory will be page aligned */
 188         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 189
 190         return (ptr);
 191 }
 192
 193 static void
 194 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 195 {
 196         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 197
 198         /*
 199          * The Linux direct reclaim path uses this out of band value to
 200          * determine if forward progress is being made.  Normally this is
 201          * incremented by kmem_freepages() which is part of the various
 202          * Linux slab implementations.  However, since we are using none
 203          * of that infrastructure we are responsible for incrementing it.
 204          */
 205         if (current->reclaim_state)
 206                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 207
 208         if (skc->skc_flags & KMC_KMEM) {
 209                 ASSERT(ISP2(size));
 210                 free_pages((unsigned long)ptr, get_order(size));
 211         } else {
 212                 vfree(ptr);
 213         }
 214 }
 215
 216 /*
 217  * Required space for each aligned sks.
 218  */
 219 static inline uint32_t
 220 spl_sks_size(spl_kmem_cache_t *skc)
 221 {
 222         return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
 223             skc->skc_obj_align, uint32_t));
 224 }
 225
 226 /*
 227  * Required space for each aligned object.
 228  */
 229 static inline uint32_t
 230 spl_obj_size(spl_kmem_cache_t *skc)
 231 {
 232         uint32_t align = skc->skc_obj_align;
 233
 234         return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 235             P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
 236 }
 237
 238 uint64_t
 239 spl_kmem_cache_inuse(kmem_cache_t *cache)
 240 {
 241         return (cache->skc_obj_total);
 242 }
 243 EXPORT_SYMBOL(spl_kmem_cache_inuse);
 244
 245 uint64_t
 246 spl_kmem_cache_entry_size(kmem_cache_t *cache)
 247 {
 248         return (cache->skc_obj_size);
 249 }
 250 EXPORT_SYMBOL(spl_kmem_cache_entry_size);
 251
 252 /*
 253  * Lookup the spl_kmem_object_t for an object given that object.
 254  */
 255 static inline spl_kmem_obj_t *
 256 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 257 {
 258         return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 259             skc->skc_obj_align, uint32_t));
 260 }
 261
 262 /*
 263  * It's important that we pack the spl_kmem_obj_t structure and the
 264  * actual objects in to one large address space to minimize the number
 265  * of calls to the allocator.  It is far better to do a few large
 266  * allocations and then subdivide it ourselves.  Now which allocator
 267  * we use requires balancing a few trade offs.
 268  *
 269  * For small objects we use kmem_alloc() because as long as you are
 270  * only requesting a small number of pages (ideally just one) its cheap.
 271  * However, when you start requesting multiple pages with kmem_alloc()
 272  * it gets increasingly expensive since it requires contiguous pages.
 273  * For this reason we shift to vmem_alloc() for slabs of large objects
 274  * which removes the need for contiguous pages.  We do not use
 275  * vmem_alloc() in all cases because there is significant locking
 276  * overhead in __get_vm_area_node().  This function takes a single
 277  * global lock when acquiring an available virtual address range which
 278  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 279  * different allocation functions for small and large objects should
 280  * give us the best of both worlds.
 281  *
 282  * +------------------------+
 283  * | spl_kmem_slab_t --+-+  |
 284  * | skc_obj_size    <-+ |  |
 285  * | spl_kmem_obj_t      |  |
 286  * | skc_obj_size    <---+  |
 287  * | spl_kmem_obj_t      |  |
 288  * | ...                 v  |
 289  * +------------------------+
 290  */
 291 static spl_kmem_slab_t *
 292 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 293 {
 294         spl_kmem_slab_t *sks;
 295         void *base;
 296         uint32_t obj_size;
 297
 298         base = kv_alloc(skc, skc->skc_slab_size, flags);
 299         if (base == NULL)
 300                 return (NULL);
 301
 302         sks = (spl_kmem_slab_t *)base;
 303         sks->sks_magic = SKS_MAGIC;
 304         sks->sks_objs = skc->skc_slab_objs;
 305         sks->sks_age = jiffies;
 306         sks->sks_cache = skc;
 307         INIT_LIST_HEAD(&sks->sks_list);
 308         INIT_LIST_HEAD(&sks->sks_free_list);
 309         sks->sks_ref = 0;
 310         obj_size = spl_obj_size(skc);
 311
 312         for (int i = 0; i < sks->sks_objs; i++) {
 313                 void *obj = base + spl_sks_size(skc) + (i * obj_size);
 314
 315                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 316                 spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);
 317                 sko->sko_addr = obj;
 318                 sko->sko_magic = SKO_MAGIC;
 319                 sko->sko_slab = sks;
 320                 INIT_LIST_HEAD(&sko->sko_list);
 321                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 322         }
 323
 324         return (sks);
 325 }
 326
 327 /*
 328  * Remove a slab from complete or partial list, it must be called with
 329  * the 'skc->skc_lock' held but the actual free must be performed
 330  * outside the lock to prevent deadlocking on vmem addresses.
 331  */
 332 static void
 333 spl_slab_free(spl_kmem_slab_t *sks,
 334     struct list_head *sks_list, struct list_head *sko_list)
 335 {
 336         spl_kmem_cache_t *skc;
 337
 338         ASSERT(sks->sks_magic == SKS_MAGIC);
 339         ASSERT(sks->sks_ref == 0);
 340
 341         skc = sks->sks_cache;
 342         ASSERT(skc->skc_magic == SKC_MAGIC);
 343
 344         /*
 345          * Update slab/objects counters in the cache, then remove the
 346          * slab from the skc->skc_partial_list.  Finally add the slab
 347          * and all its objects in to the private work lists where the
 348          * destructors will be called and the memory freed to the system.
 349          */
 350         skc->skc_obj_total -= sks->sks_objs;
 351         skc->skc_slab_total--;
 352         list_del(&sks->sks_list);
 353         list_add(&sks->sks_list, sks_list);
 354         list_splice_init(&sks->sks_free_list, sko_list);
 355 }
 356
 357 /*
 358  * Reclaim empty slabs at the end of the partial list.
 359  */
 360 static void
 361 spl_slab_reclaim(spl_kmem_cache_t *skc)
 362 {
 363         spl_kmem_slab_t *sks = NULL, *m = NULL;
 364         spl_kmem_obj_t *sko = NULL, *n = NULL;
 365         LIST_HEAD(sks_list);
 366         LIST_HEAD(sko_list);
 367
 368         /*
 369          * Empty slabs and objects must be moved to a private list so they
 370          * can be safely freed outside the spin lock.  All empty slabs are
 371          * at the end of skc->skc_partial_list, therefore once a non-empty
 372          * slab is found we can stop scanning.
 373          */
 374         spin_lock(&skc->skc_lock);
 375         list_for_each_entry_safe_reverse(sks, m,
 376             &skc->skc_partial_list, sks_list) {
 377
 378                 if (sks->sks_ref > 0)
 379                         break;
 380
 381                 spl_slab_free(sks, &sks_list, &sko_list);
 382         }
 383         spin_unlock(&skc->skc_lock);
 384
 385         /*
 386          * The following two loops ensure all the object destructors are run,
 387          * and the slabs themselves are freed.  This is all done outside the
 388          * skc->skc_lock since this allows the destructor to sleep, and
 389          * allows us to perform a conditional reschedule when a freeing a
 390          * large number of objects and slabs back to the system.
 391          */
 392
 393         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 394                 ASSERT(sko->sko_magic == SKO_MAGIC);
 395         }
 396
 397         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 398                 ASSERT(sks->sks_magic == SKS_MAGIC);
 399                 kv_free(skc, sks, skc->skc_slab_size);
 400         }
 401 }
 402
 403 static spl_kmem_emergency_t *
 404 spl_emergency_search(struct rb_root *root, void *obj)
 405 {
 406         struct rb_node *node = root->rb_node;
 407         spl_kmem_emergency_t *ske;
 408         unsigned long address = (unsigned long)obj;
 409
 410         while (node) {
 411                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
 412
 413                 if (address < ske->ske_obj)
 414                         node = node->rb_left;
 415                 else if (address > ske->ske_obj)
 416                         node = node->rb_right;
 417                 else
 418                         return (ske);
 419         }
 420
 421         return (NULL);
 422 }
 423
 424 static int
 425 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 426 {
 427         struct rb_node **new = &(root->rb_node), *parent = NULL;
 428         spl_kmem_emergency_t *ske_tmp;
 429         unsigned long address = ske->ske_obj;
 430
 431         while (*new) {
 432                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 433
 434                 parent = *new;
 435                 if (address < ske_tmp->ske_obj)
 436                         new = &((*new)->rb_left);
 437                 else if (address > ske_tmp->ske_obj)
 438                         new = &((*new)->rb_right);
 439                 else
 440                         return (0);
 441         }
 442
 443         rb_link_node(&ske->ske_node, parent, new);
 444         rb_insert_color(&ske->ske_node, root);
 445
 446         return (1);
 447 }
 448
 449 /*
 450  * Allocate a single emergency object and track it in a red black tree.
 451  */
 452 static int
 453 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 454 {
 455         gfp_t lflags = kmem_flags_convert(flags);
 456         spl_kmem_emergency_t *ske;
 457         int order = get_order(skc->skc_obj_size);
 458         int empty;
 459
 460         /* Last chance use a partial slab if one now exists */
 461         spin_lock(&skc->skc_lock);
 462         empty = list_empty(&skc->skc_partial_list);
 463         spin_unlock(&skc->skc_lock);
 464         if (!empty)
 465                 return (-EEXIST);
 466
 467         ske = kmalloc(sizeof (*ske), lflags);
 468         if (ske == NULL)
 469                 return (-ENOMEM);
 470
 471         ske->ske_obj = __get_free_pages(lflags, order);
 472         if (ske->ske_obj == 0) {
 473                 kfree(ske);
 474                 return (-ENOMEM);
 475         }
 476
 477         spin_lock(&skc->skc_lock);
 478         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 479         if (likely(empty)) {
 480                 skc->skc_obj_total++;
 481                 skc->skc_obj_emergency++;
 482                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 483                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 484         }
 485         spin_unlock(&skc->skc_lock);
 486
 487         if (unlikely(!empty)) {
 488                 free_pages(ske->ske_obj, order);
 489                 kfree(ske);
 490                 return (-EINVAL);
 491         }
 492
 493         *obj = (void *)ske->ske_obj;
 494
 495         return (0);
 496 }
 497
 498 /*
 499  * Locate the passed object in the red black tree and free it.
 500  */
 501 static int
 502 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 503 {
 504         spl_kmem_emergency_t *ske;
 505         int order = get_order(skc->skc_obj_size);
 506
 507         spin_lock(&skc->skc_lock);
 508         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
 509         if (ske) {
 510                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
 511                 skc->skc_obj_emergency--;
 512                 skc->skc_obj_total--;
 513         }
 514         spin_unlock(&skc->skc_lock);
 515
 516         if (ske == NULL)
 517                 return (-ENOENT);
 518
 519         free_pages(ske->ske_obj, order);
 520         kfree(ske);
 521
 522         return (0);
 523 }
 524
 525 /*
 526  * Release objects from the per-cpu magazine back to their slab.  The flush
 527  * argument contains the max number of entries to remove from the magazine.
 528  */
 529 static void
 530 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 531 {
 532         spin_lock(&skc->skc_lock);
 533
 534         ASSERT(skc->skc_magic == SKC_MAGIC);
 535         ASSERT(skm->skm_magic == SKM_MAGIC);
 536
 537         int count = MIN(flush, skm->skm_avail);
 538         for (int i = 0; i < count; i++)
 539                 spl_cache_shrink(skc, skm->skm_objs[i]);
 540
 541         skm->skm_avail -= count;
 542         memmove(skm->skm_objs, &(skm->skm_objs[count]),
 543             sizeof (void *) * skm->skm_avail);
 544
 545         spin_unlock(&skc->skc_lock);
 546 }
 547
 548 /*
 549  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
 550  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
 551  * for very small objects we may end up with more than this so as not
 552  * to waste space in the minimal allocation of a single page.  Also for
 553  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
 554  * lower than this and we will fail.
 555  */
 556 static int
 557 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 558 {
 559         uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
 560
 561         sks_size = spl_sks_size(skc);
 562         obj_size = spl_obj_size(skc);
 563         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
 564         tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
 565
 566         /*
 567          * KMC_KMEM slabs are allocated by __get_free_pages() which
 568          * rounds up to the nearest order.  Knowing this the size
 569          * should be rounded up to the next power of two with a hard
 570          * maximum defined by the maximum allowed allocation order.
 571          */
 572         if (skc->skc_flags & KMC_KMEM) {
 573                 max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
 574                 tgt_size = MIN(max_size,
 575                     PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
 576         }
 577
 578         if (tgt_size <= max_size) {
 579                 tgt_objs = (tgt_size - sks_size) / obj_size;
 580         } else {
 581                 tgt_objs = (max_size - sks_size) / obj_size;
 582                 tgt_size = (tgt_objs * obj_size) + sks_size;
 583         }
 584
 585         if (tgt_objs == 0)
 586                 return (-ENOSPC);
 587
 588         *objs = tgt_objs;
 589         *size = tgt_size;
 590
 591         return (0);
 592 }
 593
 594 /*
 595  * Make a guess at reasonable per-cpu magazine size based on the size of
 596  * each object and the cost of caching N of them in each magazine.  Long
 597  * term this should really adapt based on an observed usage heuristic.
 598  */
 599 static int
 600 spl_magazine_size(spl_kmem_cache_t *skc)
 601 {
 602         uint32_t obj_size = spl_obj_size(skc);
 603         int size;
 604
 605         if (spl_kmem_cache_magazine_size > 0)
 606                 return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
 607
 608         /* Per-magazine sizes below assume a 4Kib page size */
 609         if (obj_size > (PAGE_SIZE * 256))
 610                 size = 4;  /* Minimum 4Mib per-magazine */
 611         else if (obj_size > (PAGE_SIZE * 32))
 612                 size = 16; /* Minimum 2Mib per-magazine */
 613         else if (obj_size > (PAGE_SIZE))
 614                 size = 64; /* Minimum 256Kib per-magazine */
 615         else if (obj_size > (PAGE_SIZE / 4))
 616                 size = 128; /* Minimum 128Kib per-magazine */
 617         else
 618                 size = 256;
 619
 620         return (size);
 621 }
 622
 623 /*
 624  * Allocate a per-cpu magazine to associate with a specific core.
 625  */
 626 static spl_kmem_magazine_t *
 627 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 628 {
 629         spl_kmem_magazine_t *skm;
 630         int size = sizeof (spl_kmem_magazine_t) +
 631             sizeof (void *) * skc->skc_mag_size;
 632
 633         skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
 634         if (skm) {
 635                 skm->skm_magic = SKM_MAGIC;
 636                 skm->skm_avail = 0;
 637                 skm->skm_size = skc->skc_mag_size;
 638                 skm->skm_refill = skc->skc_mag_refill;
 639                 skm->skm_cache = skc;
 640                 skm->skm_cpu = cpu;
 641         }
 642
 643         return (skm);
 644 }
 645
 646 /*
 647  * Free a per-cpu magazine associated with a specific core.
 648  */
 649 static void
 650 spl_magazine_free(spl_kmem_magazine_t *skm)
 651 {
 652         ASSERT(skm->skm_magic == SKM_MAGIC);
 653         ASSERT(skm->skm_avail == 0);
 654         kfree(skm);
 655 }
 656
 657 /*
 658  * Create all pre-cpu magazines of reasonable sizes.
 659  */
 660 static int
 661 spl_magazine_create(spl_kmem_cache_t *skc)
 662 {
 663         int i = 0;
 664
 665         if (skc->skc_flags & KMC_NOMAGAZINE)
 666                 return (0);
 667
 668         skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
 669             num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
 670         skc->skc_mag_size = spl_magazine_size(skc);
 671         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 672
 673         for_each_possible_cpu(i) {
 674                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
 675                 if (!skc->skc_mag[i]) {
 676                         for (i--; i >= 0; i--)
 677                                 spl_magazine_free(skc->skc_mag[i]);
 678
 679                         kfree(skc->skc_mag);
 680                         return (-ENOMEM);
 681                 }
 682         }
 683
 684         return (0);
 685 }
 686
 687 /*
 688  * Destroy all pre-cpu magazines.
 689  */
 690 static void
 691 spl_magazine_destroy(spl_kmem_cache_t *skc)
 692 {
 693         spl_kmem_magazine_t *skm;
 694         int i = 0;
 695
 696         if (skc->skc_flags & KMC_NOMAGAZINE)
 697                 return;
 698
 699         for_each_possible_cpu(i) {
 700                 skm = skc->skc_mag[i];
 701                 spl_cache_flush(skc, skm, skm->skm_avail);
 702                 spl_magazine_free(skm);
 703         }
 704
 705         kfree(skc->skc_mag);
 706 }
 707
 708 /*
 709  * Create a object cache based on the following arguments:
 710  * name         cache name
 711  * size         cache object size
 712  * align        cache object alignment
 713  * ctor         cache object constructor
 714  * dtor         cache object destructor
 715  * reclaim      cache object reclaim
 716  * priv         cache private data for ctor/dtor/reclaim
 717  * vmp          unused must be NULL
 718  * flags
 719  *      KMC_KMEM        Force SPL kmem backed cache
 720  *      KMC_VMEM        Force SPL vmem backed cache
 721  *      KMC_KVMEM       Force kvmem backed SPL cache
 722  *      KMC_SLAB        Force Linux slab backed cache
 723  *      KMC_NODEBUG     Disable debugging (unsupported)
 724  *      KMC_NOHASH      Disable hashing (unsupported)
 725  *      KMC_QCACHE      Disable qcache (unsupported)
 726  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
 727  */
 728 spl_kmem_cache_t *
 729 spl_kmem_cache_create(char *name, size_t size, size_t align,
 730     spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
 731     void *priv, void *vmp, int flags)
 732 {
 733         gfp_t lflags = kmem_flags_convert(KM_SLEEP);
 734         spl_kmem_cache_t *skc;
 735         int rc;
 736
 737         /*
 738          * Unsupported flags
 739          */
 740         ASSERT0(flags & KMC_NOMAGAZINE);
 741         ASSERT0(flags & KMC_NOHASH);
 742         ASSERT0(flags & KMC_QCACHE);
 743         ASSERT(vmp == NULL);
 744         ASSERT(reclaim == NULL);
 745
 746         might_sleep();
 747
 748         skc = kzalloc(sizeof (*skc), lflags);
 749         if (skc == NULL)
 750                 return (NULL);
 751
 752         skc->skc_magic = SKC_MAGIC;
 753         skc->skc_name_size = strlen(name) + 1;
 754         skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
 755         if (skc->skc_name == NULL) {
 756                 kfree(skc);
 757                 return (NULL);
 758         }
 759         strncpy(skc->skc_name, name, skc->skc_name_size);
 760
 761         skc->skc_ctor = ctor;
 762         skc->skc_dtor = dtor;
 763         skc->skc_private = priv;
 764         skc->skc_vmp = vmp;
 765         skc->skc_linux_cache = NULL;
 766         skc->skc_flags = flags;
 767         skc->skc_obj_size = size;
 768         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 769         atomic_set(&skc->skc_ref, 0);
 770
 771         INIT_LIST_HEAD(&skc->skc_list);
 772         INIT_LIST_HEAD(&skc->skc_complete_list);
 773         INIT_LIST_HEAD(&skc->skc_partial_list);
 774         skc->skc_emergency_tree = RB_ROOT;
 775         spin_lock_init(&skc->skc_lock);
 776         init_waitqueue_head(&skc->skc_waitq);
 777         skc->skc_slab_fail = 0;
 778         skc->skc_slab_create = 0;
 779         skc->skc_slab_destroy = 0;
 780         skc->skc_slab_total = 0;
 781         skc->skc_slab_alloc = 0;
 782         skc->skc_slab_max = 0;
 783         skc->skc_obj_total = 0;
 784         skc->skc_obj_alloc = 0;
 785         skc->skc_obj_max = 0;
 786         skc->skc_obj_deadlock = 0;
 787         skc->skc_obj_emergency = 0;
 788         skc->skc_obj_emergency_max = 0;
 789
 790         rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0,
 791             GFP_KERNEL);
 792         if (rc != 0) {
 793                 kfree(skc);
 794                 return (NULL);
 795         }
 796
 797         /*
 798          * Verify the requested alignment restriction is sane.
 799          */
 800         if (align) {
 801                 VERIFY(ISP2(align));
 802                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
 803                 VERIFY3U(align, <=, PAGE_SIZE);
 804                 skc->skc_obj_align = align;
 805         }
 806
 807         /*
 808          * When no specific type of slab is requested (kmem, vmem, or
 809          * linuxslab) then select a cache type based on the object size
 810          * and default tunables.
 811          */
 812         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB | KMC_KVMEM))) {
 813                 if (spl_kmem_cache_slab_limit &&
 814                     size <= (size_t)spl_kmem_cache_slab_limit) {
 815                         /*
 816                          * Objects smaller than spl_kmem_cache_slab_limit can
 817                          * use the Linux slab for better space-efficiency.
 818                          */
 819                         skc->skc_flags |= KMC_SLAB;
 820                 } else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) {
 821                         /*
 822                          * Small objects, less than spl_kmem_cache_kmem_limit
 823                          * per object should use kmem because their slabs are
 824                          * small.
 825                          */
 826                         skc->skc_flags |= KMC_KMEM;
 827                 } else {
 828                         /*
 829                          * All other objects are considered large and are
 830                          * placed on kvmem backed slabs.
 831                          */
 832                         skc->skc_flags |= KMC_KVMEM;
 833                 }
 834         }
 835
 836         /*
 837          * Given the type of slab allocate the required resources.
 838          */
 839         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_KVMEM)) {
 840                 rc = spl_slab_size(skc,
 841                     &skc->skc_slab_objs, &skc->skc_slab_size);
 842                 if (rc)
 843                         goto out;
 844
 845                 rc = spl_magazine_create(skc);
 846                 if (rc)
 847                         goto out;
 848         } else {
 849                 unsigned long slabflags = 0;
 850
 851                 if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
 852                         rc = EINVAL;
 853                         goto out;
 854                 }
 855
 856 #if defined(SLAB_USERCOPY)
 857                 /*
 858                  * Required for PAX-enabled kernels if the slab is to be
 859                  * used for copying between user and kernel space.
 860                  */
 861                 slabflags |= SLAB_USERCOPY;
 862 #endif
 863
 864 #if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
 865                 /*
 866                  * Newer grsec patchset uses kmem_cache_create_usercopy()
 867                  * instead of SLAB_USERCOPY flag
 868                  */
 869                 skc->skc_linux_cache = kmem_cache_create_usercopy(
 870                     skc->skc_name, size, align, slabflags, 0, size, NULL);
 871 #else
 872                 skc->skc_linux_cache = kmem_cache_create(
 873                     skc->skc_name, size, align, slabflags, NULL);
 874 #endif
 875                 if (skc->skc_linux_cache == NULL) {
 876                         rc = ENOMEM;
 877                         goto out;
 878                 }
 879
 880                 skc->skc_flags |= KMC_NOMAGAZINE;
 881         }
 882
 883         down_write(&spl_kmem_cache_sem);
 884         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 885         up_write(&spl_kmem_cache_sem);
 886
 887         return (skc);
 888 out:
 889         kfree(skc->skc_name);
 890         percpu_counter_destroy(&skc->skc_linux_alloc);
 891         kfree(skc);
 892         return (NULL);
 893 }
 894 EXPORT_SYMBOL(spl_kmem_cache_create);
 895
 896 /*
 897  * Register a move callback for cache defragmentation.
 898  * XXX: Unimplemented but harmless to stub out for now.
 899  */
 900 void
 901 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
 902     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 903 {
 904         ASSERT(move != NULL);
 905 }
 906 EXPORT_SYMBOL(spl_kmem_cache_set_move);
 907
 908 /*
 909  * Destroy a cache and all objects associated with the cache.
 910  */
 911 void
 912 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 913 {
 914         DECLARE_WAIT_QUEUE_HEAD(wq);
 915         taskqid_t id;
 916
 917         ASSERT(skc->skc_magic == SKC_MAGIC);
 918         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_KVMEM | KMC_SLAB));
 919
 920         down_write(&spl_kmem_cache_sem);
 921         list_del_init(&skc->skc_list);
 922         up_write(&spl_kmem_cache_sem);
 923
 924         /* Cancel any and wait for any pending delayed tasks */
 925         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 926
 927         spin_lock(&skc->skc_lock);
 928         id = skc->skc_taskqid;
 929         spin_unlock(&skc->skc_lock);
 930
 931         taskq_cancel_id(spl_kmem_cache_taskq, id);
 932
 933         /*
 934          * Wait until all current callers complete, this is mainly
 935          * to catch the case where a low memory situation triggers a
 936          * cache reaping action which races with this destroy.
 937          */
 938         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
 939
 940         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_KVMEM)) {
 941                 spl_magazine_destroy(skc);
 942                 spl_slab_reclaim(skc);
 943         } else {
 944                 ASSERT(skc->skc_flags & KMC_SLAB);
 945                 kmem_cache_destroy(skc->skc_linux_cache);
 946         }
 947
 948         spin_lock(&skc->skc_lock);
 949
 950         /*
 951          * Validate there are no objects in use and free all the
 952          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
 953          */
 954         ASSERT3U(skc->skc_slab_alloc, ==, 0);
 955         ASSERT3U(skc->skc_obj_alloc, ==, 0);
 956         ASSERT3U(skc->skc_slab_total, ==, 0);
 957         ASSERT3U(skc->skc_obj_total, ==, 0);
 958         ASSERT3U(skc->skc_obj_emergency, ==, 0);
 959         ASSERT(list_empty(&skc->skc_complete_list));
 960
 961         ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
 962         percpu_counter_destroy(&skc->skc_linux_alloc);
 963
 964         spin_unlock(&skc->skc_lock);
 965
 966         kfree(skc->skc_name);
 967         kfree(skc);
 968 }
 969 EXPORT_SYMBOL(spl_kmem_cache_destroy);
 970
 971 /*
 972  * Allocate an object from a slab attached to the cache.  This is used to
 973  * repopulate the per-cpu magazine caches in batches when they run low.
 974  */
 975 static void *
 976 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 977 {
 978         spl_kmem_obj_t *sko;
 979
 980         ASSERT(skc->skc_magic == SKC_MAGIC);
 981         ASSERT(sks->sks_magic == SKS_MAGIC);
 982
 983         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
 984         ASSERT(sko->sko_magic == SKO_MAGIC);
 985         ASSERT(sko->sko_addr != NULL);
 986
 987         /* Remove from sks_free_list */
 988         list_del_init(&sko->sko_list);
 989
 990         sks->sks_age = jiffies;
 991         sks->sks_ref++;
 992         skc->skc_obj_alloc++;
 993
 994         /* Track max obj usage statistics */
 995         if (skc->skc_obj_alloc > skc->skc_obj_max)
 996                 skc->skc_obj_max = skc->skc_obj_alloc;
 997
 998         /* Track max slab usage statistics */
 999         if (sks->sks_ref == 1) {
1000                 skc->skc_slab_alloc++;
1001
1002                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1003                         skc->skc_slab_max = skc->skc_slab_alloc;
1004         }
1005
1006         return (sko->sko_addr);
1007 }
1008
1009 /*
1010  * Generic slab allocation function to run by the global work queues.
1011  * It is responsible for allocating a new slab, linking it in to the list
1012  * of partial slabs, and then waking any waiters.
1013  */
1014 static int
1015 __spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1016 {
1017         spl_kmem_slab_t *sks;
1018
1019         fstrans_cookie_t cookie = spl_fstrans_mark();
1020         sks = spl_slab_alloc(skc, flags);
1021         spl_fstrans_unmark(cookie);
1022
1023         spin_lock(&skc->skc_lock);
1024         if (sks) {
1025                 skc->skc_slab_total++;
1026                 skc->skc_obj_total += sks->sks_objs;
1027                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1028
1029                 smp_mb__before_atomic();
1030                 clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1031                 smp_mb__after_atomic();
1032         }
1033         spin_unlock(&skc->skc_lock);
1034
1035         return (sks == NULL ? -ENOMEM : 0);
1036 }
1037
1038 static void
1039 spl_cache_grow_work(void *data)
1040 {
1041         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1042         spl_kmem_cache_t *skc = ska->ska_cache;
1043
1044         int error = __spl_cache_grow(skc, ska->ska_flags);
1045
1046         atomic_dec(&skc->skc_ref);
1047         smp_mb__before_atomic();
1048         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1049         smp_mb__after_atomic();
1050         if (error == 0)
1051                 wake_up_all(&skc->skc_waitq);
1052
1053         kfree(ska);
1054 }
1055
1056 /*
1057  * Returns non-zero when a new slab should be available.
1058  */
1059 static int
1060 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1061 {
1062         return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
1063 }
1064
1065 /*
1066  * No available objects on any slabs, create a new slab.  Note that this
1067  * functionality is disabled for KMC_SLAB caches which are backed by the
1068  * Linux slab.
1069  */
1070 static int
1071 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1072 {
1073         int remaining, rc = 0;
1074
1075         ASSERT0(flags & ~KM_PUBLIC_MASK);
1076         ASSERT(skc->skc_magic == SKC_MAGIC);
1077         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1078         might_sleep();
1079         *obj = NULL;
1080
1081         /*
1082          * Before allocating a new slab wait for any reaping to complete and
1083          * then return so the local magazine can be rechecked for new objects.
1084          */
1085         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1086                 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1087                     TASK_UNINTERRUPTIBLE);
1088                 return (rc ? rc : -EAGAIN);
1089         }
1090
1091         /*
1092          * To reduce the overhead of context switch and improve NUMA locality,
1093          * it tries to allocate a new slab in the current process context with
1094          * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
1095          * allocation.
1096          *
1097          * However, this can't be applied to KVM_VMEM due to a bug that
1098          * spl_vmalloc() doesn't honor gfp flags in page table allocation.
1099          */
1100         if (!(skc->skc_flags & KMC_VMEM) && !(skc->skc_flags & KMC_KVMEM)) {
1101                 rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
1102                 if (rc == 0) {
1103                         wake_up_all(&skc->skc_waitq);
1104                         return (0);
1105                 }
1106         }
1107
1108         /*
1109          * This is handled by dispatching a work request to the global work
1110          * queue.  This allows us to asynchronously allocate a new slab while
1111          * retaining the ability to safely fall back to a smaller synchronous
1112          * allocations to ensure forward progress is always maintained.
1113          */
1114         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1115                 spl_kmem_alloc_t *ska;
1116
1117                 ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
1118                 if (ska == NULL) {
1119                         clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
1120                         smp_mb__after_atomic();
1121                         wake_up_all(&skc->skc_waitq);
1122                         return (-ENOMEM);
1123                 }
1124
1125                 atomic_inc(&skc->skc_ref);
1126                 ska->ska_cache = skc;
1127                 ska->ska_flags = flags;
1128                 taskq_init_ent(&ska->ska_tqe);
1129                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1130                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1131         }
1132
1133         /*
1134          * The goal here is to only detect the rare case where a virtual slab
1135          * allocation has deadlocked.  We must be careful to minimize the use
1136          * of emergency objects which are more expensive to track.  Therefore,
1137          * we set a very long timeout for the asynchronous allocation and if
1138          * the timeout is reached the cache is flagged as deadlocked.  From
1139          * this point only new emergency objects will be allocated until the
1140          * asynchronous allocation completes and clears the deadlocked flag.
1141          */
1142         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1143                 rc = spl_emergency_alloc(skc, flags, obj);
1144         } else {
1145                 remaining = wait_event_timeout(skc->skc_waitq,
1146                     spl_cache_grow_wait(skc), HZ / 10);
1147
1148                 if (!remaining) {
1149                         spin_lock(&skc->skc_lock);
1150                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1151                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1152                                 skc->skc_obj_deadlock++;
1153                         }
1154                         spin_unlock(&skc->skc_lock);
1155                 }
1156
1157                 rc = -ENOMEM;
1158         }
1159
1160         return (rc);
1161 }
1162
1163 /*
1164  * Refill a per-cpu magazine with objects from the slabs for this cache.
1165  * Ideally the magazine can be repopulated using existing objects which have
1166  * been released, however if we are unable to locate enough free objects new
1167  * slabs of objects will be created.  On success NULL is returned, otherwise
1168  * the address of a single emergency object is returned for use by the caller.
1169  */
1170 static void *
1171 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1172 {
1173         spl_kmem_slab_t *sks;
1174         int count = 0, rc, refill;
1175         void *obj = NULL;
1176
1177         ASSERT(skc->skc_magic == SKC_MAGIC);
1178         ASSERT(skm->skm_magic == SKM_MAGIC);
1179
1180         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1181         spin_lock(&skc->skc_lock);
1182
1183         while (refill > 0) {
1184                 /* No slabs available we may need to grow the cache */
1185                 if (list_empty(&skc->skc_partial_list)) {
1186                         spin_unlock(&skc->skc_lock);
1187
1188                         local_irq_enable();
1189                         rc = spl_cache_grow(skc, flags, &obj);
1190                         local_irq_disable();
1191
1192                         /* Emergency object for immediate use by caller */
1193                         if (rc == 0 && obj != NULL)
1194                                 return (obj);
1195
1196                         if (rc)
1197                                 goto out;
1198
1199                         /* Rescheduled to different CPU skm is not local */
1200                         if (skm != skc->skc_mag[smp_processor_id()])
1201                                 goto out;
1202
1203                         /*
1204                          * Potentially rescheduled to the same CPU but
1205                          * allocations may have occurred from this CPU while
1206                          * we were sleeping so recalculate max refill.
1207                          */
1208                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1209
1210                         spin_lock(&skc->skc_lock);
1211                         continue;
1212                 }
1213
1214                 /* Grab the next available slab */
1215                 sks = list_entry((&skc->skc_partial_list)->next,
1216                     spl_kmem_slab_t, sks_list);
1217                 ASSERT(sks->sks_magic == SKS_MAGIC);
1218                 ASSERT(sks->sks_ref < sks->sks_objs);
1219                 ASSERT(!list_empty(&sks->sks_free_list));
1220
1221                 /*
1222                  * Consume as many objects as needed to refill the requested
1223                  * cache.  We must also be careful not to overfill it.
1224                  */
1225                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
1226                     ++count) {
1227                         ASSERT(skm->skm_avail < skm->skm_size);
1228                         ASSERT(count < skm->skm_size);
1229                         skm->skm_objs[skm->skm_avail++] =
1230                             spl_cache_obj(skc, sks);
1231                 }
1232
1233                 /* Move slab to skc_complete_list when full */
1234                 if (sks->sks_ref == sks->sks_objs) {
1235                         list_del(&sks->sks_list);
1236                         list_add(&sks->sks_list, &skc->skc_complete_list);
1237                 }
1238         }
1239
1240         spin_unlock(&skc->skc_lock);
1241 out:
1242         return (NULL);
1243 }
1244
1245 /*
1246  * Release an object back to the slab from which it came.
1247  */
1248 static void
1249 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1250 {
1251         spl_kmem_slab_t *sks = NULL;
1252         spl_kmem_obj_t *sko = NULL;
1253
1254         ASSERT(skc->skc_magic == SKC_MAGIC);
1255
1256         sko = spl_sko_from_obj(skc, obj);
1257         ASSERT(sko->sko_magic == SKO_MAGIC);
1258         sks = sko->sko_slab;
1259         ASSERT(sks->sks_magic == SKS_MAGIC);
1260         ASSERT(sks->sks_cache == skc);
1261         list_add(&sko->sko_list, &sks->sks_free_list);
1262
1263         sks->sks_age = jiffies;
1264         sks->sks_ref--;
1265         skc->skc_obj_alloc--;
1266
1267         /*
1268          * Move slab to skc_partial_list when no longer full.  Slabs
1269          * are added to the head to keep the partial list is quasi-full
1270          * sorted order.  Fuller at the head, emptier at the tail.
1271          */
1272         if (sks->sks_ref == (sks->sks_objs - 1)) {
1273                 list_del(&sks->sks_list);
1274                 list_add(&sks->sks_list, &skc->skc_partial_list);
1275         }
1276
1277         /*
1278          * Move empty slabs to the end of the partial list so
1279          * they can be easily found and freed during reclamation.
1280          */
1281         if (sks->sks_ref == 0) {
1282                 list_del(&sks->sks_list);
1283                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1284                 skc->skc_slab_alloc--;
1285         }
1286 }
1287
1288 /*
1289  * Allocate an object from the per-cpu magazine, or if the magazine
1290  * is empty directly allocate from a slab and repopulate the magazine.
1291  */
1292 void *
1293 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1294 {
1295         spl_kmem_magazine_t *skm;
1296         void *obj = NULL;
1297
1298         ASSERT0(flags & ~KM_PUBLIC_MASK);
1299         ASSERT(skc->skc_magic == SKC_MAGIC);
1300         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1301
1302         /*
1303          * Allocate directly from a Linux slab.  All optimizations are left
1304          * to the underlying cache we only need to guarantee that KM_SLEEP
1305          * callers will never fail.
1306          */
1307         if (skc->skc_flags & KMC_SLAB) {
1308                 struct kmem_cache *slc = skc->skc_linux_cache;
1309                 do {
1310                         obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
1311                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1312
1313                 if (obj != NULL) {
1314                         /*
1315                          * Even though we leave everything up to the
1316                          * underlying cache we still keep track of
1317                          * how many objects we've allocated in it for
1318                          * better debuggability.
1319                          */
1320                         percpu_counter_inc(&skc->skc_linux_alloc);
1321                 }
1322                 goto ret;
1323         }
1324
1325         local_irq_disable();
1326
1327 restart:
1328         /*
1329          * Safe to update per-cpu structure without lock, but
1330          * in the restart case we must be careful to reacquire
1331          * the local magazine since this may have changed
1332          * when we need to grow the cache.
1333          */
1334         skm = skc->skc_mag[smp_processor_id()];
1335         ASSERT(skm->skm_magic == SKM_MAGIC);
1336
1337         if (likely(skm->skm_avail)) {
1338                 /* Object available in CPU cache, use it */
1339                 obj = skm->skm_objs[--skm->skm_avail];
1340         } else {
1341                 obj = spl_cache_refill(skc, skm, flags);
1342                 if ((obj == NULL) && !(flags & KM_NOSLEEP))
1343                         goto restart;
1344
1345                 local_irq_enable();
1346                 goto ret;
1347         }
1348
1349         local_irq_enable();
1350         ASSERT(obj);
1351         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1352
1353 ret:
1354         /* Pre-emptively migrate object to CPU L1 cache */
1355         if (obj) {
1356                 if (obj && skc->skc_ctor)
1357                         skc->skc_ctor(obj, skc->skc_private, flags);
1358                 else
1359                         prefetchw(obj);
1360         }
1361
1362         return (obj);
1363 }
1364 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1365
1366 /*
1367  * Free an object back to the local per-cpu magazine, there is no
1368  * guarantee that this is the same magazine the object was originally
1369  * allocated from.  We may need to flush entire from the magazine
1370  * back to the slabs to make space.
1371  */
1372 void
1373 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1374 {
1375         spl_kmem_magazine_t *skm;
1376         unsigned long flags;
1377         int do_reclaim = 0;
1378         int do_emergency = 0;
1379
1380         ASSERT(skc->skc_magic == SKC_MAGIC);
1381         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1382
1383         /*
1384          * Run the destructor
1385          */
1386         if (skc->skc_dtor)
1387                 skc->skc_dtor(obj, skc->skc_private);
1388
1389         /*
1390          * Free the object from the Linux underlying Linux slab.
1391          */
1392         if (skc->skc_flags & KMC_SLAB) {
1393                 kmem_cache_free(skc->skc_linux_cache, obj);
1394                 percpu_counter_dec(&skc->skc_linux_alloc);
1395                 return;
1396         }
1397
1398         /*
1399          * While a cache has outstanding emergency objects all freed objects
1400          * must be checked.  However, since emergency objects will never use
1401          * a virtual address these objects can be safely excluded as an
1402          * optimization.
1403          */
1404         if (!is_vmalloc_addr(obj)) {
1405                 spin_lock(&skc->skc_lock);
1406                 do_emergency = (skc->skc_obj_emergency > 0);
1407                 spin_unlock(&skc->skc_lock);
1408
1409                 if (do_emergency && (spl_emergency_free(skc, obj) == 0))
1410                         return;
1411         }
1412
1413         local_irq_save(flags);
1414
1415         /*
1416          * Safe to update per-cpu structure without lock, but
1417          * no remote memory allocation tracking is being performed
1418          * it is entirely possible to allocate an object from one
1419          * CPU cache and return it to another.
1420          */
1421         skm = skc->skc_mag[smp_processor_id()];
1422         ASSERT(skm->skm_magic == SKM_MAGIC);
1423
1424         /*
1425          * Per-CPU cache full, flush it to make space for this object,
1426          * this may result in an empty slab which can be reclaimed once
1427          * interrupts are re-enabled.
1428          */
1429         if (unlikely(skm->skm_avail >= skm->skm_size)) {
1430                 spl_cache_flush(skc, skm, skm->skm_refill);
1431                 do_reclaim = 1;
1432         }
1433
1434         /* Available space in cache, use it */
1435         skm->skm_objs[skm->skm_avail++] = obj;
1436
1437         local_irq_restore(flags);
1438
1439         if (do_reclaim)
1440                 spl_slab_reclaim(skc);
1441 }
1442 EXPORT_SYMBOL(spl_kmem_cache_free);
1443
1444 /*
1445  * Depending on how many and which objects are released it may simply
1446  * repopulate the local magazine which will then need to age-out.  Objects
1447  * which cannot fit in the magazine will be released back to their slabs
1448  * which will also need to age out before being released.  This is all just
1449  * best effort and we do not want to thrash creating and destroying slabs.
1450  */
1451 void
1452 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1453 {
1454         ASSERT(skc->skc_magic == SKC_MAGIC);
1455         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1456
1457         if (skc->skc_flags & KMC_SLAB)
1458                 return;
1459
1460         atomic_inc(&skc->skc_ref);
1461
1462         /*
1463          * Prevent concurrent cache reaping when contended.
1464          */
1465         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
1466                 goto out;
1467
1468         /* Reclaim from the magazine and free all now empty slabs. */
1469         unsigned long irq_flags;
1470         local_irq_save(irq_flags);
1471         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1472         spl_cache_flush(skc, skm, skm->skm_avail);
1473         local_irq_restore(irq_flags);
1474
1475         spl_slab_reclaim(skc);
1476         clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
1477         smp_mb__after_atomic();
1478         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
1479 out:
1480         atomic_dec(&skc->skc_ref);
1481 }
1482 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1483
1484 /*
1485  * This is stubbed out for code consistency with other platforms.  There
1486  * is existing logic to prevent concurrent reaping so while this is ugly
1487  * it should do no harm.
1488  */
1489 int
1490 spl_kmem_cache_reap_active()
1491 {
1492         return (0);
1493 }
1494 EXPORT_SYMBOL(spl_kmem_cache_reap_active);
1495
1496 /*
1497  * Reap all free slabs from all registered caches.
1498  */
1499 void
1500 spl_kmem_reap(void)
1501 {
1502         spl_kmem_cache_t *skc = NULL;
1503
1504         down_read(&spl_kmem_cache_sem);
1505         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1506                 spl_kmem_cache_reap_now(skc);
1507         }
1508         up_read(&spl_kmem_cache_sem);
1509 }
1510 EXPORT_SYMBOL(spl_kmem_reap);
1511
1512 int
1513 spl_kmem_cache_init(void)
1514 {
1515         init_rwsem(&spl_kmem_cache_sem);
1516         INIT_LIST_HEAD(&spl_kmem_cache_list);
1517         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1518             spl_kmem_cache_kmem_threads, maxclsyspri,
1519             spl_kmem_cache_kmem_threads * 8, INT_MAX,
1520             TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1521
1522         return (0);
1523 }
1524
1525 void
1526 spl_kmem_cache_fini(void)
1527 {
1528         taskq_destroy(spl_kmem_cache_taskq);
1529 }