module/spl/spl-kmem-cache.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28 #include <sys/kmem_cache.h>
  29 #include <sys/taskq.h>
  30 #include <sys/timer.h>
  31 #include <sys/vmem.h>
  32 #include <linux/slab.h>
  33 #include <linux/swap.h>
  34 #include <linux/mm_compat.h>
  35 #include <linux/wait_compat.h>
  36
  37 /*
  38  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  39  * are removed to allow access to the real Linux slab allocator.
  40  */
  41 #undef kmem_cache_destroy
  42 #undef kmem_cache_create
  43 #undef kmem_cache_alloc
  44 #undef kmem_cache_free
  45
  46
  47 /*
  48  * Cache expiration was implemented because it was part of the default Solaris
  49  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  50  * accessed in several seconds should be returned to the cache.  On the other
  51  * hand Linux slabs never move objects back to the slabs unless there is
  52  * memory pressure on the system.  By default the Linux method is enabled
  53  * because it has been shown to improve responsiveness on low memory systems.
  54  * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  55  */
  56 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
  57 EXPORT_SYMBOL(spl_kmem_cache_expire);
  58 module_param(spl_kmem_cache_expire, uint, 0644);
  59 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  60
  61 /*
  62  * The default behavior is to report the number of objects remaining in the
  63  * cache.  This allows the Linux VM to repeatedly reclaim objects from the
  64  * cache when memory is low satisfy other memory allocations.  Alternately,
  65  * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
  66  * is reclaimed.  This may increase the likelihood of out of memory events.
  67  */
  68 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
  69 module_param(spl_kmem_cache_reclaim, uint, 0644);
  70 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
  71
  72 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  73 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  74 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  75
  76 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
  77 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
  78 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
  79     "Minimal number of objects per slab");
  80
  81 unsigned int spl_kmem_cache_max_size = 32;
  82 module_param(spl_kmem_cache_max_size, uint, 0644);
  83 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  84
  85 /*
  86  * For small objects the Linux slab allocator should be used to make the most
  87  * efficient use of the memory.  However, large objects are not supported by
  88  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  89  * of 16K was determined to be optimal for architectures using 4K pages.
  90  */
  91 #if PAGE_SIZE == 4096
  92 unsigned int spl_kmem_cache_slab_limit = 16384;
  93 #else
  94 unsigned int spl_kmem_cache_slab_limit = 0;
  95 #endif
  96 module_param(spl_kmem_cache_slab_limit, uint, 0644);
  97 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
  98     "Objects less than N bytes use the Linux slab");
  99
 100 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
 101 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
 102 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
 103     "Objects less than N bytes use the kmalloc");
 104
 105 /*
 106  * Slab allocation interfaces
 107  *
 108  * While the Linux slab implementation was inspired by the Solaris
 109  * implementation I cannot use it to emulate the Solaris APIs.  I
 110  * require two features which are not provided by the Linux slab.
 111  *
 112  * 1) Constructors AND destructors.  Recent versions of the Linux
 113  *    kernel have removed support for destructors.  This is a deal
 114  *    breaker for the SPL which contains particularly expensive
 115  *    initializers for mutex's, condition variables, etc.  We also
 116  *    require a minimal level of cleanup for these data types unlike
 117  *    many Linux data type which do need to be explicitly destroyed.
 118  *
 119  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 120  *    expect it to work well for both small are very large allocations.
 121  *    Because of memory fragmentation the Linux slab which is backed
 122  *    by kmalloc'ed memory performs very badly when confronted with
 123  *    large numbers of large allocations.  Basing the slab on the
 124  *    virtual address space removes the need for contiguous pages
 125  *    and greatly improve performance for large allocations.
 126  *
 127  * For these reasons, the SPL has its own slab implementation with
 128  * the needed features.  It is not as highly optimized as either the
 129  * Solaris or Linux slabs, but it should get me most of what is
 130  * needed until it can be optimized or obsoleted by another approach.
 131  *
 132  * One serious concern I do have about this method is the relatively
 133  * small virtual address space on 32bit arches.  This will seriously
 134  * constrain the size of the slab caches and their performance.
 135  *
 136  * XXX: Improve the partial slab list by carefully maintaining a
 137  *      strict ordering of fullest to emptiest slabs based on
 138  *      the slab reference count.  This guarantees the when freeing
 139  *      slabs back to the system we need only linearly traverse the
 140  *      last N slabs in the list to discover all the freeable slabs.
 141  *
 142  * XXX: NUMA awareness for optionally allocating memory close to a
 143  *      particular core.  This can be advantageous if you know the slab
 144  *      object will be short lived and primarily accessed from one core.
 145  *
 146  * XXX: Slab coloring may also yield performance improvements and would
 147  *      be desirable to implement.
 148  */
 149
 150 struct list_head spl_kmem_cache_list;   /* List of caches */
 151 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 152 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 153
 154 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 155
 156 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 157 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 158         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 159
 160 static void *
 161 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 162 {
 163         void *ptr;
 164
 165         ASSERT(ISP2(size));
 166
 167         if (skc->skc_flags & KMC_KMEM)
 168                 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
 169                     get_order(size));
 170         else
 171                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 172
 173         /* Resulting allocated memory will be page aligned */
 174         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 175
 176         return ptr;
 177 }
 178
 179 static void
 180 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 181 {
 182         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 183         ASSERT(ISP2(size));
 184
 185         /*
 186          * The Linux direct reclaim path uses this out of band value to
 187          * determine if forward progress is being made.  Normally this is
 188          * incremented by kmem_freepages() which is part of the various
 189          * Linux slab implementations.  However, since we are using none
 190          * of that infrastructure we are responsible for incrementing it.
 191          */
 192         if (current->reclaim_state)
 193                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 194
 195         if (skc->skc_flags & KMC_KMEM)
 196                 free_pages((unsigned long)ptr, get_order(size));
 197         else
 198                 vfree(ptr);
 199 }
 200
 201 /*
 202  * Required space for each aligned sks.
 203  */
 204 static inline uint32_t
 205 spl_sks_size(spl_kmem_cache_t *skc)
 206 {
 207         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 208                skc->skc_obj_align, uint32_t);
 209 }
 210
 211 /*
 212  * Required space for each aligned object.
 213  */
 214 static inline uint32_t
 215 spl_obj_size(spl_kmem_cache_t *skc)
 216 {
 217         uint32_t align = skc->skc_obj_align;
 218
 219         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 220                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 221 }
 222
 223 /*
 224  * Lookup the spl_kmem_object_t for an object given that object.
 225  */
 226 static inline spl_kmem_obj_t *
 227 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 228 {
 229         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 230                skc->skc_obj_align, uint32_t);
 231 }
 232
 233 /*
 234  * Required space for each offslab object taking in to account alignment
 235  * restrictions and the power-of-two requirement of kv_alloc().
 236  */
 237 static inline uint32_t
 238 spl_offslab_size(spl_kmem_cache_t *skc)
 239 {
 240         return 1UL << (fls64(spl_obj_size(skc)) + 1);
 241 }
 242
 243 /*
 244  * It's important that we pack the spl_kmem_obj_t structure and the
 245  * actual objects in to one large address space to minimize the number
 246  * of calls to the allocator.  It is far better to do a few large
 247  * allocations and then subdivide it ourselves.  Now which allocator
 248  * we use requires balancing a few trade offs.
 249  *
 250  * For small objects we use kmem_alloc() because as long as you are
 251  * only requesting a small number of pages (ideally just one) its cheap.
 252  * However, when you start requesting multiple pages with kmem_alloc()
 253  * it gets increasingly expensive since it requires contiguous pages.
 254  * For this reason we shift to vmem_alloc() for slabs of large objects
 255  * which removes the need for contiguous pages.  We do not use
 256  * vmem_alloc() in all cases because there is significant locking
 257  * overhead in __get_vm_area_node().  This function takes a single
 258  * global lock when acquiring an available virtual address range which
 259  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 260  * different allocation functions for small and large objects should
 261  * give us the best of both worlds.
 262  *
 263  * KMC_ONSLAB                       KMC_OFFSLAB
 264  *
 265  * +------------------------+       +-----------------+
 266  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 267  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 268  * | spl_kmem_obj_t      |  |                             | |
 269  * | skc_obj_size    <---+  |       +-----------------+   | |
 270  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 271  * | ...                 v  |       | spl_kmem_obj_t  |     |
 272  * +------------------------+       +-----------------+     v
 273  */
 274 static spl_kmem_slab_t *
 275 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 276 {
 277         spl_kmem_slab_t *sks;
 278         spl_kmem_obj_t *sko, *n;
 279         void *base, *obj;
 280         uint32_t obj_size, offslab_size = 0;
 281         int i,  rc = 0;
 282
 283         base = kv_alloc(skc, skc->skc_slab_size, flags);
 284         if (base == NULL)
 285                 return (NULL);
 286
 287         sks = (spl_kmem_slab_t *)base;
 288         sks->sks_magic = SKS_MAGIC;
 289         sks->sks_objs = skc->skc_slab_objs;
 290         sks->sks_age = jiffies;
 291         sks->sks_cache = skc;
 292         INIT_LIST_HEAD(&sks->sks_list);
 293         INIT_LIST_HEAD(&sks->sks_free_list);
 294         sks->sks_ref = 0;
 295         obj_size = spl_obj_size(skc);
 296
 297         if (skc->skc_flags & KMC_OFFSLAB)
 298                 offslab_size = spl_offslab_size(skc);
 299
 300         for (i = 0; i < sks->sks_objs; i++) {
 301                 if (skc->skc_flags & KMC_OFFSLAB) {
 302                         obj = kv_alloc(skc, offslab_size, flags);
 303                         if (!obj) {
 304                                 rc = -ENOMEM;
 305                                 goto out;
 306                         }
 307                 } else {
 308                         obj = base + spl_sks_size(skc) + (i * obj_size);
 309                 }
 310
 311                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 312                 sko = spl_sko_from_obj(skc, obj);
 313                 sko->sko_addr = obj;
 314                 sko->sko_magic = SKO_MAGIC;
 315                 sko->sko_slab = sks;
 316                 INIT_LIST_HEAD(&sko->sko_list);
 317                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 318         }
 319
 320 out:
 321         if (rc) {
 322                 if (skc->skc_flags & KMC_OFFSLAB)
 323                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
 324                                                  sko_list)
 325                                 kv_free(skc, sko->sko_addr, offslab_size);
 326
 327                 kv_free(skc, base, skc->skc_slab_size);
 328                 sks = NULL;
 329         }
 330
 331         return (sks);
 332 }
 333
 334 /*
 335  * Remove a slab from complete or partial list, it must be called with
 336  * the 'skc->skc_lock' held but the actual free must be performed
 337  * outside the lock to prevent deadlocking on vmem addresses.
 338  */
 339 static void
 340 spl_slab_free(spl_kmem_slab_t *sks,
 341               struct list_head *sks_list, struct list_head *sko_list)
 342 {
 343         spl_kmem_cache_t *skc;
 344
 345         ASSERT(sks->sks_magic == SKS_MAGIC);
 346         ASSERT(sks->sks_ref == 0);
 347
 348         skc = sks->sks_cache;
 349         ASSERT(skc->skc_magic == SKC_MAGIC);
 350         ASSERT(spin_is_locked(&skc->skc_lock));
 351
 352         /*
 353          * Update slab/objects counters in the cache, then remove the
 354          * slab from the skc->skc_partial_list.  Finally add the slab
 355          * and all its objects in to the private work lists where the
 356          * destructors will be called and the memory freed to the system.
 357          */
 358         skc->skc_obj_total -= sks->sks_objs;
 359         skc->skc_slab_total--;
 360         list_del(&sks->sks_list);
 361         list_add(&sks->sks_list, sks_list);
 362         list_splice_init(&sks->sks_free_list, sko_list);
 363 }
 364
 365 /*
 366  * Traverses all the partial slabs attached to a cache and free those
 367  * which which are currently empty, and have not been touched for
 368  * skc_delay seconds to  avoid thrashing.  The count argument is
 369  * passed to optionally cap the number of slabs reclaimed, a count
 370  * of zero means try and reclaim everything.  When flag is set we
 371  * always free an available slab regardless of age.
 372  */
 373 static void
 374 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 375 {
 376         spl_kmem_slab_t *sks, *m;
 377         spl_kmem_obj_t *sko, *n;
 378         LIST_HEAD(sks_list);
 379         LIST_HEAD(sko_list);
 380         uint32_t size = 0;
 381         int i = 0;
 382
 383         /*
 384          * Move empty slabs and objects which have not been touched in
 385          * skc_delay seconds on to private lists to be freed outside
 386          * the spin lock.  This delay time is important to avoid thrashing
 387          * however when flag is set the delay will not be used.
 388          */
 389         spin_lock(&skc->skc_lock);
 390         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
 391                 /*
 392                  * All empty slabs are at the end of skc->skc_partial_list,
 393                  * therefore once a non-empty slab is found we can stop
 394                  * scanning.  Additionally, stop when reaching the target
 395                  * reclaim 'count' if a non-zero threshold is given.
 396                  */
 397                 if ((sks->sks_ref > 0) || (count && i >= count))
 398                         break;
 399
 400                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
 401                         spl_slab_free(sks, &sks_list, &sko_list);
 402                         i++;
 403                 }
 404         }
 405         spin_unlock(&skc->skc_lock);
 406
 407         /*
 408          * The following two loops ensure all the object destructors are
 409          * run, any offslab objects are freed, and the slabs themselves
 410          * are freed.  This is all done outside the skc->skc_lock since
 411          * this allows the destructor to sleep, and allows us to perform
 412          * a conditional reschedule when a freeing a large number of
 413          * objects and slabs back to the system.
 414          */
 415         if (skc->skc_flags & KMC_OFFSLAB)
 416                 size = spl_offslab_size(skc);
 417
 418         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 419                 ASSERT(sko->sko_magic == SKO_MAGIC);
 420
 421                 if (skc->skc_flags & KMC_OFFSLAB)
 422                         kv_free(skc, sko->sko_addr, size);
 423         }
 424
 425         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 426                 ASSERT(sks->sks_magic == SKS_MAGIC);
 427                 kv_free(skc, sks, skc->skc_slab_size);
 428         }
 429 }
 430
 431 static spl_kmem_emergency_t *
 432 spl_emergency_search(struct rb_root *root, void *obj)
 433 {
 434         struct rb_node *node = root->rb_node;
 435         spl_kmem_emergency_t *ske;
 436         unsigned long address = (unsigned long)obj;
 437
 438         while (node) {
 439                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
 440
 441                 if (address < (unsigned long)ske->ske_obj)
 442                         node = node->rb_left;
 443                 else if (address > (unsigned long)ske->ske_obj)
 444                         node = node->rb_right;
 445                 else
 446                         return ske;
 447         }
 448
 449         return NULL;
 450 }
 451
 452 static int
 453 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 454 {
 455         struct rb_node **new = &(root->rb_node), *parent = NULL;
 456         spl_kmem_emergency_t *ske_tmp;
 457         unsigned long address = (unsigned long)ske->ske_obj;
 458
 459         while (*new) {
 460                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 461
 462                 parent = *new;
 463                 if (address < (unsigned long)ske_tmp->ske_obj)
 464                         new = &((*new)->rb_left);
 465                 else if (address > (unsigned long)ske_tmp->ske_obj)
 466                         new = &((*new)->rb_right);
 467                 else
 468                         return 0;
 469         }
 470
 471         rb_link_node(&ske->ske_node, parent, new);
 472         rb_insert_color(&ske->ske_node, root);
 473
 474         return 1;
 475 }
 476
 477 /*
 478  * Allocate a single emergency object and track it in a red black tree.
 479  */
 480 static int
 481 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 482 {
 483         spl_kmem_emergency_t *ske;
 484         int empty;
 485
 486         /* Last chance use a partial slab if one now exists */
 487         spin_lock(&skc->skc_lock);
 488         empty = list_empty(&skc->skc_partial_list);
 489         spin_unlock(&skc->skc_lock);
 490         if (!empty)
 491                 return (-EEXIST);
 492
 493         ske = kmalloc(sizeof(*ske), flags);
 494         if (ske == NULL)
 495                 return (-ENOMEM);
 496
 497         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
 498         if (ske->ske_obj == NULL) {
 499                 kfree(ske);
 500                 return (-ENOMEM);
 501         }
 502
 503         spin_lock(&skc->skc_lock);
 504         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 505         if (likely(empty)) {
 506                 skc->skc_obj_total++;
 507                 skc->skc_obj_emergency++;
 508                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 509                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 510         }
 511         spin_unlock(&skc->skc_lock);
 512
 513         if (unlikely(!empty)) {
 514                 kfree(ske->ske_obj);
 515                 kfree(ske);
 516                 return (-EINVAL);
 517         }
 518
 519         *obj = ske->ske_obj;
 520
 521         return (0);
 522 }
 523
 524 /*
 525  * Locate the passed object in the red black tree and free it.
 526  */
 527 static int
 528 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 529 {
 530         spl_kmem_emergency_t *ske;
 531
 532         spin_lock(&skc->skc_lock);
 533         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
 534         if (likely(ske)) {
 535                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
 536                 skc->skc_obj_emergency--;
 537                 skc->skc_obj_total--;
 538         }
 539         spin_unlock(&skc->skc_lock);
 540
 541         if (unlikely(ske == NULL))
 542                 return (-ENOENT);
 543
 544         kfree(ske->ske_obj);
 545         kfree(ske);
 546
 547         return (0);
 548 }
 549
 550 /*
 551  * Release objects from the per-cpu magazine back to their slab.  The flush
 552  * argument contains the max number of entries to remove from the magazine.
 553  */
 554 static void
 555 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 556 {
 557         int i, count = MIN(flush, skm->skm_avail);
 558
 559         ASSERT(skc->skc_magic == SKC_MAGIC);
 560         ASSERT(skm->skm_magic == SKM_MAGIC);
 561         ASSERT(spin_is_locked(&skc->skc_lock));
 562
 563         for (i = 0; i < count; i++)
 564                 spl_cache_shrink(skc, skm->skm_objs[i]);
 565
 566         skm->skm_avail -= count;
 567         memmove(skm->skm_objs, &(skm->skm_objs[count]),
 568                 sizeof(void *) * skm->skm_avail);
 569 }
 570
 571 static void
 572 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 573 {
 574         spin_lock(&skc->skc_lock);
 575         __spl_cache_flush(skc, skm, flush);
 576         spin_unlock(&skc->skc_lock);
 577 }
 578
 579 static void
 580 spl_magazine_age(void *data)
 581 {
 582         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
 583         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 584
 585         ASSERT(skm->skm_magic == SKM_MAGIC);
 586         ASSERT(skm->skm_cpu == smp_processor_id());
 587         ASSERT(irqs_disabled());
 588
 589         /* There are no available objects or they are too young to age out */
 590         if ((skm->skm_avail == 0) ||
 591             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
 592                 return;
 593
 594         /*
 595          * Because we're executing in interrupt context we may have
 596          * interrupted the holder of this lock.  To avoid a potential
 597          * deadlock return if the lock is contended.
 598          */
 599         if (!spin_trylock(&skc->skc_lock))
 600                 return;
 601
 602         __spl_cache_flush(skc, skm, skm->skm_refill);
 603         spin_unlock(&skc->skc_lock);
 604 }
 605
 606 /*
 607  * Called regularly to keep a downward pressure on the cache.
 608  *
 609  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
 610  * be returned to the caches.  This is done to prevent idle magazines from
 611  * holding memory which could be better used elsewhere.  The delay is
 612  * present to prevent thrashing the magazine.
 613  *
 614  * The newly released objects may result in empty partial slabs.  Those
 615  * slabs should be released to the system.  Otherwise moving the objects
 616  * out of the magazines is just wasted work.
 617  */
 618 static void
 619 spl_cache_age(void *data)
 620 {
 621         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
 622         taskqid_t id = 0;
 623
 624         ASSERT(skc->skc_magic == SKC_MAGIC);
 625
 626         /* Dynamically disabled at run time */
 627         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
 628                 return;
 629
 630         atomic_inc(&skc->skc_ref);
 631
 632         if (!(skc->skc_flags & KMC_NOMAGAZINE))
 633                 on_each_cpu(spl_magazine_age, skc, 1);
 634
 635         spl_slab_reclaim(skc, skc->skc_reap, 0);
 636
 637         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
 638                 id = taskq_dispatch_delay(
 639                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
 640                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 641
 642                 /* Destroy issued after dispatch immediately cancel it */
 643                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
 644                         taskq_cancel_id(spl_kmem_cache_taskq, id);
 645         }
 646
 647         spin_lock(&skc->skc_lock);
 648         skc->skc_taskqid = id;
 649         spin_unlock(&skc->skc_lock);
 650
 651         atomic_dec(&skc->skc_ref);
 652 }
 653
 654 /*
 655  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
 656  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
 657  * for very small objects we may end up with more than this so as not
 658  * to waste space in the minimal allocation of a single page.  Also for
 659  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
 660  * lower than this and we will fail.
 661  */
 662 static int
 663 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 664 {
 665         uint32_t sks_size, obj_size, max_size;
 666
 667         if (skc->skc_flags & KMC_OFFSLAB) {
 668                 *objs = spl_kmem_cache_obj_per_slab;
 669                 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
 670                 return (0);
 671         } else {
 672                 sks_size = spl_sks_size(skc);
 673                 obj_size = spl_obj_size(skc);
 674
 675                 if (skc->skc_flags & KMC_KMEM)
 676                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
 677                 else
 678                         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
 679
 680                 /* Power of two sized slab */
 681                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
 682                         *objs = (*size - sks_size) / obj_size;
 683                         if (*objs >= spl_kmem_cache_obj_per_slab)
 684                                 return (0);
 685                 }
 686
 687                 /*
 688                  * Unable to satisfy target objects per slab, fall back to
 689                  * allocating a maximally sized slab and assuming it can
 690                  * contain the minimum objects count use it.  If not fail.
 691                  */
 692                 *size = max_size;
 693                 *objs = (*size - sks_size) / obj_size;
 694                 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
 695                         return (0);
 696         }
 697
 698         return (-ENOSPC);
 699 }
 700
 701 /*
 702  * Make a guess at reasonable per-cpu magazine size based on the size of
 703  * each object and the cost of caching N of them in each magazine.  Long
 704  * term this should really adapt based on an observed usage heuristic.
 705  */
 706 static int
 707 spl_magazine_size(spl_kmem_cache_t *skc)
 708 {
 709         uint32_t obj_size = spl_obj_size(skc);
 710         int size;
 711
 712         /* Per-magazine sizes below assume a 4Kib page size */
 713         if (obj_size > (PAGE_SIZE * 256))
 714                 size = 4;  /* Minimum 4Mib per-magazine */
 715         else if (obj_size > (PAGE_SIZE * 32))
 716                 size = 16; /* Minimum 2Mib per-magazine */
 717         else if (obj_size > (PAGE_SIZE))
 718                 size = 64; /* Minimum 256Kib per-magazine */
 719         else if (obj_size > (PAGE_SIZE / 4))
 720                 size = 128; /* Minimum 128Kib per-magazine */
 721         else
 722                 size = 256;
 723
 724         return (size);
 725 }
 726
 727 /*
 728  * Allocate a per-cpu magazine to associate with a specific core.
 729  */
 730 static spl_kmem_magazine_t *
 731 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 732 {
 733         spl_kmem_magazine_t *skm;
 734         int size = sizeof(spl_kmem_magazine_t) +
 735                    sizeof(void *) * skc->skc_mag_size;
 736
 737         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
 738         if (skm) {
 739                 skm->skm_magic = SKM_MAGIC;
 740                 skm->skm_avail = 0;
 741                 skm->skm_size = skc->skc_mag_size;
 742                 skm->skm_refill = skc->skc_mag_refill;
 743                 skm->skm_cache = skc;
 744                 skm->skm_age = jiffies;
 745                 skm->skm_cpu = cpu;
 746         }
 747
 748         return (skm);
 749 }
 750
 751 /*
 752  * Free a per-cpu magazine associated with a specific core.
 753  */
 754 static void
 755 spl_magazine_free(spl_kmem_magazine_t *skm)
 756 {
 757         int size = sizeof(spl_kmem_magazine_t) +
 758                    sizeof(void *) * skm->skm_size;
 759
 760         ASSERT(skm->skm_magic == SKM_MAGIC);
 761         ASSERT(skm->skm_avail == 0);
 762
 763         kmem_free(skm, size);
 764 }
 765
 766 /*
 767  * Create all pre-cpu magazines of reasonable sizes.
 768  */
 769 static int
 770 spl_magazine_create(spl_kmem_cache_t *skc)
 771 {
 772         int i;
 773
 774         if (skc->skc_flags & KMC_NOMAGAZINE)
 775                 return (0);
 776
 777         skc->skc_mag_size = spl_magazine_size(skc);
 778         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 779
 780         for_each_online_cpu(i) {
 781                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
 782                 if (!skc->skc_mag[i]) {
 783                         for (i--; i >= 0; i--)
 784                                 spl_magazine_free(skc->skc_mag[i]);
 785
 786                         return (-ENOMEM);
 787                 }
 788         }
 789
 790         return (0);
 791 }
 792
 793 /*
 794  * Destroy all pre-cpu magazines.
 795  */
 796 static void
 797 spl_magazine_destroy(spl_kmem_cache_t *skc)
 798 {
 799         spl_kmem_magazine_t *skm;
 800         int i;
 801
 802         if (skc->skc_flags & KMC_NOMAGAZINE)
 803                 return;
 804
 805         for_each_online_cpu(i) {
 806                 skm = skc->skc_mag[i];
 807                 spl_cache_flush(skc, skm, skm->skm_avail);
 808                 spl_magazine_free(skm);
 809         }
 810 }
 811
 812 /*
 813  * Create a object cache based on the following arguments:
 814  * name         cache name
 815  * size         cache object size
 816  * align        cache object alignment
 817  * ctor         cache object constructor
 818  * dtor         cache object destructor
 819  * reclaim      cache object reclaim
 820  * priv         cache private data for ctor/dtor/reclaim
 821  * vmp          unused must be NULL
 822  * flags
 823  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
 824  *      KMC_NODEBUG     Disable debugging (unsupported)
 825  *      KMC_NOHASH      Disable hashing (unsupported)
 826  *      KMC_QCACHE      Disable qcache (unsupported)
 827  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
 828  *      KMC_KMEM        Force kmem backed cache
 829  *      KMC_VMEM        Force vmem backed cache
 830  *      KMC_SLAB        Force Linux slab backed cache
 831  *      KMC_OFFSLAB     Locate objects off the slab
 832  */
 833 spl_kmem_cache_t *
 834 spl_kmem_cache_create(char *name, size_t size, size_t align,
 835                       spl_kmem_ctor_t ctor,
 836                       spl_kmem_dtor_t dtor,
 837                       spl_kmem_reclaim_t reclaim,
 838                       void *priv, void *vmp, int flags)
 839 {
 840         spl_kmem_cache_t *skc;
 841         int rc;
 842
 843         /*
 844          * Unsupported flags
 845          */
 846         ASSERT0(flags & KMC_NOMAGAZINE);
 847         ASSERT0(flags & KMC_NOHASH);
 848         ASSERT0(flags & KMC_QCACHE);
 849         ASSERT(vmp == NULL);
 850
 851         might_sleep();
 852
 853         /*
 854          * Allocate memory for a new cache an initialize it.  Unfortunately,
 855          * this usually ends up being a large allocation of ~32k because
 856          * we need to allocate enough memory for the worst case number of
 857          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
 858          * explicitly pass KM_NODEBUG to suppress the kmem warning
 859          */
 860         skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
 861         if (skc == NULL)
 862                 return (NULL);
 863
 864         skc->skc_magic = SKC_MAGIC;
 865         skc->skc_name_size = strlen(name) + 1;
 866         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
 867         if (skc->skc_name == NULL) {
 868                 kmem_free(skc, sizeof(*skc));
 869                 return (NULL);
 870         }
 871         strncpy(skc->skc_name, name, skc->skc_name_size);
 872
 873         skc->skc_ctor = ctor;
 874         skc->skc_dtor = dtor;
 875         skc->skc_reclaim = reclaim;
 876         skc->skc_private = priv;
 877         skc->skc_vmp = vmp;
 878         skc->skc_linux_cache = NULL;
 879         skc->skc_flags = flags;
 880         skc->skc_obj_size = size;
 881         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 882         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
 883         skc->skc_reap = SPL_KMEM_CACHE_REAP;
 884         atomic_set(&skc->skc_ref, 0);
 885
 886         INIT_LIST_HEAD(&skc->skc_list);
 887         INIT_LIST_HEAD(&skc->skc_complete_list);
 888         INIT_LIST_HEAD(&skc->skc_partial_list);
 889         skc->skc_emergency_tree = RB_ROOT;
 890         spin_lock_init(&skc->skc_lock);
 891         init_waitqueue_head(&skc->skc_waitq);
 892         skc->skc_slab_fail = 0;
 893         skc->skc_slab_create = 0;
 894         skc->skc_slab_destroy = 0;
 895         skc->skc_slab_total = 0;
 896         skc->skc_slab_alloc = 0;
 897         skc->skc_slab_max = 0;
 898         skc->skc_obj_total = 0;
 899         skc->skc_obj_alloc = 0;
 900         skc->skc_obj_max = 0;
 901         skc->skc_obj_deadlock = 0;
 902         skc->skc_obj_emergency = 0;
 903         skc->skc_obj_emergency_max = 0;
 904
 905         /*
 906          * Verify the requested alignment restriction is sane.
 907          */
 908         if (align) {
 909                 VERIFY(ISP2(align));
 910                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
 911                 VERIFY3U(align, <=, PAGE_SIZE);
 912                 skc->skc_obj_align = align;
 913         }
 914
 915         /*
 916          * When no specific type of slab is requested (kmem, vmem, or
 917          * linuxslab) then select a cache type based on the object size
 918          * and default tunables.
 919          */
 920         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
 921
 922                 /*
 923                  * Objects smaller than spl_kmem_cache_slab_limit can
 924                  * use the Linux slab for better space-efficiency.  By
 925                  * default this functionality is disabled until its
 926                  * performance characters are fully understood.
 927                  */
 928                 if (spl_kmem_cache_slab_limit &&
 929                     size <= (size_t)spl_kmem_cache_slab_limit)
 930                         skc->skc_flags |= KMC_SLAB;
 931
 932                 /*
 933                  * Small objects, less than spl_kmem_cache_kmem_limit per
 934                  * object should use kmem because their slabs are small.
 935                  */
 936                 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
 937                         skc->skc_flags |= KMC_KMEM;
 938
 939                 /*
 940                  * All other objects are considered large and are placed
 941                  * on vmem backed slabs.
 942                  */
 943                 else
 944                         skc->skc_flags |= KMC_VMEM;
 945         }
 946
 947         /*
 948          * Given the type of slab allocate the required resources.
 949          */
 950         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
 951                 rc = spl_slab_size(skc,
 952                     &skc->skc_slab_objs, &skc->skc_slab_size);
 953                 if (rc)
 954                         goto out;
 955
 956                 rc = spl_magazine_create(skc);
 957                 if (rc)
 958                         goto out;
 959         } else {
 960                 skc->skc_linux_cache = kmem_cache_create(
 961                     skc->skc_name, size, align, 0, NULL);
 962                 if (skc->skc_linux_cache == NULL) {
 963                         rc = ENOMEM;
 964                         goto out;
 965                 }
 966
 967                 kmem_cache_set_allocflags(skc, __GFP_COMP);
 968                 skc->skc_flags |= KMC_NOMAGAZINE;
 969         }
 970
 971         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
 972                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
 973                     spl_cache_age, skc, TQ_SLEEP,
 974                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 975
 976         down_write(&spl_kmem_cache_sem);
 977         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 978         up_write(&spl_kmem_cache_sem);
 979
 980         return (skc);
 981 out:
 982         kmem_free(skc->skc_name, skc->skc_name_size);
 983         kmem_free(skc, sizeof(*skc));
 984         return (NULL);
 985 }
 986 EXPORT_SYMBOL(spl_kmem_cache_create);
 987
 988 /*
 989  * Register a move callback to for cache defragmentation.
 990  * XXX: Unimplemented but harmless to stub out for now.
 991  */
 992 void
 993 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
 994     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 995 {
 996         ASSERT(move != NULL);
 997 }
 998 EXPORT_SYMBOL(spl_kmem_cache_set_move);
 999
1000 /*
1001  * Destroy a cache and all objects associated with the cache.
1002  */
1003 void
1004 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1005 {
1006         DECLARE_WAIT_QUEUE_HEAD(wq);
1007         taskqid_t id;
1008
1009         ASSERT(skc->skc_magic == SKC_MAGIC);
1010         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1011
1012         down_write(&spl_kmem_cache_sem);
1013         list_del_init(&skc->skc_list);
1014         up_write(&spl_kmem_cache_sem);
1015
1016         /* Cancel any and wait for any pending delayed tasks */
1017         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1018
1019         spin_lock(&skc->skc_lock);
1020         id = skc->skc_taskqid;
1021         spin_unlock(&skc->skc_lock);
1022
1023         taskq_cancel_id(spl_kmem_cache_taskq, id);
1024
1025         /* Wait until all current callers complete, this is mainly
1026          * to catch the case where a low memory situation triggers a
1027          * cache reaping action which races with this destroy. */
1028         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1029
1030         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1031                 spl_magazine_destroy(skc);
1032                 spl_slab_reclaim(skc, 0, 1);
1033         } else {
1034                 ASSERT(skc->skc_flags & KMC_SLAB);
1035                 kmem_cache_destroy(skc->skc_linux_cache);
1036         }
1037
1038         spin_lock(&skc->skc_lock);
1039
1040         /* Validate there are no objects in use and free all the
1041          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1042         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1043         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1044         ASSERT3U(skc->skc_slab_total, ==, 0);
1045         ASSERT3U(skc->skc_obj_total, ==, 0);
1046         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1047         ASSERT(list_empty(&skc->skc_complete_list));
1048
1049         kmem_free(skc->skc_name, skc->skc_name_size);
1050         spin_unlock(&skc->skc_lock);
1051
1052         kmem_free(skc, sizeof(*skc));
1053 }
1054 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1055
1056 /*
1057  * Allocate an object from a slab attached to the cache.  This is used to
1058  * repopulate the per-cpu magazine caches in batches when they run low.
1059  */
1060 static void *
1061 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1062 {
1063         spl_kmem_obj_t *sko;
1064
1065         ASSERT(skc->skc_magic == SKC_MAGIC);
1066         ASSERT(sks->sks_magic == SKS_MAGIC);
1067         ASSERT(spin_is_locked(&skc->skc_lock));
1068
1069         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1070         ASSERT(sko->sko_magic == SKO_MAGIC);
1071         ASSERT(sko->sko_addr != NULL);
1072
1073         /* Remove from sks_free_list */
1074         list_del_init(&sko->sko_list);
1075
1076         sks->sks_age = jiffies;
1077         sks->sks_ref++;
1078         skc->skc_obj_alloc++;
1079
1080         /* Track max obj usage statistics */
1081         if (skc->skc_obj_alloc > skc->skc_obj_max)
1082                 skc->skc_obj_max = skc->skc_obj_alloc;
1083
1084         /* Track max slab usage statistics */
1085         if (sks->sks_ref == 1) {
1086                 skc->skc_slab_alloc++;
1087
1088                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1089                         skc->skc_slab_max = skc->skc_slab_alloc;
1090         }
1091
1092         return sko->sko_addr;
1093 }
1094
1095 /*
1096  * Generic slab allocation function to run by the global work queues.
1097  * It is responsible for allocating a new slab, linking it in to the list
1098  * of partial slabs, and then waking any waiters.
1099  */
1100 static void
1101 spl_cache_grow_work(void *data)
1102 {
1103         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1104         spl_kmem_cache_t *skc = ska->ska_cache;
1105         spl_kmem_slab_t *sks;
1106
1107         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1108         spin_lock(&skc->skc_lock);
1109         if (sks) {
1110                 skc->skc_slab_total++;
1111                 skc->skc_obj_total += sks->sks_objs;
1112                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1113         }
1114
1115         atomic_dec(&skc->skc_ref);
1116         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1117         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1118         wake_up_all(&skc->skc_waitq);
1119         spin_unlock(&skc->skc_lock);
1120
1121         kfree(ska);
1122 }
1123
1124 /*
1125  * Returns non-zero when a new slab should be available.
1126  */
1127 static int
1128 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1129 {
1130         return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1131 }
1132
1133 /*
1134  * No available objects on any slabs, create a new slab.  Note that this
1135  * functionality is disabled for KMC_SLAB caches which are backed by the
1136  * Linux slab.
1137  */
1138 static int
1139 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1140 {
1141         int remaining, rc;
1142
1143         ASSERT(skc->skc_magic == SKC_MAGIC);
1144         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1145         might_sleep();
1146         *obj = NULL;
1147
1148         /*
1149          * Before allocating a new slab wait for any reaping to complete and
1150          * then return so the local magazine can be rechecked for new objects.
1151          */
1152         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1153                 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1154                     TASK_UNINTERRUPTIBLE);
1155                 return (rc ? rc : -EAGAIN);
1156         }
1157
1158         /*
1159          * This is handled by dispatching a work request to the global work
1160          * queue.  This allows us to asynchronously allocate a new slab while
1161          * retaining the ability to safely fall back to a smaller synchronous
1162          * allocations to ensure forward progress is always maintained.
1163          */
1164         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1165                 spl_kmem_alloc_t *ska;
1166
1167                 ska = kmalloc(sizeof(*ska), flags);
1168                 if (ska == NULL) {
1169                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1170                         wake_up_all(&skc->skc_waitq);
1171                         return (-ENOMEM);
1172                 }
1173
1174                 atomic_inc(&skc->skc_ref);
1175                 ska->ska_cache = skc;
1176                 ska->ska_flags = flags & ~__GFP_FS;
1177                 taskq_init_ent(&ska->ska_tqe);
1178                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1179                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1180         }
1181
1182         /*
1183          * The goal here is to only detect the rare case where a virtual slab
1184          * allocation has deadlocked.  We must be careful to minimize the use
1185          * of emergency objects which are more expensive to track.  Therefore,
1186          * we set a very long timeout for the asynchronous allocation and if
1187          * the timeout is reached the cache is flagged as deadlocked.  From
1188          * this point only new emergency objects will be allocated until the
1189          * asynchronous allocation completes and clears the deadlocked flag.
1190          */
1191         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1192                 rc = spl_emergency_alloc(skc, flags, obj);
1193         } else {
1194                 remaining = wait_event_timeout(skc->skc_waitq,
1195                                                spl_cache_grow_wait(skc), HZ);
1196
1197                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1198                         spin_lock(&skc->skc_lock);
1199                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1200                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1201                                 skc->skc_obj_deadlock++;
1202                         }
1203                         spin_unlock(&skc->skc_lock);
1204                 }
1205
1206                 rc = -ENOMEM;
1207         }
1208
1209         return (rc);
1210 }
1211
1212 /*
1213  * Refill a per-cpu magazine with objects from the slabs for this cache.
1214  * Ideally the magazine can be repopulated using existing objects which have
1215  * been released, however if we are unable to locate enough free objects new
1216  * slabs of objects will be created.  On success NULL is returned, otherwise
1217  * the address of a single emergency object is returned for use by the caller.
1218  */
1219 static void *
1220 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1221 {
1222         spl_kmem_slab_t *sks;
1223         int count = 0, rc, refill;
1224         void *obj = NULL;
1225
1226         ASSERT(skc->skc_magic == SKC_MAGIC);
1227         ASSERT(skm->skm_magic == SKM_MAGIC);
1228
1229         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1230         spin_lock(&skc->skc_lock);
1231
1232         while (refill > 0) {
1233                 /* No slabs available we may need to grow the cache */
1234                 if (list_empty(&skc->skc_partial_list)) {
1235                         spin_unlock(&skc->skc_lock);
1236
1237                         local_irq_enable();
1238                         rc = spl_cache_grow(skc, flags, &obj);
1239                         local_irq_disable();
1240
1241                         /* Emergency object for immediate use by caller */
1242                         if (rc == 0 && obj != NULL)
1243                                 return (obj);
1244
1245                         if (rc)
1246                                 goto out;
1247
1248                         /* Rescheduled to different CPU skm is not local */
1249                         if (skm != skc->skc_mag[smp_processor_id()])
1250                                 goto out;
1251
1252                         /* Potentially rescheduled to the same CPU but
1253                          * allocations may have occurred from this CPU while
1254                          * we were sleeping so recalculate max refill. */
1255                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1256
1257                         spin_lock(&skc->skc_lock);
1258                         continue;
1259                 }
1260
1261                 /* Grab the next available slab */
1262                 sks = list_entry((&skc->skc_partial_list)->next,
1263                                  spl_kmem_slab_t, sks_list);
1264                 ASSERT(sks->sks_magic == SKS_MAGIC);
1265                 ASSERT(sks->sks_ref < sks->sks_objs);
1266                 ASSERT(!list_empty(&sks->sks_free_list));
1267
1268                 /* Consume as many objects as needed to refill the requested
1269                  * cache.  We must also be careful not to overfill it. */
1270                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
1271                         ASSERT(skm->skm_avail < skm->skm_size);
1272                         ASSERT(count < skm->skm_size);
1273                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1274                 }
1275
1276                 /* Move slab to skc_complete_list when full */
1277                 if (sks->sks_ref == sks->sks_objs) {
1278                         list_del(&sks->sks_list);
1279                         list_add(&sks->sks_list, &skc->skc_complete_list);
1280                 }
1281         }
1282
1283         spin_unlock(&skc->skc_lock);
1284 out:
1285         return (NULL);
1286 }
1287
1288 /*
1289  * Release an object back to the slab from which it came.
1290  */
1291 static void
1292 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1293 {
1294         spl_kmem_slab_t *sks = NULL;
1295         spl_kmem_obj_t *sko = NULL;
1296
1297         ASSERT(skc->skc_magic == SKC_MAGIC);
1298         ASSERT(spin_is_locked(&skc->skc_lock));
1299
1300         sko = spl_sko_from_obj(skc, obj);
1301         ASSERT(sko->sko_magic == SKO_MAGIC);
1302         sks = sko->sko_slab;
1303         ASSERT(sks->sks_magic == SKS_MAGIC);
1304         ASSERT(sks->sks_cache == skc);
1305         list_add(&sko->sko_list, &sks->sks_free_list);
1306
1307         sks->sks_age = jiffies;
1308         sks->sks_ref--;
1309         skc->skc_obj_alloc--;
1310
1311         /* Move slab to skc_partial_list when no longer full.  Slabs
1312          * are added to the head to keep the partial list is quasi-full
1313          * sorted order.  Fuller at the head, emptier at the tail. */
1314         if (sks->sks_ref == (sks->sks_objs - 1)) {
1315                 list_del(&sks->sks_list);
1316                 list_add(&sks->sks_list, &skc->skc_partial_list);
1317         }
1318
1319         /* Move empty slabs to the end of the partial list so
1320          * they can be easily found and freed during reclamation. */
1321         if (sks->sks_ref == 0) {
1322                 list_del(&sks->sks_list);
1323                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1324                 skc->skc_slab_alloc--;
1325         }
1326 }
1327
1328 /*
1329  * Allocate an object from the per-cpu magazine, or if the magazine
1330  * is empty directly allocate from a slab and repopulate the magazine.
1331  */
1332 void *
1333 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1334 {
1335         spl_kmem_magazine_t *skm;
1336         void *obj = NULL;
1337
1338         ASSERT(skc->skc_magic == SKC_MAGIC);
1339         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1340         ASSERT(flags & KM_SLEEP);
1341
1342         atomic_inc(&skc->skc_ref);
1343
1344         /*
1345          * Allocate directly from a Linux slab.  All optimizations are left
1346          * to the underlying cache we only need to guarantee that KM_SLEEP
1347          * callers will never fail.
1348          */
1349         if (skc->skc_flags & KMC_SLAB) {
1350                 struct kmem_cache *slc = skc->skc_linux_cache;
1351
1352                 do {
1353                         obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
1354                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1355
1356                 goto ret;
1357         }
1358
1359         local_irq_disable();
1360
1361 restart:
1362         /* Safe to update per-cpu structure without lock, but
1363          * in the restart case we must be careful to reacquire
1364          * the local magazine since this may have changed
1365          * when we need to grow the cache. */
1366         skm = skc->skc_mag[smp_processor_id()];
1367         ASSERT(skm->skm_magic == SKM_MAGIC);
1368
1369         if (likely(skm->skm_avail)) {
1370                 /* Object available in CPU cache, use it */
1371                 obj = skm->skm_objs[--skm->skm_avail];
1372                 skm->skm_age = jiffies;
1373         } else {
1374                 obj = spl_cache_refill(skc, skm, flags);
1375                 if (obj == NULL)
1376                         goto restart;
1377         }
1378
1379         local_irq_enable();
1380         ASSERT(obj);
1381         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1382
1383 ret:
1384         /* Pre-emptively migrate object to CPU L1 cache */
1385         if (obj) {
1386                 if (obj && skc->skc_ctor)
1387                         skc->skc_ctor(obj, skc->skc_private, flags);
1388                 else
1389                         prefetchw(obj);
1390         }
1391
1392         atomic_dec(&skc->skc_ref);
1393
1394         return (obj);
1395 }
1396
1397 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1398
1399 /*
1400  * Free an object back to the local per-cpu magazine, there is no
1401  * guarantee that this is the same magazine the object was originally
1402  * allocated from.  We may need to flush entire from the magazine
1403  * back to the slabs to make space.
1404  */
1405 void
1406 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1407 {
1408         spl_kmem_magazine_t *skm;
1409         unsigned long flags;
1410
1411         ASSERT(skc->skc_magic == SKC_MAGIC);
1412         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1413         atomic_inc(&skc->skc_ref);
1414
1415         /*
1416          * Run the destructor
1417          */
1418         if (skc->skc_dtor)
1419                 skc->skc_dtor(obj, skc->skc_private);
1420
1421         /*
1422          * Free the object from the Linux underlying Linux slab.
1423          */
1424         if (skc->skc_flags & KMC_SLAB) {
1425                 kmem_cache_free(skc->skc_linux_cache, obj);
1426                 goto out;
1427         }
1428
1429         /*
1430          * Only virtual slabs may have emergency objects and these objects
1431          * are guaranteed to have physical addresses.  They must be removed
1432          * from the tree of emergency objects and the freed.
1433          */
1434         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
1435                 spl_emergency_free(skc, obj);
1436                 goto out;
1437         }
1438
1439         local_irq_save(flags);
1440
1441         /* Safe to update per-cpu structure without lock, but
1442          * no remote memory allocation tracking is being performed
1443          * it is entirely possible to allocate an object from one
1444          * CPU cache and return it to another. */
1445         skm = skc->skc_mag[smp_processor_id()];
1446         ASSERT(skm->skm_magic == SKM_MAGIC);
1447
1448         /* Per-CPU cache full, flush it to make space */
1449         if (unlikely(skm->skm_avail >= skm->skm_size))
1450                 spl_cache_flush(skc, skm, skm->skm_refill);
1451
1452         /* Available space in cache, use it */
1453         skm->skm_objs[skm->skm_avail++] = obj;
1454
1455         local_irq_restore(flags);
1456 out:
1457         atomic_dec(&skc->skc_ref);
1458 }
1459 EXPORT_SYMBOL(spl_kmem_cache_free);
1460
1461 /*
1462  * The generic shrinker function for all caches.  Under Linux a shrinker
1463  * may not be tightly coupled with a slab cache.  In fact Linux always
1464  * systematically tries calling all registered shrinker callbacks which
1465  * report that they contain unused objects.  Because of this we only
1466  * register one shrinker function in the shim layer for all slab caches.
1467  * We always attempt to shrink all caches when this generic shrinker
1468  * is called.
1469  *
1470  * If sc->nr_to_scan is zero, the caller is requesting a query of the
1471  * number of objects which can potentially be freed.  If it is nonzero,
1472  * the request is to free that many objects.
1473  *
1474  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1475  * in struct shrinker and also require the shrinker to return the number
1476  * of objects freed.
1477  *
1478  * Older kernels require the shrinker to return the number of freeable
1479  * objects following the freeing of nr_to_free.
1480  *
1481  * Linux semantics differ from those under Solaris, which are to
1482  * free all available objects which may (and probably will) be more
1483  * objects than the requested nr_to_scan.
1484  */
1485 static spl_shrinker_t
1486 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1487     struct shrink_control *sc)
1488 {
1489         spl_kmem_cache_t *skc;
1490         int alloc = 0;
1491
1492         down_read(&spl_kmem_cache_sem);
1493         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1494                 if (sc->nr_to_scan) {
1495 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1496                         uint64_t oldalloc = skc->skc_obj_alloc;
1497                         spl_kmem_cache_reap_now(skc,
1498                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1499                         if (oldalloc > skc->skc_obj_alloc)
1500                                 alloc += oldalloc - skc->skc_obj_alloc;
1501 #else
1502                         spl_kmem_cache_reap_now(skc,
1503                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
1504                         alloc += skc->skc_obj_alloc;
1505 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1506                 } else {
1507                         /* Request to query number of freeable objects */
1508                         alloc += skc->skc_obj_alloc;
1509                 }
1510         }
1511         up_read(&spl_kmem_cache_sem);
1512
1513         /*
1514          * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
1515          * This functionality only exists to work around a rare issue where
1516          * shrink_slabs() is repeatedly invoked by many cores causing the
1517          * system to thrash.
1518          */
1519         if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
1520                 return (SHRINK_STOP);
1521
1522         return (MAX(alloc, 0));
1523 }
1524
1525 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
1526
1527 /*
1528  * Call the registered reclaim function for a cache.  Depending on how
1529  * many and which objects are released it may simply repopulate the
1530  * local magazine which will then need to age-out.  Objects which cannot
1531  * fit in the magazine we will be released back to their slabs which will
1532  * also need to age out before being release.  This is all just best
1533  * effort and we do not want to thrash creating and destroying slabs.
1534  */
1535 void
1536 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
1537 {
1538         ASSERT(skc->skc_magic == SKC_MAGIC);
1539         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1540
1541         atomic_inc(&skc->skc_ref);
1542
1543         /*
1544          * Execute the registered reclaim callback if it exists.  The
1545          * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
1546          */
1547         if (skc->skc_flags & KMC_SLAB) {
1548                 if (skc->skc_reclaim)
1549                         skc->skc_reclaim(skc->skc_private);
1550
1551                 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
1552                         kmem_cache_shrink(skc->skc_linux_cache);
1553
1554                 goto out;
1555         }
1556
1557         /*
1558          * Prevent concurrent cache reaping when contended.
1559          */
1560         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
1561                 goto out;
1562
1563         /*
1564          * When a reclaim function is available it may be invoked repeatedly
1565          * until at least a single slab can be freed.  This ensures that we
1566          * do free memory back to the system.  This helps minimize the chance
1567          * of an OOM event when the bulk of memory is used by the slab.
1568          *
1569          * When free slabs are already available the reclaim callback will be
1570          * skipped.  Additionally, if no forward progress is detected despite
1571          * a reclaim function the cache will be skipped to avoid deadlock.
1572          *
1573          * Longer term this would be the correct place to add the code which
1574          * repacks the slabs in order minimize fragmentation.
1575          */
1576         if (skc->skc_reclaim) {
1577                 uint64_t objects = UINT64_MAX;
1578                 int do_reclaim;
1579
1580                 do {
1581                         spin_lock(&skc->skc_lock);
1582                         do_reclaim =
1583                             (skc->skc_slab_total > 0) &&
1584                             ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
1585                             (skc->skc_obj_alloc < objects);
1586
1587                         objects = skc->skc_obj_alloc;
1588                         spin_unlock(&skc->skc_lock);
1589
1590                         if (do_reclaim)
1591                                 skc->skc_reclaim(skc->skc_private);
1592
1593                 } while (do_reclaim);
1594         }
1595
1596         /* Reclaim from the magazine then the slabs ignoring age and delay. */
1597         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
1598                 spl_kmem_magazine_t *skm;
1599                 unsigned long irq_flags;
1600
1601                 local_irq_save(irq_flags);
1602                 skm = skc->skc_mag[smp_processor_id()];
1603                 spl_cache_flush(skc, skm, skm->skm_avail);
1604                 local_irq_restore(irq_flags);
1605         }
1606
1607         spl_slab_reclaim(skc, count, 1);
1608         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1609         smp_wmb();
1610         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
1611 out:
1612         atomic_dec(&skc->skc_ref);
1613 }
1614 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1615
1616 /*
1617  * Reap all free slabs from all registered caches.
1618  */
1619 void
1620 spl_kmem_reap(void)
1621 {
1622         struct shrink_control sc;
1623
1624         sc.nr_to_scan = KMC_REAP_CHUNK;
1625         sc.gfp_mask = GFP_KERNEL;
1626
1627         (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
1628 }
1629 EXPORT_SYMBOL(spl_kmem_reap);
1630
1631 int
1632 spl_kmem_cache_init(void)
1633 {
1634         init_rwsem(&spl_kmem_cache_sem);
1635         INIT_LIST_HEAD(&spl_kmem_cache_list);
1636         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1637             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
1638         spl_register_shrinker(&spl_kmem_cache_shrinker);
1639
1640         return (0);
1641 }
1642
1643 void
1644 spl_kmem_cache_fini(void)
1645 {
1646         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
1647         taskq_destroy(spl_kmem_cache_taskq);
1648 }