module/spl/spl-kmem-cache.c

   1 /*
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  */
  24
  25 #include <sys/kmem.h>
  26 #include <sys/kmem_cache.h>
  27 #include <sys/taskq.h>
  28 #include <sys/timer.h>
  29 #include <sys/vmem.h>
  30 #include <linux/slab.h>
  31 #include <linux/swap.h>
  32 #include <linux/mm_compat.h>
  33 #include <linux/wait_compat.h>
  34
  35 /*
  36  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  37  * are removed to allow access to the real Linux slab allocator.
  38  */
  39 #undef kmem_cache_destroy
  40 #undef kmem_cache_create
  41 #undef kmem_cache_alloc
  42 #undef kmem_cache_free
  43
  44
  45 /*
  46  * Cache expiration was implemented because it was part of the default Solaris
  47  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  48  * accessed in several seconds should be returned to the cache.  On the other
  49  * hand Linux slabs never move objects back to the slabs unless there is
  50  * memory pressure on the system.  By default the Linux method is enabled
  51  * because it has been shown to improve responsiveness on low memory systems.
  52  * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  53  */
  54 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
  55 EXPORT_SYMBOL(spl_kmem_cache_expire);
  56 module_param(spl_kmem_cache_expire, uint, 0644);
  57 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  58
  59 /*
  60  * The default behavior is to report the number of objects remaining in the
  61  * cache.  This allows the Linux VM to repeatedly reclaim objects from the
  62  * cache when memory is low satisfy other memory allocations.  Alternately,
  63  * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
  64  * is reclaimed.  This may increase the likelihood of out of memory events.
  65  */
  66 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
  67 module_param(spl_kmem_cache_reclaim, uint, 0644);
  68 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
  69
  70 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  71 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  72 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  73
  74 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
  75 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
  76 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
  77         "Minimal number of objects per slab");
  78
  79 unsigned int spl_kmem_cache_max_size = 32;
  80 module_param(spl_kmem_cache_max_size, uint, 0644);
  81 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  82
  83 /*
  84  * For small objects the Linux slab allocator should be used to make the most
  85  * efficient use of the memory.  However, large objects are not supported by
  86  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  87  * of 16K was determined to be optimal for architectures using 4K pages.
  88  */
  89 #if PAGE_SIZE == 4096
  90 unsigned int spl_kmem_cache_slab_limit = 16384;
  91 #else
  92 unsigned int spl_kmem_cache_slab_limit = 0;
  93 #endif
  94 module_param(spl_kmem_cache_slab_limit, uint, 0644);
  95 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
  96         "Objects less than N bytes use the Linux slab");
  97
  98 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
  99 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
 100 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
 101         "Objects less than N bytes use the kmalloc");
 102
 103 /*
 104  * Slab allocation interfaces
 105  *
 106  * While the Linux slab implementation was inspired by the Solaris
 107  * implementation I cannot use it to emulate the Solaris APIs.  I
 108  * require two features which are not provided by the Linux slab.
 109  *
 110  * 1) Constructors AND destructors.  Recent versions of the Linux
 111  *    kernel have removed support for destructors.  This is a deal
 112  *    breaker for the SPL which contains particularly expensive
 113  *    initializers for mutex's, condition variables, etc.  We also
 114  *    require a minimal level of cleanup for these data types unlike
 115  *    many Linux data types which do need to be explicitly destroyed.
 116  *
 117  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 118  *    expect it to work well for both small are very large allocations.
 119  *    Because of memory fragmentation the Linux slab which is backed
 120  *    by kmalloc'ed memory performs very badly when confronted with
 121  *    large numbers of large allocations.  Basing the slab on the
 122  *    virtual address space removes the need for contiguous pages
 123  *    and greatly improve performance for large allocations.
 124  *
 125  * For these reasons, the SPL has its own slab implementation with
 126  * the needed features.  It is not as highly optimized as either the
 127  * Solaris or Linux slabs, but it should get me most of what is
 128  * needed until it can be optimized or obsoleted by another approach.
 129  *
 130  * One serious concern I do have about this method is the relatively
 131  * small virtual address space on 32bit arches.  This will seriously
 132  * constrain the size of the slab caches and their performance.
 133  *
 134  * XXX: Improve the partial slab list by carefully maintaining a
 135  *      strict ordering of fullest to emptiest slabs based on
 136  *      the slab reference count.  This guarantees that when freeing
 137  *      slabs back to the system we need only linearly traverse the
 138  *      last N slabs in the list to discover all the freeable slabs.
 139  *
 140  * XXX: NUMA awareness for optionally allocating memory close to a
 141  *      particular core.  This can be advantageous if you know the slab
 142  *      object will be short lived and primarily accessed from one core.
 143  *
 144  * XXX: Slab coloring may also yield performance improvements and would
 145  *      be desirable to implement.
 146  */
 147
 148 struct list_head spl_kmem_cache_list;   /* List of caches */
 149 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 150 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 151
 152 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 153
 154 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 155 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 156         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 157
 158 static void *
 159 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 160 {
 161         void *ptr;
 162
 163         ASSERT(ISP2(size));
 164
 165         if (skc->skc_flags & KMC_KMEM)
 166                 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
 167                     get_order(size));
 168         else
 169                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 170
 171         /* Resulting allocated memory will be page aligned */
 172         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 173
 174         return (ptr);
 175 }
 176
 177 static void
 178 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 179 {
 180         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 181         ASSERT(ISP2(size));
 182
 183         /*
 184          * The Linux direct reclaim path uses this out of band value to
 185          * determine if forward progress is being made.  Normally this is
 186          * incremented by kmem_freepages() which is part of the various
 187          * Linux slab implementations.  However, since we are using none
 188          * of that infrastructure we are responsible for incrementing it.
 189          */
 190         if (current->reclaim_state)
 191                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 192
 193         if (skc->skc_flags & KMC_KMEM)
 194                 free_pages((unsigned long)ptr, get_order(size));
 195         else
 196                 vfree(ptr);
 197 }
 198
 199 /*
 200  * Required space for each aligned sks.
 201  */
 202 static inline uint32_t
 203 spl_sks_size(spl_kmem_cache_t *skc)
 204 {
 205         return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
 206             skc->skc_obj_align, uint32_t));
 207 }
 208
 209 /*
 210  * Required space for each aligned object.
 211  */
 212 static inline uint32_t
 213 spl_obj_size(spl_kmem_cache_t *skc)
 214 {
 215         uint32_t align = skc->skc_obj_align;
 216
 217         return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 218             P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
 219 }
 220
 221 /*
 222  * Lookup the spl_kmem_object_t for an object given that object.
 223  */
 224 static inline spl_kmem_obj_t *
 225 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 226 {
 227         return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 228             skc->skc_obj_align, uint32_t));
 229 }
 230
 231 /*
 232  * Required space for each offslab object taking in to account alignment
 233  * restrictions and the power-of-two requirement of kv_alloc().
 234  */
 235 static inline uint32_t
 236 spl_offslab_size(spl_kmem_cache_t *skc)
 237 {
 238         return (1UL << (fls64(spl_obj_size(skc)) + 1));
 239 }
 240
 241 /*
 242  * It's important that we pack the spl_kmem_obj_t structure and the
 243  * actual objects in to one large address space to minimize the number
 244  * of calls to the allocator.  It is far better to do a few large
 245  * allocations and then subdivide it ourselves.  Now which allocator
 246  * we use requires balancing a few trade offs.
 247  *
 248  * For small objects we use kmem_alloc() because as long as you are
 249  * only requesting a small number of pages (ideally just one) its cheap.
 250  * However, when you start requesting multiple pages with kmem_alloc()
 251  * it gets increasingly expensive since it requires contiguous pages.
 252  * For this reason we shift to vmem_alloc() for slabs of large objects
 253  * which removes the need for contiguous pages.  We do not use
 254  * vmem_alloc() in all cases because there is significant locking
 255  * overhead in __get_vm_area_node().  This function takes a single
 256  * global lock when acquiring an available virtual address range which
 257  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 258  * different allocation functions for small and large objects should
 259  * give us the best of both worlds.
 260  *
 261  * KMC_ONSLAB                       KMC_OFFSLAB
 262  *
 263  * +------------------------+       +-----------------+
 264  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 265  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 266  * | spl_kmem_obj_t      |  |                             | |
 267  * | skc_obj_size    <---+  |       +-----------------+   | |
 268  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 269  * | ...                 v  |       | spl_kmem_obj_t  |     |
 270  * +------------------------+       +-----------------+     v
 271  */
 272 static spl_kmem_slab_t *
 273 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 274 {
 275         spl_kmem_slab_t *sks;
 276         spl_kmem_obj_t *sko, *n;
 277         void *base, *obj;
 278         uint32_t obj_size, offslab_size = 0;
 279         int i,  rc = 0;
 280
 281         base = kv_alloc(skc, skc->skc_slab_size, flags);
 282         if (base == NULL)
 283                 return (NULL);
 284
 285         sks = (spl_kmem_slab_t *)base;
 286         sks->sks_magic = SKS_MAGIC;
 287         sks->sks_objs = skc->skc_slab_objs;
 288         sks->sks_age = jiffies;
 289         sks->sks_cache = skc;
 290         INIT_LIST_HEAD(&sks->sks_list);
 291         INIT_LIST_HEAD(&sks->sks_free_list);
 292         sks->sks_ref = 0;
 293         obj_size = spl_obj_size(skc);
 294
 295         if (skc->skc_flags & KMC_OFFSLAB)
 296                 offslab_size = spl_offslab_size(skc);
 297
 298         for (i = 0; i < sks->sks_objs; i++) {
 299                 if (skc->skc_flags & KMC_OFFSLAB) {
 300                         obj = kv_alloc(skc, offslab_size, flags);
 301                         if (!obj) {
 302                                 rc = -ENOMEM;
 303                                 goto out;
 304                         }
 305                 } else {
 306                         obj = base + spl_sks_size(skc) + (i * obj_size);
 307                 }
 308
 309                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 310                 sko = spl_sko_from_obj(skc, obj);
 311                 sko->sko_addr = obj;
 312                 sko->sko_magic = SKO_MAGIC;
 313                 sko->sko_slab = sks;
 314                 INIT_LIST_HEAD(&sko->sko_list);
 315                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 316         }
 317
 318 out:
 319         if (rc) {
 320                 if (skc->skc_flags & KMC_OFFSLAB)
 321                         list_for_each_entry_safe(sko,
 322                             n, &sks->sks_free_list, sko_list)
 323                                 kv_free(skc, sko->sko_addr, offslab_size);
 324
 325                 kv_free(skc, base, skc->skc_slab_size);
 326                 sks = NULL;
 327         }
 328
 329         return (sks);
 330 }
 331
 332 /*
 333  * Remove a slab from complete or partial list, it must be called with
 334  * the 'skc->skc_lock' held but the actual free must be performed
 335  * outside the lock to prevent deadlocking on vmem addresses.
 336  */
 337 static void
 338 spl_slab_free(spl_kmem_slab_t *sks,
 339     struct list_head *sks_list, struct list_head *sko_list)
 340 {
 341         spl_kmem_cache_t *skc;
 342
 343         ASSERT(sks->sks_magic == SKS_MAGIC);
 344         ASSERT(sks->sks_ref == 0);
 345
 346         skc = sks->sks_cache;
 347         ASSERT(skc->skc_magic == SKC_MAGIC);
 348         ASSERT(spin_is_locked(&skc->skc_lock));
 349
 350         /*
 351          * Update slab/objects counters in the cache, then remove the
 352          * slab from the skc->skc_partial_list.  Finally add the slab
 353          * and all its objects in to the private work lists where the
 354          * destructors will be called and the memory freed to the system.
 355          */
 356         skc->skc_obj_total -= sks->sks_objs;
 357         skc->skc_slab_total--;
 358         list_del(&sks->sks_list);
 359         list_add(&sks->sks_list, sks_list);
 360         list_splice_init(&sks->sks_free_list, sko_list);
 361 }
 362
 363 /*
 364  * Traverse all the partial slabs attached to a cache and free those
 365  * which which are currently empty, and have not been touched for
 366  * skc_delay seconds to  avoid thrashing.  The count argument is
 367  * passed to optionally cap the number of slabs reclaimed, a count
 368  * of zero means try and reclaim everything.  When flag is set we
 369  * always free an available slab regardless of age.
 370  */
 371 static void
 372 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 373 {
 374         spl_kmem_slab_t *sks, *m;
 375         spl_kmem_obj_t *sko, *n;
 376         LIST_HEAD(sks_list);
 377         LIST_HEAD(sko_list);
 378         uint32_t size = 0;
 379         int i = 0;
 380
 381         /*
 382          * Move empty slabs and objects which have not been touched in
 383          * skc_delay seconds on to private lists to be freed outside
 384          * the spin lock.  This delay time is important to avoid thrashing
 385          * however when flag is set the delay will not be used.
 386          */
 387         spin_lock(&skc->skc_lock);
 388         list_for_each_entry_safe_reverse(sks, m,
 389             &skc->skc_partial_list, sks_list) {
 390                 /*
 391                  * All empty slabs are at the end of skc->skc_partial_list,
 392                  * therefore once a non-empty slab is found we can stop
 393                  * scanning.  Additionally, stop when reaching the target
 394                  * reclaim 'count' if a non-zero threshold is given.
 395                  */
 396                 if ((sks->sks_ref > 0) || (count && i >= count))
 397                         break;
 398
 399                 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ) ||
 400                     flag) {
 401                         spl_slab_free(sks, &sks_list, &sko_list);
 402                         i++;
 403                 }
 404         }
 405         spin_unlock(&skc->skc_lock);
 406
 407         /*
 408          * The following two loops ensure all the object destructors are
 409          * run, any offslab objects are freed, and the slabs themselves
 410          * are freed.  This is all done outside the skc->skc_lock since
 411          * this allows the destructor to sleep, and allows us to perform
 412          * a conditional reschedule when a freeing a large number of
 413          * objects and slabs back to the system.
 414          */
 415         if (skc->skc_flags & KMC_OFFSLAB)
 416                 size = spl_offslab_size(skc);
 417
 418         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 419                 ASSERT(sko->sko_magic == SKO_MAGIC);
 420
 421                 if (skc->skc_flags & KMC_OFFSLAB)
 422                         kv_free(skc, sko->sko_addr, size);
 423         }
 424
 425         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 426                 ASSERT(sks->sks_magic == SKS_MAGIC);
 427                 kv_free(skc, sks, skc->skc_slab_size);
 428         }
 429 }
 430
 431 static spl_kmem_emergency_t *
 432 spl_emergency_search(struct rb_root *root, void *obj)
 433 {
 434         struct rb_node *node = root->rb_node;
 435         spl_kmem_emergency_t *ske;
 436         unsigned long address = (unsigned long)obj;
 437
 438         while (node) {
 439                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
 440
 441                 if (address < (unsigned long)ske->ske_obj)
 442                         node = node->rb_left;
 443                 else if (address > (unsigned long)ske->ske_obj)
 444                         node = node->rb_right;
 445                 else
 446                         return (ske);
 447         }
 448
 449         return (NULL);
 450 }
 451
 452 static int
 453 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 454 {
 455         struct rb_node **new = &(root->rb_node), *parent = NULL;
 456         spl_kmem_emergency_t *ske_tmp;
 457         unsigned long address = (unsigned long)ske->ske_obj;
 458
 459         while (*new) {
 460                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 461
 462                 parent = *new;
 463                 if (address < (unsigned long)ske_tmp->ske_obj)
 464                         new = &((*new)->rb_left);
 465                 else if (address > (unsigned long)ske_tmp->ske_obj)
 466                         new = &((*new)->rb_right);
 467                 else
 468                         return (0);
 469         }
 470
 471         rb_link_node(&ske->ske_node, parent, new);
 472         rb_insert_color(&ske->ske_node, root);
 473
 474         return (1);
 475 }
 476
 477 /*
 478  * Allocate a single emergency object and track it in a red black tree.
 479  */
 480 static int
 481 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 482 {
 483         spl_kmem_emergency_t *ske;
 484         int empty;
 485
 486         /* Last chance use a partial slab if one now exists */
 487         spin_lock(&skc->skc_lock);
 488         empty = list_empty(&skc->skc_partial_list);
 489         spin_unlock(&skc->skc_lock);
 490         if (!empty)
 491                 return (-EEXIST);
 492
 493         ske = kmalloc(sizeof (*ske), flags);
 494         if (ske == NULL)
 495                 return (-ENOMEM);
 496
 497         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
 498         if (ske->ske_obj == NULL) {
 499                 kfree(ske);
 500                 return (-ENOMEM);
 501         }
 502
 503         spin_lock(&skc->skc_lock);
 504         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 505         if (likely(empty)) {
 506                 skc->skc_obj_total++;
 507                 skc->skc_obj_emergency++;
 508                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 509                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 510         }
 511         spin_unlock(&skc->skc_lock);
 512
 513         if (unlikely(!empty)) {
 514                 kfree(ske->ske_obj);
 515                 kfree(ske);
 516                 return (-EINVAL);
 517         }
 518
 519         *obj = ske->ske_obj;
 520
 521         return (0);
 522 }
 523
 524 /*
 525  * Locate the passed object in the red black tree and free it.
 526  */
 527 static int
 528 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 529 {
 530         spl_kmem_emergency_t *ske;
 531
 532         spin_lock(&skc->skc_lock);
 533         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
 534         if (likely(ske)) {
 535                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
 536                 skc->skc_obj_emergency--;
 537                 skc->skc_obj_total--;
 538         }
 539         spin_unlock(&skc->skc_lock);
 540
 541         if (unlikely(ske == NULL))
 542                 return (-ENOENT);
 543
 544         kfree(ske->ske_obj);
 545         kfree(ske);
 546
 547         return (0);
 548 }
 549
 550 /*
 551  * Release objects from the per-cpu magazine back to their slab.  The flush
 552  * argument contains the max number of entries to remove from the magazine.
 553  */
 554 static void
 555 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 556 {
 557         int i, count = MIN(flush, skm->skm_avail);
 558
 559         ASSERT(skc->skc_magic == SKC_MAGIC);
 560         ASSERT(skm->skm_magic == SKM_MAGIC);
 561         ASSERT(spin_is_locked(&skc->skc_lock));
 562
 563         for (i = 0; i < count; i++)
 564                 spl_cache_shrink(skc, skm->skm_objs[i]);
 565
 566         skm->skm_avail -= count;
 567         memmove(skm->skm_objs, &(skm->skm_objs[count]),
 568             sizeof (void *) * skm->skm_avail);
 569 }
 570
 571 static void
 572 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 573 {
 574         spin_lock(&skc->skc_lock);
 575         __spl_cache_flush(skc, skm, flush);
 576         spin_unlock(&skc->skc_lock);
 577 }
 578
 579 static void
 580 spl_magazine_age(void *data)
 581 {
 582         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
 583         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 584
 585         ASSERT(skm->skm_magic == SKM_MAGIC);
 586         ASSERT(skm->skm_cpu == smp_processor_id());
 587         ASSERT(irqs_disabled());
 588
 589         /* There are no available objects or they are too young to age out */
 590         if ((skm->skm_avail == 0) ||
 591             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
 592                 return;
 593
 594         /*
 595          * Because we're executing in interrupt context we may have
 596          * interrupted the holder of this lock.  To avoid a potential
 597          * deadlock return if the lock is contended.
 598          */
 599         if (!spin_trylock(&skc->skc_lock))
 600                 return;
 601
 602         __spl_cache_flush(skc, skm, skm->skm_refill);
 603         spin_unlock(&skc->skc_lock);
 604 }
 605
 606 /*
 607  * Called regularly to keep a downward pressure on the cache.
 608  *
 609  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
 610  * be returned to the caches.  This is done to prevent idle magazines from
 611  * holding memory which could be better used elsewhere.  The delay is
 612  * present to prevent thrashing the magazine.
 613  *
 614  * The newly released objects may result in empty partial slabs.  Those
 615  * slabs should be released to the system.  Otherwise moving the objects
 616  * out of the magazines is just wasted work.
 617  */
 618 static void
 619 spl_cache_age(void *data)
 620 {
 621         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
 622         taskqid_t id = 0;
 623
 624         ASSERT(skc->skc_magic == SKC_MAGIC);
 625
 626         /* Dynamically disabled at run time */
 627         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
 628                 return;
 629
 630         atomic_inc(&skc->skc_ref);
 631
 632         if (!(skc->skc_flags & KMC_NOMAGAZINE))
 633                 on_each_cpu(spl_magazine_age, skc, 1);
 634
 635         spl_slab_reclaim(skc, skc->skc_reap, 0);
 636
 637         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
 638                 id = taskq_dispatch_delay(
 639                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
 640                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 641
 642                 /* Destroy issued after dispatch immediately cancel it */
 643                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
 644                         taskq_cancel_id(spl_kmem_cache_taskq, id);
 645         }
 646
 647         spin_lock(&skc->skc_lock);
 648         skc->skc_taskqid = id;
 649         spin_unlock(&skc->skc_lock);
 650
 651         atomic_dec(&skc->skc_ref);
 652 }
 653
 654 /*
 655  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
 656  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
 657  * for very small objects we may end up with more than this so as not
 658  * to waste space in the minimal allocation of a single page.  Also for
 659  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
 660  * lower than this and we will fail.
 661  */
 662 static int
 663 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 664 {
 665         uint32_t sks_size, obj_size, max_size;
 666
 667         if (skc->skc_flags & KMC_OFFSLAB) {
 668                 *objs = spl_kmem_cache_obj_per_slab;
 669                 *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
 670                 return (0);
 671         } else {
 672                 sks_size = spl_sks_size(skc);
 673                 obj_size = spl_obj_size(skc);
 674
 675                 if (skc->skc_flags & KMC_KMEM)
 676                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
 677                 else
 678                         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
 679
 680                 /* Power of two sized slab */
 681                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
 682                         *objs = (*size - sks_size) / obj_size;
 683                         if (*objs >= spl_kmem_cache_obj_per_slab)
 684                                 return (0);
 685                 }
 686
 687                 /*
 688                  * Unable to satisfy target objects per slab, fall back to
 689                  * allocating a maximally sized slab and assuming it can
 690                  * contain the minimum objects count use it.  If not fail.
 691                  */
 692                 *size = max_size;
 693                 *objs = (*size - sks_size) / obj_size;
 694                 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
 695                         return (0);
 696         }
 697
 698         return (-ENOSPC);
 699 }
 700
 701 /*
 702  * Make a guess at reasonable per-cpu magazine size based on the size of
 703  * each object and the cost of caching N of them in each magazine.  Long
 704  * term this should really adapt based on an observed usage heuristic.
 705  */
 706 static int
 707 spl_magazine_size(spl_kmem_cache_t *skc)
 708 {
 709         uint32_t obj_size = spl_obj_size(skc);
 710         int size;
 711
 712         /* Per-magazine sizes below assume a 4Kib page size */
 713         if (obj_size > (PAGE_SIZE * 256))
 714                 size = 4;  /* Minimum 4Mib per-magazine */
 715         else if (obj_size > (PAGE_SIZE * 32))
 716                 size = 16; /* Minimum 2Mib per-magazine */
 717         else if (obj_size > (PAGE_SIZE))
 718                 size = 64; /* Minimum 256Kib per-magazine */
 719         else if (obj_size > (PAGE_SIZE / 4))
 720                 size = 128; /* Minimum 128Kib per-magazine */
 721         else
 722                 size = 256;
 723
 724         return (size);
 725 }
 726
 727 /*
 728  * Allocate a per-cpu magazine to associate with a specific core.
 729  */
 730 static spl_kmem_magazine_t *
 731 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 732 {
 733         spl_kmem_magazine_t *skm;
 734         int size = sizeof (spl_kmem_magazine_t) +
 735             sizeof (void *) * skc->skc_mag_size;
 736
 737         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
 738         if (skm) {
 739                 skm->skm_magic = SKM_MAGIC;
 740                 skm->skm_avail = 0;
 741                 skm->skm_size = skc->skc_mag_size;
 742                 skm->skm_refill = skc->skc_mag_refill;
 743                 skm->skm_cache = skc;
 744                 skm->skm_age = jiffies;
 745                 skm->skm_cpu = cpu;
 746         }
 747
 748         return (skm);
 749 }
 750
 751 /*
 752  * Free a per-cpu magazine associated with a specific core.
 753  */
 754 static void
 755 spl_magazine_free(spl_kmem_magazine_t *skm)
 756 {
 757         int size = sizeof (spl_kmem_magazine_t) +
 758             sizeof (void *) * skm->skm_size;
 759
 760         ASSERT(skm->skm_magic == SKM_MAGIC);
 761         ASSERT(skm->skm_avail == 0);
 762
 763         kmem_free(skm, size);
 764 }
 765
 766 /*
 767  * Create all pre-cpu magazines of reasonable sizes.
 768  */
 769 static int
 770 spl_magazine_create(spl_kmem_cache_t *skc)
 771 {
 772         int i;
 773
 774         if (skc->skc_flags & KMC_NOMAGAZINE)
 775                 return (0);
 776
 777         skc->skc_mag_size = spl_magazine_size(skc);
 778         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 779
 780         for_each_online_cpu(i) {
 781                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
 782                 if (!skc->skc_mag[i]) {
 783                         for (i--; i >= 0; i--)
 784                                 spl_magazine_free(skc->skc_mag[i]);
 785
 786                         return (-ENOMEM);
 787                 }
 788         }
 789
 790         return (0);
 791 }
 792
 793 /*
 794  * Destroy all pre-cpu magazines.
 795  */
 796 static void
 797 spl_magazine_destroy(spl_kmem_cache_t *skc)
 798 {
 799         spl_kmem_magazine_t *skm;
 800         int i;
 801
 802         if (skc->skc_flags & KMC_NOMAGAZINE)
 803                 return;
 804
 805         for_each_online_cpu(i) {
 806                 skm = skc->skc_mag[i];
 807                 spl_cache_flush(skc, skm, skm->skm_avail);
 808                 spl_magazine_free(skm);
 809         }
 810 }
 811
 812 /*
 813  * Create a object cache based on the following arguments:
 814  * name         cache name
 815  * size         cache object size
 816  * align        cache object alignment
 817  * ctor         cache object constructor
 818  * dtor         cache object destructor
 819  * reclaim      cache object reclaim
 820  * priv         cache private data for ctor/dtor/reclaim
 821  * vmp          unused must be NULL
 822  * flags
 823  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
 824  *      KMC_NODEBUG     Disable debugging (unsupported)
 825  *      KMC_NOHASH      Disable hashing (unsupported)
 826  *      KMC_QCACHE      Disable qcache (unsupported)
 827  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
 828  *      KMC_KMEM        Force kmem backed cache
 829  *      KMC_VMEM        Force vmem backed cache
 830  *      KMC_SLAB        Force Linux slab backed cache
 831  *      KMC_OFFSLAB     Locate objects off the slab
 832  */
 833 spl_kmem_cache_t *
 834 spl_kmem_cache_create(char *name, size_t size, size_t align,
 835     spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
 836     void *priv, void *vmp, int flags)
 837 {
 838         spl_kmem_cache_t *skc;
 839         int rc;
 840
 841         /*
 842          * Unsupported flags
 843          */
 844         ASSERT0(flags & KMC_NOMAGAZINE);
 845         ASSERT0(flags & KMC_NOHASH);
 846         ASSERT0(flags & KMC_QCACHE);
 847         ASSERT(vmp == NULL);
 848
 849         might_sleep();
 850
 851         /*
 852          * Allocate memory for a new cache and initialize it.  Unfortunately,
 853          * this usually ends up being a large allocation of ~32k because
 854          * we need to allocate enough memory for the worst case number of
 855          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
 856          * explicitly pass KM_NODEBUG to suppress the kmem warning
 857          */
 858         skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG);
 859         if (skc == NULL)
 860                 return (NULL);
 861
 862         skc->skc_magic = SKC_MAGIC;
 863         skc->skc_name_size = strlen(name) + 1;
 864         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
 865         if (skc->skc_name == NULL) {
 866                 kmem_free(skc, sizeof (*skc));
 867                 return (NULL);
 868         }
 869         strncpy(skc->skc_name, name, skc->skc_name_size);
 870
 871         skc->skc_ctor = ctor;
 872         skc->skc_dtor = dtor;
 873         skc->skc_reclaim = reclaim;
 874         skc->skc_private = priv;
 875         skc->skc_vmp = vmp;
 876         skc->skc_linux_cache = NULL;
 877         skc->skc_flags = flags;
 878         skc->skc_obj_size = size;
 879         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 880         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
 881         skc->skc_reap = SPL_KMEM_CACHE_REAP;
 882         atomic_set(&skc->skc_ref, 0);
 883
 884         INIT_LIST_HEAD(&skc->skc_list);
 885         INIT_LIST_HEAD(&skc->skc_complete_list);
 886         INIT_LIST_HEAD(&skc->skc_partial_list);
 887         skc->skc_emergency_tree = RB_ROOT;
 888         spin_lock_init(&skc->skc_lock);
 889         init_waitqueue_head(&skc->skc_waitq);
 890         skc->skc_slab_fail = 0;
 891         skc->skc_slab_create = 0;
 892         skc->skc_slab_destroy = 0;
 893         skc->skc_slab_total = 0;
 894         skc->skc_slab_alloc = 0;
 895         skc->skc_slab_max = 0;
 896         skc->skc_obj_total = 0;
 897         skc->skc_obj_alloc = 0;
 898         skc->skc_obj_max = 0;
 899         skc->skc_obj_deadlock = 0;
 900         skc->skc_obj_emergency = 0;
 901         skc->skc_obj_emergency_max = 0;
 902
 903         /*
 904          * Verify the requested alignment restriction is sane.
 905          */
 906         if (align) {
 907                 VERIFY(ISP2(align));
 908                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
 909                 VERIFY3U(align, <=, PAGE_SIZE);
 910                 skc->skc_obj_align = align;
 911         }
 912
 913         /*
 914          * When no specific type of slab is requested (kmem, vmem, or
 915          * linuxslab) then select a cache type based on the object size
 916          * and default tunables.
 917          */
 918         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
 919
 920                 /*
 921                  * Objects smaller than spl_kmem_cache_slab_limit can
 922                  * use the Linux slab for better space-efficiency.  By
 923                  * default this functionality is disabled until its
 924                  * performance characteristics are fully understood.
 925                  */
 926                 if (spl_kmem_cache_slab_limit &&
 927                     size <= (size_t)spl_kmem_cache_slab_limit)
 928                         skc->skc_flags |= KMC_SLAB;
 929
 930                 /*
 931                  * Small objects, less than spl_kmem_cache_kmem_limit per
 932                  * object should use kmem because their slabs are small.
 933                  */
 934                 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
 935                         skc->skc_flags |= KMC_KMEM;
 936
 937                 /*
 938                  * All other objects are considered large and are placed
 939                  * on vmem backed slabs.
 940                  */
 941                 else
 942                         skc->skc_flags |= KMC_VMEM;
 943         }
 944
 945         /*
 946          * Given the type of slab allocate the required resources.
 947          */
 948         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
 949                 rc = spl_slab_size(skc,
 950                     &skc->skc_slab_objs, &skc->skc_slab_size);
 951                 if (rc)
 952                         goto out;
 953
 954                 rc = spl_magazine_create(skc);
 955                 if (rc)
 956                         goto out;
 957         } else {
 958                 skc->skc_linux_cache = kmem_cache_create(
 959                     skc->skc_name, size, align, 0, NULL);
 960                 if (skc->skc_linux_cache == NULL) {
 961                         rc = ENOMEM;
 962                         goto out;
 963                 }
 964
 965                 kmem_cache_set_allocflags(skc, __GFP_COMP);
 966                 skc->skc_flags |= KMC_NOMAGAZINE;
 967         }
 968
 969         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
 970                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
 971                     spl_cache_age, skc, TQ_SLEEP,
 972                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 973
 974         down_write(&spl_kmem_cache_sem);
 975         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 976         up_write(&spl_kmem_cache_sem);
 977
 978         return (skc);
 979 out:
 980         kmem_free(skc->skc_name, skc->skc_name_size);
 981         kmem_free(skc, sizeof (*skc));
 982         return (NULL);
 983 }
 984 EXPORT_SYMBOL(spl_kmem_cache_create);
 985
 986 /*
 987  * Register a move callback for cache defragmentation.
 988  * XXX: Unimplemented but harmless to stub out for now.
 989  */
 990 void
 991 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
 992     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 993 {
 994         ASSERT(move != NULL);
 995 }
 996 EXPORT_SYMBOL(spl_kmem_cache_set_move);
 997
 998 /*
 999  * Destroy a cache and all objects associated with the cache.
1000  */
1001 void
1002 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1003 {
1004         DECLARE_WAIT_QUEUE_HEAD(wq);
1005         taskqid_t id;
1006
1007         ASSERT(skc->skc_magic == SKC_MAGIC);
1008         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
1009
1010         down_write(&spl_kmem_cache_sem);
1011         list_del_init(&skc->skc_list);
1012         up_write(&spl_kmem_cache_sem);
1013
1014         /* Cancel any and wait for any pending delayed tasks */
1015         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1016
1017         spin_lock(&skc->skc_lock);
1018         id = skc->skc_taskqid;
1019         spin_unlock(&skc->skc_lock);
1020
1021         taskq_cancel_id(spl_kmem_cache_taskq, id);
1022
1023         /*
1024          * Wait until all current callers complete, this is mainly
1025          * to catch the case where a low memory situation triggers a
1026          * cache reaping action which races with this destroy.
1027          */
1028         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1029
1030         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1031                 spl_magazine_destroy(skc);
1032                 spl_slab_reclaim(skc, 0, 1);
1033         } else {
1034                 ASSERT(skc->skc_flags & KMC_SLAB);
1035                 kmem_cache_destroy(skc->skc_linux_cache);
1036         }
1037
1038         spin_lock(&skc->skc_lock);
1039
1040         /*
1041          * Validate there are no objects in use and free all the
1042          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
1043          */
1044         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1045         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1046         ASSERT3U(skc->skc_slab_total, ==, 0);
1047         ASSERT3U(skc->skc_obj_total, ==, 0);
1048         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1049         ASSERT(list_empty(&skc->skc_complete_list));
1050
1051         kmem_free(skc->skc_name, skc->skc_name_size);
1052         spin_unlock(&skc->skc_lock);
1053
1054         kmem_free(skc, sizeof (*skc));
1055 }
1056 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1057
1058 /*
1059  * Allocate an object from a slab attached to the cache.  This is used to
1060  * repopulate the per-cpu magazine caches in batches when they run low.
1061  */
1062 static void *
1063 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1064 {
1065         spl_kmem_obj_t *sko;
1066
1067         ASSERT(skc->skc_magic == SKC_MAGIC);
1068         ASSERT(sks->sks_magic == SKS_MAGIC);
1069         ASSERT(spin_is_locked(&skc->skc_lock));
1070
1071         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1072         ASSERT(sko->sko_magic == SKO_MAGIC);
1073         ASSERT(sko->sko_addr != NULL);
1074
1075         /* Remove from sks_free_list */
1076         list_del_init(&sko->sko_list);
1077
1078         sks->sks_age = jiffies;
1079         sks->sks_ref++;
1080         skc->skc_obj_alloc++;
1081
1082         /* Track max obj usage statistics */
1083         if (skc->skc_obj_alloc > skc->skc_obj_max)
1084                 skc->skc_obj_max = skc->skc_obj_alloc;
1085
1086         /* Track max slab usage statistics */
1087         if (sks->sks_ref == 1) {
1088                 skc->skc_slab_alloc++;
1089
1090                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1091                         skc->skc_slab_max = skc->skc_slab_alloc;
1092         }
1093
1094         return (sko->sko_addr);
1095 }
1096
1097 /*
1098  * Generic slab allocation function to run by the global work queues.
1099  * It is responsible for allocating a new slab, linking it in to the list
1100  * of partial slabs, and then waking any waiters.
1101  */
1102 static void
1103 spl_cache_grow_work(void *data)
1104 {
1105         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1106         spl_kmem_cache_t *skc = ska->ska_cache;
1107         spl_kmem_slab_t *sks;
1108
1109         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1110         spin_lock(&skc->skc_lock);
1111         if (sks) {
1112                 skc->skc_slab_total++;
1113                 skc->skc_obj_total += sks->sks_objs;
1114                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1115         }
1116
1117         atomic_dec(&skc->skc_ref);
1118         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1119         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1120         wake_up_all(&skc->skc_waitq);
1121         spin_unlock(&skc->skc_lock);
1122
1123         kfree(ska);
1124 }
1125
1126 /*
1127  * Returns non-zero when a new slab should be available.
1128  */
1129 static int
1130 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1131 {
1132         return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
1133 }
1134
1135 /*
1136  * No available objects on any slabs, create a new slab.  Note that this
1137  * functionality is disabled for KMC_SLAB caches which are backed by the
1138  * Linux slab.
1139  */
1140 static int
1141 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1142 {
1143         int remaining, rc;
1144
1145         ASSERT(skc->skc_magic == SKC_MAGIC);
1146         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1147         might_sleep();
1148         *obj = NULL;
1149
1150         /*
1151          * Before allocating a new slab wait for any reaping to complete and
1152          * then return so the local magazine can be rechecked for new objects.
1153          */
1154         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1155                 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1156                     TASK_UNINTERRUPTIBLE);
1157                 return (rc ? rc : -EAGAIN);
1158         }
1159
1160         /*
1161          * This is handled by dispatching a work request to the global work
1162          * queue.  This allows us to asynchronously allocate a new slab while
1163          * retaining the ability to safely fall back to a smaller synchronous
1164          * allocations to ensure forward progress is always maintained.
1165          */
1166         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1167                 spl_kmem_alloc_t *ska;
1168
1169                 ska = kmalloc(sizeof (*ska), flags);
1170                 if (ska == NULL) {
1171                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1172                         wake_up_all(&skc->skc_waitq);
1173                         return (-ENOMEM);
1174                 }
1175
1176                 atomic_inc(&skc->skc_ref);
1177                 ska->ska_cache = skc;
1178                 ska->ska_flags = flags & ~__GFP_FS;
1179                 taskq_init_ent(&ska->ska_tqe);
1180                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1181                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1182         }
1183
1184         /*
1185          * The goal here is to only detect the rare case where a virtual slab
1186          * allocation has deadlocked.  We must be careful to minimize the use
1187          * of emergency objects which are more expensive to track.  Therefore,
1188          * we set a very long timeout for the asynchronous allocation and if
1189          * the timeout is reached the cache is flagged as deadlocked.  From
1190          * this point only new emergency objects will be allocated until the
1191          * asynchronous allocation completes and clears the deadlocked flag.
1192          */
1193         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1194                 rc = spl_emergency_alloc(skc, flags, obj);
1195         } else {
1196                 remaining = wait_event_timeout(skc->skc_waitq,
1197                     spl_cache_grow_wait(skc), HZ);
1198
1199                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1200                         spin_lock(&skc->skc_lock);
1201                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1202                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1203                                 skc->skc_obj_deadlock++;
1204                         }
1205                         spin_unlock(&skc->skc_lock);
1206                 }
1207
1208                 rc = -ENOMEM;
1209         }
1210
1211         return (rc);
1212 }
1213
1214 /*
1215  * Refill a per-cpu magazine with objects from the slabs for this cache.
1216  * Ideally the magazine can be repopulated using existing objects which have
1217  * been released, however if we are unable to locate enough free objects new
1218  * slabs of objects will be created.  On success NULL is returned, otherwise
1219  * the address of a single emergency object is returned for use by the caller.
1220  */
1221 static void *
1222 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1223 {
1224         spl_kmem_slab_t *sks;
1225         int count = 0, rc, refill;
1226         void *obj = NULL;
1227
1228         ASSERT(skc->skc_magic == SKC_MAGIC);
1229         ASSERT(skm->skm_magic == SKM_MAGIC);
1230
1231         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1232         spin_lock(&skc->skc_lock);
1233
1234         while (refill > 0) {
1235                 /* No slabs available we may need to grow the cache */
1236                 if (list_empty(&skc->skc_partial_list)) {
1237                         spin_unlock(&skc->skc_lock);
1238
1239                         local_irq_enable();
1240                         rc = spl_cache_grow(skc, flags, &obj);
1241                         local_irq_disable();
1242
1243                         /* Emergency object for immediate use by caller */
1244                         if (rc == 0 && obj != NULL)
1245                                 return (obj);
1246
1247                         if (rc)
1248                                 goto out;
1249
1250                         /* Rescheduled to different CPU skm is not local */
1251                         if (skm != skc->skc_mag[smp_processor_id()])
1252                                 goto out;
1253
1254                         /*
1255                          * Potentially rescheduled to the same CPU but
1256                          * allocations may have occurred from this CPU while
1257                          * we were sleeping so recalculate max refill.
1258                          */
1259                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1260
1261                         spin_lock(&skc->skc_lock);
1262                         continue;
1263                 }
1264
1265                 /* Grab the next available slab */
1266                 sks = list_entry((&skc->skc_partial_list)->next,
1267                     spl_kmem_slab_t, sks_list);
1268                 ASSERT(sks->sks_magic == SKS_MAGIC);
1269                 ASSERT(sks->sks_ref < sks->sks_objs);
1270                 ASSERT(!list_empty(&sks->sks_free_list));
1271
1272                 /*
1273                  * Consume as many objects as needed to refill the requested
1274                  * cache.  We must also be careful not to overfill it.
1275                  */
1276                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
1277                     ++count) {
1278                         ASSERT(skm->skm_avail < skm->skm_size);
1279                         ASSERT(count < skm->skm_size);
1280                         skm->skm_objs[skm->skm_avail++] =
1281                             spl_cache_obj(skc, sks);
1282                 }
1283
1284                 /* Move slab to skc_complete_list when full */
1285                 if (sks->sks_ref == sks->sks_objs) {
1286                         list_del(&sks->sks_list);
1287                         list_add(&sks->sks_list, &skc->skc_complete_list);
1288                 }
1289         }
1290
1291         spin_unlock(&skc->skc_lock);
1292 out:
1293         return (NULL);
1294 }
1295
1296 /*
1297  * Release an object back to the slab from which it came.
1298  */
1299 static void
1300 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1301 {
1302         spl_kmem_slab_t *sks = NULL;
1303         spl_kmem_obj_t *sko = NULL;
1304
1305         ASSERT(skc->skc_magic == SKC_MAGIC);
1306         ASSERT(spin_is_locked(&skc->skc_lock));
1307
1308         sko = spl_sko_from_obj(skc, obj);
1309         ASSERT(sko->sko_magic == SKO_MAGIC);
1310         sks = sko->sko_slab;
1311         ASSERT(sks->sks_magic == SKS_MAGIC);
1312         ASSERT(sks->sks_cache == skc);
1313         list_add(&sko->sko_list, &sks->sks_free_list);
1314
1315         sks->sks_age = jiffies;
1316         sks->sks_ref--;
1317         skc->skc_obj_alloc--;
1318
1319         /*
1320          * Move slab to skc_partial_list when no longer full.  Slabs
1321          * are added to the head to keep the partial list is quasi-full
1322          * sorted order.  Fuller at the head, emptier at the tail.
1323          */
1324         if (sks->sks_ref == (sks->sks_objs - 1)) {
1325                 list_del(&sks->sks_list);
1326                 list_add(&sks->sks_list, &skc->skc_partial_list);
1327         }
1328
1329         /*
1330          * Move empty slabs to the end of the partial list so
1331          * they can be easily found and freed during reclamation.
1332          */
1333         if (sks->sks_ref == 0) {
1334                 list_del(&sks->sks_list);
1335                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1336                 skc->skc_slab_alloc--;
1337         }
1338 }
1339
1340 /*
1341  * Allocate an object from the per-cpu magazine, or if the magazine
1342  * is empty directly allocate from a slab and repopulate the magazine.
1343  */
1344 void *
1345 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1346 {
1347         spl_kmem_magazine_t *skm;
1348         void *obj = NULL;
1349
1350         ASSERT(skc->skc_magic == SKC_MAGIC);
1351         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1352         ASSERT(flags & KM_SLEEP);
1353
1354         atomic_inc(&skc->skc_ref);
1355
1356         /*
1357          * Allocate directly from a Linux slab.  All optimizations are left
1358          * to the underlying cache we only need to guarantee that KM_SLEEP
1359          * callers will never fail.
1360          */
1361         if (skc->skc_flags & KMC_SLAB) {
1362                 struct kmem_cache *slc = skc->skc_linux_cache;
1363
1364                 do {
1365                         obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
1366                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1367
1368                 goto ret;
1369         }
1370
1371         local_irq_disable();
1372
1373 restart:
1374         /*
1375          * Safe to update per-cpu structure without lock, but
1376          * in the restart case we must be careful to reacquire
1377          * the local magazine since this may have changed
1378          * when we need to grow the cache.
1379          */
1380         skm = skc->skc_mag[smp_processor_id()];
1381         ASSERT(skm->skm_magic == SKM_MAGIC);
1382
1383         if (likely(skm->skm_avail)) {
1384                 /* Object available in CPU cache, use it */
1385                 obj = skm->skm_objs[--skm->skm_avail];
1386                 skm->skm_age = jiffies;
1387         } else {
1388                 obj = spl_cache_refill(skc, skm, flags);
1389                 if (obj == NULL)
1390                         goto restart;
1391         }
1392
1393         local_irq_enable();
1394         ASSERT(obj);
1395         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1396
1397 ret:
1398         /* Pre-emptively migrate object to CPU L1 cache */
1399         if (obj) {
1400                 if (obj && skc->skc_ctor)
1401                         skc->skc_ctor(obj, skc->skc_private, flags);
1402                 else
1403                         prefetchw(obj);
1404         }
1405
1406         atomic_dec(&skc->skc_ref);
1407
1408         return (obj);
1409 }
1410
1411 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1412
1413 /*
1414  * Free an object back to the local per-cpu magazine, there is no
1415  * guarantee that this is the same magazine the object was originally
1416  * allocated from.  We may need to flush entire from the magazine
1417  * back to the slabs to make space.
1418  */
1419 void
1420 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1421 {
1422         spl_kmem_magazine_t *skm;
1423         unsigned long flags;
1424
1425         ASSERT(skc->skc_magic == SKC_MAGIC);
1426         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1427         atomic_inc(&skc->skc_ref);
1428
1429         /*
1430          * Run the destructor
1431          */
1432         if (skc->skc_dtor)
1433                 skc->skc_dtor(obj, skc->skc_private);
1434
1435         /*
1436          * Free the object from the Linux underlying Linux slab.
1437          */
1438         if (skc->skc_flags & KMC_SLAB) {
1439                 kmem_cache_free(skc->skc_linux_cache, obj);
1440                 goto out;
1441         }
1442
1443         /*
1444          * Only virtual slabs may have emergency objects and these objects
1445          * are guaranteed to have physical addresses.  They must be removed
1446          * from the tree of emergency objects and the freed.
1447          */
1448         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
1449                 spl_emergency_free(skc, obj);
1450                 goto out;
1451         }
1452
1453         local_irq_save(flags);
1454
1455         /*
1456          * Safe to update per-cpu structure without lock, but
1457          * no remote memory allocation tracking is being performed
1458          * it is entirely possible to allocate an object from one
1459          * CPU cache and return it to another.
1460          */
1461         skm = skc->skc_mag[smp_processor_id()];
1462         ASSERT(skm->skm_magic == SKM_MAGIC);
1463
1464         /* Per-CPU cache full, flush it to make space */
1465         if (unlikely(skm->skm_avail >= skm->skm_size))
1466                 spl_cache_flush(skc, skm, skm->skm_refill);
1467
1468         /* Available space in cache, use it */
1469         skm->skm_objs[skm->skm_avail++] = obj;
1470
1471         local_irq_restore(flags);
1472 out:
1473         atomic_dec(&skc->skc_ref);
1474 }
1475 EXPORT_SYMBOL(spl_kmem_cache_free);
1476
1477 /*
1478  * The generic shrinker function for all caches.  Under Linux a shrinker
1479  * may not be tightly coupled with a slab cache.  In fact Linux always
1480  * systematically tries calling all registered shrinker callbacks which
1481  * report that they contain unused objects.  Because of this we only
1482  * register one shrinker function in the shim layer for all slab caches.
1483  * We always attempt to shrink all caches when this generic shrinker
1484  * is called.
1485  *
1486  * If sc->nr_to_scan is zero, the caller is requesting a query of the
1487  * number of objects which can potentially be freed.  If it is nonzero,
1488  * the request is to free that many objects.
1489  *
1490  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1491  * in struct shrinker and also require the shrinker to return the number
1492  * of objects freed.
1493  *
1494  * Older kernels require the shrinker to return the number of freeable
1495  * objects following the freeing of nr_to_free.
1496  *
1497  * Linux semantics differ from those under Solaris, which are to
1498  * free all available objects which may (and probably will) be more
1499  * objects than the requested nr_to_scan.
1500  */
1501 static spl_shrinker_t
1502 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1503     struct shrink_control *sc)
1504 {
1505         spl_kmem_cache_t *skc;
1506         int alloc = 0;
1507
1508         down_read(&spl_kmem_cache_sem);
1509         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1510                 if (sc->nr_to_scan) {
1511 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1512                         uint64_t oldalloc = skc->skc_obj_alloc;
1513                         spl_kmem_cache_reap_now(skc,
1514                             MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
1515                         if (oldalloc > skc->skc_obj_alloc)
1516                                 alloc += oldalloc - skc->skc_obj_alloc;
1517 #else
1518                         spl_kmem_cache_reap_now(skc,
1519                             MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
1520                         alloc += skc->skc_obj_alloc;
1521 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1522                 } else {
1523                         /* Request to query number of freeable objects */
1524                         alloc += skc->skc_obj_alloc;
1525                 }
1526         }
1527         up_read(&spl_kmem_cache_sem);
1528
1529         /*
1530          * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
1531          * This functionality only exists to work around a rare issue where
1532          * shrink_slabs() is repeatedly invoked by many cores causing the
1533          * system to thrash.
1534          */
1535         if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
1536                 return (SHRINK_STOP);
1537
1538         return (MAX(alloc, 0));
1539 }
1540
1541 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
1542
1543 /*
1544  * Call the registered reclaim function for a cache.  Depending on how
1545  * many and which objects are released it may simply repopulate the
1546  * local magazine which will then need to age-out.  Objects which cannot
1547  * fit in the magazine we will be released back to their slabs which will
1548  * also need to age out before being release.  This is all just best
1549  * effort and we do not want to thrash creating and destroying slabs.
1550  */
1551 void
1552 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
1553 {
1554         ASSERT(skc->skc_magic == SKC_MAGIC);
1555         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1556
1557         atomic_inc(&skc->skc_ref);
1558
1559         /*
1560          * Execute the registered reclaim callback if it exists.  The
1561          * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
1562          */
1563         if (skc->skc_flags & KMC_SLAB) {
1564                 if (skc->skc_reclaim)
1565                         skc->skc_reclaim(skc->skc_private);
1566
1567                 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
1568                         kmem_cache_shrink(skc->skc_linux_cache);
1569
1570                 goto out;
1571         }
1572
1573         /*
1574          * Prevent concurrent cache reaping when contended.
1575          */
1576         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
1577                 goto out;
1578
1579         /*
1580          * When a reclaim function is available it may be invoked repeatedly
1581          * until at least a single slab can be freed.  This ensures that we
1582          * do free memory back to the system.  This helps minimize the chance
1583          * of an OOM event when the bulk of memory is used by the slab.
1584          *
1585          * When free slabs are already available the reclaim callback will be
1586          * skipped.  Additionally, if no forward progress is detected despite
1587          * a reclaim function the cache will be skipped to avoid deadlock.
1588          *
1589          * Longer term this would be the correct place to add the code which
1590          * repacks the slabs in order minimize fragmentation.
1591          */
1592         if (skc->skc_reclaim) {
1593                 uint64_t objects = UINT64_MAX;
1594                 int do_reclaim;
1595
1596                 do {
1597                         spin_lock(&skc->skc_lock);
1598                         do_reclaim =
1599                             (skc->skc_slab_total > 0) &&
1600                             ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
1601                             (skc->skc_obj_alloc < objects);
1602
1603                         objects = skc->skc_obj_alloc;
1604                         spin_unlock(&skc->skc_lock);
1605
1606                         if (do_reclaim)
1607                                 skc->skc_reclaim(skc->skc_private);
1608
1609                 } while (do_reclaim);
1610         }
1611
1612         /* Reclaim from the magazine then the slabs ignoring age and delay. */
1613         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
1614                 spl_kmem_magazine_t *skm;
1615                 unsigned long irq_flags;
1616
1617                 local_irq_save(irq_flags);
1618                 skm = skc->skc_mag[smp_processor_id()];
1619                 spl_cache_flush(skc, skm, skm->skm_avail);
1620                 local_irq_restore(irq_flags);
1621         }
1622
1623         spl_slab_reclaim(skc, count, 1);
1624         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1625         smp_wmb();
1626         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
1627 out:
1628         atomic_dec(&skc->skc_ref);
1629 }
1630 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1631
1632 /*
1633  * Reap all free slabs from all registered caches.
1634  */
1635 void
1636 spl_kmem_reap(void)
1637 {
1638         struct shrink_control sc;
1639
1640         sc.nr_to_scan = KMC_REAP_CHUNK;
1641         sc.gfp_mask = GFP_KERNEL;
1642
1643         (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
1644 }
1645 EXPORT_SYMBOL(spl_kmem_reap);
1646
1647 int
1648 spl_kmem_cache_init(void)
1649 {
1650         init_rwsem(&spl_kmem_cache_sem);
1651         INIT_LIST_HEAD(&spl_kmem_cache_list);
1652         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1653             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
1654         spl_register_shrinker(&spl_kmem_cache_shrinker);
1655
1656         return (0);
1657 }
1658
1659 void
1660 spl_kmem_cache_fini(void)
1661 {
1662         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
1663         taskq_destroy(spl_kmem_cache_taskq);
1664 }