module/spl/spl-kmem-cache.c

   1 /*
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  */
  24
  25 #include <sys/kmem.h>
  26 #include <sys/kmem_cache.h>
  27 #include <sys/taskq.h>
  28 #include <sys/timer.h>
  29 #include <sys/vmem.h>
  30 #include <linux/slab.h>
  31 #include <linux/swap.h>
  32 #include <linux/mm_compat.h>
  33 #include <linux/wait_compat.h>
  34
  35 /*
  36  * Within the scope of spl-kmem.c file the kmem_cache_* definitions
  37  * are removed to allow access to the real Linux slab allocator.
  38  */
  39 #undef kmem_cache_destroy
  40 #undef kmem_cache_create
  41 #undef kmem_cache_alloc
  42 #undef kmem_cache_free
  43
  44
  45 /*
  46  * Cache expiration was implemented because it was part of the default Solaris
  47  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  48  * accessed in several seconds should be returned to the cache.  On the other
  49  * hand Linux slabs never move objects back to the slabs unless there is
  50  * memory pressure on the system.  By default the Linux method is enabled
  51  * because it has been shown to improve responsiveness on low memory systems.
  52  * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  53  */
  54 unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
  55 EXPORT_SYMBOL(spl_kmem_cache_expire);
  56 module_param(spl_kmem_cache_expire, uint, 0644);
  57 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  58
  59 /*
  60  * The default behavior is to report the number of objects remaining in the
  61  * cache.  This allows the Linux VM to repeatedly reclaim objects from the
  62  * cache when memory is low satisfy other memory allocations.  Alternately,
  63  * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
  64  * is reclaimed.  This may increase the likelihood of out of memory events.
  65  */
  66 unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
  67 module_param(spl_kmem_cache_reclaim, uint, 0644);
  68 MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
  69
  70 unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
  71 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
  72 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
  73
  74 unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
  75 module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
  76 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
  77         "Minimal number of objects per slab");
  78
  79 unsigned int spl_kmem_cache_max_size = 32;
  80 module_param(spl_kmem_cache_max_size, uint, 0644);
  81 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
  82
  83 /*
  84  * For small objects the Linux slab allocator should be used to make the most
  85  * efficient use of the memory.  However, large objects are not supported by
  86  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  87  * of 16K was determined to be optimal for architectures using 4K pages.
  88  */
  89 #if PAGE_SIZE == 4096
  90 unsigned int spl_kmem_cache_slab_limit = 16384;
  91 #else
  92 unsigned int spl_kmem_cache_slab_limit = 0;
  93 #endif
  94 module_param(spl_kmem_cache_slab_limit, uint, 0644);
  95 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
  96         "Objects less than N bytes use the Linux slab");
  97
  98 unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
  99 module_param(spl_kmem_cache_kmem_limit, uint, 0644);
 100 MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
 101         "Objects less than N bytes use the kmalloc");
 102
 103 /*
 104  * Slab allocation interfaces
 105  *
 106  * While the Linux slab implementation was inspired by the Solaris
 107  * implementation I cannot use it to emulate the Solaris APIs.  I
 108  * require two features which are not provided by the Linux slab.
 109  *
 110  * 1) Constructors AND destructors.  Recent versions of the Linux
 111  *    kernel have removed support for destructors.  This is a deal
 112  *    breaker for the SPL which contains particularly expensive
 113  *    initializers for mutex's, condition variables, etc.  We also
 114  *    require a minimal level of cleanup for these data types unlike
 115  *    many Linux data types which do need to be explicitly destroyed.
 116  *
 117  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 118  *    expect it to work well for both small are very large allocations.
 119  *    Because of memory fragmentation the Linux slab which is backed
 120  *    by kmalloc'ed memory performs very badly when confronted with
 121  *    large numbers of large allocations.  Basing the slab on the
 122  *    virtual address space removes the need for contiguous pages
 123  *    and greatly improve performance for large allocations.
 124  *
 125  * For these reasons, the SPL has its own slab implementation with
 126  * the needed features.  It is not as highly optimized as either the
 127  * Solaris or Linux slabs, but it should get me most of what is
 128  * needed until it can be optimized or obsoleted by another approach.
 129  *
 130  * One serious concern I do have about this method is the relatively
 131  * small virtual address space on 32bit arches.  This will seriously
 132  * constrain the size of the slab caches and their performance.
 133  */
 134
 135 struct list_head spl_kmem_cache_list;   /* List of caches */
 136 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 137 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 138
 139 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 140
 141 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 142 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 143         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 144
 145 static void *
 146 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 147 {
 148         gfp_t lflags = kmem_flags_convert(flags);
 149         void *ptr;
 150
 151         ASSERT(ISP2(size));
 152
 153         if (skc->skc_flags & KMC_KMEM)
 154                 ptr = (void *)__get_free_pages(lflags, get_order(size));
 155         else
 156                 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
 157
 158         /* Resulting allocated memory will be page aligned */
 159         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 160
 161         return (ptr);
 162 }
 163
 164 static void
 165 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 166 {
 167         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 168         ASSERT(ISP2(size));
 169
 170         /*
 171          * The Linux direct reclaim path uses this out of band value to
 172          * determine if forward progress is being made.  Normally this is
 173          * incremented by kmem_freepages() which is part of the various
 174          * Linux slab implementations.  However, since we are using none
 175          * of that infrastructure we are responsible for incrementing it.
 176          */
 177         if (current->reclaim_state)
 178                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 179
 180         if (skc->skc_flags & KMC_KMEM)
 181                 free_pages((unsigned long)ptr, get_order(size));
 182         else
 183                 vfree(ptr);
 184 }
 185
 186 /*
 187  * Required space for each aligned sks.
 188  */
 189 static inline uint32_t
 190 spl_sks_size(spl_kmem_cache_t *skc)
 191 {
 192         return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
 193             skc->skc_obj_align, uint32_t));
 194 }
 195
 196 /*
 197  * Required space for each aligned object.
 198  */
 199 static inline uint32_t
 200 spl_obj_size(spl_kmem_cache_t *skc)
 201 {
 202         uint32_t align = skc->skc_obj_align;
 203
 204         return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 205             P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
 206 }
 207
 208 /*
 209  * Lookup the spl_kmem_object_t for an object given that object.
 210  */
 211 static inline spl_kmem_obj_t *
 212 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 213 {
 214         return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 215             skc->skc_obj_align, uint32_t));
 216 }
 217
 218 /*
 219  * Required space for each offslab object taking in to account alignment
 220  * restrictions and the power-of-two requirement of kv_alloc().
 221  */
 222 static inline uint32_t
 223 spl_offslab_size(spl_kmem_cache_t *skc)
 224 {
 225         return (1UL << (fls64(spl_obj_size(skc)) + 1));
 226 }
 227
 228 /*
 229  * It's important that we pack the spl_kmem_obj_t structure and the
 230  * actual objects in to one large address space to minimize the number
 231  * of calls to the allocator.  It is far better to do a few large
 232  * allocations and then subdivide it ourselves.  Now which allocator
 233  * we use requires balancing a few trade offs.
 234  *
 235  * For small objects we use kmem_alloc() because as long as you are
 236  * only requesting a small number of pages (ideally just one) its cheap.
 237  * However, when you start requesting multiple pages with kmem_alloc()
 238  * it gets increasingly expensive since it requires contiguous pages.
 239  * For this reason we shift to vmem_alloc() for slabs of large objects
 240  * which removes the need for contiguous pages.  We do not use
 241  * vmem_alloc() in all cases because there is significant locking
 242  * overhead in __get_vm_area_node().  This function takes a single
 243  * global lock when acquiring an available virtual address range which
 244  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 245  * different allocation functions for small and large objects should
 246  * give us the best of both worlds.
 247  *
 248  * KMC_ONSLAB                       KMC_OFFSLAB
 249  *
 250  * +------------------------+       +-----------------+
 251  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 252  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 253  * | spl_kmem_obj_t      |  |                             | |
 254  * | skc_obj_size    <---+  |       +-----------------+   | |
 255  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 256  * | ...                 v  |       | spl_kmem_obj_t  |     |
 257  * +------------------------+       +-----------------+     v
 258  */
 259 static spl_kmem_slab_t *
 260 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 261 {
 262         spl_kmem_slab_t *sks;
 263         spl_kmem_obj_t *sko, *n;
 264         void *base, *obj;
 265         uint32_t obj_size, offslab_size = 0;
 266         int i,  rc = 0;
 267
 268         base = kv_alloc(skc, skc->skc_slab_size, flags);
 269         if (base == NULL)
 270                 return (NULL);
 271
 272         sks = (spl_kmem_slab_t *)base;
 273         sks->sks_magic = SKS_MAGIC;
 274         sks->sks_objs = skc->skc_slab_objs;
 275         sks->sks_age = jiffies;
 276         sks->sks_cache = skc;
 277         INIT_LIST_HEAD(&sks->sks_list);
 278         INIT_LIST_HEAD(&sks->sks_free_list);
 279         sks->sks_ref = 0;
 280         obj_size = spl_obj_size(skc);
 281
 282         if (skc->skc_flags & KMC_OFFSLAB)
 283                 offslab_size = spl_offslab_size(skc);
 284
 285         for (i = 0; i < sks->sks_objs; i++) {
 286                 if (skc->skc_flags & KMC_OFFSLAB) {
 287                         obj = kv_alloc(skc, offslab_size, flags);
 288                         if (!obj) {
 289                                 rc = -ENOMEM;
 290                                 goto out;
 291                         }
 292                 } else {
 293                         obj = base + spl_sks_size(skc) + (i * obj_size);
 294                 }
 295
 296                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 297                 sko = spl_sko_from_obj(skc, obj);
 298                 sko->sko_addr = obj;
 299                 sko->sko_magic = SKO_MAGIC;
 300                 sko->sko_slab = sks;
 301                 INIT_LIST_HEAD(&sko->sko_list);
 302                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 303         }
 304
 305 out:
 306         if (rc) {
 307                 if (skc->skc_flags & KMC_OFFSLAB)
 308                         list_for_each_entry_safe(sko,
 309                             n, &sks->sks_free_list, sko_list)
 310                                 kv_free(skc, sko->sko_addr, offslab_size);
 311
 312                 kv_free(skc, base, skc->skc_slab_size);
 313                 sks = NULL;
 314         }
 315
 316         return (sks);
 317 }
 318
 319 /*
 320  * Remove a slab from complete or partial list, it must be called with
 321  * the 'skc->skc_lock' held but the actual free must be performed
 322  * outside the lock to prevent deadlocking on vmem addresses.
 323  */
 324 static void
 325 spl_slab_free(spl_kmem_slab_t *sks,
 326     struct list_head *sks_list, struct list_head *sko_list)
 327 {
 328         spl_kmem_cache_t *skc;
 329
 330         ASSERT(sks->sks_magic == SKS_MAGIC);
 331         ASSERT(sks->sks_ref == 0);
 332
 333         skc = sks->sks_cache;
 334         ASSERT(skc->skc_magic == SKC_MAGIC);
 335         ASSERT(spin_is_locked(&skc->skc_lock));
 336
 337         /*
 338          * Update slab/objects counters in the cache, then remove the
 339          * slab from the skc->skc_partial_list.  Finally add the slab
 340          * and all its objects in to the private work lists where the
 341          * destructors will be called and the memory freed to the system.
 342          */
 343         skc->skc_obj_total -= sks->sks_objs;
 344         skc->skc_slab_total--;
 345         list_del(&sks->sks_list);
 346         list_add(&sks->sks_list, sks_list);
 347         list_splice_init(&sks->sks_free_list, sko_list);
 348 }
 349
 350 /*
 351  * Traverse all the partial slabs attached to a cache and free those which
 352  * are currently empty, and have not been touched for skc_delay seconds to
 353  * avoid thrashing.  The count argument is passed to optionally cap the
 354  * number of slabs reclaimed, a count of zero means try and reclaim
 355  * everything.  When flag the is set available slabs freed regardless of age.
 356  */
 357 static void
 358 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 359 {
 360         spl_kmem_slab_t *sks, *m;
 361         spl_kmem_obj_t *sko, *n;
 362         LIST_HEAD(sks_list);
 363         LIST_HEAD(sko_list);
 364         uint32_t size = 0;
 365         int i = 0;
 366
 367         /*
 368          * Move empty slabs and objects which have not been touched in
 369          * skc_delay seconds on to private lists to be freed outside
 370          * the spin lock.  This delay time is important to avoid thrashing
 371          * however when flag is set the delay will not be used.
 372          */
 373         spin_lock(&skc->skc_lock);
 374         list_for_each_entry_safe_reverse(sks, m,
 375             &skc->skc_partial_list, sks_list) {
 376                 /*
 377                  * All empty slabs are at the end of skc->skc_partial_list,
 378                  * therefore once a non-empty slab is found we can stop
 379                  * scanning.  Additionally, stop when reaching the target
 380                  * reclaim 'count' if a non-zero threshold is given.
 381                  */
 382                 if ((sks->sks_ref > 0) || (count && i >= count))
 383                         break;
 384
 385                 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ) ||
 386                     flag) {
 387                         spl_slab_free(sks, &sks_list, &sko_list);
 388                         i++;
 389                 }
 390         }
 391         spin_unlock(&skc->skc_lock);
 392
 393         /*
 394          * The following two loops ensure all the object destructors are
 395          * run, any offslab objects are freed, and the slabs themselves
 396          * are freed.  This is all done outside the skc->skc_lock since
 397          * this allows the destructor to sleep, and allows us to perform
 398          * a conditional reschedule when a freeing a large number of
 399          * objects and slabs back to the system.
 400          */
 401         if (skc->skc_flags & KMC_OFFSLAB)
 402                 size = spl_offslab_size(skc);
 403
 404         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 405                 ASSERT(sko->sko_magic == SKO_MAGIC);
 406
 407                 if (skc->skc_flags & KMC_OFFSLAB)
 408                         kv_free(skc, sko->sko_addr, size);
 409         }
 410
 411         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 412                 ASSERT(sks->sks_magic == SKS_MAGIC);
 413                 kv_free(skc, sks, skc->skc_slab_size);
 414         }
 415 }
 416
 417 static spl_kmem_emergency_t *
 418 spl_emergency_search(struct rb_root *root, void *obj)
 419 {
 420         struct rb_node *node = root->rb_node;
 421         spl_kmem_emergency_t *ske;
 422         unsigned long address = (unsigned long)obj;
 423
 424         while (node) {
 425                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
 426
 427                 if (address < (unsigned long)ske->ske_obj)
 428                         node = node->rb_left;
 429                 else if (address > (unsigned long)ske->ske_obj)
 430                         node = node->rb_right;
 431                 else
 432                         return (ske);
 433         }
 434
 435         return (NULL);
 436 }
 437
 438 static int
 439 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 440 {
 441         struct rb_node **new = &(root->rb_node), *parent = NULL;
 442         spl_kmem_emergency_t *ske_tmp;
 443         unsigned long address = (unsigned long)ske->ske_obj;
 444
 445         while (*new) {
 446                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 447
 448                 parent = *new;
 449                 if (address < (unsigned long)ske_tmp->ske_obj)
 450                         new = &((*new)->rb_left);
 451                 else if (address > (unsigned long)ske_tmp->ske_obj)
 452                         new = &((*new)->rb_right);
 453                 else
 454                         return (0);
 455         }
 456
 457         rb_link_node(&ske->ske_node, parent, new);
 458         rb_insert_color(&ske->ske_node, root);
 459
 460         return (1);
 461 }
 462
 463 /*
 464  * Allocate a single emergency object and track it in a red black tree.
 465  */
 466 static int
 467 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 468 {
 469         gfp_t lflags = kmem_flags_convert(flags);
 470         spl_kmem_emergency_t *ske;
 471         int empty;
 472
 473         /* Last chance use a partial slab if one now exists */
 474         spin_lock(&skc->skc_lock);
 475         empty = list_empty(&skc->skc_partial_list);
 476         spin_unlock(&skc->skc_lock);
 477         if (!empty)
 478                 return (-EEXIST);
 479
 480         ske = kmalloc(sizeof (*ske), lflags);
 481         if (ske == NULL)
 482                 return (-ENOMEM);
 483
 484         ske->ske_obj = kmalloc(skc->skc_obj_size, lflags);
 485         if (ske->ske_obj == NULL) {
 486                 kfree(ske);
 487                 return (-ENOMEM);
 488         }
 489
 490         spin_lock(&skc->skc_lock);
 491         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 492         if (likely(empty)) {
 493                 skc->skc_obj_total++;
 494                 skc->skc_obj_emergency++;
 495                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 496                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 497         }
 498         spin_unlock(&skc->skc_lock);
 499
 500         if (unlikely(!empty)) {
 501                 kfree(ske->ske_obj);
 502                 kfree(ske);
 503                 return (-EINVAL);
 504         }
 505
 506         *obj = ske->ske_obj;
 507
 508         return (0);
 509 }
 510
 511 /*
 512  * Locate the passed object in the red black tree and free it.
 513  */
 514 static int
 515 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 516 {
 517         spl_kmem_emergency_t *ske;
 518
 519         spin_lock(&skc->skc_lock);
 520         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
 521         if (likely(ske)) {
 522                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
 523                 skc->skc_obj_emergency--;
 524                 skc->skc_obj_total--;
 525         }
 526         spin_unlock(&skc->skc_lock);
 527
 528         if (unlikely(ske == NULL))
 529                 return (-ENOENT);
 530
 531         kfree(ske->ske_obj);
 532         kfree(ske);
 533
 534         return (0);
 535 }
 536
 537 /*
 538  * Release objects from the per-cpu magazine back to their slab.  The flush
 539  * argument contains the max number of entries to remove from the magazine.
 540  */
 541 static void
 542 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 543 {
 544         int i, count = MIN(flush, skm->skm_avail);
 545
 546         ASSERT(skc->skc_magic == SKC_MAGIC);
 547         ASSERT(skm->skm_magic == SKM_MAGIC);
 548         ASSERT(spin_is_locked(&skc->skc_lock));
 549
 550         for (i = 0; i < count; i++)
 551                 spl_cache_shrink(skc, skm->skm_objs[i]);
 552
 553         skm->skm_avail -= count;
 554         memmove(skm->skm_objs, &(skm->skm_objs[count]),
 555             sizeof (void *) * skm->skm_avail);
 556 }
 557
 558 static void
 559 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 560 {
 561         spin_lock(&skc->skc_lock);
 562         __spl_cache_flush(skc, skm, flush);
 563         spin_unlock(&skc->skc_lock);
 564 }
 565
 566 static void
 567 spl_magazine_age(void *data)
 568 {
 569         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
 570         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 571
 572         ASSERT(skm->skm_magic == SKM_MAGIC);
 573         ASSERT(skm->skm_cpu == smp_processor_id());
 574         ASSERT(irqs_disabled());
 575
 576         /* There are no available objects or they are too young to age out */
 577         if ((skm->skm_avail == 0) ||
 578             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
 579                 return;
 580
 581         /*
 582          * Because we're executing in interrupt context we may have
 583          * interrupted the holder of this lock.  To avoid a potential
 584          * deadlock return if the lock is contended.
 585          */
 586         if (!spin_trylock(&skc->skc_lock))
 587                 return;
 588
 589         __spl_cache_flush(skc, skm, skm->skm_refill);
 590         spin_unlock(&skc->skc_lock);
 591 }
 592
 593 /*
 594  * Called regularly to keep a downward pressure on the cache.
 595  *
 596  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
 597  * be returned to the caches.  This is done to prevent idle magazines from
 598  * holding memory which could be better used elsewhere.  The delay is
 599  * present to prevent thrashing the magazine.
 600  *
 601  * The newly released objects may result in empty partial slabs.  Those
 602  * slabs should be released to the system.  Otherwise moving the objects
 603  * out of the magazines is just wasted work.
 604  */
 605 static void
 606 spl_cache_age(void *data)
 607 {
 608         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
 609         taskqid_t id = 0;
 610
 611         ASSERT(skc->skc_magic == SKC_MAGIC);
 612
 613         /* Dynamically disabled at run time */
 614         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
 615                 return;
 616
 617         atomic_inc(&skc->skc_ref);
 618
 619         if (!(skc->skc_flags & KMC_NOMAGAZINE))
 620                 on_each_cpu(spl_magazine_age, skc, 1);
 621
 622         spl_slab_reclaim(skc, skc->skc_reap, 0);
 623
 624         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
 625                 id = taskq_dispatch_delay(
 626                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
 627                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 628
 629                 /* Destroy issued after dispatch immediately cancel it */
 630                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
 631                         taskq_cancel_id(spl_kmem_cache_taskq, id);
 632         }
 633
 634         spin_lock(&skc->skc_lock);
 635         skc->skc_taskqid = id;
 636         spin_unlock(&skc->skc_lock);
 637
 638         atomic_dec(&skc->skc_ref);
 639 }
 640
 641 /*
 642  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
 643  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
 644  * for very small objects we may end up with more than this so as not
 645  * to waste space in the minimal allocation of a single page.  Also for
 646  * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
 647  * lower than this and we will fail.
 648  */
 649 static int
 650 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 651 {
 652         uint32_t sks_size, obj_size, max_size;
 653
 654         if (skc->skc_flags & KMC_OFFSLAB) {
 655                 *objs = spl_kmem_cache_obj_per_slab;
 656                 *size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
 657                 return (0);
 658         } else {
 659                 sks_size = spl_sks_size(skc);
 660                 obj_size = spl_obj_size(skc);
 661
 662                 if (skc->skc_flags & KMC_KMEM)
 663                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
 664                 else
 665                         max_size = (spl_kmem_cache_max_size * 1024 * 1024);
 666
 667                 /* Power of two sized slab */
 668                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
 669                         *objs = (*size - sks_size) / obj_size;
 670                         if (*objs >= spl_kmem_cache_obj_per_slab)
 671                                 return (0);
 672                 }
 673
 674                 /*
 675                  * Unable to satisfy target objects per slab, fall back to
 676                  * allocating a maximally sized slab and assuming it can
 677                  * contain the minimum objects count use it.  If not fail.
 678                  */
 679                 *size = max_size;
 680                 *objs = (*size - sks_size) / obj_size;
 681                 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
 682                         return (0);
 683         }
 684
 685         return (-ENOSPC);
 686 }
 687
 688 /*
 689  * Make a guess at reasonable per-cpu magazine size based on the size of
 690  * each object and the cost of caching N of them in each magazine.  Long
 691  * term this should really adapt based on an observed usage heuristic.
 692  */
 693 static int
 694 spl_magazine_size(spl_kmem_cache_t *skc)
 695 {
 696         uint32_t obj_size = spl_obj_size(skc);
 697         int size;
 698
 699         /* Per-magazine sizes below assume a 4Kib page size */
 700         if (obj_size > (PAGE_SIZE * 256))
 701                 size = 4;  /* Minimum 4Mib per-magazine */
 702         else if (obj_size > (PAGE_SIZE * 32))
 703                 size = 16; /* Minimum 2Mib per-magazine */
 704         else if (obj_size > (PAGE_SIZE))
 705                 size = 64; /* Minimum 256Kib per-magazine */
 706         else if (obj_size > (PAGE_SIZE / 4))
 707                 size = 128; /* Minimum 128Kib per-magazine */
 708         else
 709                 size = 256;
 710
 711         return (size);
 712 }
 713
 714 /*
 715  * Allocate a per-cpu magazine to associate with a specific core.
 716  */
 717 static spl_kmem_magazine_t *
 718 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 719 {
 720         spl_kmem_magazine_t *skm;
 721         int size = sizeof (spl_kmem_magazine_t) +
 722             sizeof (void *) * skc->skc_mag_size;
 723
 724         skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
 725         if (skm) {
 726                 skm->skm_magic = SKM_MAGIC;
 727                 skm->skm_avail = 0;
 728                 skm->skm_size = skc->skc_mag_size;
 729                 skm->skm_refill = skc->skc_mag_refill;
 730                 skm->skm_cache = skc;
 731                 skm->skm_age = jiffies;
 732                 skm->skm_cpu = cpu;
 733         }
 734
 735         return (skm);
 736 }
 737
 738 /*
 739  * Free a per-cpu magazine associated with a specific core.
 740  */
 741 static void
 742 spl_magazine_free(spl_kmem_magazine_t *skm)
 743 {
 744         ASSERT(skm->skm_magic == SKM_MAGIC);
 745         ASSERT(skm->skm_avail == 0);
 746         kfree(skm);
 747 }
 748
 749 /*
 750  * Create all pre-cpu magazines of reasonable sizes.
 751  */
 752 static int
 753 spl_magazine_create(spl_kmem_cache_t *skc)
 754 {
 755         int i;
 756
 757         if (skc->skc_flags & KMC_NOMAGAZINE)
 758                 return (0);
 759
 760         skc->skc_mag_size = spl_magazine_size(skc);
 761         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 762
 763         for_each_online_cpu(i) {
 764                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
 765                 if (!skc->skc_mag[i]) {
 766                         for (i--; i >= 0; i--)
 767                                 spl_magazine_free(skc->skc_mag[i]);
 768
 769                         return (-ENOMEM);
 770                 }
 771         }
 772
 773         return (0);
 774 }
 775
 776 /*
 777  * Destroy all pre-cpu magazines.
 778  */
 779 static void
 780 spl_magazine_destroy(spl_kmem_cache_t *skc)
 781 {
 782         spl_kmem_magazine_t *skm;
 783         int i;
 784
 785         if (skc->skc_flags & KMC_NOMAGAZINE)
 786                 return;
 787
 788         for_each_online_cpu(i) {
 789                 skm = skc->skc_mag[i];
 790                 spl_cache_flush(skc, skm, skm->skm_avail);
 791                 spl_magazine_free(skm);
 792         }
 793 }
 794
 795 /*
 796  * Create a object cache based on the following arguments:
 797  * name         cache name
 798  * size         cache object size
 799  * align        cache object alignment
 800  * ctor         cache object constructor
 801  * dtor         cache object destructor
 802  * reclaim      cache object reclaim
 803  * priv         cache private data for ctor/dtor/reclaim
 804  * vmp          unused must be NULL
 805  * flags
 806  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
 807  *      KMC_NODEBUG     Disable debugging (unsupported)
 808  *      KMC_NOHASH      Disable hashing (unsupported)
 809  *      KMC_QCACHE      Disable qcache (unsupported)
 810  *      KMC_NOMAGAZINE  Enabled for kmem/vmem, Disabled for Linux slab
 811  *      KMC_KMEM        Force kmem backed cache
 812  *      KMC_VMEM        Force vmem backed cache
 813  *      KMC_SLAB        Force Linux slab backed cache
 814  *      KMC_OFFSLAB     Locate objects off the slab
 815  */
 816 spl_kmem_cache_t *
 817 spl_kmem_cache_create(char *name, size_t size, size_t align,
 818     spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
 819     void *priv, void *vmp, int flags)
 820 {
 821         gfp_t lflags = kmem_flags_convert(KM_SLEEP);
 822         spl_kmem_cache_t *skc;
 823         int rc;
 824
 825         /*
 826          * Unsupported flags
 827          */
 828         ASSERT0(flags & KMC_NOMAGAZINE);
 829         ASSERT0(flags & KMC_NOHASH);
 830         ASSERT0(flags & KMC_QCACHE);
 831         ASSERT(vmp == NULL);
 832
 833         might_sleep();
 834
 835         /*
 836          * Allocate memory for a new cache and initialize it.  Unfortunately,
 837          * this usually ends up being a large allocation of ~32k because
 838          * we need to allocate enough memory for the worst case number of
 839          * cpus in the magazine, skc_mag[NR_CPUS].
 840          */
 841         skc = kzalloc(sizeof (*skc), lflags);
 842         if (skc == NULL)
 843                 return (NULL);
 844
 845         skc->skc_magic = SKC_MAGIC;
 846         skc->skc_name_size = strlen(name) + 1;
 847         skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
 848         if (skc->skc_name == NULL) {
 849                 kfree(skc);
 850                 return (NULL);
 851         }
 852         strncpy(skc->skc_name, name, skc->skc_name_size);
 853
 854         skc->skc_ctor = ctor;
 855         skc->skc_dtor = dtor;
 856         skc->skc_reclaim = reclaim;
 857         skc->skc_private = priv;
 858         skc->skc_vmp = vmp;
 859         skc->skc_linux_cache = NULL;
 860         skc->skc_flags = flags;
 861         skc->skc_obj_size = size;
 862         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 863         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
 864         skc->skc_reap = SPL_KMEM_CACHE_REAP;
 865         atomic_set(&skc->skc_ref, 0);
 866
 867         INIT_LIST_HEAD(&skc->skc_list);
 868         INIT_LIST_HEAD(&skc->skc_complete_list);
 869         INIT_LIST_HEAD(&skc->skc_partial_list);
 870         skc->skc_emergency_tree = RB_ROOT;
 871         spin_lock_init(&skc->skc_lock);
 872         init_waitqueue_head(&skc->skc_waitq);
 873         skc->skc_slab_fail = 0;
 874         skc->skc_slab_create = 0;
 875         skc->skc_slab_destroy = 0;
 876         skc->skc_slab_total = 0;
 877         skc->skc_slab_alloc = 0;
 878         skc->skc_slab_max = 0;
 879         skc->skc_obj_total = 0;
 880         skc->skc_obj_alloc = 0;
 881         skc->skc_obj_max = 0;
 882         skc->skc_obj_deadlock = 0;
 883         skc->skc_obj_emergency = 0;
 884         skc->skc_obj_emergency_max = 0;
 885
 886         /*
 887          * Verify the requested alignment restriction is sane.
 888          */
 889         if (align) {
 890                 VERIFY(ISP2(align));
 891                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
 892                 VERIFY3U(align, <=, PAGE_SIZE);
 893                 skc->skc_obj_align = align;
 894         }
 895
 896         /*
 897          * When no specific type of slab is requested (kmem, vmem, or
 898          * linuxslab) then select a cache type based on the object size
 899          * and default tunables.
 900          */
 901         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
 902
 903                 /*
 904                  * Objects smaller than spl_kmem_cache_slab_limit can
 905                  * use the Linux slab for better space-efficiency.  By
 906                  * default this functionality is disabled until its
 907                  * performance characteristics are fully understood.
 908                  */
 909                 if (spl_kmem_cache_slab_limit &&
 910                     size <= (size_t)spl_kmem_cache_slab_limit)
 911                         skc->skc_flags |= KMC_SLAB;
 912
 913                 /*
 914                  * Small objects, less than spl_kmem_cache_kmem_limit per
 915                  * object should use kmem because their slabs are small.
 916                  */
 917                 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
 918                         skc->skc_flags |= KMC_KMEM;
 919
 920                 /*
 921                  * All other objects are considered large and are placed
 922                  * on vmem backed slabs.
 923                  */
 924                 else
 925                         skc->skc_flags |= KMC_VMEM;
 926         }
 927
 928         /*
 929          * Given the type of slab allocate the required resources.
 930          */
 931         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
 932                 rc = spl_slab_size(skc,
 933                     &skc->skc_slab_objs, &skc->skc_slab_size);
 934                 if (rc)
 935                         goto out;
 936
 937                 rc = spl_magazine_create(skc);
 938                 if (rc)
 939                         goto out;
 940         } else {
 941                 skc->skc_linux_cache = kmem_cache_create(
 942                     skc->skc_name, size, align, 0, NULL);
 943                 if (skc->skc_linux_cache == NULL) {
 944                         rc = ENOMEM;
 945                         goto out;
 946                 }
 947
 948 #if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
 949                 skc->skc_linux_cache->allocflags |= __GFP_COMP;
 950 #elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
 951                 skc->skc_linux_cache->gfpflags |= __GFP_COMP;
 952 #endif
 953                 skc->skc_flags |= KMC_NOMAGAZINE;
 954         }
 955
 956         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
 957                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
 958                     spl_cache_age, skc, TQ_SLEEP,
 959                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
 960
 961         down_write(&spl_kmem_cache_sem);
 962         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 963         up_write(&spl_kmem_cache_sem);
 964
 965         return (skc);
 966 out:
 967         kfree(skc->skc_name);
 968         kfree(skc);
 969         return (NULL);
 970 }
 971 EXPORT_SYMBOL(spl_kmem_cache_create);
 972
 973 /*
 974  * Register a move callback for cache defragmentation.
 975  * XXX: Unimplemented but harmless to stub out for now.
 976  */
 977 void
 978 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
 979     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 980 {
 981         ASSERT(move != NULL);
 982 }
 983 EXPORT_SYMBOL(spl_kmem_cache_set_move);
 984
 985 /*
 986  * Destroy a cache and all objects associated with the cache.
 987  */
 988 void
 989 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 990 {
 991         DECLARE_WAIT_QUEUE_HEAD(wq);
 992         taskqid_t id;
 993
 994         ASSERT(skc->skc_magic == SKC_MAGIC);
 995         ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
 996
 997         down_write(&spl_kmem_cache_sem);
 998         list_del_init(&skc->skc_list);
 999         up_write(&spl_kmem_cache_sem);
1000
1001         /* Cancel any and wait for any pending delayed tasks */
1002         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1003
1004         spin_lock(&skc->skc_lock);
1005         id = skc->skc_taskqid;
1006         spin_unlock(&skc->skc_lock);
1007
1008         taskq_cancel_id(spl_kmem_cache_taskq, id);
1009
1010         /*
1011          * Wait until all current callers complete, this is mainly
1012          * to catch the case where a low memory situation triggers a
1013          * cache reaping action which races with this destroy.
1014          */
1015         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1016
1017         if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1018                 spl_magazine_destroy(skc);
1019                 spl_slab_reclaim(skc, 0, 1);
1020         } else {
1021                 ASSERT(skc->skc_flags & KMC_SLAB);
1022                 kmem_cache_destroy(skc->skc_linux_cache);
1023         }
1024
1025         spin_lock(&skc->skc_lock);
1026
1027         /*
1028          * Validate there are no objects in use and free all the
1029          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
1030          */
1031         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1032         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1033         ASSERT3U(skc->skc_slab_total, ==, 0);
1034         ASSERT3U(skc->skc_obj_total, ==, 0);
1035         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1036         ASSERT(list_empty(&skc->skc_complete_list));
1037
1038         spin_unlock(&skc->skc_lock);
1039
1040         kfree(skc->skc_name);
1041         kfree(skc);
1042 }
1043 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1044
1045 /*
1046  * Allocate an object from a slab attached to the cache.  This is used to
1047  * repopulate the per-cpu magazine caches in batches when they run low.
1048  */
1049 static void *
1050 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1051 {
1052         spl_kmem_obj_t *sko;
1053
1054         ASSERT(skc->skc_magic == SKC_MAGIC);
1055         ASSERT(sks->sks_magic == SKS_MAGIC);
1056         ASSERT(spin_is_locked(&skc->skc_lock));
1057
1058         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1059         ASSERT(sko->sko_magic == SKO_MAGIC);
1060         ASSERT(sko->sko_addr != NULL);
1061
1062         /* Remove from sks_free_list */
1063         list_del_init(&sko->sko_list);
1064
1065         sks->sks_age = jiffies;
1066         sks->sks_ref++;
1067         skc->skc_obj_alloc++;
1068
1069         /* Track max obj usage statistics */
1070         if (skc->skc_obj_alloc > skc->skc_obj_max)
1071                 skc->skc_obj_max = skc->skc_obj_alloc;
1072
1073         /* Track max slab usage statistics */
1074         if (sks->sks_ref == 1) {
1075                 skc->skc_slab_alloc++;
1076
1077                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1078                         skc->skc_slab_max = skc->skc_slab_alloc;
1079         }
1080
1081         return (sko->sko_addr);
1082 }
1083
1084 /*
1085  * Generic slab allocation function to run by the global work queues.
1086  * It is responsible for allocating a new slab, linking it in to the list
1087  * of partial slabs, and then waking any waiters.
1088  */
1089 static void
1090 spl_cache_grow_work(void *data)
1091 {
1092         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1093         spl_kmem_cache_t *skc = ska->ska_cache;
1094         spl_kmem_slab_t *sks;
1095
1096 #if defined(PF_MEMALLOC_NOIO)
1097         unsigned noio_flag = memalloc_noio_save();
1098         sks = spl_slab_alloc(skc, ska->ska_flags);
1099         memalloc_noio_restore(noio_flag);
1100 #else
1101         fstrans_cookie_t cookie = spl_fstrans_mark();
1102         sks = spl_slab_alloc(skc, ska->ska_flags);
1103         spl_fstrans_unmark(cookie);
1104 #endif
1105         spin_lock(&skc->skc_lock);
1106         if (sks) {
1107                 skc->skc_slab_total++;
1108                 skc->skc_obj_total += sks->sks_objs;
1109                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1110         }
1111
1112         atomic_dec(&skc->skc_ref);
1113         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1114         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1115         wake_up_all(&skc->skc_waitq);
1116         spin_unlock(&skc->skc_lock);
1117
1118         kfree(ska);
1119 }
1120
1121 /*
1122  * Returns non-zero when a new slab should be available.
1123  */
1124 static int
1125 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1126 {
1127         return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
1128 }
1129
1130 /*
1131  * No available objects on any slabs, create a new slab.  Note that this
1132  * functionality is disabled for KMC_SLAB caches which are backed by the
1133  * Linux slab.
1134  */
1135 static int
1136 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1137 {
1138         int remaining, rc = 0;
1139
1140         ASSERT0(flags & ~KM_PUBLIC_MASK);
1141         ASSERT(skc->skc_magic == SKC_MAGIC);
1142         ASSERT((skc->skc_flags & KMC_SLAB) == 0);
1143         might_sleep();
1144         *obj = NULL;
1145
1146         /*
1147          * Before allocating a new slab wait for any reaping to complete and
1148          * then return so the local magazine can be rechecked for new objects.
1149          */
1150         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1151                 rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1152                     TASK_UNINTERRUPTIBLE);
1153                 return (rc ? rc : -EAGAIN);
1154         }
1155
1156         /*
1157          * This is handled by dispatching a work request to the global work
1158          * queue.  This allows us to asynchronously allocate a new slab while
1159          * retaining the ability to safely fall back to a smaller synchronous
1160          * allocations to ensure forward progress is always maintained.
1161          */
1162         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1163                 spl_kmem_alloc_t *ska;
1164
1165                 ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
1166                 if (ska == NULL) {
1167                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1168                         wake_up_all(&skc->skc_waitq);
1169                         return (-ENOMEM);
1170                 }
1171
1172                 atomic_inc(&skc->skc_ref);
1173                 ska->ska_cache = skc;
1174                 ska->ska_flags = flags;
1175                 taskq_init_ent(&ska->ska_tqe);
1176                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1177                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1178         }
1179
1180         /*
1181          * The goal here is to only detect the rare case where a virtual slab
1182          * allocation has deadlocked.  We must be careful to minimize the use
1183          * of emergency objects which are more expensive to track.  Therefore,
1184          * we set a very long timeout for the asynchronous allocation and if
1185          * the timeout is reached the cache is flagged as deadlocked.  From
1186          * this point only new emergency objects will be allocated until the
1187          * asynchronous allocation completes and clears the deadlocked flag.
1188          */
1189         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1190                 rc = spl_emergency_alloc(skc, flags, obj);
1191         } else {
1192                 remaining = wait_event_timeout(skc->skc_waitq,
1193                     spl_cache_grow_wait(skc), HZ);
1194
1195                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1196                         spin_lock(&skc->skc_lock);
1197                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1198                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1199                                 skc->skc_obj_deadlock++;
1200                         }
1201                         spin_unlock(&skc->skc_lock);
1202                 }
1203
1204                 rc = -ENOMEM;
1205         }
1206
1207         return (rc);
1208 }
1209
1210 /*
1211  * Refill a per-cpu magazine with objects from the slabs for this cache.
1212  * Ideally the magazine can be repopulated using existing objects which have
1213  * been released, however if we are unable to locate enough free objects new
1214  * slabs of objects will be created.  On success NULL is returned, otherwise
1215  * the address of a single emergency object is returned for use by the caller.
1216  */
1217 static void *
1218 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1219 {
1220         spl_kmem_slab_t *sks;
1221         int count = 0, rc, refill;
1222         void *obj = NULL;
1223
1224         ASSERT(skc->skc_magic == SKC_MAGIC);
1225         ASSERT(skm->skm_magic == SKM_MAGIC);
1226
1227         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1228         spin_lock(&skc->skc_lock);
1229
1230         while (refill > 0) {
1231                 /* No slabs available we may need to grow the cache */
1232                 if (list_empty(&skc->skc_partial_list)) {
1233                         spin_unlock(&skc->skc_lock);
1234
1235                         local_irq_enable();
1236                         rc = spl_cache_grow(skc, flags, &obj);
1237                         local_irq_disable();
1238
1239                         /* Emergency object for immediate use by caller */
1240                         if (rc == 0 && obj != NULL)
1241                                 return (obj);
1242
1243                         if (rc)
1244                                 goto out;
1245
1246                         /* Rescheduled to different CPU skm is not local */
1247                         if (skm != skc->skc_mag[smp_processor_id()])
1248                                 goto out;
1249
1250                         /*
1251                          * Potentially rescheduled to the same CPU but
1252                          * allocations may have occurred from this CPU while
1253                          * we were sleeping so recalculate max refill.
1254                          */
1255                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1256
1257                         spin_lock(&skc->skc_lock);
1258                         continue;
1259                 }
1260
1261                 /* Grab the next available slab */
1262                 sks = list_entry((&skc->skc_partial_list)->next,
1263                     spl_kmem_slab_t, sks_list);
1264                 ASSERT(sks->sks_magic == SKS_MAGIC);
1265                 ASSERT(sks->sks_ref < sks->sks_objs);
1266                 ASSERT(!list_empty(&sks->sks_free_list));
1267
1268                 /*
1269                  * Consume as many objects as needed to refill the requested
1270                  * cache.  We must also be careful not to overfill it.
1271                  */
1272                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
1273                     ++count) {
1274                         ASSERT(skm->skm_avail < skm->skm_size);
1275                         ASSERT(count < skm->skm_size);
1276                         skm->skm_objs[skm->skm_avail++] =
1277                             spl_cache_obj(skc, sks);
1278                 }
1279
1280                 /* Move slab to skc_complete_list when full */
1281                 if (sks->sks_ref == sks->sks_objs) {
1282                         list_del(&sks->sks_list);
1283                         list_add(&sks->sks_list, &skc->skc_complete_list);
1284                 }
1285         }
1286
1287         spin_unlock(&skc->skc_lock);
1288 out:
1289         return (NULL);
1290 }
1291
1292 /*
1293  * Release an object back to the slab from which it came.
1294  */
1295 static void
1296 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1297 {
1298         spl_kmem_slab_t *sks = NULL;
1299         spl_kmem_obj_t *sko = NULL;
1300
1301         ASSERT(skc->skc_magic == SKC_MAGIC);
1302         ASSERT(spin_is_locked(&skc->skc_lock));
1303
1304         sko = spl_sko_from_obj(skc, obj);
1305         ASSERT(sko->sko_magic == SKO_MAGIC);
1306         sks = sko->sko_slab;
1307         ASSERT(sks->sks_magic == SKS_MAGIC);
1308         ASSERT(sks->sks_cache == skc);
1309         list_add(&sko->sko_list, &sks->sks_free_list);
1310
1311         sks->sks_age = jiffies;
1312         sks->sks_ref--;
1313         skc->skc_obj_alloc--;
1314
1315         /*
1316          * Move slab to skc_partial_list when no longer full.  Slabs
1317          * are added to the head to keep the partial list is quasi-full
1318          * sorted order.  Fuller at the head, emptier at the tail.
1319          */
1320         if (sks->sks_ref == (sks->sks_objs - 1)) {
1321                 list_del(&sks->sks_list);
1322                 list_add(&sks->sks_list, &skc->skc_partial_list);
1323         }
1324
1325         /*
1326          * Move empty slabs to the end of the partial list so
1327          * they can be easily found and freed during reclamation.
1328          */
1329         if (sks->sks_ref == 0) {
1330                 list_del(&sks->sks_list);
1331                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1332                 skc->skc_slab_alloc--;
1333         }
1334 }
1335
1336 /*
1337  * Allocate an object from the per-cpu magazine, or if the magazine
1338  * is empty directly allocate from a slab and repopulate the magazine.
1339  */
1340 void *
1341 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1342 {
1343         spl_kmem_magazine_t *skm;
1344         void *obj = NULL;
1345
1346         ASSERT0(flags & ~KM_PUBLIC_MASK);
1347         ASSERT(skc->skc_magic == SKC_MAGIC);
1348         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1349
1350         atomic_inc(&skc->skc_ref);
1351
1352         /*
1353          * Allocate directly from a Linux slab.  All optimizations are left
1354          * to the underlying cache we only need to guarantee that KM_SLEEP
1355          * callers will never fail.
1356          */
1357         if (skc->skc_flags & KMC_SLAB) {
1358                 struct kmem_cache *slc = skc->skc_linux_cache;
1359                 do {
1360                         obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
1361                 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
1362
1363                 goto ret;
1364         }
1365
1366         local_irq_disable();
1367
1368 restart:
1369         /*
1370          * Safe to update per-cpu structure without lock, but
1371          * in the restart case we must be careful to reacquire
1372          * the local magazine since this may have changed
1373          * when we need to grow the cache.
1374          */
1375         skm = skc->skc_mag[smp_processor_id()];
1376         ASSERT(skm->skm_magic == SKM_MAGIC);
1377
1378         if (likely(skm->skm_avail)) {
1379                 /* Object available in CPU cache, use it */
1380                 obj = skm->skm_objs[--skm->skm_avail];
1381                 skm->skm_age = jiffies;
1382         } else {
1383                 obj = spl_cache_refill(skc, skm, flags);
1384                 if (obj == NULL)
1385                         goto restart;
1386         }
1387
1388         local_irq_enable();
1389         ASSERT(obj);
1390         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1391
1392 ret:
1393         /* Pre-emptively migrate object to CPU L1 cache */
1394         if (obj) {
1395                 if (obj && skc->skc_ctor)
1396                         skc->skc_ctor(obj, skc->skc_private, flags);
1397                 else
1398                         prefetchw(obj);
1399         }
1400
1401         atomic_dec(&skc->skc_ref);
1402
1403         return (obj);
1404 }
1405
1406 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1407
1408 /*
1409  * Free an object back to the local per-cpu magazine, there is no
1410  * guarantee that this is the same magazine the object was originally
1411  * allocated from.  We may need to flush entire from the magazine
1412  * back to the slabs to make space.
1413  */
1414 void
1415 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1416 {
1417         spl_kmem_magazine_t *skm;
1418         unsigned long flags;
1419
1420         ASSERT(skc->skc_magic == SKC_MAGIC);
1421         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1422         atomic_inc(&skc->skc_ref);
1423
1424         /*
1425          * Run the destructor
1426          */
1427         if (skc->skc_dtor)
1428                 skc->skc_dtor(obj, skc->skc_private);
1429
1430         /*
1431          * Free the object from the Linux underlying Linux slab.
1432          */
1433         if (skc->skc_flags & KMC_SLAB) {
1434                 kmem_cache_free(skc->skc_linux_cache, obj);
1435                 goto out;
1436         }
1437
1438         /*
1439          * Only virtual slabs may have emergency objects and these objects
1440          * are guaranteed to have physical addresses.  They must be removed
1441          * from the tree of emergency objects and the freed.
1442          */
1443         if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) {
1444                 spl_emergency_free(skc, obj);
1445                 goto out;
1446         }
1447
1448         local_irq_save(flags);
1449
1450         /*
1451          * Safe to update per-cpu structure without lock, but
1452          * no remote memory allocation tracking is being performed
1453          * it is entirely possible to allocate an object from one
1454          * CPU cache and return it to another.
1455          */
1456         skm = skc->skc_mag[smp_processor_id()];
1457         ASSERT(skm->skm_magic == SKM_MAGIC);
1458
1459         /* Per-CPU cache full, flush it to make space */
1460         if (unlikely(skm->skm_avail >= skm->skm_size))
1461                 spl_cache_flush(skc, skm, skm->skm_refill);
1462
1463         /* Available space in cache, use it */
1464         skm->skm_objs[skm->skm_avail++] = obj;
1465
1466         local_irq_restore(flags);
1467 out:
1468         atomic_dec(&skc->skc_ref);
1469 }
1470 EXPORT_SYMBOL(spl_kmem_cache_free);
1471
1472 /*
1473  * The generic shrinker function for all caches.  Under Linux a shrinker
1474  * may not be tightly coupled with a slab cache.  In fact Linux always
1475  * systematically tries calling all registered shrinker callbacks which
1476  * report that they contain unused objects.  Because of this we only
1477  * register one shrinker function in the shim layer for all slab caches.
1478  * We always attempt to shrink all caches when this generic shrinker
1479  * is called.
1480  *
1481  * If sc->nr_to_scan is zero, the caller is requesting a query of the
1482  * number of objects which can potentially be freed.  If it is nonzero,
1483  * the request is to free that many objects.
1484  *
1485  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
1486  * in struct shrinker and also require the shrinker to return the number
1487  * of objects freed.
1488  *
1489  * Older kernels require the shrinker to return the number of freeable
1490  * objects following the freeing of nr_to_free.
1491  *
1492  * Linux semantics differ from those under Solaris, which are to
1493  * free all available objects which may (and probably will) be more
1494  * objects than the requested nr_to_scan.
1495  */
1496 static spl_shrinker_t
1497 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
1498     struct shrink_control *sc)
1499 {
1500         spl_kmem_cache_t *skc;
1501         int alloc = 0;
1502
1503         down_read(&spl_kmem_cache_sem);
1504         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1505                 if (sc->nr_to_scan) {
1506 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
1507                         uint64_t oldalloc = skc->skc_obj_alloc;
1508                         spl_kmem_cache_reap_now(skc,
1509                             MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
1510                         if (oldalloc > skc->skc_obj_alloc)
1511                                 alloc += oldalloc - skc->skc_obj_alloc;
1512 #else
1513                         spl_kmem_cache_reap_now(skc,
1514                             MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
1515                         alloc += skc->skc_obj_alloc;
1516 #endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
1517                 } else {
1518                         /* Request to query number of freeable objects */
1519                         alloc += skc->skc_obj_alloc;
1520                 }
1521         }
1522         up_read(&spl_kmem_cache_sem);
1523
1524         /*
1525          * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
1526          * This functionality only exists to work around a rare issue where
1527          * shrink_slabs() is repeatedly invoked by many cores causing the
1528          * system to thrash.
1529          */
1530         if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
1531                 return (SHRINK_STOP);
1532
1533         return (MAX(alloc, 0));
1534 }
1535
1536 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
1537
1538 /*
1539  * Call the registered reclaim function for a cache.  Depending on how
1540  * many and which objects are released it may simply repopulate the
1541  * local magazine which will then need to age-out.  Objects which cannot
1542  * fit in the magazine we will be released back to their slabs which will
1543  * also need to age out before being release.  This is all just best
1544  * effort and we do not want to thrash creating and destroying slabs.
1545  */
1546 void
1547 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
1548 {
1549         ASSERT(skc->skc_magic == SKC_MAGIC);
1550         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1551
1552         atomic_inc(&skc->skc_ref);
1553
1554         /*
1555          * Execute the registered reclaim callback if it exists.  The
1556          * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
1557          */
1558         if (skc->skc_flags & KMC_SLAB) {
1559                 if (skc->skc_reclaim)
1560                         skc->skc_reclaim(skc->skc_private);
1561
1562                 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
1563                         kmem_cache_shrink(skc->skc_linux_cache);
1564
1565                 goto out;
1566         }
1567
1568         /*
1569          * Prevent concurrent cache reaping when contended.
1570          */
1571         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
1572                 goto out;
1573
1574         /*
1575          * When a reclaim function is available it may be invoked repeatedly
1576          * until at least a single slab can be freed.  This ensures that we
1577          * do free memory back to the system.  This helps minimize the chance
1578          * of an OOM event when the bulk of memory is used by the slab.
1579          *
1580          * When free slabs are already available the reclaim callback will be
1581          * skipped.  Additionally, if no forward progress is detected despite
1582          * a reclaim function the cache will be skipped to avoid deadlock.
1583          *
1584          * Longer term this would be the correct place to add the code which
1585          * repacks the slabs in order minimize fragmentation.
1586          */
1587         if (skc->skc_reclaim) {
1588                 uint64_t objects = UINT64_MAX;
1589                 int do_reclaim;
1590
1591                 do {
1592                         spin_lock(&skc->skc_lock);
1593                         do_reclaim =
1594                             (skc->skc_slab_total > 0) &&
1595                             ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
1596                             (skc->skc_obj_alloc < objects);
1597
1598                         objects = skc->skc_obj_alloc;
1599                         spin_unlock(&skc->skc_lock);
1600
1601                         if (do_reclaim)
1602                                 skc->skc_reclaim(skc->skc_private);
1603
1604                 } while (do_reclaim);
1605         }
1606
1607         /* Reclaim from the magazine then the slabs ignoring age and delay. */
1608         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
1609                 spl_kmem_magazine_t *skm;
1610                 unsigned long irq_flags;
1611
1612                 local_irq_save(irq_flags);
1613                 skm = skc->skc_mag[smp_processor_id()];
1614                 spl_cache_flush(skc, skm, skm->skm_avail);
1615                 local_irq_restore(irq_flags);
1616         }
1617
1618         spl_slab_reclaim(skc, count, 1);
1619         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1620         smp_wmb();
1621         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
1622 out:
1623         atomic_dec(&skc->skc_ref);
1624 }
1625 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1626
1627 /*
1628  * Reap all free slabs from all registered caches.
1629  */
1630 void
1631 spl_kmem_reap(void)
1632 {
1633         struct shrink_control sc;
1634
1635         sc.nr_to_scan = KMC_REAP_CHUNK;
1636         sc.gfp_mask = GFP_KERNEL;
1637
1638         (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
1639 }
1640 EXPORT_SYMBOL(spl_kmem_reap);
1641
1642 int
1643 spl_kmem_cache_init(void)
1644 {
1645         init_rwsem(&spl_kmem_cache_sem);
1646         INIT_LIST_HEAD(&spl_kmem_cache_list);
1647         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1648             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
1649         spl_register_shrinker(&spl_kmem_cache_shrinker);
1650
1651         return (0);
1652 }
1653
1654 void
1655 spl_kmem_cache_fini(void)
1656 {
1657         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
1658         taskq_destroy(spl_kmem_cache_taskq);
1659 }