module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://github.com/behlendorf/spl/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28
  29 #ifdef DEBUG_SUBSYSTEM
  30 # undef DEBUG_SUBSYSTEM
  31 #endif
  32
  33 #define DEBUG_SUBSYSTEM S_KMEM
  34
  35 /*
  36  * The minimum amount of memory measured in pages to be free at all
  37  * times on the system.  This is similar to Linux's zone->pages_min
  38  * multipled by the number of zones and is sized based on that.
  39  */
  40 pgcnt_t minfree = 0;
  41 EXPORT_SYMBOL(minfree);
  42
  43 /*
  44  * The desired amount of memory measured in pages to be free at all
  45  * times on the system.  This is similar to Linux's zone->pages_low
  46  * multipled by the number of zones and is sized based on that.
  47  * Assuming all zones are being used roughly equally, when we drop
  48  * below this threshold async page reclamation is triggered.
  49  */
  50 pgcnt_t desfree = 0;
  51 EXPORT_SYMBOL(desfree);
  52
  53 /*
  54  * When above this amount of memory measures in pages the system is
  55  * determined to have enough free memory.  This is similar to Linux's
  56  * zone->pages_high multipled by the number of zones and is sized based
  57  * on that.  Assuming all zones are being used roughly equally, when
  58  * async page reclamation reaches this threshold it stops.
  59  */
  60 pgcnt_t lotsfree = 0;
  61 EXPORT_SYMBOL(lotsfree);
  62
  63 /* Unused always 0 in this implementation */
  64 pgcnt_t needfree = 0;
  65 EXPORT_SYMBOL(needfree);
  66
  67 pgcnt_t swapfs_minfree = 0;
  68 EXPORT_SYMBOL(swapfs_minfree);
  69
  70 pgcnt_t swapfs_reserve = 0;
  71 EXPORT_SYMBOL(swapfs_reserve);
  72
  73 vmem_t *heap_arena = NULL;
  74 EXPORT_SYMBOL(heap_arena);
  75
  76 vmem_t *zio_alloc_arena = NULL;
  77 EXPORT_SYMBOL(zio_alloc_arena);
  78
  79 vmem_t *zio_arena = NULL;
  80 EXPORT_SYMBOL(zio_arena);
  81
  82 #ifndef HAVE_GET_VMALLOC_INFO
  83 get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
  84 EXPORT_SYMBOL(get_vmalloc_info_fn);
  85 #endif /* HAVE_GET_VMALLOC_INFO */
  86
  87 #ifdef HAVE_PGDAT_HELPERS
  88 # ifndef HAVE_FIRST_ONLINE_PGDAT
  89 first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
  90 EXPORT_SYMBOL(first_online_pgdat_fn);
  91 # endif /* HAVE_FIRST_ONLINE_PGDAT */
  92
  93 # ifndef HAVE_NEXT_ONLINE_PGDAT
  94 next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
  95 EXPORT_SYMBOL(next_online_pgdat_fn);
  96 # endif /* HAVE_NEXT_ONLINE_PGDAT */
  97
  98 # ifndef HAVE_NEXT_ZONE
  99 next_zone_t next_zone_fn = SYMBOL_POISON;
 100 EXPORT_SYMBOL(next_zone_fn);
 101 # endif /* HAVE_NEXT_ZONE */
 102
 103 #else /* HAVE_PGDAT_HELPERS */
 104
 105 # ifndef HAVE_PGDAT_LIST
 106 struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
 107 EXPORT_SYMBOL(pgdat_list_addr);
 108 # endif /* HAVE_PGDAT_LIST */
 109
 110 #endif /* HAVE_PGDAT_HELPERS */
 111
 112 #ifdef NEED_GET_ZONE_COUNTS
 113 # ifndef HAVE_GET_ZONE_COUNTS
 114 get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
 115 EXPORT_SYMBOL(get_zone_counts_fn);
 116 # endif /* HAVE_GET_ZONE_COUNTS */
 117
 118 unsigned long
 119 spl_global_page_state(spl_zone_stat_item_t item)
 120 {
 121         unsigned long active;
 122         unsigned long inactive;
 123         unsigned long free;
 124
 125         get_zone_counts(&active, &inactive, &free);
 126         switch (item) {
 127         case SPL_NR_FREE_PAGES: return free;
 128         case SPL_NR_INACTIVE:   return inactive;
 129         case SPL_NR_ACTIVE:     return active;
 130         default:                ASSERT(0); /* Unsupported */
 131         }
 132
 133         return 0;
 134 }
 135 #else
 136 # ifdef HAVE_GLOBAL_PAGE_STATE
 137 unsigned long
 138 spl_global_page_state(spl_zone_stat_item_t item)
 139 {
 140         unsigned long pages = 0;
 141
 142         switch (item) {
 143         case SPL_NR_FREE_PAGES:
 144 #  ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
 145                 pages += global_page_state(NR_FREE_PAGES);
 146 #  endif
 147                 break;
 148         case SPL_NR_INACTIVE:
 149 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
 150                 pages += global_page_state(NR_INACTIVE);
 151 #  endif
 152 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
 153                 pages += global_page_state(NR_INACTIVE_ANON);
 154 #  endif
 155 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
 156                 pages += global_page_state(NR_INACTIVE_FILE);
 157 #  endif
 158                 break;
 159         case SPL_NR_ACTIVE:
 160 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
 161                 pages += global_page_state(NR_ACTIVE);
 162 #  endif
 163 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
 164                 pages += global_page_state(NR_ACTIVE_ANON);
 165 #  endif
 166 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
 167                 pages += global_page_state(NR_ACTIVE_FILE);
 168 #  endif
 169                 break;
 170         default:
 171                 ASSERT(0); /* Unsupported */
 172         }
 173
 174         return pages;
 175 }
 176 # else
 177 #  error "Both global_page_state() and get_zone_counts() unavailable"
 178 # endif /* HAVE_GLOBAL_PAGE_STATE */
 179 #endif /* NEED_GET_ZONE_COUNTS */
 180 EXPORT_SYMBOL(spl_global_page_state);
 181
 182 pgcnt_t
 183 spl_kmem_availrmem(void)
 184 {
 185         /* The amount of easily available memory */
 186         return (spl_global_page_state(SPL_NR_FREE_PAGES) +
 187                 spl_global_page_state(SPL_NR_INACTIVE));
 188 }
 189 EXPORT_SYMBOL(spl_kmem_availrmem);
 190
 191 size_t
 192 vmem_size(vmem_t *vmp, int typemask)
 193 {
 194         struct vmalloc_info vmi;
 195         size_t size = 0;
 196
 197         ASSERT(vmp == NULL);
 198         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 199
 200         get_vmalloc_info(&vmi);
 201         if (typemask & VMEM_ALLOC)
 202                 size += (size_t)vmi.used;
 203
 204         if (typemask & VMEM_FREE)
 205                 size += (size_t)(VMALLOC_TOTAL - vmi.used);
 206
 207         return size;
 208 }
 209 EXPORT_SYMBOL(vmem_size);
 210
 211 int
 212 kmem_debugging(void)
 213 {
 214         return 0;
 215 }
 216 EXPORT_SYMBOL(kmem_debugging);
 217
 218 #ifndef HAVE_KVASPRINTF
 219 /* Simplified asprintf. */
 220 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 221 {
 222         unsigned int len;
 223         char *p;
 224         va_list aq;
 225
 226         va_copy(aq, ap);
 227         len = vsnprintf(NULL, 0, fmt, aq);
 228         va_end(aq);
 229
 230         p = kmalloc(len+1, gfp);
 231         if (!p)
 232                 return NULL;
 233
 234         vsnprintf(p, len+1, fmt, ap);
 235
 236         return p;
 237 }
 238 EXPORT_SYMBOL(kvasprintf);
 239 #endif /* HAVE_KVASPRINTF */
 240
 241 char *
 242 kmem_asprintf(const char *fmt, ...)
 243 {
 244         va_list args;
 245         char *ptr;
 246
 247         va_start(args, fmt);
 248         do {
 249                 ptr = kvasprintf(GFP_KERNEL, fmt, args);
 250         } while (ptr == NULL);
 251         va_end(args);
 252
 253         return ptr;
 254 }
 255 EXPORT_SYMBOL(kmem_asprintf);
 256
 257 /*
 258  * Memory allocation interfaces and debugging for basic kmem_*
 259  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 260  * the SPL will keep track of the total memory allocated, and
 261  * report any memory leaked when the module is unloaded.
 262  */
 263 #ifdef DEBUG_KMEM
 264
 265 /* Shim layer memory accounting */
 266 # ifdef HAVE_ATOMIC64_T
 267 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 268 unsigned long long kmem_alloc_max = 0;
 269 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 270 unsigned long long vmem_alloc_max = 0;
 271 # else
 272 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 273 unsigned long long kmem_alloc_max = 0;
 274 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 275 unsigned long long vmem_alloc_max = 0;
 276 # endif /* _LP64 */
 277
 278 EXPORT_SYMBOL(kmem_alloc_used);
 279 EXPORT_SYMBOL(kmem_alloc_max);
 280 EXPORT_SYMBOL(vmem_alloc_used);
 281 EXPORT_SYMBOL(vmem_alloc_max);
 282
 283 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 284  * but also the location of every alloc and free.  When the SPL module is
 285  * unloaded a list of all leaked addresses and where they were allocated
 286  * will be dumped to the console.  Enabling this feature has a significant
 287  * impact on performance but it makes finding memory leaks straight forward.
 288  *
 289  * Not surprisingly with debugging enabled the xmem_locks are very highly
 290  * contended particularly on xfree().  If we want to run with this detailed
 291  * debugging enabled for anything other than debugging  we need to minimize
 292  * the contention by moving to a lock per xmem_table entry model.
 293  */
 294 # ifdef DEBUG_KMEM_TRACKING
 295
 296 #  define KMEM_HASH_BITS          10
 297 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 298
 299 #  define VMEM_HASH_BITS          10
 300 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 301
 302 typedef struct kmem_debug {
 303         struct hlist_node kd_hlist;     /* Hash node linkage */
 304         struct list_head kd_list;       /* List of all allocations */
 305         void *kd_addr;                  /* Allocation pointer */
 306         size_t kd_size;                 /* Allocation size */
 307         const char *kd_func;            /* Allocation function */
 308         int kd_line;                    /* Allocation line */
 309 } kmem_debug_t;
 310
 311 spinlock_t kmem_lock;
 312 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 313 struct list_head kmem_list;
 314
 315 spinlock_t vmem_lock;
 316 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 317 struct list_head vmem_list;
 318
 319 EXPORT_SYMBOL(kmem_lock);
 320 EXPORT_SYMBOL(kmem_table);
 321 EXPORT_SYMBOL(kmem_list);
 322
 323 EXPORT_SYMBOL(vmem_lock);
 324 EXPORT_SYMBOL(vmem_table);
 325 EXPORT_SYMBOL(vmem_list);
 326 # endif
 327 #endif
 328
 329 /*
 330  * Slab allocation interfaces
 331  *
 332  * While the Linux slab implementation was inspired by the Solaris
 333  * implemenation I cannot use it to emulate the Solaris APIs.  I
 334  * require two features which are not provided by the Linux slab.
 335  *
 336  * 1) Constructors AND destructors.  Recent versions of the Linux
 337  *    kernel have removed support for destructors.  This is a deal
 338  *    breaker for the SPL which contains particularly expensive
 339  *    initializers for mutex's, condition variables, etc.  We also
 340  *    require a minimal level of cleanup for these data types unlike
 341  *    many Linux data type which do need to be explicitly destroyed.
 342  *
 343  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 344  *    expect it to work well for both small are very large allocations.
 345  *    Because of memory fragmentation the Linux slab which is backed
 346  *    by kmalloc'ed memory performs very badly when confronted with
 347  *    large numbers of large allocations.  Basing the slab on the
 348  *    virtual address space removes the need for contigeous pages
 349  *    and greatly improve performance for large allocations.
 350  *
 351  * For these reasons, the SPL has its own slab implementation with
 352  * the needed features.  It is not as highly optimized as either the
 353  * Solaris or Linux slabs, but it should get me most of what is
 354  * needed until it can be optimized or obsoleted by another approach.
 355  *
 356  * One serious concern I do have about this method is the relatively
 357  * small virtual address space on 32bit arches.  This will seriously
 358  * constrain the size of the slab caches and their performance.
 359  *
 360  * XXX: Improve the partial slab list by carefully maintaining a
 361  *      strict ordering of fullest to emptiest slabs based on
 362  *      the slab reference count.  This gaurentees the when freeing
 363  *      slabs back to the system we need only linearly traverse the
 364  *      last N slabs in the list to discover all the freeable slabs.
 365  *
 366  * XXX: NUMA awareness for optionally allocating memory close to a
 367  *      particular core.  This can be adventageous if you know the slab
 368  *      object will be short lived and primarily accessed from one core.
 369  *
 370  * XXX: Slab coloring may also yield performance improvements and would
 371  *      be desirable to implement.
 372  */
 373
 374 struct list_head spl_kmem_cache_list;   /* List of caches */
 375 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 376
 377 static int spl_cache_flush(spl_kmem_cache_t *skc,
 378                            spl_kmem_magazine_t *skm, int flush);
 379
 380 #ifdef HAVE_SET_SHRINKER
 381 static struct shrinker *spl_kmem_cache_shrinker;
 382 #else
 383 static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
 384                                            unsigned int gfp_mask);
 385 static struct shrinker spl_kmem_cache_shrinker = {
 386         .shrink = spl_kmem_cache_generic_shrinker,
 387         .seeks = KMC_DEFAULT_SEEKS,
 388 };
 389 #endif
 390
 391 #ifdef DEBUG_KMEM
 392 # ifdef DEBUG_KMEM_TRACKING
 393
 394 static kmem_debug_t *
 395 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
 396                 void *addr)
 397 {
 398         struct hlist_head *head;
 399         struct hlist_node *node;
 400         struct kmem_debug *p;
 401         unsigned long flags;
 402         ENTRY;
 403
 404         spin_lock_irqsave(lock, flags);
 405
 406         head = &table[hash_ptr(addr, bits)];
 407         hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
 408                 if (p->kd_addr == addr) {
 409                         hlist_del_init(&p->kd_hlist);
 410                         list_del_init(&p->kd_list);
 411                         spin_unlock_irqrestore(lock, flags);
 412                         return p;
 413                 }
 414         }
 415
 416         spin_unlock_irqrestore(lock, flags);
 417
 418         RETURN(NULL);
 419 }
 420
 421 void *
 422 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 423     int node_alloc, int node)
 424 {
 425         void *ptr = NULL;
 426         kmem_debug_t *dptr;
 427         unsigned long irq_flags;
 428         ENTRY;
 429
 430         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 431             flags & ~__GFP_ZERO);
 432
 433         if (dptr == NULL) {
 434                 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
 435                     sizeof(kmem_debug_t), flags);
 436         } else {
 437                 /* Marked unlikely because we should never be doing this,
 438                  * we tolerate to up 2 pages but a single page is best.   */
 439                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 440                         CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
 441                             (unsigned long long) size, flags,
 442                             kmem_alloc_used_read(), kmem_alloc_max);
 443                         spl_debug_dumpstack(NULL);
 444                 }
 445
 446                 /* We use kstrdup() below because the string pointed to by
 447                  * __FUNCTION__ might not be available by the time we want
 448                  * to print it since the module might have been unloaded. */
 449                 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
 450                 if (unlikely(dptr->kd_func == NULL)) {
 451                         kfree(dptr);
 452                         CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
 453                             "(%lld/%llu)\n", (unsigned long long) size, flags,
 454                             kmem_alloc_used_read(), kmem_alloc_max);
 455                         goto out;
 456                 }
 457
 458                 /* Use the correct allocator */
 459                 if (node_alloc) {
 460                         ASSERT(!(flags & __GFP_ZERO));
 461                         ptr = kmalloc_node_nofail(size, flags, node);
 462                 } else if (flags & __GFP_ZERO) {
 463                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 464                 } else {
 465                         ptr = kmalloc_nofail(size, flags);
 466                 }
 467
 468                 if (unlikely(ptr == NULL)) {
 469                         kfree(dptr->kd_func);
 470                         kfree(dptr);
 471                         CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 472                             (unsigned long long) size, flags,
 473                             kmem_alloc_used_read(), kmem_alloc_max);
 474                         goto out;
 475                 }
 476
 477                 kmem_alloc_used_add(size);
 478                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 479                         kmem_alloc_max = kmem_alloc_used_read();
 480
 481                 INIT_HLIST_NODE(&dptr->kd_hlist);
 482                 INIT_LIST_HEAD(&dptr->kd_list);
 483
 484                 dptr->kd_addr = ptr;
 485                 dptr->kd_size = size;
 486                 dptr->kd_line = line;
 487
 488                 spin_lock_irqsave(&kmem_lock, irq_flags);
 489                 hlist_add_head_rcu(&dptr->kd_hlist,
 490                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 491                 list_add_tail(&dptr->kd_list, &kmem_list);
 492                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 493
 494                 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
 495                     "(%lld/%llu)\n", (unsigned long long) size, flags,
 496                     ptr, kmem_alloc_used_read(),
 497                     kmem_alloc_max);
 498         }
 499 out:
 500         RETURN(ptr);
 501 }
 502 EXPORT_SYMBOL(kmem_alloc_track);
 503
 504 void
 505 kmem_free_track(void *ptr, size_t size)
 506 {
 507         kmem_debug_t *dptr;
 508         ENTRY;
 509
 510         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 511             (unsigned long long) size);
 512
 513         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 514
 515         ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
 516
 517         /* Size must match */
 518         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 519             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 520             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 521
 522         kmem_alloc_used_sub(size);
 523         CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 524             (unsigned long long) size, kmem_alloc_used_read(),
 525             kmem_alloc_max);
 526
 527         kfree(dptr->kd_func);
 528
 529         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 530         kfree(dptr);
 531
 532         memset(ptr, 0x5a, size);
 533         kfree(ptr);
 534
 535         EXIT;
 536 }
 537 EXPORT_SYMBOL(kmem_free_track);
 538
 539 void *
 540 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 541 {
 542         void *ptr = NULL;
 543         kmem_debug_t *dptr;
 544         unsigned long irq_flags;
 545         ENTRY;
 546
 547         ASSERT(flags & KM_SLEEP);
 548
 549         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 550             flags & ~__GFP_ZERO);
 551         if (dptr == NULL) {
 552                 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
 553                     sizeof(kmem_debug_t), flags);
 554         } else {
 555                 /* We use kstrdup() below because the string pointed to by
 556                  * __FUNCTION__ might not be available by the time we want
 557                  * to print it, since the module might have been unloaded. */
 558                 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
 559                 if (unlikely(dptr->kd_func == NULL)) {
 560                         kfree(dptr);
 561                         CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
 562                             "(%lld/%llu)\n", (unsigned long long) size, flags,
 563                             vmem_alloc_used_read(), vmem_alloc_max);
 564                         goto out;
 565                 }
 566
 567                 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
 568                     PAGE_KERNEL);
 569
 570                 if (unlikely(ptr == NULL)) {
 571                         kfree(dptr->kd_func);
 572                         kfree(dptr);
 573                         CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 574                             (unsigned long long) size, flags,
 575                             vmem_alloc_used_read(), vmem_alloc_max);
 576                         goto out;
 577                 }
 578
 579                 if (flags & __GFP_ZERO)
 580                         memset(ptr, 0, size);
 581
 582                 vmem_alloc_used_add(size);
 583                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 584                         vmem_alloc_max = vmem_alloc_used_read();
 585
 586                 INIT_HLIST_NODE(&dptr->kd_hlist);
 587                 INIT_LIST_HEAD(&dptr->kd_list);
 588
 589                 dptr->kd_addr = ptr;
 590                 dptr->kd_size = size;
 591                 dptr->kd_line = line;
 592
 593                 spin_lock_irqsave(&vmem_lock, irq_flags);
 594                 hlist_add_head_rcu(&dptr->kd_hlist,
 595                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 596                 list_add_tail(&dptr->kd_list, &vmem_list);
 597                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 598
 599                 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 600                     "(%lld/%llu)\n", (unsigned long long) size, flags,
 601                     ptr, vmem_alloc_used_read(),
 602                     vmem_alloc_max);
 603         }
 604 out:
 605         RETURN(ptr);
 606 }
 607 EXPORT_SYMBOL(vmem_alloc_track);
 608
 609 void
 610 vmem_free_track(void *ptr, size_t size)
 611 {
 612         kmem_debug_t *dptr;
 613         ENTRY;
 614
 615         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 616             (unsigned long long) size);
 617
 618         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 619         ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
 620
 621         /* Size must match */
 622         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 623             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 624             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 625
 626         vmem_alloc_used_sub(size);
 627         CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 628             (unsigned long long) size, vmem_alloc_used_read(),
 629             vmem_alloc_max);
 630
 631         kfree(dptr->kd_func);
 632
 633         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 634         kfree(dptr);
 635
 636         memset(ptr, 0x5a, size);
 637         vfree(ptr);
 638
 639         EXIT;
 640 }
 641 EXPORT_SYMBOL(vmem_free_track);
 642
 643 # else /* DEBUG_KMEM_TRACKING */
 644
 645 void *
 646 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 647     int node_alloc, int node)
 648 {
 649         void *ptr;
 650         ENTRY;
 651
 652         /* Marked unlikely because we should never be doing this,
 653          * we tolerate to up 2 pages but a single page is best.   */
 654         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 655                 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
 656                     (unsigned long long) size, flags,
 657                     kmem_alloc_used_read(), kmem_alloc_max);
 658                 spl_debug_dumpstack(NULL);
 659         }
 660
 661         /* Use the correct allocator */
 662         if (node_alloc) {
 663                 ASSERT(!(flags & __GFP_ZERO));
 664                 ptr = kmalloc_node_nofail(size, flags, node);
 665         } else if (flags & __GFP_ZERO) {
 666                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 667         } else {
 668                 ptr = kmalloc_nofail(size, flags);
 669         }
 670
 671         if (ptr == NULL) {
 672                 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 673                     (unsigned long long) size, flags,
 674                     kmem_alloc_used_read(), kmem_alloc_max);
 675         } else {
 676                 kmem_alloc_used_add(size);
 677                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 678                         kmem_alloc_max = kmem_alloc_used_read();
 679
 680                 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
 681                        "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 682                        kmem_alloc_used_read(), kmem_alloc_max);
 683         }
 684         RETURN(ptr);
 685 }
 686 EXPORT_SYMBOL(kmem_alloc_debug);
 687
 688 void
 689 kmem_free_debug(void *ptr, size_t size)
 690 {
 691         ENTRY;
 692
 693         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 694             (unsigned long long) size);
 695
 696         kmem_alloc_used_sub(size);
 697         CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 698             (unsigned long long) size, kmem_alloc_used_read(),
 699             kmem_alloc_max);
 700
 701         memset(ptr, 0x5a, size);
 702         kfree(ptr);
 703
 704         EXIT;
 705 }
 706 EXPORT_SYMBOL(kmem_free_debug);
 707
 708 void *
 709 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 710 {
 711         void *ptr;
 712         ENTRY;
 713
 714         ASSERT(flags & KM_SLEEP);
 715
 716         ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
 717             PAGE_KERNEL);
 718         if (ptr == NULL) {
 719                 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 720                     (unsigned long long) size, flags,
 721                     vmem_alloc_used_read(), vmem_alloc_max);
 722         } else {
 723                 if (flags & __GFP_ZERO)
 724                         memset(ptr, 0, size);
 725
 726                 vmem_alloc_used_add(size);
 727                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 728                         vmem_alloc_max = vmem_alloc_used_read();
 729
 730                 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 731                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 732                     vmem_alloc_used_read(), vmem_alloc_max);
 733         }
 734
 735         RETURN(ptr);
 736 }
 737 EXPORT_SYMBOL(vmem_alloc_debug);
 738
 739 void
 740 vmem_free_debug(void *ptr, size_t size)
 741 {
 742         ENTRY;
 743
 744         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 745             (unsigned long long) size);
 746
 747         vmem_alloc_used_sub(size);
 748         CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 749             (unsigned long long) size, vmem_alloc_used_read(),
 750             vmem_alloc_max);
 751
 752         memset(ptr, 0x5a, size);
 753         vfree(ptr);
 754
 755         EXIT;
 756 }
 757 EXPORT_SYMBOL(vmem_free_debug);
 758
 759 # endif /* DEBUG_KMEM_TRACKING */
 760 #endif /* DEBUG_KMEM */
 761
 762 static void *
 763 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 764 {
 765         void *ptr;
 766
 767         ASSERT(ISP2(size));
 768
 769         if (skc->skc_flags & KMC_KMEM)
 770                 ptr = (void *)__get_free_pages(flags, get_order(size));
 771         else
 772                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 773
 774         /* Resulting allocated memory will be page aligned */
 775         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 776
 777         return ptr;
 778 }
 779
 780 static void
 781 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 782 {
 783         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 784         ASSERT(ISP2(size));
 785
 786         if (skc->skc_flags & KMC_KMEM)
 787                 free_pages((unsigned long)ptr, get_order(size));
 788         else
 789                 vfree(ptr);
 790 }
 791
 792 /*
 793  * Required space for each aligned sks.
 794  */
 795 static inline uint32_t
 796 spl_sks_size(spl_kmem_cache_t *skc)
 797 {
 798         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 799                skc->skc_obj_align, uint32_t);
 800 }
 801
 802 /*
 803  * Required space for each aligned object.
 804  */
 805 static inline uint32_t
 806 spl_obj_size(spl_kmem_cache_t *skc)
 807 {
 808         uint32_t align = skc->skc_obj_align;
 809
 810         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 811                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 812 }
 813
 814 /*
 815  * Lookup the spl_kmem_object_t for an object given that object.
 816  */
 817 static inline spl_kmem_obj_t *
 818 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 819 {
 820         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 821                skc->skc_obj_align, uint32_t);
 822 }
 823
 824 /*
 825  * Required space for each offslab object taking in to account alignment
 826  * restrictions and the power-of-two requirement of kv_alloc().
 827  */
 828 static inline uint32_t
 829 spl_offslab_size(spl_kmem_cache_t *skc)
 830 {
 831         return 1UL << (highbit(spl_obj_size(skc)) + 1);
 832 }
 833
 834 /*
 835  * It's important that we pack the spl_kmem_obj_t structure and the
 836  * actual objects in to one large address space to minimize the number
 837  * of calls to the allocator.  It is far better to do a few large
 838  * allocations and then subdivide it ourselves.  Now which allocator
 839  * we use requires balancing a few trade offs.
 840  *
 841  * For small objects we use kmem_alloc() because as long as you are
 842  * only requesting a small number of pages (ideally just one) its cheap.
 843  * However, when you start requesting multiple pages with kmem_alloc()
 844  * it gets increasingly expensive since it requires contigeous pages.
 845  * For this reason we shift to vmem_alloc() for slabs of large objects
 846  * which removes the need for contigeous pages.  We do not use
 847  * vmem_alloc() in all cases because there is significant locking
 848  * overhead in __get_vm_area_node().  This function takes a single
 849  * global lock when aquiring an available virtual address range which
 850  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 851  * different allocation functions for small and large objects should
 852  * give us the best of both worlds.
 853  *
 854  * KMC_ONSLAB                       KMC_OFFSLAB
 855  *
 856  * +------------------------+       +-----------------+
 857  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 858  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 859  * | spl_kmem_obj_t      |  |                             | |
 860  * | skc_obj_size    <---+  |       +-----------------+   | |
 861  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 862  * | ...                 v  |       | spl_kmem_obj_t  |     |
 863  * +------------------------+       +-----------------+     v
 864  */
 865 static spl_kmem_slab_t *
 866 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 867 {
 868         spl_kmem_slab_t *sks;
 869         spl_kmem_obj_t *sko, *n;
 870         void *base, *obj;
 871         uint32_t obj_size, offslab_size = 0;
 872         int i,  rc = 0;
 873
 874         base = kv_alloc(skc, skc->skc_slab_size, flags);
 875         if (base == NULL)
 876                 RETURN(NULL);
 877
 878         sks = (spl_kmem_slab_t *)base;
 879         sks->sks_magic = SKS_MAGIC;
 880         sks->sks_objs = skc->skc_slab_objs;
 881         sks->sks_age = jiffies;
 882         sks->sks_cache = skc;
 883         INIT_LIST_HEAD(&sks->sks_list);
 884         INIT_LIST_HEAD(&sks->sks_free_list);
 885         sks->sks_ref = 0;
 886         obj_size = spl_obj_size(skc);
 887
 888         if (skc->skc_flags * KMC_OFFSLAB)
 889                 offslab_size = spl_offslab_size(skc);
 890
 891         for (i = 0; i < sks->sks_objs; i++) {
 892                 if (skc->skc_flags & KMC_OFFSLAB) {
 893                         obj = kv_alloc(skc, offslab_size, flags);
 894                         if (!obj)
 895                                 GOTO(out, rc = -ENOMEM);
 896                 } else {
 897                         obj = base + spl_sks_size(skc) + (i * obj_size);
 898                 }
 899
 900                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 901                 sko = spl_sko_from_obj(skc, obj);
 902                 sko->sko_addr = obj;
 903                 sko->sko_magic = SKO_MAGIC;
 904                 sko->sko_slab = sks;
 905                 INIT_LIST_HEAD(&sko->sko_list);
 906                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 907         }
 908
 909         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
 910                 if (skc->skc_ctor)
 911                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
 912 out:
 913         if (rc) {
 914                 if (skc->skc_flags & KMC_OFFSLAB)
 915                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
 916                                                  sko_list)
 917                                 kv_free(skc, sko->sko_addr, offslab_size);
 918
 919                 kv_free(skc, base, skc->skc_slab_size);
 920                 sks = NULL;
 921         }
 922
 923         RETURN(sks);
 924 }
 925
 926 /*
 927  * Remove a slab from complete or partial list, it must be called with
 928  * the 'skc->skc_lock' held but the actual free must be performed
 929  * outside the lock to prevent deadlocking on vmem addresses.
 930  */
 931 static void
 932 spl_slab_free(spl_kmem_slab_t *sks,
 933               struct list_head *sks_list, struct list_head *sko_list)
 934 {
 935         spl_kmem_cache_t *skc;
 936         ENTRY;
 937
 938         ASSERT(sks->sks_magic == SKS_MAGIC);
 939         ASSERT(sks->sks_ref == 0);
 940
 941         skc = sks->sks_cache;
 942         ASSERT(skc->skc_magic == SKC_MAGIC);
 943         ASSERT(spin_is_locked(&skc->skc_lock));
 944
 945         /*
 946          * Update slab/objects counters in the cache, then remove the
 947          * slab from the skc->skc_partial_list.  Finally add the slab
 948          * and all its objects in to the private work lists where the
 949          * destructors will be called and the memory freed to the system.
 950          */
 951         skc->skc_obj_total -= sks->sks_objs;
 952         skc->skc_slab_total--;
 953         list_del(&sks->sks_list);
 954         list_add(&sks->sks_list, sks_list);
 955         list_splice_init(&sks->sks_free_list, sko_list);
 956
 957         EXIT;
 958 }
 959
 960 /*
 961  * Traverses all the partial slabs attached to a cache and free those
 962  * which which are currently empty, and have not been touched for
 963  * skc_delay seconds to  avoid thrashing.  The count argument is
 964  * passed to optionally cap the number of slabs reclaimed, a count
 965  * of zero means try and reclaim everything.  When flag is set we
 966  * always free an available slab regardless of age.
 967  */
 968 static void
 969 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 970 {
 971         spl_kmem_slab_t *sks, *m;
 972         spl_kmem_obj_t *sko, *n;
 973         LIST_HEAD(sks_list);
 974         LIST_HEAD(sko_list);
 975         uint32_t size = 0;
 976         int i = 0;
 977         ENTRY;
 978
 979         /*
 980          * Move empty slabs and objects which have not been touched in
 981          * skc_delay seconds on to private lists to be freed outside
 982          * the spin lock.  This delay time is important to avoid thrashing
 983          * however when flag is set the delay will not be used.
 984          */
 985         spin_lock(&skc->skc_lock);
 986         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
 987                 /*
 988                  * All empty slabs are at the end of skc->skc_partial_list,
 989                  * therefore once a non-empty slab is found we can stop
 990                  * scanning.  Additionally, stop when reaching the target
 991                  * reclaim 'count' if a non-zero threshhold is given.
 992                  */
 993                 if ((sks->sks_ref > 0) || (count && i > count))
 994                         break;
 995
 996                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
 997                         spl_slab_free(sks, &sks_list, &sko_list);
 998                         i++;
 999                 }
1000         }
1001         spin_unlock(&skc->skc_lock);
1002
1003         /*
1004          * The following two loops ensure all the object destructors are
1005          * run, any offslab objects are freed, and the slabs themselves
1006          * are freed.  This is all done outside the skc->skc_lock since
1007          * this allows the destructor to sleep, and allows us to perform
1008          * a conditional reschedule when a freeing a large number of
1009          * objects and slabs back to the system.
1010          */
1011         if (skc->skc_flags & KMC_OFFSLAB)
1012                 size = spl_offslab_size(skc);
1013
1014         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1015                 ASSERT(sko->sko_magic == SKO_MAGIC);
1016
1017                 if (skc->skc_dtor)
1018                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
1019
1020                 if (skc->skc_flags & KMC_OFFSLAB)
1021                         kv_free(skc, sko->sko_addr, size);
1022
1023                 cond_resched();
1024         }
1025
1026         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1027                 ASSERT(sks->sks_magic == SKS_MAGIC);
1028                 kv_free(skc, sks, skc->skc_slab_size);
1029                 cond_resched();
1030         }
1031
1032         EXIT;
1033 }
1034
1035 /*
1036  * Called regularly on all caches to age objects out of the magazines
1037  * which have not been access in skc->skc_delay seconds.  This prevents
1038  * idle magazines from holding memory which might be better used by
1039  * other caches or parts of the system.  The delay is present to
1040  * prevent thrashing the magazine.
1041  */
1042 static void
1043 spl_magazine_age(void *data)
1044 {
1045         spl_kmem_magazine_t *skm =
1046                 spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work);
1047         spl_kmem_cache_t *skc = skm->skm_cache;
1048         int i = smp_processor_id();
1049
1050         ASSERT(skm->skm_magic == SKM_MAGIC);
1051         ASSERT(skc->skc_magic == SKC_MAGIC);
1052         ASSERT(skc->skc_mag[i] == skm);
1053
1054         if (skm->skm_avail > 0 &&
1055             time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1056                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1057
1058         if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
1059                 schedule_delayed_work_on(i, &skm->skm_work,
1060                                          skc->skc_delay / 3 * HZ);
1061 }
1062
1063 /*
1064  * Called regularly to keep a downward pressure on the size of idle
1065  * magazines and to release free slabs from the cache.  This function
1066  * never calls the registered reclaim function, that only occures
1067  * under memory pressure or with a direct call to spl_kmem_reap().
1068  */
1069 static void
1070 spl_cache_age(void *data)
1071 {
1072         spl_kmem_cache_t *skc =
1073                 spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
1074
1075         ASSERT(skc->skc_magic == SKC_MAGIC);
1076         spl_slab_reclaim(skc, skc->skc_reap, 0);
1077
1078         if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
1079                 schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
1080 }
1081
1082 /*
1083  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1084  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
1085  * for very small objects we may end up with more than this so as not
1086  * to waste space in the minimal allocation of a single page.  Also for
1087  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
1088  * lower than this and we will fail.
1089  */
1090 static int
1091 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1092 {
1093         uint32_t sks_size, obj_size, max_size;
1094
1095         if (skc->skc_flags & KMC_OFFSLAB) {
1096                 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
1097                 *size = sizeof(spl_kmem_slab_t);
1098         } else {
1099                 sks_size = spl_sks_size(skc);
1100                 obj_size = spl_obj_size(skc);
1101
1102                 if (skc->skc_flags & KMC_KMEM)
1103                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1104                 else
1105                         max_size = (32 * 1024 * 1024);
1106
1107                 /* Power of two sized slab */
1108                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1109                         *objs = (*size - sks_size) / obj_size;
1110                         if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
1111                                 RETURN(0);
1112                 }
1113
1114                 /*
1115                  * Unable to satisfy target objects per slab, fall back to
1116                  * allocating a maximally sized slab and assuming it can
1117                  * contain the minimum objects count use it.  If not fail.
1118                  */
1119                 *size = max_size;
1120                 *objs = (*size - sks_size) / obj_size;
1121                 if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
1122                         RETURN(0);
1123         }
1124
1125         RETURN(-ENOSPC);
1126 }
1127
1128 /*
1129  * Make a guess at reasonable per-cpu magazine size based on the size of
1130  * each object and the cost of caching N of them in each magazine.  Long
1131  * term this should really adapt based on an observed usage heuristic.
1132  */
1133 static int
1134 spl_magazine_size(spl_kmem_cache_t *skc)
1135 {
1136         uint32_t obj_size = spl_obj_size(skc);
1137         int size;
1138         ENTRY;
1139
1140         /* Per-magazine sizes below assume a 4Kib page size */
1141         if (obj_size > (PAGE_SIZE * 256))
1142                 size = 4;  /* Minimum 4Mib per-magazine */
1143         else if (obj_size > (PAGE_SIZE * 32))
1144                 size = 16; /* Minimum 2Mib per-magazine */
1145         else if (obj_size > (PAGE_SIZE))
1146                 size = 64; /* Minimum 256Kib per-magazine */
1147         else if (obj_size > (PAGE_SIZE / 4))
1148                 size = 128; /* Minimum 128Kib per-magazine */
1149         else
1150                 size = 256;
1151
1152         RETURN(size);
1153 }
1154
1155 /*
1156  * Allocate a per-cpu magazine to assoicate with a specific core.
1157  */
1158 static spl_kmem_magazine_t *
1159 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
1160 {
1161         spl_kmem_magazine_t *skm;
1162         int size = sizeof(spl_kmem_magazine_t) +
1163                    sizeof(void *) * skc->skc_mag_size;
1164         ENTRY;
1165
1166         skm = kmem_alloc_node(size, KM_SLEEP, node);
1167         if (skm) {
1168                 skm->skm_magic = SKM_MAGIC;
1169                 skm->skm_avail = 0;
1170                 skm->skm_size = skc->skc_mag_size;
1171                 skm->skm_refill = skc->skc_mag_refill;
1172                 skm->skm_cache = skc;
1173                 spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm);
1174                 skm->skm_age = jiffies;
1175         }
1176
1177         RETURN(skm);
1178 }
1179
1180 /*
1181  * Free a per-cpu magazine assoicated with a specific core.
1182  */
1183 static void
1184 spl_magazine_free(spl_kmem_magazine_t *skm)
1185 {
1186         int size = sizeof(spl_kmem_magazine_t) +
1187                    sizeof(void *) * skm->skm_size;
1188
1189         ENTRY;
1190         ASSERT(skm->skm_magic == SKM_MAGIC);
1191         ASSERT(skm->skm_avail == 0);
1192
1193         kmem_free(skm, size);
1194         EXIT;
1195 }
1196
1197 /*
1198  * Create all pre-cpu magazines of reasonable sizes.
1199  */
1200 static int
1201 spl_magazine_create(spl_kmem_cache_t *skc)
1202 {
1203         int i;
1204         ENTRY;
1205
1206         skc->skc_mag_size = spl_magazine_size(skc);
1207         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1208
1209         for_each_online_cpu(i) {
1210                 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
1211                 if (!skc->skc_mag[i]) {
1212                         for (i--; i >= 0; i--)
1213                                 spl_magazine_free(skc->skc_mag[i]);
1214
1215                         RETURN(-ENOMEM);
1216                 }
1217         }
1218
1219         /* Only after everything is allocated schedule magazine work */
1220         for_each_online_cpu(i)
1221                 schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work,
1222                                          skc->skc_delay / 3 * HZ);
1223
1224         RETURN(0);
1225 }
1226
1227 /*
1228  * Destroy all pre-cpu magazines.
1229  */
1230 static void
1231 spl_magazine_destroy(spl_kmem_cache_t *skc)
1232 {
1233         spl_kmem_magazine_t *skm;
1234         int i;
1235         ENTRY;
1236
1237         for_each_online_cpu(i) {
1238                 skm = skc->skc_mag[i];
1239                 (void)spl_cache_flush(skc, skm, skm->skm_avail);
1240                 spl_magazine_free(skm);
1241         }
1242
1243         EXIT;
1244 }
1245
1246 /*
1247  * Create a object cache based on the following arguments:
1248  * name         cache name
1249  * size         cache object size
1250  * align        cache object alignment
1251  * ctor         cache object constructor
1252  * dtor         cache object destructor
1253  * reclaim      cache object reclaim
1254  * priv         cache private data for ctor/dtor/reclaim
1255  * vmp          unused must be NULL
1256  * flags
1257  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1258  *      KMC_NODEBUG     Disable debugging (unsupported)
1259  *      KMC_NOMAGAZINE  Disable magazine (unsupported)
1260  *      KMC_NOHASH      Disable hashing (unsupported)
1261  *      KMC_QCACHE      Disable qcache (unsupported)
1262  *      KMC_KMEM        Force kmem backed cache
1263  *      KMC_VMEM        Force vmem backed cache
1264  *      KMC_OFFSLAB     Locate objects off the slab
1265  */
1266 spl_kmem_cache_t *
1267 spl_kmem_cache_create(char *name, size_t size, size_t align,
1268                       spl_kmem_ctor_t ctor,
1269                       spl_kmem_dtor_t dtor,
1270                       spl_kmem_reclaim_t reclaim,
1271                       void *priv, void *vmp, int flags)
1272 {
1273         spl_kmem_cache_t *skc;
1274         int rc, kmem_flags = KM_SLEEP;
1275         ENTRY;
1276
1277         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1278         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1279         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1280         ASSERT(vmp == NULL);
1281
1282         /* We may be called when there is a non-zero preempt_count or
1283          * interrupts are disabled is which case we must not sleep.
1284          */
1285         if (current_thread_info()->preempt_count || irqs_disabled())
1286                 kmem_flags = KM_NOSLEEP;
1287
1288         /* Allocate memry for a new cache an initialize it.  Unfortunately,
1289          * this usually ends up being a large allocation of ~32k because
1290          * we need to allocate enough memory for the worst case number of
1291          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1292          * explicitly pass KM_NODEBUG to suppress the kmem warning */
1293         skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc),
1294                                               kmem_flags | KM_NODEBUG);
1295         if (skc == NULL)
1296                 RETURN(NULL);
1297
1298         skc->skc_magic = SKC_MAGIC;
1299         skc->skc_name_size = strlen(name) + 1;
1300         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
1301         if (skc->skc_name == NULL) {
1302                 kmem_free(skc, sizeof(*skc));
1303                 RETURN(NULL);
1304         }
1305         strncpy(skc->skc_name, name, skc->skc_name_size);
1306
1307         skc->skc_ctor = ctor;
1308         skc->skc_dtor = dtor;
1309         skc->skc_reclaim = reclaim;
1310         skc->skc_private = priv;
1311         skc->skc_vmp = vmp;
1312         skc->skc_flags = flags;
1313         skc->skc_obj_size = size;
1314         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1315         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1316         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1317         atomic_set(&skc->skc_ref, 0);
1318
1319         INIT_LIST_HEAD(&skc->skc_list);
1320         INIT_LIST_HEAD(&skc->skc_complete_list);
1321         INIT_LIST_HEAD(&skc->skc_partial_list);
1322         spin_lock_init(&skc->skc_lock);
1323         skc->skc_slab_fail = 0;
1324         skc->skc_slab_create = 0;
1325         skc->skc_slab_destroy = 0;
1326         skc->skc_slab_total = 0;
1327         skc->skc_slab_alloc = 0;
1328         skc->skc_slab_max = 0;
1329         skc->skc_obj_total = 0;
1330         skc->skc_obj_alloc = 0;
1331         skc->skc_obj_max = 0;
1332
1333         if (align) {
1334                 VERIFY(ISP2(align));
1335                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */
1336                 VERIFY3U(align, <=, PAGE_SIZE);            /* Max alignment */
1337                 skc->skc_obj_align = align;
1338         }
1339
1340         /* If none passed select a cache type based on object size */
1341         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
1342                 if (spl_obj_size(skc) < (PAGE_SIZE / 8))
1343                         skc->skc_flags |= KMC_KMEM;
1344                 else
1345                         skc->skc_flags |= KMC_VMEM;
1346         }
1347
1348         rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
1349         if (rc)
1350                 GOTO(out, rc);
1351
1352         rc = spl_magazine_create(skc);
1353         if (rc)
1354                 GOTO(out, rc);
1355
1356         spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
1357         schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
1358
1359         down_write(&spl_kmem_cache_sem);
1360         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1361         up_write(&spl_kmem_cache_sem);
1362
1363         RETURN(skc);
1364 out:
1365         kmem_free(skc->skc_name, skc->skc_name_size);
1366         kmem_free(skc, sizeof(*skc));
1367         RETURN(NULL);
1368 }
1369 EXPORT_SYMBOL(spl_kmem_cache_create);
1370
1371 /*
1372  * Destroy a cache and all objects assoicated with the cache.
1373  */
1374 void
1375 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1376 {
1377         DECLARE_WAIT_QUEUE_HEAD(wq);
1378         int i;
1379         ENTRY;
1380
1381         ASSERT(skc->skc_magic == SKC_MAGIC);
1382
1383         down_write(&spl_kmem_cache_sem);
1384         list_del_init(&skc->skc_list);
1385         up_write(&spl_kmem_cache_sem);
1386
1387         /* Cancel any and wait for any pending delayed work */
1388         ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1389         cancel_delayed_work(&skc->skc_work);
1390         for_each_online_cpu(i)
1391                 cancel_delayed_work(&skc->skc_mag[i]->skm_work);
1392
1393         flush_scheduled_work();
1394
1395         /* Wait until all current callers complete, this is mainly
1396          * to catch the case where a low memory situation triggers a
1397          * cache reaping action which races with this destroy. */
1398         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1399
1400         spl_magazine_destroy(skc);
1401         spl_slab_reclaim(skc, 0, 1);
1402         spin_lock(&skc->skc_lock);
1403
1404         /* Validate there are no objects in use and free all the
1405          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1406         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1407         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1408         ASSERT3U(skc->skc_slab_total, ==, 0);
1409         ASSERT3U(skc->skc_obj_total, ==, 0);
1410         ASSERT(list_empty(&skc->skc_complete_list));
1411
1412         kmem_free(skc->skc_name, skc->skc_name_size);
1413         spin_unlock(&skc->skc_lock);
1414
1415         kmem_free(skc, sizeof(*skc));
1416
1417         EXIT;
1418 }
1419 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1420
1421 /*
1422  * Allocate an object from a slab attached to the cache.  This is used to
1423  * repopulate the per-cpu magazine caches in batches when they run low.
1424  */
1425 static void *
1426 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1427 {
1428         spl_kmem_obj_t *sko;
1429
1430         ASSERT(skc->skc_magic == SKC_MAGIC);
1431         ASSERT(sks->sks_magic == SKS_MAGIC);
1432         ASSERT(spin_is_locked(&skc->skc_lock));
1433
1434         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1435         ASSERT(sko->sko_magic == SKO_MAGIC);
1436         ASSERT(sko->sko_addr != NULL);
1437
1438         /* Remove from sks_free_list */
1439         list_del_init(&sko->sko_list);
1440
1441         sks->sks_age = jiffies;
1442         sks->sks_ref++;
1443         skc->skc_obj_alloc++;
1444
1445         /* Track max obj usage statistics */
1446         if (skc->skc_obj_alloc > skc->skc_obj_max)
1447                 skc->skc_obj_max = skc->skc_obj_alloc;
1448
1449         /* Track max slab usage statistics */
1450         if (sks->sks_ref == 1) {
1451                 skc->skc_slab_alloc++;
1452
1453                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1454                         skc->skc_slab_max = skc->skc_slab_alloc;
1455         }
1456
1457         return sko->sko_addr;
1458 }
1459
1460 /*
1461  * No available objects on any slabsi, create a new slab.  Since this
1462  * is an expensive operation we do it without holding the spinlock and
1463  * only briefly aquire it when we link in the fully allocated and
1464  * constructed slab.
1465  */
1466 static spl_kmem_slab_t *
1467 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1468 {
1469         spl_kmem_slab_t *sks;
1470         ENTRY;
1471
1472         ASSERT(skc->skc_magic == SKC_MAGIC);
1473         local_irq_enable();
1474         might_sleep();
1475
1476         /*
1477          * Before allocating a new slab check if the slab is being reaped.
1478          * If it is there is a good chance we can wait until it finishes
1479          * and then use one of the newly freed but not aged-out slabs.
1480          */
1481         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1482                 schedule();
1483                 GOTO(out, sks= NULL);
1484         }
1485
1486         /* Allocate a new slab for the cache */
1487         sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG);
1488         if (sks == NULL)
1489                 GOTO(out, sks = NULL);
1490
1491         /* Link the new empty slab in to the end of skc_partial_list. */
1492         spin_lock(&skc->skc_lock);
1493         skc->skc_slab_total++;
1494         skc->skc_obj_total += sks->sks_objs;
1495         list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1496         spin_unlock(&skc->skc_lock);
1497 out:
1498         local_irq_disable();
1499
1500         RETURN(sks);
1501 }
1502
1503 /*
1504  * Refill a per-cpu magazine with objects from the slabs for this
1505  * cache.  Ideally the magazine can be repopulated using existing
1506  * objects which have been released, however if we are unable to
1507  * locate enough free objects new slabs of objects will be created.
1508  */
1509 static int
1510 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1511 {
1512         spl_kmem_slab_t *sks;
1513         int rc = 0, refill;
1514         ENTRY;
1515
1516         ASSERT(skc->skc_magic == SKC_MAGIC);
1517         ASSERT(skm->skm_magic == SKM_MAGIC);
1518
1519         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1520         spin_lock(&skc->skc_lock);
1521
1522         while (refill > 0) {
1523                 /* No slabs available we may need to grow the cache */
1524                 if (list_empty(&skc->skc_partial_list)) {
1525                         spin_unlock(&skc->skc_lock);
1526
1527                         sks = spl_cache_grow(skc, flags);
1528                         if (!sks)
1529                                 GOTO(out, rc);
1530
1531                         /* Rescheduled to different CPU skm is not local */
1532                         if (skm != skc->skc_mag[smp_processor_id()])
1533                                 GOTO(out, rc);
1534
1535                         /* Potentially rescheduled to the same CPU but
1536                          * allocations may have occured from this CPU while
1537                          * we were sleeping so recalculate max refill. */
1538                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1539
1540                         spin_lock(&skc->skc_lock);
1541                         continue;
1542                 }
1543
1544                 /* Grab the next available slab */
1545                 sks = list_entry((&skc->skc_partial_list)->next,
1546                                  spl_kmem_slab_t, sks_list);
1547                 ASSERT(sks->sks_magic == SKS_MAGIC);
1548                 ASSERT(sks->sks_ref < sks->sks_objs);
1549                 ASSERT(!list_empty(&sks->sks_free_list));
1550
1551                 /* Consume as many objects as needed to refill the requested
1552                  * cache.  We must also be careful not to overfill it. */
1553                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1554                         ASSERT(skm->skm_avail < skm->skm_size);
1555                         ASSERT(rc < skm->skm_size);
1556                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1557                 }
1558
1559                 /* Move slab to skc_complete_list when full */
1560                 if (sks->sks_ref == sks->sks_objs) {
1561                         list_del(&sks->sks_list);
1562                         list_add(&sks->sks_list, &skc->skc_complete_list);
1563                 }
1564         }
1565
1566         spin_unlock(&skc->skc_lock);
1567 out:
1568         /* Returns the number of entries added to cache */
1569         RETURN(rc);
1570 }
1571
1572 /*
1573  * Release an object back to the slab from which it came.
1574  */
1575 static void
1576 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1577 {
1578         spl_kmem_slab_t *sks = NULL;
1579         spl_kmem_obj_t *sko = NULL;
1580         ENTRY;
1581
1582         ASSERT(skc->skc_magic == SKC_MAGIC);
1583         ASSERT(spin_is_locked(&skc->skc_lock));
1584
1585         sko = spl_sko_from_obj(skc, obj);
1586         ASSERT(sko->sko_magic == SKO_MAGIC);
1587         sks = sko->sko_slab;
1588         ASSERT(sks->sks_magic == SKS_MAGIC);
1589         ASSERT(sks->sks_cache == skc);
1590         list_add(&sko->sko_list, &sks->sks_free_list);
1591
1592         sks->sks_age = jiffies;
1593         sks->sks_ref--;
1594         skc->skc_obj_alloc--;
1595
1596         /* Move slab to skc_partial_list when no longer full.  Slabs
1597          * are added to the head to keep the partial list is quasi-full
1598          * sorted order.  Fuller at the head, emptier at the tail. */
1599         if (sks->sks_ref == (sks->sks_objs - 1)) {
1600                 list_del(&sks->sks_list);
1601                 list_add(&sks->sks_list, &skc->skc_partial_list);
1602         }
1603
1604         /* Move emply slabs to the end of the partial list so
1605          * they can be easily found and freed during reclamation. */
1606         if (sks->sks_ref == 0) {
1607                 list_del(&sks->sks_list);
1608                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1609                 skc->skc_slab_alloc--;
1610         }
1611
1612         EXIT;
1613 }
1614
1615 /*
1616  * Release a batch of objects from a per-cpu magazine back to their
1617  * respective slabs.  This occurs when we exceed the magazine size,
1618  * are under memory pressure, when the cache is idle, or during
1619  * cache cleanup.  The flush argument contains the number of entries
1620  * to remove from the magazine.
1621  */
1622 static int
1623 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1624 {
1625         int i, count = MIN(flush, skm->skm_avail);
1626         ENTRY;
1627
1628         ASSERT(skc->skc_magic == SKC_MAGIC);
1629         ASSERT(skm->skm_magic == SKM_MAGIC);
1630
1631         /*
1632          * XXX: Currently we simply return objects from the magazine to
1633          * the slabs in fifo order.  The ideal thing to do from a memory
1634          * fragmentation standpoint is to cheaply determine the set of
1635          * objects in the magazine which will result in the largest
1636          * number of free slabs if released from the magazine.
1637          */
1638         spin_lock(&skc->skc_lock);
1639         for (i = 0; i < count; i++)
1640                 spl_cache_shrink(skc, skm->skm_objs[i]);
1641
1642         skm->skm_avail -= count;
1643         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1644                 sizeof(void *) * skm->skm_avail);
1645
1646         spin_unlock(&skc->skc_lock);
1647
1648         RETURN(count);
1649 }
1650
1651 /*
1652  * Allocate an object from the per-cpu magazine, or if the magazine
1653  * is empty directly allocate from a slab and repopulate the magazine.
1654  */
1655 void *
1656 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1657 {
1658         spl_kmem_magazine_t *skm;
1659         unsigned long irq_flags;
1660         void *obj = NULL;
1661         ENTRY;
1662
1663         ASSERT(skc->skc_magic == SKC_MAGIC);
1664         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1665         ASSERT(flags & KM_SLEEP);
1666         atomic_inc(&skc->skc_ref);
1667         local_irq_save(irq_flags);
1668
1669 restart:
1670         /* Safe to update per-cpu structure without lock, but
1671          * in the restart case we must be careful to reaquire
1672          * the local magazine since this may have changed
1673          * when we need to grow the cache. */
1674         skm = skc->skc_mag[smp_processor_id()];
1675         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1676                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1677                 skm->skm_size, skm->skm_refill, skm->skm_avail);
1678
1679         if (likely(skm->skm_avail)) {
1680                 /* Object available in CPU cache, use it */
1681                 obj = skm->skm_objs[--skm->skm_avail];
1682                 skm->skm_age = jiffies;
1683         } else {
1684                 /* Per-CPU cache empty, directly allocate from
1685                  * the slab and refill the per-CPU cache. */
1686                 (void)spl_cache_refill(skc, skm, flags);
1687                 GOTO(restart, obj = NULL);
1688         }
1689
1690         local_irq_restore(irq_flags);
1691         ASSERT(obj);
1692         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1693
1694         /* Pre-emptively migrate object to CPU L1 cache */
1695         prefetchw(obj);
1696         atomic_dec(&skc->skc_ref);
1697
1698         RETURN(obj);
1699 }
1700 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1701
1702 /*
1703  * Free an object back to the local per-cpu magazine, there is no
1704  * guarantee that this is the same magazine the object was originally
1705  * allocated from.  We may need to flush entire from the magazine
1706  * back to the slabs to make space.
1707  */
1708 void
1709 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1710 {
1711         spl_kmem_magazine_t *skm;
1712         unsigned long flags;
1713         ENTRY;
1714
1715         ASSERT(skc->skc_magic == SKC_MAGIC);
1716         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1717         atomic_inc(&skc->skc_ref);
1718         local_irq_save(flags);
1719
1720         /* Safe to update per-cpu structure without lock, but
1721          * no remote memory allocation tracking is being performed
1722          * it is entirely possible to allocate an object from one
1723          * CPU cache and return it to another. */
1724         skm = skc->skc_mag[smp_processor_id()];
1725         ASSERT(skm->skm_magic == SKM_MAGIC);
1726
1727         /* Per-CPU cache full, flush it to make space */
1728         if (unlikely(skm->skm_avail >= skm->skm_size))
1729                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1730
1731         /* Available space in cache, use it */
1732         skm->skm_objs[skm->skm_avail++] = obj;
1733
1734         local_irq_restore(flags);
1735         atomic_dec(&skc->skc_ref);
1736
1737         EXIT;
1738 }
1739 EXPORT_SYMBOL(spl_kmem_cache_free);
1740
1741 /*
1742  * The generic shrinker function for all caches.  Under linux a shrinker
1743  * may not be tightly coupled with a slab cache.  In fact linux always
1744  * systematically trys calling all registered shrinker callbacks which
1745  * report that they contain unused objects.  Because of this we only
1746  * register one shrinker function in the shim layer for all slab caches.
1747  * We always attempt to shrink all caches when this generic shrinker
1748  * is called.  The shrinker should return the number of free objects
1749  * in the cache when called with nr_to_scan == 0 but not attempt to
1750  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
1751  * objects should be freed, because Solaris semantics are to free
1752  * all available objects we may free more objects than requested.
1753  */
1754 static int
1755 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1756 {
1757         spl_kmem_cache_t *skc;
1758         int unused = 0;
1759
1760         down_read(&spl_kmem_cache_sem);
1761         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1762                 if (nr_to_scan)
1763                         spl_kmem_cache_reap_now(skc);
1764
1765                 /*
1766                  * Presume everything alloc'ed in reclaimable, this ensures
1767                  * we are called again with nr_to_scan > 0 so can try and
1768                  * reclaim.  The exact number is not important either so
1769                  * we forgo taking this already highly contented lock.
1770                  */
1771                 unused += skc->skc_obj_alloc;
1772         }
1773         up_read(&spl_kmem_cache_sem);
1774
1775         return (unused * sysctl_vfs_cache_pressure) / 100;
1776 }
1777
1778 /*
1779  * Call the registered reclaim function for a cache.  Depending on how
1780  * many and which objects are released it may simply repopulate the
1781  * local magazine which will then need to age-out.  Objects which cannot
1782  * fit in the magazine we will be released back to their slabs which will
1783  * also need to age out before being release.  This is all just best
1784  * effort and we do not want to thrash creating and destroying slabs.
1785  */
1786 void
1787 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1788 {
1789         ENTRY;
1790
1791         ASSERT(skc->skc_magic == SKC_MAGIC);
1792         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1793
1794         /* Prevent concurrent cache reaping when contended */
1795         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1796                 EXIT;
1797                 return;
1798         }
1799
1800         atomic_inc(&skc->skc_ref);
1801
1802         if (skc->skc_reclaim)
1803                 skc->skc_reclaim(skc->skc_private);
1804
1805         spl_slab_reclaim(skc, skc->skc_reap, 0);
1806         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1807         atomic_dec(&skc->skc_ref);
1808
1809         EXIT;
1810 }
1811 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1812
1813 /*
1814  * Reap all free slabs from all registered caches.
1815  */
1816 void
1817 spl_kmem_reap(void)
1818 {
1819         spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1820 }
1821 EXPORT_SYMBOL(spl_kmem_reap);
1822
1823 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1824 static char *
1825 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1826 {
1827         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1828         int i, flag = 1;
1829
1830         ASSERT(str != NULL && len >= 17);
1831         memset(str, 0, len);
1832
1833         /* Check for a fully printable string, and while we are at
1834          * it place the printable characters in the passed buffer. */
1835         for (i = 0; i < size; i++) {
1836                 str[i] = ((char *)(kd->kd_addr))[i];
1837                 if (isprint(str[i])) {
1838                         continue;
1839                 } else {
1840                         /* Minimum number of printable characters found
1841                          * to make it worthwhile to print this as ascii. */
1842                         if (i > min)
1843                                 break;
1844
1845                         flag = 0;
1846                         break;
1847                 }
1848         }
1849
1850         if (!flag) {
1851                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1852                         *((uint8_t *)kd->kd_addr),
1853                         *((uint8_t *)kd->kd_addr + 2),
1854                         *((uint8_t *)kd->kd_addr + 4),
1855                         *((uint8_t *)kd->kd_addr + 6),
1856                         *((uint8_t *)kd->kd_addr + 8),
1857                         *((uint8_t *)kd->kd_addr + 10),
1858                         *((uint8_t *)kd->kd_addr + 12),
1859                         *((uint8_t *)kd->kd_addr + 14));
1860         }
1861
1862         return str;
1863 }
1864
1865 static int
1866 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1867 {
1868         int i;
1869         ENTRY;
1870
1871         spin_lock_init(lock);
1872         INIT_LIST_HEAD(list);
1873
1874         for (i = 0; i < size; i++)
1875                 INIT_HLIST_HEAD(&kmem_table[i]);
1876
1877         RETURN(0);
1878 }
1879
1880 static void
1881 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1882 {
1883         unsigned long flags;
1884         kmem_debug_t *kd;
1885         char str[17];
1886         ENTRY;
1887
1888         spin_lock_irqsave(lock, flags);
1889         if (!list_empty(list))
1890                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1891                        "size", "data", "func", "line");
1892
1893         list_for_each_entry(kd, list, kd_list)
1894                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1895                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1896                        kd->kd_func, kd->kd_line);
1897
1898         spin_unlock_irqrestore(lock, flags);
1899         EXIT;
1900 }
1901 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1902 #define spl_kmem_init_tracking(list, lock, size)
1903 #define spl_kmem_fini_tracking(list, lock)
1904 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1905
1906 static void
1907 spl_kmem_init_globals(void)
1908 {
1909         struct zone *zone;
1910
1911         /* For now all zones are includes, it may be wise to restrict
1912          * this to normal and highmem zones if we see problems. */
1913         for_each_zone(zone) {
1914
1915                 if (!populated_zone(zone))
1916                         continue;
1917
1918                 minfree += min_wmark_pages(zone);
1919                 desfree += low_wmark_pages(zone);
1920                 lotsfree += high_wmark_pages(zone);
1921         }
1922
1923         /* Solaris default values */
1924         swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
1925         swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
1926 }
1927
1928 /*
1929  * Called at module init when it is safe to use spl_kallsyms_lookup_name()
1930  */
1931 int
1932 spl_kmem_init_kallsyms_lookup(void)
1933 {
1934 #ifndef HAVE_GET_VMALLOC_INFO
1935         get_vmalloc_info_fn = (get_vmalloc_info_t)
1936                 spl_kallsyms_lookup_name("get_vmalloc_info");
1937         if (!get_vmalloc_info_fn) {
1938                 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
1939                 return -EFAULT;
1940         }
1941 #endif /* HAVE_GET_VMALLOC_INFO */
1942
1943 #ifdef HAVE_PGDAT_HELPERS
1944 # ifndef HAVE_FIRST_ONLINE_PGDAT
1945         first_online_pgdat_fn = (first_online_pgdat_t)
1946                 spl_kallsyms_lookup_name("first_online_pgdat");
1947         if (!first_online_pgdat_fn) {
1948                 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
1949                 return -EFAULT;
1950         }
1951 # endif /* HAVE_FIRST_ONLINE_PGDAT */
1952
1953 # ifndef HAVE_NEXT_ONLINE_PGDAT
1954         next_online_pgdat_fn = (next_online_pgdat_t)
1955                 spl_kallsyms_lookup_name("next_online_pgdat");
1956         if (!next_online_pgdat_fn) {
1957                 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
1958                 return -EFAULT;
1959         }
1960 # endif /* HAVE_NEXT_ONLINE_PGDAT */
1961
1962 # ifndef HAVE_NEXT_ZONE
1963         next_zone_fn = (next_zone_t)
1964                 spl_kallsyms_lookup_name("next_zone");
1965         if (!next_zone_fn) {
1966                 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
1967                 return -EFAULT;
1968         }
1969 # endif /* HAVE_NEXT_ZONE */
1970
1971 #else /* HAVE_PGDAT_HELPERS */
1972
1973 # ifndef HAVE_PGDAT_LIST
1974         pgdat_list_addr = *(struct pglist_data **)
1975                 spl_kallsyms_lookup_name("pgdat_list");
1976         if (!pgdat_list_addr) {
1977                 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
1978                 return -EFAULT;
1979         }
1980 # endif /* HAVE_PGDAT_LIST */
1981 #endif /* HAVE_PGDAT_HELPERS */
1982
1983 #if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
1984         get_zone_counts_fn = (get_zone_counts_t)
1985                 spl_kallsyms_lookup_name("get_zone_counts");
1986         if (!get_zone_counts_fn) {
1987                 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
1988                 return -EFAULT;
1989         }
1990 #endif  /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
1991
1992         /*
1993          * It is now safe to initialize the global tunings which rely on
1994          * the use of the for_each_zone() macro.  This macro in turns
1995          * depends on the *_pgdat symbols which are now available.
1996          */
1997         spl_kmem_init_globals();
1998
1999         return 0;
2000 }
2001
2002 int
2003 spl_kmem_init(void)
2004 {
2005         int rc = 0;
2006         ENTRY;
2007
2008         init_rwsem(&spl_kmem_cache_sem);
2009         INIT_LIST_HEAD(&spl_kmem_cache_list);
2010
2011 #ifdef HAVE_SET_SHRINKER
2012         spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
2013                                                spl_kmem_cache_generic_shrinker);
2014         if (spl_kmem_cache_shrinker == NULL)
2015                 RETURN(rc = -ENOMEM);
2016 #else
2017         register_shrinker(&spl_kmem_cache_shrinker);
2018 #endif
2019
2020 #ifdef DEBUG_KMEM
2021         kmem_alloc_used_set(0);
2022         vmem_alloc_used_set(0);
2023
2024         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2025         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2026 #endif
2027         RETURN(rc);
2028 }
2029
2030 void
2031 spl_kmem_fini(void)
2032 {
2033 #ifdef DEBUG_KMEM
2034         /* Display all unreclaimed memory addresses, including the
2035          * allocation size and the first few bytes of what's located
2036          * at that address to aid in debugging.  Performance is not
2037          * a serious concern here since it is module unload time. */
2038         if (kmem_alloc_used_read() != 0)
2039                 CWARN("kmem leaked %ld/%ld bytes\n",
2040                       kmem_alloc_used_read(), kmem_alloc_max);
2041
2042
2043         if (vmem_alloc_used_read() != 0)
2044                 CWARN("vmem leaked %ld/%ld bytes\n",
2045                       vmem_alloc_used_read(), vmem_alloc_max);
2046
2047         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2048         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2049 #endif /* DEBUG_KMEM */
2050         ENTRY;
2051
2052 #ifdef HAVE_SET_SHRINKER
2053         remove_shrinker(spl_kmem_cache_shrinker);
2054 #else
2055         unregister_shrinker(&spl_kmem_cache_shrinker);
2056 #endif
2057
2058         EXIT;
2059 }