module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://github.com/behlendorf/spl/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28 #include <spl-debug.h>
  29
  30 #ifdef SS_DEBUG_SUBSYS
  31 #undef SS_DEBUG_SUBSYS
  32 #endif
  33
  34 #define SS_DEBUG_SUBSYS SS_KMEM
  35
  36 /*
  37  * Cache expiration was implemented because it was part of the default Solaris
  38  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  39  * accessed in several seconds should be returned to the cache.  On the other
  40  * hand Linux slabs never move objects back to the slabs unless there is
  41  * memory pressure on the system.  By default both methods are disabled, but
  42  * may be enabled by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  43  */
  44 unsigned int spl_kmem_cache_expire = 0;
  45 EXPORT_SYMBOL(spl_kmem_cache_expire);
  46 module_param(spl_kmem_cache_expire, uint, 0644);
  47 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  48
  49 /*
  50  * The minimum amount of memory measured in pages to be free at all
  51  * times on the system.  This is similar to Linux's zone->pages_min
  52  * multiplied by the number of zones and is sized based on that.
  53  */
  54 pgcnt_t minfree = 0;
  55 EXPORT_SYMBOL(minfree);
  56
  57 /*
  58  * The desired amount of memory measured in pages to be free at all
  59  * times on the system.  This is similar to Linux's zone->pages_low
  60  * multiplied by the number of zones and is sized based on that.
  61  * Assuming all zones are being used roughly equally, when we drop
  62  * below this threshold asynchronous page reclamation is triggered.
  63  */
  64 pgcnt_t desfree = 0;
  65 EXPORT_SYMBOL(desfree);
  66
  67 /*
  68  * When above this amount of memory measures in pages the system is
  69  * determined to have enough free memory.  This is similar to Linux's
  70  * zone->pages_high multiplied by the number of zones and is sized based
  71  * on that.  Assuming all zones are being used roughly equally, when
  72  * asynchronous page reclamation reaches this threshold it stops.
  73  */
  74 pgcnt_t lotsfree = 0;
  75 EXPORT_SYMBOL(lotsfree);
  76
  77 /* Unused always 0 in this implementation */
  78 pgcnt_t needfree = 0;
  79 EXPORT_SYMBOL(needfree);
  80
  81 pgcnt_t swapfs_minfree = 0;
  82 EXPORT_SYMBOL(swapfs_minfree);
  83
  84 pgcnt_t swapfs_reserve = 0;
  85 EXPORT_SYMBOL(swapfs_reserve);
  86
  87 vmem_t *heap_arena = NULL;
  88 EXPORT_SYMBOL(heap_arena);
  89
  90 vmem_t *zio_alloc_arena = NULL;
  91 EXPORT_SYMBOL(zio_alloc_arena);
  92
  93 vmem_t *zio_arena = NULL;
  94 EXPORT_SYMBOL(zio_arena);
  95
  96 #ifndef HAVE_GET_VMALLOC_INFO
  97 get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
  98 EXPORT_SYMBOL(get_vmalloc_info_fn);
  99 #endif /* HAVE_GET_VMALLOC_INFO */
 100
 101 #ifdef HAVE_PGDAT_HELPERS
 102 # ifndef HAVE_FIRST_ONLINE_PGDAT
 103 first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
 104 EXPORT_SYMBOL(first_online_pgdat_fn);
 105 # endif /* HAVE_FIRST_ONLINE_PGDAT */
 106
 107 # ifndef HAVE_NEXT_ONLINE_PGDAT
 108 next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
 109 EXPORT_SYMBOL(next_online_pgdat_fn);
 110 # endif /* HAVE_NEXT_ONLINE_PGDAT */
 111
 112 # ifndef HAVE_NEXT_ZONE
 113 next_zone_t next_zone_fn = SYMBOL_POISON;
 114 EXPORT_SYMBOL(next_zone_fn);
 115 # endif /* HAVE_NEXT_ZONE */
 116
 117 #else /* HAVE_PGDAT_HELPERS */
 118
 119 # ifndef HAVE_PGDAT_LIST
 120 struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
 121 EXPORT_SYMBOL(pgdat_list_addr);
 122 # endif /* HAVE_PGDAT_LIST */
 123
 124 #endif /* HAVE_PGDAT_HELPERS */
 125
 126 #ifdef NEED_GET_ZONE_COUNTS
 127 # ifndef HAVE_GET_ZONE_COUNTS
 128 get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
 129 EXPORT_SYMBOL(get_zone_counts_fn);
 130 # endif /* HAVE_GET_ZONE_COUNTS */
 131
 132 unsigned long
 133 spl_global_page_state(spl_zone_stat_item_t item)
 134 {
 135         unsigned long active;
 136         unsigned long inactive;
 137         unsigned long free;
 138
 139         get_zone_counts(&active, &inactive, &free);
 140         switch (item) {
 141         case SPL_NR_FREE_PAGES: return free;
 142         case SPL_NR_INACTIVE:   return inactive;
 143         case SPL_NR_ACTIVE:     return active;
 144         default:                ASSERT(0); /* Unsupported */
 145         }
 146
 147         return 0;
 148 }
 149 #else
 150 # ifdef HAVE_GLOBAL_PAGE_STATE
 151 unsigned long
 152 spl_global_page_state(spl_zone_stat_item_t item)
 153 {
 154         unsigned long pages = 0;
 155
 156         switch (item) {
 157         case SPL_NR_FREE_PAGES:
 158 #  ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
 159                 pages += global_page_state(NR_FREE_PAGES);
 160 #  endif
 161                 break;
 162         case SPL_NR_INACTIVE:
 163 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
 164                 pages += global_page_state(NR_INACTIVE);
 165 #  endif
 166 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
 167                 pages += global_page_state(NR_INACTIVE_ANON);
 168 #  endif
 169 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
 170                 pages += global_page_state(NR_INACTIVE_FILE);
 171 #  endif
 172                 break;
 173         case SPL_NR_ACTIVE:
 174 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
 175                 pages += global_page_state(NR_ACTIVE);
 176 #  endif
 177 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
 178                 pages += global_page_state(NR_ACTIVE_ANON);
 179 #  endif
 180 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
 181                 pages += global_page_state(NR_ACTIVE_FILE);
 182 #  endif
 183                 break;
 184         default:
 185                 ASSERT(0); /* Unsupported */
 186         }
 187
 188         return pages;
 189 }
 190 # else
 191 #  error "Both global_page_state() and get_zone_counts() unavailable"
 192 # endif /* HAVE_GLOBAL_PAGE_STATE */
 193 #endif /* NEED_GET_ZONE_COUNTS */
 194 EXPORT_SYMBOL(spl_global_page_state);
 195
 196 #ifndef HAVE_SHRINK_DCACHE_MEMORY
 197 shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
 198 EXPORT_SYMBOL(shrink_dcache_memory_fn);
 199 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
 200
 201 #ifndef HAVE_SHRINK_ICACHE_MEMORY
 202 shrink_icache_memory_t shrink_icache_memory_fn = SYMBOL_POISON;
 203 EXPORT_SYMBOL(shrink_icache_memory_fn);
 204 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
 205
 206 pgcnt_t
 207 spl_kmem_availrmem(void)
 208 {
 209         /* The amount of easily available memory */
 210         return (spl_global_page_state(SPL_NR_FREE_PAGES) +
 211                 spl_global_page_state(SPL_NR_INACTIVE));
 212 }
 213 EXPORT_SYMBOL(spl_kmem_availrmem);
 214
 215 size_t
 216 vmem_size(vmem_t *vmp, int typemask)
 217 {
 218         struct vmalloc_info vmi;
 219         size_t size = 0;
 220
 221         ASSERT(vmp == NULL);
 222         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 223
 224         get_vmalloc_info(&vmi);
 225         if (typemask & VMEM_ALLOC)
 226                 size += (size_t)vmi.used;
 227
 228         if (typemask & VMEM_FREE)
 229                 size += (size_t)(VMALLOC_TOTAL - vmi.used);
 230
 231         return size;
 232 }
 233 EXPORT_SYMBOL(vmem_size);
 234
 235 int
 236 kmem_debugging(void)
 237 {
 238         return 0;
 239 }
 240 EXPORT_SYMBOL(kmem_debugging);
 241
 242 #ifndef HAVE_KVASPRINTF
 243 /* Simplified asprintf. */
 244 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 245 {
 246         unsigned int len;
 247         char *p;
 248         va_list aq;
 249
 250         va_copy(aq, ap);
 251         len = vsnprintf(NULL, 0, fmt, aq);
 252         va_end(aq);
 253
 254         p = kmalloc(len+1, gfp);
 255         if (!p)
 256                 return NULL;
 257
 258         vsnprintf(p, len+1, fmt, ap);
 259
 260         return p;
 261 }
 262 EXPORT_SYMBOL(kvasprintf);
 263 #endif /* HAVE_KVASPRINTF */
 264
 265 char *
 266 kmem_vasprintf(const char *fmt, va_list ap)
 267 {
 268         va_list aq;
 269         char *ptr;
 270
 271         do {
 272                 va_copy(aq, ap);
 273                 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
 274                 va_end(aq);
 275         } while (ptr == NULL);
 276
 277         return ptr;
 278 }
 279 EXPORT_SYMBOL(kmem_vasprintf);
 280
 281 char *
 282 kmem_asprintf(const char *fmt, ...)
 283 {
 284         va_list ap;
 285         char *ptr;
 286
 287         do {
 288                 va_start(ap, fmt);
 289                 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
 290                 va_end(ap);
 291         } while (ptr == NULL);
 292
 293         return ptr;
 294 }
 295 EXPORT_SYMBOL(kmem_asprintf);
 296
 297 static char *
 298 __strdup(const char *str, int flags)
 299 {
 300         char *ptr;
 301         int n;
 302
 303         n = strlen(str);
 304         ptr = kmalloc_nofail(n + 1, flags);
 305         if (ptr)
 306                 memcpy(ptr, str, n + 1);
 307
 308         return ptr;
 309 }
 310
 311 char *
 312 strdup(const char *str)
 313 {
 314         return __strdup(str, KM_SLEEP);
 315 }
 316 EXPORT_SYMBOL(strdup);
 317
 318 void
 319 strfree(char *str)
 320 {
 321         kfree(str);
 322 }
 323 EXPORT_SYMBOL(strfree);
 324
 325 /*
 326  * Memory allocation interfaces and debugging for basic kmem_*
 327  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 328  * the SPL will keep track of the total memory allocated, and
 329  * report any memory leaked when the module is unloaded.
 330  */
 331 #ifdef DEBUG_KMEM
 332
 333 /* Shim layer memory accounting */
 334 # ifdef HAVE_ATOMIC64_T
 335 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 336 unsigned long long kmem_alloc_max = 0;
 337 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 338 unsigned long long vmem_alloc_max = 0;
 339 # else  /* HAVE_ATOMIC64_T */
 340 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 341 unsigned long long kmem_alloc_max = 0;
 342 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 343 unsigned long long vmem_alloc_max = 0;
 344 # endif /* HAVE_ATOMIC64_T */
 345
 346 EXPORT_SYMBOL(kmem_alloc_used);
 347 EXPORT_SYMBOL(kmem_alloc_max);
 348 EXPORT_SYMBOL(vmem_alloc_used);
 349 EXPORT_SYMBOL(vmem_alloc_max);
 350
 351 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 352  * but also the location of every alloc and free.  When the SPL module is
 353  * unloaded a list of all leaked addresses and where they were allocated
 354  * will be dumped to the console.  Enabling this feature has a significant
 355  * impact on performance but it makes finding memory leaks straight forward.
 356  *
 357  * Not surprisingly with debugging enabled the xmem_locks are very highly
 358  * contended particularly on xfree().  If we want to run with this detailed
 359  * debugging enabled for anything other than debugging  we need to minimize
 360  * the contention by moving to a lock per xmem_table entry model.
 361  */
 362 # ifdef DEBUG_KMEM_TRACKING
 363
 364 #  define KMEM_HASH_BITS          10
 365 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 366
 367 #  define VMEM_HASH_BITS          10
 368 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 369
 370 typedef struct kmem_debug {
 371         struct hlist_node kd_hlist;     /* Hash node linkage */
 372         struct list_head kd_list;       /* List of all allocations */
 373         void *kd_addr;                  /* Allocation pointer */
 374         size_t kd_size;                 /* Allocation size */
 375         const char *kd_func;            /* Allocation function */
 376         int kd_line;                    /* Allocation line */
 377 } kmem_debug_t;
 378
 379 spinlock_t kmem_lock;
 380 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 381 struct list_head kmem_list;
 382
 383 spinlock_t vmem_lock;
 384 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 385 struct list_head vmem_list;
 386
 387 EXPORT_SYMBOL(kmem_lock);
 388 EXPORT_SYMBOL(kmem_table);
 389 EXPORT_SYMBOL(kmem_list);
 390
 391 EXPORT_SYMBOL(vmem_lock);
 392 EXPORT_SYMBOL(vmem_table);
 393 EXPORT_SYMBOL(vmem_list);
 394
 395 static kmem_debug_t *
 396 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
 397 {
 398         struct hlist_head *head;
 399         struct hlist_node *node;
 400         struct kmem_debug *p;
 401         unsigned long flags;
 402         SENTRY;
 403
 404         spin_lock_irqsave(lock, flags);
 405
 406         head = &table[hash_ptr(addr, bits)];
 407         hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
 408                 if (p->kd_addr == addr) {
 409                         hlist_del_init(&p->kd_hlist);
 410                         list_del_init(&p->kd_list);
 411                         spin_unlock_irqrestore(lock, flags);
 412                         return p;
 413                 }
 414         }
 415
 416         spin_unlock_irqrestore(lock, flags);
 417
 418         SRETURN(NULL);
 419 }
 420
 421 void *
 422 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 423     int node_alloc, int node)
 424 {
 425         void *ptr = NULL;
 426         kmem_debug_t *dptr;
 427         unsigned long irq_flags;
 428         SENTRY;
 429
 430         /* Function may be called with KM_NOSLEEP so failure is possible */
 431         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 432             flags & ~__GFP_ZERO);
 433
 434         if (unlikely(dptr == NULL)) {
 435                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 436                     "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 437                     sizeof(kmem_debug_t), flags, func, line,
 438                     kmem_alloc_used_read(), kmem_alloc_max);
 439         } else {
 440                 /*
 441                  * Marked unlikely because we should never be doing this,
 442                  * we tolerate to up 2 pages but a single page is best.
 443                  */
 444                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 445                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
 446                             "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 447                             (unsigned long long) size, flags, func, line,
 448                             kmem_alloc_used_read(), kmem_alloc_max);
 449                         spl_debug_dumpstack(NULL);
 450                 }
 451
 452                 /*
 453                  *  We use __strdup() below because the string pointed to by
 454                  * __FUNCTION__ might not be available by the time we want
 455                  * to print it since the module might have been unloaded.
 456                  * This can only fail in the KM_NOSLEEP case.
 457                  */
 458                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 459                 if (unlikely(dptr->kd_func == NULL)) {
 460                         kfree(dptr);
 461                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 462                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 463                             func, line, kmem_alloc_used_read(), kmem_alloc_max);
 464                         goto out;
 465                 }
 466
 467                 /* Use the correct allocator */
 468                 if (node_alloc) {
 469                         ASSERT(!(flags & __GFP_ZERO));
 470                         ptr = kmalloc_node_nofail(size, flags, node);
 471                 } else if (flags & __GFP_ZERO) {
 472                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 473                 } else {
 474                         ptr = kmalloc_nofail(size, flags);
 475                 }
 476
 477                 if (unlikely(ptr == NULL)) {
 478                         kfree(dptr->kd_func);
 479                         kfree(dptr);
 480                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc"
 481                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 482                             (unsigned long long) size, flags, func, line,
 483                             kmem_alloc_used_read(), kmem_alloc_max);
 484                         goto out;
 485                 }
 486
 487                 kmem_alloc_used_add(size);
 488                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 489                         kmem_alloc_max = kmem_alloc_used_read();
 490
 491                 INIT_HLIST_NODE(&dptr->kd_hlist);
 492                 INIT_LIST_HEAD(&dptr->kd_list);
 493
 494                 dptr->kd_addr = ptr;
 495                 dptr->kd_size = size;
 496                 dptr->kd_line = line;
 497
 498                 spin_lock_irqsave(&kmem_lock, irq_flags);
 499                 hlist_add_head_rcu(&dptr->kd_hlist,
 500                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 501                 list_add_tail(&dptr->kd_list, &kmem_list);
 502                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 503
 504                 SDEBUG_LIMIT(SD_INFO,
 505                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 506                     (unsigned long long) size, flags, func, line, ptr,
 507                     kmem_alloc_used_read(), kmem_alloc_max);
 508         }
 509 out:
 510         SRETURN(ptr);
 511 }
 512 EXPORT_SYMBOL(kmem_alloc_track);
 513
 514 void
 515 kmem_free_track(const void *ptr, size_t size)
 516 {
 517         kmem_debug_t *dptr;
 518         SENTRY;
 519
 520         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 521             (unsigned long long) size);
 522
 523         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 524
 525         /* Must exist in hash due to kmem_alloc() */
 526         ASSERT(dptr);
 527
 528         /* Size must match */
 529         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 530             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 531             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 532
 533         kmem_alloc_used_sub(size);
 534         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 535             (unsigned long long) size, kmem_alloc_used_read(),
 536             kmem_alloc_max);
 537
 538         kfree(dptr->kd_func);
 539
 540         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 541         kfree(dptr);
 542
 543         memset(ptr, 0x5a, size);
 544         kfree(ptr);
 545
 546         SEXIT;
 547 }
 548 EXPORT_SYMBOL(kmem_free_track);
 549
 550 void *
 551 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 552 {
 553         void *ptr = NULL;
 554         kmem_debug_t *dptr;
 555         unsigned long irq_flags;
 556         SENTRY;
 557
 558         ASSERT(flags & KM_SLEEP);
 559
 560         /* Function may be called with KM_NOSLEEP so failure is possible */
 561         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 562             flags & ~__GFP_ZERO);
 563         if (unlikely(dptr == NULL)) {
 564                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 565                     "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 566                     sizeof(kmem_debug_t), flags, func, line,
 567                     vmem_alloc_used_read(), vmem_alloc_max);
 568         } else {
 569                 /*
 570                  * We use __strdup() below because the string pointed to by
 571                  * __FUNCTION__ might not be available by the time we want
 572                  * to print it, since the module might have been unloaded.
 573                  * This can never fail because we have already asserted
 574                  * that flags is KM_SLEEP.
 575                  */
 576                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 577                 if (unlikely(dptr->kd_func == NULL)) {
 578                         kfree(dptr);
 579                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 580                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 581                             func, line, vmem_alloc_used_read(), vmem_alloc_max);
 582                         goto out;
 583                 }
 584
 585                 /* Use the correct allocator */
 586                 if (flags & __GFP_ZERO) {
 587                         ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
 588                 } else {
 589                         ptr = vmalloc_nofail(size, flags);
 590                 }
 591
 592                 if (unlikely(ptr == NULL)) {
 593                         kfree(dptr->kd_func);
 594                         kfree(dptr);
 595                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc"
 596                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 597                             (unsigned long long) size, flags, func, line,
 598                             vmem_alloc_used_read(), vmem_alloc_max);
 599                         goto out;
 600                 }
 601
 602                 vmem_alloc_used_add(size);
 603                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 604                         vmem_alloc_max = vmem_alloc_used_read();
 605
 606                 INIT_HLIST_NODE(&dptr->kd_hlist);
 607                 INIT_LIST_HEAD(&dptr->kd_list);
 608
 609                 dptr->kd_addr = ptr;
 610                 dptr->kd_size = size;
 611                 dptr->kd_line = line;
 612
 613                 spin_lock_irqsave(&vmem_lock, irq_flags);
 614                 hlist_add_head_rcu(&dptr->kd_hlist,
 615                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 616                 list_add_tail(&dptr->kd_list, &vmem_list);
 617                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 618
 619                 SDEBUG_LIMIT(SD_INFO,
 620                     "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 621                     (unsigned long long) size, flags, func, line,
 622                     ptr, vmem_alloc_used_read(), vmem_alloc_max);
 623         }
 624 out:
 625         SRETURN(ptr);
 626 }
 627 EXPORT_SYMBOL(vmem_alloc_track);
 628
 629 void
 630 vmem_free_track(const void *ptr, size_t size)
 631 {
 632         kmem_debug_t *dptr;
 633         SENTRY;
 634
 635         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 636             (unsigned long long) size);
 637
 638         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 639
 640         /* Must exist in hash due to vmem_alloc() */
 641         ASSERT(dptr);
 642
 643         /* Size must match */
 644         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 645             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 646             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 647
 648         vmem_alloc_used_sub(size);
 649         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 650             (unsigned long long) size, vmem_alloc_used_read(),
 651             vmem_alloc_max);
 652
 653         kfree(dptr->kd_func);
 654
 655         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 656         kfree(dptr);
 657
 658         memset(ptr, 0x5a, size);
 659         vfree(ptr);
 660
 661         SEXIT;
 662 }
 663 EXPORT_SYMBOL(vmem_free_track);
 664
 665 # else /* DEBUG_KMEM_TRACKING */
 666
 667 void *
 668 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 669     int node_alloc, int node)
 670 {
 671         void *ptr;
 672         SENTRY;
 673
 674         /*
 675          * Marked unlikely because we should never be doing this,
 676          * we tolerate to up 2 pages but a single page is best.
 677          */
 678         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 679                 SDEBUG(SD_CONSOLE | SD_WARNING,
 680                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 681                     (unsigned long long) size, flags, func, line,
 682                     kmem_alloc_used_read(), kmem_alloc_max);
 683                 dump_stack();
 684         }
 685
 686         /* Use the correct allocator */
 687         if (node_alloc) {
 688                 ASSERT(!(flags & __GFP_ZERO));
 689                 ptr = kmalloc_node_nofail(size, flags, node);
 690         } else if (flags & __GFP_ZERO) {
 691                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 692         } else {
 693                 ptr = kmalloc_nofail(size, flags);
 694         }
 695
 696         if (unlikely(ptr == NULL)) {
 697                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 698                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 699                     (unsigned long long) size, flags, func, line,
 700                     kmem_alloc_used_read(), kmem_alloc_max);
 701         } else {
 702                 kmem_alloc_used_add(size);
 703                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 704                         kmem_alloc_max = kmem_alloc_used_read();
 705
 706                 SDEBUG_LIMIT(SD_INFO,
 707                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 708                     (unsigned long long) size, flags, func, line, ptr,
 709                     kmem_alloc_used_read(), kmem_alloc_max);
 710         }
 711
 712         SRETURN(ptr);
 713 }
 714 EXPORT_SYMBOL(kmem_alloc_debug);
 715
 716 void
 717 kmem_free_debug(const void *ptr, size_t size)
 718 {
 719         SENTRY;
 720
 721         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 722             (unsigned long long) size);
 723
 724         kmem_alloc_used_sub(size);
 725         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 726             (unsigned long long) size, kmem_alloc_used_read(),
 727             kmem_alloc_max);
 728         kfree(ptr);
 729
 730         SEXIT;
 731 }
 732 EXPORT_SYMBOL(kmem_free_debug);
 733
 734 void *
 735 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 736 {
 737         void *ptr;
 738         SENTRY;
 739
 740         ASSERT(flags & KM_SLEEP);
 741
 742         /* Use the correct allocator */
 743         if (flags & __GFP_ZERO) {
 744                 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
 745         } else {
 746                 ptr = vmalloc_nofail(size, flags);
 747         }
 748
 749         if (unlikely(ptr == NULL)) {
 750                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 751                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 752                     (unsigned long long) size, flags, func, line,
 753                     vmem_alloc_used_read(), vmem_alloc_max);
 754         } else {
 755                 vmem_alloc_used_add(size);
 756                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 757                         vmem_alloc_max = vmem_alloc_used_read();
 758
 759                 SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 760                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 761                     vmem_alloc_used_read(), vmem_alloc_max);
 762         }
 763
 764         SRETURN(ptr);
 765 }
 766 EXPORT_SYMBOL(vmem_alloc_debug);
 767
 768 void
 769 vmem_free_debug(const void *ptr, size_t size)
 770 {
 771         SENTRY;
 772
 773         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 774             (unsigned long long) size);
 775
 776         vmem_alloc_used_sub(size);
 777         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 778             (unsigned long long) size, vmem_alloc_used_read(),
 779             vmem_alloc_max);
 780         vfree(ptr);
 781
 782         SEXIT;
 783 }
 784 EXPORT_SYMBOL(vmem_free_debug);
 785
 786 # endif /* DEBUG_KMEM_TRACKING */
 787 #endif /* DEBUG_KMEM */
 788
 789 /*
 790  * Slab allocation interfaces
 791  *
 792  * While the Linux slab implementation was inspired by the Solaris
 793  * implementation I cannot use it to emulate the Solaris APIs.  I
 794  * require two features which are not provided by the Linux slab.
 795  *
 796  * 1) Constructors AND destructors.  Recent versions of the Linux
 797  *    kernel have removed support for destructors.  This is a deal
 798  *    breaker for the SPL which contains particularly expensive
 799  *    initializers for mutex's, condition variables, etc.  We also
 800  *    require a minimal level of cleanup for these data types unlike
 801  *    many Linux data type which do need to be explicitly destroyed.
 802  *
 803  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 804  *    expect it to work well for both small are very large allocations.
 805  *    Because of memory fragmentation the Linux slab which is backed
 806  *    by kmalloc'ed memory performs very badly when confronted with
 807  *    large numbers of large allocations.  Basing the slab on the
 808  *    virtual address space removes the need for contiguous pages
 809  *    and greatly improve performance for large allocations.
 810  *
 811  * For these reasons, the SPL has its own slab implementation with
 812  * the needed features.  It is not as highly optimized as either the
 813  * Solaris or Linux slabs, but it should get me most of what is
 814  * needed until it can be optimized or obsoleted by another approach.
 815  *
 816  * One serious concern I do have about this method is the relatively
 817  * small virtual address space on 32bit arches.  This will seriously
 818  * constrain the size of the slab caches and their performance.
 819  *
 820  * XXX: Improve the partial slab list by carefully maintaining a
 821  *      strict ordering of fullest to emptiest slabs based on
 822  *      the slab reference count.  This guarantees the when freeing
 823  *      slabs back to the system we need only linearly traverse the
 824  *      last N slabs in the list to discover all the freeable slabs.
 825  *
 826  * XXX: NUMA awareness for optionally allocating memory close to a
 827  *      particular core.  This can be advantageous if you know the slab
 828  *      object will be short lived and primarily accessed from one core.
 829  *
 830  * XXX: Slab coloring may also yield performance improvements and would
 831  *      be desirable to implement.
 832  */
 833
 834 struct list_head spl_kmem_cache_list;   /* List of caches */
 835 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 836 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 837
 838 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 839
 840 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 841 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 842         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 843
 844 static void *
 845 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 846 {
 847         void *ptr;
 848
 849         ASSERT(ISP2(size));
 850
 851         if (skc->skc_flags & KMC_KMEM)
 852                 ptr = (void *)__get_free_pages(flags, get_order(size));
 853         else
 854                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 855
 856         /* Resulting allocated memory will be page aligned */
 857         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 858
 859         return ptr;
 860 }
 861
 862 static void
 863 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 864 {
 865         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 866         ASSERT(ISP2(size));
 867
 868         /*
 869          * The Linux direct reclaim path uses this out of band value to
 870          * determine if forward progress is being made.  Normally this is
 871          * incremented by kmem_freepages() which is part of the various
 872          * Linux slab implementations.  However, since we are using none
 873          * of that infrastructure we are responsible for incrementing it.
 874          */
 875         if (current->reclaim_state)
 876                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 877
 878         if (skc->skc_flags & KMC_KMEM)
 879                 free_pages((unsigned long)ptr, get_order(size));
 880         else
 881                 vfree(ptr);
 882 }
 883
 884 /*
 885  * Required space for each aligned sks.
 886  */
 887 static inline uint32_t
 888 spl_sks_size(spl_kmem_cache_t *skc)
 889 {
 890         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 891                skc->skc_obj_align, uint32_t);
 892 }
 893
 894 /*
 895  * Required space for each aligned object.
 896  */
 897 static inline uint32_t
 898 spl_obj_size(spl_kmem_cache_t *skc)
 899 {
 900         uint32_t align = skc->skc_obj_align;
 901
 902         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 903                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 904 }
 905
 906 /*
 907  * Lookup the spl_kmem_object_t for an object given that object.
 908  */
 909 static inline spl_kmem_obj_t *
 910 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 911 {
 912         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 913                skc->skc_obj_align, uint32_t);
 914 }
 915
 916 /*
 917  * Required space for each offslab object taking in to account alignment
 918  * restrictions and the power-of-two requirement of kv_alloc().
 919  */
 920 static inline uint32_t
 921 spl_offslab_size(spl_kmem_cache_t *skc)
 922 {
 923         return 1UL << (highbit(spl_obj_size(skc)) + 1);
 924 }
 925
 926 /*
 927  * It's important that we pack the spl_kmem_obj_t structure and the
 928  * actual objects in to one large address space to minimize the number
 929  * of calls to the allocator.  It is far better to do a few large
 930  * allocations and then subdivide it ourselves.  Now which allocator
 931  * we use requires balancing a few trade offs.
 932  *
 933  * For small objects we use kmem_alloc() because as long as you are
 934  * only requesting a small number of pages (ideally just one) its cheap.
 935  * However, when you start requesting multiple pages with kmem_alloc()
 936  * it gets increasingly expensive since it requires contiguous pages.
 937  * For this reason we shift to vmem_alloc() for slabs of large objects
 938  * which removes the need for contiguous pages.  We do not use
 939  * vmem_alloc() in all cases because there is significant locking
 940  * overhead in __get_vm_area_node().  This function takes a single
 941  * global lock when acquiring an available virtual address range which
 942  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 943  * different allocation functions for small and large objects should
 944  * give us the best of both worlds.
 945  *
 946  * KMC_ONSLAB                       KMC_OFFSLAB
 947  *
 948  * +------------------------+       +-----------------+
 949  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 950  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 951  * | spl_kmem_obj_t      |  |                             | |
 952  * | skc_obj_size    <---+  |       +-----------------+   | |
 953  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 954  * | ...                 v  |       | spl_kmem_obj_t  |     |
 955  * +------------------------+       +-----------------+     v
 956  */
 957 static spl_kmem_slab_t *
 958 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 959 {
 960         spl_kmem_slab_t *sks;
 961         spl_kmem_obj_t *sko, *n;
 962         void *base, *obj;
 963         uint32_t obj_size, offslab_size = 0;
 964         int i,  rc = 0;
 965
 966         base = kv_alloc(skc, skc->skc_slab_size, flags);
 967         if (base == NULL)
 968                 SRETURN(NULL);
 969
 970         sks = (spl_kmem_slab_t *)base;
 971         sks->sks_magic = SKS_MAGIC;
 972         sks->sks_objs = skc->skc_slab_objs;
 973         sks->sks_age = jiffies;
 974         sks->sks_cache = skc;
 975         INIT_LIST_HEAD(&sks->sks_list);
 976         INIT_LIST_HEAD(&sks->sks_free_list);
 977         sks->sks_ref = 0;
 978         obj_size = spl_obj_size(skc);
 979
 980         if (skc->skc_flags & KMC_OFFSLAB)
 981                 offslab_size = spl_offslab_size(skc);
 982
 983         for (i = 0; i < sks->sks_objs; i++) {
 984                 if (skc->skc_flags & KMC_OFFSLAB) {
 985                         obj = kv_alloc(skc, offslab_size, flags);
 986                         if (!obj)
 987                                 SGOTO(out, rc = -ENOMEM);
 988                 } else {
 989                         obj = base + spl_sks_size(skc) + (i * obj_size);
 990                 }
 991
 992                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 993                 sko = spl_sko_from_obj(skc, obj);
 994                 sko->sko_addr = obj;
 995                 sko->sko_magic = SKO_MAGIC;
 996                 sko->sko_slab = sks;
 997                 INIT_LIST_HEAD(&sko->sko_list);
 998                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 999         }
1000
1001         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
1002                 if (skc->skc_ctor)
1003                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
1004 out:
1005         if (rc) {
1006                 if (skc->skc_flags & KMC_OFFSLAB)
1007                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
1008                                                  sko_list)
1009                                 kv_free(skc, sko->sko_addr, offslab_size);
1010
1011                 kv_free(skc, base, skc->skc_slab_size);
1012                 sks = NULL;
1013         }
1014
1015         SRETURN(sks);
1016 }
1017
1018 /*
1019  * Remove a slab from complete or partial list, it must be called with
1020  * the 'skc->skc_lock' held but the actual free must be performed
1021  * outside the lock to prevent deadlocking on vmem addresses.
1022  */
1023 static void
1024 spl_slab_free(spl_kmem_slab_t *sks,
1025               struct list_head *sks_list, struct list_head *sko_list)
1026 {
1027         spl_kmem_cache_t *skc;
1028         SENTRY;
1029
1030         ASSERT(sks->sks_magic == SKS_MAGIC);
1031         ASSERT(sks->sks_ref == 0);
1032
1033         skc = sks->sks_cache;
1034         ASSERT(skc->skc_magic == SKC_MAGIC);
1035         ASSERT(spin_is_locked(&skc->skc_lock));
1036
1037         /*
1038          * Update slab/objects counters in the cache, then remove the
1039          * slab from the skc->skc_partial_list.  Finally add the slab
1040          * and all its objects in to the private work lists where the
1041          * destructors will be called and the memory freed to the system.
1042          */
1043         skc->skc_obj_total -= sks->sks_objs;
1044         skc->skc_slab_total--;
1045         list_del(&sks->sks_list);
1046         list_add(&sks->sks_list, sks_list);
1047         list_splice_init(&sks->sks_free_list, sko_list);
1048
1049         SEXIT;
1050 }
1051
1052 /*
1053  * Traverses all the partial slabs attached to a cache and free those
1054  * which which are currently empty, and have not been touched for
1055  * skc_delay seconds to  avoid thrashing.  The count argument is
1056  * passed to optionally cap the number of slabs reclaimed, a count
1057  * of zero means try and reclaim everything.  When flag is set we
1058  * always free an available slab regardless of age.
1059  */
1060 static void
1061 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
1062 {
1063         spl_kmem_slab_t *sks, *m;
1064         spl_kmem_obj_t *sko, *n;
1065         LIST_HEAD(sks_list);
1066         LIST_HEAD(sko_list);
1067         uint32_t size = 0;
1068         int i = 0;
1069         SENTRY;
1070
1071         /*
1072          * Move empty slabs and objects which have not been touched in
1073          * skc_delay seconds on to private lists to be freed outside
1074          * the spin lock.  This delay time is important to avoid thrashing
1075          * however when flag is set the delay will not be used.
1076          */
1077         spin_lock(&skc->skc_lock);
1078         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
1079                 /*
1080                  * All empty slabs are at the end of skc->skc_partial_list,
1081                  * therefore once a non-empty slab is found we can stop
1082                  * scanning.  Additionally, stop when reaching the target
1083                  * reclaim 'count' if a non-zero threshold is given.
1084                  */
1085                 if ((sks->sks_ref > 0) || (count && i >= count))
1086                         break;
1087
1088                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
1089                         spl_slab_free(sks, &sks_list, &sko_list);
1090                         i++;
1091                 }
1092         }
1093         spin_unlock(&skc->skc_lock);
1094
1095         /*
1096          * The following two loops ensure all the object destructors are
1097          * run, any offslab objects are freed, and the slabs themselves
1098          * are freed.  This is all done outside the skc->skc_lock since
1099          * this allows the destructor to sleep, and allows us to perform
1100          * a conditional reschedule when a freeing a large number of
1101          * objects and slabs back to the system.
1102          */
1103         if (skc->skc_flags & KMC_OFFSLAB)
1104                 size = spl_offslab_size(skc);
1105
1106         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1107                 ASSERT(sko->sko_magic == SKO_MAGIC);
1108
1109                 if (skc->skc_dtor)
1110                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
1111
1112                 if (skc->skc_flags & KMC_OFFSLAB)
1113                         kv_free(skc, sko->sko_addr, size);
1114
1115                 cond_resched();
1116         }
1117
1118         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1119                 ASSERT(sks->sks_magic == SKS_MAGIC);
1120                 kv_free(skc, sks, skc->skc_slab_size);
1121                 cond_resched();
1122         }
1123
1124         SEXIT;
1125 }
1126
1127 static spl_kmem_emergency_t *
1128 spl_emergency_search(struct rb_root *root, void *obj)
1129 {
1130         struct rb_node *node = root->rb_node;
1131         spl_kmem_emergency_t *ske;
1132         unsigned long address = (unsigned long)obj;
1133
1134         while (node) {
1135                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
1136
1137                 if (address < (unsigned long)ske->ske_obj)
1138                         node = node->rb_left;
1139                 else if (address > (unsigned long)ske->ske_obj)
1140                         node = node->rb_right;
1141                 else
1142                         return ske;
1143         }
1144
1145         return NULL;
1146 }
1147
1148 static int
1149 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
1150 {
1151         struct rb_node **new = &(root->rb_node), *parent = NULL;
1152         spl_kmem_emergency_t *ske_tmp;
1153         unsigned long address = (unsigned long)ske->ske_obj;
1154
1155         while (*new) {
1156                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
1157
1158                 parent = *new;
1159                 if (address < (unsigned long)ske_tmp->ske_obj)
1160                         new = &((*new)->rb_left);
1161                 else if (address > (unsigned long)ske_tmp->ske_obj)
1162                         new = &((*new)->rb_right);
1163                 else
1164                         return 0;
1165         }
1166
1167         rb_link_node(&ske->ske_node, parent, new);
1168         rb_insert_color(&ske->ske_node, root);
1169
1170         return 1;
1171 }
1172
1173 /*
1174  * Allocate a single emergency object and track it in a red black tree.
1175  */
1176 static int
1177 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
1178 {
1179         spl_kmem_emergency_t *ske;
1180         int empty;
1181         SENTRY;
1182
1183         /* Last chance use a partial slab if one now exists */
1184         spin_lock(&skc->skc_lock);
1185         empty = list_empty(&skc->skc_partial_list);
1186         spin_unlock(&skc->skc_lock);
1187         if (!empty)
1188                 SRETURN(-EEXIST);
1189
1190         ske = kmalloc(sizeof(*ske), flags);
1191         if (ske == NULL)
1192                 SRETURN(-ENOMEM);
1193
1194         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
1195         if (ske->ske_obj == NULL) {
1196                 kfree(ske);
1197                 SRETURN(-ENOMEM);
1198         }
1199
1200         spin_lock(&skc->skc_lock);
1201         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
1202         if (likely(empty)) {
1203                 skc->skc_obj_total++;
1204                 skc->skc_obj_emergency++;
1205                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
1206                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
1207         }
1208         spin_unlock(&skc->skc_lock);
1209
1210         if (unlikely(!empty)) {
1211                 kfree(ske->ske_obj);
1212                 kfree(ske);
1213                 SRETURN(-EINVAL);
1214         }
1215
1216         if (skc->skc_ctor)
1217                 skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
1218
1219         *obj = ske->ske_obj;
1220
1221         SRETURN(0);
1222 }
1223
1224 /*
1225  * Locate the passed object in the red black tree and free it.
1226  */
1227 static int
1228 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1229 {
1230         spl_kmem_emergency_t *ske;
1231         SENTRY;
1232
1233         spin_lock(&skc->skc_lock);
1234         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1235         if (likely(ske)) {
1236                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1237                 skc->skc_obj_emergency--;
1238                 skc->skc_obj_total--;
1239         }
1240         spin_unlock(&skc->skc_lock);
1241
1242         if (unlikely(ske == NULL))
1243                 SRETURN(-ENOENT);
1244
1245         if (skc->skc_dtor)
1246                 skc->skc_dtor(ske->ske_obj, skc->skc_private);
1247
1248         kfree(ske->ske_obj);
1249         kfree(ske);
1250
1251         SRETURN(0);
1252 }
1253
1254 /*
1255  * Release objects from the per-cpu magazine back to their slab.  The flush
1256  * argument contains the max number of entries to remove from the magazine.
1257  */
1258 static void
1259 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1260 {
1261         int i, count = MIN(flush, skm->skm_avail);
1262         SENTRY;
1263
1264         ASSERT(skc->skc_magic == SKC_MAGIC);
1265         ASSERT(skm->skm_magic == SKM_MAGIC);
1266         ASSERT(spin_is_locked(&skc->skc_lock));
1267
1268         for (i = 0; i < count; i++)
1269                 spl_cache_shrink(skc, skm->skm_objs[i]);
1270
1271         skm->skm_avail -= count;
1272         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1273                 sizeof(void *) * skm->skm_avail);
1274
1275         SEXIT;
1276 }
1277
1278 static void
1279 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1280 {
1281         spin_lock(&skc->skc_lock);
1282         __spl_cache_flush(skc, skm, flush);
1283         spin_unlock(&skc->skc_lock);
1284 }
1285
1286 static void
1287 spl_magazine_age(void *data)
1288 {
1289         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1290         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1291
1292         ASSERT(skm->skm_magic == SKM_MAGIC);
1293         ASSERT(skm->skm_cpu == smp_processor_id());
1294         ASSERT(irqs_disabled());
1295
1296         /* There are no available objects or they are too young to age out */
1297         if ((skm->skm_avail == 0) ||
1298             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1299                 return;
1300
1301         /*
1302          * Because we're executing in interrupt context we may have
1303          * interrupted the holder of this lock.  To avoid a potential
1304          * deadlock return if the lock is contended.
1305          */
1306         if (!spin_trylock(&skc->skc_lock))
1307                 return;
1308
1309         __spl_cache_flush(skc, skm, skm->skm_refill);
1310         spin_unlock(&skc->skc_lock);
1311 }
1312
1313 /*
1314  * Called regularly to keep a downward pressure on the cache.
1315  *
1316  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1317  * be returned to the caches.  This is done to prevent idle magazines from
1318  * holding memory which could be better used elsewhere.  The delay is
1319  * present to prevent thrashing the magazine.
1320  *
1321  * The newly released objects may result in empty partial slabs.  Those
1322  * slabs should be released to the system.  Otherwise moving the objects
1323  * out of the magazines is just wasted work.
1324  */
1325 static void
1326 spl_cache_age(void *data)
1327 {
1328         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1329         taskqid_t id = 0;
1330
1331         ASSERT(skc->skc_magic == SKC_MAGIC);
1332
1333         /* Dynamically disabled at run time */
1334         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1335                 return;
1336
1337         atomic_inc(&skc->skc_ref);
1338         spl_on_each_cpu(spl_magazine_age, skc, 1);
1339         spl_slab_reclaim(skc, skc->skc_reap, 0);
1340
1341         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1342                 id = taskq_dispatch_delay(
1343                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1344                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1345
1346                 /* Destroy issued after dispatch immediately cancel it */
1347                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1348                         taskq_cancel_id(spl_kmem_cache_taskq, id);
1349         }
1350
1351         spin_lock(&skc->skc_lock);
1352         skc->skc_taskqid = id;
1353         spin_unlock(&skc->skc_lock);
1354
1355         atomic_dec(&skc->skc_ref);
1356 }
1357
1358 /*
1359  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1360  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
1361  * for very small objects we may end up with more than this so as not
1362  * to waste space in the minimal allocation of a single page.  Also for
1363  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
1364  * lower than this and we will fail.
1365  */
1366 static int
1367 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1368 {
1369         uint32_t sks_size, obj_size, max_size;
1370
1371         if (skc->skc_flags & KMC_OFFSLAB) {
1372                 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
1373                 *size = sizeof(spl_kmem_slab_t);
1374         } else {
1375                 sks_size = spl_sks_size(skc);
1376                 obj_size = spl_obj_size(skc);
1377
1378                 if (skc->skc_flags & KMC_KMEM)
1379                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1380                 else
1381                         max_size = (32 * 1024 * 1024);
1382
1383                 /* Power of two sized slab */
1384                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1385                         *objs = (*size - sks_size) / obj_size;
1386                         if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
1387                                 SRETURN(0);
1388                 }
1389
1390                 /*
1391                  * Unable to satisfy target objects per slab, fall back to
1392                  * allocating a maximally sized slab and assuming it can
1393                  * contain the minimum objects count use it.  If not fail.
1394                  */
1395                 *size = max_size;
1396                 *objs = (*size - sks_size) / obj_size;
1397                 if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
1398                         SRETURN(0);
1399         }
1400
1401         SRETURN(-ENOSPC);
1402 }
1403
1404 /*
1405  * Make a guess at reasonable per-cpu magazine size based on the size of
1406  * each object and the cost of caching N of them in each magazine.  Long
1407  * term this should really adapt based on an observed usage heuristic.
1408  */
1409 static int
1410 spl_magazine_size(spl_kmem_cache_t *skc)
1411 {
1412         uint32_t obj_size = spl_obj_size(skc);
1413         int size;
1414         SENTRY;
1415
1416         /* Per-magazine sizes below assume a 4Kib page size */
1417         if (obj_size > (PAGE_SIZE * 256))
1418                 size = 4;  /* Minimum 4Mib per-magazine */
1419         else if (obj_size > (PAGE_SIZE * 32))
1420                 size = 16; /* Minimum 2Mib per-magazine */
1421         else if (obj_size > (PAGE_SIZE))
1422                 size = 64; /* Minimum 256Kib per-magazine */
1423         else if (obj_size > (PAGE_SIZE / 4))
1424                 size = 128; /* Minimum 128Kib per-magazine */
1425         else
1426                 size = 256;
1427
1428         SRETURN(size);
1429 }
1430
1431 /*
1432  * Allocate a per-cpu magazine to associate with a specific core.
1433  */
1434 static spl_kmem_magazine_t *
1435 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
1436 {
1437         spl_kmem_magazine_t *skm;
1438         int size = sizeof(spl_kmem_magazine_t) +
1439                    sizeof(void *) * skc->skc_mag_size;
1440         SENTRY;
1441
1442         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
1443         if (skm) {
1444                 skm->skm_magic = SKM_MAGIC;
1445                 skm->skm_avail = 0;
1446                 skm->skm_size = skc->skc_mag_size;
1447                 skm->skm_refill = skc->skc_mag_refill;
1448                 skm->skm_cache = skc;
1449                 skm->skm_age = jiffies;
1450                 skm->skm_cpu = cpu;
1451         }
1452
1453         SRETURN(skm);
1454 }
1455
1456 /*
1457  * Free a per-cpu magazine associated with a specific core.
1458  */
1459 static void
1460 spl_magazine_free(spl_kmem_magazine_t *skm)
1461 {
1462         int size = sizeof(spl_kmem_magazine_t) +
1463                    sizeof(void *) * skm->skm_size;
1464
1465         SENTRY;
1466         ASSERT(skm->skm_magic == SKM_MAGIC);
1467         ASSERT(skm->skm_avail == 0);
1468
1469         kmem_free(skm, size);
1470         SEXIT;
1471 }
1472
1473 /*
1474  * Create all pre-cpu magazines of reasonable sizes.
1475  */
1476 static int
1477 spl_magazine_create(spl_kmem_cache_t *skc)
1478 {
1479         int i;
1480         SENTRY;
1481
1482         skc->skc_mag_size = spl_magazine_size(skc);
1483         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1484
1485         for_each_online_cpu(i) {
1486                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
1487                 if (!skc->skc_mag[i]) {
1488                         for (i--; i >= 0; i--)
1489                                 spl_magazine_free(skc->skc_mag[i]);
1490
1491                         SRETURN(-ENOMEM);
1492                 }
1493         }
1494
1495         SRETURN(0);
1496 }
1497
1498 /*
1499  * Destroy all pre-cpu magazines.
1500  */
1501 static void
1502 spl_magazine_destroy(spl_kmem_cache_t *skc)
1503 {
1504         spl_kmem_magazine_t *skm;
1505         int i;
1506         SENTRY;
1507
1508         for_each_online_cpu(i) {
1509                 skm = skc->skc_mag[i];
1510                 spl_cache_flush(skc, skm, skm->skm_avail);
1511                 spl_magazine_free(skm);
1512         }
1513
1514         SEXIT;
1515 }
1516
1517 /*
1518  * Create a object cache based on the following arguments:
1519  * name         cache name
1520  * size         cache object size
1521  * align        cache object alignment
1522  * ctor         cache object constructor
1523  * dtor         cache object destructor
1524  * reclaim      cache object reclaim
1525  * priv         cache private data for ctor/dtor/reclaim
1526  * vmp          unused must be NULL
1527  * flags
1528  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1529  *      KMC_NODEBUG     Disable debugging (unsupported)
1530  *      KMC_NOMAGAZINE  Disable magazine (unsupported)
1531  *      KMC_NOHASH      Disable hashing (unsupported)
1532  *      KMC_QCACHE      Disable qcache (unsupported)
1533  *      KMC_KMEM        Force kmem backed cache
1534  *      KMC_VMEM        Force vmem backed cache
1535  *      KMC_OFFSLAB     Locate objects off the slab
1536  */
1537 spl_kmem_cache_t *
1538 spl_kmem_cache_create(char *name, size_t size, size_t align,
1539                       spl_kmem_ctor_t ctor,
1540                       spl_kmem_dtor_t dtor,
1541                       spl_kmem_reclaim_t reclaim,
1542                       void *priv, void *vmp, int flags)
1543 {
1544         spl_kmem_cache_t *skc;
1545         int rc;
1546         SENTRY;
1547
1548         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1549         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1550         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1551         ASSERT(vmp == NULL);
1552
1553         might_sleep();
1554
1555         /*
1556          * Allocate memory for a new cache an initialize it.  Unfortunately,
1557          * this usually ends up being a large allocation of ~32k because
1558          * we need to allocate enough memory for the worst case number of
1559          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1560          * explicitly pass KM_NODEBUG to suppress the kmem warning
1561          */
1562         skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
1563         if (skc == NULL)
1564                 SRETURN(NULL);
1565
1566         skc->skc_magic = SKC_MAGIC;
1567         skc->skc_name_size = strlen(name) + 1;
1568         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
1569         if (skc->skc_name == NULL) {
1570                 kmem_free(skc, sizeof(*skc));
1571                 SRETURN(NULL);
1572         }
1573         strncpy(skc->skc_name, name, skc->skc_name_size);
1574
1575         skc->skc_ctor = ctor;
1576         skc->skc_dtor = dtor;
1577         skc->skc_reclaim = reclaim;
1578         skc->skc_private = priv;
1579         skc->skc_vmp = vmp;
1580         skc->skc_flags = flags;
1581         skc->skc_obj_size = size;
1582         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1583         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1584         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1585         atomic_set(&skc->skc_ref, 0);
1586
1587         INIT_LIST_HEAD(&skc->skc_list);
1588         INIT_LIST_HEAD(&skc->skc_complete_list);
1589         INIT_LIST_HEAD(&skc->skc_partial_list);
1590         skc->skc_emergency_tree = RB_ROOT;
1591         spin_lock_init(&skc->skc_lock);
1592         init_waitqueue_head(&skc->skc_waitq);
1593         skc->skc_slab_fail = 0;
1594         skc->skc_slab_create = 0;
1595         skc->skc_slab_destroy = 0;
1596         skc->skc_slab_total = 0;
1597         skc->skc_slab_alloc = 0;
1598         skc->skc_slab_max = 0;
1599         skc->skc_obj_total = 0;
1600         skc->skc_obj_alloc = 0;
1601         skc->skc_obj_max = 0;
1602         skc->skc_obj_deadlock = 0;
1603         skc->skc_obj_emergency = 0;
1604         skc->skc_obj_emergency_max = 0;
1605
1606         if (align) {
1607                 VERIFY(ISP2(align));
1608                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */
1609                 VERIFY3U(align, <=, PAGE_SIZE);            /* Max alignment */
1610                 skc->skc_obj_align = align;
1611         }
1612
1613         /* If none passed select a cache type based on object size */
1614         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
1615                 if (spl_obj_size(skc) < (PAGE_SIZE / 8))
1616                         skc->skc_flags |= KMC_KMEM;
1617                 else
1618                         skc->skc_flags |= KMC_VMEM;
1619         }
1620
1621         rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
1622         if (rc)
1623                 SGOTO(out, rc);
1624
1625         rc = spl_magazine_create(skc);
1626         if (rc)
1627                 SGOTO(out, rc);
1628
1629         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1630                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1631                     spl_cache_age, skc, TQ_SLEEP,
1632                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1633
1634         down_write(&spl_kmem_cache_sem);
1635         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1636         up_write(&spl_kmem_cache_sem);
1637
1638         SRETURN(skc);
1639 out:
1640         kmem_free(skc->skc_name, skc->skc_name_size);
1641         kmem_free(skc, sizeof(*skc));
1642         SRETURN(NULL);
1643 }
1644 EXPORT_SYMBOL(spl_kmem_cache_create);
1645
1646 /*
1647  * Register a move callback to for cache defragmentation.
1648  * XXX: Unimplemented but harmless to stub out for now.
1649  */
1650 void
1651 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
1652     kmem_cbrc_t (move)(void *, void *, size_t, void *))
1653 {
1654         ASSERT(move != NULL);
1655 }
1656 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1657
1658 /*
1659  * Destroy a cache and all objects associated with the cache.
1660  */
1661 void
1662 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1663 {
1664         DECLARE_WAIT_QUEUE_HEAD(wq);
1665         taskqid_t id;
1666         SENTRY;
1667
1668         ASSERT(skc->skc_magic == SKC_MAGIC);
1669
1670         down_write(&spl_kmem_cache_sem);
1671         list_del_init(&skc->skc_list);
1672         up_write(&spl_kmem_cache_sem);
1673
1674         /* Cancel any and wait for any pending delayed tasks */
1675         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1676
1677         spin_lock(&skc->skc_lock);
1678         id = skc->skc_taskqid;
1679         spin_unlock(&skc->skc_lock);
1680
1681         taskq_cancel_id(spl_kmem_cache_taskq, id);
1682
1683         /* Wait until all current callers complete, this is mainly
1684          * to catch the case where a low memory situation triggers a
1685          * cache reaping action which races with this destroy. */
1686         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1687
1688         spl_magazine_destroy(skc);
1689         spl_slab_reclaim(skc, 0, 1);
1690         spin_lock(&skc->skc_lock);
1691
1692         /* Validate there are no objects in use and free all the
1693          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1694         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1695         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1696         ASSERT3U(skc->skc_slab_total, ==, 0);
1697         ASSERT3U(skc->skc_obj_total, ==, 0);
1698         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1699         ASSERT(list_empty(&skc->skc_complete_list));
1700
1701         kmem_free(skc->skc_name, skc->skc_name_size);
1702         spin_unlock(&skc->skc_lock);
1703
1704         kmem_free(skc, sizeof(*skc));
1705
1706         SEXIT;
1707 }
1708 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1709
1710 /*
1711  * Allocate an object from a slab attached to the cache.  This is used to
1712  * repopulate the per-cpu magazine caches in batches when they run low.
1713  */
1714 static void *
1715 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1716 {
1717         spl_kmem_obj_t *sko;
1718
1719         ASSERT(skc->skc_magic == SKC_MAGIC);
1720         ASSERT(sks->sks_magic == SKS_MAGIC);
1721         ASSERT(spin_is_locked(&skc->skc_lock));
1722
1723         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1724         ASSERT(sko->sko_magic == SKO_MAGIC);
1725         ASSERT(sko->sko_addr != NULL);
1726
1727         /* Remove from sks_free_list */
1728         list_del_init(&sko->sko_list);
1729
1730         sks->sks_age = jiffies;
1731         sks->sks_ref++;
1732         skc->skc_obj_alloc++;
1733
1734         /* Track max obj usage statistics */
1735         if (skc->skc_obj_alloc > skc->skc_obj_max)
1736                 skc->skc_obj_max = skc->skc_obj_alloc;
1737
1738         /* Track max slab usage statistics */
1739         if (sks->sks_ref == 1) {
1740                 skc->skc_slab_alloc++;
1741
1742                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1743                         skc->skc_slab_max = skc->skc_slab_alloc;
1744         }
1745
1746         return sko->sko_addr;
1747 }
1748
1749 /*
1750  * Generic slab allocation function to run by the global work queues.
1751  * It is responsible for allocating a new slab, linking it in to the list
1752  * of partial slabs, and then waking any waiters.
1753  */
1754 static void
1755 spl_cache_grow_work(void *data)
1756 {
1757         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1758         spl_kmem_cache_t *skc = ska->ska_cache;
1759         spl_kmem_slab_t *sks;
1760
1761         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1762         spin_lock(&skc->skc_lock);
1763         if (sks) {
1764                 skc->skc_slab_total++;
1765                 skc->skc_obj_total += sks->sks_objs;
1766                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1767         }
1768
1769         atomic_dec(&skc->skc_ref);
1770         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1771         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1772         wake_up_all(&skc->skc_waitq);
1773         spin_unlock(&skc->skc_lock);
1774
1775         kfree(ska);
1776 }
1777
1778 /*
1779  * Returns non-zero when a new slab should be available.
1780  */
1781 static int
1782 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1783 {
1784         return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1785 }
1786
1787 static int
1788 spl_cache_reclaim_wait(void *word)
1789 {
1790         schedule();
1791         return 0;
1792 }
1793
1794 /*
1795  * No available objects on any slabs, create a new slab.
1796  */
1797 static int
1798 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1799 {
1800         int remaining, rc;
1801         SENTRY;
1802
1803         ASSERT(skc->skc_magic == SKC_MAGIC);
1804         might_sleep();
1805         *obj = NULL;
1806
1807         /*
1808          * Before allocating a new slab wait for any reaping to complete and
1809          * then return so the local magazine can be rechecked for new objects.
1810          */
1811         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1812                 rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1813                     spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
1814                 SRETURN(rc ? rc : -EAGAIN);
1815         }
1816
1817         /*
1818          * This is handled by dispatching a work request to the global work
1819          * queue.  This allows us to asynchronously allocate a new slab while
1820          * retaining the ability to safely fall back to a smaller synchronous
1821          * allocations to ensure forward progress is always maintained.
1822          */
1823         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1824                 spl_kmem_alloc_t *ska;
1825
1826                 ska = kmalloc(sizeof(*ska), flags);
1827                 if (ska == NULL) {
1828                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1829                         wake_up_all(&skc->skc_waitq);
1830                         SRETURN(-ENOMEM);
1831                 }
1832
1833                 atomic_inc(&skc->skc_ref);
1834                 ska->ska_cache = skc;
1835                 ska->ska_flags = flags & ~__GFP_FS;
1836                 taskq_init_ent(&ska->ska_tqe);
1837                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1838                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1839         }
1840
1841         /*
1842          * The goal here is to only detect the rare case where a virtual slab
1843          * allocation has deadlocked.  We must be careful to minimize the use
1844          * of emergency objects which are more expensive to track.  Therefore,
1845          * we set a very long timeout for the asynchronous allocation and if
1846          * the timeout is reached the cache is flagged as deadlocked.  From
1847          * this point only new emergency objects will be allocated until the
1848          * asynchronous allocation completes and clears the deadlocked flag.
1849          */
1850         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1851                 rc = spl_emergency_alloc(skc, flags, obj);
1852         } else {
1853                 remaining = wait_event_timeout(skc->skc_waitq,
1854                                                spl_cache_grow_wait(skc), HZ);
1855
1856                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1857                         spin_lock(&skc->skc_lock);
1858                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1859                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1860                                 skc->skc_obj_deadlock++;
1861                         }
1862                         spin_unlock(&skc->skc_lock);
1863                 }
1864
1865                 rc = -ENOMEM;
1866         }
1867
1868         SRETURN(rc);
1869 }
1870
1871 /*
1872  * Refill a per-cpu magazine with objects from the slabs for this cache.
1873  * Ideally the magazine can be repopulated using existing objects which have
1874  * been released, however if we are unable to locate enough free objects new
1875  * slabs of objects will be created.  On success NULL is returned, otherwise
1876  * the address of a single emergency object is returned for use by the caller.
1877  */
1878 static void *
1879 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1880 {
1881         spl_kmem_slab_t *sks;
1882         int count = 0, rc, refill;
1883         void *obj = NULL;
1884         SENTRY;
1885
1886         ASSERT(skc->skc_magic == SKC_MAGIC);
1887         ASSERT(skm->skm_magic == SKM_MAGIC);
1888
1889         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1890         spin_lock(&skc->skc_lock);
1891
1892         while (refill > 0) {
1893                 /* No slabs available we may need to grow the cache */
1894                 if (list_empty(&skc->skc_partial_list)) {
1895                         spin_unlock(&skc->skc_lock);
1896
1897                         local_irq_enable();
1898                         rc = spl_cache_grow(skc, flags, &obj);
1899                         local_irq_disable();
1900
1901                         /* Emergency object for immediate use by caller */
1902                         if (rc == 0 && obj != NULL)
1903                                 SRETURN(obj);
1904
1905                         if (rc)
1906                                 SGOTO(out, rc);
1907
1908                         /* Rescheduled to different CPU skm is not local */
1909                         if (skm != skc->skc_mag[smp_processor_id()])
1910                                 SGOTO(out, rc);
1911
1912                         /* Potentially rescheduled to the same CPU but
1913                          * allocations may have occurred from this CPU while
1914                          * we were sleeping so recalculate max refill. */
1915                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1916
1917                         spin_lock(&skc->skc_lock);
1918                         continue;
1919                 }
1920
1921                 /* Grab the next available slab */
1922                 sks = list_entry((&skc->skc_partial_list)->next,
1923                                  spl_kmem_slab_t, sks_list);
1924                 ASSERT(sks->sks_magic == SKS_MAGIC);
1925                 ASSERT(sks->sks_ref < sks->sks_objs);
1926                 ASSERT(!list_empty(&sks->sks_free_list));
1927
1928                 /* Consume as many objects as needed to refill the requested
1929                  * cache.  We must also be careful not to overfill it. */
1930                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
1931                         ASSERT(skm->skm_avail < skm->skm_size);
1932                         ASSERT(count < skm->skm_size);
1933                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1934                 }
1935
1936                 /* Move slab to skc_complete_list when full */
1937                 if (sks->sks_ref == sks->sks_objs) {
1938                         list_del(&sks->sks_list);
1939                         list_add(&sks->sks_list, &skc->skc_complete_list);
1940                 }
1941         }
1942
1943         spin_unlock(&skc->skc_lock);
1944 out:
1945         SRETURN(NULL);
1946 }
1947
1948 /*
1949  * Release an object back to the slab from which it came.
1950  */
1951 static void
1952 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1953 {
1954         spl_kmem_slab_t *sks = NULL;
1955         spl_kmem_obj_t *sko = NULL;
1956         SENTRY;
1957
1958         ASSERT(skc->skc_magic == SKC_MAGIC);
1959         ASSERT(spin_is_locked(&skc->skc_lock));
1960
1961         sko = spl_sko_from_obj(skc, obj);
1962         ASSERT(sko->sko_magic == SKO_MAGIC);
1963         sks = sko->sko_slab;
1964         ASSERT(sks->sks_magic == SKS_MAGIC);
1965         ASSERT(sks->sks_cache == skc);
1966         list_add(&sko->sko_list, &sks->sks_free_list);
1967
1968         sks->sks_age = jiffies;
1969         sks->sks_ref--;
1970         skc->skc_obj_alloc--;
1971
1972         /* Move slab to skc_partial_list when no longer full.  Slabs
1973          * are added to the head to keep the partial list is quasi-full
1974          * sorted order.  Fuller at the head, emptier at the tail. */
1975         if (sks->sks_ref == (sks->sks_objs - 1)) {
1976                 list_del(&sks->sks_list);
1977                 list_add(&sks->sks_list, &skc->skc_partial_list);
1978         }
1979
1980         /* Move empty slabs to the end of the partial list so
1981          * they can be easily found and freed during reclamation. */
1982         if (sks->sks_ref == 0) {
1983                 list_del(&sks->sks_list);
1984                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1985                 skc->skc_slab_alloc--;
1986         }
1987
1988         SEXIT;
1989 }
1990
1991 /*
1992  * Allocate an object from the per-cpu magazine, or if the magazine
1993  * is empty directly allocate from a slab and repopulate the magazine.
1994  */
1995 void *
1996 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1997 {
1998         spl_kmem_magazine_t *skm;
1999         unsigned long irq_flags;
2000         void *obj = NULL;
2001         SENTRY;
2002
2003         ASSERT(skc->skc_magic == SKC_MAGIC);
2004         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2005         ASSERT(flags & KM_SLEEP);
2006         atomic_inc(&skc->skc_ref);
2007         local_irq_save(irq_flags);
2008
2009 restart:
2010         /* Safe to update per-cpu structure without lock, but
2011          * in the restart case we must be careful to reacquire
2012          * the local magazine since this may have changed
2013          * when we need to grow the cache. */
2014         skm = skc->skc_mag[smp_processor_id()];
2015         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
2016                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
2017                 skm->skm_size, skm->skm_refill, skm->skm_avail);
2018
2019         if (likely(skm->skm_avail)) {
2020                 /* Object available in CPU cache, use it */
2021                 obj = skm->skm_objs[--skm->skm_avail];
2022                 skm->skm_age = jiffies;
2023         } else {
2024                 obj = spl_cache_refill(skc, skm, flags);
2025                 if (obj == NULL)
2026                         SGOTO(restart, obj = NULL);
2027         }
2028
2029         local_irq_restore(irq_flags);
2030         ASSERT(obj);
2031         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
2032
2033         /* Pre-emptively migrate object to CPU L1 cache */
2034         prefetchw(obj);
2035         atomic_dec(&skc->skc_ref);
2036
2037         SRETURN(obj);
2038 }
2039 EXPORT_SYMBOL(spl_kmem_cache_alloc);
2040
2041 /*
2042  * Free an object back to the local per-cpu magazine, there is no
2043  * guarantee that this is the same magazine the object was originally
2044  * allocated from.  We may need to flush entire from the magazine
2045  * back to the slabs to make space.
2046  */
2047 void
2048 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
2049 {
2050         spl_kmem_magazine_t *skm;
2051         unsigned long flags;
2052         SENTRY;
2053
2054         ASSERT(skc->skc_magic == SKC_MAGIC);
2055         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2056         atomic_inc(&skc->skc_ref);
2057
2058         /*
2059          * Only virtual slabs may have emergency objects and these objects
2060          * are guaranteed to have physical addresses.  They must be removed
2061          * from the tree of emergency objects and the freed.
2062          */
2063         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
2064                 SGOTO(out, spl_emergency_free(skc, obj));
2065
2066         local_irq_save(flags);
2067
2068         /* Safe to update per-cpu structure without lock, but
2069          * no remote memory allocation tracking is being performed
2070          * it is entirely possible to allocate an object from one
2071          * CPU cache and return it to another. */
2072         skm = skc->skc_mag[smp_processor_id()];
2073         ASSERT(skm->skm_magic == SKM_MAGIC);
2074
2075         /* Per-CPU cache full, flush it to make space */
2076         if (unlikely(skm->skm_avail >= skm->skm_size))
2077                 spl_cache_flush(skc, skm, skm->skm_refill);
2078
2079         /* Available space in cache, use it */
2080         skm->skm_objs[skm->skm_avail++] = obj;
2081
2082         local_irq_restore(flags);
2083 out:
2084         atomic_dec(&skc->skc_ref);
2085
2086         SEXIT;
2087 }
2088 EXPORT_SYMBOL(spl_kmem_cache_free);
2089
2090 /*
2091  * The generic shrinker function for all caches.  Under Linux a shrinker
2092  * may not be tightly coupled with a slab cache.  In fact Linux always
2093  * systematically tries calling all registered shrinker callbacks which
2094  * report that they contain unused objects.  Because of this we only
2095  * register one shrinker function in the shim layer for all slab caches.
2096  * We always attempt to shrink all caches when this generic shrinker
2097  * is called.  The shrinker should return the number of free objects
2098  * in the cache when called with nr_to_scan == 0 but not attempt to
2099  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
2100  * objects should be freed, which differs from Solaris semantics.
2101  * Solaris semantics are to free all available objects which may (and
2102  * probably will) be more objects than the requested nr_to_scan.
2103  */
2104 static int
2105 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
2106     struct shrink_control *sc)
2107 {
2108         spl_kmem_cache_t *skc;
2109         int unused = 0;
2110
2111         down_read(&spl_kmem_cache_sem);
2112         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
2113                 if (sc->nr_to_scan)
2114                         spl_kmem_cache_reap_now(skc,
2115                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
2116
2117                 /*
2118                  * Presume everything alloc'ed in reclaimable, this ensures
2119                  * we are called again with nr_to_scan > 0 so can try and
2120                  * reclaim.  The exact number is not important either so
2121                  * we forgo taking this already highly contented lock.
2122                  */
2123                 unused += skc->skc_obj_alloc;
2124         }
2125         up_read(&spl_kmem_cache_sem);
2126
2127         return (unused * sysctl_vfs_cache_pressure) / 100;
2128 }
2129
2130 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2131
2132 /*
2133  * Call the registered reclaim function for a cache.  Depending on how
2134  * many and which objects are released it may simply repopulate the
2135  * local magazine which will then need to age-out.  Objects which cannot
2136  * fit in the magazine we will be released back to their slabs which will
2137  * also need to age out before being release.  This is all just best
2138  * effort and we do not want to thrash creating and destroying slabs.
2139  */
2140 void
2141 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
2142 {
2143         SENTRY;
2144
2145         ASSERT(skc->skc_magic == SKC_MAGIC);
2146         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2147
2148         /* Prevent concurrent cache reaping when contended */
2149         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
2150                 SEXIT;
2151                 return;
2152         }
2153
2154         atomic_inc(&skc->skc_ref);
2155
2156         /*
2157          * When a reclaim function is available it may be invoked repeatedly
2158          * until at least a single slab can be freed.  This ensures that we
2159          * do free memory back to the system.  This helps minimize the chance
2160          * of an OOM event when the bulk of memory is used by the slab.
2161          *
2162          * When free slabs are already available the reclaim callback will be
2163          * skipped.  Additionally, if no forward progress is detected despite
2164          * a reclaim function the cache will be skipped to avoid deadlock.
2165          *
2166          * Longer term this would be the correct place to add the code which
2167          * repacks the slabs in order minimize fragmentation.
2168          */
2169         if (skc->skc_reclaim) {
2170                 uint64_t objects = UINT64_MAX;
2171                 int do_reclaim;
2172
2173                 do {
2174                         spin_lock(&skc->skc_lock);
2175                         do_reclaim =
2176                             (skc->skc_slab_total > 0) &&
2177                             ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2178                             (skc->skc_obj_alloc < objects);
2179
2180                         objects = skc->skc_obj_alloc;
2181                         spin_unlock(&skc->skc_lock);
2182
2183                         if (do_reclaim)
2184                                 skc->skc_reclaim(skc->skc_private);
2185
2186                 } while (do_reclaim);
2187         }
2188
2189         /* Reclaim from the magazine then the slabs ignoring age and delay. */
2190         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2191                 spl_kmem_magazine_t *skm;
2192                 int i;
2193
2194                 for_each_online_cpu(i) {
2195                         skm = skc->skc_mag[i];
2196                         spl_cache_flush(skc, skm, skm->skm_avail);
2197                 }
2198         }
2199
2200         spl_slab_reclaim(skc, count, 1);
2201         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
2202         smp_mb__after_clear_bit();
2203         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
2204
2205         atomic_dec(&skc->skc_ref);
2206
2207         SEXIT;
2208 }
2209 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
2210
2211 /*
2212  * Reap all free slabs from all registered caches.
2213  */
2214 void
2215 spl_kmem_reap(void)
2216 {
2217         struct shrink_control sc;
2218
2219         sc.nr_to_scan = KMC_REAP_CHUNK;
2220         sc.gfp_mask = GFP_KERNEL;
2221
2222         __spl_kmem_cache_generic_shrinker(NULL, &sc);
2223 }
2224 EXPORT_SYMBOL(spl_kmem_reap);
2225
2226 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
2227 static char *
2228 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
2229 {
2230         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
2231         int i, flag = 1;
2232
2233         ASSERT(str != NULL && len >= 17);
2234         memset(str, 0, len);
2235
2236         /* Check for a fully printable string, and while we are at
2237          * it place the printable characters in the passed buffer. */
2238         for (i = 0; i < size; i++) {
2239                 str[i] = ((char *)(kd->kd_addr))[i];
2240                 if (isprint(str[i])) {
2241                         continue;
2242                 } else {
2243                         /* Minimum number of printable characters found
2244                          * to make it worthwhile to print this as ascii. */
2245                         if (i > min)
2246                                 break;
2247
2248                         flag = 0;
2249                         break;
2250                 }
2251         }
2252
2253         if (!flag) {
2254                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2255                         *((uint8_t *)kd->kd_addr),
2256                         *((uint8_t *)kd->kd_addr + 2),
2257                         *((uint8_t *)kd->kd_addr + 4),
2258                         *((uint8_t *)kd->kd_addr + 6),
2259                         *((uint8_t *)kd->kd_addr + 8),
2260                         *((uint8_t *)kd->kd_addr + 10),
2261                         *((uint8_t *)kd->kd_addr + 12),
2262                         *((uint8_t *)kd->kd_addr + 14));
2263         }
2264
2265         return str;
2266 }
2267
2268 static int
2269 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2270 {
2271         int i;
2272         SENTRY;
2273
2274         spin_lock_init(lock);
2275         INIT_LIST_HEAD(list);
2276
2277         for (i = 0; i < size; i++)
2278                 INIT_HLIST_HEAD(&kmem_table[i]);
2279
2280         SRETURN(0);
2281 }
2282
2283 static void
2284 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
2285 {
2286         unsigned long flags;
2287         kmem_debug_t *kd;
2288         char str[17];
2289         SENTRY;
2290
2291         spin_lock_irqsave(lock, flags);
2292         if (!list_empty(list))
2293                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2294                        "size", "data", "func", "line");
2295
2296         list_for_each_entry(kd, list, kd_list)
2297                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
2298                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2299                        kd->kd_func, kd->kd_line);
2300
2301         spin_unlock_irqrestore(lock, flags);
2302         SEXIT;
2303 }
2304 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2305 #define spl_kmem_init_tracking(list, lock, size)
2306 #define spl_kmem_fini_tracking(list, lock)
2307 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2308
2309 static void
2310 spl_kmem_init_globals(void)
2311 {
2312         struct zone *zone;
2313
2314         /* For now all zones are includes, it may be wise to restrict
2315          * this to normal and highmem zones if we see problems. */
2316         for_each_zone(zone) {
2317
2318                 if (!populated_zone(zone))
2319                         continue;
2320
2321                 minfree += min_wmark_pages(zone);
2322                 desfree += low_wmark_pages(zone);
2323                 lotsfree += high_wmark_pages(zone);
2324         }
2325
2326         /* Solaris default values */
2327         swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
2328         swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
2329 }
2330
2331 /*
2332  * Called at module init when it is safe to use spl_kallsyms_lookup_name()
2333  */
2334 int
2335 spl_kmem_init_kallsyms_lookup(void)
2336 {
2337 #ifndef HAVE_GET_VMALLOC_INFO
2338         get_vmalloc_info_fn = (get_vmalloc_info_t)
2339                 spl_kallsyms_lookup_name("get_vmalloc_info");
2340         if (!get_vmalloc_info_fn) {
2341                 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
2342                 return -EFAULT;
2343         }
2344 #endif /* HAVE_GET_VMALLOC_INFO */
2345
2346 #ifdef HAVE_PGDAT_HELPERS
2347 # ifndef HAVE_FIRST_ONLINE_PGDAT
2348         first_online_pgdat_fn = (first_online_pgdat_t)
2349                 spl_kallsyms_lookup_name("first_online_pgdat");
2350         if (!first_online_pgdat_fn) {
2351                 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
2352                 return -EFAULT;
2353         }
2354 # endif /* HAVE_FIRST_ONLINE_PGDAT */
2355
2356 # ifndef HAVE_NEXT_ONLINE_PGDAT
2357         next_online_pgdat_fn = (next_online_pgdat_t)
2358                 spl_kallsyms_lookup_name("next_online_pgdat");
2359         if (!next_online_pgdat_fn) {
2360                 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
2361                 return -EFAULT;
2362         }
2363 # endif /* HAVE_NEXT_ONLINE_PGDAT */
2364
2365 # ifndef HAVE_NEXT_ZONE
2366         next_zone_fn = (next_zone_t)
2367                 spl_kallsyms_lookup_name("next_zone");
2368         if (!next_zone_fn) {
2369                 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
2370                 return -EFAULT;
2371         }
2372 # endif /* HAVE_NEXT_ZONE */
2373
2374 #else /* HAVE_PGDAT_HELPERS */
2375
2376 # ifndef HAVE_PGDAT_LIST
2377         pgdat_list_addr = *(struct pglist_data **)
2378                 spl_kallsyms_lookup_name("pgdat_list");
2379         if (!pgdat_list_addr) {
2380                 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
2381                 return -EFAULT;
2382         }
2383 # endif /* HAVE_PGDAT_LIST */
2384 #endif /* HAVE_PGDAT_HELPERS */
2385
2386 #if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
2387         get_zone_counts_fn = (get_zone_counts_t)
2388                 spl_kallsyms_lookup_name("get_zone_counts");
2389         if (!get_zone_counts_fn) {
2390                 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
2391                 return -EFAULT;
2392         }
2393 #endif  /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
2394
2395         /*
2396          * It is now safe to initialize the global tunings which rely on
2397          * the use of the for_each_zone() macro.  This macro in turns
2398          * depends on the *_pgdat symbols which are now available.
2399          */
2400         spl_kmem_init_globals();
2401
2402 #ifndef HAVE_SHRINK_DCACHE_MEMORY
2403         /* When shrink_dcache_memory_fn == NULL support is disabled */
2404         shrink_dcache_memory_fn = (shrink_dcache_memory_t)
2405                 spl_kallsyms_lookup_name("shrink_dcache_memory");
2406 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
2407
2408 #ifndef HAVE_SHRINK_ICACHE_MEMORY
2409         /* When shrink_icache_memory_fn == NULL support is disabled */
2410         shrink_icache_memory_fn = (shrink_icache_memory_t)
2411                 spl_kallsyms_lookup_name("shrink_icache_memory");
2412 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
2413
2414         return 0;
2415 }
2416
2417 int
2418 spl_kmem_init(void)
2419 {
2420         int rc = 0;
2421         SENTRY;
2422
2423         init_rwsem(&spl_kmem_cache_sem);
2424         INIT_LIST_HEAD(&spl_kmem_cache_list);
2425         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2426             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2427
2428         spl_register_shrinker(&spl_kmem_cache_shrinker);
2429
2430 #ifdef DEBUG_KMEM
2431         kmem_alloc_used_set(0);
2432         vmem_alloc_used_set(0);
2433
2434         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2435         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2436 #endif
2437         SRETURN(rc);
2438 }
2439
2440 void
2441 spl_kmem_fini(void)
2442 {
2443 #ifdef DEBUG_KMEM
2444         /* Display all unreclaimed memory addresses, including the
2445          * allocation size and the first few bytes of what's located
2446          * at that address to aid in debugging.  Performance is not
2447          * a serious concern here since it is module unload time. */
2448         if (kmem_alloc_used_read() != 0)
2449                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2450                     "kmem leaked %ld/%ld bytes\n",
2451                     kmem_alloc_used_read(), kmem_alloc_max);
2452
2453
2454         if (vmem_alloc_used_read() != 0)
2455                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2456                     "vmem leaked %ld/%ld bytes\n",
2457                     vmem_alloc_used_read(), vmem_alloc_max);
2458
2459         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2460         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2461 #endif /* DEBUG_KMEM */
2462         SENTRY;
2463
2464         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2465         taskq_destroy(spl_kmem_cache_taskq);
2466
2467         SEXIT;
2468 }