module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://zfsonlinux.org/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28 #include <spl-debug.h>
  29
  30 #ifdef SS_DEBUG_SUBSYS
  31 #undef SS_DEBUG_SUBSYS
  32 #endif
  33
  34 #define SS_DEBUG_SUBSYS SS_KMEM
  35
  36 /*
  37  * Cache expiration was implemented because it was part of the default Solaris
  38  * kmem_cache behavior.  The idea is that per-cpu objects which haven't been
  39  * accessed in several seconds should be returned to the cache.  On the other
  40  * hand Linux slabs never move objects back to the slabs unless there is
  41  * memory pressure on the system.  By default both methods are disabled, but
  42  * may be enabled by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
  43  */
  44 unsigned int spl_kmem_cache_expire = 0;
  45 EXPORT_SYMBOL(spl_kmem_cache_expire);
  46 module_param(spl_kmem_cache_expire, uint, 0644);
  47 MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
  48
  49 /*
  50  * The minimum amount of memory measured in pages to be free at all
  51  * times on the system.  This is similar to Linux's zone->pages_min
  52  * multiplied by the number of zones and is sized based on that.
  53  */
  54 pgcnt_t minfree = 0;
  55 EXPORT_SYMBOL(minfree);
  56
  57 /*
  58  * The desired amount of memory measured in pages to be free at all
  59  * times on the system.  This is similar to Linux's zone->pages_low
  60  * multiplied by the number of zones and is sized based on that.
  61  * Assuming all zones are being used roughly equally, when we drop
  62  * below this threshold asynchronous page reclamation is triggered.
  63  */
  64 pgcnt_t desfree = 0;
  65 EXPORT_SYMBOL(desfree);
  66
  67 /*
  68  * When above this amount of memory measures in pages the system is
  69  * determined to have enough free memory.  This is similar to Linux's
  70  * zone->pages_high multiplied by the number of zones and is sized based
  71  * on that.  Assuming all zones are being used roughly equally, when
  72  * asynchronous page reclamation reaches this threshold it stops.
  73  */
  74 pgcnt_t lotsfree = 0;
  75 EXPORT_SYMBOL(lotsfree);
  76
  77 /* Unused always 0 in this implementation */
  78 pgcnt_t needfree = 0;
  79 EXPORT_SYMBOL(needfree);
  80
  81 pgcnt_t swapfs_minfree = 0;
  82 EXPORT_SYMBOL(swapfs_minfree);
  83
  84 pgcnt_t swapfs_reserve = 0;
  85 EXPORT_SYMBOL(swapfs_reserve);
  86
  87 vmem_t *heap_arena = NULL;
  88 EXPORT_SYMBOL(heap_arena);
  89
  90 vmem_t *zio_alloc_arena = NULL;
  91 EXPORT_SYMBOL(zio_alloc_arena);
  92
  93 vmem_t *zio_arena = NULL;
  94 EXPORT_SYMBOL(zio_arena);
  95
  96 #ifndef HAVE_GET_VMALLOC_INFO
  97 get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
  98 EXPORT_SYMBOL(get_vmalloc_info_fn);
  99 #endif /* HAVE_GET_VMALLOC_INFO */
 100
 101 #ifdef HAVE_PGDAT_HELPERS
 102 # ifndef HAVE_FIRST_ONLINE_PGDAT
 103 first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
 104 EXPORT_SYMBOL(first_online_pgdat_fn);
 105 # endif /* HAVE_FIRST_ONLINE_PGDAT */
 106
 107 # ifndef HAVE_NEXT_ONLINE_PGDAT
 108 next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
 109 EXPORT_SYMBOL(next_online_pgdat_fn);
 110 # endif /* HAVE_NEXT_ONLINE_PGDAT */
 111
 112 # ifndef HAVE_NEXT_ZONE
 113 next_zone_t next_zone_fn = SYMBOL_POISON;
 114 EXPORT_SYMBOL(next_zone_fn);
 115 # endif /* HAVE_NEXT_ZONE */
 116
 117 #else /* HAVE_PGDAT_HELPERS */
 118
 119 # ifndef HAVE_PGDAT_LIST
 120 struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
 121 EXPORT_SYMBOL(pgdat_list_addr);
 122 # endif /* HAVE_PGDAT_LIST */
 123
 124 #endif /* HAVE_PGDAT_HELPERS */
 125
 126 #ifdef NEED_GET_ZONE_COUNTS
 127 # ifndef HAVE_GET_ZONE_COUNTS
 128 get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
 129 EXPORT_SYMBOL(get_zone_counts_fn);
 130 # endif /* HAVE_GET_ZONE_COUNTS */
 131
 132 unsigned long
 133 spl_global_page_state(spl_zone_stat_item_t item)
 134 {
 135         unsigned long active;
 136         unsigned long inactive;
 137         unsigned long free;
 138
 139         get_zone_counts(&active, &inactive, &free);
 140         switch (item) {
 141         case SPL_NR_FREE_PAGES: return free;
 142         case SPL_NR_INACTIVE:   return inactive;
 143         case SPL_NR_ACTIVE:     return active;
 144         default:                ASSERT(0); /* Unsupported */
 145         }
 146
 147         return 0;
 148 }
 149 #else
 150 # ifdef HAVE_GLOBAL_PAGE_STATE
 151 unsigned long
 152 spl_global_page_state(spl_zone_stat_item_t item)
 153 {
 154         unsigned long pages = 0;
 155
 156         switch (item) {
 157         case SPL_NR_FREE_PAGES:
 158 #  ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
 159                 pages += global_page_state(NR_FREE_PAGES);
 160 #  endif
 161                 break;
 162         case SPL_NR_INACTIVE:
 163 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
 164                 pages += global_page_state(NR_INACTIVE);
 165 #  endif
 166 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
 167                 pages += global_page_state(NR_INACTIVE_ANON);
 168 #  endif
 169 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
 170                 pages += global_page_state(NR_INACTIVE_FILE);
 171 #  endif
 172                 break;
 173         case SPL_NR_ACTIVE:
 174 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
 175                 pages += global_page_state(NR_ACTIVE);
 176 #  endif
 177 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
 178                 pages += global_page_state(NR_ACTIVE_ANON);
 179 #  endif
 180 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
 181                 pages += global_page_state(NR_ACTIVE_FILE);
 182 #  endif
 183                 break;
 184         default:
 185                 ASSERT(0); /* Unsupported */
 186         }
 187
 188         return pages;
 189 }
 190 # else
 191 #  error "Both global_page_state() and get_zone_counts() unavailable"
 192 # endif /* HAVE_GLOBAL_PAGE_STATE */
 193 #endif /* NEED_GET_ZONE_COUNTS */
 194 EXPORT_SYMBOL(spl_global_page_state);
 195
 196 #ifndef HAVE_SHRINK_DCACHE_MEMORY
 197 shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
 198 EXPORT_SYMBOL(shrink_dcache_memory_fn);
 199 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
 200
 201 #ifndef HAVE_SHRINK_ICACHE_MEMORY
 202 shrink_icache_memory_t shrink_icache_memory_fn = SYMBOL_POISON;
 203 EXPORT_SYMBOL(shrink_icache_memory_fn);
 204 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
 205
 206 pgcnt_t
 207 spl_kmem_availrmem(void)
 208 {
 209         /* The amount of easily available memory */
 210         return (spl_global_page_state(SPL_NR_FREE_PAGES) +
 211                 spl_global_page_state(SPL_NR_INACTIVE));
 212 }
 213 EXPORT_SYMBOL(spl_kmem_availrmem);
 214
 215 size_t
 216 vmem_size(vmem_t *vmp, int typemask)
 217 {
 218         struct vmalloc_info vmi;
 219         size_t size = 0;
 220
 221         ASSERT(vmp == NULL);
 222         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 223
 224         get_vmalloc_info(&vmi);
 225         if (typemask & VMEM_ALLOC)
 226                 size += (size_t)vmi.used;
 227
 228         if (typemask & VMEM_FREE)
 229                 size += (size_t)(VMALLOC_TOTAL - vmi.used);
 230
 231         return size;
 232 }
 233 EXPORT_SYMBOL(vmem_size);
 234
 235 int
 236 kmem_debugging(void)
 237 {
 238         return 0;
 239 }
 240 EXPORT_SYMBOL(kmem_debugging);
 241
 242 #ifndef HAVE_KVASPRINTF
 243 /* Simplified asprintf. */
 244 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 245 {
 246         unsigned int len;
 247         char *p;
 248         va_list aq;
 249
 250         va_copy(aq, ap);
 251         len = vsnprintf(NULL, 0, fmt, aq);
 252         va_end(aq);
 253
 254         p = kmalloc(len+1, gfp);
 255         if (!p)
 256                 return NULL;
 257
 258         vsnprintf(p, len+1, fmt, ap);
 259
 260         return p;
 261 }
 262 EXPORT_SYMBOL(kvasprintf);
 263 #endif /* HAVE_KVASPRINTF */
 264
 265 char *
 266 kmem_vasprintf(const char *fmt, va_list ap)
 267 {
 268         va_list aq;
 269         char *ptr;
 270
 271         do {
 272                 va_copy(aq, ap);
 273                 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
 274                 va_end(aq);
 275         } while (ptr == NULL);
 276
 277         return ptr;
 278 }
 279 EXPORT_SYMBOL(kmem_vasprintf);
 280
 281 char *
 282 kmem_asprintf(const char *fmt, ...)
 283 {
 284         va_list ap;
 285         char *ptr;
 286
 287         do {
 288                 va_start(ap, fmt);
 289                 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
 290                 va_end(ap);
 291         } while (ptr == NULL);
 292
 293         return ptr;
 294 }
 295 EXPORT_SYMBOL(kmem_asprintf);
 296
 297 static char *
 298 __strdup(const char *str, int flags)
 299 {
 300         char *ptr;
 301         int n;
 302
 303         n = strlen(str);
 304         ptr = kmalloc_nofail(n + 1, flags);
 305         if (ptr)
 306                 memcpy(ptr, str, n + 1);
 307
 308         return ptr;
 309 }
 310
 311 char *
 312 strdup(const char *str)
 313 {
 314         return __strdup(str, KM_SLEEP);
 315 }
 316 EXPORT_SYMBOL(strdup);
 317
 318 void
 319 strfree(char *str)
 320 {
 321         kfree(str);
 322 }
 323 EXPORT_SYMBOL(strfree);
 324
 325 /*
 326  * Memory allocation interfaces and debugging for basic kmem_*
 327  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 328  * the SPL will keep track of the total memory allocated, and
 329  * report any memory leaked when the module is unloaded.
 330  */
 331 #ifdef DEBUG_KMEM
 332
 333 /* Shim layer memory accounting */
 334 # ifdef HAVE_ATOMIC64_T
 335 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 336 unsigned long long kmem_alloc_max = 0;
 337 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 338 unsigned long long vmem_alloc_max = 0;
 339 # else  /* HAVE_ATOMIC64_T */
 340 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 341 unsigned long long kmem_alloc_max = 0;
 342 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 343 unsigned long long vmem_alloc_max = 0;
 344 # endif /* HAVE_ATOMIC64_T */
 345
 346 EXPORT_SYMBOL(kmem_alloc_used);
 347 EXPORT_SYMBOL(kmem_alloc_max);
 348 EXPORT_SYMBOL(vmem_alloc_used);
 349 EXPORT_SYMBOL(vmem_alloc_max);
 350
 351 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 352  * but also the location of every alloc and free.  When the SPL module is
 353  * unloaded a list of all leaked addresses and where they were allocated
 354  * will be dumped to the console.  Enabling this feature has a significant
 355  * impact on performance but it makes finding memory leaks straight forward.
 356  *
 357  * Not surprisingly with debugging enabled the xmem_locks are very highly
 358  * contended particularly on xfree().  If we want to run with this detailed
 359  * debugging enabled for anything other than debugging  we need to minimize
 360  * the contention by moving to a lock per xmem_table entry model.
 361  */
 362 # ifdef DEBUG_KMEM_TRACKING
 363
 364 #  define KMEM_HASH_BITS          10
 365 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 366
 367 #  define VMEM_HASH_BITS          10
 368 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 369
 370 typedef struct kmem_debug {
 371         struct hlist_node kd_hlist;     /* Hash node linkage */
 372         struct list_head kd_list;       /* List of all allocations */
 373         void *kd_addr;                  /* Allocation pointer */
 374         size_t kd_size;                 /* Allocation size */
 375         const char *kd_func;            /* Allocation function */
 376         int kd_line;                    /* Allocation line */
 377 } kmem_debug_t;
 378
 379 spinlock_t kmem_lock;
 380 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 381 struct list_head kmem_list;
 382
 383 spinlock_t vmem_lock;
 384 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 385 struct list_head vmem_list;
 386
 387 EXPORT_SYMBOL(kmem_lock);
 388 EXPORT_SYMBOL(kmem_table);
 389 EXPORT_SYMBOL(kmem_list);
 390
 391 EXPORT_SYMBOL(vmem_lock);
 392 EXPORT_SYMBOL(vmem_table);
 393 EXPORT_SYMBOL(vmem_list);
 394
 395 static kmem_debug_t *
 396 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
 397 {
 398         struct hlist_head *head;
 399         struct hlist_node *node;
 400         struct kmem_debug *p;
 401         unsigned long flags;
 402         SENTRY;
 403
 404         spin_lock_irqsave(lock, flags);
 405
 406         head = &table[hash_ptr((void *)addr, bits)];
 407         hlist_for_each(node, head) {
 408                 p = list_entry(node, struct kmem_debug, kd_hlist);
 409                 if (p->kd_addr == addr) {
 410                         hlist_del_init(&p->kd_hlist);
 411                         list_del_init(&p->kd_list);
 412                         spin_unlock_irqrestore(lock, flags);
 413                         return p;
 414                 }
 415         }
 416
 417         spin_unlock_irqrestore(lock, flags);
 418
 419         SRETURN(NULL);
 420 }
 421
 422 void *
 423 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 424     int node_alloc, int node)
 425 {
 426         void *ptr = NULL;
 427         kmem_debug_t *dptr;
 428         unsigned long irq_flags;
 429         SENTRY;
 430
 431         /* Function may be called with KM_NOSLEEP so failure is possible */
 432         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 433             flags & ~__GFP_ZERO);
 434
 435         if (unlikely(dptr == NULL)) {
 436                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 437                     "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 438                     sizeof(kmem_debug_t), flags, func, line,
 439                     kmem_alloc_used_read(), kmem_alloc_max);
 440         } else {
 441                 /*
 442                  * Marked unlikely because we should never be doing this,
 443                  * we tolerate to up 2 pages but a single page is best.
 444                  */
 445                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 446                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
 447                             "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 448                             (unsigned long long) size, flags, func, line,
 449                             kmem_alloc_used_read(), kmem_alloc_max);
 450                         spl_debug_dumpstack(NULL);
 451                 }
 452
 453                 /*
 454                  *  We use __strdup() below because the string pointed to by
 455                  * __FUNCTION__ might not be available by the time we want
 456                  * to print it since the module might have been unloaded.
 457                  * This can only fail in the KM_NOSLEEP case.
 458                  */
 459                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 460                 if (unlikely(dptr->kd_func == NULL)) {
 461                         kfree(dptr);
 462                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 463                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 464                             func, line, kmem_alloc_used_read(), kmem_alloc_max);
 465                         goto out;
 466                 }
 467
 468                 /* Use the correct allocator */
 469                 if (node_alloc) {
 470                         ASSERT(!(flags & __GFP_ZERO));
 471                         ptr = kmalloc_node_nofail(size, flags, node);
 472                 } else if (flags & __GFP_ZERO) {
 473                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 474                 } else {
 475                         ptr = kmalloc_nofail(size, flags);
 476                 }
 477
 478                 if (unlikely(ptr == NULL)) {
 479                         kfree(dptr->kd_func);
 480                         kfree(dptr);
 481                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc"
 482                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 483                             (unsigned long long) size, flags, func, line,
 484                             kmem_alloc_used_read(), kmem_alloc_max);
 485                         goto out;
 486                 }
 487
 488                 kmem_alloc_used_add(size);
 489                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 490                         kmem_alloc_max = kmem_alloc_used_read();
 491
 492                 INIT_HLIST_NODE(&dptr->kd_hlist);
 493                 INIT_LIST_HEAD(&dptr->kd_list);
 494
 495                 dptr->kd_addr = ptr;
 496                 dptr->kd_size = size;
 497                 dptr->kd_line = line;
 498
 499                 spin_lock_irqsave(&kmem_lock, irq_flags);
 500                 hlist_add_head(&dptr->kd_hlist,
 501                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 502                 list_add_tail(&dptr->kd_list, &kmem_list);
 503                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 504
 505                 SDEBUG_LIMIT(SD_INFO,
 506                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 507                     (unsigned long long) size, flags, func, line, ptr,
 508                     kmem_alloc_used_read(), kmem_alloc_max);
 509         }
 510 out:
 511         SRETURN(ptr);
 512 }
 513 EXPORT_SYMBOL(kmem_alloc_track);
 514
 515 void
 516 kmem_free_track(const void *ptr, size_t size)
 517 {
 518         kmem_debug_t *dptr;
 519         SENTRY;
 520
 521         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 522             (unsigned long long) size);
 523
 524         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 525
 526         /* Must exist in hash due to kmem_alloc() */
 527         ASSERT(dptr);
 528
 529         /* Size must match */
 530         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 531             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 532             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 533
 534         kmem_alloc_used_sub(size);
 535         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 536             (unsigned long long) size, kmem_alloc_used_read(),
 537             kmem_alloc_max);
 538
 539         kfree(dptr->kd_func);
 540
 541         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 542         kfree(dptr);
 543
 544         memset((void *)ptr, 0x5a, size);
 545         kfree(ptr);
 546
 547         SEXIT;
 548 }
 549 EXPORT_SYMBOL(kmem_free_track);
 550
 551 void *
 552 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 553 {
 554         void *ptr = NULL;
 555         kmem_debug_t *dptr;
 556         unsigned long irq_flags;
 557         SENTRY;
 558
 559         ASSERT(flags & KM_SLEEP);
 560
 561         /* Function may be called with KM_NOSLEEP so failure is possible */
 562         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 563             flags & ~__GFP_ZERO);
 564         if (unlikely(dptr == NULL)) {
 565                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 566                     "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 567                     sizeof(kmem_debug_t), flags, func, line,
 568                     vmem_alloc_used_read(), vmem_alloc_max);
 569         } else {
 570                 /*
 571                  * We use __strdup() below because the string pointed to by
 572                  * __FUNCTION__ might not be available by the time we want
 573                  * to print it, since the module might have been unloaded.
 574                  * This can never fail because we have already asserted
 575                  * that flags is KM_SLEEP.
 576                  */
 577                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 578                 if (unlikely(dptr->kd_func == NULL)) {
 579                         kfree(dptr);
 580                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 581                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 582                             func, line, vmem_alloc_used_read(), vmem_alloc_max);
 583                         goto out;
 584                 }
 585
 586                 /* Use the correct allocator */
 587                 if (flags & __GFP_ZERO) {
 588                         ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
 589                 } else {
 590                         ptr = vmalloc_nofail(size, flags);
 591                 }
 592
 593                 if (unlikely(ptr == NULL)) {
 594                         kfree(dptr->kd_func);
 595                         kfree(dptr);
 596                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc"
 597                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 598                             (unsigned long long) size, flags, func, line,
 599                             vmem_alloc_used_read(), vmem_alloc_max);
 600                         goto out;
 601                 }
 602
 603                 vmem_alloc_used_add(size);
 604                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 605                         vmem_alloc_max = vmem_alloc_used_read();
 606
 607                 INIT_HLIST_NODE(&dptr->kd_hlist);
 608                 INIT_LIST_HEAD(&dptr->kd_list);
 609
 610                 dptr->kd_addr = ptr;
 611                 dptr->kd_size = size;
 612                 dptr->kd_line = line;
 613
 614                 spin_lock_irqsave(&vmem_lock, irq_flags);
 615                 hlist_add_head(&dptr->kd_hlist,
 616                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 617                 list_add_tail(&dptr->kd_list, &vmem_list);
 618                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 619
 620                 SDEBUG_LIMIT(SD_INFO,
 621                     "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 622                     (unsigned long long) size, flags, func, line,
 623                     ptr, vmem_alloc_used_read(), vmem_alloc_max);
 624         }
 625 out:
 626         SRETURN(ptr);
 627 }
 628 EXPORT_SYMBOL(vmem_alloc_track);
 629
 630 void
 631 vmem_free_track(const void *ptr, size_t size)
 632 {
 633         kmem_debug_t *dptr;
 634         SENTRY;
 635
 636         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 637             (unsigned long long) size);
 638
 639         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 640
 641         /* Must exist in hash due to vmem_alloc() */
 642         ASSERT(dptr);
 643
 644         /* Size must match */
 645         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 646             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 647             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 648
 649         vmem_alloc_used_sub(size);
 650         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 651             (unsigned long long) size, vmem_alloc_used_read(),
 652             vmem_alloc_max);
 653
 654         kfree(dptr->kd_func);
 655
 656         memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
 657         kfree(dptr);
 658
 659         memset((void *)ptr, 0x5a, size);
 660         vfree(ptr);
 661
 662         SEXIT;
 663 }
 664 EXPORT_SYMBOL(vmem_free_track);
 665
 666 # else /* DEBUG_KMEM_TRACKING */
 667
 668 void *
 669 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 670     int node_alloc, int node)
 671 {
 672         void *ptr;
 673         SENTRY;
 674
 675         /*
 676          * Marked unlikely because we should never be doing this,
 677          * we tolerate to up 2 pages but a single page is best.
 678          */
 679         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 680                 SDEBUG(SD_CONSOLE | SD_WARNING,
 681                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 682                     (unsigned long long) size, flags, func, line,
 683                     kmem_alloc_used_read(), kmem_alloc_max);
 684                 dump_stack();
 685         }
 686
 687         /* Use the correct allocator */
 688         if (node_alloc) {
 689                 ASSERT(!(flags & __GFP_ZERO));
 690                 ptr = kmalloc_node_nofail(size, flags, node);
 691         } else if (flags & __GFP_ZERO) {
 692                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 693         } else {
 694                 ptr = kmalloc_nofail(size, flags);
 695         }
 696
 697         if (unlikely(ptr == NULL)) {
 698                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 699                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 700                     (unsigned long long) size, flags, func, line,
 701                     kmem_alloc_used_read(), kmem_alloc_max);
 702         } else {
 703                 kmem_alloc_used_add(size);
 704                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 705                         kmem_alloc_max = kmem_alloc_used_read();
 706
 707                 SDEBUG_LIMIT(SD_INFO,
 708                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 709                     (unsigned long long) size, flags, func, line, ptr,
 710                     kmem_alloc_used_read(), kmem_alloc_max);
 711         }
 712
 713         SRETURN(ptr);
 714 }
 715 EXPORT_SYMBOL(kmem_alloc_debug);
 716
 717 void
 718 kmem_free_debug(const void *ptr, size_t size)
 719 {
 720         SENTRY;
 721
 722         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 723             (unsigned long long) size);
 724
 725         kmem_alloc_used_sub(size);
 726         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 727             (unsigned long long) size, kmem_alloc_used_read(),
 728             kmem_alloc_max);
 729         kfree(ptr);
 730
 731         SEXIT;
 732 }
 733 EXPORT_SYMBOL(kmem_free_debug);
 734
 735 void *
 736 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 737 {
 738         void *ptr;
 739         SENTRY;
 740
 741         ASSERT(flags & KM_SLEEP);
 742
 743         /* Use the correct allocator */
 744         if (flags & __GFP_ZERO) {
 745                 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
 746         } else {
 747                 ptr = vmalloc_nofail(size, flags);
 748         }
 749
 750         if (unlikely(ptr == NULL)) {
 751                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 752                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 753                     (unsigned long long) size, flags, func, line,
 754                     vmem_alloc_used_read(), vmem_alloc_max);
 755         } else {
 756                 vmem_alloc_used_add(size);
 757                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 758                         vmem_alloc_max = vmem_alloc_used_read();
 759
 760                 SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 761                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 762                     vmem_alloc_used_read(), vmem_alloc_max);
 763         }
 764
 765         SRETURN(ptr);
 766 }
 767 EXPORT_SYMBOL(vmem_alloc_debug);
 768
 769 void
 770 vmem_free_debug(const void *ptr, size_t size)
 771 {
 772         SENTRY;
 773
 774         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 775             (unsigned long long) size);
 776
 777         vmem_alloc_used_sub(size);
 778         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 779             (unsigned long long) size, vmem_alloc_used_read(),
 780             vmem_alloc_max);
 781         vfree(ptr);
 782
 783         SEXIT;
 784 }
 785 EXPORT_SYMBOL(vmem_free_debug);
 786
 787 # endif /* DEBUG_KMEM_TRACKING */
 788 #endif /* DEBUG_KMEM */
 789
 790 /*
 791  * Slab allocation interfaces
 792  *
 793  * While the Linux slab implementation was inspired by the Solaris
 794  * implementation I cannot use it to emulate the Solaris APIs.  I
 795  * require two features which are not provided by the Linux slab.
 796  *
 797  * 1) Constructors AND destructors.  Recent versions of the Linux
 798  *    kernel have removed support for destructors.  This is a deal
 799  *    breaker for the SPL which contains particularly expensive
 800  *    initializers for mutex's, condition variables, etc.  We also
 801  *    require a minimal level of cleanup for these data types unlike
 802  *    many Linux data type which do need to be explicitly destroyed.
 803  *
 804  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 805  *    expect it to work well for both small are very large allocations.
 806  *    Because of memory fragmentation the Linux slab which is backed
 807  *    by kmalloc'ed memory performs very badly when confronted with
 808  *    large numbers of large allocations.  Basing the slab on the
 809  *    virtual address space removes the need for contiguous pages
 810  *    and greatly improve performance for large allocations.
 811  *
 812  * For these reasons, the SPL has its own slab implementation with
 813  * the needed features.  It is not as highly optimized as either the
 814  * Solaris or Linux slabs, but it should get me most of what is
 815  * needed until it can be optimized or obsoleted by another approach.
 816  *
 817  * One serious concern I do have about this method is the relatively
 818  * small virtual address space on 32bit arches.  This will seriously
 819  * constrain the size of the slab caches and their performance.
 820  *
 821  * XXX: Improve the partial slab list by carefully maintaining a
 822  *      strict ordering of fullest to emptiest slabs based on
 823  *      the slab reference count.  This guarantees the when freeing
 824  *      slabs back to the system we need only linearly traverse the
 825  *      last N slabs in the list to discover all the freeable slabs.
 826  *
 827  * XXX: NUMA awareness for optionally allocating memory close to a
 828  *      particular core.  This can be advantageous if you know the slab
 829  *      object will be short lived and primarily accessed from one core.
 830  *
 831  * XXX: Slab coloring may also yield performance improvements and would
 832  *      be desirable to implement.
 833  */
 834
 835 struct list_head spl_kmem_cache_list;   /* List of caches */
 836 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 837 taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */
 838
 839 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 840
 841 SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
 842 SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 843         spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
 844
 845 static void *
 846 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 847 {
 848         void *ptr;
 849
 850         ASSERT(ISP2(size));
 851
 852         if (skc->skc_flags & KMC_KMEM)
 853                 ptr = (void *)__get_free_pages(flags, get_order(size));
 854         else
 855                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 856
 857         /* Resulting allocated memory will be page aligned */
 858         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 859
 860         return ptr;
 861 }
 862
 863 static void
 864 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 865 {
 866         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 867         ASSERT(ISP2(size));
 868
 869         /*
 870          * The Linux direct reclaim path uses this out of band value to
 871          * determine if forward progress is being made.  Normally this is
 872          * incremented by kmem_freepages() which is part of the various
 873          * Linux slab implementations.  However, since we are using none
 874          * of that infrastructure we are responsible for incrementing it.
 875          */
 876         if (current->reclaim_state)
 877                 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 878
 879         if (skc->skc_flags & KMC_KMEM)
 880                 free_pages((unsigned long)ptr, get_order(size));
 881         else
 882                 vfree(ptr);
 883 }
 884
 885 /*
 886  * Required space for each aligned sks.
 887  */
 888 static inline uint32_t
 889 spl_sks_size(spl_kmem_cache_t *skc)
 890 {
 891         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 892                skc->skc_obj_align, uint32_t);
 893 }
 894
 895 /*
 896  * Required space for each aligned object.
 897  */
 898 static inline uint32_t
 899 spl_obj_size(spl_kmem_cache_t *skc)
 900 {
 901         uint32_t align = skc->skc_obj_align;
 902
 903         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 904                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 905 }
 906
 907 /*
 908  * Lookup the spl_kmem_object_t for an object given that object.
 909  */
 910 static inline spl_kmem_obj_t *
 911 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 912 {
 913         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 914                skc->skc_obj_align, uint32_t);
 915 }
 916
 917 /*
 918  * Required space for each offslab object taking in to account alignment
 919  * restrictions and the power-of-two requirement of kv_alloc().
 920  */
 921 static inline uint32_t
 922 spl_offslab_size(spl_kmem_cache_t *skc)
 923 {
 924         return 1UL << (highbit(spl_obj_size(skc)) + 1);
 925 }
 926
 927 /*
 928  * It's important that we pack the spl_kmem_obj_t structure and the
 929  * actual objects in to one large address space to minimize the number
 930  * of calls to the allocator.  It is far better to do a few large
 931  * allocations and then subdivide it ourselves.  Now which allocator
 932  * we use requires balancing a few trade offs.
 933  *
 934  * For small objects we use kmem_alloc() because as long as you are
 935  * only requesting a small number of pages (ideally just one) its cheap.
 936  * However, when you start requesting multiple pages with kmem_alloc()
 937  * it gets increasingly expensive since it requires contiguous pages.
 938  * For this reason we shift to vmem_alloc() for slabs of large objects
 939  * which removes the need for contiguous pages.  We do not use
 940  * vmem_alloc() in all cases because there is significant locking
 941  * overhead in __get_vm_area_node().  This function takes a single
 942  * global lock when acquiring an available virtual address range which
 943  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 944  * different allocation functions for small and large objects should
 945  * give us the best of both worlds.
 946  *
 947  * KMC_ONSLAB                       KMC_OFFSLAB
 948  *
 949  * +------------------------+       +-----------------+
 950  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 951  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 952  * | spl_kmem_obj_t      |  |                             | |
 953  * | skc_obj_size    <---+  |       +-----------------+   | |
 954  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 955  * | ...                 v  |       | spl_kmem_obj_t  |     |
 956  * +------------------------+       +-----------------+     v
 957  */
 958 static spl_kmem_slab_t *
 959 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 960 {
 961         spl_kmem_slab_t *sks;
 962         spl_kmem_obj_t *sko, *n;
 963         void *base, *obj;
 964         uint32_t obj_size, offslab_size = 0;
 965         int i,  rc = 0;
 966
 967         base = kv_alloc(skc, skc->skc_slab_size, flags);
 968         if (base == NULL)
 969                 SRETURN(NULL);
 970
 971         sks = (spl_kmem_slab_t *)base;
 972         sks->sks_magic = SKS_MAGIC;
 973         sks->sks_objs = skc->skc_slab_objs;
 974         sks->sks_age = jiffies;
 975         sks->sks_cache = skc;
 976         INIT_LIST_HEAD(&sks->sks_list);
 977         INIT_LIST_HEAD(&sks->sks_free_list);
 978         sks->sks_ref = 0;
 979         obj_size = spl_obj_size(skc);
 980
 981         if (skc->skc_flags & KMC_OFFSLAB)
 982                 offslab_size = spl_offslab_size(skc);
 983
 984         for (i = 0; i < sks->sks_objs; i++) {
 985                 if (skc->skc_flags & KMC_OFFSLAB) {
 986                         obj = kv_alloc(skc, offslab_size, flags);
 987                         if (!obj)
 988                                 SGOTO(out, rc = -ENOMEM);
 989                 } else {
 990                         obj = base + spl_sks_size(skc) + (i * obj_size);
 991                 }
 992
 993                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 994                 sko = spl_sko_from_obj(skc, obj);
 995                 sko->sko_addr = obj;
 996                 sko->sko_magic = SKO_MAGIC;
 997                 sko->sko_slab = sks;
 998                 INIT_LIST_HEAD(&sko->sko_list);
 999                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
1000         }
1001
1002         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
1003                 if (skc->skc_ctor)
1004                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
1005 out:
1006         if (rc) {
1007                 if (skc->skc_flags & KMC_OFFSLAB)
1008                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
1009                                                  sko_list)
1010                                 kv_free(skc, sko->sko_addr, offslab_size);
1011
1012                 kv_free(skc, base, skc->skc_slab_size);
1013                 sks = NULL;
1014         }
1015
1016         SRETURN(sks);
1017 }
1018
1019 /*
1020  * Remove a slab from complete or partial list, it must be called with
1021  * the 'skc->skc_lock' held but the actual free must be performed
1022  * outside the lock to prevent deadlocking on vmem addresses.
1023  */
1024 static void
1025 spl_slab_free(spl_kmem_slab_t *sks,
1026               struct list_head *sks_list, struct list_head *sko_list)
1027 {
1028         spl_kmem_cache_t *skc;
1029         SENTRY;
1030
1031         ASSERT(sks->sks_magic == SKS_MAGIC);
1032         ASSERT(sks->sks_ref == 0);
1033
1034         skc = sks->sks_cache;
1035         ASSERT(skc->skc_magic == SKC_MAGIC);
1036         ASSERT(spin_is_locked(&skc->skc_lock));
1037
1038         /*
1039          * Update slab/objects counters in the cache, then remove the
1040          * slab from the skc->skc_partial_list.  Finally add the slab
1041          * and all its objects in to the private work lists where the
1042          * destructors will be called and the memory freed to the system.
1043          */
1044         skc->skc_obj_total -= sks->sks_objs;
1045         skc->skc_slab_total--;
1046         list_del(&sks->sks_list);
1047         list_add(&sks->sks_list, sks_list);
1048         list_splice_init(&sks->sks_free_list, sko_list);
1049
1050         SEXIT;
1051 }
1052
1053 /*
1054  * Traverses all the partial slabs attached to a cache and free those
1055  * which which are currently empty, and have not been touched for
1056  * skc_delay seconds to  avoid thrashing.  The count argument is
1057  * passed to optionally cap the number of slabs reclaimed, a count
1058  * of zero means try and reclaim everything.  When flag is set we
1059  * always free an available slab regardless of age.
1060  */
1061 static void
1062 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
1063 {
1064         spl_kmem_slab_t *sks, *m;
1065         spl_kmem_obj_t *sko, *n;
1066         LIST_HEAD(sks_list);
1067         LIST_HEAD(sko_list);
1068         uint32_t size = 0;
1069         int i = 0;
1070         SENTRY;
1071
1072         /*
1073          * Move empty slabs and objects which have not been touched in
1074          * skc_delay seconds on to private lists to be freed outside
1075          * the spin lock.  This delay time is important to avoid thrashing
1076          * however when flag is set the delay will not be used.
1077          */
1078         spin_lock(&skc->skc_lock);
1079         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
1080                 /*
1081                  * All empty slabs are at the end of skc->skc_partial_list,
1082                  * therefore once a non-empty slab is found we can stop
1083                  * scanning.  Additionally, stop when reaching the target
1084                  * reclaim 'count' if a non-zero threshold is given.
1085                  */
1086                 if ((sks->sks_ref > 0) || (count && i >= count))
1087                         break;
1088
1089                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
1090                         spl_slab_free(sks, &sks_list, &sko_list);
1091                         i++;
1092                 }
1093         }
1094         spin_unlock(&skc->skc_lock);
1095
1096         /*
1097          * The following two loops ensure all the object destructors are
1098          * run, any offslab objects are freed, and the slabs themselves
1099          * are freed.  This is all done outside the skc->skc_lock since
1100          * this allows the destructor to sleep, and allows us to perform
1101          * a conditional reschedule when a freeing a large number of
1102          * objects and slabs back to the system.
1103          */
1104         if (skc->skc_flags & KMC_OFFSLAB)
1105                 size = spl_offslab_size(skc);
1106
1107         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1108                 ASSERT(sko->sko_magic == SKO_MAGIC);
1109
1110                 if (skc->skc_dtor)
1111                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
1112
1113                 if (skc->skc_flags & KMC_OFFSLAB)
1114                         kv_free(skc, sko->sko_addr, size);
1115         }
1116
1117         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1118                 ASSERT(sks->sks_magic == SKS_MAGIC);
1119                 kv_free(skc, sks, skc->skc_slab_size);
1120         }
1121
1122         SEXIT;
1123 }
1124
1125 static spl_kmem_emergency_t *
1126 spl_emergency_search(struct rb_root *root, void *obj)
1127 {
1128         struct rb_node *node = root->rb_node;
1129         spl_kmem_emergency_t *ske;
1130         unsigned long address = (unsigned long)obj;
1131
1132         while (node) {
1133                 ske = container_of(node, spl_kmem_emergency_t, ske_node);
1134
1135                 if (address < (unsigned long)ske->ske_obj)
1136                         node = node->rb_left;
1137                 else if (address > (unsigned long)ske->ske_obj)
1138                         node = node->rb_right;
1139                 else
1140                         return ske;
1141         }
1142
1143         return NULL;
1144 }
1145
1146 static int
1147 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
1148 {
1149         struct rb_node **new = &(root->rb_node), *parent = NULL;
1150         spl_kmem_emergency_t *ske_tmp;
1151         unsigned long address = (unsigned long)ske->ske_obj;
1152
1153         while (*new) {
1154                 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
1155
1156                 parent = *new;
1157                 if (address < (unsigned long)ske_tmp->ske_obj)
1158                         new = &((*new)->rb_left);
1159                 else if (address > (unsigned long)ske_tmp->ske_obj)
1160                         new = &((*new)->rb_right);
1161                 else
1162                         return 0;
1163         }
1164
1165         rb_link_node(&ske->ske_node, parent, new);
1166         rb_insert_color(&ske->ske_node, root);
1167
1168         return 1;
1169 }
1170
1171 /*
1172  * Allocate a single emergency object and track it in a red black tree.
1173  */
1174 static int
1175 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
1176 {
1177         spl_kmem_emergency_t *ske;
1178         int empty;
1179         SENTRY;
1180
1181         /* Last chance use a partial slab if one now exists */
1182         spin_lock(&skc->skc_lock);
1183         empty = list_empty(&skc->skc_partial_list);
1184         spin_unlock(&skc->skc_lock);
1185         if (!empty)
1186                 SRETURN(-EEXIST);
1187
1188         ske = kmalloc(sizeof(*ske), flags);
1189         if (ske == NULL)
1190                 SRETURN(-ENOMEM);
1191
1192         ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
1193         if (ske->ske_obj == NULL) {
1194                 kfree(ske);
1195                 SRETURN(-ENOMEM);
1196         }
1197
1198         spin_lock(&skc->skc_lock);
1199         empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
1200         if (likely(empty)) {
1201                 skc->skc_obj_total++;
1202                 skc->skc_obj_emergency++;
1203                 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
1204                         skc->skc_obj_emergency_max = skc->skc_obj_emergency;
1205         }
1206         spin_unlock(&skc->skc_lock);
1207
1208         if (unlikely(!empty)) {
1209                 kfree(ske->ske_obj);
1210                 kfree(ske);
1211                 SRETURN(-EINVAL);
1212         }
1213
1214         if (skc->skc_ctor)
1215                 skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
1216
1217         *obj = ske->ske_obj;
1218
1219         SRETURN(0);
1220 }
1221
1222 /*
1223  * Locate the passed object in the red black tree and free it.
1224  */
1225 static int
1226 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1227 {
1228         spl_kmem_emergency_t *ske;
1229         SENTRY;
1230
1231         spin_lock(&skc->skc_lock);
1232         ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1233         if (likely(ske)) {
1234                 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1235                 skc->skc_obj_emergency--;
1236                 skc->skc_obj_total--;
1237         }
1238         spin_unlock(&skc->skc_lock);
1239
1240         if (unlikely(ske == NULL))
1241                 SRETURN(-ENOENT);
1242
1243         if (skc->skc_dtor)
1244                 skc->skc_dtor(ske->ske_obj, skc->skc_private);
1245
1246         kfree(ske->ske_obj);
1247         kfree(ske);
1248
1249         SRETURN(0);
1250 }
1251
1252 /*
1253  * Release objects from the per-cpu magazine back to their slab.  The flush
1254  * argument contains the max number of entries to remove from the magazine.
1255  */
1256 static void
1257 __spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1258 {
1259         int i, count = MIN(flush, skm->skm_avail);
1260         SENTRY;
1261
1262         ASSERT(skc->skc_magic == SKC_MAGIC);
1263         ASSERT(skm->skm_magic == SKM_MAGIC);
1264         ASSERT(spin_is_locked(&skc->skc_lock));
1265
1266         for (i = 0; i < count; i++)
1267                 spl_cache_shrink(skc, skm->skm_objs[i]);
1268
1269         skm->skm_avail -= count;
1270         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1271                 sizeof(void *) * skm->skm_avail);
1272
1273         SEXIT;
1274 }
1275
1276 static void
1277 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1278 {
1279         spin_lock(&skc->skc_lock);
1280         __spl_cache_flush(skc, skm, flush);
1281         spin_unlock(&skc->skc_lock);
1282 }
1283
1284 static void
1285 spl_magazine_age(void *data)
1286 {
1287         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1288         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1289
1290         ASSERT(skm->skm_magic == SKM_MAGIC);
1291         ASSERT(skm->skm_cpu == smp_processor_id());
1292         ASSERT(irqs_disabled());
1293
1294         /* There are no available objects or they are too young to age out */
1295         if ((skm->skm_avail == 0) ||
1296             time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1297                 return;
1298
1299         /*
1300          * Because we're executing in interrupt context we may have
1301          * interrupted the holder of this lock.  To avoid a potential
1302          * deadlock return if the lock is contended.
1303          */
1304         if (!spin_trylock(&skc->skc_lock))
1305                 return;
1306
1307         __spl_cache_flush(skc, skm, skm->skm_refill);
1308         spin_unlock(&skc->skc_lock);
1309 }
1310
1311 /*
1312  * Called regularly to keep a downward pressure on the cache.
1313  *
1314  * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1315  * be returned to the caches.  This is done to prevent idle magazines from
1316  * holding memory which could be better used elsewhere.  The delay is
1317  * present to prevent thrashing the magazine.
1318  *
1319  * The newly released objects may result in empty partial slabs.  Those
1320  * slabs should be released to the system.  Otherwise moving the objects
1321  * out of the magazines is just wasted work.
1322  */
1323 static void
1324 spl_cache_age(void *data)
1325 {
1326         spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1327         taskqid_t id = 0;
1328
1329         ASSERT(skc->skc_magic == SKC_MAGIC);
1330
1331         /* Dynamically disabled at run time */
1332         if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1333                 return;
1334
1335         atomic_inc(&skc->skc_ref);
1336         spl_on_each_cpu(spl_magazine_age, skc, 1);
1337         spl_slab_reclaim(skc, skc->skc_reap, 0);
1338
1339         while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1340                 id = taskq_dispatch_delay(
1341                     spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1342                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1343
1344                 /* Destroy issued after dispatch immediately cancel it */
1345                 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1346                         taskq_cancel_id(spl_kmem_cache_taskq, id);
1347         }
1348
1349         spin_lock(&skc->skc_lock);
1350         skc->skc_taskqid = id;
1351         spin_unlock(&skc->skc_lock);
1352
1353         atomic_dec(&skc->skc_ref);
1354 }
1355
1356 /*
1357  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1358  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
1359  * for very small objects we may end up with more than this so as not
1360  * to waste space in the minimal allocation of a single page.  Also for
1361  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
1362  * lower than this and we will fail.
1363  */
1364 static int
1365 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1366 {
1367         uint32_t sks_size, obj_size, max_size;
1368
1369         if (skc->skc_flags & KMC_OFFSLAB) {
1370                 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
1371                 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
1372                 SRETURN(0);
1373         } else {
1374                 sks_size = spl_sks_size(skc);
1375                 obj_size = spl_obj_size(skc);
1376
1377                 if (skc->skc_flags & KMC_KMEM)
1378                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1379                 else
1380                         max_size = (32 * 1024 * 1024);
1381
1382                 /* Power of two sized slab */
1383                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1384                         *objs = (*size - sks_size) / obj_size;
1385                         if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
1386                                 SRETURN(0);
1387                 }
1388
1389                 /*
1390                  * Unable to satisfy target objects per slab, fall back to
1391                  * allocating a maximally sized slab and assuming it can
1392                  * contain the minimum objects count use it.  If not fail.
1393                  */
1394                 *size = max_size;
1395                 *objs = (*size - sks_size) / obj_size;
1396                 if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
1397                         SRETURN(0);
1398         }
1399
1400         SRETURN(-ENOSPC);
1401 }
1402
1403 /*
1404  * Make a guess at reasonable per-cpu magazine size based on the size of
1405  * each object and the cost of caching N of them in each magazine.  Long
1406  * term this should really adapt based on an observed usage heuristic.
1407  */
1408 static int
1409 spl_magazine_size(spl_kmem_cache_t *skc)
1410 {
1411         uint32_t obj_size = spl_obj_size(skc);
1412         int size;
1413         SENTRY;
1414
1415         /* Per-magazine sizes below assume a 4Kib page size */
1416         if (obj_size > (PAGE_SIZE * 256))
1417                 size = 4;  /* Minimum 4Mib per-magazine */
1418         else if (obj_size > (PAGE_SIZE * 32))
1419                 size = 16; /* Minimum 2Mib per-magazine */
1420         else if (obj_size > (PAGE_SIZE))
1421                 size = 64; /* Minimum 256Kib per-magazine */
1422         else if (obj_size > (PAGE_SIZE / 4))
1423                 size = 128; /* Minimum 128Kib per-magazine */
1424         else
1425                 size = 256;
1426
1427         SRETURN(size);
1428 }
1429
1430 /*
1431  * Allocate a per-cpu magazine to associate with a specific core.
1432  */
1433 static spl_kmem_magazine_t *
1434 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
1435 {
1436         spl_kmem_magazine_t *skm;
1437         int size = sizeof(spl_kmem_magazine_t) +
1438                    sizeof(void *) * skc->skc_mag_size;
1439         SENTRY;
1440
1441         skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
1442         if (skm) {
1443                 skm->skm_magic = SKM_MAGIC;
1444                 skm->skm_avail = 0;
1445                 skm->skm_size = skc->skc_mag_size;
1446                 skm->skm_refill = skc->skc_mag_refill;
1447                 skm->skm_cache = skc;
1448                 skm->skm_age = jiffies;
1449                 skm->skm_cpu = cpu;
1450         }
1451
1452         SRETURN(skm);
1453 }
1454
1455 /*
1456  * Free a per-cpu magazine associated with a specific core.
1457  */
1458 static void
1459 spl_magazine_free(spl_kmem_magazine_t *skm)
1460 {
1461         int size = sizeof(spl_kmem_magazine_t) +
1462                    sizeof(void *) * skm->skm_size;
1463
1464         SENTRY;
1465         ASSERT(skm->skm_magic == SKM_MAGIC);
1466         ASSERT(skm->skm_avail == 0);
1467
1468         kmem_free(skm, size);
1469         SEXIT;
1470 }
1471
1472 /*
1473  * Create all pre-cpu magazines of reasonable sizes.
1474  */
1475 static int
1476 spl_magazine_create(spl_kmem_cache_t *skc)
1477 {
1478         int i;
1479         SENTRY;
1480
1481         skc->skc_mag_size = spl_magazine_size(skc);
1482         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1483
1484         for_each_online_cpu(i) {
1485                 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
1486                 if (!skc->skc_mag[i]) {
1487                         for (i--; i >= 0; i--)
1488                                 spl_magazine_free(skc->skc_mag[i]);
1489
1490                         SRETURN(-ENOMEM);
1491                 }
1492         }
1493
1494         SRETURN(0);
1495 }
1496
1497 /*
1498  * Destroy all pre-cpu magazines.
1499  */
1500 static void
1501 spl_magazine_destroy(spl_kmem_cache_t *skc)
1502 {
1503         spl_kmem_magazine_t *skm;
1504         int i;
1505         SENTRY;
1506
1507         for_each_online_cpu(i) {
1508                 skm = skc->skc_mag[i];
1509                 spl_cache_flush(skc, skm, skm->skm_avail);
1510                 spl_magazine_free(skm);
1511         }
1512
1513         SEXIT;
1514 }
1515
1516 /*
1517  * Create a object cache based on the following arguments:
1518  * name         cache name
1519  * size         cache object size
1520  * align        cache object alignment
1521  * ctor         cache object constructor
1522  * dtor         cache object destructor
1523  * reclaim      cache object reclaim
1524  * priv         cache private data for ctor/dtor/reclaim
1525  * vmp          unused must be NULL
1526  * flags
1527  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1528  *      KMC_NODEBUG     Disable debugging (unsupported)
1529  *      KMC_NOMAGAZINE  Disable magazine (unsupported)
1530  *      KMC_NOHASH      Disable hashing (unsupported)
1531  *      KMC_QCACHE      Disable qcache (unsupported)
1532  *      KMC_KMEM        Force kmem backed cache
1533  *      KMC_VMEM        Force vmem backed cache
1534  *      KMC_OFFSLAB     Locate objects off the slab
1535  */
1536 spl_kmem_cache_t *
1537 spl_kmem_cache_create(char *name, size_t size, size_t align,
1538                       spl_kmem_ctor_t ctor,
1539                       spl_kmem_dtor_t dtor,
1540                       spl_kmem_reclaim_t reclaim,
1541                       void *priv, void *vmp, int flags)
1542 {
1543         spl_kmem_cache_t *skc;
1544         int rc;
1545         SENTRY;
1546
1547         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1548         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1549         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1550         ASSERT(vmp == NULL);
1551
1552         might_sleep();
1553
1554         /*
1555          * Allocate memory for a new cache an initialize it.  Unfortunately,
1556          * this usually ends up being a large allocation of ~32k because
1557          * we need to allocate enough memory for the worst case number of
1558          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1559          * explicitly pass KM_NODEBUG to suppress the kmem warning
1560          */
1561         skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
1562         if (skc == NULL)
1563                 SRETURN(NULL);
1564
1565         skc->skc_magic = SKC_MAGIC;
1566         skc->skc_name_size = strlen(name) + 1;
1567         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
1568         if (skc->skc_name == NULL) {
1569                 kmem_free(skc, sizeof(*skc));
1570                 SRETURN(NULL);
1571         }
1572         strncpy(skc->skc_name, name, skc->skc_name_size);
1573
1574         skc->skc_ctor = ctor;
1575         skc->skc_dtor = dtor;
1576         skc->skc_reclaim = reclaim;
1577         skc->skc_private = priv;
1578         skc->skc_vmp = vmp;
1579         skc->skc_flags = flags;
1580         skc->skc_obj_size = size;
1581         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1582         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1583         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1584         atomic_set(&skc->skc_ref, 0);
1585
1586         INIT_LIST_HEAD(&skc->skc_list);
1587         INIT_LIST_HEAD(&skc->skc_complete_list);
1588         INIT_LIST_HEAD(&skc->skc_partial_list);
1589         skc->skc_emergency_tree = RB_ROOT;
1590         spin_lock_init(&skc->skc_lock);
1591         init_waitqueue_head(&skc->skc_waitq);
1592         skc->skc_slab_fail = 0;
1593         skc->skc_slab_create = 0;
1594         skc->skc_slab_destroy = 0;
1595         skc->skc_slab_total = 0;
1596         skc->skc_slab_alloc = 0;
1597         skc->skc_slab_max = 0;
1598         skc->skc_obj_total = 0;
1599         skc->skc_obj_alloc = 0;
1600         skc->skc_obj_max = 0;
1601         skc->skc_obj_deadlock = 0;
1602         skc->skc_obj_emergency = 0;
1603         skc->skc_obj_emergency_max = 0;
1604
1605         if (align) {
1606                 VERIFY(ISP2(align));
1607                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */
1608                 VERIFY3U(align, <=, PAGE_SIZE);            /* Max alignment */
1609                 skc->skc_obj_align = align;
1610         }
1611
1612         /* If none passed select a cache type based on object size */
1613         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
1614                 if (spl_obj_size(skc) < (PAGE_SIZE / 8))
1615                         skc->skc_flags |= KMC_KMEM;
1616                 else
1617                         skc->skc_flags |= KMC_VMEM;
1618         }
1619
1620         rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
1621         if (rc)
1622                 SGOTO(out, rc);
1623
1624         rc = spl_magazine_create(skc);
1625         if (rc)
1626                 SGOTO(out, rc);
1627
1628         if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1629                 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1630                     spl_cache_age, skc, TQ_SLEEP,
1631                     ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1632
1633         down_write(&spl_kmem_cache_sem);
1634         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1635         up_write(&spl_kmem_cache_sem);
1636
1637         SRETURN(skc);
1638 out:
1639         kmem_free(skc->skc_name, skc->skc_name_size);
1640         kmem_free(skc, sizeof(*skc));
1641         SRETURN(NULL);
1642 }
1643 EXPORT_SYMBOL(spl_kmem_cache_create);
1644
1645 /*
1646  * Register a move callback to for cache defragmentation.
1647  * XXX: Unimplemented but harmless to stub out for now.
1648  */
1649 void
1650 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
1651     kmem_cbrc_t (move)(void *, void *, size_t, void *))
1652 {
1653         ASSERT(move != NULL);
1654 }
1655 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1656
1657 /*
1658  * Destroy a cache and all objects associated with the cache.
1659  */
1660 void
1661 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1662 {
1663         DECLARE_WAIT_QUEUE_HEAD(wq);
1664         taskqid_t id;
1665         SENTRY;
1666
1667         ASSERT(skc->skc_magic == SKC_MAGIC);
1668
1669         down_write(&spl_kmem_cache_sem);
1670         list_del_init(&skc->skc_list);
1671         up_write(&spl_kmem_cache_sem);
1672
1673         /* Cancel any and wait for any pending delayed tasks */
1674         VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1675
1676         spin_lock(&skc->skc_lock);
1677         id = skc->skc_taskqid;
1678         spin_unlock(&skc->skc_lock);
1679
1680         taskq_cancel_id(spl_kmem_cache_taskq, id);
1681
1682         /* Wait until all current callers complete, this is mainly
1683          * to catch the case where a low memory situation triggers a
1684          * cache reaping action which races with this destroy. */
1685         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1686
1687         spl_magazine_destroy(skc);
1688         spl_slab_reclaim(skc, 0, 1);
1689         spin_lock(&skc->skc_lock);
1690
1691         /* Validate there are no objects in use and free all the
1692          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1693         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1694         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1695         ASSERT3U(skc->skc_slab_total, ==, 0);
1696         ASSERT3U(skc->skc_obj_total, ==, 0);
1697         ASSERT3U(skc->skc_obj_emergency, ==, 0);
1698         ASSERT(list_empty(&skc->skc_complete_list));
1699
1700         kmem_free(skc->skc_name, skc->skc_name_size);
1701         spin_unlock(&skc->skc_lock);
1702
1703         kmem_free(skc, sizeof(*skc));
1704
1705         SEXIT;
1706 }
1707 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1708
1709 /*
1710  * Allocate an object from a slab attached to the cache.  This is used to
1711  * repopulate the per-cpu magazine caches in batches when they run low.
1712  */
1713 static void *
1714 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1715 {
1716         spl_kmem_obj_t *sko;
1717
1718         ASSERT(skc->skc_magic == SKC_MAGIC);
1719         ASSERT(sks->sks_magic == SKS_MAGIC);
1720         ASSERT(spin_is_locked(&skc->skc_lock));
1721
1722         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1723         ASSERT(sko->sko_magic == SKO_MAGIC);
1724         ASSERT(sko->sko_addr != NULL);
1725
1726         /* Remove from sks_free_list */
1727         list_del_init(&sko->sko_list);
1728
1729         sks->sks_age = jiffies;
1730         sks->sks_ref++;
1731         skc->skc_obj_alloc++;
1732
1733         /* Track max obj usage statistics */
1734         if (skc->skc_obj_alloc > skc->skc_obj_max)
1735                 skc->skc_obj_max = skc->skc_obj_alloc;
1736
1737         /* Track max slab usage statistics */
1738         if (sks->sks_ref == 1) {
1739                 skc->skc_slab_alloc++;
1740
1741                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1742                         skc->skc_slab_max = skc->skc_slab_alloc;
1743         }
1744
1745         return sko->sko_addr;
1746 }
1747
1748 /*
1749  * Generic slab allocation function to run by the global work queues.
1750  * It is responsible for allocating a new slab, linking it in to the list
1751  * of partial slabs, and then waking any waiters.
1752  */
1753 static void
1754 spl_cache_grow_work(void *data)
1755 {
1756         spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
1757         spl_kmem_cache_t *skc = ska->ska_cache;
1758         spl_kmem_slab_t *sks;
1759
1760         sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1761         spin_lock(&skc->skc_lock);
1762         if (sks) {
1763                 skc->skc_slab_total++;
1764                 skc->skc_obj_total += sks->sks_objs;
1765                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1766         }
1767
1768         atomic_dec(&skc->skc_ref);
1769         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1770         clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1771         wake_up_all(&skc->skc_waitq);
1772         spin_unlock(&skc->skc_lock);
1773
1774         kfree(ska);
1775 }
1776
1777 /*
1778  * Returns non-zero when a new slab should be available.
1779  */
1780 static int
1781 spl_cache_grow_wait(spl_kmem_cache_t *skc)
1782 {
1783         return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1784 }
1785
1786 static int
1787 spl_cache_reclaim_wait(void *word)
1788 {
1789         schedule();
1790         return 0;
1791 }
1792
1793 /*
1794  * No available objects on any slabs, create a new slab.
1795  */
1796 static int
1797 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1798 {
1799         int remaining, rc;
1800         SENTRY;
1801
1802         ASSERT(skc->skc_magic == SKC_MAGIC);
1803         might_sleep();
1804         *obj = NULL;
1805
1806         /*
1807          * Before allocating a new slab wait for any reaping to complete and
1808          * then return so the local magazine can be rechecked for new objects.
1809          */
1810         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1811                 rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1812                     spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
1813                 SRETURN(rc ? rc : -EAGAIN);
1814         }
1815
1816         /*
1817          * This is handled by dispatching a work request to the global work
1818          * queue.  This allows us to asynchronously allocate a new slab while
1819          * retaining the ability to safely fall back to a smaller synchronous
1820          * allocations to ensure forward progress is always maintained.
1821          */
1822         if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1823                 spl_kmem_alloc_t *ska;
1824
1825                 ska = kmalloc(sizeof(*ska), flags);
1826                 if (ska == NULL) {
1827                         clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1828                         wake_up_all(&skc->skc_waitq);
1829                         SRETURN(-ENOMEM);
1830                 }
1831
1832                 atomic_inc(&skc->skc_ref);
1833                 ska->ska_cache = skc;
1834                 ska->ska_flags = flags & ~__GFP_FS;
1835                 taskq_init_ent(&ska->ska_tqe);
1836                 taskq_dispatch_ent(spl_kmem_cache_taskq,
1837                     spl_cache_grow_work, ska, 0, &ska->ska_tqe);
1838         }
1839
1840         /*
1841          * The goal here is to only detect the rare case where a virtual slab
1842          * allocation has deadlocked.  We must be careful to minimize the use
1843          * of emergency objects which are more expensive to track.  Therefore,
1844          * we set a very long timeout for the asynchronous allocation and if
1845          * the timeout is reached the cache is flagged as deadlocked.  From
1846          * this point only new emergency objects will be allocated until the
1847          * asynchronous allocation completes and clears the deadlocked flag.
1848          */
1849         if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1850                 rc = spl_emergency_alloc(skc, flags, obj);
1851         } else {
1852                 remaining = wait_event_timeout(skc->skc_waitq,
1853                                                spl_cache_grow_wait(skc), HZ);
1854
1855                 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1856                         spin_lock(&skc->skc_lock);
1857                         if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1858                                 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1859                                 skc->skc_obj_deadlock++;
1860                         }
1861                         spin_unlock(&skc->skc_lock);
1862                 }
1863
1864                 rc = -ENOMEM;
1865         }
1866
1867         SRETURN(rc);
1868 }
1869
1870 /*
1871  * Refill a per-cpu magazine with objects from the slabs for this cache.
1872  * Ideally the magazine can be repopulated using existing objects which have
1873  * been released, however if we are unable to locate enough free objects new
1874  * slabs of objects will be created.  On success NULL is returned, otherwise
1875  * the address of a single emergency object is returned for use by the caller.
1876  */
1877 static void *
1878 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1879 {
1880         spl_kmem_slab_t *sks;
1881         int count = 0, rc, refill;
1882         void *obj = NULL;
1883         SENTRY;
1884
1885         ASSERT(skc->skc_magic == SKC_MAGIC);
1886         ASSERT(skm->skm_magic == SKM_MAGIC);
1887
1888         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1889         spin_lock(&skc->skc_lock);
1890
1891         while (refill > 0) {
1892                 /* No slabs available we may need to grow the cache */
1893                 if (list_empty(&skc->skc_partial_list)) {
1894                         spin_unlock(&skc->skc_lock);
1895
1896                         local_irq_enable();
1897                         rc = spl_cache_grow(skc, flags, &obj);
1898                         local_irq_disable();
1899
1900                         /* Emergency object for immediate use by caller */
1901                         if (rc == 0 && obj != NULL)
1902                                 SRETURN(obj);
1903
1904                         if (rc)
1905                                 SGOTO(out, rc);
1906
1907                         /* Rescheduled to different CPU skm is not local */
1908                         if (skm != skc->skc_mag[smp_processor_id()])
1909                                 SGOTO(out, rc);
1910
1911                         /* Potentially rescheduled to the same CPU but
1912                          * allocations may have occurred from this CPU while
1913                          * we were sleeping so recalculate max refill. */
1914                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1915
1916                         spin_lock(&skc->skc_lock);
1917                         continue;
1918                 }
1919
1920                 /* Grab the next available slab */
1921                 sks = list_entry((&skc->skc_partial_list)->next,
1922                                  spl_kmem_slab_t, sks_list);
1923                 ASSERT(sks->sks_magic == SKS_MAGIC);
1924                 ASSERT(sks->sks_ref < sks->sks_objs);
1925                 ASSERT(!list_empty(&sks->sks_free_list));
1926
1927                 /* Consume as many objects as needed to refill the requested
1928                  * cache.  We must also be careful not to overfill it. */
1929                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
1930                         ASSERT(skm->skm_avail < skm->skm_size);
1931                         ASSERT(count < skm->skm_size);
1932                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1933                 }
1934
1935                 /* Move slab to skc_complete_list when full */
1936                 if (sks->sks_ref == sks->sks_objs) {
1937                         list_del(&sks->sks_list);
1938                         list_add(&sks->sks_list, &skc->skc_complete_list);
1939                 }
1940         }
1941
1942         spin_unlock(&skc->skc_lock);
1943 out:
1944         SRETURN(NULL);
1945 }
1946
1947 /*
1948  * Release an object back to the slab from which it came.
1949  */
1950 static void
1951 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1952 {
1953         spl_kmem_slab_t *sks = NULL;
1954         spl_kmem_obj_t *sko = NULL;
1955         SENTRY;
1956
1957         ASSERT(skc->skc_magic == SKC_MAGIC);
1958         ASSERT(spin_is_locked(&skc->skc_lock));
1959
1960         sko = spl_sko_from_obj(skc, obj);
1961         ASSERT(sko->sko_magic == SKO_MAGIC);
1962         sks = sko->sko_slab;
1963         ASSERT(sks->sks_magic == SKS_MAGIC);
1964         ASSERT(sks->sks_cache == skc);
1965         list_add(&sko->sko_list, &sks->sks_free_list);
1966
1967         sks->sks_age = jiffies;
1968         sks->sks_ref--;
1969         skc->skc_obj_alloc--;
1970
1971         /* Move slab to skc_partial_list when no longer full.  Slabs
1972          * are added to the head to keep the partial list is quasi-full
1973          * sorted order.  Fuller at the head, emptier at the tail. */
1974         if (sks->sks_ref == (sks->sks_objs - 1)) {
1975                 list_del(&sks->sks_list);
1976                 list_add(&sks->sks_list, &skc->skc_partial_list);
1977         }
1978
1979         /* Move empty slabs to the end of the partial list so
1980          * they can be easily found and freed during reclamation. */
1981         if (sks->sks_ref == 0) {
1982                 list_del(&sks->sks_list);
1983                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1984                 skc->skc_slab_alloc--;
1985         }
1986
1987         SEXIT;
1988 }
1989
1990 /*
1991  * Allocate an object from the per-cpu magazine, or if the magazine
1992  * is empty directly allocate from a slab and repopulate the magazine.
1993  */
1994 void *
1995 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1996 {
1997         spl_kmem_magazine_t *skm;
1998         void *obj = NULL;
1999         SENTRY;
2000
2001         ASSERT(skc->skc_magic == SKC_MAGIC);
2002         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2003         ASSERT(flags & KM_SLEEP);
2004         atomic_inc(&skc->skc_ref);
2005         local_irq_disable();
2006
2007 restart:
2008         /* Safe to update per-cpu structure without lock, but
2009          * in the restart case we must be careful to reacquire
2010          * the local magazine since this may have changed
2011          * when we need to grow the cache. */
2012         skm = skc->skc_mag[smp_processor_id()];
2013         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
2014                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
2015                 skm->skm_size, skm->skm_refill, skm->skm_avail);
2016
2017         if (likely(skm->skm_avail)) {
2018                 /* Object available in CPU cache, use it */
2019                 obj = skm->skm_objs[--skm->skm_avail];
2020                 skm->skm_age = jiffies;
2021         } else {
2022                 obj = spl_cache_refill(skc, skm, flags);
2023                 if (obj == NULL)
2024                         SGOTO(restart, obj = NULL);
2025         }
2026
2027         local_irq_enable();
2028         ASSERT(obj);
2029         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
2030
2031         /* Pre-emptively migrate object to CPU L1 cache */
2032         prefetchw(obj);
2033         atomic_dec(&skc->skc_ref);
2034
2035         SRETURN(obj);
2036 }
2037 EXPORT_SYMBOL(spl_kmem_cache_alloc);
2038
2039 /*
2040  * Free an object back to the local per-cpu magazine, there is no
2041  * guarantee that this is the same magazine the object was originally
2042  * allocated from.  We may need to flush entire from the magazine
2043  * back to the slabs to make space.
2044  */
2045 void
2046 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
2047 {
2048         spl_kmem_magazine_t *skm;
2049         unsigned long flags;
2050         SENTRY;
2051
2052         ASSERT(skc->skc_magic == SKC_MAGIC);
2053         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2054         atomic_inc(&skc->skc_ref);
2055
2056         /*
2057          * Only virtual slabs may have emergency objects and these objects
2058          * are guaranteed to have physical addresses.  They must be removed
2059          * from the tree of emergency objects and the freed.
2060          */
2061         if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
2062                 SGOTO(out, spl_emergency_free(skc, obj));
2063
2064         local_irq_save(flags);
2065
2066         /* Safe to update per-cpu structure without lock, but
2067          * no remote memory allocation tracking is being performed
2068          * it is entirely possible to allocate an object from one
2069          * CPU cache and return it to another. */
2070         skm = skc->skc_mag[smp_processor_id()];
2071         ASSERT(skm->skm_magic == SKM_MAGIC);
2072
2073         /* Per-CPU cache full, flush it to make space */
2074         if (unlikely(skm->skm_avail >= skm->skm_size))
2075                 spl_cache_flush(skc, skm, skm->skm_refill);
2076
2077         /* Available space in cache, use it */
2078         skm->skm_objs[skm->skm_avail++] = obj;
2079
2080         local_irq_restore(flags);
2081 out:
2082         atomic_dec(&skc->skc_ref);
2083
2084         SEXIT;
2085 }
2086 EXPORT_SYMBOL(spl_kmem_cache_free);
2087
2088 /*
2089  * The generic shrinker function for all caches.  Under Linux a shrinker
2090  * may not be tightly coupled with a slab cache.  In fact Linux always
2091  * systematically tries calling all registered shrinker callbacks which
2092  * report that they contain unused objects.  Because of this we only
2093  * register one shrinker function in the shim layer for all slab caches.
2094  * We always attempt to shrink all caches when this generic shrinker
2095  * is called.  The shrinker should return the number of free objects
2096  * in the cache when called with nr_to_scan == 0 but not attempt to
2097  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
2098  * objects should be freed, which differs from Solaris semantics.
2099  * Solaris semantics are to free all available objects which may (and
2100  * probably will) be more objects than the requested nr_to_scan.
2101  */
2102 static int
2103 __spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
2104     struct shrink_control *sc)
2105 {
2106         spl_kmem_cache_t *skc;
2107         int unused = 0;
2108
2109         down_read(&spl_kmem_cache_sem);
2110         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
2111                 if (sc->nr_to_scan)
2112                         spl_kmem_cache_reap_now(skc,
2113                            MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
2114
2115                 /*
2116                  * Presume everything alloc'ed in reclaimable, this ensures
2117                  * we are called again with nr_to_scan > 0 so can try and
2118                  * reclaim.  The exact number is not important either so
2119                  * we forgo taking this already highly contented lock.
2120                  */
2121                 unused += skc->skc_obj_alloc;
2122         }
2123         up_read(&spl_kmem_cache_sem);
2124
2125         /*
2126          * After performing reclaim always return -1 to indicate we cannot
2127          * perform additional reclaim.  This prevents shrink_slabs() from
2128          * repeatedly invoking this generic shrinker and potentially spinning.
2129          */
2130         if (sc->nr_to_scan)
2131                 return -1;
2132
2133         return unused;
2134 }
2135
2136 SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2137
2138 /*
2139  * Call the registered reclaim function for a cache.  Depending on how
2140  * many and which objects are released it may simply repopulate the
2141  * local magazine which will then need to age-out.  Objects which cannot
2142  * fit in the magazine we will be released back to their slabs which will
2143  * also need to age out before being release.  This is all just best
2144  * effort and we do not want to thrash creating and destroying slabs.
2145  */
2146 void
2147 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
2148 {
2149         SENTRY;
2150
2151         ASSERT(skc->skc_magic == SKC_MAGIC);
2152         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2153
2154         /* Prevent concurrent cache reaping when contended */
2155         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
2156                 SEXIT;
2157                 return;
2158         }
2159
2160         atomic_inc(&skc->skc_ref);
2161
2162         /*
2163          * When a reclaim function is available it may be invoked repeatedly
2164          * until at least a single slab can be freed.  This ensures that we
2165          * do free memory back to the system.  This helps minimize the chance
2166          * of an OOM event when the bulk of memory is used by the slab.
2167          *
2168          * When free slabs are already available the reclaim callback will be
2169          * skipped.  Additionally, if no forward progress is detected despite
2170          * a reclaim function the cache will be skipped to avoid deadlock.
2171          *
2172          * Longer term this would be the correct place to add the code which
2173          * repacks the slabs in order minimize fragmentation.
2174          */
2175         if (skc->skc_reclaim) {
2176                 uint64_t objects = UINT64_MAX;
2177                 int do_reclaim;
2178
2179                 do {
2180                         spin_lock(&skc->skc_lock);
2181                         do_reclaim =
2182                             (skc->skc_slab_total > 0) &&
2183                             ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2184                             (skc->skc_obj_alloc < objects);
2185
2186                         objects = skc->skc_obj_alloc;
2187                         spin_unlock(&skc->skc_lock);
2188
2189                         if (do_reclaim)
2190                                 skc->skc_reclaim(skc->skc_private);
2191
2192                 } while (do_reclaim);
2193         }
2194
2195         /* Reclaim from the magazine then the slabs ignoring age and delay. */
2196         if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2197                 spl_kmem_magazine_t *skm;
2198                 unsigned long irq_flags;
2199
2200                 local_irq_save(irq_flags);
2201                 skm = skc->skc_mag[smp_processor_id()];
2202                 spl_cache_flush(skc, skm, skm->skm_avail);
2203                 local_irq_restore(irq_flags);
2204         }
2205
2206         spl_slab_reclaim(skc, count, 1);
2207         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
2208         smp_mb__after_clear_bit();
2209         wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
2210
2211         atomic_dec(&skc->skc_ref);
2212
2213         SEXIT;
2214 }
2215 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
2216
2217 /*
2218  * Reap all free slabs from all registered caches.
2219  */
2220 void
2221 spl_kmem_reap(void)
2222 {
2223         struct shrink_control sc;
2224
2225         sc.nr_to_scan = KMC_REAP_CHUNK;
2226         sc.gfp_mask = GFP_KERNEL;
2227
2228         __spl_kmem_cache_generic_shrinker(NULL, &sc);
2229 }
2230 EXPORT_SYMBOL(spl_kmem_reap);
2231
2232 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
2233 static char *
2234 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
2235 {
2236         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
2237         int i, flag = 1;
2238
2239         ASSERT(str != NULL && len >= 17);
2240         memset(str, 0, len);
2241
2242         /* Check for a fully printable string, and while we are at
2243          * it place the printable characters in the passed buffer. */
2244         for (i = 0; i < size; i++) {
2245                 str[i] = ((char *)(kd->kd_addr))[i];
2246                 if (isprint(str[i])) {
2247                         continue;
2248                 } else {
2249                         /* Minimum number of printable characters found
2250                          * to make it worthwhile to print this as ascii. */
2251                         if (i > min)
2252                                 break;
2253
2254                         flag = 0;
2255                         break;
2256                 }
2257         }
2258
2259         if (!flag) {
2260                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2261                         *((uint8_t *)kd->kd_addr),
2262                         *((uint8_t *)kd->kd_addr + 2),
2263                         *((uint8_t *)kd->kd_addr + 4),
2264                         *((uint8_t *)kd->kd_addr + 6),
2265                         *((uint8_t *)kd->kd_addr + 8),
2266                         *((uint8_t *)kd->kd_addr + 10),
2267                         *((uint8_t *)kd->kd_addr + 12),
2268                         *((uint8_t *)kd->kd_addr + 14));
2269         }
2270
2271         return str;
2272 }
2273
2274 static int
2275 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2276 {
2277         int i;
2278         SENTRY;
2279
2280         spin_lock_init(lock);
2281         INIT_LIST_HEAD(list);
2282
2283         for (i = 0; i < size; i++)
2284                 INIT_HLIST_HEAD(&kmem_table[i]);
2285
2286         SRETURN(0);
2287 }
2288
2289 static void
2290 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
2291 {
2292         unsigned long flags;
2293         kmem_debug_t *kd;
2294         char str[17];
2295         SENTRY;
2296
2297         spin_lock_irqsave(lock, flags);
2298         if (!list_empty(list))
2299                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2300                        "size", "data", "func", "line");
2301
2302         list_for_each_entry(kd, list, kd_list)
2303                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
2304                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2305                        kd->kd_func, kd->kd_line);
2306
2307         spin_unlock_irqrestore(lock, flags);
2308         SEXIT;
2309 }
2310 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2311 #define spl_kmem_init_tracking(list, lock, size)
2312 #define spl_kmem_fini_tracking(list, lock)
2313 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2314
2315 static void
2316 spl_kmem_init_globals(void)
2317 {
2318         struct zone *zone;
2319
2320         /* For now all zones are includes, it may be wise to restrict
2321          * this to normal and highmem zones if we see problems. */
2322         for_each_zone(zone) {
2323
2324                 if (!populated_zone(zone))
2325                         continue;
2326
2327                 minfree += min_wmark_pages(zone);
2328                 desfree += low_wmark_pages(zone);
2329                 lotsfree += high_wmark_pages(zone);
2330         }
2331
2332         /* Solaris default values */
2333         swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
2334         swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
2335 }
2336
2337 /*
2338  * Called at module init when it is safe to use spl_kallsyms_lookup_name()
2339  */
2340 int
2341 spl_kmem_init_kallsyms_lookup(void)
2342 {
2343 #ifndef HAVE_GET_VMALLOC_INFO
2344         get_vmalloc_info_fn = (get_vmalloc_info_t)
2345                 spl_kallsyms_lookup_name("get_vmalloc_info");
2346         if (!get_vmalloc_info_fn) {
2347                 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
2348                 return -EFAULT;
2349         }
2350 #endif /* HAVE_GET_VMALLOC_INFO */
2351
2352 #ifdef HAVE_PGDAT_HELPERS
2353 # ifndef HAVE_FIRST_ONLINE_PGDAT
2354         first_online_pgdat_fn = (first_online_pgdat_t)
2355                 spl_kallsyms_lookup_name("first_online_pgdat");
2356         if (!first_online_pgdat_fn) {
2357                 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
2358                 return -EFAULT;
2359         }
2360 # endif /* HAVE_FIRST_ONLINE_PGDAT */
2361
2362 # ifndef HAVE_NEXT_ONLINE_PGDAT
2363         next_online_pgdat_fn = (next_online_pgdat_t)
2364                 spl_kallsyms_lookup_name("next_online_pgdat");
2365         if (!next_online_pgdat_fn) {
2366                 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
2367                 return -EFAULT;
2368         }
2369 # endif /* HAVE_NEXT_ONLINE_PGDAT */
2370
2371 # ifndef HAVE_NEXT_ZONE
2372         next_zone_fn = (next_zone_t)
2373                 spl_kallsyms_lookup_name("next_zone");
2374         if (!next_zone_fn) {
2375                 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
2376                 return -EFAULT;
2377         }
2378 # endif /* HAVE_NEXT_ZONE */
2379
2380 #else /* HAVE_PGDAT_HELPERS */
2381
2382 # ifndef HAVE_PGDAT_LIST
2383         pgdat_list_addr = *(struct pglist_data **)
2384                 spl_kallsyms_lookup_name("pgdat_list");
2385         if (!pgdat_list_addr) {
2386                 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
2387                 return -EFAULT;
2388         }
2389 # endif /* HAVE_PGDAT_LIST */
2390 #endif /* HAVE_PGDAT_HELPERS */
2391
2392 #if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
2393         get_zone_counts_fn = (get_zone_counts_t)
2394                 spl_kallsyms_lookup_name("get_zone_counts");
2395         if (!get_zone_counts_fn) {
2396                 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
2397                 return -EFAULT;
2398         }
2399 #endif  /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
2400
2401         /*
2402          * It is now safe to initialize the global tunings which rely on
2403          * the use of the for_each_zone() macro.  This macro in turns
2404          * depends on the *_pgdat symbols which are now available.
2405          */
2406         spl_kmem_init_globals();
2407
2408 #ifndef HAVE_SHRINK_DCACHE_MEMORY
2409         /* When shrink_dcache_memory_fn == NULL support is disabled */
2410         shrink_dcache_memory_fn = (shrink_dcache_memory_t)
2411                 spl_kallsyms_lookup_name("shrink_dcache_memory");
2412 #endif /* HAVE_SHRINK_DCACHE_MEMORY */
2413
2414 #ifndef HAVE_SHRINK_ICACHE_MEMORY
2415         /* When shrink_icache_memory_fn == NULL support is disabled */
2416         shrink_icache_memory_fn = (shrink_icache_memory_t)
2417                 spl_kallsyms_lookup_name("shrink_icache_memory");
2418 #endif /* HAVE_SHRINK_ICACHE_MEMORY */
2419
2420         return 0;
2421 }
2422
2423 int
2424 spl_kmem_init(void)
2425 {
2426         int rc = 0;
2427         SENTRY;
2428
2429 #ifdef DEBUG_KMEM
2430         kmem_alloc_used_set(0);
2431         vmem_alloc_used_set(0);
2432
2433         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2434         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2435 #endif
2436
2437         init_rwsem(&spl_kmem_cache_sem);
2438         INIT_LIST_HEAD(&spl_kmem_cache_list);
2439         spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2440             1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2441
2442         spl_register_shrinker(&spl_kmem_cache_shrinker);
2443
2444         SRETURN(rc);
2445 }
2446
2447 void
2448 spl_kmem_fini(void)
2449 {
2450         SENTRY;
2451
2452         spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2453         taskq_destroy(spl_kmem_cache_taskq);
2454
2455 #ifdef DEBUG_KMEM
2456         /* Display all unreclaimed memory addresses, including the
2457          * allocation size and the first few bytes of what's located
2458          * at that address to aid in debugging.  Performance is not
2459          * a serious concern here since it is module unload time. */
2460         if (kmem_alloc_used_read() != 0)
2461                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2462                     "kmem leaked %ld/%ld bytes\n",
2463                     kmem_alloc_used_read(), kmem_alloc_max);
2464
2465
2466         if (vmem_alloc_used_read() != 0)
2467                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2468                     "vmem leaked %ld/%ld bytes\n",
2469                     vmem_alloc_used_read(), vmem_alloc_max);
2470
2471         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2472         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2473 #endif /* DEBUG_KMEM */
2474
2475         SEXIT;
2476 }