module/spl/spl-kmem.c

   1 /*****************************************************************************\
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *  For details, see <http://github.com/behlendorf/spl/>.
  10  *
  11  *  The SPL is free software; you can redistribute it and/or modify it
  12  *  under the terms of the GNU General Public License as published by the
  13  *  Free Software Foundation; either version 2 of the License, or (at your
  14  *  option) any later version.
  15  *
  16  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  17  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  18  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  19  *  for more details.
  20  *
  21  *  You should have received a copy of the GNU General Public License along
  22  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  23  *****************************************************************************
  24  *  Solaris Porting Layer (SPL) Kmem Implementation.
  25 \*****************************************************************************/
  26
  27 #include <sys/kmem.h>
  28 #include <spl-debug.h>
  29
  30 #ifdef SS_DEBUG_SUBSYS
  31 #undef SS_DEBUG_SUBSYS
  32 #endif
  33
  34 #define SS_DEBUG_SUBSYS SS_KMEM
  35
  36 /*
  37  * The minimum amount of memory measured in pages to be free at all
  38  * times on the system.  This is similar to Linux's zone->pages_min
  39  * multipled by the number of zones and is sized based on that.
  40  */
  41 pgcnt_t minfree = 0;
  42 EXPORT_SYMBOL(minfree);
  43
  44 /*
  45  * The desired amount of memory measured in pages to be free at all
  46  * times on the system.  This is similar to Linux's zone->pages_low
  47  * multipled by the number of zones and is sized based on that.
  48  * Assuming all zones are being used roughly equally, when we drop
  49  * below this threshold async page reclamation is triggered.
  50  */
  51 pgcnt_t desfree = 0;
  52 EXPORT_SYMBOL(desfree);
  53
  54 /*
  55  * When above this amount of memory measures in pages the system is
  56  * determined to have enough free memory.  This is similar to Linux's
  57  * zone->pages_high multipled by the number of zones and is sized based
  58  * on that.  Assuming all zones are being used roughly equally, when
  59  * async page reclamation reaches this threshold it stops.
  60  */
  61 pgcnt_t lotsfree = 0;
  62 EXPORT_SYMBOL(lotsfree);
  63
  64 /* Unused always 0 in this implementation */
  65 pgcnt_t needfree = 0;
  66 EXPORT_SYMBOL(needfree);
  67
  68 pgcnt_t swapfs_minfree = 0;
  69 EXPORT_SYMBOL(swapfs_minfree);
  70
  71 pgcnt_t swapfs_reserve = 0;
  72 EXPORT_SYMBOL(swapfs_reserve);
  73
  74 vmem_t *heap_arena = NULL;
  75 EXPORT_SYMBOL(heap_arena);
  76
  77 vmem_t *zio_alloc_arena = NULL;
  78 EXPORT_SYMBOL(zio_alloc_arena);
  79
  80 vmem_t *zio_arena = NULL;
  81 EXPORT_SYMBOL(zio_arena);
  82
  83 #ifndef HAVE_GET_VMALLOC_INFO
  84 get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
  85 EXPORT_SYMBOL(get_vmalloc_info_fn);
  86 #endif /* HAVE_GET_VMALLOC_INFO */
  87
  88 #ifdef HAVE_PGDAT_HELPERS
  89 # ifndef HAVE_FIRST_ONLINE_PGDAT
  90 first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
  91 EXPORT_SYMBOL(first_online_pgdat_fn);
  92 # endif /* HAVE_FIRST_ONLINE_PGDAT */
  93
  94 # ifndef HAVE_NEXT_ONLINE_PGDAT
  95 next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
  96 EXPORT_SYMBOL(next_online_pgdat_fn);
  97 # endif /* HAVE_NEXT_ONLINE_PGDAT */
  98
  99 # ifndef HAVE_NEXT_ZONE
 100 next_zone_t next_zone_fn = SYMBOL_POISON;
 101 EXPORT_SYMBOL(next_zone_fn);
 102 # endif /* HAVE_NEXT_ZONE */
 103
 104 #else /* HAVE_PGDAT_HELPERS */
 105
 106 # ifndef HAVE_PGDAT_LIST
 107 struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
 108 EXPORT_SYMBOL(pgdat_list_addr);
 109 # endif /* HAVE_PGDAT_LIST */
 110
 111 #endif /* HAVE_PGDAT_HELPERS */
 112
 113 #ifdef NEED_GET_ZONE_COUNTS
 114 # ifndef HAVE_GET_ZONE_COUNTS
 115 get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
 116 EXPORT_SYMBOL(get_zone_counts_fn);
 117 # endif /* HAVE_GET_ZONE_COUNTS */
 118
 119 unsigned long
 120 spl_global_page_state(spl_zone_stat_item_t item)
 121 {
 122         unsigned long active;
 123         unsigned long inactive;
 124         unsigned long free;
 125
 126         get_zone_counts(&active, &inactive, &free);
 127         switch (item) {
 128         case SPL_NR_FREE_PAGES: return free;
 129         case SPL_NR_INACTIVE:   return inactive;
 130         case SPL_NR_ACTIVE:     return active;
 131         default:                ASSERT(0); /* Unsupported */
 132         }
 133
 134         return 0;
 135 }
 136 #else
 137 # ifdef HAVE_GLOBAL_PAGE_STATE
 138 unsigned long
 139 spl_global_page_state(spl_zone_stat_item_t item)
 140 {
 141         unsigned long pages = 0;
 142
 143         switch (item) {
 144         case SPL_NR_FREE_PAGES:
 145 #  ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
 146                 pages += global_page_state(NR_FREE_PAGES);
 147 #  endif
 148                 break;
 149         case SPL_NR_INACTIVE:
 150 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
 151                 pages += global_page_state(NR_INACTIVE);
 152 #  endif
 153 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
 154                 pages += global_page_state(NR_INACTIVE_ANON);
 155 #  endif
 156 #  ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
 157                 pages += global_page_state(NR_INACTIVE_FILE);
 158 #  endif
 159                 break;
 160         case SPL_NR_ACTIVE:
 161 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
 162                 pages += global_page_state(NR_ACTIVE);
 163 #  endif
 164 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
 165                 pages += global_page_state(NR_ACTIVE_ANON);
 166 #  endif
 167 #  ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
 168                 pages += global_page_state(NR_ACTIVE_FILE);
 169 #  endif
 170                 break;
 171         default:
 172                 ASSERT(0); /* Unsupported */
 173         }
 174
 175         return pages;
 176 }
 177 # else
 178 #  error "Both global_page_state() and get_zone_counts() unavailable"
 179 # endif /* HAVE_GLOBAL_PAGE_STATE */
 180 #endif /* NEED_GET_ZONE_COUNTS */
 181 EXPORT_SYMBOL(spl_global_page_state);
 182
 183 #ifndef HAVE_INVALIDATE_INODES
 184 invalidate_inodes_t invalidate_inodes_fn = SYMBOL_POISON;
 185 EXPORT_SYMBOL(invalidate_inodes_fn);
 186 #endif /* HAVE_INVALIDATE_INODES */
 187
 188 pgcnt_t
 189 spl_kmem_availrmem(void)
 190 {
 191         /* The amount of easily available memory */
 192         return (spl_global_page_state(SPL_NR_FREE_PAGES) +
 193                 spl_global_page_state(SPL_NR_INACTIVE));
 194 }
 195 EXPORT_SYMBOL(spl_kmem_availrmem);
 196
 197 size_t
 198 vmem_size(vmem_t *vmp, int typemask)
 199 {
 200         struct vmalloc_info vmi;
 201         size_t size = 0;
 202
 203         ASSERT(vmp == NULL);
 204         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 205
 206         get_vmalloc_info(&vmi);
 207         if (typemask & VMEM_ALLOC)
 208                 size += (size_t)vmi.used;
 209
 210         if (typemask & VMEM_FREE)
 211                 size += (size_t)(VMALLOC_TOTAL - vmi.used);
 212
 213         return size;
 214 }
 215 EXPORT_SYMBOL(vmem_size);
 216
 217 int
 218 kmem_debugging(void)
 219 {
 220         return 0;
 221 }
 222 EXPORT_SYMBOL(kmem_debugging);
 223
 224 #ifndef HAVE_KVASPRINTF
 225 /* Simplified asprintf. */
 226 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 227 {
 228         unsigned int len;
 229         char *p;
 230         va_list aq;
 231
 232         va_copy(aq, ap);
 233         len = vsnprintf(NULL, 0, fmt, aq);
 234         va_end(aq);
 235
 236         p = kmalloc(len+1, gfp);
 237         if (!p)
 238                 return NULL;
 239
 240         vsnprintf(p, len+1, fmt, ap);
 241
 242         return p;
 243 }
 244 EXPORT_SYMBOL(kvasprintf);
 245 #endif /* HAVE_KVASPRINTF */
 246
 247 char *
 248 kmem_vasprintf(const char *fmt, va_list ap)
 249 {
 250         va_list aq;
 251         char *ptr;
 252
 253         do {
 254                 va_copy(aq, ap);
 255                 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
 256                 va_end(aq);
 257         } while (ptr == NULL);
 258
 259         return ptr;
 260 }
 261 EXPORT_SYMBOL(kmem_vasprintf);
 262
 263 char *
 264 kmem_asprintf(const char *fmt, ...)
 265 {
 266         va_list ap;
 267         char *ptr;
 268
 269         do {
 270                 va_start(ap, fmt);
 271                 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
 272                 va_end(ap);
 273         } while (ptr == NULL);
 274
 275         return ptr;
 276 }
 277 EXPORT_SYMBOL(kmem_asprintf);
 278
 279 static char *
 280 __strdup(const char *str, int flags)
 281 {
 282         char *ptr;
 283         int n;
 284
 285         n = strlen(str);
 286         ptr = kmalloc_nofail(n + 1, flags);
 287         if (ptr)
 288                 memcpy(ptr, str, n + 1);
 289
 290         return ptr;
 291 }
 292
 293 char *
 294 strdup(const char *str)
 295 {
 296         return __strdup(str, KM_SLEEP);
 297 }
 298 EXPORT_SYMBOL(strdup);
 299
 300 void
 301 strfree(char *str)
 302 {
 303         kfree(str);
 304 }
 305 EXPORT_SYMBOL(strfree);
 306
 307 /*
 308  * Memory allocation interfaces and debugging for basic kmem_*
 309  * and vmem_* style memory allocation.  When DEBUG_KMEM is enabled
 310  * the SPL will keep track of the total memory allocated, and
 311  * report any memory leaked when the module is unloaded.
 312  */
 313 #ifdef DEBUG_KMEM
 314
 315 /* Shim layer memory accounting */
 316 # ifdef HAVE_ATOMIC64_T
 317 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 318 unsigned long long kmem_alloc_max = 0;
 319 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 320 unsigned long long vmem_alloc_max = 0;
 321 # else  /* HAVE_ATOMIC64_T */
 322 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 323 unsigned long long kmem_alloc_max = 0;
 324 atomic_t vmem_alloc_used = ATOMIC_INIT(0);
 325 unsigned long long vmem_alloc_max = 0;
 326 # endif /* HAVE_ATOMIC64_T */
 327
 328 EXPORT_SYMBOL(kmem_alloc_used);
 329 EXPORT_SYMBOL(kmem_alloc_max);
 330 EXPORT_SYMBOL(vmem_alloc_used);
 331 EXPORT_SYMBOL(vmem_alloc_max);
 332
 333 /* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 334  * but also the location of every alloc and free.  When the SPL module is
 335  * unloaded a list of all leaked addresses and where they were allocated
 336  * will be dumped to the console.  Enabling this feature has a significant
 337  * impact on performance but it makes finding memory leaks straight forward.
 338  *
 339  * Not surprisingly with debugging enabled the xmem_locks are very highly
 340  * contended particularly on xfree().  If we want to run with this detailed
 341  * debugging enabled for anything other than debugging  we need to minimize
 342  * the contention by moving to a lock per xmem_table entry model.
 343  */
 344 # ifdef DEBUG_KMEM_TRACKING
 345
 346 #  define KMEM_HASH_BITS          10
 347 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 348
 349 #  define VMEM_HASH_BITS          10
 350 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 351
 352 typedef struct kmem_debug {
 353         struct hlist_node kd_hlist;     /* Hash node linkage */
 354         struct list_head kd_list;       /* List of all allocations */
 355         void *kd_addr;                  /* Allocation pointer */
 356         size_t kd_size;                 /* Allocation size */
 357         const char *kd_func;            /* Allocation function */
 358         int kd_line;                    /* Allocation line */
 359 } kmem_debug_t;
 360
 361 spinlock_t kmem_lock;
 362 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 363 struct list_head kmem_list;
 364
 365 spinlock_t vmem_lock;
 366 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 367 struct list_head vmem_list;
 368
 369 EXPORT_SYMBOL(kmem_lock);
 370 EXPORT_SYMBOL(kmem_table);
 371 EXPORT_SYMBOL(kmem_list);
 372
 373 EXPORT_SYMBOL(vmem_lock);
 374 EXPORT_SYMBOL(vmem_table);
 375 EXPORT_SYMBOL(vmem_list);
 376
 377 static kmem_debug_t *
 378 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, void *addr)
 379 {
 380         struct hlist_head *head;
 381         struct hlist_node *node;
 382         struct kmem_debug *p;
 383         unsigned long flags;
 384         SENTRY;
 385
 386         spin_lock_irqsave(lock, flags);
 387
 388         head = &table[hash_ptr(addr, bits)];
 389         hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
 390                 if (p->kd_addr == addr) {
 391                         hlist_del_init(&p->kd_hlist);
 392                         list_del_init(&p->kd_list);
 393                         spin_unlock_irqrestore(lock, flags);
 394                         return p;
 395                 }
 396         }
 397
 398         spin_unlock_irqrestore(lock, flags);
 399
 400         SRETURN(NULL);
 401 }
 402
 403 void *
 404 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 405     int node_alloc, int node)
 406 {
 407         void *ptr = NULL;
 408         kmem_debug_t *dptr;
 409         unsigned long irq_flags;
 410         SENTRY;
 411
 412         /* Function may be called with KM_NOSLEEP so failure is possible */
 413         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 414             flags & ~__GFP_ZERO);
 415
 416         if (unlikely(dptr == NULL)) {
 417                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 418                     "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 419                     sizeof(kmem_debug_t), flags, func, line,
 420                     kmem_alloc_used_read(), kmem_alloc_max);
 421         } else {
 422                 /*
 423                  * Marked unlikely because we should never be doing this,
 424                  * we tolerate to up 2 pages but a single page is best.
 425                  */
 426                 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
 427                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
 428                             "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 429                             (unsigned long long) size, flags, func, line,
 430                             kmem_alloc_used_read(), kmem_alloc_max);
 431                         spl_debug_dumpstack(NULL);
 432                 }
 433
 434                 /*
 435                  *  We use __strdup() below because the string pointed to by
 436                  * __FUNCTION__ might not be available by the time we want
 437                  * to print it since the module might have been unloaded.
 438                  * This can only fail in the KM_NOSLEEP case.
 439                  */
 440                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 441                 if (unlikely(dptr->kd_func == NULL)) {
 442                         kfree(dptr);
 443                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 444                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 445                             func, line, kmem_alloc_used_read(), kmem_alloc_max);
 446                         goto out;
 447                 }
 448
 449                 /* Use the correct allocator */
 450                 if (node_alloc) {
 451                         ASSERT(!(flags & __GFP_ZERO));
 452                         ptr = kmalloc_node_nofail(size, flags, node);
 453                 } else if (flags & __GFP_ZERO) {
 454                         ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
 455                 } else {
 456                         ptr = kmalloc_nofail(size, flags);
 457                 }
 458
 459                 if (unlikely(ptr == NULL)) {
 460                         kfree(dptr->kd_func);
 461                         kfree(dptr);
 462                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc"
 463                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 464                             (unsigned long long) size, flags, func, line,
 465                             kmem_alloc_used_read(), kmem_alloc_max);
 466                         goto out;
 467                 }
 468
 469                 kmem_alloc_used_add(size);
 470                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 471                         kmem_alloc_max = kmem_alloc_used_read();
 472
 473                 INIT_HLIST_NODE(&dptr->kd_hlist);
 474                 INIT_LIST_HEAD(&dptr->kd_list);
 475
 476                 dptr->kd_addr = ptr;
 477                 dptr->kd_size = size;
 478                 dptr->kd_line = line;
 479
 480                 spin_lock_irqsave(&kmem_lock, irq_flags);
 481                 hlist_add_head_rcu(&dptr->kd_hlist,
 482                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 483                 list_add_tail(&dptr->kd_list, &kmem_list);
 484                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 485
 486                 SDEBUG_LIMIT(SD_INFO,
 487                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 488                     (unsigned long long) size, flags, func, line, ptr,
 489                     kmem_alloc_used_read(), kmem_alloc_max);
 490         }
 491 out:
 492         SRETURN(ptr);
 493 }
 494 EXPORT_SYMBOL(kmem_alloc_track);
 495
 496 void
 497 kmem_free_track(void *ptr, size_t size)
 498 {
 499         kmem_debug_t *dptr;
 500         SENTRY;
 501
 502         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 503             (unsigned long long) size);
 504
 505         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 506
 507         /* Must exist in hash due to kmem_alloc() */
 508         ASSERT(dptr);
 509
 510         /* Size must match */
 511         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 512             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 513             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 514
 515         kmem_alloc_used_sub(size);
 516         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 517             (unsigned long long) size, kmem_alloc_used_read(),
 518             kmem_alloc_max);
 519
 520         kfree(dptr->kd_func);
 521
 522         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 523         kfree(dptr);
 524
 525         memset(ptr, 0x5a, size);
 526         kfree(ptr);
 527
 528         SEXIT;
 529 }
 530 EXPORT_SYMBOL(kmem_free_track);
 531
 532 void *
 533 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 534 {
 535         void *ptr = NULL;
 536         kmem_debug_t *dptr;
 537         unsigned long irq_flags;
 538         SENTRY;
 539
 540         ASSERT(flags & KM_SLEEP);
 541
 542         /* Function may be called with KM_NOSLEEP so failure is possible */
 543         dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
 544             flags & ~__GFP_ZERO);
 545         if (unlikely(dptr == NULL)) {
 546                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
 547                     "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
 548                     sizeof(kmem_debug_t), flags, func, line,
 549                     vmem_alloc_used_read(), vmem_alloc_max);
 550         } else {
 551                 /*
 552                  * We use __strdup() below because the string pointed to by
 553                  * __FUNCTION__ might not be available by the time we want
 554                  * to print it, since the module might have been unloaded.
 555                  * This can never fail because we have already asserted
 556                  * that flags is KM_SLEEP.
 557                  */
 558                 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
 559                 if (unlikely(dptr->kd_func == NULL)) {
 560                         kfree(dptr);
 561                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 562                             "debug __strdup() at %s:%d failed (%lld/%llu)\n",
 563                             func, line, vmem_alloc_used_read(), vmem_alloc_max);
 564                         goto out;
 565                 }
 566
 567                 /* Use the correct allocator */
 568                 if (flags & __GFP_ZERO) {
 569                         ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
 570                 } else {
 571                         ptr = vmalloc_nofail(size, flags);
 572                 }
 573
 574                 if (unlikely(ptr == NULL)) {
 575                         kfree(dptr->kd_func);
 576                         kfree(dptr);
 577                         SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc"
 578                             "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 579                             (unsigned long long) size, flags, func, line,
 580                             vmem_alloc_used_read(), vmem_alloc_max);
 581                         goto out;
 582                 }
 583
 584                 vmem_alloc_used_add(size);
 585                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 586                         vmem_alloc_max = vmem_alloc_used_read();
 587
 588                 INIT_HLIST_NODE(&dptr->kd_hlist);
 589                 INIT_LIST_HEAD(&dptr->kd_list);
 590
 591                 dptr->kd_addr = ptr;
 592                 dptr->kd_size = size;
 593                 dptr->kd_line = line;
 594
 595                 spin_lock_irqsave(&vmem_lock, irq_flags);
 596                 hlist_add_head_rcu(&dptr->kd_hlist,
 597                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 598                 list_add_tail(&dptr->kd_list, &vmem_list);
 599                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 600
 601                 SDEBUG_LIMIT(SD_INFO,
 602                     "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 603                     (unsigned long long) size, flags, func, line,
 604                     ptr, vmem_alloc_used_read(), vmem_alloc_max);
 605         }
 606 out:
 607         SRETURN(ptr);
 608 }
 609 EXPORT_SYMBOL(vmem_alloc_track);
 610
 611 void
 612 vmem_free_track(void *ptr, size_t size)
 613 {
 614         kmem_debug_t *dptr;
 615         SENTRY;
 616
 617         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 618             (unsigned long long) size);
 619
 620         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 621
 622         /* Must exist in hash due to vmem_alloc() */
 623         ASSERT(dptr);
 624
 625         /* Size must match */
 626         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 627             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 628             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 629
 630         vmem_alloc_used_sub(size);
 631         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 632             (unsigned long long) size, vmem_alloc_used_read(),
 633             vmem_alloc_max);
 634
 635         kfree(dptr->kd_func);
 636
 637         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 638         kfree(dptr);
 639
 640         memset(ptr, 0x5a, size);
 641         vfree(ptr);
 642
 643         SEXIT;
 644 }
 645 EXPORT_SYMBOL(vmem_free_track);
 646
 647 # else /* DEBUG_KMEM_TRACKING */
 648
 649 void *
 650 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 651     int node_alloc, int node)
 652 {
 653         void *ptr;
 654         SENTRY;
 655
 656         /*
 657          * Marked unlikely because we should never be doing this,
 658          * we tolerate to up 2 pages but a single page is best.
 659          */
 660         if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
 661                 SDEBUG(SD_CONSOLE | SD_WARNING,
 662                     "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
 663                     (unsigned long long) size, flags, func, line,
 664                     kmem_alloc_used_read(), kmem_alloc_max);
 665                 spl_debug_dumpstack(NULL);
 666         }
 667
 668         /* Use the correct allocator */
 669         if (node_alloc) {
 670                 ASSERT(!(flags & __GFP_ZERO));
 671                 ptr = kmalloc_node_nofail(size, flags, node);
 672         } else if (flags & __GFP_ZERO) {
 673                 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
 674         } else {
 675                 ptr = kmalloc_nofail(size, flags);
 676         }
 677
 678         if (unlikely(ptr == NULL)) {
 679                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 680                     "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 681                     (unsigned long long) size, flags, func, line,
 682                     kmem_alloc_used_read(), kmem_alloc_max);
 683         } else {
 684                 kmem_alloc_used_add(size);
 685                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 686                         kmem_alloc_max = kmem_alloc_used_read();
 687
 688                 SDEBUG_LIMIT(SD_INFO,
 689                     "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
 690                     (unsigned long long) size, flags, func, line, ptr,
 691                     kmem_alloc_used_read(), kmem_alloc_max);
 692         }
 693
 694         SRETURN(ptr);
 695 }
 696 EXPORT_SYMBOL(kmem_alloc_debug);
 697
 698 void
 699 kmem_free_debug(void *ptr, size_t size)
 700 {
 701         SENTRY;
 702
 703         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 704             (unsigned long long) size);
 705
 706         kmem_alloc_used_sub(size);
 707         SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 708             (unsigned long long) size, kmem_alloc_used_read(),
 709             kmem_alloc_max);
 710         kfree(ptr);
 711
 712         SEXIT;
 713 }
 714 EXPORT_SYMBOL(kmem_free_debug);
 715
 716 void *
 717 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 718 {
 719         void *ptr;
 720         SENTRY;
 721
 722         ASSERT(flags & KM_SLEEP);
 723
 724         /* Use the correct allocator */
 725         if (flags & __GFP_ZERO) {
 726                 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
 727         } else {
 728                 ptr = vmalloc_nofail(size, flags);
 729         }
 730
 731         if (unlikely(ptr == NULL)) {
 732                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
 733                     "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
 734                     (unsigned long long) size, flags, func, line,
 735                     vmem_alloc_used_read(), vmem_alloc_max);
 736         } else {
 737                 vmem_alloc_used_add(size);
 738                 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
 739                         vmem_alloc_max = vmem_alloc_used_read();
 740
 741                 SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 742                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 743                     vmem_alloc_used_read(), vmem_alloc_max);
 744         }
 745
 746         SRETURN(ptr);
 747 }
 748 EXPORT_SYMBOL(vmem_alloc_debug);
 749
 750 void
 751 vmem_free_debug(void *ptr, size_t size)
 752 {
 753         SENTRY;
 754
 755         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 756             (unsigned long long) size);
 757
 758         vmem_alloc_used_sub(size);
 759         SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 760             (unsigned long long) size, vmem_alloc_used_read(),
 761             vmem_alloc_max);
 762         vfree(ptr);
 763
 764         SEXIT;
 765 }
 766 EXPORT_SYMBOL(vmem_free_debug);
 767
 768 # endif /* DEBUG_KMEM_TRACKING */
 769 #endif /* DEBUG_KMEM */
 770
 771 /*
 772  * Slab allocation interfaces
 773  *
 774  * While the Linux slab implementation was inspired by the Solaris
 775  * implemenation I cannot use it to emulate the Solaris APIs.  I
 776  * require two features which are not provided by the Linux slab.
 777  *
 778  * 1) Constructors AND destructors.  Recent versions of the Linux
 779  *    kernel have removed support for destructors.  This is a deal
 780  *    breaker for the SPL which contains particularly expensive
 781  *    initializers for mutex's, condition variables, etc.  We also
 782  *    require a minimal level of cleanup for these data types unlike
 783  *    many Linux data type which do need to be explicitly destroyed.
 784  *
 785  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 786  *    expect it to work well for both small are very large allocations.
 787  *    Because of memory fragmentation the Linux slab which is backed
 788  *    by kmalloc'ed memory performs very badly when confronted with
 789  *    large numbers of large allocations.  Basing the slab on the
 790  *    virtual address space removes the need for contigeous pages
 791  *    and greatly improve performance for large allocations.
 792  *
 793  * For these reasons, the SPL has its own slab implementation with
 794  * the needed features.  It is not as highly optimized as either the
 795  * Solaris or Linux slabs, but it should get me most of what is
 796  * needed until it can be optimized or obsoleted by another approach.
 797  *
 798  * One serious concern I do have about this method is the relatively
 799  * small virtual address space on 32bit arches.  This will seriously
 800  * constrain the size of the slab caches and their performance.
 801  *
 802  * XXX: Improve the partial slab list by carefully maintaining a
 803  *      strict ordering of fullest to emptiest slabs based on
 804  *      the slab reference count.  This gaurentees the when freeing
 805  *      slabs back to the system we need only linearly traverse the
 806  *      last N slabs in the list to discover all the freeable slabs.
 807  *
 808  * XXX: NUMA awareness for optionally allocating memory close to a
 809  *      particular core.  This can be adventageous if you know the slab
 810  *      object will be short lived and primarily accessed from one core.
 811  *
 812  * XXX: Slab coloring may also yield performance improvements and would
 813  *      be desirable to implement.
 814  */
 815
 816 struct list_head spl_kmem_cache_list;   /* List of caches */
 817 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 818
 819 static int spl_cache_flush(spl_kmem_cache_t *skc,
 820                            spl_kmem_magazine_t *skm, int flush);
 821
 822 #ifdef HAVE_SET_SHRINKER
 823 static struct shrinker *spl_kmem_cache_shrinker;
 824 #else
 825 # ifdef HAVE_3ARGS_SHRINKER_CALLBACK
 826 static int spl_kmem_cache_generic_shrinker(struct shrinker *shrinker_cb,
 827     int nr_to_scan, unsigned int gfp_mask);
 828 # else
 829 static int spl_kmem_cache_generic_shrinker(
 830     int nr_to_scan, unsigned int gfp_mask);
 831 # endif /* HAVE_3ARGS_SHRINKER_CALLBACK */
 832 static struct shrinker spl_kmem_cache_shrinker = {
 833         .shrink = spl_kmem_cache_generic_shrinker,
 834         .seeks = KMC_DEFAULT_SEEKS,
 835 };
 836 #endif /* HAVE_SET_SHRINKER */
 837
 838 static void *
 839 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 840 {
 841         void *ptr;
 842
 843         ASSERT(ISP2(size));
 844
 845         if (skc->skc_flags & KMC_KMEM)
 846                 ptr = (void *)__get_free_pages(flags, get_order(size));
 847         else
 848                 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
 849
 850         /* Resulting allocated memory will be page aligned */
 851         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 852
 853         return ptr;
 854 }
 855
 856 static void
 857 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 858 {
 859         ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 860         ASSERT(ISP2(size));
 861
 862         if (skc->skc_flags & KMC_KMEM)
 863                 free_pages((unsigned long)ptr, get_order(size));
 864         else
 865                 vfree(ptr);
 866 }
 867
 868 /*
 869  * Required space for each aligned sks.
 870  */
 871 static inline uint32_t
 872 spl_sks_size(spl_kmem_cache_t *skc)
 873 {
 874         return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
 875                skc->skc_obj_align, uint32_t);
 876 }
 877
 878 /*
 879  * Required space for each aligned object.
 880  */
 881 static inline uint32_t
 882 spl_obj_size(spl_kmem_cache_t *skc)
 883 {
 884         uint32_t align = skc->skc_obj_align;
 885
 886         return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 887                P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
 888 }
 889
 890 /*
 891  * Lookup the spl_kmem_object_t for an object given that object.
 892  */
 893 static inline spl_kmem_obj_t *
 894 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 895 {
 896         return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 897                skc->skc_obj_align, uint32_t);
 898 }
 899
 900 /*
 901  * Required space for each offslab object taking in to account alignment
 902  * restrictions and the power-of-two requirement of kv_alloc().
 903  */
 904 static inline uint32_t
 905 spl_offslab_size(spl_kmem_cache_t *skc)
 906 {
 907         return 1UL << (highbit(spl_obj_size(skc)) + 1);
 908 }
 909
 910 /*
 911  * It's important that we pack the spl_kmem_obj_t structure and the
 912  * actual objects in to one large address space to minimize the number
 913  * of calls to the allocator.  It is far better to do a few large
 914  * allocations and then subdivide it ourselves.  Now which allocator
 915  * we use requires balancing a few trade offs.
 916  *
 917  * For small objects we use kmem_alloc() because as long as you are
 918  * only requesting a small number of pages (ideally just one) its cheap.
 919  * However, when you start requesting multiple pages with kmem_alloc()
 920  * it gets increasingly expensive since it requires contigeous pages.
 921  * For this reason we shift to vmem_alloc() for slabs of large objects
 922  * which removes the need for contigeous pages.  We do not use
 923  * vmem_alloc() in all cases because there is significant locking
 924  * overhead in __get_vm_area_node().  This function takes a single
 925  * global lock when aquiring an available virtual address range which
 926  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 927  * different allocation functions for small and large objects should
 928  * give us the best of both worlds.
 929  *
 930  * KMC_ONSLAB                       KMC_OFFSLAB
 931  *
 932  * +------------------------+       +-----------------+
 933  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 934  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 935  * | spl_kmem_obj_t      |  |                             | |
 936  * | skc_obj_size    <---+  |       +-----------------+   | |
 937  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 938  * | ...                 v  |       | spl_kmem_obj_t  |     |
 939  * +------------------------+       +-----------------+     v
 940  */
 941 static spl_kmem_slab_t *
 942 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 943 {
 944         spl_kmem_slab_t *sks;
 945         spl_kmem_obj_t *sko, *n;
 946         void *base, *obj;
 947         uint32_t obj_size, offslab_size = 0;
 948         int i,  rc = 0;
 949
 950         base = kv_alloc(skc, skc->skc_slab_size, flags);
 951         if (base == NULL)
 952                 SRETURN(NULL);
 953
 954         sks = (spl_kmem_slab_t *)base;
 955         sks->sks_magic = SKS_MAGIC;
 956         sks->sks_objs = skc->skc_slab_objs;
 957         sks->sks_age = jiffies;
 958         sks->sks_cache = skc;
 959         INIT_LIST_HEAD(&sks->sks_list);
 960         INIT_LIST_HEAD(&sks->sks_free_list);
 961         sks->sks_ref = 0;
 962         obj_size = spl_obj_size(skc);
 963
 964         if (skc->skc_flags * KMC_OFFSLAB)
 965                 offslab_size = spl_offslab_size(skc);
 966
 967         for (i = 0; i < sks->sks_objs; i++) {
 968                 if (skc->skc_flags & KMC_OFFSLAB) {
 969                         obj = kv_alloc(skc, offslab_size, flags);
 970                         if (!obj)
 971                                 SGOTO(out, rc = -ENOMEM);
 972                 } else {
 973                         obj = base + spl_sks_size(skc) + (i * obj_size);
 974                 }
 975
 976                 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 977                 sko = spl_sko_from_obj(skc, obj);
 978                 sko->sko_addr = obj;
 979                 sko->sko_magic = SKO_MAGIC;
 980                 sko->sko_slab = sks;
 981                 INIT_LIST_HEAD(&sko->sko_list);
 982                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 983         }
 984
 985         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
 986                 if (skc->skc_ctor)
 987                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
 988 out:
 989         if (rc) {
 990                 if (skc->skc_flags & KMC_OFFSLAB)
 991                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
 992                                                  sko_list)
 993                                 kv_free(skc, sko->sko_addr, offslab_size);
 994
 995                 kv_free(skc, base, skc->skc_slab_size);
 996                 sks = NULL;
 997         }
 998
 999         SRETURN(sks);
1000 }
1001
1002 /*
1003  * Remove a slab from complete or partial list, it must be called with
1004  * the 'skc->skc_lock' held but the actual free must be performed
1005  * outside the lock to prevent deadlocking on vmem addresses.
1006  */
1007 static void
1008 spl_slab_free(spl_kmem_slab_t *sks,
1009               struct list_head *sks_list, struct list_head *sko_list)
1010 {
1011         spl_kmem_cache_t *skc;
1012         SENTRY;
1013
1014         ASSERT(sks->sks_magic == SKS_MAGIC);
1015         ASSERT(sks->sks_ref == 0);
1016
1017         skc = sks->sks_cache;
1018         ASSERT(skc->skc_magic == SKC_MAGIC);
1019         ASSERT(spin_is_locked(&skc->skc_lock));
1020
1021         /*
1022          * Update slab/objects counters in the cache, then remove the
1023          * slab from the skc->skc_partial_list.  Finally add the slab
1024          * and all its objects in to the private work lists where the
1025          * destructors will be called and the memory freed to the system.
1026          */
1027         skc->skc_obj_total -= sks->sks_objs;
1028         skc->skc_slab_total--;
1029         list_del(&sks->sks_list);
1030         list_add(&sks->sks_list, sks_list);
1031         list_splice_init(&sks->sks_free_list, sko_list);
1032
1033         SEXIT;
1034 }
1035
1036 /*
1037  * Traverses all the partial slabs attached to a cache and free those
1038  * which which are currently empty, and have not been touched for
1039  * skc_delay seconds to  avoid thrashing.  The count argument is
1040  * passed to optionally cap the number of slabs reclaimed, a count
1041  * of zero means try and reclaim everything.  When flag is set we
1042  * always free an available slab regardless of age.
1043  */
1044 static void
1045 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
1046 {
1047         spl_kmem_slab_t *sks, *m;
1048         spl_kmem_obj_t *sko, *n;
1049         LIST_HEAD(sks_list);
1050         LIST_HEAD(sko_list);
1051         uint32_t size = 0;
1052         int i = 0;
1053         SENTRY;
1054
1055         /*
1056          * Move empty slabs and objects which have not been touched in
1057          * skc_delay seconds on to private lists to be freed outside
1058          * the spin lock.  This delay time is important to avoid thrashing
1059          * however when flag is set the delay will not be used.
1060          */
1061         spin_lock(&skc->skc_lock);
1062         list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
1063                 /*
1064                  * All empty slabs are at the end of skc->skc_partial_list,
1065                  * therefore once a non-empty slab is found we can stop
1066                  * scanning.  Additionally, stop when reaching the target
1067                  * reclaim 'count' if a non-zero threshhold is given.
1068                  */
1069                 if ((sks->sks_ref > 0) || (count && i > count))
1070                         break;
1071
1072                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
1073                         spl_slab_free(sks, &sks_list, &sko_list);
1074                         i++;
1075                 }
1076         }
1077         spin_unlock(&skc->skc_lock);
1078
1079         /*
1080          * The following two loops ensure all the object destructors are
1081          * run, any offslab objects are freed, and the slabs themselves
1082          * are freed.  This is all done outside the skc->skc_lock since
1083          * this allows the destructor to sleep, and allows us to perform
1084          * a conditional reschedule when a freeing a large number of
1085          * objects and slabs back to the system.
1086          */
1087         if (skc->skc_flags & KMC_OFFSLAB)
1088                 size = spl_offslab_size(skc);
1089
1090         list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1091                 ASSERT(sko->sko_magic == SKO_MAGIC);
1092
1093                 if (skc->skc_dtor)
1094                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
1095
1096                 if (skc->skc_flags & KMC_OFFSLAB)
1097                         kv_free(skc, sko->sko_addr, size);
1098
1099                 cond_resched();
1100         }
1101
1102         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1103                 ASSERT(sks->sks_magic == SKS_MAGIC);
1104                 kv_free(skc, sks, skc->skc_slab_size);
1105                 cond_resched();
1106         }
1107
1108         SEXIT;
1109 }
1110
1111 /*
1112  * Called regularly on all caches to age objects out of the magazines
1113  * which have not been access in skc->skc_delay seconds.  This prevents
1114  * idle magazines from holding memory which might be better used by
1115  * other caches or parts of the system.  The delay is present to
1116  * prevent thrashing the magazine.
1117  */
1118 static void
1119 spl_magazine_age(void *data)
1120 {
1121         spl_kmem_magazine_t *skm =
1122                 spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work);
1123         spl_kmem_cache_t *skc = skm->skm_cache;
1124         int i = smp_processor_id();
1125
1126         ASSERT(skm->skm_magic == SKM_MAGIC);
1127         ASSERT(skc->skc_magic == SKC_MAGIC);
1128         ASSERT(skc->skc_mag[i] == skm);
1129
1130         if (skm->skm_avail > 0 &&
1131             time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1132                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1133
1134         if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
1135                 schedule_delayed_work_on(i, &skm->skm_work,
1136                                          skc->skc_delay / 3 * HZ);
1137 }
1138
1139 /*
1140  * Called regularly to keep a downward pressure on the size of idle
1141  * magazines and to release free slabs from the cache.  This function
1142  * never calls the registered reclaim function, that only occures
1143  * under memory pressure or with a direct call to spl_kmem_reap().
1144  */
1145 static void
1146 spl_cache_age(void *data)
1147 {
1148         spl_kmem_cache_t *skc =
1149                 spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
1150
1151         ASSERT(skc->skc_magic == SKC_MAGIC);
1152         spl_slab_reclaim(skc, skc->skc_reap, 0);
1153
1154         if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
1155                 schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
1156 }
1157
1158 /*
1159  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
1160  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
1161  * for very small objects we may end up with more than this so as not
1162  * to waste space in the minimal allocation of a single page.  Also for
1163  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
1164  * lower than this and we will fail.
1165  */
1166 static int
1167 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1168 {
1169         uint32_t sks_size, obj_size, max_size;
1170
1171         if (skc->skc_flags & KMC_OFFSLAB) {
1172                 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
1173                 *size = sizeof(spl_kmem_slab_t);
1174         } else {
1175                 sks_size = spl_sks_size(skc);
1176                 obj_size = spl_obj_size(skc);
1177
1178                 if (skc->skc_flags & KMC_KMEM)
1179                         max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
1180                 else
1181                         max_size = (32 * 1024 * 1024);
1182
1183                 /* Power of two sized slab */
1184                 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
1185                         *objs = (*size - sks_size) / obj_size;
1186                         if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
1187                                 SRETURN(0);
1188                 }
1189
1190                 /*
1191                  * Unable to satisfy target objects per slab, fall back to
1192                  * allocating a maximally sized slab and assuming it can
1193                  * contain the minimum objects count use it.  If not fail.
1194                  */
1195                 *size = max_size;
1196                 *objs = (*size - sks_size) / obj_size;
1197                 if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
1198                         SRETURN(0);
1199         }
1200
1201         SRETURN(-ENOSPC);
1202 }
1203
1204 /*
1205  * Make a guess at reasonable per-cpu magazine size based on the size of
1206  * each object and the cost of caching N of them in each magazine.  Long
1207  * term this should really adapt based on an observed usage heuristic.
1208  */
1209 static int
1210 spl_magazine_size(spl_kmem_cache_t *skc)
1211 {
1212         uint32_t obj_size = spl_obj_size(skc);
1213         int size;
1214         SENTRY;
1215
1216         /* Per-magazine sizes below assume a 4Kib page size */
1217         if (obj_size > (PAGE_SIZE * 256))
1218                 size = 4;  /* Minimum 4Mib per-magazine */
1219         else if (obj_size > (PAGE_SIZE * 32))
1220                 size = 16; /* Minimum 2Mib per-magazine */
1221         else if (obj_size > (PAGE_SIZE))
1222                 size = 64; /* Minimum 256Kib per-magazine */
1223         else if (obj_size > (PAGE_SIZE / 4))
1224                 size = 128; /* Minimum 128Kib per-magazine */
1225         else
1226                 size = 256;
1227
1228         SRETURN(size);
1229 }
1230
1231 /*
1232  * Allocate a per-cpu magazine to assoicate with a specific core.
1233  */
1234 static spl_kmem_magazine_t *
1235 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
1236 {
1237         spl_kmem_magazine_t *skm;
1238         int size = sizeof(spl_kmem_magazine_t) +
1239                    sizeof(void *) * skc->skc_mag_size;
1240         SENTRY;
1241
1242         skm = kmem_alloc_node(size, KM_SLEEP, node);
1243         if (skm) {
1244                 skm->skm_magic = SKM_MAGIC;
1245                 skm->skm_avail = 0;
1246                 skm->skm_size = skc->skc_mag_size;
1247                 skm->skm_refill = skc->skc_mag_refill;
1248                 skm->skm_cache = skc;
1249                 spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm);
1250                 skm->skm_age = jiffies;
1251         }
1252
1253         SRETURN(skm);
1254 }
1255
1256 /*
1257  * Free a per-cpu magazine assoicated with a specific core.
1258  */
1259 static void
1260 spl_magazine_free(spl_kmem_magazine_t *skm)
1261 {
1262         int size = sizeof(spl_kmem_magazine_t) +
1263                    sizeof(void *) * skm->skm_size;
1264
1265         SENTRY;
1266         ASSERT(skm->skm_magic == SKM_MAGIC);
1267         ASSERT(skm->skm_avail == 0);
1268
1269         kmem_free(skm, size);
1270         SEXIT;
1271 }
1272
1273 /*
1274  * Create all pre-cpu magazines of reasonable sizes.
1275  */
1276 static int
1277 spl_magazine_create(spl_kmem_cache_t *skc)
1278 {
1279         int i;
1280         SENTRY;
1281
1282         skc->skc_mag_size = spl_magazine_size(skc);
1283         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1284
1285         for_each_online_cpu(i) {
1286                 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
1287                 if (!skc->skc_mag[i]) {
1288                         for (i--; i >= 0; i--)
1289                                 spl_magazine_free(skc->skc_mag[i]);
1290
1291                         SRETURN(-ENOMEM);
1292                 }
1293         }
1294
1295         /* Only after everything is allocated schedule magazine work */
1296         for_each_online_cpu(i)
1297                 schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work,
1298                                          skc->skc_delay / 3 * HZ);
1299
1300         SRETURN(0);
1301 }
1302
1303 /*
1304  * Destroy all pre-cpu magazines.
1305  */
1306 static void
1307 spl_magazine_destroy(spl_kmem_cache_t *skc)
1308 {
1309         spl_kmem_magazine_t *skm;
1310         int i;
1311         SENTRY;
1312
1313         for_each_online_cpu(i) {
1314                 skm = skc->skc_mag[i];
1315                 (void)spl_cache_flush(skc, skm, skm->skm_avail);
1316                 spl_magazine_free(skm);
1317         }
1318
1319         SEXIT;
1320 }
1321
1322 /*
1323  * Create a object cache based on the following arguments:
1324  * name         cache name
1325  * size         cache object size
1326  * align        cache object alignment
1327  * ctor         cache object constructor
1328  * dtor         cache object destructor
1329  * reclaim      cache object reclaim
1330  * priv         cache private data for ctor/dtor/reclaim
1331  * vmp          unused must be NULL
1332  * flags
1333  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1334  *      KMC_NODEBUG     Disable debugging (unsupported)
1335  *      KMC_NOMAGAZINE  Disable magazine (unsupported)
1336  *      KMC_NOHASH      Disable hashing (unsupported)
1337  *      KMC_QCACHE      Disable qcache (unsupported)
1338  *      KMC_KMEM        Force kmem backed cache
1339  *      KMC_VMEM        Force vmem backed cache
1340  *      KMC_OFFSLAB     Locate objects off the slab
1341  */
1342 spl_kmem_cache_t *
1343 spl_kmem_cache_create(char *name, size_t size, size_t align,
1344                       spl_kmem_ctor_t ctor,
1345                       spl_kmem_dtor_t dtor,
1346                       spl_kmem_reclaim_t reclaim,
1347                       void *priv, void *vmp, int flags)
1348 {
1349         spl_kmem_cache_t *skc;
1350         int rc, kmem_flags = KM_SLEEP;
1351         SENTRY;
1352
1353         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1354         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1355         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1356         ASSERT(vmp == NULL);
1357
1358         /* We may be called when there is a non-zero preempt_count or
1359          * interrupts are disabled is which case we must not sleep.
1360          */
1361         if (current_thread_info()->preempt_count || irqs_disabled())
1362                 kmem_flags = KM_NOSLEEP;
1363
1364         /* Allocate memry for a new cache an initialize it.  Unfortunately,
1365          * this usually ends up being a large allocation of ~32k because
1366          * we need to allocate enough memory for the worst case number of
1367          * cpus in the magazine, skc_mag[NR_CPUS].  Because of this we
1368          * explicitly pass KM_NODEBUG to suppress the kmem warning */
1369         skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc),
1370                                               kmem_flags | KM_NODEBUG);
1371         if (skc == NULL)
1372                 SRETURN(NULL);
1373
1374         skc->skc_magic = SKC_MAGIC;
1375         skc->skc_name_size = strlen(name) + 1;
1376         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
1377         if (skc->skc_name == NULL) {
1378                 kmem_free(skc, sizeof(*skc));
1379                 SRETURN(NULL);
1380         }
1381         strncpy(skc->skc_name, name, skc->skc_name_size);
1382
1383         skc->skc_ctor = ctor;
1384         skc->skc_dtor = dtor;
1385         skc->skc_reclaim = reclaim;
1386         skc->skc_private = priv;
1387         skc->skc_vmp = vmp;
1388         skc->skc_flags = flags;
1389         skc->skc_obj_size = size;
1390         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1391         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1392         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1393         atomic_set(&skc->skc_ref, 0);
1394
1395         INIT_LIST_HEAD(&skc->skc_list);
1396         INIT_LIST_HEAD(&skc->skc_complete_list);
1397         INIT_LIST_HEAD(&skc->skc_partial_list);
1398         spin_lock_init(&skc->skc_lock);
1399         skc->skc_slab_fail = 0;
1400         skc->skc_slab_create = 0;
1401         skc->skc_slab_destroy = 0;
1402         skc->skc_slab_total = 0;
1403         skc->skc_slab_alloc = 0;
1404         skc->skc_slab_max = 0;
1405         skc->skc_obj_total = 0;
1406         skc->skc_obj_alloc = 0;
1407         skc->skc_obj_max = 0;
1408
1409         if (align) {
1410                 VERIFY(ISP2(align));
1411                 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */
1412                 VERIFY3U(align, <=, PAGE_SIZE);            /* Max alignment */
1413                 skc->skc_obj_align = align;
1414         }
1415
1416         /* If none passed select a cache type based on object size */
1417         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
1418                 if (spl_obj_size(skc) < (PAGE_SIZE / 8))
1419                         skc->skc_flags |= KMC_KMEM;
1420                 else
1421                         skc->skc_flags |= KMC_VMEM;
1422         }
1423
1424         rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
1425         if (rc)
1426                 SGOTO(out, rc);
1427
1428         rc = spl_magazine_create(skc);
1429         if (rc)
1430                 SGOTO(out, rc);
1431
1432         spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
1433         schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
1434
1435         down_write(&spl_kmem_cache_sem);
1436         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1437         up_write(&spl_kmem_cache_sem);
1438
1439         SRETURN(skc);
1440 out:
1441         kmem_free(skc->skc_name, skc->skc_name_size);
1442         kmem_free(skc, sizeof(*skc));
1443         SRETURN(NULL);
1444 }
1445 EXPORT_SYMBOL(spl_kmem_cache_create);
1446
1447 /*
1448  * Register a move callback to for cache defragmentation.
1449  * XXX: Unimplemented but harmless to stub out for now.
1450  */
1451 void
1452 spl_kmem_cache_set_move(kmem_cache_t *skc,
1453     kmem_cbrc_t (move)(void *, void *, size_t, void *))
1454 {
1455         ASSERT(move != NULL);
1456 }
1457 EXPORT_SYMBOL(spl_kmem_cache_set_move);
1458
1459 /*
1460  * Destroy a cache and all objects assoicated with the cache.
1461  */
1462 void
1463 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1464 {
1465         DECLARE_WAIT_QUEUE_HEAD(wq);
1466         int i;
1467         SENTRY;
1468
1469         ASSERT(skc->skc_magic == SKC_MAGIC);
1470
1471         down_write(&spl_kmem_cache_sem);
1472         list_del_init(&skc->skc_list);
1473         up_write(&spl_kmem_cache_sem);
1474
1475         /* Cancel any and wait for any pending delayed work */
1476         ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1477         cancel_delayed_work(&skc->skc_work);
1478         for_each_online_cpu(i)
1479                 cancel_delayed_work(&skc->skc_mag[i]->skm_work);
1480
1481         flush_scheduled_work();
1482
1483         /* Wait until all current callers complete, this is mainly
1484          * to catch the case where a low memory situation triggers a
1485          * cache reaping action which races with this destroy. */
1486         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1487
1488         spl_magazine_destroy(skc);
1489         spl_slab_reclaim(skc, 0, 1);
1490         spin_lock(&skc->skc_lock);
1491
1492         /* Validate there are no objects in use and free all the
1493          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1494         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1495         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1496         ASSERT3U(skc->skc_slab_total, ==, 0);
1497         ASSERT3U(skc->skc_obj_total, ==, 0);
1498         ASSERT(list_empty(&skc->skc_complete_list));
1499
1500         kmem_free(skc->skc_name, skc->skc_name_size);
1501         spin_unlock(&skc->skc_lock);
1502
1503         kmem_free(skc, sizeof(*skc));
1504
1505         SEXIT;
1506 }
1507 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1508
1509 /*
1510  * Allocate an object from a slab attached to the cache.  This is used to
1511  * repopulate the per-cpu magazine caches in batches when they run low.
1512  */
1513 static void *
1514 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1515 {
1516         spl_kmem_obj_t *sko;
1517
1518         ASSERT(skc->skc_magic == SKC_MAGIC);
1519         ASSERT(sks->sks_magic == SKS_MAGIC);
1520         ASSERT(spin_is_locked(&skc->skc_lock));
1521
1522         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1523         ASSERT(sko->sko_magic == SKO_MAGIC);
1524         ASSERT(sko->sko_addr != NULL);
1525
1526         /* Remove from sks_free_list */
1527         list_del_init(&sko->sko_list);
1528
1529         sks->sks_age = jiffies;
1530         sks->sks_ref++;
1531         skc->skc_obj_alloc++;
1532
1533         /* Track max obj usage statistics */
1534         if (skc->skc_obj_alloc > skc->skc_obj_max)
1535                 skc->skc_obj_max = skc->skc_obj_alloc;
1536
1537         /* Track max slab usage statistics */
1538         if (sks->sks_ref == 1) {
1539                 skc->skc_slab_alloc++;
1540
1541                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1542                         skc->skc_slab_max = skc->skc_slab_alloc;
1543         }
1544
1545         return sko->sko_addr;
1546 }
1547
1548 /*
1549  * No available objects on any slabsi, create a new slab.  Since this
1550  * is an expensive operation we do it without holding the spinlock and
1551  * only briefly aquire it when we link in the fully allocated and
1552  * constructed slab.
1553  */
1554 static spl_kmem_slab_t *
1555 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1556 {
1557         spl_kmem_slab_t *sks;
1558         SENTRY;
1559
1560         ASSERT(skc->skc_magic == SKC_MAGIC);
1561         local_irq_enable();
1562         might_sleep();
1563
1564         /*
1565          * Before allocating a new slab check if the slab is being reaped.
1566          * If it is there is a good chance we can wait until it finishes
1567          * and then use one of the newly freed but not aged-out slabs.
1568          */
1569         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1570                 schedule();
1571                 SGOTO(out, sks= NULL);
1572         }
1573
1574         /* Allocate a new slab for the cache */
1575         sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG);
1576         if (sks == NULL)
1577                 SGOTO(out, sks = NULL);
1578
1579         /* Link the new empty slab in to the end of skc_partial_list. */
1580         spin_lock(&skc->skc_lock);
1581         skc->skc_slab_total++;
1582         skc->skc_obj_total += sks->sks_objs;
1583         list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1584         spin_unlock(&skc->skc_lock);
1585 out:
1586         local_irq_disable();
1587
1588         SRETURN(sks);
1589 }
1590
1591 /*
1592  * Refill a per-cpu magazine with objects from the slabs for this
1593  * cache.  Ideally the magazine can be repopulated using existing
1594  * objects which have been released, however if we are unable to
1595  * locate enough free objects new slabs of objects will be created.
1596  */
1597 static int
1598 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1599 {
1600         spl_kmem_slab_t *sks;
1601         int rc = 0, refill;
1602         SENTRY;
1603
1604         ASSERT(skc->skc_magic == SKC_MAGIC);
1605         ASSERT(skm->skm_magic == SKM_MAGIC);
1606
1607         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1608         spin_lock(&skc->skc_lock);
1609
1610         while (refill > 0) {
1611                 /* No slabs available we may need to grow the cache */
1612                 if (list_empty(&skc->skc_partial_list)) {
1613                         spin_unlock(&skc->skc_lock);
1614
1615                         sks = spl_cache_grow(skc, flags);
1616                         if (!sks)
1617                                 SGOTO(out, rc);
1618
1619                         /* Rescheduled to different CPU skm is not local */
1620                         if (skm != skc->skc_mag[smp_processor_id()])
1621                                 SGOTO(out, rc);
1622
1623                         /* Potentially rescheduled to the same CPU but
1624                          * allocations may have occured from this CPU while
1625                          * we were sleeping so recalculate max refill. */
1626                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1627
1628                         spin_lock(&skc->skc_lock);
1629                         continue;
1630                 }
1631
1632                 /* Grab the next available slab */
1633                 sks = list_entry((&skc->skc_partial_list)->next,
1634                                  spl_kmem_slab_t, sks_list);
1635                 ASSERT(sks->sks_magic == SKS_MAGIC);
1636                 ASSERT(sks->sks_ref < sks->sks_objs);
1637                 ASSERT(!list_empty(&sks->sks_free_list));
1638
1639                 /* Consume as many objects as needed to refill the requested
1640                  * cache.  We must also be careful not to overfill it. */
1641                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1642                         ASSERT(skm->skm_avail < skm->skm_size);
1643                         ASSERT(rc < skm->skm_size);
1644                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1645                 }
1646
1647                 /* Move slab to skc_complete_list when full */
1648                 if (sks->sks_ref == sks->sks_objs) {
1649                         list_del(&sks->sks_list);
1650                         list_add(&sks->sks_list, &skc->skc_complete_list);
1651                 }
1652         }
1653
1654         spin_unlock(&skc->skc_lock);
1655 out:
1656         /* Returns the number of entries added to cache */
1657         SRETURN(rc);
1658 }
1659
1660 /*
1661  * Release an object back to the slab from which it came.
1662  */
1663 static void
1664 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1665 {
1666         spl_kmem_slab_t *sks = NULL;
1667         spl_kmem_obj_t *sko = NULL;
1668         SENTRY;
1669
1670         ASSERT(skc->skc_magic == SKC_MAGIC);
1671         ASSERT(spin_is_locked(&skc->skc_lock));
1672
1673         sko = spl_sko_from_obj(skc, obj);
1674         ASSERT(sko->sko_magic == SKO_MAGIC);
1675         sks = sko->sko_slab;
1676         ASSERT(sks->sks_magic == SKS_MAGIC);
1677         ASSERT(sks->sks_cache == skc);
1678         list_add(&sko->sko_list, &sks->sks_free_list);
1679
1680         sks->sks_age = jiffies;
1681         sks->sks_ref--;
1682         skc->skc_obj_alloc--;
1683
1684         /* Move slab to skc_partial_list when no longer full.  Slabs
1685          * are added to the head to keep the partial list is quasi-full
1686          * sorted order.  Fuller at the head, emptier at the tail. */
1687         if (sks->sks_ref == (sks->sks_objs - 1)) {
1688                 list_del(&sks->sks_list);
1689                 list_add(&sks->sks_list, &skc->skc_partial_list);
1690         }
1691
1692         /* Move emply slabs to the end of the partial list so
1693          * they can be easily found and freed during reclamation. */
1694         if (sks->sks_ref == 0) {
1695                 list_del(&sks->sks_list);
1696                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1697                 skc->skc_slab_alloc--;
1698         }
1699
1700         SEXIT;
1701 }
1702
1703 /*
1704  * Release a batch of objects from a per-cpu magazine back to their
1705  * respective slabs.  This occurs when we exceed the magazine size,
1706  * are under memory pressure, when the cache is idle, or during
1707  * cache cleanup.  The flush argument contains the number of entries
1708  * to remove from the magazine.
1709  */
1710 static int
1711 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1712 {
1713         int i, count = MIN(flush, skm->skm_avail);
1714         SENTRY;
1715
1716         ASSERT(skc->skc_magic == SKC_MAGIC);
1717         ASSERT(skm->skm_magic == SKM_MAGIC);
1718
1719         /*
1720          * XXX: Currently we simply return objects from the magazine to
1721          * the slabs in fifo order.  The ideal thing to do from a memory
1722          * fragmentation standpoint is to cheaply determine the set of
1723          * objects in the magazine which will result in the largest
1724          * number of free slabs if released from the magazine.
1725          */
1726         spin_lock(&skc->skc_lock);
1727         for (i = 0; i < count; i++)
1728                 spl_cache_shrink(skc, skm->skm_objs[i]);
1729
1730         skm->skm_avail -= count;
1731         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1732                 sizeof(void *) * skm->skm_avail);
1733
1734         spin_unlock(&skc->skc_lock);
1735
1736         SRETURN(count);
1737 }
1738
1739 /*
1740  * Allocate an object from the per-cpu magazine, or if the magazine
1741  * is empty directly allocate from a slab and repopulate the magazine.
1742  */
1743 void *
1744 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1745 {
1746         spl_kmem_magazine_t *skm;
1747         unsigned long irq_flags;
1748         void *obj = NULL;
1749         SENTRY;
1750
1751         ASSERT(skc->skc_magic == SKC_MAGIC);
1752         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1753         ASSERT(flags & KM_SLEEP);
1754         atomic_inc(&skc->skc_ref);
1755         local_irq_save(irq_flags);
1756
1757 restart:
1758         /* Safe to update per-cpu structure without lock, but
1759          * in the restart case we must be careful to reaquire
1760          * the local magazine since this may have changed
1761          * when we need to grow the cache. */
1762         skm = skc->skc_mag[smp_processor_id()];
1763         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1764                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1765                 skm->skm_size, skm->skm_refill, skm->skm_avail);
1766
1767         if (likely(skm->skm_avail)) {
1768                 /* Object available in CPU cache, use it */
1769                 obj = skm->skm_objs[--skm->skm_avail];
1770                 skm->skm_age = jiffies;
1771         } else {
1772                 /* Per-CPU cache empty, directly allocate from
1773                  * the slab and refill the per-CPU cache. */
1774                 (void)spl_cache_refill(skc, skm, flags);
1775                 SGOTO(restart, obj = NULL);
1776         }
1777
1778         local_irq_restore(irq_flags);
1779         ASSERT(obj);
1780         ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1781
1782         /* Pre-emptively migrate object to CPU L1 cache */
1783         prefetchw(obj);
1784         atomic_dec(&skc->skc_ref);
1785
1786         SRETURN(obj);
1787 }
1788 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1789
1790 /*
1791  * Free an object back to the local per-cpu magazine, there is no
1792  * guarantee that this is the same magazine the object was originally
1793  * allocated from.  We may need to flush entire from the magazine
1794  * back to the slabs to make space.
1795  */
1796 void
1797 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1798 {
1799         spl_kmem_magazine_t *skm;
1800         unsigned long flags;
1801         SENTRY;
1802
1803         ASSERT(skc->skc_magic == SKC_MAGIC);
1804         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1805         atomic_inc(&skc->skc_ref);
1806         local_irq_save(flags);
1807
1808         /* Safe to update per-cpu structure without lock, but
1809          * no remote memory allocation tracking is being performed
1810          * it is entirely possible to allocate an object from one
1811          * CPU cache and return it to another. */
1812         skm = skc->skc_mag[smp_processor_id()];
1813         ASSERT(skm->skm_magic == SKM_MAGIC);
1814
1815         /* Per-CPU cache full, flush it to make space */
1816         if (unlikely(skm->skm_avail >= skm->skm_size))
1817                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1818
1819         /* Available space in cache, use it */
1820         skm->skm_objs[skm->skm_avail++] = obj;
1821
1822         local_irq_restore(flags);
1823         atomic_dec(&skc->skc_ref);
1824
1825         SEXIT;
1826 }
1827 EXPORT_SYMBOL(spl_kmem_cache_free);
1828
1829 /*
1830  * The generic shrinker function for all caches.  Under linux a shrinker
1831  * may not be tightly coupled with a slab cache.  In fact linux always
1832  * systematically trys calling all registered shrinker callbacks which
1833  * report that they contain unused objects.  Because of this we only
1834  * register one shrinker function in the shim layer for all slab caches.
1835  * We always attempt to shrink all caches when this generic shrinker
1836  * is called.  The shrinker should return the number of free objects
1837  * in the cache when called with nr_to_scan == 0 but not attempt to
1838  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
1839  * objects should be freed, because Solaris semantics are to free
1840  * all available objects we may free more objects than requested.
1841  */
1842 #ifdef HAVE_3ARGS_SHRINKER_CALLBACK
1843 static int
1844 spl_kmem_cache_generic_shrinker(struct shrinker *shrinker_cb,
1845                                 int nr_to_scan, unsigned int gfp_mask)
1846 #else
1847 static int
1848 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1849 #endif /* HAVE_3ARGS_SHRINKER_CALLBACK */
1850 {
1851         spl_kmem_cache_t *skc;
1852         int unused = 0;
1853
1854         down_read(&spl_kmem_cache_sem);
1855         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1856                 if (nr_to_scan)
1857                         spl_kmem_cache_reap_now(skc);
1858
1859                 /*
1860                  * Presume everything alloc'ed in reclaimable, this ensures
1861                  * we are called again with nr_to_scan > 0 so can try and
1862                  * reclaim.  The exact number is not important either so
1863                  * we forgo taking this already highly contented lock.
1864                  */
1865                 unused += skc->skc_obj_alloc;
1866         }
1867         up_read(&spl_kmem_cache_sem);
1868
1869         return (unused * sysctl_vfs_cache_pressure) / 100;
1870 }
1871
1872 /*
1873  * Call the registered reclaim function for a cache.  Depending on how
1874  * many and which objects are released it may simply repopulate the
1875  * local magazine which will then need to age-out.  Objects which cannot
1876  * fit in the magazine we will be released back to their slabs which will
1877  * also need to age out before being release.  This is all just best
1878  * effort and we do not want to thrash creating and destroying slabs.
1879  */
1880 void
1881 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1882 {
1883         SENTRY;
1884
1885         ASSERT(skc->skc_magic == SKC_MAGIC);
1886         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1887
1888         /* Prevent concurrent cache reaping when contended */
1889         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1890                 SEXIT;
1891                 return;
1892         }
1893
1894         atomic_inc(&skc->skc_ref);
1895
1896         if (skc->skc_reclaim)
1897                 skc->skc_reclaim(skc->skc_private);
1898
1899         spl_slab_reclaim(skc, skc->skc_reap, 0);
1900         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1901         atomic_dec(&skc->skc_ref);
1902
1903         SEXIT;
1904 }
1905 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1906
1907 /*
1908  * Reap all free slabs from all registered caches.
1909  */
1910 void
1911 spl_kmem_reap(void)
1912 {
1913 #ifdef HAVE_3ARGS_SHRINKER_CALLBACK
1914         spl_kmem_cache_generic_shrinker(NULL, KMC_REAP_CHUNK, GFP_KERNEL);
1915 #else
1916         spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1917 #endif /* HAVE_3ARGS_SHRINKER_CALLBACK */
1918 }
1919 EXPORT_SYMBOL(spl_kmem_reap);
1920
1921 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1922 static char *
1923 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1924 {
1925         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1926         int i, flag = 1;
1927
1928         ASSERT(str != NULL && len >= 17);
1929         memset(str, 0, len);
1930
1931         /* Check for a fully printable string, and while we are at
1932          * it place the printable characters in the passed buffer. */
1933         for (i = 0; i < size; i++) {
1934                 str[i] = ((char *)(kd->kd_addr))[i];
1935                 if (isprint(str[i])) {
1936                         continue;
1937                 } else {
1938                         /* Minimum number of printable characters found
1939                          * to make it worthwhile to print this as ascii. */
1940                         if (i > min)
1941                                 break;
1942
1943                         flag = 0;
1944                         break;
1945                 }
1946         }
1947
1948         if (!flag) {
1949                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1950                         *((uint8_t *)kd->kd_addr),
1951                         *((uint8_t *)kd->kd_addr + 2),
1952                         *((uint8_t *)kd->kd_addr + 4),
1953                         *((uint8_t *)kd->kd_addr + 6),
1954                         *((uint8_t *)kd->kd_addr + 8),
1955                         *((uint8_t *)kd->kd_addr + 10),
1956                         *((uint8_t *)kd->kd_addr + 12),
1957                         *((uint8_t *)kd->kd_addr + 14));
1958         }
1959
1960         return str;
1961 }
1962
1963 static int
1964 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1965 {
1966         int i;
1967         SENTRY;
1968
1969         spin_lock_init(lock);
1970         INIT_LIST_HEAD(list);
1971
1972         for (i = 0; i < size; i++)
1973                 INIT_HLIST_HEAD(&kmem_table[i]);
1974
1975         SRETURN(0);
1976 }
1977
1978 static void
1979 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1980 {
1981         unsigned long flags;
1982         kmem_debug_t *kd;
1983         char str[17];
1984         SENTRY;
1985
1986         spin_lock_irqsave(lock, flags);
1987         if (!list_empty(list))
1988                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1989                        "size", "data", "func", "line");
1990
1991         list_for_each_entry(kd, list, kd_list)
1992                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1993                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1994                        kd->kd_func, kd->kd_line);
1995
1996         spin_unlock_irqrestore(lock, flags);
1997         SEXIT;
1998 }
1999 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2000 #define spl_kmem_init_tracking(list, lock, size)
2001 #define spl_kmem_fini_tracking(list, lock)
2002 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2003
2004 static void
2005 spl_kmem_init_globals(void)
2006 {
2007         struct zone *zone;
2008
2009         /* For now all zones are includes, it may be wise to restrict
2010          * this to normal and highmem zones if we see problems. */
2011         for_each_zone(zone) {
2012
2013                 if (!populated_zone(zone))
2014                         continue;
2015
2016                 minfree += min_wmark_pages(zone);
2017                 desfree += low_wmark_pages(zone);
2018                 lotsfree += high_wmark_pages(zone);
2019         }
2020
2021         /* Solaris default values */
2022         swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
2023         swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
2024 }
2025
2026 /*
2027  * Called at module init when it is safe to use spl_kallsyms_lookup_name()
2028  */
2029 int
2030 spl_kmem_init_kallsyms_lookup(void)
2031 {
2032 #ifndef HAVE_GET_VMALLOC_INFO
2033         get_vmalloc_info_fn = (get_vmalloc_info_t)
2034                 spl_kallsyms_lookup_name("get_vmalloc_info");
2035         if (!get_vmalloc_info_fn) {
2036                 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
2037                 return -EFAULT;
2038         }
2039 #endif /* HAVE_GET_VMALLOC_INFO */
2040
2041 #ifdef HAVE_PGDAT_HELPERS
2042 # ifndef HAVE_FIRST_ONLINE_PGDAT
2043         first_online_pgdat_fn = (first_online_pgdat_t)
2044                 spl_kallsyms_lookup_name("first_online_pgdat");
2045         if (!first_online_pgdat_fn) {
2046                 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
2047                 return -EFAULT;
2048         }
2049 # endif /* HAVE_FIRST_ONLINE_PGDAT */
2050
2051 # ifndef HAVE_NEXT_ONLINE_PGDAT
2052         next_online_pgdat_fn = (next_online_pgdat_t)
2053                 spl_kallsyms_lookup_name("next_online_pgdat");
2054         if (!next_online_pgdat_fn) {
2055                 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
2056                 return -EFAULT;
2057         }
2058 # endif /* HAVE_NEXT_ONLINE_PGDAT */
2059
2060 # ifndef HAVE_NEXT_ZONE
2061         next_zone_fn = (next_zone_t)
2062                 spl_kallsyms_lookup_name("next_zone");
2063         if (!next_zone_fn) {
2064                 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
2065                 return -EFAULT;
2066         }
2067 # endif /* HAVE_NEXT_ZONE */
2068
2069 #else /* HAVE_PGDAT_HELPERS */
2070
2071 # ifndef HAVE_PGDAT_LIST
2072         pgdat_list_addr = *(struct pglist_data **)
2073                 spl_kallsyms_lookup_name("pgdat_list");
2074         if (!pgdat_list_addr) {
2075                 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
2076                 return -EFAULT;
2077         }
2078 # endif /* HAVE_PGDAT_LIST */
2079 #endif /* HAVE_PGDAT_HELPERS */
2080
2081 #if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
2082         get_zone_counts_fn = (get_zone_counts_t)
2083                 spl_kallsyms_lookup_name("get_zone_counts");
2084         if (!get_zone_counts_fn) {
2085                 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
2086                 return -EFAULT;
2087         }
2088 #endif  /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
2089
2090         /*
2091          * It is now safe to initialize the global tunings which rely on
2092          * the use of the for_each_zone() macro.  This macro in turns
2093          * depends on the *_pgdat symbols which are now available.
2094          */
2095         spl_kmem_init_globals();
2096
2097 #ifndef HAVE_INVALIDATE_INODES
2098         invalidate_inodes_fn = (invalidate_inodes_t)
2099         spl_kallsyms_lookup_name("invalidate_inodes");
2100         if (!invalidate_inodes_fn) {
2101                 printk(KERN_ERR "Error: Unknown symbol invalidate_inodes\n");
2102                 return -EFAULT;
2103         }
2104 #endif /* HAVE_INVALIDATE_INODES */
2105
2106         return 0;
2107 }
2108
2109 int
2110 spl_kmem_init(void)
2111 {
2112         int rc = 0;
2113         SENTRY;
2114
2115         init_rwsem(&spl_kmem_cache_sem);
2116         INIT_LIST_HEAD(&spl_kmem_cache_list);
2117
2118 #ifdef HAVE_SET_SHRINKER
2119         spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
2120                                                spl_kmem_cache_generic_shrinker);
2121         if (spl_kmem_cache_shrinker == NULL)
2122                 SRETURN(rc = -ENOMEM);
2123 #else
2124         register_shrinker(&spl_kmem_cache_shrinker);
2125 #endif
2126
2127 #ifdef DEBUG_KMEM
2128         kmem_alloc_used_set(0);
2129         vmem_alloc_used_set(0);
2130
2131         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2132         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2133 #endif
2134         SRETURN(rc);
2135 }
2136
2137 void
2138 spl_kmem_fini(void)
2139 {
2140 #ifdef DEBUG_KMEM
2141         /* Display all unreclaimed memory addresses, including the
2142          * allocation size and the first few bytes of what's located
2143          * at that address to aid in debugging.  Performance is not
2144          * a serious concern here since it is module unload time. */
2145         if (kmem_alloc_used_read() != 0)
2146                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2147                     "kmem leaked %ld/%ld bytes\n",
2148                     kmem_alloc_used_read(), kmem_alloc_max);
2149
2150
2151         if (vmem_alloc_used_read() != 0)
2152                 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
2153                     "vmem leaked %ld/%ld bytes\n",
2154                     vmem_alloc_used_read(), vmem_alloc_max);
2155
2156         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2157         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2158 #endif /* DEBUG_KMEM */
2159         SENTRY;
2160
2161 #ifdef HAVE_SET_SHRINKER
2162         remove_shrinker(spl_kmem_cache_shrinker);
2163 #else
2164         unregister_shrinker(&spl_kmem_cache_shrinker);
2165 #endif
2166
2167         SEXIT;
2168 }