module/spl/spl-kmem.c

   1 /*
   2  *  This file is part of the SPL: Solaris Porting Layer.
   3  *
   4  *  Copyright (c) 2008 Lawrence Livermore National Security, LLC.
   5  *  Produced at Lawrence Livermore National Laboratory
   6  *  Written by:
   7  *          Brian Behlendorf <behlendorf1@llnl.gov>,
   8  *          Herb Wartens <wartens2@llnl.gov>,
   9  *          Jim Garlick <garlick@llnl.gov>
  10  *  UCRL-CODE-235197
  11  *
  12  *  This is free software; you can redistribute it and/or modify it
  13  *  under the terms of the GNU General Public License as published by
  14  *  the Free Software Foundation; either version 2 of the License, or
  15  *  (at your option) any later version.
  16  *
  17  *  This is distributed in the hope that it will be useful, but WITHOUT
  18  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  20  *  for more details.
  21  *
  22  *  You should have received a copy of the GNU General Public License along
  23  *  with this program; if not, write to the Free Software Foundation, Inc.,
  24  *  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  */
  26
  27 #include <sys/kmem.h>
  28
  29 #ifdef DEBUG_SUBSYSTEM
  30 # undef DEBUG_SUBSYSTEM
  31 #endif
  32
  33 #define DEBUG_SUBSYSTEM S_KMEM
  34
  35 /*
  36  * The minimum amount of memory measured in pages to be free at all
  37  * times on the system.  This is similar to Linux's zone->pages_min
  38  * multipled by the number of zones and is sized based on that.
  39  */
  40 pgcnt_t minfree = 0;
  41 EXPORT_SYMBOL(minfree);
  42
  43 /*
  44  * The desired amount of memory measured in pages to be free at all
  45  * times on the system.  This is similar to Linux's zone->pages_low
  46  * multipled by the number of zones and is sized based on that.
  47  * Assuming all zones are being used roughly equally, when we drop
  48  * below this threshold async page reclamation is triggered.
  49  */
  50 pgcnt_t desfree = 0;
  51 EXPORT_SYMBOL(desfree);
  52
  53 /*
  54  * When above this amount of memory measures in pages the system is
  55  * determined to have enough free memory.  This is similar to Linux's
  56  * zone->pages_high multipled by the number of zones and is sized based
  57  * on that.  Assuming all zones are being used roughly equally, when
  58  * async page reclamation reaches this threshold it stops.
  59  */
  60 pgcnt_t lotsfree = 0;
  61 EXPORT_SYMBOL(lotsfree);
  62
  63 /* Unused always 0 in this implementation */
  64 pgcnt_t needfree = 0;
  65 EXPORT_SYMBOL(needfree);
  66
  67 pgcnt_t swapfs_minfree = 0;
  68 EXPORT_SYMBOL(swapfs_minfree);
  69
  70 pgcnt_t swapfs_reserve = 0;
  71 EXPORT_SYMBOL(swapfs_reserve);
  72
  73 vmem_t *heap_arena = NULL;
  74 EXPORT_SYMBOL(heap_arena);
  75
  76 vmem_t *zio_alloc_arena = NULL;
  77 EXPORT_SYMBOL(zio_alloc_arena);
  78
  79 vmem_t *zio_arena = NULL;
  80 EXPORT_SYMBOL(zio_arena);
  81
  82 #ifndef HAVE_FIRST_ONLINE_PGDAT
  83 struct pglist_data *
  84 first_online_pgdat(void)
  85 {
  86         return NODE_DATA(first_online_node);
  87 }
  88 EXPORT_SYMBOL(first_online_pgdat);
  89 #endif /* HAVE_FIRST_ONLINE_PGDAT */
  90
  91 #ifndef HAVE_NEXT_ONLINE_PGDAT
  92 struct pglist_data *
  93 next_online_pgdat(struct pglist_data *pgdat)
  94 {
  95         int nid = next_online_node(pgdat->node_id);
  96
  97         if (nid == MAX_NUMNODES)
  98                 return NULL;
  99
 100         return NODE_DATA(nid);
 101 }
 102 EXPORT_SYMBOL(next_online_pgdat);
 103 #endif /* HAVE_NEXT_ONLINE_PGDAT */
 104
 105 #ifndef HAVE_NEXT_ZONE
 106 struct zone *
 107 next_zone(struct zone *zone)
 108 {
 109         pg_data_t *pgdat = zone->zone_pgdat;
 110
 111         if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
 112         zone++;
 113         else {
 114                 pgdat = next_online_pgdat(pgdat);
 115                 if (pgdat)
 116                         zone = pgdat->node_zones;
 117                 else
 118                         zone = NULL;
 119         }
 120         return zone;
 121 }
 122 EXPORT_SYMBOL(next_zone);
 123 #endif /* HAVE_NEXT_ZONE */
 124
 125 #ifndef HAVE_GET_ZONE_COUNTS
 126 void
 127 __get_zone_counts(unsigned long *active, unsigned long *inactive,
 128                   unsigned long *free, struct pglist_data *pgdat)
 129 {
 130         struct zone *zones = pgdat->node_zones;
 131         int i;
 132
 133         *active = 0;
 134         *inactive = 0;
 135         *free = 0;
 136         for (i = 0; i < MAX_NR_ZONES; i++) {
 137                 *active += zones[i].nr_active;
 138                 *inactive += zones[i].nr_inactive;
 139                 *free += zones[i].free_pages;
 140         }
 141 }
 142
 143 void
 144 get_zone_counts(unsigned long *active, unsigned long *inactive,
 145                 unsigned long *free)
 146 {
 147         struct pglist_data *pgdat;
 148
 149         *active = 0;
 150         *inactive = 0;
 151         *free = 0;
 152         for_each_online_pgdat(pgdat) {
 153                 unsigned long l, m, n;
 154                 __get_zone_counts(&l, &m, &n, pgdat);
 155                 *active += l;
 156                 *inactive += m;
 157                 *free += n;
 158         }
 159 }
 160 EXPORT_SYMBOL(get_zone_counts);
 161 #endif /* HAVE_GET_ZONE_COUNTS */
 162
 163 pgcnt_t
 164 spl_kmem_availrmem(void)
 165 {
 166         unsigned long active;
 167         unsigned long inactive;
 168         unsigned long free;
 169
 170         get_zone_counts(&active, &inactive, &free);
 171
 172         /* The amount of easily available memory */
 173         return free + inactive;
 174 }
 175 EXPORT_SYMBOL(spl_kmem_availrmem);
 176
 177 size_t
 178 vmem_size(vmem_t *vmp, int typemask)
 179 {
 180         /* Arena's unsupported */
 181         ASSERT(vmp == NULL);
 182         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 183
 184         return 0;
 185 }
 186 EXPORT_SYMBOL(vmem_size);
 187
 188
 189 /*
 190  * Memory allocation interfaces and debugging for basic kmem_*
 191  * and vmem_* style memory allocation.  When DEBUG_KMEM is enable
 192  * all allocations will be tracked when they are allocated and
 193  * freed.  When the SPL module is unload a list of all leaked
 194  * addresses and where they were allocated will be dumped to the
 195  * console.  Enabling this feature has a significant impant on
 196  * performance but it makes finding memory leaks staight forward.
 197  */
 198 #ifdef DEBUG_KMEM
 199 /* Shim layer memory accounting */
 200 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 201 unsigned long long kmem_alloc_max = 0;
 202 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 203 unsigned long long vmem_alloc_max = 0;
 204 int kmem_warning_flag = 1;
 205
 206 EXPORT_SYMBOL(kmem_alloc_used);
 207 EXPORT_SYMBOL(kmem_alloc_max);
 208 EXPORT_SYMBOL(vmem_alloc_used);
 209 EXPORT_SYMBOL(vmem_alloc_max);
 210 EXPORT_SYMBOL(kmem_warning_flag);
 211
 212 # ifdef DEBUG_KMEM_TRACKING
 213
 214 /* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
 215  * highly contended particularly on xfree().  If we want to run with this
 216  * detailed debugging enabled for anything other than debugging  we need to
 217  * minimize the contention by moving to a lock per xmem_table entry model.
 218  */
 219
 220 #  define KMEM_HASH_BITS          10
 221 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 222
 223 #  define VMEM_HASH_BITS          10
 224 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 225
 226 typedef struct kmem_debug {
 227         struct hlist_node kd_hlist;     /* Hash node linkage */
 228         struct list_head kd_list;       /* List of all allocations */
 229         void *kd_addr;                  /* Allocation pointer */
 230         size_t kd_size;                 /* Allocation size */
 231         const char *kd_func;            /* Allocation function */
 232         int kd_line;                    /* Allocation line */
 233 } kmem_debug_t;
 234
 235 spinlock_t kmem_lock;
 236 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 237 struct list_head kmem_list;
 238
 239 spinlock_t vmem_lock;
 240 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 241 struct list_head vmem_list;
 242
 243 EXPORT_SYMBOL(kmem_lock);
 244 EXPORT_SYMBOL(kmem_table);
 245 EXPORT_SYMBOL(kmem_list);
 246
 247 EXPORT_SYMBOL(vmem_lock);
 248 EXPORT_SYMBOL(vmem_table);
 249 EXPORT_SYMBOL(vmem_list);
 250 # endif
 251
 252 int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
 253 #else
 254 int kmem_set_warning(int flag) { return 0; }
 255 #endif
 256 EXPORT_SYMBOL(kmem_set_warning);
 257
 258 /*
 259  * Slab allocation interfaces
 260  *
 261  * While the Linux slab implementation was inspired by the Solaris
 262  * implemenation I cannot use it to emulate the Solaris APIs.  I
 263  * require two features which are not provided by the Linux slab.
 264  *
 265  * 1) Constructors AND destructors.  Recent versions of the Linux
 266  *    kernel have removed support for destructors.  This is a deal
 267  *    breaker for the SPL which contains particularly expensive
 268  *    initializers for mutex's, condition variables, etc.  We also
 269  *    require a minimal level of cleanup for these data types unlike
 270  *    many Linux data type which do need to be explicitly destroyed.
 271  *
 272  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 273  *    expect it to work well for both small are very large allocations.
 274  *    Because of memory fragmentation the Linux slab which is backed
 275  *    by kmalloc'ed memory performs very badly when confronted with
 276  *    large numbers of large allocations.  Basing the slab on the
 277  *    virtual address space removes the need for contigeous pages
 278  *    and greatly improve performance for large allocations.
 279  *
 280  * For these reasons, the SPL has its own slab implementation with
 281  * the needed features.  It is not as highly optimized as either the
 282  * Solaris or Linux slabs, but it should get me most of what is
 283  * needed until it can be optimized or obsoleted by another approach.
 284  *
 285  * One serious concern I do have about this method is the relatively
 286  * small virtual address space on 32bit arches.  This will seriously
 287  * constrain the size of the slab caches and their performance.
 288  *
 289  * XXX: Improve the partial slab list by carefully maintaining a
 290  *      strict ordering of fullest to emptiest slabs based on
 291  *      the slab reference count.  This gaurentees the when freeing
 292  *      slabs back to the system we need only linearly traverse the
 293  *      last N slabs in the list to discover all the freeable slabs.
 294  *
 295  * XXX: NUMA awareness for optionally allocating memory close to a
 296  *      particular core.  This can be adventageous if you know the slab
 297  *      object will be short lived and primarily accessed from one core.
 298  *
 299  * XXX: Slab coloring may also yield performance improvements and would
 300  *      be desirable to implement.
 301  */
 302
 303 struct list_head spl_kmem_cache_list;   /* List of caches */
 304 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 305
 306 static int spl_cache_flush(spl_kmem_cache_t *skc,
 307                            spl_kmem_magazine_t *skm, int flush);
 308
 309 #ifdef HAVE_SET_SHRINKER
 310 static struct shrinker *spl_kmem_cache_shrinker;
 311 #else
 312 static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
 313                                            unsigned int gfp_mask);
 314 static struct shrinker spl_kmem_cache_shrinker = {
 315         .shrink = spl_kmem_cache_generic_shrinker,
 316         .seeks = KMC_DEFAULT_SEEKS,
 317 };
 318 #endif
 319
 320 #ifdef DEBUG_KMEM
 321 # ifdef DEBUG_KMEM_TRACKING
 322
 323 static kmem_debug_t *
 324 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
 325                 void *addr)
 326 {
 327         struct hlist_head *head;
 328         struct hlist_node *node;
 329         struct kmem_debug *p;
 330         unsigned long flags;
 331         ENTRY;
 332
 333         spin_lock_irqsave(lock, flags);
 334
 335         head = &table[hash_ptr(addr, bits)];
 336         hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
 337                 if (p->kd_addr == addr) {
 338                         hlist_del_init(&p->kd_hlist);
 339                         list_del_init(&p->kd_list);
 340                         spin_unlock_irqrestore(lock, flags);
 341                         return p;
 342                 }
 343         }
 344
 345         spin_unlock_irqrestore(lock, flags);
 346
 347         RETURN(NULL);
 348 }
 349
 350 void *
 351 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 352     int node_alloc, int node)
 353 {
 354         void *ptr = NULL;
 355         kmem_debug_t *dptr;
 356         unsigned long irq_flags;
 357         ENTRY;
 358
 359         dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
 360             flags & ~__GFP_ZERO);
 361
 362         if (dptr == NULL) {
 363                 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
 364                     sizeof(kmem_debug_t), flags);
 365         } else {
 366                 /* Marked unlikely because we should never be doing this,
 367                  * we tolerate to up 2 pages but a single page is best.   */
 368                 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
 369                         CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
 370                             (unsigned long long) size, flags,
 371                             atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 372
 373                 /* We use kstrdup() below because the string pointed to by
 374                  * __FUNCTION__ might not be available by the time we want
 375                  * to print it since the module might have been unloaded. */
 376                 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
 377                 if (unlikely(dptr->kd_func == NULL)) {
 378                         kfree(dptr);
 379                         CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
 380                             "(%lld/%llu)\n", (unsigned long long) size, flags,
 381                             atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 382                         goto out;
 383                 }
 384
 385                 /* Use the correct allocator */
 386                 if (node_alloc) {
 387                         ASSERT(!(flags & __GFP_ZERO));
 388                         ptr = kmalloc_node(size, flags, node);
 389                 } else if (flags & __GFP_ZERO) {
 390                         ptr = kzalloc(size, flags & ~__GFP_ZERO);
 391                 } else {
 392                         ptr = kmalloc(size, flags);
 393                 }
 394
 395                 if (unlikely(ptr == NULL)) {
 396                         kfree(dptr->kd_func);
 397                         kfree(dptr);
 398                         CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 399                             (unsigned long long) size, flags,
 400                             atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 401                         goto out;
 402                 }
 403
 404                 atomic64_add(size, &kmem_alloc_used);
 405                 if (unlikely(atomic64_read(&kmem_alloc_used) >
 406                     kmem_alloc_max))
 407                         kmem_alloc_max =
 408                             atomic64_read(&kmem_alloc_used);
 409
 410                 INIT_HLIST_NODE(&dptr->kd_hlist);
 411                 INIT_LIST_HEAD(&dptr->kd_list);
 412
 413                 dptr->kd_addr = ptr;
 414                 dptr->kd_size = size;
 415                 dptr->kd_line = line;
 416
 417                 spin_lock_irqsave(&kmem_lock, irq_flags);
 418                 hlist_add_head_rcu(&dptr->kd_hlist,
 419                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 420                 list_add_tail(&dptr->kd_list, &kmem_list);
 421                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 422
 423                 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
 424                     "(%lld/%llu)\n", (unsigned long long) size, flags,
 425                     ptr, atomic64_read(&kmem_alloc_used),
 426                     kmem_alloc_max);
 427         }
 428 out:
 429         RETURN(ptr);
 430 }
 431 EXPORT_SYMBOL(kmem_alloc_track);
 432
 433 void
 434 kmem_free_track(void *ptr, size_t size)
 435 {
 436         kmem_debug_t *dptr;
 437         ENTRY;
 438
 439         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 440             (unsigned long long) size);
 441
 442         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 443
 444         ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
 445
 446         /* Size must match */
 447         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 448             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 449             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 450
 451         atomic64_sub(size, &kmem_alloc_used);
 452
 453         CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 454             (unsigned long long) size, atomic64_read(&kmem_alloc_used),
 455             kmem_alloc_max);
 456
 457         kfree(dptr->kd_func);
 458
 459         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 460         kfree(dptr);
 461
 462         memset(ptr, 0x5a, size);
 463         kfree(ptr);
 464
 465         EXIT;
 466 }
 467 EXPORT_SYMBOL(kmem_free_track);
 468
 469 void *
 470 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 471 {
 472         void *ptr = NULL;
 473         kmem_debug_t *dptr;
 474         unsigned long irq_flags;
 475         ENTRY;
 476
 477         ASSERT(flags & KM_SLEEP);
 478
 479         dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
 480         if (dptr == NULL) {
 481                 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
 482                     sizeof(kmem_debug_t), flags);
 483         } else {
 484                 /* We use kstrdup() below because the string pointed to by
 485                  * __FUNCTION__ might not be available by the time we want
 486                  * to print it, since the module might have been unloaded. */
 487                 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
 488                 if (unlikely(dptr->kd_func == NULL)) {
 489                         kfree(dptr);
 490                         CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
 491                             "(%lld/%llu)\n", (unsigned long long) size, flags,
 492                             atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 493                         goto out;
 494                 }
 495
 496                 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
 497                     PAGE_KERNEL);
 498
 499                 if (unlikely(ptr == NULL)) {
 500                         kfree(dptr->kd_func);
 501                         kfree(dptr);
 502                         CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 503                             (unsigned long long) size, flags,
 504                             atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 505                         goto out;
 506                 }
 507
 508                 if (flags & __GFP_ZERO)
 509                         memset(ptr, 0, size);
 510
 511                 atomic64_add(size, &vmem_alloc_used);
 512                 if (unlikely(atomic64_read(&vmem_alloc_used) >
 513                     vmem_alloc_max))
 514                         vmem_alloc_max =
 515                             atomic64_read(&vmem_alloc_used);
 516
 517                 INIT_HLIST_NODE(&dptr->kd_hlist);
 518                 INIT_LIST_HEAD(&dptr->kd_list);
 519
 520                 dptr->kd_addr = ptr;
 521                 dptr->kd_size = size;
 522                 dptr->kd_line = line;
 523
 524                 spin_lock_irqsave(&vmem_lock, irq_flags);
 525                 hlist_add_head_rcu(&dptr->kd_hlist,
 526                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 527                 list_add_tail(&dptr->kd_list, &vmem_list);
 528                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 529
 530                 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 531                     "(%lld/%llu)\n", (unsigned long long) size, flags,
 532                     ptr, atomic64_read(&vmem_alloc_used),
 533                     vmem_alloc_max);
 534         }
 535 out:
 536         RETURN(ptr);
 537 }
 538 EXPORT_SYMBOL(vmem_alloc_track);
 539
 540 void
 541 vmem_free_track(void *ptr, size_t size)
 542 {
 543         kmem_debug_t *dptr;
 544         ENTRY;
 545
 546         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 547             (unsigned long long) size);
 548
 549         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 550         ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
 551
 552         /* Size must match */
 553         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 554             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 555             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 556
 557         atomic64_sub(size, &vmem_alloc_used);
 558         CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 559             (unsigned long long) size, atomic64_read(&vmem_alloc_used),
 560             vmem_alloc_max);
 561
 562         kfree(dptr->kd_func);
 563
 564         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 565         kfree(dptr);
 566
 567         memset(ptr, 0x5a, size);
 568         vfree(ptr);
 569
 570         EXIT;
 571 }
 572 EXPORT_SYMBOL(vmem_free_track);
 573
 574 # else /* DEBUG_KMEM_TRACKING */
 575
 576 void *
 577 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 578     int node_alloc, int node)
 579 {
 580         void *ptr;
 581         ENTRY;
 582
 583         /* Marked unlikely because we should never be doing this,
 584          * we tolerate to up 2 pages but a single page is best.   */
 585         if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
 586                 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
 587                     (unsigned long long) size, flags,
 588                     atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 589
 590         /* Use the correct allocator */
 591         if (node_alloc) {
 592                 ASSERT(!(flags & __GFP_ZERO));
 593                 ptr = kmalloc_node(size, flags, node);
 594         } else if (flags & __GFP_ZERO) {
 595                 ptr = kzalloc(size, flags & (~__GFP_ZERO));
 596         } else {
 597                 ptr = kmalloc(size, flags);
 598         }
 599
 600         if (ptr == NULL) {
 601                 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 602                     (unsigned long long) size, flags,
 603                     atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 604         } else {
 605                 atomic64_add(size, &kmem_alloc_used);
 606                 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
 607                         kmem_alloc_max = atomic64_read(&kmem_alloc_used);
 608
 609                 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
 610                        "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 611                        atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 612         }
 613         RETURN(ptr);
 614 }
 615 EXPORT_SYMBOL(kmem_alloc_debug);
 616
 617 void
 618 kmem_free_debug(void *ptr, size_t size)
 619 {
 620         ENTRY;
 621
 622         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 623             (unsigned long long) size);
 624
 625         atomic64_sub(size, &kmem_alloc_used);
 626
 627         CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 628             (unsigned long long) size, atomic64_read(&kmem_alloc_used),
 629             kmem_alloc_max);
 630
 631         memset(ptr, 0x5a, size);
 632         kfree(ptr);
 633
 634         EXIT;
 635 }
 636 EXPORT_SYMBOL(kmem_free_debug);
 637
 638 void *
 639 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 640 {
 641         void *ptr;
 642         ENTRY;
 643
 644         ASSERT(flags & KM_SLEEP);
 645
 646         ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
 647             PAGE_KERNEL);
 648         if (ptr == NULL) {
 649                 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 650                     (unsigned long long) size, flags,
 651                     atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 652         } else {
 653                 if (flags & __GFP_ZERO)
 654                         memset(ptr, 0, size);
 655
 656                 atomic64_add(size, &vmem_alloc_used);
 657
 658                 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
 659                         vmem_alloc_max = atomic64_read(&vmem_alloc_used);
 660
 661                 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 662                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 663                     atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 664         }
 665
 666         RETURN(ptr);
 667 }
 668 EXPORT_SYMBOL(vmem_alloc_debug);
 669
 670 void
 671 vmem_free_debug(void *ptr, size_t size)
 672 {
 673         ENTRY;
 674
 675         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 676             (unsigned long long) size);
 677
 678         atomic64_sub(size, &vmem_alloc_used);
 679
 680         CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 681             (unsigned long long) size, atomic64_read(&vmem_alloc_used),
 682             vmem_alloc_max);
 683
 684         memset(ptr, 0x5a, size);
 685         vfree(ptr);
 686
 687         EXIT;
 688 }
 689 EXPORT_SYMBOL(vmem_free_debug);
 690
 691 # endif /* DEBUG_KMEM_TRACKING */
 692 #endif /* DEBUG_KMEM */
 693
 694 static void *
 695 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 696 {
 697         void *ptr;
 698
 699         if (skc->skc_flags & KMC_KMEM) {
 700                 if (size > (2 * PAGE_SIZE)) {
 701                         ptr = (void *)__get_free_pages(flags, get_order(size));
 702                 } else
 703                         ptr = kmem_alloc(size, flags);
 704         } else {
 705                 ptr = vmem_alloc(size, flags);
 706         }
 707
 708         return ptr;
 709 }
 710
 711 static void
 712 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 713 {
 714         if (skc->skc_flags & KMC_KMEM) {
 715                 if (size > (2 * PAGE_SIZE))
 716                         free_pages((unsigned long)ptr, get_order(size));
 717                 else
 718                         kmem_free(ptr, size);
 719         } else {
 720                 vmem_free(ptr, size);
 721         }
 722 }
 723
 724 /*
 725  * It's important that we pack the spl_kmem_obj_t structure and the
 726  * actual objects in to one large address space to minimize the number
 727  * of calls to the allocator.  It is far better to do a few large
 728  * allocations and then subdivide it ourselves.  Now which allocator
 729  * we use requires balancing a few trade offs.
 730  *
 731  * For small objects we use kmem_alloc() because as long as you are
 732  * only requesting a small number of pages (ideally just one) its cheap.
 733  * However, when you start requesting multiple pages with kmem_alloc()
 734  * it gets increasingly expensive since it requires contigeous pages.
 735  * For this reason we shift to vmem_alloc() for slabs of large objects
 736  * which removes the need for contigeous pages.  We do not use
 737  * vmem_alloc() in all cases because there is significant locking
 738  * overhead in __get_vm_area_node().  This function takes a single
 739  * global lock when aquiring an available virtual address range which
 740  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 741  * different allocation functions for small and large objects should
 742  * give us the best of both worlds.
 743  *
 744  * KMC_ONSLAB                       KMC_OFFSLAB
 745  *
 746  * +------------------------+       +-----------------+
 747  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 748  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 749  * | spl_kmem_obj_t      |  |                             | |
 750  * | skc_obj_size    <---+  |       +-----------------+   | |
 751  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 752  * | ...                 v  |       | spl_kmem_obj_t  |     |
 753  * +------------------------+       +-----------------+     v
 754  */
 755 static spl_kmem_slab_t *
 756 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 757 {
 758         spl_kmem_slab_t *sks;
 759         spl_kmem_obj_t *sko, *n;
 760         void *base, *obj;
 761         int i, align, size, rc = 0;
 762
 763         base = kv_alloc(skc, skc->skc_slab_size, flags);
 764         if (base == NULL)
 765                 RETURN(NULL);
 766
 767         sks = (spl_kmem_slab_t *)base;
 768         sks->sks_magic = SKS_MAGIC;
 769         sks->sks_objs = skc->skc_slab_objs;
 770         sks->sks_age = jiffies;
 771         sks->sks_cache = skc;
 772         INIT_LIST_HEAD(&sks->sks_list);
 773         INIT_LIST_HEAD(&sks->sks_free_list);
 774         sks->sks_ref = 0;
 775
 776         align = skc->skc_obj_align;
 777         size = P2ROUNDUP(skc->skc_obj_size, align) +
 778                P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
 779
 780         for (i = 0; i < sks->sks_objs; i++) {
 781                 if (skc->skc_flags & KMC_OFFSLAB) {
 782                         obj = kv_alloc(skc, size, flags);
 783                         if (!obj)
 784                                 GOTO(out, rc = -ENOMEM);
 785                 } else {
 786                         obj = base +
 787                               P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
 788                               (i * size);
 789                 }
 790
 791                 sko = obj + P2ROUNDUP(skc->skc_obj_size, align);
 792                 sko->sko_addr = obj;
 793                 sko->sko_magic = SKO_MAGIC;
 794                 sko->sko_slab = sks;
 795                 INIT_LIST_HEAD(&sko->sko_list);
 796                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 797         }
 798
 799         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
 800                 if (skc->skc_ctor)
 801                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
 802 out:
 803         if (rc) {
 804                 if (skc->skc_flags & KMC_OFFSLAB)
 805                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
 806                                                  sko_list)
 807                                 kv_free(skc, sko->sko_addr, size);
 808
 809                 kv_free(skc, base, skc->skc_slab_size);
 810                 sks = NULL;
 811         }
 812
 813         RETURN(sks);
 814 }
 815
 816 /*
 817  * Remove a slab from complete or partial list, it must be called with
 818  * the 'skc->skc_lock' held but the actual free must be performed
 819  * outside the lock to prevent deadlocking on vmem addresses.
 820  */
 821 static void
 822 spl_slab_free(spl_kmem_slab_t *sks,
 823               struct list_head *sks_list, struct list_head *sko_list)
 824 {
 825         spl_kmem_cache_t *skc;
 826         spl_kmem_obj_t *sko, *n;
 827         ENTRY;
 828
 829         ASSERT(sks->sks_magic == SKS_MAGIC);
 830         ASSERT(sks->sks_ref == 0);
 831
 832         skc = sks->sks_cache;
 833         ASSERT(skc->skc_magic == SKC_MAGIC);
 834         ASSERT(spin_is_locked(&skc->skc_lock));
 835
 836         skc->skc_obj_total -= sks->sks_objs;
 837         skc->skc_slab_total--;
 838         list_del(&sks->sks_list);
 839
 840         /* Run destructors slab is being released */
 841         list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
 842                 ASSERT(sko->sko_magic == SKO_MAGIC);
 843                 list_del(&sko->sko_list);
 844
 845                 if (skc->skc_dtor)
 846                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
 847
 848                 if (skc->skc_flags & KMC_OFFSLAB)
 849                         list_add(&sko->sko_list, sko_list);
 850         }
 851
 852         list_add(&sks->sks_list, sks_list);
 853         EXIT;
 854 }
 855
 856 /*
 857  * Traverses all the partial slabs attached to a cache and free those
 858  * which which are currently empty, and have not been touched for
 859  * skc_delay seconds.  This is to avoid thrashing.
 860  */
 861 static void
 862 spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
 863 {
 864         spl_kmem_slab_t *sks, *m;
 865         spl_kmem_obj_t *sko, *n;
 866         LIST_HEAD(sks_list);
 867         LIST_HEAD(sko_list);
 868         int size;
 869         ENTRY;
 870
 871         /*
 872          * Move empty slabs and objects which have not been touched in
 873          * skc_delay seconds on to private lists to be freed outside
 874          * the spin lock.  This delay time is important to avoid
 875          * thrashing however when flag is set the delay will not be
 876          * used.  Empty slabs will be at the end of the skc_partial_list.
 877          */
 878         spin_lock(&skc->skc_lock);
 879         list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
 880                                          sks_list) {
 881                 if (sks->sks_ref > 0)
 882                        break;
 883
 884                 if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
 885                         spl_slab_free(sks, &sks_list, &sko_list);
 886         }
 887         spin_unlock(&skc->skc_lock);
 888
 889         /*
 890          * We only have list of spl_kmem_obj_t's if they are located off
 891          * the slab, otherwise they get feed with the spl_kmem_slab_t.
 892          */
 893         if (!list_empty(&sko_list)) {
 894                 ASSERT(skc->skc_flags & KMC_OFFSLAB);
 895
 896                 size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
 897                        P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
 898
 899                 list_for_each_entry_safe(sko, n, &sko_list, sko_list)
 900                         kv_free(skc, sko->sko_addr, size);
 901         }
 902
 903         list_for_each_entry_safe(sks, m, &sks_list, sks_list)
 904                 kv_free(skc, sks, skc->skc_slab_size);
 905
 906         EXIT;
 907 }
 908
 909 /*
 910  * Called regularly on all caches to age objects out of the magazines
 911  * which have not been access in skc->skc_delay seconds.  This prevents
 912  * idle magazines from holding memory which might be better used by
 913  * other caches or parts of the system.  The delay is present to
 914  * prevent thrashing the magazine.
 915  */
 916 static void
 917 spl_magazine_age(void *data)
 918 {
 919         spl_kmem_cache_t *skc = data;
 920         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 921
 922         if (skm->skm_avail > 0 &&
 923             time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
 924                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
 925 }
 926
 927 /*
 928  * Called regularly to keep a downward pressure on the size of idle
 929  * magazines and to release free slabs from the cache.  This function
 930  * never calls the registered reclaim function, that only occures
 931  * under memory pressure or with a direct call to spl_kmem_reap().
 932  */
 933 static void
 934 spl_cache_age(void *data)
 935 {
 936         spl_kmem_cache_t *skc =
 937                 spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
 938
 939         ASSERT(skc->skc_magic == SKC_MAGIC);
 940         spl_on_each_cpu(spl_magazine_age, skc, 1);
 941         spl_slab_reclaim(skc, 0);
 942
 943         if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
 944                 schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
 945 }
 946
 947 /*
 948  * Size a slab based on the size of each aliged object plus spl_kmem_obj_t.
 949  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
 950  * for very small objects we may end up with more than this so as not
 951  * to waste space in the minimal allocation of a single page.  Also for
 952  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
 953  * lower than this and we will fail.
 954  */
 955 static int
 956 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 957 {
 958         int sks_size, obj_size, max_size, align;
 959
 960         if (skc->skc_flags & KMC_OFFSLAB) {
 961                 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 962                 *size = sizeof(spl_kmem_slab_t);
 963         } else {
 964                 align = skc->skc_obj_align;
 965                 sks_size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align);
 966                 obj_size = P2ROUNDUP(skc->skc_obj_size, align) +
 967                            P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
 968
 969                 if (skc->skc_flags & KMC_KMEM)
 970                         max_size = ((uint64_t)1 << (MAX_ORDER-1)) * PAGE_SIZE;
 971                 else
 972                         max_size = (32 * 1024 * 1024);
 973
 974                 for (*size = PAGE_SIZE; *size <= max_size; *size += PAGE_SIZE) {
 975                         *objs = (*size - sks_size) / obj_size;
 976                         if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
 977                                 RETURN(0);
 978                 }
 979
 980                 /*
 981                  * Unable to satisfy target objets per slab, fallback to
 982                  * allocating a maximally sized slab and assuming it can
 983                  * contain the minimum objects count use it.  If not fail.
 984                  */
 985                 *size = max_size;
 986                 *objs = (*size - sks_size) / obj_size;
 987                 if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
 988                         RETURN(0);
 989         }
 990
 991         RETURN(-ENOSPC);
 992 }
 993
 994 /*
 995  * Make a guess at reasonable per-cpu magazine size based on the size of
 996  * each object and the cost of caching N of them in each magazine.  Long
 997  * term this should really adapt based on an observed usage heuristic.
 998  */
 999 static int
1000 spl_magazine_size(spl_kmem_cache_t *skc)
1001 {
1002         int size, align = skc->skc_obj_align;
1003         ENTRY;
1004
1005         /* Per-magazine sizes below assume a 4Kib page size */
1006         if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
1007                 size = 4;  /* Minimum 4Mib per-magazine */
1008         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
1009                 size = 16; /* Minimum 2Mib per-magazine */
1010         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
1011                 size = 64; /* Minimum 256Kib per-magazine */
1012         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
1013                 size = 128; /* Minimum 128Kib per-magazine */
1014         else
1015                 size = 256;
1016
1017         RETURN(size);
1018 }
1019
1020 /*
1021  * Allocate a per-cpu magazine to assoicate with a specific core.
1022  */
1023 static spl_kmem_magazine_t *
1024 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
1025 {
1026         spl_kmem_magazine_t *skm;
1027         int size = sizeof(spl_kmem_magazine_t) +
1028                    sizeof(void *) * skc->skc_mag_size;
1029         ENTRY;
1030
1031         skm = kmem_alloc_node(size, GFP_KERNEL | __GFP_NOFAIL, node);
1032         if (skm) {
1033                 skm->skm_magic = SKM_MAGIC;
1034                 skm->skm_avail = 0;
1035                 skm->skm_size = skc->skc_mag_size;
1036                 skm->skm_refill = skc->skc_mag_refill;
1037                 skm->skm_age = jiffies;
1038         }
1039
1040         RETURN(skm);
1041 }
1042
1043 /*
1044  * Free a per-cpu magazine assoicated with a specific core.
1045  */
1046 static void
1047 spl_magazine_free(spl_kmem_magazine_t *skm)
1048 {
1049         int size = sizeof(spl_kmem_magazine_t) +
1050                    sizeof(void *) * skm->skm_size;
1051
1052         ENTRY;
1053         ASSERT(skm->skm_magic == SKM_MAGIC);
1054         ASSERT(skm->skm_avail == 0);
1055
1056         kmem_free(skm, size);
1057         EXIT;
1058 }
1059
1060 static void
1061 __spl_magazine_create(void *data)
1062 {
1063         spl_kmem_cache_t *skc = data;
1064         int id = smp_processor_id();
1065
1066         skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
1067         ASSERT(skc->skc_mag[id]);
1068 }
1069
1070 /*
1071  * Create all pre-cpu magazines of reasonable sizes.
1072  */
1073 static int
1074 spl_magazine_create(spl_kmem_cache_t *skc)
1075 {
1076         ENTRY;
1077
1078         skc->skc_mag_size = spl_magazine_size(skc);
1079         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1080         spl_on_each_cpu(__spl_magazine_create, skc, 1);
1081
1082         RETURN(0);
1083 }
1084
1085 static void
1086 __spl_magazine_destroy(void *data)
1087 {
1088         spl_kmem_cache_t *skc = data;
1089         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
1090
1091         (void)spl_cache_flush(skc, skm, skm->skm_avail);
1092         spl_magazine_free(skm);
1093 }
1094
1095 /*
1096  * Destroy all pre-cpu magazines.
1097  */
1098 static void
1099 spl_magazine_destroy(spl_kmem_cache_t *skc)
1100 {
1101         ENTRY;
1102         spl_on_each_cpu(__spl_magazine_destroy, skc, 1);
1103         EXIT;
1104 }
1105
1106 /*
1107  * Create a object cache based on the following arguments:
1108  * name         cache name
1109  * size         cache object size
1110  * align        cache object alignment
1111  * ctor         cache object constructor
1112  * dtor         cache object destructor
1113  * reclaim      cache object reclaim
1114  * priv         cache private data for ctor/dtor/reclaim
1115  * vmp          unused must be NULL
1116  * flags
1117  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1118  *      KMC_NODEBUG     Disable debugging (unsupported)
1119  *      KMC_NOMAGAZINE  Disable magazine (unsupported)
1120  *      KMC_NOHASH      Disable hashing (unsupported)
1121  *      KMC_QCACHE      Disable qcache (unsupported)
1122  *      KMC_KMEM        Force kmem backed cache
1123  *      KMC_VMEM        Force vmem backed cache
1124  *      KMC_OFFSLAB     Locate objects off the slab
1125  */
1126 spl_kmem_cache_t *
1127 spl_kmem_cache_create(char *name, size_t size, size_t align,
1128                       spl_kmem_ctor_t ctor,
1129                       spl_kmem_dtor_t dtor,
1130                       spl_kmem_reclaim_t reclaim,
1131                       void *priv, void *vmp, int flags)
1132 {
1133         spl_kmem_cache_t *skc;
1134         int rc, kmem_flags = KM_SLEEP;
1135         ENTRY;
1136
1137         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1138         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1139         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1140         ASSERT(vmp == NULL);
1141
1142         /* We may be called when there is a non-zero preempt_count or
1143          * interrupts are disabled is which case we must not sleep.
1144          */
1145         if (current_thread_info()->preempt_count || irqs_disabled())
1146                 kmem_flags = KM_NOSLEEP;
1147
1148         /* Allocate new cache memory and initialize. */
1149         skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
1150         if (skc == NULL)
1151                 RETURN(NULL);
1152
1153         skc->skc_magic = SKC_MAGIC;
1154         skc->skc_name_size = strlen(name) + 1;
1155         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
1156         if (skc->skc_name == NULL) {
1157                 kmem_free(skc, sizeof(*skc));
1158                 RETURN(NULL);
1159         }
1160         strncpy(skc->skc_name, name, skc->skc_name_size);
1161
1162         skc->skc_ctor = ctor;
1163         skc->skc_dtor = dtor;
1164         skc->skc_reclaim = reclaim;
1165         skc->skc_private = priv;
1166         skc->skc_vmp = vmp;
1167         skc->skc_flags = flags;
1168         skc->skc_obj_size = size;
1169         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1170         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1171         atomic_set(&skc->skc_ref, 0);
1172
1173         INIT_LIST_HEAD(&skc->skc_list);
1174         INIT_LIST_HEAD(&skc->skc_complete_list);
1175         INIT_LIST_HEAD(&skc->skc_partial_list);
1176         spin_lock_init(&skc->skc_lock);
1177         skc->skc_slab_fail = 0;
1178         skc->skc_slab_create = 0;
1179         skc->skc_slab_destroy = 0;
1180         skc->skc_slab_total = 0;
1181         skc->skc_slab_alloc = 0;
1182         skc->skc_slab_max = 0;
1183         skc->skc_obj_total = 0;
1184         skc->skc_obj_alloc = 0;
1185         skc->skc_obj_max = 0;
1186
1187         if (align) {
1188                 ASSERT((align & (align - 1)) == 0);    /* Power of two */
1189                 ASSERT(align >= SPL_KMEM_CACHE_ALIGN); /* Minimum size */
1190                 skc->skc_obj_align = align;
1191         }
1192
1193         /* If none passed select a cache type based on object size */
1194         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
1195                 if (P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) <
1196                     (PAGE_SIZE / 8)) {
1197                         skc->skc_flags |= KMC_KMEM;
1198                 } else {
1199                         skc->skc_flags |= KMC_VMEM;
1200                 }
1201         }
1202
1203         rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
1204         if (rc)
1205                 GOTO(out, rc);
1206
1207         rc = spl_magazine_create(skc);
1208         if (rc)
1209                 GOTO(out, rc);
1210
1211         spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
1212         schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
1213
1214         down_write(&spl_kmem_cache_sem);
1215         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1216         up_write(&spl_kmem_cache_sem);
1217
1218         RETURN(skc);
1219 out:
1220         kmem_free(skc->skc_name, skc->skc_name_size);
1221         kmem_free(skc, sizeof(*skc));
1222         RETURN(NULL);
1223 }
1224 EXPORT_SYMBOL(spl_kmem_cache_create);
1225
1226 /*
1227  * Destroy a cache and all objects assoicated with the cache.
1228  */
1229 void
1230 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1231 {
1232         DECLARE_WAIT_QUEUE_HEAD(wq);
1233         ENTRY;
1234
1235         ASSERT(skc->skc_magic == SKC_MAGIC);
1236
1237         down_write(&spl_kmem_cache_sem);
1238         list_del_init(&skc->skc_list);
1239         up_write(&spl_kmem_cache_sem);
1240
1241         /* Cancel any and wait for any pending delayed work */
1242         ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1243         cancel_delayed_work(&skc->skc_work);
1244         flush_scheduled_work();
1245
1246         /* Wait until all current callers complete, this is mainly
1247          * to catch the case where a low memory situation triggers a
1248          * cache reaping action which races with this destroy. */
1249         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1250
1251         spl_magazine_destroy(skc);
1252         spl_slab_reclaim(skc, 1);
1253         spin_lock(&skc->skc_lock);
1254
1255         /* Validate there are no objects in use and free all the
1256          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1257         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1258         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1259         ASSERT3U(skc->skc_slab_total, ==, 0);
1260         ASSERT3U(skc->skc_obj_total, ==, 0);
1261         ASSERT(list_empty(&skc->skc_complete_list));
1262
1263         kmem_free(skc->skc_name, skc->skc_name_size);
1264         spin_unlock(&skc->skc_lock);
1265
1266         kmem_free(skc, sizeof(*skc));
1267
1268         EXIT;
1269 }
1270 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1271
1272 /*
1273  * Allocate an object from a slab attached to the cache.  This is used to
1274  * repopulate the per-cpu magazine caches in batches when they run low.
1275  */
1276 static void *
1277 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1278 {
1279         spl_kmem_obj_t *sko;
1280
1281         ASSERT(skc->skc_magic == SKC_MAGIC);
1282         ASSERT(sks->sks_magic == SKS_MAGIC);
1283         ASSERT(spin_is_locked(&skc->skc_lock));
1284
1285         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1286         ASSERT(sko->sko_magic == SKO_MAGIC);
1287         ASSERT(sko->sko_addr != NULL);
1288
1289         /* Remove from sks_free_list */
1290         list_del_init(&sko->sko_list);
1291
1292         sks->sks_age = jiffies;
1293         sks->sks_ref++;
1294         skc->skc_obj_alloc++;
1295
1296         /* Track max obj usage statistics */
1297         if (skc->skc_obj_alloc > skc->skc_obj_max)
1298                 skc->skc_obj_max = skc->skc_obj_alloc;
1299
1300         /* Track max slab usage statistics */
1301         if (sks->sks_ref == 1) {
1302                 skc->skc_slab_alloc++;
1303
1304                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1305                         skc->skc_slab_max = skc->skc_slab_alloc;
1306         }
1307
1308         return sko->sko_addr;
1309 }
1310
1311 /*
1312  * No available objects on any slabsi, create a new slab.  Since this
1313  * is an expensive operation we do it without holding the spinlock and
1314  * only briefly aquire it when we link in the fully allocated and
1315  * constructed slab.
1316  */
1317 static spl_kmem_slab_t *
1318 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1319 {
1320         spl_kmem_slab_t *sks;
1321         ENTRY;
1322
1323         ASSERT(skc->skc_magic == SKC_MAGIC);
1324         local_irq_enable();
1325         might_sleep();
1326
1327         /*
1328          * Before allocating a new slab check if the slab is being reaped.
1329          * If it is there is a good chance we can wait until it finishes
1330          * and then use one of the newly freed but not aged-out slabs.
1331          */
1332         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1333                 schedule();
1334                 GOTO(out, sks= NULL);
1335         }
1336
1337         /* Allocate a new slab for the cache */
1338         sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | __GFP_NOWARN);
1339         if (sks == NULL)
1340                 GOTO(out, sks = NULL);
1341
1342         /* Link the new empty slab in to the end of skc_partial_list. */
1343         spin_lock(&skc->skc_lock);
1344         skc->skc_slab_total++;
1345         skc->skc_obj_total += sks->sks_objs;
1346         list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1347         spin_unlock(&skc->skc_lock);
1348 out:
1349         local_irq_disable();
1350
1351         RETURN(sks);
1352 }
1353
1354 /*
1355  * Refill a per-cpu magazine with objects from the slabs for this
1356  * cache.  Ideally the magazine can be repopulated using existing
1357  * objects which have been released, however if we are unable to
1358  * locate enough free objects new slabs of objects will be created.
1359  */
1360 static int
1361 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1362 {
1363         spl_kmem_slab_t *sks;
1364         int rc = 0, refill;
1365         ENTRY;
1366
1367         ASSERT(skc->skc_magic == SKC_MAGIC);
1368         ASSERT(skm->skm_magic == SKM_MAGIC);
1369
1370         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1371         spin_lock(&skc->skc_lock);
1372
1373         while (refill > 0) {
1374                 /* No slabs available we may need to grow the cache */
1375                 if (list_empty(&skc->skc_partial_list)) {
1376                         spin_unlock(&skc->skc_lock);
1377
1378                         sks = spl_cache_grow(skc, flags);
1379                         if (!sks)
1380                                 GOTO(out, rc);
1381
1382                         /* Rescheduled to different CPU skm is not local */
1383                         if (skm != skc->skc_mag[smp_processor_id()])
1384                                 GOTO(out, rc);
1385
1386                         /* Potentially rescheduled to the same CPU but
1387                          * allocations may have occured from this CPU while
1388                          * we were sleeping so recalculate max refill. */
1389                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1390
1391                         spin_lock(&skc->skc_lock);
1392                         continue;
1393                 }
1394
1395                 /* Grab the next available slab */
1396                 sks = list_entry((&skc->skc_partial_list)->next,
1397                                  spl_kmem_slab_t, sks_list);
1398                 ASSERT(sks->sks_magic == SKS_MAGIC);
1399                 ASSERT(sks->sks_ref < sks->sks_objs);
1400                 ASSERT(!list_empty(&sks->sks_free_list));
1401
1402                 /* Consume as many objects as needed to refill the requested
1403                  * cache.  We must also be careful not to overfill it. */
1404                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1405                         ASSERT(skm->skm_avail < skm->skm_size);
1406                         ASSERT(rc < skm->skm_size);
1407                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1408                 }
1409
1410                 /* Move slab to skc_complete_list when full */
1411                 if (sks->sks_ref == sks->sks_objs) {
1412                         list_del(&sks->sks_list);
1413                         list_add(&sks->sks_list, &skc->skc_complete_list);
1414                 }
1415         }
1416
1417         spin_unlock(&skc->skc_lock);
1418 out:
1419         /* Returns the number of entries added to cache */
1420         RETURN(rc);
1421 }
1422
1423 /*
1424  * Release an object back to the slab from which it came.
1425  */
1426 static void
1427 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1428 {
1429         spl_kmem_slab_t *sks = NULL;
1430         spl_kmem_obj_t *sko = NULL;
1431         ENTRY;
1432
1433         ASSERT(skc->skc_magic == SKC_MAGIC);
1434         ASSERT(spin_is_locked(&skc->skc_lock));
1435
1436         sko = obj + P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align);
1437         ASSERT(sko->sko_magic == SKO_MAGIC);
1438
1439         sks = sko->sko_slab;
1440         ASSERT(sks->sks_magic == SKS_MAGIC);
1441         ASSERT(sks->sks_cache == skc);
1442         list_add(&sko->sko_list, &sks->sks_free_list);
1443
1444         sks->sks_age = jiffies;
1445         sks->sks_ref--;
1446         skc->skc_obj_alloc--;
1447
1448         /* Move slab to skc_partial_list when no longer full.  Slabs
1449          * are added to the head to keep the partial list is quasi-full
1450          * sorted order.  Fuller at the head, emptier at the tail. */
1451         if (sks->sks_ref == (sks->sks_objs - 1)) {
1452                 list_del(&sks->sks_list);
1453                 list_add(&sks->sks_list, &skc->skc_partial_list);
1454         }
1455
1456         /* Move emply slabs to the end of the partial list so
1457          * they can be easily found and freed during reclamation. */
1458         if (sks->sks_ref == 0) {
1459                 list_del(&sks->sks_list);
1460                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1461                 skc->skc_slab_alloc--;
1462         }
1463
1464         EXIT;
1465 }
1466
1467 /*
1468  * Release a batch of objects from a per-cpu magazine back to their
1469  * respective slabs.  This occurs when we exceed the magazine size,
1470  * are under memory pressure, when the cache is idle, or during
1471  * cache cleanup.  The flush argument contains the number of entries
1472  * to remove from the magazine.
1473  */
1474 static int
1475 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1476 {
1477         int i, count = MIN(flush, skm->skm_avail);
1478         ENTRY;
1479
1480         ASSERT(skc->skc_magic == SKC_MAGIC);
1481         ASSERT(skm->skm_magic == SKM_MAGIC);
1482
1483         /*
1484          * XXX: Currently we simply return objects from the magazine to
1485          * the slabs in fifo order.  The ideal thing to do from a memory
1486          * fragmentation standpoint is to cheaply determine the set of
1487          * objects in the magazine which will result in the largest
1488          * number of free slabs if released from the magazine.
1489          */
1490         spin_lock(&skc->skc_lock);
1491         for (i = 0; i < count; i++)
1492                 spl_cache_shrink(skc, skm->skm_objs[i]);
1493
1494         skm->skm_avail -= count;
1495         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1496                 sizeof(void *) * skm->skm_avail);
1497
1498         spin_unlock(&skc->skc_lock);
1499
1500         RETURN(count);
1501 }
1502
1503 /*
1504  * Allocate an object from the per-cpu magazine, or if the magazine
1505  * is empty directly allocate from a slab and repopulate the magazine.
1506  */
1507 void *
1508 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1509 {
1510         spl_kmem_magazine_t *skm;
1511         unsigned long irq_flags;
1512         void *obj = NULL;
1513         ENTRY;
1514
1515         ASSERT(skc->skc_magic == SKC_MAGIC);
1516         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1517         ASSERT(flags & KM_SLEEP);
1518         atomic_inc(&skc->skc_ref);
1519         local_irq_save(irq_flags);
1520
1521 restart:
1522         /* Safe to update per-cpu structure without lock, but
1523          * in the restart case we must be careful to reaquire
1524          * the local magazine since this may have changed
1525          * when we need to grow the cache. */
1526         skm = skc->skc_mag[smp_processor_id()];
1527         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1528                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1529                 skm->skm_size, skm->skm_refill, skm->skm_avail);
1530
1531         if (likely(skm->skm_avail)) {
1532                 /* Object available in CPU cache, use it */
1533                 obj = skm->skm_objs[--skm->skm_avail];
1534                 skm->skm_age = jiffies;
1535         } else {
1536                 /* Per-CPU cache empty, directly allocate from
1537                  * the slab and refill the per-CPU cache. */
1538                 (void)spl_cache_refill(skc, skm, flags);
1539                 GOTO(restart, obj = NULL);
1540         }
1541
1542         local_irq_restore(irq_flags);
1543         ASSERT(obj);
1544         ASSERT(((unsigned long)(obj) % skc->skc_obj_align) == 0);
1545
1546         /* Pre-emptively migrate object to CPU L1 cache */
1547         prefetchw(obj);
1548         atomic_dec(&skc->skc_ref);
1549
1550         RETURN(obj);
1551 }
1552 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1553
1554 /*
1555  * Free an object back to the local per-cpu magazine, there is no
1556  * guarantee that this is the same magazine the object was originally
1557  * allocated from.  We may need to flush entire from the magazine
1558  * back to the slabs to make space.
1559  */
1560 void
1561 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1562 {
1563         spl_kmem_magazine_t *skm;
1564         unsigned long flags;
1565         ENTRY;
1566
1567         ASSERT(skc->skc_magic == SKC_MAGIC);
1568         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1569         atomic_inc(&skc->skc_ref);
1570         local_irq_save(flags);
1571
1572         /* Safe to update per-cpu structure without lock, but
1573          * no remote memory allocation tracking is being performed
1574          * it is entirely possible to allocate an object from one
1575          * CPU cache and return it to another. */
1576         skm = skc->skc_mag[smp_processor_id()];
1577         ASSERT(skm->skm_magic == SKM_MAGIC);
1578
1579         /* Per-CPU cache full, flush it to make space */
1580         if (unlikely(skm->skm_avail >= skm->skm_size))
1581                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1582
1583         /* Available space in cache, use it */
1584         skm->skm_objs[skm->skm_avail++] = obj;
1585
1586         local_irq_restore(flags);
1587         atomic_dec(&skc->skc_ref);
1588
1589         EXIT;
1590 }
1591 EXPORT_SYMBOL(spl_kmem_cache_free);
1592
1593 /*
1594  * The generic shrinker function for all caches.  Under linux a shrinker
1595  * may not be tightly coupled with a slab cache.  In fact linux always
1596  * systematically trys calling all registered shrinker callbacks which
1597  * report that they contain unused objects.  Because of this we only
1598  * register one shrinker function in the shim layer for all slab caches.
1599  * We always attempt to shrink all caches when this generic shrinker
1600  * is called.  The shrinker should return the number of free objects
1601  * in the cache when called with nr_to_scan == 0 but not attempt to
1602  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
1603  * objects should be freed, because Solaris semantics are to free
1604  * all available objects we may free more objects than requested.
1605  */
1606 static int
1607 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1608 {
1609         spl_kmem_cache_t *skc;
1610         int unused = 0;
1611
1612         down_read(&spl_kmem_cache_sem);
1613         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1614                 if (nr_to_scan)
1615                         spl_kmem_cache_reap_now(skc);
1616
1617                 /*
1618                  * Presume everything alloc'ed in reclaimable, this ensures
1619                  * we are called again with nr_to_scan > 0 so can try and
1620                  * reclaim.  The exact number is not important either so
1621                  * we forgo taking this already highly contented lock.
1622                  */
1623                 unused += skc->skc_obj_alloc;
1624         }
1625         up_read(&spl_kmem_cache_sem);
1626
1627         return (unused * sysctl_vfs_cache_pressure) / 100;
1628 }
1629
1630 /*
1631  * Call the registered reclaim function for a cache.  Depending on how
1632  * many and which objects are released it may simply repopulate the
1633  * local magazine which will then need to age-out.  Objects which cannot
1634  * fit in the magazine we will be released back to their slabs which will
1635  * also need to age out before being release.  This is all just best
1636  * effort and we do not want to thrash creating and destroying slabs.
1637  */
1638 void
1639 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1640 {
1641         ENTRY;
1642
1643         ASSERT(skc->skc_magic == SKC_MAGIC);
1644         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1645
1646         /* Prevent concurrent cache reaping when contended */
1647         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1648                 EXIT;
1649                 return;
1650         }
1651
1652         atomic_inc(&skc->skc_ref);
1653
1654         if (skc->skc_reclaim)
1655                 skc->skc_reclaim(skc->skc_private);
1656
1657         spl_slab_reclaim(skc, 0);
1658         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1659         atomic_dec(&skc->skc_ref);
1660
1661         EXIT;
1662 }
1663 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1664
1665 /*
1666  * Reap all free slabs from all registered caches.
1667  */
1668 void
1669 spl_kmem_reap(void)
1670 {
1671         spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1672 }
1673 EXPORT_SYMBOL(spl_kmem_reap);
1674
1675 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1676 static char *
1677 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1678 {
1679         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1680         int i, flag = 1;
1681
1682         ASSERT(str != NULL && len >= 17);
1683         memset(str, 0, len);
1684
1685         /* Check for a fully printable string, and while we are at
1686          * it place the printable characters in the passed buffer. */
1687         for (i = 0; i < size; i++) {
1688                 str[i] = ((char *)(kd->kd_addr))[i];
1689                 if (isprint(str[i])) {
1690                         continue;
1691                 } else {
1692                         /* Minimum number of printable characters found
1693                          * to make it worthwhile to print this as ascii. */
1694                         if (i > min)
1695                                 break;
1696
1697                         flag = 0;
1698                         break;
1699                 }
1700         }
1701
1702         if (!flag) {
1703                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1704                         *((uint8_t *)kd->kd_addr),
1705                         *((uint8_t *)kd->kd_addr + 2),
1706                         *((uint8_t *)kd->kd_addr + 4),
1707                         *((uint8_t *)kd->kd_addr + 6),
1708                         *((uint8_t *)kd->kd_addr + 8),
1709                         *((uint8_t *)kd->kd_addr + 10),
1710                         *((uint8_t *)kd->kd_addr + 12),
1711                         *((uint8_t *)kd->kd_addr + 14));
1712         }
1713
1714         return str;
1715 }
1716
1717 static int
1718 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1719 {
1720         int i;
1721         ENTRY;
1722
1723         spin_lock_init(lock);
1724         INIT_LIST_HEAD(list);
1725
1726         for (i = 0; i < size; i++)
1727                 INIT_HLIST_HEAD(&kmem_table[i]);
1728
1729         RETURN(0);
1730 }
1731
1732 static void
1733 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1734 {
1735         unsigned long flags;
1736         kmem_debug_t *kd;
1737         char str[17];
1738         ENTRY;
1739
1740         spin_lock_irqsave(lock, flags);
1741         if (!list_empty(list))
1742                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1743                        "size", "data", "func", "line");
1744
1745         list_for_each_entry(kd, list, kd_list)
1746                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1747                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1748                        kd->kd_func, kd->kd_line);
1749
1750         spin_unlock_irqrestore(lock, flags);
1751         EXIT;
1752 }
1753 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1754 #define spl_kmem_init_tracking(list, lock, size)
1755 #define spl_kmem_fini_tracking(list, lock)
1756 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1757
1758 static void
1759 spl_kmem_init_globals(void)
1760 {
1761         struct zone *zone;
1762
1763         /* For now all zones are includes, it may be wise to restrict
1764          * this to normal and highmem zones if we see problems. */
1765         for_each_zone(zone) {
1766
1767                 if (!populated_zone(zone))
1768                         continue;
1769
1770                 minfree += zone->pages_min;
1771                 desfree += zone->pages_low;
1772                 lotsfree += zone->pages_high;
1773         }
1774
1775         /* Solaris default values */
1776         swapfs_minfree = MAX(2*1024*1024 / PAGE_SIZE, physmem / 8);
1777         swapfs_reserve = MIN(4*1024*1024 / PAGE_SIZE, physmem / 16);
1778 }
1779
1780 int
1781 spl_kmem_init(void)
1782 {
1783         int rc = 0;
1784         ENTRY;
1785
1786         init_rwsem(&spl_kmem_cache_sem);
1787         INIT_LIST_HEAD(&spl_kmem_cache_list);
1788         spl_kmem_init_globals();
1789
1790 #ifdef HAVE_SET_SHRINKER
1791         spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1792                                                spl_kmem_cache_generic_shrinker);
1793         if (spl_kmem_cache_shrinker == NULL)
1794                 RETURN(rc = -ENOMEM);
1795 #else
1796         register_shrinker(&spl_kmem_cache_shrinker);
1797 #endif
1798
1799 #ifdef DEBUG_KMEM
1800         atomic64_set(&kmem_alloc_used, 0);
1801         atomic64_set(&vmem_alloc_used, 0);
1802
1803         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1804         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1805 #endif
1806         RETURN(rc);
1807 }
1808
1809 void
1810 spl_kmem_fini(void)
1811 {
1812 #ifdef DEBUG_KMEM
1813         /* Display all unreclaimed memory addresses, including the
1814          * allocation size and the first few bytes of what's located
1815          * at that address to aid in debugging.  Performance is not
1816          * a serious concern here since it is module unload time. */
1817         if (atomic64_read(&kmem_alloc_used) != 0)
1818                 CWARN("kmem leaked %ld/%ld bytes\n",
1819                       atomic64_read(&kmem_alloc_used), kmem_alloc_max);
1820
1821
1822         if (atomic64_read(&vmem_alloc_used) != 0)
1823                 CWARN("vmem leaked %ld/%ld bytes\n",
1824                       atomic64_read(&vmem_alloc_used), vmem_alloc_max);
1825
1826         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1827         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1828 #endif /* DEBUG_KMEM */
1829         ENTRY;
1830
1831 #ifdef HAVE_SET_SHRINKER
1832         remove_shrinker(spl_kmem_cache_shrinker);
1833 #else
1834         unregister_shrinker(&spl_kmem_cache_shrinker);
1835 #endif
1836
1837         EXIT;
1838 }