module/spl/spl-kmem.c

   1 /*
   2  *  This file is part of the SPL: Solaris Porting Layer.
   3  *
   4  *  Copyright (c) 2008 Lawrence Livermore National Security, LLC.
   5  *  Produced at Lawrence Livermore National Laboratory
   6  *  Written by:
   7  *          Brian Behlendorf <behlendorf1@llnl.gov>,
   8  *          Herb Wartens <wartens2@llnl.gov>,
   9  *          Jim Garlick <garlick@llnl.gov>
  10  *  UCRL-CODE-235197
  11  *
  12  *  This is free software; you can redistribute it and/or modify it
  13  *  under the terms of the GNU General Public License as published by
  14  *  the Free Software Foundation; either version 2 of the License, or
  15  *  (at your option) any later version.
  16  *
  17  *  This is distributed in the hope that it will be useful, but WITHOUT
  18  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  20  *  for more details.
  21  *
  22  *  You should have received a copy of the GNU General Public License along
  23  *  with this program; if not, write to the Free Software Foundation, Inc.,
  24  *  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  */
  26
  27 #include <sys/kmem.h>
  28
  29 #ifdef DEBUG_SUBSYSTEM
  30 # undef DEBUG_SUBSYSTEM
  31 #endif
  32
  33 #define DEBUG_SUBSYSTEM S_KMEM
  34
  35 /*
  36  * The minimum amount of memory measured in pages to be free at all
  37  * times on the system.  This is similar to Linux's zone->pages_min
  38  * multipled by the number of zones and is sized based on that.
  39  */
  40 pgcnt_t minfree = 0;
  41 EXPORT_SYMBOL(minfree);
  42
  43 /*
  44  * The desired amount of memory measured in pages to be free at all
  45  * times on the system.  This is similar to Linux's zone->pages_low
  46  * multipled by the number of zones and is sized based on that.
  47  * Assuming all zones are being used roughly equally, when we drop
  48  * below this threshold async page reclamation is triggered.
  49  */
  50 pgcnt_t desfree = 0;
  51 EXPORT_SYMBOL(desfree);
  52
  53 /*
  54  * When above this amount of memory measures in pages the system is
  55  * determined to have enough free memory.  This is similar to Linux's
  56  * zone->pages_high multipled by the number of zones and is sized based
  57  * on that.  Assuming all zones are being used roughly equally, when
  58  * async page reclamation reaches this threshold it stops.
  59  */
  60 pgcnt_t lotsfree = 0;
  61 EXPORT_SYMBOL(lotsfree);
  62
  63 /* Unused always 0 in this implementation */
  64 pgcnt_t needfree = 0;
  65 EXPORT_SYMBOL(needfree);
  66
  67 pgcnt_t swapfs_minfree = 0;
  68 EXPORT_SYMBOL(swapfs_minfree);
  69
  70 pgcnt_t swapfs_reserve = 0;
  71 EXPORT_SYMBOL(swapfs_reserve);
  72
  73 vmem_t *heap_arena = NULL;
  74 EXPORT_SYMBOL(heap_arena);
  75
  76 vmem_t *zio_alloc_arena = NULL;
  77 EXPORT_SYMBOL(zio_alloc_arena);
  78
  79 vmem_t *zio_arena = NULL;
  80 EXPORT_SYMBOL(zio_arena);
  81
  82 #ifndef HAVE_FIRST_ONLINE_PGDAT
  83 struct pglist_data *
  84 first_online_pgdat(void)
  85 {
  86         return NODE_DATA(first_online_node);
  87 }
  88 EXPORT_SYMBOL(first_online_pgdat);
  89 #endif /* HAVE_FIRST_ONLINE_PGDAT */
  90
  91 #ifndef HAVE_NEXT_ONLINE_PGDAT
  92 struct pglist_data *
  93 next_online_pgdat(struct pglist_data *pgdat)
  94 {
  95         int nid = next_online_node(pgdat->node_id);
  96
  97         if (nid == MAX_NUMNODES)
  98                 return NULL;
  99
 100         return NODE_DATA(nid);
 101 }
 102 EXPORT_SYMBOL(next_online_pgdat);
 103 #endif /* HAVE_NEXT_ONLINE_PGDAT */
 104
 105 #ifndef HAVE_NEXT_ZONE
 106 struct zone *
 107 next_zone(struct zone *zone)
 108 {
 109         pg_data_t *pgdat = zone->zone_pgdat;
 110
 111         if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
 112         zone++;
 113         else {
 114                 pgdat = next_online_pgdat(pgdat);
 115                 if (pgdat)
 116                         zone = pgdat->node_zones;
 117                 else
 118                         zone = NULL;
 119         }
 120         return zone;
 121 }
 122 EXPORT_SYMBOL(next_zone);
 123 #endif /* HAVE_NEXT_ZONE */
 124
 125 #ifndef HAVE_GET_ZONE_COUNTS
 126 void
 127 __get_zone_counts(unsigned long *active, unsigned long *inactive,
 128                   unsigned long *free, struct pglist_data *pgdat)
 129 {
 130         struct zone *zones = pgdat->node_zones;
 131         int i;
 132
 133         *active = 0;
 134         *inactive = 0;
 135         *free = 0;
 136         for (i = 0; i < MAX_NR_ZONES; i++) {
 137                 *active += zones[i].nr_active;
 138                 *inactive += zones[i].nr_inactive;
 139                 *free += zones[i].free_pages;
 140         }
 141 }
 142
 143 void
 144 get_zone_counts(unsigned long *active, unsigned long *inactive,
 145                 unsigned long *free)
 146 {
 147         struct pglist_data *pgdat;
 148
 149         *active = 0;
 150         *inactive = 0;
 151         *free = 0;
 152         for_each_online_pgdat(pgdat) {
 153                 unsigned long l, m, n;
 154                 __get_zone_counts(&l, &m, &n, pgdat);
 155                 *active += l;
 156                 *inactive += m;
 157                 *free += n;
 158         }
 159 }
 160 EXPORT_SYMBOL(get_zone_counts);
 161 #endif /* HAVE_GET_ZONE_COUNTS */
 162
 163 pgcnt_t
 164 spl_kmem_availrmem(void)
 165 {
 166         unsigned long active;
 167         unsigned long inactive;
 168         unsigned long free;
 169
 170         get_zone_counts(&active, &inactive, &free);
 171
 172         /* The amount of easily available memory */
 173         return free + inactive;
 174 }
 175 EXPORT_SYMBOL(spl_kmem_availrmem);
 176
 177 size_t
 178 vmem_size(vmem_t *vmp, int typemask)
 179 {
 180         /* Arena's unsupported */
 181         ASSERT(vmp == NULL);
 182         ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
 183
 184         return 0;
 185 }
 186 EXPORT_SYMBOL(vmem_size);
 187
 188
 189 /*
 190  * Memory allocation interfaces and debugging for basic kmem_*
 191  * and vmem_* style memory allocation.  When DEBUG_KMEM is enable
 192  * all allocations will be tracked when they are allocated and
 193  * freed.  When the SPL module is unload a list of all leaked
 194  * addresses and where they were allocated will be dumped to the
 195  * console.  Enabling this feature has a significant impant on
 196  * performance but it makes finding memory leaks staight forward.
 197  */
 198 #ifdef DEBUG_KMEM
 199 /* Shim layer memory accounting */
 200 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 201 unsigned long long kmem_alloc_max = 0;
 202 atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
 203 unsigned long long vmem_alloc_max = 0;
 204 int kmem_warning_flag = 1;
 205
 206 EXPORT_SYMBOL(kmem_alloc_used);
 207 EXPORT_SYMBOL(kmem_alloc_max);
 208 EXPORT_SYMBOL(vmem_alloc_used);
 209 EXPORT_SYMBOL(vmem_alloc_max);
 210 EXPORT_SYMBOL(kmem_warning_flag);
 211
 212 # ifdef DEBUG_KMEM_TRACKING
 213
 214 /* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
 215  * highly contended particularly on xfree().  If we want to run with this
 216  * detailed debugging enabled for anything other than debugging  we need to
 217  * minimize the contention by moving to a lock per xmem_table entry model.
 218  */
 219
 220 #  define KMEM_HASH_BITS          10
 221 #  define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 222
 223 #  define VMEM_HASH_BITS          10
 224 #  define VMEM_TABLE_SIZE         (1 << VMEM_HASH_BITS)
 225
 226 typedef struct kmem_debug {
 227         struct hlist_node kd_hlist;     /* Hash node linkage */
 228         struct list_head kd_list;       /* List of all allocations */
 229         void *kd_addr;                  /* Allocation pointer */
 230         size_t kd_size;                 /* Allocation size */
 231         const char *kd_func;            /* Allocation function */
 232         int kd_line;                    /* Allocation line */
 233 } kmem_debug_t;
 234
 235 spinlock_t kmem_lock;
 236 struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 237 struct list_head kmem_list;
 238
 239 spinlock_t vmem_lock;
 240 struct hlist_head vmem_table[VMEM_TABLE_SIZE];
 241 struct list_head vmem_list;
 242
 243 EXPORT_SYMBOL(kmem_lock);
 244 EXPORT_SYMBOL(kmem_table);
 245 EXPORT_SYMBOL(kmem_list);
 246
 247 EXPORT_SYMBOL(vmem_lock);
 248 EXPORT_SYMBOL(vmem_table);
 249 EXPORT_SYMBOL(vmem_list);
 250 # endif
 251
 252 int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
 253 #else
 254 int kmem_set_warning(int flag) { return 0; }
 255 #endif
 256 EXPORT_SYMBOL(kmem_set_warning);
 257
 258 /*
 259  * Slab allocation interfaces
 260  *
 261  * While the Linux slab implementation was inspired by the Solaris
 262  * implemenation I cannot use it to emulate the Solaris APIs.  I
 263  * require two features which are not provided by the Linux slab.
 264  *
 265  * 1) Constructors AND destructors.  Recent versions of the Linux
 266  *    kernel have removed support for destructors.  This is a deal
 267  *    breaker for the SPL which contains particularly expensive
 268  *    initializers for mutex's, condition variables, etc.  We also
 269  *    require a minimal level of cleanup for these data types unlike
 270  *    many Linux data type which do need to be explicitly destroyed.
 271  *
 272  * 2) Virtual address space backed slab.  Callers of the Solaris slab
 273  *    expect it to work well for both small are very large allocations.
 274  *    Because of memory fragmentation the Linux slab which is backed
 275  *    by kmalloc'ed memory performs very badly when confronted with
 276  *    large numbers of large allocations.  Basing the slab on the
 277  *    virtual address space removes the need for contigeous pages
 278  *    and greatly improve performance for large allocations.
 279  *
 280  * For these reasons, the SPL has its own slab implementation with
 281  * the needed features.  It is not as highly optimized as either the
 282  * Solaris or Linux slabs, but it should get me most of what is
 283  * needed until it can be optimized or obsoleted by another approach.
 284  *
 285  * One serious concern I do have about this method is the relatively
 286  * small virtual address space on 32bit arches.  This will seriously
 287  * constrain the size of the slab caches and their performance.
 288  *
 289  * XXX: Improve the partial slab list by carefully maintaining a
 290  *      strict ordering of fullest to emptiest slabs based on
 291  *      the slab reference count.  This gaurentees the when freeing
 292  *      slabs back to the system we need only linearly traverse the
 293  *      last N slabs in the list to discover all the freeable slabs.
 294  *
 295  * XXX: NUMA awareness for optionally allocating memory close to a
 296  *      particular core.  This can be adventageous if you know the slab
 297  *      object will be short lived and primarily accessed from one core.
 298  *
 299  * XXX: Slab coloring may also yield performance improvements and would
 300  *      be desirable to implement.
 301  */
 302
 303 struct list_head spl_kmem_cache_list;   /* List of caches */
 304 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 305
 306 static int spl_cache_flush(spl_kmem_cache_t *skc,
 307                            spl_kmem_magazine_t *skm, int flush);
 308
 309 #ifdef HAVE_SET_SHRINKER
 310 static struct shrinker *spl_kmem_cache_shrinker;
 311 #else
 312 static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
 313                                            unsigned int gfp_mask);
 314 static struct shrinker spl_kmem_cache_shrinker = {
 315         .shrink = spl_kmem_cache_generic_shrinker,
 316         .seeks = KMC_DEFAULT_SEEKS,
 317 };
 318 #endif
 319
 320 #ifdef DEBUG_KMEM
 321 # ifdef DEBUG_KMEM_TRACKING
 322
 323 static kmem_debug_t *
 324 kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
 325                 void *addr)
 326 {
 327         struct hlist_head *head;
 328         struct hlist_node *node;
 329         struct kmem_debug *p;
 330         unsigned long flags;
 331         ENTRY;
 332
 333         spin_lock_irqsave(lock, flags);
 334
 335         head = &table[hash_ptr(addr, bits)];
 336         hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
 337                 if (p->kd_addr == addr) {
 338                         hlist_del_init(&p->kd_hlist);
 339                         list_del_init(&p->kd_list);
 340                         spin_unlock_irqrestore(lock, flags);
 341                         return p;
 342                 }
 343         }
 344
 345         spin_unlock_irqrestore(lock, flags);
 346
 347         RETURN(NULL);
 348 }
 349
 350 void *
 351 kmem_alloc_track(size_t size, int flags, const char *func, int line,
 352     int node_alloc, int node)
 353 {
 354         void *ptr = NULL;
 355         kmem_debug_t *dptr;
 356         unsigned long irq_flags;
 357         ENTRY;
 358
 359         dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
 360             flags & ~__GFP_ZERO);
 361
 362         if (dptr == NULL) {
 363                 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
 364                     sizeof(kmem_debug_t), flags);
 365         } else {
 366                 /* Marked unlikely because we should never be doing this,
 367                  * we tolerate to up 2 pages but a single page is best.   */
 368                 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
 369                         CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
 370                             (unsigned long long) size, flags,
 371                             atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 372
 373                 /* We use kstrdup() below because the string pointed to by
 374                  * __FUNCTION__ might not be available by the time we want
 375                  * to print it since the module might have been unloaded. */
 376                 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
 377                 if (unlikely(dptr->kd_func == NULL)) {
 378                         kfree(dptr);
 379                         CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
 380                             "(%lld/%llu)\n", (unsigned long long) size, flags,
 381                             atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 382                         goto out;
 383                 }
 384
 385                 /* Use the correct allocator */
 386                 if (node_alloc) {
 387                         ASSERT(!(flags & __GFP_ZERO));
 388                         ptr = kmalloc_node(size, flags, node);
 389                 } else if (flags & __GFP_ZERO) {
 390                         ptr = kzalloc(size, flags & ~__GFP_ZERO);
 391                 } else {
 392                         ptr = kmalloc(size, flags);
 393                 }
 394
 395                 if (unlikely(ptr == NULL)) {
 396                         kfree(dptr->kd_func);
 397                         kfree(dptr);
 398                         CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 399                             (unsigned long long) size, flags,
 400                             atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 401                         goto out;
 402                 }
 403
 404                 atomic64_add(size, &kmem_alloc_used);
 405                 if (unlikely(atomic64_read(&kmem_alloc_used) >
 406                     kmem_alloc_max))
 407                         kmem_alloc_max =
 408                             atomic64_read(&kmem_alloc_used);
 409
 410                 INIT_HLIST_NODE(&dptr->kd_hlist);
 411                 INIT_LIST_HEAD(&dptr->kd_list);
 412
 413                 dptr->kd_addr = ptr;
 414                 dptr->kd_size = size;
 415                 dptr->kd_line = line;
 416
 417                 spin_lock_irqsave(&kmem_lock, irq_flags);
 418                 hlist_add_head_rcu(&dptr->kd_hlist,
 419                     &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 420                 list_add_tail(&dptr->kd_list, &kmem_list);
 421                 spin_unlock_irqrestore(&kmem_lock, irq_flags);
 422
 423                 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
 424                     "(%lld/%llu)\n", (unsigned long long) size, flags,
 425                     ptr, atomic64_read(&kmem_alloc_used),
 426                     kmem_alloc_max);
 427         }
 428 out:
 429         RETURN(ptr);
 430 }
 431 EXPORT_SYMBOL(kmem_alloc_track);
 432
 433 void
 434 kmem_free_track(void *ptr, size_t size)
 435 {
 436         kmem_debug_t *dptr;
 437         ENTRY;
 438
 439         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 440             (unsigned long long) size);
 441
 442         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 443
 444         ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
 445
 446         /* Size must match */
 447         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 448             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 449             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 450
 451         atomic64_sub(size, &kmem_alloc_used);
 452
 453         CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 454             (unsigned long long) size, atomic64_read(&kmem_alloc_used),
 455             kmem_alloc_max);
 456
 457         kfree(dptr->kd_func);
 458
 459         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 460         kfree(dptr);
 461
 462         memset(ptr, 0x5a, size);
 463         kfree(ptr);
 464
 465         EXIT;
 466 }
 467 EXPORT_SYMBOL(kmem_free_track);
 468
 469 void *
 470 vmem_alloc_track(size_t size, int flags, const char *func, int line)
 471 {
 472         void *ptr = NULL;
 473         kmem_debug_t *dptr;
 474         unsigned long irq_flags;
 475         ENTRY;
 476
 477         ASSERT(flags & KM_SLEEP);
 478
 479         dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
 480         if (dptr == NULL) {
 481                 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
 482                     sizeof(kmem_debug_t), flags);
 483         } else {
 484                 /* We use kstrdup() below because the string pointed to by
 485                  * __FUNCTION__ might not be available by the time we want
 486                  * to print it, since the module might have been unloaded. */
 487                 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
 488                 if (unlikely(dptr->kd_func == NULL)) {
 489                         kfree(dptr);
 490                         CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
 491                             "(%lld/%llu)\n", (unsigned long long) size, flags,
 492                             atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 493                         goto out;
 494                 }
 495
 496                 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
 497                     PAGE_KERNEL);
 498
 499                 if (unlikely(ptr == NULL)) {
 500                         kfree(dptr->kd_func);
 501                         kfree(dptr);
 502                         CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 503                             (unsigned long long) size, flags,
 504                             atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 505                         goto out;
 506                 }
 507
 508                 if (flags & __GFP_ZERO)
 509                         memset(ptr, 0, size);
 510
 511                 atomic64_add(size, &vmem_alloc_used);
 512                 if (unlikely(atomic64_read(&vmem_alloc_used) >
 513                     vmem_alloc_max))
 514                         vmem_alloc_max =
 515                             atomic64_read(&vmem_alloc_used);
 516
 517                 INIT_HLIST_NODE(&dptr->kd_hlist);
 518                 INIT_LIST_HEAD(&dptr->kd_list);
 519
 520                 dptr->kd_addr = ptr;
 521                 dptr->kd_size = size;
 522                 dptr->kd_line = line;
 523
 524                 spin_lock_irqsave(&vmem_lock, irq_flags);
 525                 hlist_add_head_rcu(&dptr->kd_hlist,
 526                     &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
 527                 list_add_tail(&dptr->kd_list, &vmem_list);
 528                 spin_unlock_irqrestore(&vmem_lock, irq_flags);
 529
 530                 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 531                     "(%lld/%llu)\n", (unsigned long long) size, flags,
 532                     ptr, atomic64_read(&vmem_alloc_used),
 533                     vmem_alloc_max);
 534         }
 535 out:
 536         RETURN(ptr);
 537 }
 538 EXPORT_SYMBOL(vmem_alloc_track);
 539
 540 void
 541 vmem_free_track(void *ptr, size_t size)
 542 {
 543         kmem_debug_t *dptr;
 544         ENTRY;
 545
 546         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 547             (unsigned long long) size);
 548
 549         dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
 550         ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
 551
 552         /* Size must match */
 553         ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
 554             "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
 555             (unsigned long long) size, dptr->kd_func, dptr->kd_line);
 556
 557         atomic64_sub(size, &vmem_alloc_used);
 558         CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 559             (unsigned long long) size, atomic64_read(&vmem_alloc_used),
 560             vmem_alloc_max);
 561
 562         kfree(dptr->kd_func);
 563
 564         memset(dptr, 0x5a, sizeof(kmem_debug_t));
 565         kfree(dptr);
 566
 567         memset(ptr, 0x5a, size);
 568         vfree(ptr);
 569
 570         EXIT;
 571 }
 572 EXPORT_SYMBOL(vmem_free_track);
 573
 574 # else /* DEBUG_KMEM_TRACKING */
 575
 576 void *
 577 kmem_alloc_debug(size_t size, int flags, const char *func, int line,
 578     int node_alloc, int node)
 579 {
 580         void *ptr;
 581         ENTRY;
 582
 583         /* Marked unlikely because we should never be doing this,
 584          * we tolerate to up 2 pages but a single page is best.   */
 585         if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
 586                 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
 587                     (unsigned long long) size, flags,
 588                     atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 589
 590         /* Use the correct allocator */
 591         if (node_alloc) {
 592                 ASSERT(!(flags & __GFP_ZERO));
 593                 ptr = kmalloc_node(size, flags, node);
 594         } else if (flags & __GFP_ZERO) {
 595                 ptr = kzalloc(size, flags & (~__GFP_ZERO));
 596         } else {
 597                 ptr = kmalloc(size, flags);
 598         }
 599
 600         if (ptr == NULL) {
 601                 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 602                     (unsigned long long) size, flags,
 603                     atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 604         } else {
 605                 atomic64_add(size, &kmem_alloc_used);
 606                 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
 607                         kmem_alloc_max = atomic64_read(&kmem_alloc_used);
 608
 609                 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
 610                        "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 611                        atomic64_read(&kmem_alloc_used), kmem_alloc_max);
 612         }
 613         RETURN(ptr);
 614 }
 615 EXPORT_SYMBOL(kmem_alloc_debug);
 616
 617 void
 618 kmem_free_debug(void *ptr, size_t size)
 619 {
 620         ENTRY;
 621
 622         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 623             (unsigned long long) size);
 624
 625         atomic64_sub(size, &kmem_alloc_used);
 626
 627         CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 628             (unsigned long long) size, atomic64_read(&kmem_alloc_used),
 629             kmem_alloc_max);
 630
 631         memset(ptr, 0x5a, size);
 632         kfree(ptr);
 633
 634         EXIT;
 635 }
 636 EXPORT_SYMBOL(kmem_free_debug);
 637
 638 void *
 639 vmem_alloc_debug(size_t size, int flags, const char *func, int line)
 640 {
 641         void *ptr;
 642         ENTRY;
 643
 644         ASSERT(flags & KM_SLEEP);
 645
 646         ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
 647             PAGE_KERNEL);
 648         if (ptr == NULL) {
 649                 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
 650                     (unsigned long long) size, flags,
 651                     atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 652         } else {
 653                 if (flags & __GFP_ZERO)
 654                         memset(ptr, 0, size);
 655
 656                 atomic64_add(size, &vmem_alloc_used);
 657
 658                 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
 659                         vmem_alloc_max = atomic64_read(&vmem_alloc_used);
 660
 661                 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
 662                     "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
 663                     atomic64_read(&vmem_alloc_used), vmem_alloc_max);
 664         }
 665
 666         RETURN(ptr);
 667 }
 668 EXPORT_SYMBOL(vmem_alloc_debug);
 669
 670 void
 671 vmem_free_debug(void *ptr, size_t size)
 672 {
 673         ENTRY;
 674
 675         ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
 676             (unsigned long long) size);
 677
 678         atomic64_sub(size, &vmem_alloc_used);
 679
 680         CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
 681             (unsigned long long) size, atomic64_read(&vmem_alloc_used),
 682             vmem_alloc_max);
 683
 684         memset(ptr, 0x5a, size);
 685         vfree(ptr);
 686
 687         EXIT;
 688 }
 689 EXPORT_SYMBOL(vmem_free_debug);
 690
 691 # endif /* DEBUG_KMEM_TRACKING */
 692 #endif /* DEBUG_KMEM */
 693
 694 static void *
 695 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 696 {
 697         void *ptr;
 698
 699         if (skc->skc_flags & KMC_KMEM) {
 700                 if (size > (2 * PAGE_SIZE)) {
 701                         ptr = (void *)__get_free_pages(flags, get_order(size));
 702                 } else
 703                         ptr = kmem_alloc(size, flags);
 704         } else {
 705                 ptr = vmem_alloc(size, flags);
 706         }
 707
 708         return ptr;
 709 }
 710
 711 static void
 712 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 713 {
 714         if (skc->skc_flags & KMC_KMEM) {
 715                 if (size > (2 * PAGE_SIZE))
 716                         free_pages((unsigned long)ptr, get_order(size));
 717                 else
 718                         kmem_free(ptr, size);
 719         } else {
 720                 vmem_free(ptr, size);
 721         }
 722 }
 723
 724 /*
 725  * It's important that we pack the spl_kmem_obj_t structure and the
 726  * actual objects in to one large address space to minimize the number
 727  * of calls to the allocator.  It is far better to do a few large
 728  * allocations and then subdivide it ourselves.  Now which allocator
 729  * we use requires balancing a few trade offs.
 730  *
 731  * For small objects we use kmem_alloc() because as long as you are
 732  * only requesting a small number of pages (ideally just one) its cheap.
 733  * However, when you start requesting multiple pages with kmem_alloc()
 734  * it gets increasingly expensive since it requires contigeous pages.
 735  * For this reason we shift to vmem_alloc() for slabs of large objects
 736  * which removes the need for contigeous pages.  We do not use
 737  * vmem_alloc() in all cases because there is significant locking
 738  * overhead in __get_vm_area_node().  This function takes a single
 739  * global lock when aquiring an available virtual address range which
 740  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
 741  * different allocation functions for small and large objects should
 742  * give us the best of both worlds.
 743  *
 744  * KMC_ONSLAB                       KMC_OFFSLAB
 745  *
 746  * +------------------------+       +-----------------+
 747  * | spl_kmem_slab_t --+-+  |       | spl_kmem_slab_t |---+-+
 748  * | skc_obj_size    <-+ |  |       +-----------------+   | |
 749  * | spl_kmem_obj_t      |  |                             | |
 750  * | skc_obj_size    <---+  |       +-----------------+   | |
 751  * | spl_kmem_obj_t      |  |       | skc_obj_size    | <-+ |
 752  * | ...                 v  |       | spl_kmem_obj_t  |     |
 753  * +------------------------+       +-----------------+     v
 754  */
 755 static spl_kmem_slab_t *
 756 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 757 {
 758         spl_kmem_slab_t *sks;
 759         spl_kmem_obj_t *sko, *n;
 760         void *base, *obj;
 761         int i, align, size, rc = 0;
 762
 763         base = kv_alloc(skc, skc->skc_slab_size, flags);
 764         if (base == NULL)
 765                 RETURN(NULL);
 766
 767         sks = (spl_kmem_slab_t *)base;
 768         sks->sks_magic = SKS_MAGIC;
 769         sks->sks_objs = skc->skc_slab_objs;
 770         sks->sks_age = jiffies;
 771         sks->sks_cache = skc;
 772         INIT_LIST_HEAD(&sks->sks_list);
 773         INIT_LIST_HEAD(&sks->sks_free_list);
 774         sks->sks_ref = 0;
 775
 776         align = skc->skc_obj_align;
 777         size = P2ROUNDUP(skc->skc_obj_size, align) +
 778                P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
 779
 780         for (i = 0; i < sks->sks_objs; i++) {
 781                 if (skc->skc_flags & KMC_OFFSLAB) {
 782                         obj = kv_alloc(skc, size, flags);
 783                         if (!obj)
 784                                 GOTO(out, rc = -ENOMEM);
 785                 } else {
 786                         obj = base +
 787                               P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
 788                               (i * size);
 789                 }
 790
 791                 sko = obj + P2ROUNDUP(skc->skc_obj_size, align);
 792                 sko->sko_addr = obj;
 793                 sko->sko_magic = SKO_MAGIC;
 794                 sko->sko_slab = sks;
 795                 INIT_LIST_HEAD(&sko->sko_list);
 796                 list_add_tail(&sko->sko_list, &sks->sks_free_list);
 797         }
 798
 799         list_for_each_entry(sko, &sks->sks_free_list, sko_list)
 800                 if (skc->skc_ctor)
 801                         skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
 802 out:
 803         if (rc) {
 804                 if (skc->skc_flags & KMC_OFFSLAB)
 805                         list_for_each_entry_safe(sko, n, &sks->sks_free_list,
 806                                                  sko_list)
 807                                 kv_free(skc, sko->sko_addr, size);
 808
 809                 kv_free(skc, base, skc->skc_slab_size);
 810                 sks = NULL;
 811         }
 812
 813         RETURN(sks);
 814 }
 815
 816 /*
 817  * Remove a slab from complete or partial list, it must be called with
 818  * the 'skc->skc_lock' held but the actual free must be performed
 819  * outside the lock to prevent deadlocking on vmem addresses.
 820  */
 821 static void
 822 spl_slab_free(spl_kmem_slab_t *sks,
 823               struct list_head *sks_list, struct list_head *sko_list)
 824 {
 825         spl_kmem_cache_t *skc;
 826         spl_kmem_obj_t *sko, *n;
 827         ENTRY;
 828
 829         ASSERT(sks->sks_magic == SKS_MAGIC);
 830         ASSERT(sks->sks_ref == 0);
 831
 832         skc = sks->sks_cache;
 833         ASSERT(skc->skc_magic == SKC_MAGIC);
 834         ASSERT(spin_is_locked(&skc->skc_lock));
 835
 836         skc->skc_obj_total -= sks->sks_objs;
 837         skc->skc_slab_total--;
 838         list_del(&sks->sks_list);
 839
 840         /* Run destructors slab is being released */
 841         list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
 842                 ASSERT(sko->sko_magic == SKO_MAGIC);
 843                 list_del(&sko->sko_list);
 844
 845                 if (skc->skc_dtor)
 846                         skc->skc_dtor(sko->sko_addr, skc->skc_private);
 847
 848                 if (skc->skc_flags & KMC_OFFSLAB)
 849                         list_add(&sko->sko_list, sko_list);
 850         }
 851
 852         list_add(&sks->sks_list, sks_list);
 853         EXIT;
 854 }
 855
 856 /*
 857  * Traverses all the partial slabs attached to a cache and free those
 858  * which which are currently empty, and have not been touched for
 859  * skc_delay seconds to  avoid thrashing.  The count argument is
 860  * passed to optionally cap the number of slabs reclaimed, a count
 861  * of zero means try and reclaim everything.  When flag is set we
 862  * always free an available slab regardless of age.
 863  */
 864 static void
 865 spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 866 {
 867         spl_kmem_slab_t *sks, *m;
 868         spl_kmem_obj_t *sko, *n;
 869         LIST_HEAD(sks_list);
 870         LIST_HEAD(sko_list);
 871         int size, i = 0;
 872         ENTRY;
 873
 874         /*
 875          * Move empty slabs and objects which have not been touched in
 876          * skc_delay seconds on to private lists to be freed outside
 877          * the spin lock.  This delay time is important to avoid
 878          * thrashing however when flag is set the delay will not be
 879          * used.  Empty slabs will be at the end of the skc_partial_list.
 880          */
 881         spin_lock(&skc->skc_lock);
 882         list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
 883                                          sks_list) {
 884                 /* Release at most count slabs */
 885                 if (count && i > count)
 886                         break;
 887
 888                 /* Skip active slabs */
 889                 if (sks->sks_ref > 0)
 890                         continue;
 891
 892                 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
 893                         spl_slab_free(sks, &sks_list, &sko_list);
 894                         i++;
 895                 }
 896         }
 897         spin_unlock(&skc->skc_lock);
 898
 899         /*
 900          * We only have list of spl_kmem_obj_t's if they are located off
 901          * the slab, otherwise they get feed with the spl_kmem_slab_t.
 902          */
 903         if (!list_empty(&sko_list)) {
 904                 ASSERT(skc->skc_flags & KMC_OFFSLAB);
 905
 906                 size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
 907                        P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
 908
 909                 /* To avoid soft lockups conditionally reschedule */
 910                 list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 911                         kv_free(skc, sko->sko_addr, size);
 912                         cond_resched();
 913                 }
 914         }
 915
 916         /* To avoid soft lockups conditionally reschedule */
 917         list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 918                 kv_free(skc, sks, skc->skc_slab_size);
 919                 cond_resched();
 920         }
 921
 922         EXIT;
 923 }
 924
 925 /*
 926  * Called regularly on all caches to age objects out of the magazines
 927  * which have not been access in skc->skc_delay seconds.  This prevents
 928  * idle magazines from holding memory which might be better used by
 929  * other caches or parts of the system.  The delay is present to
 930  * prevent thrashing the magazine.
 931  */
 932 static void
 933 spl_magazine_age(void *data)
 934 {
 935         spl_kmem_cache_t *skc = data;
 936         spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 937
 938         if (skm->skm_avail > 0 &&
 939             time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
 940                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
 941 }
 942
 943 /*
 944  * Called regularly to keep a downward pressure on the size of idle
 945  * magazines and to release free slabs from the cache.  This function
 946  * never calls the registered reclaim function, that only occures
 947  * under memory pressure or with a direct call to spl_kmem_reap().
 948  */
 949 static void
 950 spl_cache_age(void *data)
 951 {
 952         spl_kmem_cache_t *skc =
 953                 spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
 954
 955         ASSERT(skc->skc_magic == SKC_MAGIC);
 956         spl_slab_reclaim(skc, skc->skc_reap, 0);
 957         spl_on_each_cpu(spl_magazine_age, skc, 0);
 958
 959         if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
 960                 schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
 961 }
 962
 963 /*
 964  * Size a slab based on the size of each aliged object plus spl_kmem_obj_t.
 965  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However,
 966  * for very small objects we may end up with more than this so as not
 967  * to waste space in the minimal allocation of a single page.  Also for
 968  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN,
 969  * lower than this and we will fail.
 970  */
 971 static int
 972 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 973 {
 974         int sks_size, obj_size, max_size, align;
 975
 976         if (skc->skc_flags & KMC_OFFSLAB) {
 977                 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 978                 *size = sizeof(spl_kmem_slab_t);
 979         } else {
 980                 align = skc->skc_obj_align;
 981                 sks_size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align);
 982                 obj_size = P2ROUNDUP(skc->skc_obj_size, align) +
 983                            P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
 984
 985                 if (skc->skc_flags & KMC_KMEM)
 986                         max_size = ((uint64_t)1 << (MAX_ORDER-1)) * PAGE_SIZE;
 987                 else
 988                         max_size = (32 * 1024 * 1024);
 989
 990                 for (*size = PAGE_SIZE; *size <= max_size; *size += PAGE_SIZE) {
 991                         *objs = (*size - sks_size) / obj_size;
 992                         if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB)
 993                                 RETURN(0);
 994                 }
 995
 996                 /*
 997                  * Unable to satisfy target objets per slab, fallback to
 998                  * allocating a maximally sized slab and assuming it can
 999                  * contain the minimum objects count use it.  If not fail.
1000                  */
1001                 *size = max_size;
1002                 *objs = (*size - sks_size) / obj_size;
1003                 if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN)
1004                         RETURN(0);
1005         }
1006
1007         RETURN(-ENOSPC);
1008 }
1009
1010 /*
1011  * Make a guess at reasonable per-cpu magazine size based on the size of
1012  * each object and the cost of caching N of them in each magazine.  Long
1013  * term this should really adapt based on an observed usage heuristic.
1014  */
1015 static int
1016 spl_magazine_size(spl_kmem_cache_t *skc)
1017 {
1018         int size, align = skc->skc_obj_align;
1019         ENTRY;
1020
1021         /* Per-magazine sizes below assume a 4Kib page size */
1022         if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
1023                 size = 4;  /* Minimum 4Mib per-magazine */
1024         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
1025                 size = 16; /* Minimum 2Mib per-magazine */
1026         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
1027                 size = 64; /* Minimum 256Kib per-magazine */
1028         else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
1029                 size = 128; /* Minimum 128Kib per-magazine */
1030         else
1031                 size = 256;
1032
1033         RETURN(size);
1034 }
1035
1036 /*
1037  * Allocate a per-cpu magazine to assoicate with a specific core.
1038  */
1039 static spl_kmem_magazine_t *
1040 spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
1041 {
1042         spl_kmem_magazine_t *skm;
1043         int size = sizeof(spl_kmem_magazine_t) +
1044                    sizeof(void *) * skc->skc_mag_size;
1045         ENTRY;
1046
1047         skm = kmem_alloc_node(size, GFP_KERNEL | __GFP_NOFAIL, node);
1048         if (skm) {
1049                 skm->skm_magic = SKM_MAGIC;
1050                 skm->skm_avail = 0;
1051                 skm->skm_size = skc->skc_mag_size;
1052                 skm->skm_refill = skc->skc_mag_refill;
1053                 skm->skm_age = jiffies;
1054         }
1055
1056         RETURN(skm);
1057 }
1058
1059 /*
1060  * Free a per-cpu magazine assoicated with a specific core.
1061  */
1062 static void
1063 spl_magazine_free(spl_kmem_magazine_t *skm)
1064 {
1065         int size = sizeof(spl_kmem_magazine_t) +
1066                    sizeof(void *) * skm->skm_size;
1067
1068         ENTRY;
1069         ASSERT(skm->skm_magic == SKM_MAGIC);
1070         ASSERT(skm->skm_avail == 0);
1071
1072         kmem_free(skm, size);
1073         EXIT;
1074 }
1075
1076 /*
1077  * Create all pre-cpu magazines of reasonable sizes.
1078  */
1079 static int
1080 spl_magazine_create(spl_kmem_cache_t *skc)
1081 {
1082         int i;
1083         ENTRY;
1084
1085         skc->skc_mag_size = spl_magazine_size(skc);
1086         skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
1087
1088         for_each_online_cpu(i) {
1089                 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
1090                 if (!skc->skc_mag[i]) {
1091                         for (i--; i >= 0; i--)
1092                                 spl_magazine_free(skc->skc_mag[i]);
1093
1094                         RETURN(-ENOMEM);
1095                 }
1096         }
1097
1098         RETURN(0);
1099 }
1100
1101 /*
1102  * Destroy all pre-cpu magazines.
1103  */
1104 static void
1105 spl_magazine_destroy(spl_kmem_cache_t *skc)
1106 {
1107         spl_kmem_magazine_t *skm;
1108         int i;
1109         ENTRY;
1110
1111         for_each_online_cpu(i) {
1112                 skm = skc->skc_mag[i];
1113                 (void)spl_cache_flush(skc, skm, skm->skm_avail);
1114                 spl_magazine_free(skm);
1115         }
1116
1117         EXIT;
1118 }
1119
1120 /*
1121  * Create a object cache based on the following arguments:
1122  * name         cache name
1123  * size         cache object size
1124  * align        cache object alignment
1125  * ctor         cache object constructor
1126  * dtor         cache object destructor
1127  * reclaim      cache object reclaim
1128  * priv         cache private data for ctor/dtor/reclaim
1129  * vmp          unused must be NULL
1130  * flags
1131  *      KMC_NOTOUCH     Disable cache object aging (unsupported)
1132  *      KMC_NODEBUG     Disable debugging (unsupported)
1133  *      KMC_NOMAGAZINE  Disable magazine (unsupported)
1134  *      KMC_NOHASH      Disable hashing (unsupported)
1135  *      KMC_QCACHE      Disable qcache (unsupported)
1136  *      KMC_KMEM        Force kmem backed cache
1137  *      KMC_VMEM        Force vmem backed cache
1138  *      KMC_OFFSLAB     Locate objects off the slab
1139  */
1140 spl_kmem_cache_t *
1141 spl_kmem_cache_create(char *name, size_t size, size_t align,
1142                       spl_kmem_ctor_t ctor,
1143                       spl_kmem_dtor_t dtor,
1144                       spl_kmem_reclaim_t reclaim,
1145                       void *priv, void *vmp, int flags)
1146 {
1147         spl_kmem_cache_t *skc;
1148         int rc, kmem_flags = KM_SLEEP;
1149         ENTRY;
1150
1151         ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1152         ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1153         ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
1154         ASSERT(vmp == NULL);
1155
1156         /* We may be called when there is a non-zero preempt_count or
1157          * interrupts are disabled is which case we must not sleep.
1158          */
1159         if (current_thread_info()->preempt_count || irqs_disabled())
1160                 kmem_flags = KM_NOSLEEP;
1161
1162         /* Allocate new cache memory and initialize. */
1163         skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
1164         if (skc == NULL)
1165                 RETURN(NULL);
1166
1167         skc->skc_magic = SKC_MAGIC;
1168         skc->skc_name_size = strlen(name) + 1;
1169         skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
1170         if (skc->skc_name == NULL) {
1171                 kmem_free(skc, sizeof(*skc));
1172                 RETURN(NULL);
1173         }
1174         strncpy(skc->skc_name, name, skc->skc_name_size);
1175
1176         skc->skc_ctor = ctor;
1177         skc->skc_dtor = dtor;
1178         skc->skc_reclaim = reclaim;
1179         skc->skc_private = priv;
1180         skc->skc_vmp = vmp;
1181         skc->skc_flags = flags;
1182         skc->skc_obj_size = size;
1183         skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
1184         skc->skc_delay = SPL_KMEM_CACHE_DELAY;
1185         skc->skc_reap = SPL_KMEM_CACHE_REAP;
1186         atomic_set(&skc->skc_ref, 0);
1187
1188         INIT_LIST_HEAD(&skc->skc_list);
1189         INIT_LIST_HEAD(&skc->skc_complete_list);
1190         INIT_LIST_HEAD(&skc->skc_partial_list);
1191         spin_lock_init(&skc->skc_lock);
1192         skc->skc_slab_fail = 0;
1193         skc->skc_slab_create = 0;
1194         skc->skc_slab_destroy = 0;
1195         skc->skc_slab_total = 0;
1196         skc->skc_slab_alloc = 0;
1197         skc->skc_slab_max = 0;
1198         skc->skc_obj_total = 0;
1199         skc->skc_obj_alloc = 0;
1200         skc->skc_obj_max = 0;
1201
1202         if (align) {
1203                 ASSERT((align & (align - 1)) == 0);    /* Power of two */
1204                 ASSERT(align >= SPL_KMEM_CACHE_ALIGN); /* Minimum size */
1205                 skc->skc_obj_align = align;
1206         }
1207
1208         /* If none passed select a cache type based on object size */
1209         if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
1210                 if (P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) <
1211                     (PAGE_SIZE / 8)) {
1212                         skc->skc_flags |= KMC_KMEM;
1213                 } else {
1214                         skc->skc_flags |= KMC_VMEM;
1215                 }
1216         }
1217
1218         rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
1219         if (rc)
1220                 GOTO(out, rc);
1221
1222         rc = spl_magazine_create(skc);
1223         if (rc)
1224                 GOTO(out, rc);
1225
1226         spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
1227         schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
1228
1229         down_write(&spl_kmem_cache_sem);
1230         list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
1231         up_write(&spl_kmem_cache_sem);
1232
1233         RETURN(skc);
1234 out:
1235         kmem_free(skc->skc_name, skc->skc_name_size);
1236         kmem_free(skc, sizeof(*skc));
1237         RETURN(NULL);
1238 }
1239 EXPORT_SYMBOL(spl_kmem_cache_create);
1240
1241 /*
1242  * Destroy a cache and all objects assoicated with the cache.
1243  */
1244 void
1245 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
1246 {
1247         DECLARE_WAIT_QUEUE_HEAD(wq);
1248         ENTRY;
1249
1250         ASSERT(skc->skc_magic == SKC_MAGIC);
1251
1252         down_write(&spl_kmem_cache_sem);
1253         list_del_init(&skc->skc_list);
1254         up_write(&spl_kmem_cache_sem);
1255
1256         /* Cancel any and wait for any pending delayed work */
1257         ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1258         cancel_delayed_work(&skc->skc_work);
1259         flush_scheduled_work();
1260
1261         /* Wait until all current callers complete, this is mainly
1262          * to catch the case where a low memory situation triggers a
1263          * cache reaping action which races with this destroy. */
1264         wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1265
1266         spl_magazine_destroy(skc);
1267         spl_slab_reclaim(skc, 0, 1);
1268         spin_lock(&skc->skc_lock);
1269
1270         /* Validate there are no objects in use and free all the
1271          * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
1272         ASSERT3U(skc->skc_slab_alloc, ==, 0);
1273         ASSERT3U(skc->skc_obj_alloc, ==, 0);
1274         ASSERT3U(skc->skc_slab_total, ==, 0);
1275         ASSERT3U(skc->skc_obj_total, ==, 0);
1276         ASSERT(list_empty(&skc->skc_complete_list));
1277
1278         kmem_free(skc->skc_name, skc->skc_name_size);
1279         spin_unlock(&skc->skc_lock);
1280
1281         kmem_free(skc, sizeof(*skc));
1282
1283         EXIT;
1284 }
1285 EXPORT_SYMBOL(spl_kmem_cache_destroy);
1286
1287 /*
1288  * Allocate an object from a slab attached to the cache.  This is used to
1289  * repopulate the per-cpu magazine caches in batches when they run low.
1290  */
1291 static void *
1292 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
1293 {
1294         spl_kmem_obj_t *sko;
1295
1296         ASSERT(skc->skc_magic == SKC_MAGIC);
1297         ASSERT(sks->sks_magic == SKS_MAGIC);
1298         ASSERT(spin_is_locked(&skc->skc_lock));
1299
1300         sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
1301         ASSERT(sko->sko_magic == SKO_MAGIC);
1302         ASSERT(sko->sko_addr != NULL);
1303
1304         /* Remove from sks_free_list */
1305         list_del_init(&sko->sko_list);
1306
1307         sks->sks_age = jiffies;
1308         sks->sks_ref++;
1309         skc->skc_obj_alloc++;
1310
1311         /* Track max obj usage statistics */
1312         if (skc->skc_obj_alloc > skc->skc_obj_max)
1313                 skc->skc_obj_max = skc->skc_obj_alloc;
1314
1315         /* Track max slab usage statistics */
1316         if (sks->sks_ref == 1) {
1317                 skc->skc_slab_alloc++;
1318
1319                 if (skc->skc_slab_alloc > skc->skc_slab_max)
1320                         skc->skc_slab_max = skc->skc_slab_alloc;
1321         }
1322
1323         return sko->sko_addr;
1324 }
1325
1326 /*
1327  * No available objects on any slabsi, create a new slab.  Since this
1328  * is an expensive operation we do it without holding the spinlock and
1329  * only briefly aquire it when we link in the fully allocated and
1330  * constructed slab.
1331  */
1332 static spl_kmem_slab_t *
1333 spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1334 {
1335         spl_kmem_slab_t *sks;
1336         ENTRY;
1337
1338         ASSERT(skc->skc_magic == SKC_MAGIC);
1339         local_irq_enable();
1340         might_sleep();
1341
1342         /*
1343          * Before allocating a new slab check if the slab is being reaped.
1344          * If it is there is a good chance we can wait until it finishes
1345          * and then use one of the newly freed but not aged-out slabs.
1346          */
1347         if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1348                 schedule();
1349                 GOTO(out, sks= NULL);
1350         }
1351
1352         /* Allocate a new slab for the cache */
1353         sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | __GFP_NOWARN);
1354         if (sks == NULL)
1355                 GOTO(out, sks = NULL);
1356
1357         /* Link the new empty slab in to the end of skc_partial_list. */
1358         spin_lock(&skc->skc_lock);
1359         skc->skc_slab_total++;
1360         skc->skc_obj_total += sks->sks_objs;
1361         list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1362         spin_unlock(&skc->skc_lock);
1363 out:
1364         local_irq_disable();
1365
1366         RETURN(sks);
1367 }
1368
1369 /*
1370  * Refill a per-cpu magazine with objects from the slabs for this
1371  * cache.  Ideally the magazine can be repopulated using existing
1372  * objects which have been released, however if we are unable to
1373  * locate enough free objects new slabs of objects will be created.
1374  */
1375 static int
1376 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
1377 {
1378         spl_kmem_slab_t *sks;
1379         int rc = 0, refill;
1380         ENTRY;
1381
1382         ASSERT(skc->skc_magic == SKC_MAGIC);
1383         ASSERT(skm->skm_magic == SKM_MAGIC);
1384
1385         refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
1386         spin_lock(&skc->skc_lock);
1387
1388         while (refill > 0) {
1389                 /* No slabs available we may need to grow the cache */
1390                 if (list_empty(&skc->skc_partial_list)) {
1391                         spin_unlock(&skc->skc_lock);
1392
1393                         sks = spl_cache_grow(skc, flags);
1394                         if (!sks)
1395                                 GOTO(out, rc);
1396
1397                         /* Rescheduled to different CPU skm is not local */
1398                         if (skm != skc->skc_mag[smp_processor_id()])
1399                                 GOTO(out, rc);
1400
1401                         /* Potentially rescheduled to the same CPU but
1402                          * allocations may have occured from this CPU while
1403                          * we were sleeping so recalculate max refill. */
1404                         refill = MIN(refill, skm->skm_size - skm->skm_avail);
1405
1406                         spin_lock(&skc->skc_lock);
1407                         continue;
1408                 }
1409
1410                 /* Grab the next available slab */
1411                 sks = list_entry((&skc->skc_partial_list)->next,
1412                                  spl_kmem_slab_t, sks_list);
1413                 ASSERT(sks->sks_magic == SKS_MAGIC);
1414                 ASSERT(sks->sks_ref < sks->sks_objs);
1415                 ASSERT(!list_empty(&sks->sks_free_list));
1416
1417                 /* Consume as many objects as needed to refill the requested
1418                  * cache.  We must also be careful not to overfill it. */
1419                 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1420                         ASSERT(skm->skm_avail < skm->skm_size);
1421                         ASSERT(rc < skm->skm_size);
1422                         skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
1423                 }
1424
1425                 /* Move slab to skc_complete_list when full */
1426                 if (sks->sks_ref == sks->sks_objs) {
1427                         list_del(&sks->sks_list);
1428                         list_add(&sks->sks_list, &skc->skc_complete_list);
1429                 }
1430         }
1431
1432         spin_unlock(&skc->skc_lock);
1433 out:
1434         /* Returns the number of entries added to cache */
1435         RETURN(rc);
1436 }
1437
1438 /*
1439  * Release an object back to the slab from which it came.
1440  */
1441 static void
1442 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1443 {
1444         spl_kmem_slab_t *sks = NULL;
1445         spl_kmem_obj_t *sko = NULL;
1446         ENTRY;
1447
1448         ASSERT(skc->skc_magic == SKC_MAGIC);
1449         ASSERT(spin_is_locked(&skc->skc_lock));
1450
1451         sko = obj + P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align);
1452         ASSERT(sko->sko_magic == SKO_MAGIC);
1453
1454         sks = sko->sko_slab;
1455         ASSERT(sks->sks_magic == SKS_MAGIC);
1456         ASSERT(sks->sks_cache == skc);
1457         list_add(&sko->sko_list, &sks->sks_free_list);
1458
1459         sks->sks_age = jiffies;
1460         sks->sks_ref--;
1461         skc->skc_obj_alloc--;
1462
1463         /* Move slab to skc_partial_list when no longer full.  Slabs
1464          * are added to the head to keep the partial list is quasi-full
1465          * sorted order.  Fuller at the head, emptier at the tail. */
1466         if (sks->sks_ref == (sks->sks_objs - 1)) {
1467                 list_del(&sks->sks_list);
1468                 list_add(&sks->sks_list, &skc->skc_partial_list);
1469         }
1470
1471         /* Move emply slabs to the end of the partial list so
1472          * they can be easily found and freed during reclamation. */
1473         if (sks->sks_ref == 0) {
1474                 list_del(&sks->sks_list);
1475                 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1476                 skc->skc_slab_alloc--;
1477         }
1478
1479         EXIT;
1480 }
1481
1482 /*
1483  * Release a batch of objects from a per-cpu magazine back to their
1484  * respective slabs.  This occurs when we exceed the magazine size,
1485  * are under memory pressure, when the cache is idle, or during
1486  * cache cleanup.  The flush argument contains the number of entries
1487  * to remove from the magazine.
1488  */
1489 static int
1490 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1491 {
1492         int i, count = MIN(flush, skm->skm_avail);
1493         ENTRY;
1494
1495         ASSERT(skc->skc_magic == SKC_MAGIC);
1496         ASSERT(skm->skm_magic == SKM_MAGIC);
1497
1498         /*
1499          * XXX: Currently we simply return objects from the magazine to
1500          * the slabs in fifo order.  The ideal thing to do from a memory
1501          * fragmentation standpoint is to cheaply determine the set of
1502          * objects in the magazine which will result in the largest
1503          * number of free slabs if released from the magazine.
1504          */
1505         spin_lock(&skc->skc_lock);
1506         for (i = 0; i < count; i++)
1507                 spl_cache_shrink(skc, skm->skm_objs[i]);
1508
1509         skm->skm_avail -= count;
1510         memmove(skm->skm_objs, &(skm->skm_objs[count]),
1511                 sizeof(void *) * skm->skm_avail);
1512
1513         spin_unlock(&skc->skc_lock);
1514
1515         RETURN(count);
1516 }
1517
1518 /*
1519  * Allocate an object from the per-cpu magazine, or if the magazine
1520  * is empty directly allocate from a slab and repopulate the magazine.
1521  */
1522 void *
1523 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1524 {
1525         spl_kmem_magazine_t *skm;
1526         unsigned long irq_flags;
1527         void *obj = NULL;
1528         ENTRY;
1529
1530         ASSERT(skc->skc_magic == SKC_MAGIC);
1531         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1532         ASSERT(flags & KM_SLEEP);
1533         atomic_inc(&skc->skc_ref);
1534         local_irq_save(irq_flags);
1535
1536 restart:
1537         /* Safe to update per-cpu structure without lock, but
1538          * in the restart case we must be careful to reaquire
1539          * the local magazine since this may have changed
1540          * when we need to grow the cache. */
1541         skm = skc->skc_mag[smp_processor_id()];
1542         ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1543                 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1544                 skm->skm_size, skm->skm_refill, skm->skm_avail);
1545
1546         if (likely(skm->skm_avail)) {
1547                 /* Object available in CPU cache, use it */
1548                 obj = skm->skm_objs[--skm->skm_avail];
1549                 skm->skm_age = jiffies;
1550         } else {
1551                 /* Per-CPU cache empty, directly allocate from
1552                  * the slab and refill the per-CPU cache. */
1553                 (void)spl_cache_refill(skc, skm, flags);
1554                 GOTO(restart, obj = NULL);
1555         }
1556
1557         local_irq_restore(irq_flags);
1558         ASSERT(obj);
1559         ASSERT(((unsigned long)(obj) % skc->skc_obj_align) == 0);
1560
1561         /* Pre-emptively migrate object to CPU L1 cache */
1562         prefetchw(obj);
1563         atomic_dec(&skc->skc_ref);
1564
1565         RETURN(obj);
1566 }
1567 EXPORT_SYMBOL(spl_kmem_cache_alloc);
1568
1569 /*
1570  * Free an object back to the local per-cpu magazine, there is no
1571  * guarantee that this is the same magazine the object was originally
1572  * allocated from.  We may need to flush entire from the magazine
1573  * back to the slabs to make space.
1574  */
1575 void
1576 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1577 {
1578         spl_kmem_magazine_t *skm;
1579         unsigned long flags;
1580         ENTRY;
1581
1582         ASSERT(skc->skc_magic == SKC_MAGIC);
1583         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1584         atomic_inc(&skc->skc_ref);
1585         local_irq_save(flags);
1586
1587         /* Safe to update per-cpu structure without lock, but
1588          * no remote memory allocation tracking is being performed
1589          * it is entirely possible to allocate an object from one
1590          * CPU cache and return it to another. */
1591         skm = skc->skc_mag[smp_processor_id()];
1592         ASSERT(skm->skm_magic == SKM_MAGIC);
1593
1594         /* Per-CPU cache full, flush it to make space */
1595         if (unlikely(skm->skm_avail >= skm->skm_size))
1596                 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1597
1598         /* Available space in cache, use it */
1599         skm->skm_objs[skm->skm_avail++] = obj;
1600
1601         local_irq_restore(flags);
1602         atomic_dec(&skc->skc_ref);
1603
1604         EXIT;
1605 }
1606 EXPORT_SYMBOL(spl_kmem_cache_free);
1607
1608 /*
1609  * The generic shrinker function for all caches.  Under linux a shrinker
1610  * may not be tightly coupled with a slab cache.  In fact linux always
1611  * systematically trys calling all registered shrinker callbacks which
1612  * report that they contain unused objects.  Because of this we only
1613  * register one shrinker function in the shim layer for all slab caches.
1614  * We always attempt to shrink all caches when this generic shrinker
1615  * is called.  The shrinker should return the number of free objects
1616  * in the cache when called with nr_to_scan == 0 but not attempt to
1617  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan
1618  * objects should be freed, because Solaris semantics are to free
1619  * all available objects we may free more objects than requested.
1620  */
1621 static int
1622 spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
1623 {
1624         spl_kmem_cache_t *skc;
1625         int unused = 0;
1626
1627         down_read(&spl_kmem_cache_sem);
1628         list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
1629                 if (nr_to_scan)
1630                         spl_kmem_cache_reap_now(skc);
1631
1632                 /*
1633                  * Presume everything alloc'ed in reclaimable, this ensures
1634                  * we are called again with nr_to_scan > 0 so can try and
1635                  * reclaim.  The exact number is not important either so
1636                  * we forgo taking this already highly contented lock.
1637                  */
1638                 unused += skc->skc_obj_alloc;
1639         }
1640         up_read(&spl_kmem_cache_sem);
1641
1642         return (unused * sysctl_vfs_cache_pressure) / 100;
1643 }
1644
1645 /*
1646  * Call the registered reclaim function for a cache.  Depending on how
1647  * many and which objects are released it may simply repopulate the
1648  * local magazine which will then need to age-out.  Objects which cannot
1649  * fit in the magazine we will be released back to their slabs which will
1650  * also need to age out before being release.  This is all just best
1651  * effort and we do not want to thrash creating and destroying slabs.
1652  */
1653 void
1654 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
1655 {
1656         ENTRY;
1657
1658         ASSERT(skc->skc_magic == SKC_MAGIC);
1659         ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
1660
1661         /* Prevent concurrent cache reaping when contended */
1662         if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1663                 EXIT;
1664                 return;
1665         }
1666
1667         atomic_inc(&skc->skc_ref);
1668
1669         if (skc->skc_reclaim)
1670                 skc->skc_reclaim(skc->skc_private);
1671
1672         spl_slab_reclaim(skc, skc->skc_reap, 0);
1673         clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
1674         atomic_dec(&skc->skc_ref);
1675
1676         EXIT;
1677 }
1678 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
1679
1680 /*
1681  * Reap all free slabs from all registered caches.
1682  */
1683 void
1684 spl_kmem_reap(void)
1685 {
1686         spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
1687 }
1688 EXPORT_SYMBOL(spl_kmem_reap);
1689
1690 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
1691 static char *
1692 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
1693 {
1694         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
1695         int i, flag = 1;
1696
1697         ASSERT(str != NULL && len >= 17);
1698         memset(str, 0, len);
1699
1700         /* Check for a fully printable string, and while we are at
1701          * it place the printable characters in the passed buffer. */
1702         for (i = 0; i < size; i++) {
1703                 str[i] = ((char *)(kd->kd_addr))[i];
1704                 if (isprint(str[i])) {
1705                         continue;
1706                 } else {
1707                         /* Minimum number of printable characters found
1708                          * to make it worthwhile to print this as ascii. */
1709                         if (i > min)
1710                                 break;
1711
1712                         flag = 0;
1713                         break;
1714                 }
1715         }
1716
1717         if (!flag) {
1718                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1719                         *((uint8_t *)kd->kd_addr),
1720                         *((uint8_t *)kd->kd_addr + 2),
1721                         *((uint8_t *)kd->kd_addr + 4),
1722                         *((uint8_t *)kd->kd_addr + 6),
1723                         *((uint8_t *)kd->kd_addr + 8),
1724                         *((uint8_t *)kd->kd_addr + 10),
1725                         *((uint8_t *)kd->kd_addr + 12),
1726                         *((uint8_t *)kd->kd_addr + 14));
1727         }
1728
1729         return str;
1730 }
1731
1732 static int
1733 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1734 {
1735         int i;
1736         ENTRY;
1737
1738         spin_lock_init(lock);
1739         INIT_LIST_HEAD(list);
1740
1741         for (i = 0; i < size; i++)
1742                 INIT_HLIST_HEAD(&kmem_table[i]);
1743
1744         RETURN(0);
1745 }
1746
1747 static void
1748 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
1749 {
1750         unsigned long flags;
1751         kmem_debug_t *kd;
1752         char str[17];
1753         ENTRY;
1754
1755         spin_lock_irqsave(lock, flags);
1756         if (!list_empty(list))
1757                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1758                        "size", "data", "func", "line");
1759
1760         list_for_each_entry(kd, list, kd_list)
1761                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1762                        (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
1763                        kd->kd_func, kd->kd_line);
1764
1765         spin_unlock_irqrestore(lock, flags);
1766         EXIT;
1767 }
1768 #else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1769 #define spl_kmem_init_tracking(list, lock, size)
1770 #define spl_kmem_fini_tracking(list, lock)
1771 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1772
1773 static void
1774 spl_kmem_init_globals(void)
1775 {
1776         struct zone *zone;
1777
1778         /* For now all zones are includes, it may be wise to restrict
1779          * this to normal and highmem zones if we see problems. */
1780         for_each_zone(zone) {
1781
1782                 if (!populated_zone(zone))
1783                         continue;
1784
1785                 minfree += zone->pages_min;
1786                 desfree += zone->pages_low;
1787                 lotsfree += zone->pages_high;
1788         }
1789
1790         /* Solaris default values */
1791         swapfs_minfree = MAX(2*1024*1024 / PAGE_SIZE, physmem / 8);
1792         swapfs_reserve = MIN(4*1024*1024 / PAGE_SIZE, physmem / 16);
1793 }
1794
1795 int
1796 spl_kmem_init(void)
1797 {
1798         int rc = 0;
1799         ENTRY;
1800
1801         init_rwsem(&spl_kmem_cache_sem);
1802         INIT_LIST_HEAD(&spl_kmem_cache_list);
1803         spl_kmem_init_globals();
1804
1805 #ifdef HAVE_SET_SHRINKER
1806         spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1807                                                spl_kmem_cache_generic_shrinker);
1808         if (spl_kmem_cache_shrinker == NULL)
1809                 RETURN(rc = -ENOMEM);
1810 #else
1811         register_shrinker(&spl_kmem_cache_shrinker);
1812 #endif
1813
1814 #ifdef DEBUG_KMEM
1815         atomic64_set(&kmem_alloc_used, 0);
1816         atomic64_set(&vmem_alloc_used, 0);
1817
1818         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1819         spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1820 #endif
1821         RETURN(rc);
1822 }
1823
1824 void
1825 spl_kmem_fini(void)
1826 {
1827 #ifdef DEBUG_KMEM
1828         /* Display all unreclaimed memory addresses, including the
1829          * allocation size and the first few bytes of what's located
1830          * at that address to aid in debugging.  Performance is not
1831          * a serious concern here since it is module unload time. */
1832         if (atomic64_read(&kmem_alloc_used) != 0)
1833                 CWARN("kmem leaked %ld/%ld bytes\n",
1834                       atomic64_read(&kmem_alloc_used), kmem_alloc_max);
1835
1836
1837         if (atomic64_read(&vmem_alloc_used) != 0)
1838                 CWARN("vmem leaked %ld/%ld bytes\n",
1839                       atomic64_read(&vmem_alloc_used), vmem_alloc_max);
1840
1841         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1842         spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1843 #endif /* DEBUG_KMEM */
1844         ENTRY;
1845
1846 #ifdef HAVE_SET_SHRINKER
1847         remove_shrinker(spl_kmem_cache_shrinker);
1848 #else
1849         unregister_shrinker(&spl_kmem_cache_shrinker);
1850 #endif
1851
1852         EXIT;
1853 }