]> git.proxmox.com Git - mirror_spl.git/blame - module/spl/spl-kmem.c
Set spl_kmem_cache_slab_limit=16384 to default
[mirror_spl.git] / module / spl / spl-kmem.c
CommitLineData
716154c5
BB
1/*****************************************************************************\
2 * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3 * Copyright (C) 2007 The Regents of the University of California.
4 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
715f6251 6 * UCRL-CODE-235197
7 *
716154c5 8 * This file is part of the SPL, Solaris Porting Layer.
3d6af2dd 9 * For details, see <http://zfsonlinux.org/>.
715f6251 10 *
716154c5
BB
11 * The SPL is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
15 *
16 * The SPL is distributed in the hope that it will be useful, but WITHOUT
715f6251 17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 * for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
716154c5
BB
22 * with the SPL. If not, see <http://www.gnu.org/licenses/>.
23 *****************************************************************************
24 * Solaris Porting Layer (SPL) Kmem Implementation.
25\*****************************************************************************/
715f6251 26
f4b37741 27#include <sys/kmem.h>
55abb092 28#include <spl-debug.h>
f1ca4da6 29
b17edc10
BB
30#ifdef SS_DEBUG_SUBSYS
31#undef SS_DEBUG_SUBSYS
937879f1 32#endif
33
b17edc10 34#define SS_DEBUG_SUBSYS SS_KMEM
937879f1 35
a073aeb0
BB
36/*
37 * Within the scope of spl-kmem.c file the kmem_cache_* definitions
38 * are removed to allow access to the real Linux slab allocator.
39 */
40#undef kmem_cache_destroy
41#undef kmem_cache_create
42#undef kmem_cache_alloc
43#undef kmem_cache_free
44
45
0936c344
BB
46/*
47 * Cache expiration was implemented because it was part of the default Solaris
48 * kmem_cache behavior. The idea is that per-cpu objects which haven't been
49 * accessed in several seconds should be returned to the cache. On the other
50 * hand Linux slabs never move objects back to the slabs unless there is
89aa9705
RY
51 * memory pressure on the system. By default the Linux method is enabled
52 * because it has been shown to improve responsiveness on low memory systems.
53 * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
0936c344 54 */
89aa9705 55unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
0936c344
BB
56EXPORT_SYMBOL(spl_kmem_cache_expire);
57module_param(spl_kmem_cache_expire, uint, 0644);
58MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
59
376dc35e 60/*
c1aef269
BB
61 * The default behavior is to report the number of objects remaining in the
62 * cache. This allows the Linux VM to repeatedly reclaim objects from the
63 * cache when memory is low satisfy other memory allocations. Alternately,
64 * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
65 * is reclaimed. This may increase the likelihood of out of memory events.
376dc35e 66 */
c1aef269 67unsigned int spl_kmem_cache_reclaim = 0;
376dc35e
BB
68module_param(spl_kmem_cache_reclaim, uint, 0644);
69MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
70
bdfbe594
AV
71unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
72module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
73MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
74
75unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
76module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
77MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
78 "Minimal number of objects per slab");
79
80unsigned int spl_kmem_cache_max_size = 32;
81module_param(spl_kmem_cache_max_size, uint, 0644);
82MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
83
f2297b5a
BB
84/*
85 * For small objects the Linux slab allocator should be used to make the most
86 * efficient use of the memory. However, large objects are not supported by
87 * the Linux slab and therefore the SPL implementation is preferred. A cutoff
88 * of 16K was determined to be optimal for architectures using 4K pages.
89 */
90#if PAGE_SIZE == 4096
91unsigned int spl_kmem_cache_slab_limit = 16384;
92#else
a073aeb0 93unsigned int spl_kmem_cache_slab_limit = 0;
f2297b5a 94#endif
a073aeb0
BB
95module_param(spl_kmem_cache_slab_limit, uint, 0644);
96MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
97 "Objects less than N bytes use the Linux slab");
98
99unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
100module_param(spl_kmem_cache_kmem_limit, uint, 0644);
101MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
102 "Objects less than N bytes use the kmalloc");
103
36b313da
BB
104/*
105 * The minimum amount of memory measured in pages to be free at all
106 * times on the system. This is similar to Linux's zone->pages_min
ecc39810 107 * multiplied by the number of zones and is sized based on that.
36b313da
BB
108 */
109pgcnt_t minfree = 0;
110EXPORT_SYMBOL(minfree);
111
112/*
113 * The desired amount of memory measured in pages to be free at all
114 * times on the system. This is similar to Linux's zone->pages_low
ecc39810 115 * multiplied by the number of zones and is sized based on that.
36b313da 116 * Assuming all zones are being used roughly equally, when we drop
ecc39810 117 * below this threshold asynchronous page reclamation is triggered.
36b313da
BB
118 */
119pgcnt_t desfree = 0;
120EXPORT_SYMBOL(desfree);
121
122/*
123 * When above this amount of memory measures in pages the system is
124 * determined to have enough free memory. This is similar to Linux's
ecc39810 125 * zone->pages_high multiplied by the number of zones and is sized based
36b313da 126 * on that. Assuming all zones are being used roughly equally, when
ecc39810 127 * asynchronous page reclamation reaches this threshold it stops.
36b313da
BB
128 */
129pgcnt_t lotsfree = 0;
130EXPORT_SYMBOL(lotsfree);
131
132/* Unused always 0 in this implementation */
133pgcnt_t needfree = 0;
134EXPORT_SYMBOL(needfree);
135
36b313da
BB
136pgcnt_t swapfs_minfree = 0;
137EXPORT_SYMBOL(swapfs_minfree);
138
139pgcnt_t swapfs_reserve = 0;
140EXPORT_SYMBOL(swapfs_reserve);
141
36b313da
BB
142vmem_t *heap_arena = NULL;
143EXPORT_SYMBOL(heap_arena);
144
145vmem_t *zio_alloc_arena = NULL;
146EXPORT_SYMBOL(zio_alloc_arena);
147
148vmem_t *zio_arena = NULL;
149EXPORT_SYMBOL(zio_arena);
150
d1ff2312 151#ifndef HAVE_GET_VMALLOC_INFO
96dded38 152get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON;
d1ff2312
BB
153EXPORT_SYMBOL(get_vmalloc_info_fn);
154#endif /* HAVE_GET_VMALLOC_INFO */
155
5232d256
BB
156#ifdef HAVE_PGDAT_HELPERS
157# ifndef HAVE_FIRST_ONLINE_PGDAT
96dded38 158first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON;
d1ff2312 159EXPORT_SYMBOL(first_online_pgdat_fn);
5232d256 160# endif /* HAVE_FIRST_ONLINE_PGDAT */
36b313da 161
5232d256 162# ifndef HAVE_NEXT_ONLINE_PGDAT
96dded38 163next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON;
d1ff2312 164EXPORT_SYMBOL(next_online_pgdat_fn);
5232d256 165# endif /* HAVE_NEXT_ONLINE_PGDAT */
36b313da 166
5232d256 167# ifndef HAVE_NEXT_ZONE
96dded38 168next_zone_t next_zone_fn = SYMBOL_POISON;
d1ff2312 169EXPORT_SYMBOL(next_zone_fn);
5232d256
BB
170# endif /* HAVE_NEXT_ZONE */
171
172#else /* HAVE_PGDAT_HELPERS */
173
174# ifndef HAVE_PGDAT_LIST
175struct pglist_data *pgdat_list_addr = SYMBOL_POISON;
176EXPORT_SYMBOL(pgdat_list_addr);
177# endif /* HAVE_PGDAT_LIST */
178
179#endif /* HAVE_PGDAT_HELPERS */
36b313da 180
6ae7fef5 181#ifdef NEED_GET_ZONE_COUNTS
e11d6c5f 182# ifndef HAVE_GET_ZONE_COUNTS
96dded38 183get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON;
d1ff2312 184EXPORT_SYMBOL(get_zone_counts_fn);
96dded38 185# endif /* HAVE_GET_ZONE_COUNTS */
4ab13d3b 186
e11d6c5f 187unsigned long
6ae7fef5 188spl_global_page_state(spl_zone_stat_item_t item)
4ab13d3b
BB
189{
190 unsigned long active;
191 unsigned long inactive;
192 unsigned long free;
193
6ae7fef5
BB
194 get_zone_counts(&active, &inactive, &free);
195 switch (item) {
196 case SPL_NR_FREE_PAGES: return free;
197 case SPL_NR_INACTIVE: return inactive;
198 case SPL_NR_ACTIVE: return active;
199 default: ASSERT(0); /* Unsupported */
e11d6c5f
BB
200 }
201
6ae7fef5
BB
202 return 0;
203}
204#else
205# ifdef HAVE_GLOBAL_PAGE_STATE
206unsigned long
207spl_global_page_state(spl_zone_stat_item_t item)
208{
209 unsigned long pages = 0;
210
211 switch (item) {
212 case SPL_NR_FREE_PAGES:
213# ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES
214 pages += global_page_state(NR_FREE_PAGES);
215# endif
216 break;
217 case SPL_NR_INACTIVE:
218# ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE
219 pages += global_page_state(NR_INACTIVE);
220# endif
221# ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON
222 pages += global_page_state(NR_INACTIVE_ANON);
223# endif
224# ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE
225 pages += global_page_state(NR_INACTIVE_FILE);
226# endif
227 break;
228 case SPL_NR_ACTIVE:
229# ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE
230 pages += global_page_state(NR_ACTIVE);
231# endif
232# ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON
233 pages += global_page_state(NR_ACTIVE_ANON);
234# endif
235# ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE
236 pages += global_page_state(NR_ACTIVE_FILE);
237# endif
238 break;
239 default:
240 ASSERT(0); /* Unsupported */
e11d6c5f
BB
241 }
242
6ae7fef5
BB
243 return pages;
244}
96dded38 245# else
6ae7fef5 246# error "Both global_page_state() and get_zone_counts() unavailable"
96dded38 247# endif /* HAVE_GLOBAL_PAGE_STATE */
6ae7fef5 248#endif /* NEED_GET_ZONE_COUNTS */
e11d6c5f 249EXPORT_SYMBOL(spl_global_page_state);
4ab13d3b 250
e76f4bf1
BB
251#ifndef HAVE_SHRINK_DCACHE_MEMORY
252shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON;
253EXPORT_SYMBOL(shrink_dcache_memory_fn);
254#endif /* HAVE_SHRINK_DCACHE_MEMORY */
255
256#ifndef HAVE_SHRINK_ICACHE_MEMORY
257shrink_icache_memory_t shrink_icache_memory_fn = SYMBOL_POISON;
258EXPORT_SYMBOL(shrink_icache_memory_fn);
259#endif /* HAVE_SHRINK_ICACHE_MEMORY */
260
e11d6c5f
BB
261pgcnt_t
262spl_kmem_availrmem(void)
263{
4ab13d3b 264 /* The amount of easily available memory */
6ae7fef5
BB
265 return (spl_global_page_state(SPL_NR_FREE_PAGES) +
266 spl_global_page_state(SPL_NR_INACTIVE));
4ab13d3b
BB
267}
268EXPORT_SYMBOL(spl_kmem_availrmem);
269
270size_t
271vmem_size(vmem_t *vmp, int typemask)
272{
d1ff2312
BB
273 struct vmalloc_info vmi;
274 size_t size = 0;
275
4ab13d3b
BB
276 ASSERT(vmp == NULL);
277 ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE));
278
d1ff2312
BB
279 get_vmalloc_info(&vmi);
280 if (typemask & VMEM_ALLOC)
281 size += (size_t)vmi.used;
282
283 if (typemask & VMEM_FREE)
284 size += (size_t)(VMALLOC_TOTAL - vmi.used);
285
286 return size;
4ab13d3b
BB
287}
288EXPORT_SYMBOL(vmem_size);
4ab13d3b 289
b868e22f
BB
290int
291kmem_debugging(void)
292{
293 return 0;
294}
295EXPORT_SYMBOL(kmem_debugging);
296
297#ifndef HAVE_KVASPRINTF
298/* Simplified asprintf. */
299char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
300{
301 unsigned int len;
302 char *p;
303 va_list aq;
304
305 va_copy(aq, ap);
306 len = vsnprintf(NULL, 0, fmt, aq);
307 va_end(aq);
308
309 p = kmalloc(len+1, gfp);
310 if (!p)
311 return NULL;
312
313 vsnprintf(p, len+1, fmt, ap);
314
315 return p;
316}
317EXPORT_SYMBOL(kvasprintf);
318#endif /* HAVE_KVASPRINTF */
319
e6de04b7
BB
320char *
321kmem_vasprintf(const char *fmt, va_list ap)
322{
323 va_list aq;
324 char *ptr;
325
e6de04b7 326 do {
2c762de8 327 va_copy(aq, ap);
e6de04b7 328 ptr = kvasprintf(GFP_KERNEL, fmt, aq);
2c762de8 329 va_end(aq);
e6de04b7 330 } while (ptr == NULL);
e6de04b7
BB
331
332 return ptr;
333}
334EXPORT_SYMBOL(kmem_vasprintf);
335
b868e22f
BB
336char *
337kmem_asprintf(const char *fmt, ...)
338{
e6de04b7 339 va_list ap;
b868e22f
BB
340 char *ptr;
341
b868e22f 342 do {
2c762de8 343 va_start(ap, fmt);
e6de04b7 344 ptr = kvasprintf(GFP_KERNEL, fmt, ap);
2c762de8 345 va_end(ap);
b868e22f 346 } while (ptr == NULL);
b868e22f
BB
347
348 return ptr;
349}
350EXPORT_SYMBOL(kmem_asprintf);
351
10129680
BB
352static char *
353__strdup(const char *str, int flags)
354{
355 char *ptr;
356 int n;
357
358 n = strlen(str);
359 ptr = kmalloc_nofail(n + 1, flags);
360 if (ptr)
361 memcpy(ptr, str, n + 1);
362
363 return ptr;
364}
365
366char *
367strdup(const char *str)
368{
369 return __strdup(str, KM_SLEEP);
370}
371EXPORT_SYMBOL(strdup);
372
373void
374strfree(char *str)
375{
41f84a8d 376 kfree(str);
10129680
BB
377}
378EXPORT_SYMBOL(strfree);
379
f1ca4da6 380/*
2fb9b26a 381 * Memory allocation interfaces and debugging for basic kmem_*
055ffd98
BB
382 * and vmem_* style memory allocation. When DEBUG_KMEM is enabled
383 * the SPL will keep track of the total memory allocated, and
384 * report any memory leaked when the module is unloaded.
f1ca4da6 385 */
386#ifdef DEBUG_KMEM
d04c8a56 387
f1ca4da6 388/* Shim layer memory accounting */
d04c8a56 389# ifdef HAVE_ATOMIC64_T
550f1705 390atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 391unsigned long long kmem_alloc_max = 0;
550f1705 392atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 393unsigned long long vmem_alloc_max = 0;
10129680 394# else /* HAVE_ATOMIC64_T */
d04c8a56
BB
395atomic_t kmem_alloc_used = ATOMIC_INIT(0);
396unsigned long long kmem_alloc_max = 0;
397atomic_t vmem_alloc_used = ATOMIC_INIT(0);
398unsigned long long vmem_alloc_max = 0;
10129680 399# endif /* HAVE_ATOMIC64_T */
79b31f36 400
ff449ac4 401EXPORT_SYMBOL(kmem_alloc_used);
402EXPORT_SYMBOL(kmem_alloc_max);
403EXPORT_SYMBOL(vmem_alloc_used);
404EXPORT_SYMBOL(vmem_alloc_max);
ff449ac4 405
055ffd98
BB
406/* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
407 * but also the location of every alloc and free. When the SPL module is
408 * unloaded a list of all leaked addresses and where they were allocated
409 * will be dumped to the console. Enabling this feature has a significant
410 * impact on performance but it makes finding memory leaks straight forward.
411 *
412 * Not surprisingly with debugging enabled the xmem_locks are very highly
413 * contended particularly on xfree(). If we want to run with this detailed
414 * debugging enabled for anything other than debugging we need to minimize
415 * the contention by moving to a lock per xmem_table entry model.
a0f6da3d 416 */
055ffd98 417# ifdef DEBUG_KMEM_TRACKING
a0f6da3d 418
419# define KMEM_HASH_BITS 10
420# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
421
422# define VMEM_HASH_BITS 10
423# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
424
425typedef struct kmem_debug {
426 struct hlist_node kd_hlist; /* Hash node linkage */
427 struct list_head kd_list; /* List of all allocations */
428 void *kd_addr; /* Allocation pointer */
429 size_t kd_size; /* Allocation size */
430 const char *kd_func; /* Allocation function */
431 int kd_line; /* Allocation line */
432} kmem_debug_t;
433
d6a26c6a 434spinlock_t kmem_lock;
435struct hlist_head kmem_table[KMEM_TABLE_SIZE];
436struct list_head kmem_list;
437
13cdca65 438spinlock_t vmem_lock;
439struct hlist_head vmem_table[VMEM_TABLE_SIZE];
440struct list_head vmem_list;
441
d6a26c6a 442EXPORT_SYMBOL(kmem_lock);
443EXPORT_SYMBOL(kmem_table);
444EXPORT_SYMBOL(kmem_list);
445
13cdca65 446EXPORT_SYMBOL(vmem_lock);
447EXPORT_SYMBOL(vmem_table);
448EXPORT_SYMBOL(vmem_list);
a0f6da3d 449
450static kmem_debug_t *
973e8269 451kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr)
a0f6da3d 452{
453 struct hlist_head *head;
454 struct hlist_node *node;
455 struct kmem_debug *p;
456 unsigned long flags;
b17edc10 457 SENTRY;
a0f6da3d 458
459 spin_lock_irqsave(lock, flags);
460
b1424add
BB
461 head = &table[hash_ptr((void *)addr, bits)];
462 hlist_for_each(node, head) {
463 p = list_entry(node, struct kmem_debug, kd_hlist);
a0f6da3d 464 if (p->kd_addr == addr) {
465 hlist_del_init(&p->kd_hlist);
466 list_del_init(&p->kd_list);
467 spin_unlock_irqrestore(lock, flags);
468 return p;
469 }
470 }
471
472 spin_unlock_irqrestore(lock, flags);
473
b17edc10 474 SRETURN(NULL);
a0f6da3d 475}
476
477void *
478kmem_alloc_track(size_t size, int flags, const char *func, int line,
479 int node_alloc, int node)
480{
481 void *ptr = NULL;
482 kmem_debug_t *dptr;
483 unsigned long irq_flags;
b17edc10 484 SENTRY;
a0f6da3d 485
10129680 486 /* Function may be called with KM_NOSLEEP so failure is possible */
c89fdee4 487 dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
a0f6da3d 488 flags & ~__GFP_ZERO);
489
10129680 490 if (unlikely(dptr == NULL)) {
b17edc10 491 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
3cb77549
BB
492 "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
493 sizeof(kmem_debug_t), flags, func, line,
494 kmem_alloc_used_read(), kmem_alloc_max);
a0f6da3d 495 } else {
10129680
BB
496 /*
497 * Marked unlikely because we should never be doing this,
498 * we tolerate to up 2 pages but a single page is best.
499 */
23d91792 500 if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
b17edc10 501 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
3cb77549
BB
502 "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
503 (unsigned long long) size, flags, func, line,
d04c8a56 504 kmem_alloc_used_read(), kmem_alloc_max);
5198ea0e
BB
505 spl_debug_dumpstack(NULL);
506 }
a0f6da3d 507
10129680
BB
508 /*
509 * We use __strdup() below because the string pointed to by
c8e60837 510 * __FUNCTION__ might not be available by the time we want
10129680
BB
511 * to print it since the module might have been unloaded.
512 * This can only fail in the KM_NOSLEEP case.
513 */
514 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
c8e60837 515 if (unlikely(dptr->kd_func == NULL)) {
516 kfree(dptr);
b17edc10 517 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
10129680 518 "debug __strdup() at %s:%d failed (%lld/%llu)\n",
3cb77549 519 func, line, kmem_alloc_used_read(), kmem_alloc_max);
c8e60837 520 goto out;
521 }
522
a0f6da3d 523 /* Use the correct allocator */
524 if (node_alloc) {
525 ASSERT(!(flags & __GFP_ZERO));
c89fdee4 526 ptr = kmalloc_node_nofail(size, flags, node);
a0f6da3d 527 } else if (flags & __GFP_ZERO) {
c89fdee4 528 ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
a0f6da3d 529 } else {
c89fdee4 530 ptr = kmalloc_nofail(size, flags);
a0f6da3d 531 }
532
533 if (unlikely(ptr == NULL)) {
c8e60837 534 kfree(dptr->kd_func);
a0f6da3d 535 kfree(dptr);
b17edc10 536 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc"
3cb77549
BB
537 "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
538 (unsigned long long) size, flags, func, line,
d04c8a56 539 kmem_alloc_used_read(), kmem_alloc_max);
a0f6da3d 540 goto out;
541 }
542
d04c8a56
BB
543 kmem_alloc_used_add(size);
544 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
545 kmem_alloc_max = kmem_alloc_used_read();
a0f6da3d 546
547 INIT_HLIST_NODE(&dptr->kd_hlist);
548 INIT_LIST_HEAD(&dptr->kd_list);
549
550 dptr->kd_addr = ptr;
551 dptr->kd_size = size;
a0f6da3d 552 dptr->kd_line = line;
553
554 spin_lock_irqsave(&kmem_lock, irq_flags);
b1424add 555 hlist_add_head(&dptr->kd_hlist,
a0f6da3d 556 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
557 list_add_tail(&dptr->kd_list, &kmem_list);
558 spin_unlock_irqrestore(&kmem_lock, irq_flags);
559
b17edc10 560 SDEBUG_LIMIT(SD_INFO,
3cb77549
BB
561 "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
562 (unsigned long long) size, flags, func, line, ptr,
563 kmem_alloc_used_read(), kmem_alloc_max);
a0f6da3d 564 }
565out:
b17edc10 566 SRETURN(ptr);
a0f6da3d 567}
568EXPORT_SYMBOL(kmem_alloc_track);
569
570void
973e8269 571kmem_free_track(const void *ptr, size_t size)
a0f6da3d 572{
573 kmem_debug_t *dptr;
b17edc10 574 SENTRY;
a0f6da3d 575
576 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
577 (unsigned long long) size);
578
579 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
580
10129680
BB
581 /* Must exist in hash due to kmem_alloc() */
582 ASSERT(dptr);
a0f6da3d 583
584 /* Size must match */
585 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
586 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
587 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
588
d04c8a56 589 kmem_alloc_used_sub(size);
b17edc10 590 SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
d04c8a56 591 (unsigned long long) size, kmem_alloc_used_read(),
a0f6da3d 592 kmem_alloc_max);
593
c8e60837 594 kfree(dptr->kd_func);
595
b1424add 596 memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
a0f6da3d 597 kfree(dptr);
598
b1424add 599 memset((void *)ptr, 0x5a, size);
a0f6da3d 600 kfree(ptr);
601
b17edc10 602 SEXIT;
a0f6da3d 603}
604EXPORT_SYMBOL(kmem_free_track);
605
606void *
607vmem_alloc_track(size_t size, int flags, const char *func, int line)
608{
609 void *ptr = NULL;
610 kmem_debug_t *dptr;
611 unsigned long irq_flags;
b17edc10 612 SENTRY;
a0f6da3d 613
614 ASSERT(flags & KM_SLEEP);
615
10129680 616 /* Function may be called with KM_NOSLEEP so failure is possible */
ef1c7a06
BB
617 dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
618 flags & ~__GFP_ZERO);
10129680 619 if (unlikely(dptr == NULL)) {
b17edc10 620 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
3cb77549
BB
621 "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
622 sizeof(kmem_debug_t), flags, func, line,
623 vmem_alloc_used_read(), vmem_alloc_max);
a0f6da3d 624 } else {
10129680
BB
625 /*
626 * We use __strdup() below because the string pointed to by
c8e60837 627 * __FUNCTION__ might not be available by the time we want
10129680
BB
628 * to print it, since the module might have been unloaded.
629 * This can never fail because we have already asserted
630 * that flags is KM_SLEEP.
631 */
632 dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
c8e60837 633 if (unlikely(dptr->kd_func == NULL)) {
634 kfree(dptr);
b17edc10 635 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
10129680 636 "debug __strdup() at %s:%d failed (%lld/%llu)\n",
3cb77549 637 func, line, vmem_alloc_used_read(), vmem_alloc_max);
c8e60837 638 goto out;
639 }
640
10129680
BB
641 /* Use the correct allocator */
642 if (flags & __GFP_ZERO) {
643 ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
644 } else {
645 ptr = vmalloc_nofail(size, flags);
646 }
a0f6da3d 647
648 if (unlikely(ptr == NULL)) {
c8e60837 649 kfree(dptr->kd_func);
a0f6da3d 650 kfree(dptr);
b17edc10 651 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc"
3cb77549
BB
652 "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
653 (unsigned long long) size, flags, func, line,
d04c8a56 654 vmem_alloc_used_read(), vmem_alloc_max);
a0f6da3d 655 goto out;
656 }
657
d04c8a56
BB
658 vmem_alloc_used_add(size);
659 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
660 vmem_alloc_max = vmem_alloc_used_read();
a0f6da3d 661
662 INIT_HLIST_NODE(&dptr->kd_hlist);
663 INIT_LIST_HEAD(&dptr->kd_list);
664
665 dptr->kd_addr = ptr;
666 dptr->kd_size = size;
a0f6da3d 667 dptr->kd_line = line;
668
669 spin_lock_irqsave(&vmem_lock, irq_flags);
b1424add 670 hlist_add_head(&dptr->kd_hlist,
a0f6da3d 671 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
672 list_add_tail(&dptr->kd_list, &vmem_list);
673 spin_unlock_irqrestore(&vmem_lock, irq_flags);
674
b17edc10 675 SDEBUG_LIMIT(SD_INFO,
3cb77549
BB
676 "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
677 (unsigned long long) size, flags, func, line,
678 ptr, vmem_alloc_used_read(), vmem_alloc_max);
a0f6da3d 679 }
680out:
b17edc10 681 SRETURN(ptr);
a0f6da3d 682}
683EXPORT_SYMBOL(vmem_alloc_track);
684
685void
973e8269 686vmem_free_track(const void *ptr, size_t size)
a0f6da3d 687{
688 kmem_debug_t *dptr;
b17edc10 689 SENTRY;
a0f6da3d 690
691 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
692 (unsigned long long) size);
693
694 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
10129680
BB
695
696 /* Must exist in hash due to vmem_alloc() */
697 ASSERT(dptr);
a0f6da3d 698
699 /* Size must match */
700 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
701 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
702 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
703
d04c8a56 704 vmem_alloc_used_sub(size);
b17edc10 705 SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
d04c8a56 706 (unsigned long long) size, vmem_alloc_used_read(),
a0f6da3d 707 vmem_alloc_max);
708
c8e60837 709 kfree(dptr->kd_func);
710
b1424add 711 memset((void *)dptr, 0x5a, sizeof(kmem_debug_t));
a0f6da3d 712 kfree(dptr);
713
b1424add 714 memset((void *)ptr, 0x5a, size);
a0f6da3d 715 vfree(ptr);
716
b17edc10 717 SEXIT;
a0f6da3d 718}
719EXPORT_SYMBOL(vmem_free_track);
720
721# else /* DEBUG_KMEM_TRACKING */
722
723void *
724kmem_alloc_debug(size_t size, int flags, const char *func, int line,
725 int node_alloc, int node)
726{
727 void *ptr;
b17edc10 728 SENTRY;
a0f6da3d 729
10129680
BB
730 /*
731 * Marked unlikely because we should never be doing this,
732 * we tolerate to up 2 pages but a single page is best.
733 */
23d91792 734 if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
b17edc10 735 SDEBUG(SD_CONSOLE | SD_WARNING,
10129680 736 "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
3cb77549 737 (unsigned long long) size, flags, func, line,
d04c8a56 738 kmem_alloc_used_read(), kmem_alloc_max);
377e12f1 739 spl_debug_dumpstack(NULL);
5198ea0e 740 }
a0f6da3d 741
742 /* Use the correct allocator */
743 if (node_alloc) {
744 ASSERT(!(flags & __GFP_ZERO));
c89fdee4 745 ptr = kmalloc_node_nofail(size, flags, node);
a0f6da3d 746 } else if (flags & __GFP_ZERO) {
c89fdee4 747 ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
a0f6da3d 748 } else {
c89fdee4 749 ptr = kmalloc_nofail(size, flags);
a0f6da3d 750 }
751
10129680 752 if (unlikely(ptr == NULL)) {
b17edc10 753 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
3cb77549
BB
754 "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
755 (unsigned long long) size, flags, func, line,
d04c8a56 756 kmem_alloc_used_read(), kmem_alloc_max);
a0f6da3d 757 } else {
d04c8a56
BB
758 kmem_alloc_used_add(size);
759 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
760 kmem_alloc_max = kmem_alloc_used_read();
a0f6da3d 761
b17edc10 762 SDEBUG_LIMIT(SD_INFO,
3cb77549
BB
763 "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
764 (unsigned long long) size, flags, func, line, ptr,
10129680 765 kmem_alloc_used_read(), kmem_alloc_max);
a0f6da3d 766 }
10129680 767
b17edc10 768 SRETURN(ptr);
a0f6da3d 769}
770EXPORT_SYMBOL(kmem_alloc_debug);
771
772void
973e8269 773kmem_free_debug(const void *ptr, size_t size)
a0f6da3d 774{
b17edc10 775 SENTRY;
a0f6da3d 776
777 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
778 (unsigned long long) size);
779
d04c8a56 780 kmem_alloc_used_sub(size);
b17edc10 781 SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
d04c8a56 782 (unsigned long long) size, kmem_alloc_used_read(),
a0f6da3d 783 kmem_alloc_max);
a0f6da3d 784 kfree(ptr);
785
b17edc10 786 SEXIT;
a0f6da3d 787}
788EXPORT_SYMBOL(kmem_free_debug);
789
790void *
791vmem_alloc_debug(size_t size, int flags, const char *func, int line)
792{
793 void *ptr;
b17edc10 794 SENTRY;
a0f6da3d 795
796 ASSERT(flags & KM_SLEEP);
797
10129680
BB
798 /* Use the correct allocator */
799 if (flags & __GFP_ZERO) {
800 ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
801 } else {
802 ptr = vmalloc_nofail(size, flags);
803 }
804
805 if (unlikely(ptr == NULL)) {
b17edc10 806 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
3cb77549
BB
807 "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
808 (unsigned long long) size, flags, func, line,
d04c8a56 809 vmem_alloc_used_read(), vmem_alloc_max);
a0f6da3d 810 } else {
d04c8a56
BB
811 vmem_alloc_used_add(size);
812 if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
813 vmem_alloc_max = vmem_alloc_used_read();
a0f6da3d 814
b17edc10 815 SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p "
a0f6da3d 816 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
d04c8a56 817 vmem_alloc_used_read(), vmem_alloc_max);
a0f6da3d 818 }
819
b17edc10 820 SRETURN(ptr);
a0f6da3d 821}
822EXPORT_SYMBOL(vmem_alloc_debug);
823
824void
973e8269 825vmem_free_debug(const void *ptr, size_t size)
a0f6da3d 826{
b17edc10 827 SENTRY;
a0f6da3d 828
829 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
830 (unsigned long long) size);
831
d04c8a56 832 vmem_alloc_used_sub(size);
b17edc10 833 SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
d04c8a56 834 (unsigned long long) size, vmem_alloc_used_read(),
a0f6da3d 835 vmem_alloc_max);
a0f6da3d 836 vfree(ptr);
837
b17edc10 838 SEXIT;
a0f6da3d 839}
840EXPORT_SYMBOL(vmem_free_debug);
841
842# endif /* DEBUG_KMEM_TRACKING */
843#endif /* DEBUG_KMEM */
844
10129680
BB
845/*
846 * Slab allocation interfaces
847 *
848 * While the Linux slab implementation was inspired by the Solaris
ecc39810 849 * implementation I cannot use it to emulate the Solaris APIs. I
10129680
BB
850 * require two features which are not provided by the Linux slab.
851 *
852 * 1) Constructors AND destructors. Recent versions of the Linux
853 * kernel have removed support for destructors. This is a deal
854 * breaker for the SPL which contains particularly expensive
855 * initializers for mutex's, condition variables, etc. We also
856 * require a minimal level of cleanup for these data types unlike
857 * many Linux data type which do need to be explicitly destroyed.
858 *
859 * 2) Virtual address space backed slab. Callers of the Solaris slab
860 * expect it to work well for both small are very large allocations.
861 * Because of memory fragmentation the Linux slab which is backed
862 * by kmalloc'ed memory performs very badly when confronted with
863 * large numbers of large allocations. Basing the slab on the
ecc39810 864 * virtual address space removes the need for contiguous pages
10129680
BB
865 * and greatly improve performance for large allocations.
866 *
867 * For these reasons, the SPL has its own slab implementation with
868 * the needed features. It is not as highly optimized as either the
869 * Solaris or Linux slabs, but it should get me most of what is
870 * needed until it can be optimized or obsoleted by another approach.
871 *
872 * One serious concern I do have about this method is the relatively
873 * small virtual address space on 32bit arches. This will seriously
874 * constrain the size of the slab caches and their performance.
875 *
876 * XXX: Improve the partial slab list by carefully maintaining a
877 * strict ordering of fullest to emptiest slabs based on
ecc39810 878 * the slab reference count. This guarantees the when freeing
10129680
BB
879 * slabs back to the system we need only linearly traverse the
880 * last N slabs in the list to discover all the freeable slabs.
881 *
882 * XXX: NUMA awareness for optionally allocating memory close to a
ecc39810 883 * particular core. This can be advantageous if you know the slab
10129680
BB
884 * object will be short lived and primarily accessed from one core.
885 *
886 * XXX: Slab coloring may also yield performance improvements and would
887 * be desirable to implement.
888 */
889
890struct list_head spl_kmem_cache_list; /* List of caches */
891struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
a10287e0 892taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
10129680 893
d4899f47 894static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
10129680 895
a55bcaad 896SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
495bd532
BB
897SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
898 spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
10129680 899
a1502d76 900static void *
901kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
fece7c99 902{
a1502d76 903 void *ptr;
f1ca4da6 904
8b45dda2
BB
905 ASSERT(ISP2(size));
906
500e95c8 907 if (skc->skc_flags & KMC_KMEM)
ae16ed99
CC
908 ptr = (void *)__get_free_pages(flags | __GFP_COMP,
909 get_order(size));
500e95c8 910 else
617f79de
BB
911 ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
912
8b45dda2
BB
913 /* Resulting allocated memory will be page aligned */
914 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
fece7c99 915
a1502d76 916 return ptr;
917}
fece7c99 918
a1502d76 919static void
920kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
921{
8b45dda2
BB
922 ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
923 ASSERT(ISP2(size));
924
06089b9e
BB
925 /*
926 * The Linux direct reclaim path uses this out of band value to
927 * determine if forward progress is being made. Normally this is
928 * incremented by kmem_freepages() which is part of the various
929 * Linux slab implementations. However, since we are using none
930 * of that infrastructure we are responsible for incrementing it.
931 */
932 if (current->reclaim_state)
933 current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
934
8b45dda2
BB
935 if (skc->skc_flags & KMC_KMEM)
936 free_pages((unsigned long)ptr, get_order(size));
937 else
938 vfree(ptr);
939}
940
941/*
942 * Required space for each aligned sks.
943 */
944static inline uint32_t
945spl_sks_size(spl_kmem_cache_t *skc)
946{
947 return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t),
948 skc->skc_obj_align, uint32_t);
949}
950
951/*
952 * Required space for each aligned object.
953 */
954static inline uint32_t
955spl_obj_size(spl_kmem_cache_t *skc)
956{
957 uint32_t align = skc->skc_obj_align;
958
959 return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
960 P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t);
961}
962
963/*
964 * Lookup the spl_kmem_object_t for an object given that object.
965 */
966static inline spl_kmem_obj_t *
967spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
968{
969 return obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
970 skc->skc_obj_align, uint32_t);
971}
972
973/*
974 * Required space for each offslab object taking in to account alignment
975 * restrictions and the power-of-two requirement of kv_alloc().
976 */
977static inline uint32_t
978spl_offslab_size(spl_kmem_cache_t *skc)
979{
980 return 1UL << (highbit(spl_obj_size(skc)) + 1);
fece7c99 981}
982
ea3e6ca9
BB
983/*
984 * It's important that we pack the spl_kmem_obj_t structure and the
48e0606a
BB
985 * actual objects in to one large address space to minimize the number
986 * of calls to the allocator. It is far better to do a few large
987 * allocations and then subdivide it ourselves. Now which allocator
988 * we use requires balancing a few trade offs.
989 *
990 * For small objects we use kmem_alloc() because as long as you are
991 * only requesting a small number of pages (ideally just one) its cheap.
992 * However, when you start requesting multiple pages with kmem_alloc()
ecc39810 993 * it gets increasingly expensive since it requires contiguous pages.
48e0606a 994 * For this reason we shift to vmem_alloc() for slabs of large objects
ecc39810 995 * which removes the need for contiguous pages. We do not use
48e0606a
BB
996 * vmem_alloc() in all cases because there is significant locking
997 * overhead in __get_vm_area_node(). This function takes a single
ecc39810 998 * global lock when acquiring an available virtual address range which
48e0606a
BB
999 * serializes all vmem_alloc()'s for all slab caches. Using slightly
1000 * different allocation functions for small and large objects should
1001 * give us the best of both worlds.
1002 *
1003 * KMC_ONSLAB KMC_OFFSLAB
1004 *
1005 * +------------------------+ +-----------------+
1006 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
1007 * | skc_obj_size <-+ | | +-----------------+ | |
1008 * | spl_kmem_obj_t | | | |
1009 * | skc_obj_size <---+ | +-----------------+ | |
1010 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
1011 * | ... v | | spl_kmem_obj_t | |
1012 * +------------------------+ +-----------------+ v
1013 */
fece7c99 1014static spl_kmem_slab_t *
a1502d76 1015spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
fece7c99 1016{
1017 spl_kmem_slab_t *sks;
a1502d76 1018 spl_kmem_obj_t *sko, *n;
1019 void *base, *obj;
8b45dda2
BB
1020 uint32_t obj_size, offslab_size = 0;
1021 int i, rc = 0;
48e0606a 1022
a1502d76 1023 base = kv_alloc(skc, skc->skc_slab_size, flags);
1024 if (base == NULL)
b17edc10 1025 SRETURN(NULL);
fece7c99 1026
a1502d76 1027 sks = (spl_kmem_slab_t *)base;
1028 sks->sks_magic = SKS_MAGIC;
1029 sks->sks_objs = skc->skc_slab_objs;
1030 sks->sks_age = jiffies;
1031 sks->sks_cache = skc;
1032 INIT_LIST_HEAD(&sks->sks_list);
1033 INIT_LIST_HEAD(&sks->sks_free_list);
1034 sks->sks_ref = 0;
8b45dda2 1035 obj_size = spl_obj_size(skc);
48e0606a 1036
8d177c18 1037 if (skc->skc_flags & KMC_OFFSLAB)
8b45dda2 1038 offslab_size = spl_offslab_size(skc);
fece7c99 1039
1040 for (i = 0; i < sks->sks_objs; i++) {
a1502d76 1041 if (skc->skc_flags & KMC_OFFSLAB) {
8b45dda2 1042 obj = kv_alloc(skc, offslab_size, flags);
a1502d76 1043 if (!obj)
b17edc10 1044 SGOTO(out, rc = -ENOMEM);
a1502d76 1045 } else {
8b45dda2 1046 obj = base + spl_sks_size(skc) + (i * obj_size);
a1502d76 1047 }
1048
8b45dda2
BB
1049 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
1050 sko = spl_sko_from_obj(skc, obj);
fece7c99 1051 sko->sko_addr = obj;
1052 sko->sko_magic = SKO_MAGIC;
1053 sko->sko_slab = sks;
1054 INIT_LIST_HEAD(&sko->sko_list);
fece7c99 1055 list_add_tail(&sko->sko_list, &sks->sks_free_list);
1056 }
1057
fece7c99 1058 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
1059 if (skc->skc_ctor)
1060 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
2fb9b26a 1061out:
a1502d76 1062 if (rc) {
1063 if (skc->skc_flags & KMC_OFFSLAB)
48e0606a
BB
1064 list_for_each_entry_safe(sko, n, &sks->sks_free_list,
1065 sko_list)
8b45dda2 1066 kv_free(skc, sko->sko_addr, offslab_size);
fece7c99 1067
a1502d76 1068 kv_free(skc, base, skc->skc_slab_size);
1069 sks = NULL;
fece7c99 1070 }
1071
b17edc10 1072 SRETURN(sks);
fece7c99 1073}
1074
ea3e6ca9
BB
1075/*
1076 * Remove a slab from complete or partial list, it must be called with
1077 * the 'skc->skc_lock' held but the actual free must be performed
1078 * outside the lock to prevent deadlocking on vmem addresses.
fece7c99 1079 */
f1ca4da6 1080static void
ea3e6ca9
BB
1081spl_slab_free(spl_kmem_slab_t *sks,
1082 struct list_head *sks_list, struct list_head *sko_list)
1083{
2fb9b26a 1084 spl_kmem_cache_t *skc;
b17edc10 1085 SENTRY;
57d86234 1086
2fb9b26a 1087 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 1088 ASSERT(sks->sks_ref == 0);
d6a26c6a 1089
fece7c99 1090 skc = sks->sks_cache;
1091 ASSERT(skc->skc_magic == SKC_MAGIC);
d46630e0 1092 ASSERT(spin_is_locked(&skc->skc_lock));
f1ca4da6 1093
1a944a7d
BB
1094 /*
1095 * Update slab/objects counters in the cache, then remove the
1096 * slab from the skc->skc_partial_list. Finally add the slab
1097 * and all its objects in to the private work lists where the
1098 * destructors will be called and the memory freed to the system.
1099 */
fece7c99 1100 skc->skc_obj_total -= sks->sks_objs;
1101 skc->skc_slab_total--;
1102 list_del(&sks->sks_list);
ea3e6ca9 1103 list_add(&sks->sks_list, sks_list);
1a944a7d
BB
1104 list_splice_init(&sks->sks_free_list, sko_list);
1105
b17edc10 1106 SEXIT;
2fb9b26a 1107}
d6a26c6a 1108
ea3e6ca9
BB
1109/*
1110 * Traverses all the partial slabs attached to a cache and free those
1111 * which which are currently empty, and have not been touched for
37db7d8c
BB
1112 * skc_delay seconds to avoid thrashing. The count argument is
1113 * passed to optionally cap the number of slabs reclaimed, a count
1114 * of zero means try and reclaim everything. When flag is set we
1115 * always free an available slab regardless of age.
ea3e6ca9
BB
1116 */
1117static void
37db7d8c 1118spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
2fb9b26a 1119{
1120 spl_kmem_slab_t *sks, *m;
ea3e6ca9
BB
1121 spl_kmem_obj_t *sko, *n;
1122 LIST_HEAD(sks_list);
1123 LIST_HEAD(sko_list);
8b45dda2
BB
1124 uint32_t size = 0;
1125 int i = 0;
b17edc10 1126 SENTRY;
2fb9b26a 1127
2fb9b26a 1128 /*
ea3e6ca9
BB
1129 * Move empty slabs and objects which have not been touched in
1130 * skc_delay seconds on to private lists to be freed outside
1a944a7d
BB
1131 * the spin lock. This delay time is important to avoid thrashing
1132 * however when flag is set the delay will not be used.
2fb9b26a 1133 */
ea3e6ca9 1134 spin_lock(&skc->skc_lock);
1a944a7d
BB
1135 list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){
1136 /*
1137 * All empty slabs are at the end of skc->skc_partial_list,
1138 * therefore once a non-empty slab is found we can stop
1139 * scanning. Additionally, stop when reaching the target
ecc39810 1140 * reclaim 'count' if a non-zero threshold is given.
1a944a7d 1141 */
cef7605c 1142 if ((sks->sks_ref > 0) || (count && i >= count))
37db7d8c
BB
1143 break;
1144
37db7d8c 1145 if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
ea3e6ca9 1146 spl_slab_free(sks, &sks_list, &sko_list);
37db7d8c
BB
1147 i++;
1148 }
ea3e6ca9
BB
1149 }
1150 spin_unlock(&skc->skc_lock);
1151
1152 /*
1a944a7d
BB
1153 * The following two loops ensure all the object destructors are
1154 * run, any offslab objects are freed, and the slabs themselves
1155 * are freed. This is all done outside the skc->skc_lock since
1156 * this allows the destructor to sleep, and allows us to perform
1157 * a conditional reschedule when a freeing a large number of
1158 * objects and slabs back to the system.
ea3e6ca9 1159 */
1a944a7d 1160 if (skc->skc_flags & KMC_OFFSLAB)
8b45dda2 1161 size = spl_offslab_size(skc);
ea3e6ca9 1162
1a944a7d
BB
1163 list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
1164 ASSERT(sko->sko_magic == SKO_MAGIC);
1165
1166 if (skc->skc_dtor)
1167 skc->skc_dtor(sko->sko_addr, skc->skc_private);
1168
1169 if (skc->skc_flags & KMC_OFFSLAB)
ea3e6ca9 1170 kv_free(skc, sko->sko_addr, size);
2fb9b26a 1171 }
1172
37db7d8c 1173 list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
1a944a7d 1174 ASSERT(sks->sks_magic == SKS_MAGIC);
ea3e6ca9 1175 kv_free(skc, sks, skc->skc_slab_size);
37db7d8c 1176 }
ea3e6ca9 1177
b17edc10 1178 SEXIT;
f1ca4da6 1179}
1180
ed316348
BB
1181static spl_kmem_emergency_t *
1182spl_emergency_search(struct rb_root *root, void *obj)
1183{
1184 struct rb_node *node = root->rb_node;
1185 spl_kmem_emergency_t *ske;
1186 unsigned long address = (unsigned long)obj;
1187
1188 while (node) {
1189 ske = container_of(node, spl_kmem_emergency_t, ske_node);
1190
1191 if (address < (unsigned long)ske->ske_obj)
1192 node = node->rb_left;
1193 else if (address > (unsigned long)ske->ske_obj)
1194 node = node->rb_right;
1195 else
1196 return ske;
1197 }
1198
1199 return NULL;
1200}
1201
1202static int
1203spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
1204{
1205 struct rb_node **new = &(root->rb_node), *parent = NULL;
1206 spl_kmem_emergency_t *ske_tmp;
1207 unsigned long address = (unsigned long)ske->ske_obj;
1208
1209 while (*new) {
1210 ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
1211
1212 parent = *new;
1213 if (address < (unsigned long)ske_tmp->ske_obj)
1214 new = &((*new)->rb_left);
1215 else if (address > (unsigned long)ske_tmp->ske_obj)
1216 new = &((*new)->rb_right);
1217 else
1218 return 0;
1219 }
1220
1221 rb_link_node(&ske->ske_node, parent, new);
1222 rb_insert_color(&ske->ske_node, root);
1223
1224 return 1;
1225}
1226
e2dcc6e2 1227/*
ed316348 1228 * Allocate a single emergency object and track it in a red black tree.
e2dcc6e2
BB
1229 */
1230static int
1231spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
1232{
1233 spl_kmem_emergency_t *ske;
1234 int empty;
1235 SENTRY;
1236
1237 /* Last chance use a partial slab if one now exists */
1238 spin_lock(&skc->skc_lock);
1239 empty = list_empty(&skc->skc_partial_list);
1240 spin_unlock(&skc->skc_lock);
1241 if (!empty)
1242 SRETURN(-EEXIST);
1243
1244 ske = kmalloc(sizeof(*ske), flags);
1245 if (ske == NULL)
1246 SRETURN(-ENOMEM);
1247
1248 ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
1249 if (ske->ske_obj == NULL) {
1250 kfree(ske);
1251 SRETURN(-ENOMEM);
1252 }
1253
e2dcc6e2 1254 spin_lock(&skc->skc_lock);
ed316348
BB
1255 empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
1256 if (likely(empty)) {
1257 skc->skc_obj_total++;
1258 skc->skc_obj_emergency++;
1259 if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
1260 skc->skc_obj_emergency_max = skc->skc_obj_emergency;
1261 }
e2dcc6e2
BB
1262 spin_unlock(&skc->skc_lock);
1263
ed316348
BB
1264 if (unlikely(!empty)) {
1265 kfree(ske->ske_obj);
1266 kfree(ske);
1267 SRETURN(-EINVAL);
1268 }
1269
1270 if (skc->skc_ctor)
1271 skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
1272
e2dcc6e2
BB
1273 *obj = ske->ske_obj;
1274
1275 SRETURN(0);
1276}
1277
1278/*
ed316348 1279 * Locate the passed object in the red black tree and free it.
e2dcc6e2
BB
1280 */
1281static int
1282spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
1283{
ed316348 1284 spl_kmem_emergency_t *ske;
e2dcc6e2
BB
1285 SENTRY;
1286
1287 spin_lock(&skc->skc_lock);
ed316348
BB
1288 ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
1289 if (likely(ske)) {
1290 rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
1291 skc->skc_obj_emergency--;
1292 skc->skc_obj_total--;
e2dcc6e2
BB
1293 }
1294 spin_unlock(&skc->skc_lock);
1295
ed316348 1296 if (unlikely(ske == NULL))
e2dcc6e2
BB
1297 SRETURN(-ENOENT);
1298
1299 if (skc->skc_dtor)
1300 skc->skc_dtor(ske->ske_obj, skc->skc_private);
1301
1302 kfree(ske->ske_obj);
1303 kfree(ske);
1304
1305 SRETURN(0);
1306}
1307
d4899f47
BB
1308/*
1309 * Release objects from the per-cpu magazine back to their slab. The flush
1310 * argument contains the max number of entries to remove from the magazine.
1311 */
1312static void
1313__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1314{
1315 int i, count = MIN(flush, skm->skm_avail);
1316 SENTRY;
1317
1318 ASSERT(skc->skc_magic == SKC_MAGIC);
1319 ASSERT(skm->skm_magic == SKM_MAGIC);
1320 ASSERT(spin_is_locked(&skc->skc_lock));
1321
1322 for (i = 0; i < count; i++)
1323 spl_cache_shrink(skc, skm->skm_objs[i]);
1324
1325 skm->skm_avail -= count;
1326 memmove(skm->skm_objs, &(skm->skm_objs[count]),
1327 sizeof(void *) * skm->skm_avail);
1328
1329 SEXIT;
1330}
1331
1332static void
1333spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1334{
1335 spin_lock(&skc->skc_lock);
1336 __spl_cache_flush(skc, skm, flush);
1337 spin_unlock(&skc->skc_lock);
1338}
1339
ea3e6ca9
BB
1340static void
1341spl_magazine_age(void *data)
f1ca4da6 1342{
a10287e0
BB
1343 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1344 spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
9b1b8e4c
BB
1345
1346 ASSERT(skm->skm_magic == SKM_MAGIC);
a10287e0 1347 ASSERT(skm->skm_cpu == smp_processor_id());
d4899f47
BB
1348 ASSERT(irqs_disabled());
1349
1350 /* There are no available objects or they are too young to age out */
1351 if ((skm->skm_avail == 0) ||
1352 time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
1353 return;
f1ca4da6 1354
d4899f47
BB
1355 /*
1356 * Because we're executing in interrupt context we may have
1357 * interrupted the holder of this lock. To avoid a potential
1358 * deadlock return if the lock is contended.
1359 */
1360 if (!spin_trylock(&skc->skc_lock))
1361 return;
1362
1363 __spl_cache_flush(skc, skm, skm->skm_refill);
1364 spin_unlock(&skc->skc_lock);
ea3e6ca9 1365}
4efd4118 1366
ea3e6ca9 1367/*
a10287e0
BB
1368 * Called regularly to keep a downward pressure on the cache.
1369 *
1370 * Objects older than skc->skc_delay seconds in the per-cpu magazines will
1371 * be returned to the caches. This is done to prevent idle magazines from
1372 * holding memory which could be better used elsewhere. The delay is
1373 * present to prevent thrashing the magazine.
1374 *
1375 * The newly released objects may result in empty partial slabs. Those
1376 * slabs should be released to the system. Otherwise moving the objects
1377 * out of the magazines is just wasted work.
ea3e6ca9
BB
1378 */
1379static void
1380spl_cache_age(void *data)
1381{
a10287e0
BB
1382 spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
1383 taskqid_t id = 0;
ea3e6ca9
BB
1384
1385 ASSERT(skc->skc_magic == SKC_MAGIC);
a10287e0 1386
0936c344
BB
1387 /* Dynamically disabled at run time */
1388 if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
1389 return;
1390
a10287e0 1391 atomic_inc(&skc->skc_ref);
a073aeb0
BB
1392
1393 if (!(skc->skc_flags & KMC_NOMAGAZINE))
1394 spl_on_each_cpu(spl_magazine_age, skc, 1);
1395
37db7d8c 1396 spl_slab_reclaim(skc, skc->skc_reap, 0);
ea3e6ca9 1397
a10287e0
BB
1398 while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
1399 id = taskq_dispatch_delay(
1400 spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
1401 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
1402
1403 /* Destroy issued after dispatch immediately cancel it */
1404 if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
1405 taskq_cancel_id(spl_kmem_cache_taskq, id);
1406 }
1407
1408 spin_lock(&skc->skc_lock);
1409 skc->skc_taskqid = id;
1410 spin_unlock(&skc->skc_lock);
1411
1412 atomic_dec(&skc->skc_ref);
2fb9b26a 1413}
f1ca4da6 1414
ea3e6ca9 1415/*
8b45dda2 1416 * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
bdfbe594 1417 * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
ea3e6ca9
BB
1418 * for very small objects we may end up with more than this so as not
1419 * to waste space in the minimal allocation of a single page. Also for
bdfbe594 1420 * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
ea3e6ca9
BB
1421 * lower than this and we will fail.
1422 */
48e0606a
BB
1423static int
1424spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
1425{
8b45dda2 1426 uint32_t sks_size, obj_size, max_size;
48e0606a
BB
1427
1428 if (skc->skc_flags & KMC_OFFSLAB) {
bdfbe594 1429 *objs = spl_kmem_cache_obj_per_slab;
ceb38728
BB
1430 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE);
1431 SRETURN(0);
48e0606a 1432 } else {
8b45dda2
BB
1433 sks_size = spl_sks_size(skc);
1434 obj_size = spl_obj_size(skc);
ea3e6ca9
BB
1435
1436 if (skc->skc_flags & KMC_KMEM)
aa600d8a 1437 max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE;
ea3e6ca9 1438 else
bdfbe594 1439 max_size = (spl_kmem_cache_max_size * 1024 * 1024);
48e0606a 1440
8b45dda2
BB
1441 /* Power of two sized slab */
1442 for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) {
ea3e6ca9 1443 *objs = (*size - sks_size) / obj_size;
bdfbe594 1444 if (*objs >= spl_kmem_cache_obj_per_slab)
b17edc10 1445 SRETURN(0);
ea3e6ca9 1446 }
48e0606a 1447
ea3e6ca9 1448 /*
8b45dda2 1449 * Unable to satisfy target objects per slab, fall back to
ea3e6ca9
BB
1450 * allocating a maximally sized slab and assuming it can
1451 * contain the minimum objects count use it. If not fail.
1452 */
1453 *size = max_size;
1454 *objs = (*size - sks_size) / obj_size;
bdfbe594 1455 if (*objs >= (spl_kmem_cache_obj_per_slab_min))
b17edc10 1456 SRETURN(0);
48e0606a
BB
1457 }
1458
b17edc10 1459 SRETURN(-ENOSPC);
48e0606a
BB
1460}
1461
ea3e6ca9
BB
1462/*
1463 * Make a guess at reasonable per-cpu magazine size based on the size of
1464 * each object and the cost of caching N of them in each magazine. Long
1465 * term this should really adapt based on an observed usage heuristic.
1466 */
4afaaefa 1467static int
1468spl_magazine_size(spl_kmem_cache_t *skc)
1469{
8b45dda2
BB
1470 uint32_t obj_size = spl_obj_size(skc);
1471 int size;
b17edc10 1472 SENTRY;
4afaaefa 1473
ea3e6ca9 1474 /* Per-magazine sizes below assume a 4Kib page size */
8b45dda2 1475 if (obj_size > (PAGE_SIZE * 256))
ea3e6ca9 1476 size = 4; /* Minimum 4Mib per-magazine */
8b45dda2 1477 else if (obj_size > (PAGE_SIZE * 32))
ea3e6ca9 1478 size = 16; /* Minimum 2Mib per-magazine */
8b45dda2 1479 else if (obj_size > (PAGE_SIZE))
ea3e6ca9 1480 size = 64; /* Minimum 256Kib per-magazine */
8b45dda2 1481 else if (obj_size > (PAGE_SIZE / 4))
ea3e6ca9 1482 size = 128; /* Minimum 128Kib per-magazine */
4afaaefa 1483 else
ea3e6ca9 1484 size = 256;
4afaaefa 1485
b17edc10 1486 SRETURN(size);
4afaaefa 1487}
1488
ea3e6ca9 1489/*
ecc39810 1490 * Allocate a per-cpu magazine to associate with a specific core.
ea3e6ca9 1491 */
4afaaefa 1492static spl_kmem_magazine_t *
08850edd 1493spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
4afaaefa 1494{
1495 spl_kmem_magazine_t *skm;
1496 int size = sizeof(spl_kmem_magazine_t) +
1497 sizeof(void *) * skc->skc_mag_size;
b17edc10 1498 SENTRY;
4afaaefa 1499
08850edd 1500 skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
4afaaefa 1501 if (skm) {
1502 skm->skm_magic = SKM_MAGIC;
1503 skm->skm_avail = 0;
1504 skm->skm_size = skc->skc_mag_size;
1505 skm->skm_refill = skc->skc_mag_refill;
9b1b8e4c 1506 skm->skm_cache = skc;
ea3e6ca9 1507 skm->skm_age = jiffies;
08850edd 1508 skm->skm_cpu = cpu;
4afaaefa 1509 }
1510
b17edc10 1511 SRETURN(skm);
4afaaefa 1512}
1513
ea3e6ca9 1514/*
ecc39810 1515 * Free a per-cpu magazine associated with a specific core.
ea3e6ca9 1516 */
4afaaefa 1517static void
1518spl_magazine_free(spl_kmem_magazine_t *skm)
1519{
a0f6da3d 1520 int size = sizeof(spl_kmem_magazine_t) +
1521 sizeof(void *) * skm->skm_size;
1522
b17edc10 1523 SENTRY;
4afaaefa 1524 ASSERT(skm->skm_magic == SKM_MAGIC);
1525 ASSERT(skm->skm_avail == 0);
a0f6da3d 1526
1527 kmem_free(skm, size);
b17edc10 1528 SEXIT;
4afaaefa 1529}
1530
ea3e6ca9
BB
1531/*
1532 * Create all pre-cpu magazines of reasonable sizes.
1533 */
4afaaefa 1534static int
1535spl_magazine_create(spl_kmem_cache_t *skc)
1536{
37db7d8c 1537 int i;
b17edc10 1538 SENTRY;
4afaaefa 1539
a073aeb0
BB
1540 if (skc->skc_flags & KMC_NOMAGAZINE)
1541 SRETURN(0);
1542
4afaaefa 1543 skc->skc_mag_size = spl_magazine_size(skc);
ea3e6ca9 1544 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
4afaaefa 1545
37db7d8c 1546 for_each_online_cpu(i) {
08850edd 1547 skc->skc_mag[i] = spl_magazine_alloc(skc, i);
37db7d8c
BB
1548 if (!skc->skc_mag[i]) {
1549 for (i--; i >= 0; i--)
1550 spl_magazine_free(skc->skc_mag[i]);
4afaaefa 1551
b17edc10 1552 SRETURN(-ENOMEM);
37db7d8c
BB
1553 }
1554 }
4afaaefa 1555
b17edc10 1556 SRETURN(0);
4afaaefa 1557}
1558
ea3e6ca9
BB
1559/*
1560 * Destroy all pre-cpu magazines.
1561 */
4afaaefa 1562static void
1563spl_magazine_destroy(spl_kmem_cache_t *skc)
1564{
37db7d8c
BB
1565 spl_kmem_magazine_t *skm;
1566 int i;
b17edc10 1567 SENTRY;
37db7d8c 1568
a073aeb0
BB
1569 if (skc->skc_flags & KMC_NOMAGAZINE) {
1570 SEXIT;
1571 return;
1572 }
1573
37db7d8c
BB
1574 for_each_online_cpu(i) {
1575 skm = skc->skc_mag[i];
d4899f47 1576 spl_cache_flush(skc, skm, skm->skm_avail);
37db7d8c
BB
1577 spl_magazine_free(skm);
1578 }
1579
b17edc10 1580 SEXIT;
4afaaefa 1581}
1582
ea3e6ca9
BB
1583/*
1584 * Create a object cache based on the following arguments:
1585 * name cache name
1586 * size cache object size
1587 * align cache object alignment
1588 * ctor cache object constructor
1589 * dtor cache object destructor
1590 * reclaim cache object reclaim
1591 * priv cache private data for ctor/dtor/reclaim
1592 * vmp unused must be NULL
1593 * flags
1594 * KMC_NOTOUCH Disable cache object aging (unsupported)
1595 * KMC_NODEBUG Disable debugging (unsupported)
ea3e6ca9
BB
1596 * KMC_NOHASH Disable hashing (unsupported)
1597 * KMC_QCACHE Disable qcache (unsupported)
a073aeb0 1598 * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
ea3e6ca9
BB
1599 * KMC_KMEM Force kmem backed cache
1600 * KMC_VMEM Force vmem backed cache
a073aeb0 1601 * KMC_SLAB Force Linux slab backed cache
ea3e6ca9
BB
1602 * KMC_OFFSLAB Locate objects off the slab
1603 */
2fb9b26a 1604spl_kmem_cache_t *
1605spl_kmem_cache_create(char *name, size_t size, size_t align,
1606 spl_kmem_ctor_t ctor,
1607 spl_kmem_dtor_t dtor,
1608 spl_kmem_reclaim_t reclaim,
1609 void *priv, void *vmp, int flags)
1610{
1611 spl_kmem_cache_t *skc;
296a8e59 1612 int rc;
b17edc10 1613 SENTRY;
937879f1 1614
a1502d76 1615 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
1616 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
1617 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
48e0606a 1618 ASSERT(vmp == NULL);
a1502d76 1619
296a8e59 1620 might_sleep();
0a6fd143 1621
296a8e59
BB
1622 /*
1623 * Allocate memory for a new cache an initialize it. Unfortunately,
5198ea0e
BB
1624 * this usually ends up being a large allocation of ~32k because
1625 * we need to allocate enough memory for the worst case number of
1626 * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
296a8e59
BB
1627 * explicitly pass KM_NODEBUG to suppress the kmem warning
1628 */
1629 skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG);
e9d7a2be 1630 if (skc == NULL)
b17edc10 1631 SRETURN(NULL);
d61e12af 1632
2fb9b26a 1633 skc->skc_magic = SKC_MAGIC;
2fb9b26a 1634 skc->skc_name_size = strlen(name) + 1;
296a8e59 1635 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
2fb9b26a 1636 if (skc->skc_name == NULL) {
1637 kmem_free(skc, sizeof(*skc));
b17edc10 1638 SRETURN(NULL);
2fb9b26a 1639 }
1640 strncpy(skc->skc_name, name, skc->skc_name_size);
1641
e9d7a2be 1642 skc->skc_ctor = ctor;
1643 skc->skc_dtor = dtor;
1644 skc->skc_reclaim = reclaim;
2fb9b26a 1645 skc->skc_private = priv;
1646 skc->skc_vmp = vmp;
a073aeb0 1647 skc->skc_linux_cache = NULL;
2fb9b26a 1648 skc->skc_flags = flags;
1649 skc->skc_obj_size = size;
48e0606a 1650 skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
2fb9b26a 1651 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
37db7d8c 1652 skc->skc_reap = SPL_KMEM_CACHE_REAP;
ea3e6ca9 1653 atomic_set(&skc->skc_ref, 0);
2fb9b26a 1654
2fb9b26a 1655 INIT_LIST_HEAD(&skc->skc_list);
1656 INIT_LIST_HEAD(&skc->skc_complete_list);
1657 INIT_LIST_HEAD(&skc->skc_partial_list);
ed316348 1658 skc->skc_emergency_tree = RB_ROOT;
d46630e0 1659 spin_lock_init(&skc->skc_lock);
e2dcc6e2 1660 init_waitqueue_head(&skc->skc_waitq);
e9d7a2be 1661 skc->skc_slab_fail = 0;
1662 skc->skc_slab_create = 0;
1663 skc->skc_slab_destroy = 0;
2fb9b26a 1664 skc->skc_slab_total = 0;
1665 skc->skc_slab_alloc = 0;
1666 skc->skc_slab_max = 0;
1667 skc->skc_obj_total = 0;
1668 skc->skc_obj_alloc = 0;
1669 skc->skc_obj_max = 0;
165f13c3 1670 skc->skc_obj_deadlock = 0;
e2dcc6e2
BB
1671 skc->skc_obj_emergency = 0;
1672 skc->skc_obj_emergency_max = 0;
a1502d76 1673
a073aeb0
BB
1674 /*
1675 * Verify the requested alignment restriction is sane.
1676 */
48e0606a 1677 if (align) {
8b45dda2 1678 VERIFY(ISP2(align));
a073aeb0
BB
1679 VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
1680 VERIFY3U(align, <=, PAGE_SIZE);
48e0606a
BB
1681 skc->skc_obj_align = align;
1682 }
1683
a073aeb0
BB
1684 /*
1685 * When no specific type of slab is requested (kmem, vmem, or
1686 * linuxslab) then select a cache type based on the object size
1687 * and default tunables.
1688 */
1689 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
1690
1691 /*
1692 * Objects smaller than spl_kmem_cache_slab_limit can
1693 * use the Linux slab for better space-efficiency. By
1694 * default this functionality is disabled until its
1695 * performance characters are fully understood.
1696 */
1697 if (spl_kmem_cache_slab_limit &&
1698 size <= (size_t)spl_kmem_cache_slab_limit)
1699 skc->skc_flags |= KMC_SLAB;
1700
1701 /*
1702 * Small objects, less than spl_kmem_cache_kmem_limit per
1703 * object should use kmem because their slabs are small.
1704 */
1705 else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
a1502d76 1706 skc->skc_flags |= KMC_KMEM;
a073aeb0
BB
1707
1708 /*
1709 * All other objects are considered large and are placed
1710 * on vmem backed slabs.
1711 */
8b45dda2 1712 else
a1502d76 1713 skc->skc_flags |= KMC_VMEM;
a1502d76 1714 }
1715
a073aeb0
BB
1716 /*
1717 * Given the type of slab allocate the required resources.
1718 */
1719 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1720 rc = spl_slab_size(skc,
1721 &skc->skc_slab_objs, &skc->skc_slab_size);
1722 if (rc)
1723 SGOTO(out, rc);
1724
1725 rc = spl_magazine_create(skc);
1726 if (rc)
1727 SGOTO(out, rc);
1728 } else {
1729 skc->skc_linux_cache = kmem_cache_create(
1730 skc->skc_name, size, align, 0, NULL);
1731 if (skc->skc_linux_cache == NULL)
1732 SGOTO(out, rc = ENOMEM);
4afaaefa 1733
a073aeb0
BB
1734 kmem_cache_set_allocflags(skc, __GFP_COMP);
1735 skc->skc_flags |= KMC_NOMAGAZINE;
1736 }
2fb9b26a 1737
0936c344
BB
1738 if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
1739 skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
1740 spl_cache_age, skc, TQ_SLEEP,
1741 ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
ea3e6ca9 1742
2fb9b26a 1743 down_write(&spl_kmem_cache_sem);
e9d7a2be 1744 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
2fb9b26a 1745 up_write(&spl_kmem_cache_sem);
1746
b17edc10 1747 SRETURN(skc);
48e0606a
BB
1748out:
1749 kmem_free(skc->skc_name, skc->skc_name_size);
1750 kmem_free(skc, sizeof(*skc));
b17edc10 1751 SRETURN(NULL);
f1ca4da6 1752}
2fb9b26a 1753EXPORT_SYMBOL(spl_kmem_cache_create);
f1ca4da6 1754
2b354302
BB
1755/*
1756 * Register a move callback to for cache defragmentation.
1757 * XXX: Unimplemented but harmless to stub out for now.
1758 */
1759void
6576a1a7 1760spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
2b354302
BB
1761 kmem_cbrc_t (move)(void *, void *, size_t, void *))
1762{
1763 ASSERT(move != NULL);
1764}
1765EXPORT_SYMBOL(spl_kmem_cache_set_move);
1766
ea3e6ca9 1767/*
ecc39810 1768 * Destroy a cache and all objects associated with the cache.
ea3e6ca9 1769 */
2fb9b26a 1770void
1771spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
f1ca4da6 1772{
ea3e6ca9 1773 DECLARE_WAIT_QUEUE_HEAD(wq);
a10287e0 1774 taskqid_t id;
b17edc10 1775 SENTRY;
f1ca4da6 1776
e9d7a2be 1777 ASSERT(skc->skc_magic == SKC_MAGIC);
a073aeb0 1778 ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
e9d7a2be 1779
1780 down_write(&spl_kmem_cache_sem);
1781 list_del_init(&skc->skc_list);
1782 up_write(&spl_kmem_cache_sem);
2fb9b26a 1783
a10287e0 1784 /* Cancel any and wait for any pending delayed tasks */
64c075c3 1785 VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
9b1b8e4c 1786
a10287e0
BB
1787 spin_lock(&skc->skc_lock);
1788 id = skc->skc_taskqid;
1789 spin_unlock(&skc->skc_lock);
1790
1791 taskq_cancel_id(spl_kmem_cache_taskq, id);
ea3e6ca9
BB
1792
1793 /* Wait until all current callers complete, this is mainly
1794 * to catch the case where a low memory situation triggers a
1795 * cache reaping action which races with this destroy. */
1796 wait_event(wq, atomic_read(&skc->skc_ref) == 0);
1797
a073aeb0
BB
1798 if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
1799 spl_magazine_destroy(skc);
1800 spl_slab_reclaim(skc, 0, 1);
1801 } else {
1802 ASSERT(skc->skc_flags & KMC_SLAB);
1803 kmem_cache_destroy(skc->skc_linux_cache);
1804 }
1805
d46630e0 1806 spin_lock(&skc->skc_lock);
d6a26c6a 1807
2fb9b26a 1808 /* Validate there are no objects in use and free all the
4afaaefa 1809 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
ea3e6ca9
BB
1810 ASSERT3U(skc->skc_slab_alloc, ==, 0);
1811 ASSERT3U(skc->skc_obj_alloc, ==, 0);
1812 ASSERT3U(skc->skc_slab_total, ==, 0);
1813 ASSERT3U(skc->skc_obj_total, ==, 0);
e2dcc6e2 1814 ASSERT3U(skc->skc_obj_emergency, ==, 0);
2fb9b26a 1815 ASSERT(list_empty(&skc->skc_complete_list));
a1502d76 1816
2fb9b26a 1817 kmem_free(skc->skc_name, skc->skc_name_size);
d46630e0 1818 spin_unlock(&skc->skc_lock);
ff449ac4 1819
4afaaefa 1820 kmem_free(skc, sizeof(*skc));
2fb9b26a 1821
b17edc10 1822 SEXIT;
f1ca4da6 1823}
2fb9b26a 1824EXPORT_SYMBOL(spl_kmem_cache_destroy);
f1ca4da6 1825
ea3e6ca9
BB
1826/*
1827 * Allocate an object from a slab attached to the cache. This is used to
1828 * repopulate the per-cpu magazine caches in batches when they run low.
1829 */
4afaaefa 1830static void *
1831spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
f1ca4da6 1832{
2fb9b26a 1833 spl_kmem_obj_t *sko;
f1ca4da6 1834
e9d7a2be 1835 ASSERT(skc->skc_magic == SKC_MAGIC);
1836 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 1837 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 1838
a1502d76 1839 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
4afaaefa 1840 ASSERT(sko->sko_magic == SKO_MAGIC);
1841 ASSERT(sko->sko_addr != NULL);
2fb9b26a 1842
a1502d76 1843 /* Remove from sks_free_list */
4afaaefa 1844 list_del_init(&sko->sko_list);
2fb9b26a 1845
4afaaefa 1846 sks->sks_age = jiffies;
1847 sks->sks_ref++;
1848 skc->skc_obj_alloc++;
2fb9b26a 1849
4afaaefa 1850 /* Track max obj usage statistics */
1851 if (skc->skc_obj_alloc > skc->skc_obj_max)
1852 skc->skc_obj_max = skc->skc_obj_alloc;
2fb9b26a 1853
4afaaefa 1854 /* Track max slab usage statistics */
1855 if (sks->sks_ref == 1) {
1856 skc->skc_slab_alloc++;
f1ca4da6 1857
4afaaefa 1858 if (skc->skc_slab_alloc > skc->skc_slab_max)
1859 skc->skc_slab_max = skc->skc_slab_alloc;
2fb9b26a 1860 }
1861
4afaaefa 1862 return sko->sko_addr;
1863}
c30df9c8 1864
ea3e6ca9 1865/*
e2dcc6e2
BB
1866 * Generic slab allocation function to run by the global work queues.
1867 * It is responsible for allocating a new slab, linking it in to the list
1868 * of partial slabs, and then waking any waiters.
4afaaefa 1869 */
e2dcc6e2
BB
1870static void
1871spl_cache_grow_work(void *data)
4afaaefa 1872{
33e94ef1 1873 spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
e2dcc6e2 1874 spl_kmem_cache_t *skc = ska->ska_cache;
e9d7a2be 1875 spl_kmem_slab_t *sks;
e2dcc6e2
BB
1876
1877 sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
1878 spin_lock(&skc->skc_lock);
1879 if (sks) {
1880 skc->skc_slab_total++;
1881 skc->skc_obj_total += sks->sks_objs;
1882 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1883 }
1884
1885 atomic_dec(&skc->skc_ref);
1886 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
165f13c3 1887 clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
e2dcc6e2
BB
1888 wake_up_all(&skc->skc_waitq);
1889 spin_unlock(&skc->skc_lock);
1890
1891 kfree(ska);
1892}
1893
1894/*
1895 * Returns non-zero when a new slab should be available.
1896 */
1897static int
1898spl_cache_grow_wait(spl_kmem_cache_t *skc)
1899{
1900 return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
1901}
1902
dc1b3022
BB
1903static int
1904spl_cache_reclaim_wait(void *word)
1905{
1906 schedule();
1907 return 0;
1908}
1909
e2dcc6e2 1910/*
a073aeb0
BB
1911 * No available objects on any slabs, create a new slab. Note that this
1912 * functionality is disabled for KMC_SLAB caches which are backed by the
1913 * Linux slab.
e2dcc6e2
BB
1914 */
1915static int
1916spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
1917{
165f13c3 1918 int remaining, rc;
b17edc10 1919 SENTRY;
f1ca4da6 1920
e9d7a2be 1921 ASSERT(skc->skc_magic == SKC_MAGIC);
a073aeb0 1922 ASSERT((skc->skc_flags & KMC_SLAB) == 0);
ea3e6ca9 1923 might_sleep();
e2dcc6e2 1924 *obj = NULL;
e9d7a2be 1925
ea3e6ca9 1926 /*
dc1b3022
BB
1927 * Before allocating a new slab wait for any reaping to complete and
1928 * then return so the local magazine can be rechecked for new objects.
ea3e6ca9 1929 */
dc1b3022
BB
1930 if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
1931 rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
1932 spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
1933 SRETURN(rc ? rc : -EAGAIN);
1934 }
2fb9b26a 1935
e2dcc6e2
BB
1936 /*
1937 * This is handled by dispatching a work request to the global work
1938 * queue. This allows us to asynchronously allocate a new slab while
1939 * retaining the ability to safely fall back to a smaller synchronous
1940 * allocations to ensure forward progress is always maintained.
1941 */
1942 if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
1943 spl_kmem_alloc_t *ska;
4afaaefa 1944
e2dcc6e2
BB
1945 ska = kmalloc(sizeof(*ska), flags);
1946 if (ska == NULL) {
1947 clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
1948 wake_up_all(&skc->skc_waitq);
1949 SRETURN(-ENOMEM);
1950 }
4afaaefa 1951
e2dcc6e2
BB
1952 atomic_inc(&skc->skc_ref);
1953 ska->ska_cache = skc;
043f9b57 1954 ska->ska_flags = flags & ~__GFP_FS;
33e94ef1
BB
1955 taskq_init_ent(&ska->ska_tqe);
1956 taskq_dispatch_ent(spl_kmem_cache_taskq,
1957 spl_cache_grow_work, ska, 0, &ska->ska_tqe);
e2dcc6e2
BB
1958 }
1959
1960 /*
165f13c3
BB
1961 * The goal here is to only detect the rare case where a virtual slab
1962 * allocation has deadlocked. We must be careful to minimize the use
1963 * of emergency objects which are more expensive to track. Therefore,
1964 * we set a very long timeout for the asynchronous allocation and if
1965 * the timeout is reached the cache is flagged as deadlocked. From
1966 * this point only new emergency objects will be allocated until the
1967 * asynchronous allocation completes and clears the deadlocked flag.
e2dcc6e2 1968 */
165f13c3
BB
1969 if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
1970 rc = spl_emergency_alloc(skc, flags, obj);
1971 } else {
1972 remaining = wait_event_timeout(skc->skc_waitq,
1973 spl_cache_grow_wait(skc), HZ);
1974
1975 if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
1976 spin_lock(&skc->skc_lock);
1977 if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
1978 set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
1979 skc->skc_obj_deadlock++;
1980 }
1981 spin_unlock(&skc->skc_lock);
1982 }
cb5c2ace 1983
165f13c3 1984 rc = -ENOMEM;
cb5c2ace 1985 }
e2dcc6e2
BB
1986
1987 SRETURN(rc);
f1ca4da6 1988}
1989
ea3e6ca9 1990/*
e2dcc6e2
BB
1991 * Refill a per-cpu magazine with objects from the slabs for this cache.
1992 * Ideally the magazine can be repopulated using existing objects which have
1993 * been released, however if we are unable to locate enough free objects new
1994 * slabs of objects will be created. On success NULL is returned, otherwise
1995 * the address of a single emergency object is returned for use by the caller.
ea3e6ca9 1996 */
e2dcc6e2 1997static void *
4afaaefa 1998spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
f1ca4da6 1999{
e9d7a2be 2000 spl_kmem_slab_t *sks;
e2dcc6e2
BB
2001 int count = 0, rc, refill;
2002 void *obj = NULL;
b17edc10 2003 SENTRY;
f1ca4da6 2004
e9d7a2be 2005 ASSERT(skc->skc_magic == SKC_MAGIC);
2006 ASSERT(skm->skm_magic == SKM_MAGIC);
2007
e9d7a2be 2008 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
d46630e0 2009 spin_lock(&skc->skc_lock);
ff449ac4 2010
4afaaefa 2011 while (refill > 0) {
ea3e6ca9 2012 /* No slabs available we may need to grow the cache */
4afaaefa 2013 if (list_empty(&skc->skc_partial_list)) {
2014 spin_unlock(&skc->skc_lock);
ff449ac4 2015
e2dcc6e2
BB
2016 local_irq_enable();
2017 rc = spl_cache_grow(skc, flags, &obj);
2018 local_irq_disable();
2019
2020 /* Emergency object for immediate use by caller */
2021 if (rc == 0 && obj != NULL)
2022 SRETURN(obj);
2023
2024 if (rc)
b17edc10 2025 SGOTO(out, rc);
4afaaefa 2026
2027 /* Rescheduled to different CPU skm is not local */
2028 if (skm != skc->skc_mag[smp_processor_id()])
b17edc10 2029 SGOTO(out, rc);
e9d7a2be 2030
2031 /* Potentially rescheduled to the same CPU but
ecc39810 2032 * allocations may have occurred from this CPU while
e9d7a2be 2033 * we were sleeping so recalculate max refill. */
2034 refill = MIN(refill, skm->skm_size - skm->skm_avail);
4afaaefa 2035
2036 spin_lock(&skc->skc_lock);
2037 continue;
2038 }
d46630e0 2039
4afaaefa 2040 /* Grab the next available slab */
2041 sks = list_entry((&skc->skc_partial_list)->next,
2042 spl_kmem_slab_t, sks_list);
2043 ASSERT(sks->sks_magic == SKS_MAGIC);
2044 ASSERT(sks->sks_ref < sks->sks_objs);
2045 ASSERT(!list_empty(&sks->sks_free_list));
d46630e0 2046
4afaaefa 2047 /* Consume as many objects as needed to refill the requested
e9d7a2be 2048 * cache. We must also be careful not to overfill it. */
e2dcc6e2 2049 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
e9d7a2be 2050 ASSERT(skm->skm_avail < skm->skm_size);
e2dcc6e2 2051 ASSERT(count < skm->skm_size);
4afaaefa 2052 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
e9d7a2be 2053 }
f1ca4da6 2054
4afaaefa 2055 /* Move slab to skc_complete_list when full */
2056 if (sks->sks_ref == sks->sks_objs) {
2057 list_del(&sks->sks_list);
2058 list_add(&sks->sks_list, &skc->skc_complete_list);
2fb9b26a 2059 }
2060 }
57d86234 2061
4afaaefa 2062 spin_unlock(&skc->skc_lock);
2063out:
e2dcc6e2 2064 SRETURN(NULL);
4afaaefa 2065}
2066
ea3e6ca9
BB
2067/*
2068 * Release an object back to the slab from which it came.
2069 */
4afaaefa 2070static void
2071spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
2072{
e9d7a2be 2073 spl_kmem_slab_t *sks = NULL;
4afaaefa 2074 spl_kmem_obj_t *sko = NULL;
b17edc10 2075 SENTRY;
4afaaefa 2076
e9d7a2be 2077 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 2078 ASSERT(spin_is_locked(&skc->skc_lock));
2079
8b45dda2 2080 sko = spl_sko_from_obj(skc, obj);
a1502d76 2081 ASSERT(sko->sko_magic == SKO_MAGIC);
4afaaefa 2082 sks = sko->sko_slab;
a1502d76 2083 ASSERT(sks->sks_magic == SKS_MAGIC);
2fb9b26a 2084 ASSERT(sks->sks_cache == skc);
2fb9b26a 2085 list_add(&sko->sko_list, &sks->sks_free_list);
d6a26c6a 2086
2fb9b26a 2087 sks->sks_age = jiffies;
4afaaefa 2088 sks->sks_ref--;
2fb9b26a 2089 skc->skc_obj_alloc--;
f1ca4da6 2090
2fb9b26a 2091 /* Move slab to skc_partial_list when no longer full. Slabs
4afaaefa 2092 * are added to the head to keep the partial list is quasi-full
2093 * sorted order. Fuller at the head, emptier at the tail. */
2094 if (sks->sks_ref == (sks->sks_objs - 1)) {
2fb9b26a 2095 list_del(&sks->sks_list);
2096 list_add(&sks->sks_list, &skc->skc_partial_list);
2097 }
f1ca4da6 2098
ecc39810 2099 /* Move empty slabs to the end of the partial list so
4afaaefa 2100 * they can be easily found and freed during reclamation. */
2101 if (sks->sks_ref == 0) {
2fb9b26a 2102 list_del(&sks->sks_list);
2103 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
2104 skc->skc_slab_alloc--;
2105 }
2106
b17edc10 2107 SEXIT;
4afaaefa 2108}
2109
ea3e6ca9
BB
2110/*
2111 * Allocate an object from the per-cpu magazine, or if the magazine
2112 * is empty directly allocate from a slab and repopulate the magazine.
2113 */
4afaaefa 2114void *
2115spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
2116{
2117 spl_kmem_magazine_t *skm;
4afaaefa 2118 void *obj = NULL;
b17edc10 2119 SENTRY;
4afaaefa 2120
e9d7a2be 2121 ASSERT(skc->skc_magic == SKC_MAGIC);
ea3e6ca9
BB
2122 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2123 ASSERT(flags & KM_SLEEP);
a073aeb0 2124
ea3e6ca9 2125 atomic_inc(&skc->skc_ref);
a073aeb0
BB
2126
2127 /*
2128 * Allocate directly from a Linux slab. All optimizations are left
2129 * to the underlying cache we only need to guarantee that KM_SLEEP
2130 * callers will never fail.
2131 */
2132 if (skc->skc_flags & KMC_SLAB) {
2133 struct kmem_cache *slc = skc->skc_linux_cache;
2134
2135 do {
2136 obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
2137 if (obj && skc->skc_ctor)
2138 skc->skc_ctor(obj, skc->skc_private, flags);
2139
2140 } while ((obj == NULL) && !(flags & KM_NOSLEEP));
2141
2142 atomic_dec(&skc->skc_ref);
2143 SRETURN(obj);
2144 }
2145
429fe89c 2146 local_irq_disable();
4afaaefa 2147
2148restart:
2149 /* Safe to update per-cpu structure without lock, but
ecc39810 2150 * in the restart case we must be careful to reacquire
4afaaefa 2151 * the local magazine since this may have changed
2152 * when we need to grow the cache. */
2153 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 2154 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
2155 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
2156 skm->skm_size, skm->skm_refill, skm->skm_avail);
4afaaefa 2157
2158 if (likely(skm->skm_avail)) {
2159 /* Object available in CPU cache, use it */
2160 obj = skm->skm_objs[--skm->skm_avail];
ea3e6ca9 2161 skm->skm_age = jiffies;
4afaaefa 2162 } else {
e2dcc6e2
BB
2163 obj = spl_cache_refill(skc, skm, flags);
2164 if (obj == NULL)
2165 SGOTO(restart, obj = NULL);
4afaaefa 2166 }
2167
429fe89c 2168 local_irq_enable();
fece7c99 2169 ASSERT(obj);
8b45dda2 2170 ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
4afaaefa 2171
2172 /* Pre-emptively migrate object to CPU L1 cache */
2173 prefetchw(obj);
ea3e6ca9 2174 atomic_dec(&skc->skc_ref);
4afaaefa 2175
b17edc10 2176 SRETURN(obj);
4afaaefa 2177}
2178EXPORT_SYMBOL(spl_kmem_cache_alloc);
2179
ea3e6ca9
BB
2180/*
2181 * Free an object back to the local per-cpu magazine, there is no
2182 * guarantee that this is the same magazine the object was originally
2183 * allocated from. We may need to flush entire from the magazine
2184 * back to the slabs to make space.
2185 */
4afaaefa 2186void
2187spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
2188{
2189 spl_kmem_magazine_t *skm;
2190 unsigned long flags;
b17edc10 2191 SENTRY;
4afaaefa 2192
e9d7a2be 2193 ASSERT(skc->skc_magic == SKC_MAGIC);
ea3e6ca9
BB
2194 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2195 atomic_inc(&skc->skc_ref);
e2dcc6e2 2196
a073aeb0
BB
2197 /*
2198 * Free the object from the Linux underlying Linux slab.
2199 */
2200 if (skc->skc_flags & KMC_SLAB) {
2201 if (skc->skc_dtor)
2202 skc->skc_dtor(obj, skc->skc_private);
2203
2204 kmem_cache_free(skc->skc_linux_cache, obj);
2205 goto out;
2206 }
2207
e2dcc6e2 2208 /*
a1af8fb1
BB
2209 * Only virtual slabs may have emergency objects and these objects
2210 * are guaranteed to have physical addresses. They must be removed
2211 * from the tree of emergency objects and the freed.
e2dcc6e2 2212 */
a1af8fb1
BB
2213 if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
2214 SGOTO(out, spl_emergency_free(skc, obj));
e2dcc6e2 2215
4afaaefa 2216 local_irq_save(flags);
2217
2218 /* Safe to update per-cpu structure without lock, but
2219 * no remote memory allocation tracking is being performed
2220 * it is entirely possible to allocate an object from one
2221 * CPU cache and return it to another. */
2222 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 2223 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 2224
2225 /* Per-CPU cache full, flush it to make space */
2226 if (unlikely(skm->skm_avail >= skm->skm_size))
d4899f47 2227 spl_cache_flush(skc, skm, skm->skm_refill);
4afaaefa 2228
2229 /* Available space in cache, use it */
2230 skm->skm_objs[skm->skm_avail++] = obj;
2231
2232 local_irq_restore(flags);
e2dcc6e2 2233out:
ea3e6ca9 2234 atomic_dec(&skc->skc_ref);
4afaaefa 2235
b17edc10 2236 SEXIT;
f1ca4da6 2237}
2fb9b26a 2238EXPORT_SYMBOL(spl_kmem_cache_free);
5c2bb9b2 2239
ea3e6ca9 2240/*
ecc39810
BB
2241 * The generic shrinker function for all caches. Under Linux a shrinker
2242 * may not be tightly coupled with a slab cache. In fact Linux always
2243 * systematically tries calling all registered shrinker callbacks which
ea3e6ca9
BB
2244 * report that they contain unused objects. Because of this we only
2245 * register one shrinker function in the shim layer for all slab caches.
2246 * We always attempt to shrink all caches when this generic shrinker
2247 * is called. The shrinker should return the number of free objects
2248 * in the cache when called with nr_to_scan == 0 but not attempt to
2249 * free any objects. When nr_to_scan > 0 it is a request that nr_to_scan
cef7605c
PS
2250 * objects should be freed, which differs from Solaris semantics.
2251 * Solaris semantics are to free all available objects which may (and
2252 * probably will) be more objects than the requested nr_to_scan.
ea3e6ca9 2253 */
a55bcaad
BB
2254static int
2255__spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
2256 struct shrink_control *sc)
2fb9b26a 2257{
e9d7a2be 2258 spl_kmem_cache_t *skc;
376dc35e 2259 int alloc = 0;
5c2bb9b2 2260
e9d7a2be 2261 down_read(&spl_kmem_cache_sem);
ea3e6ca9 2262 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
a55bcaad 2263 if (sc->nr_to_scan)
cef7605c
PS
2264 spl_kmem_cache_reap_now(skc,
2265 MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1));
ea3e6ca9
BB
2266
2267 /*
376dc35e 2268 * Presume everything alloc'ed is reclaimable, this ensures
ea3e6ca9
BB
2269 * we are called again with nr_to_scan > 0 so can try and
2270 * reclaim. The exact number is not important either so
2271 * we forgo taking this already highly contented lock.
2272 */
376dc35e 2273 alloc += skc->skc_obj_alloc;
ea3e6ca9 2274 }
e9d7a2be 2275 up_read(&spl_kmem_cache_sem);
2fb9b26a 2276
b9b37153 2277 /*
376dc35e
BB
2278 * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
2279 * This functionality only exists to work around a rare issue where
2280 * shrink_slabs() is repeatedly invoked by many cores causing the
2281 * system to thrash.
b9b37153 2282 */
376dc35e
BB
2283 if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
2284 return (-1);
b9b37153 2285
376dc35e 2286 return MAX((alloc * sysctl_vfs_cache_pressure) / 100, 0);
5c2bb9b2 2287}
5c2bb9b2 2288
a55bcaad
BB
2289SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
2290
ea3e6ca9
BB
2291/*
2292 * Call the registered reclaim function for a cache. Depending on how
2293 * many and which objects are released it may simply repopulate the
2294 * local magazine which will then need to age-out. Objects which cannot
2295 * fit in the magazine we will be released back to their slabs which will
2296 * also need to age out before being release. This is all just best
2297 * effort and we do not want to thrash creating and destroying slabs.
2298 */
57d86234 2299void
cef7605c 2300spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
57d86234 2301{
b17edc10 2302 SENTRY;
e9d7a2be 2303
2304 ASSERT(skc->skc_magic == SKC_MAGIC);
ea3e6ca9 2305 ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
2fb9b26a 2306
a073aeb0
BB
2307 atomic_inc(&skc->skc_ref);
2308
2309 /*
2310 * Execute the registered reclaim callback if it exists. The
2311 * per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
2312 */
2313 if (skc->skc_flags & KMC_SLAB) {
2314 if (skc->skc_reclaim)
2315 skc->skc_reclaim(skc->skc_private);
2316
2317 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
2318 kmem_cache_shrink(skc->skc_linux_cache);
2319
2320 SGOTO(out, 0);
ea3e6ca9 2321 }
2fb9b26a 2322
a073aeb0
BB
2323 /*
2324 * Prevent concurrent cache reaping when contended.
2325 */
2326 if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
2327 SGOTO(out, 0);
4afaaefa 2328
b78d4b9d
BB
2329 /*
2330 * When a reclaim function is available it may be invoked repeatedly
2331 * until at least a single slab can be freed. This ensures that we
2332 * do free memory back to the system. This helps minimize the chance
2333 * of an OOM event when the bulk of memory is used by the slab.
2334 *
2335 * When free slabs are already available the reclaim callback will be
2336 * skipped. Additionally, if no forward progress is detected despite
2337 * a reclaim function the cache will be skipped to avoid deadlock.
2338 *
2339 * Longer term this would be the correct place to add the code which
2340 * repacks the slabs in order minimize fragmentation.
2341 */
2342 if (skc->skc_reclaim) {
2343 uint64_t objects = UINT64_MAX;
2344 int do_reclaim;
2345
2346 do {
2347 spin_lock(&skc->skc_lock);
2348 do_reclaim =
2349 (skc->skc_slab_total > 0) &&
2350 ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) &&
2351 (skc->skc_obj_alloc < objects);
2352
2353 objects = skc->skc_obj_alloc;
2354 spin_unlock(&skc->skc_lock);
2355
2356 if (do_reclaim)
2357 skc->skc_reclaim(skc->skc_private);
2358
2359 } while (do_reclaim);
2360 }
4afaaefa 2361
0936c344
BB
2362 /* Reclaim from the magazine then the slabs ignoring age and delay. */
2363 if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
2364 spl_kmem_magazine_t *skm;
251e7a77 2365 unsigned long irq_flags;
0936c344 2366
251e7a77
RY
2367 local_irq_save(irq_flags);
2368 skm = skc->skc_mag[smp_processor_id()];
2369 spl_cache_flush(skc, skm, skm->skm_avail);
2370 local_irq_restore(irq_flags);
0936c344
BB
2371 }
2372
c0e0fc14 2373 spl_slab_reclaim(skc, count, 1);
ea3e6ca9 2374 clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
dc1b3022
BB
2375 smp_mb__after_clear_bit();
2376 wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
a073aeb0 2377out:
ea3e6ca9 2378 atomic_dec(&skc->skc_ref);
4afaaefa 2379
b17edc10 2380 SEXIT;
57d86234 2381}
2fb9b26a 2382EXPORT_SYMBOL(spl_kmem_cache_reap_now);
57d86234 2383
ea3e6ca9
BB
2384/*
2385 * Reap all free slabs from all registered caches.
2386 */
f1b59d26 2387void
2fb9b26a 2388spl_kmem_reap(void)
937879f1 2389{
a55bcaad
BB
2390 struct shrink_control sc;
2391
2392 sc.nr_to_scan = KMC_REAP_CHUNK;
2393 sc.gfp_mask = GFP_KERNEL;
2394
2395 __spl_kmem_cache_generic_shrinker(NULL, &sc);
f1ca4da6 2396}
2fb9b26a 2397EXPORT_SYMBOL(spl_kmem_reap);
5d86345d 2398
ff449ac4 2399#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
c6dc93d6 2400static char *
4afaaefa 2401spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
d6a26c6a 2402{
e9d7a2be 2403 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
d6a26c6a 2404 int i, flag = 1;
2405
2406 ASSERT(str != NULL && len >= 17);
e9d7a2be 2407 memset(str, 0, len);
d6a26c6a 2408
2409 /* Check for a fully printable string, and while we are at
2410 * it place the printable characters in the passed buffer. */
2411 for (i = 0; i < size; i++) {
e9d7a2be 2412 str[i] = ((char *)(kd->kd_addr))[i];
2413 if (isprint(str[i])) {
2414 continue;
2415 } else {
2416 /* Minimum number of printable characters found
2417 * to make it worthwhile to print this as ascii. */
2418 if (i > min)
2419 break;
2420
2421 flag = 0;
2422 break;
2423 }
d6a26c6a 2424 }
2425
2426 if (!flag) {
2427 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
2428 *((uint8_t *)kd->kd_addr),
2429 *((uint8_t *)kd->kd_addr + 2),
2430 *((uint8_t *)kd->kd_addr + 4),
2431 *((uint8_t *)kd->kd_addr + 6),
2432 *((uint8_t *)kd->kd_addr + 8),
2433 *((uint8_t *)kd->kd_addr + 10),
2434 *((uint8_t *)kd->kd_addr + 12),
2435 *((uint8_t *)kd->kd_addr + 14));
2436 }
2437
2438 return str;
2439}
2440
a1502d76 2441static int
2442spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
2443{
2444 int i;
b17edc10 2445 SENTRY;
a1502d76 2446
2447 spin_lock_init(lock);
2448 INIT_LIST_HEAD(list);
2449
2450 for (i = 0; i < size; i++)
2451 INIT_HLIST_HEAD(&kmem_table[i]);
2452
b17edc10 2453 SRETURN(0);
a1502d76 2454}
2455
ff449ac4 2456static void
2457spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
5d86345d 2458{
2fb9b26a 2459 unsigned long flags;
2460 kmem_debug_t *kd;
2461 char str[17];
b17edc10 2462 SENTRY;
2fb9b26a 2463
ff449ac4 2464 spin_lock_irqsave(lock, flags);
2465 if (!list_empty(list))
a0f6da3d 2466 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
2467 "size", "data", "func", "line");
2fb9b26a 2468
ff449ac4 2469 list_for_each_entry(kd, list, kd_list)
a0f6da3d 2470 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
b6b2acc6 2471 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2fb9b26a 2472 kd->kd_func, kd->kd_line);
2473
ff449ac4 2474 spin_unlock_irqrestore(lock, flags);
b17edc10 2475 SEXIT;
ff449ac4 2476}
2477#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
a1502d76 2478#define spl_kmem_init_tracking(list, lock, size)
ff449ac4 2479#define spl_kmem_fini_tracking(list, lock)
2480#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
2481
36b313da
BB
2482static void
2483spl_kmem_init_globals(void)
2484{
2485 struct zone *zone;
2486
2487 /* For now all zones are includes, it may be wise to restrict
2488 * this to normal and highmem zones if we see problems. */
2489 for_each_zone(zone) {
2490
2491 if (!populated_zone(zone))
2492 continue;
2493
baf2979e
BB
2494 minfree += min_wmark_pages(zone);
2495 desfree += low_wmark_pages(zone);
2496 lotsfree += high_wmark_pages(zone);
36b313da 2497 }
4ab13d3b
BB
2498
2499 /* Solaris default values */
96dded38
BB
2500 swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3);
2501 swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4);
36b313da
BB
2502}
2503
d1ff2312
BB
2504/*
2505 * Called at module init when it is safe to use spl_kallsyms_lookup_name()
2506 */
2507int
2508spl_kmem_init_kallsyms_lookup(void)
2509{
2510#ifndef HAVE_GET_VMALLOC_INFO
2511 get_vmalloc_info_fn = (get_vmalloc_info_t)
2512 spl_kallsyms_lookup_name("get_vmalloc_info");
e11d6c5f
BB
2513 if (!get_vmalloc_info_fn) {
2514 printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n");
d1ff2312 2515 return -EFAULT;
e11d6c5f 2516 }
d1ff2312
BB
2517#endif /* HAVE_GET_VMALLOC_INFO */
2518
5232d256
BB
2519#ifdef HAVE_PGDAT_HELPERS
2520# ifndef HAVE_FIRST_ONLINE_PGDAT
d1ff2312
BB
2521 first_online_pgdat_fn = (first_online_pgdat_t)
2522 spl_kallsyms_lookup_name("first_online_pgdat");
e11d6c5f
BB
2523 if (!first_online_pgdat_fn) {
2524 printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n");
d1ff2312 2525 return -EFAULT;
e11d6c5f 2526 }
5232d256 2527# endif /* HAVE_FIRST_ONLINE_PGDAT */
d1ff2312 2528
5232d256 2529# ifndef HAVE_NEXT_ONLINE_PGDAT
d1ff2312
BB
2530 next_online_pgdat_fn = (next_online_pgdat_t)
2531 spl_kallsyms_lookup_name("next_online_pgdat");
e11d6c5f
BB
2532 if (!next_online_pgdat_fn) {
2533 printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n");
d1ff2312 2534 return -EFAULT;
e11d6c5f 2535 }
5232d256 2536# endif /* HAVE_NEXT_ONLINE_PGDAT */
d1ff2312 2537
5232d256 2538# ifndef HAVE_NEXT_ZONE
d1ff2312
BB
2539 next_zone_fn = (next_zone_t)
2540 spl_kallsyms_lookup_name("next_zone");
e11d6c5f
BB
2541 if (!next_zone_fn) {
2542 printk(KERN_ERR "Error: Unknown symbol next_zone\n");
d1ff2312 2543 return -EFAULT;
e11d6c5f 2544 }
5232d256
BB
2545# endif /* HAVE_NEXT_ZONE */
2546
2547#else /* HAVE_PGDAT_HELPERS */
2548
2549# ifndef HAVE_PGDAT_LIST
124ca8a5 2550 pgdat_list_addr = *(struct pglist_data **)
5232d256
BB
2551 spl_kallsyms_lookup_name("pgdat_list");
2552 if (!pgdat_list_addr) {
2553 printk(KERN_ERR "Error: Unknown symbol pgdat_list\n");
2554 return -EFAULT;
2555 }
2556# endif /* HAVE_PGDAT_LIST */
2557#endif /* HAVE_PGDAT_HELPERS */
d1ff2312 2558
6ae7fef5 2559#if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS)
d1ff2312
BB
2560 get_zone_counts_fn = (get_zone_counts_t)
2561 spl_kallsyms_lookup_name("get_zone_counts");
e11d6c5f
BB
2562 if (!get_zone_counts_fn) {
2563 printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n");
d1ff2312 2564 return -EFAULT;
e11d6c5f 2565 }
6ae7fef5 2566#endif /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */
d1ff2312
BB
2567
2568 /*
2569 * It is now safe to initialize the global tunings which rely on
2570 * the use of the for_each_zone() macro. This macro in turns
2571 * depends on the *_pgdat symbols which are now available.
2572 */
2573 spl_kmem_init_globals();
2574
e76f4bf1 2575#ifndef HAVE_SHRINK_DCACHE_MEMORY
fe71c0e5 2576 /* When shrink_dcache_memory_fn == NULL support is disabled */
e76f4bf1 2577 shrink_dcache_memory_fn = (shrink_dcache_memory_t)
fe71c0e5 2578 spl_kallsyms_lookup_name("shrink_dcache_memory");
e76f4bf1
BB
2579#endif /* HAVE_SHRINK_DCACHE_MEMORY */
2580
2581#ifndef HAVE_SHRINK_ICACHE_MEMORY
fe71c0e5 2582 /* When shrink_icache_memory_fn == NULL support is disabled */
e76f4bf1 2583 shrink_icache_memory_fn = (shrink_icache_memory_t)
fe71c0e5 2584 spl_kallsyms_lookup_name("shrink_icache_memory");
e76f4bf1
BB
2585#endif /* HAVE_SHRINK_ICACHE_MEMORY */
2586
d1ff2312
BB
2587 return 0;
2588}
2589
a1502d76 2590int
2591spl_kmem_init(void)
2592{
2593 int rc = 0;
b17edc10 2594 SENTRY;
a1502d76 2595
a1502d76 2596#ifdef DEBUG_KMEM
d04c8a56
BB
2597 kmem_alloc_used_set(0);
2598 vmem_alloc_used_set(0);
a1502d76 2599
2600 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
2601 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
2602#endif
5c7a0369
TC
2603
2604 init_rwsem(&spl_kmem_cache_sem);
2605 INIT_LIST_HEAD(&spl_kmem_cache_list);
2606 spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
2607 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
2608
2609 spl_register_shrinker(&spl_kmem_cache_shrinker);
2610
b17edc10 2611 SRETURN(rc);
a1502d76 2612}
2613
ff449ac4 2614void
2615spl_kmem_fini(void)
2616{
ab4e74cc
BB
2617 SENTRY;
2618
2619 spl_unregister_shrinker(&spl_kmem_cache_shrinker);
2620 taskq_destroy(spl_kmem_cache_taskq);
2621
ff449ac4 2622#ifdef DEBUG_KMEM
2623 /* Display all unreclaimed memory addresses, including the
2624 * allocation size and the first few bytes of what's located
2625 * at that address to aid in debugging. Performance is not
2626 * a serious concern here since it is module unload time. */
d04c8a56 2627 if (kmem_alloc_used_read() != 0)
b17edc10 2628 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
3cb77549
BB
2629 "kmem leaked %ld/%ld bytes\n",
2630 kmem_alloc_used_read(), kmem_alloc_max);
ff449ac4 2631
2fb9b26a 2632
d04c8a56 2633 if (vmem_alloc_used_read() != 0)
b17edc10 2634 SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
3cb77549
BB
2635 "vmem leaked %ld/%ld bytes\n",
2636 vmem_alloc_used_read(), vmem_alloc_max);
2fb9b26a 2637
ff449ac4 2638 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
2639 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
2640#endif /* DEBUG_KMEM */
2fb9b26a 2641
b17edc10 2642 SEXIT;
5d86345d 2643}