]> git.proxmox.com Git - mirror_spl-debian.git/blame - modules/spl/spl-kmem.c
Make the splat load message caps just for consistency
[mirror_spl-debian.git] / modules / spl / spl-kmem.c
CommitLineData
715f6251 1/*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
f4b37741 27#include <sys/kmem.h>
f1ca4da6 28
937879f1 29#ifdef DEBUG_SUBSYSTEM
a0f6da3d 30# undef DEBUG_SUBSYSTEM
937879f1 31#endif
32
33#define DEBUG_SUBSYSTEM S_KMEM
34
f1ca4da6 35/*
2fb9b26a 36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
f1ca4da6 43 */
44#ifdef DEBUG_KMEM
45/* Shim layer memory accounting */
550f1705 46atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 47unsigned long long kmem_alloc_max = 0;
550f1705 48atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 49unsigned long long vmem_alloc_max = 0;
c19c06f3 50int kmem_warning_flag = 1;
79b31f36 51
ff449ac4 52EXPORT_SYMBOL(kmem_alloc_used);
53EXPORT_SYMBOL(kmem_alloc_max);
54EXPORT_SYMBOL(vmem_alloc_used);
55EXPORT_SYMBOL(vmem_alloc_max);
56EXPORT_SYMBOL(kmem_warning_flag);
57
a0f6da3d 58# ifdef DEBUG_KMEM_TRACKING
59
60/* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
61 * highly contended particularly on xfree(). If we want to run with this
62 * detailed debugging enabled for anything other than debugging we need to
63 * minimize the contention by moving to a lock per xmem_table entry model.
64 */
65
66# define KMEM_HASH_BITS 10
67# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
68
69# define VMEM_HASH_BITS 10
70# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
71
72typedef struct kmem_debug {
73 struct hlist_node kd_hlist; /* Hash node linkage */
74 struct list_head kd_list; /* List of all allocations */
75 void *kd_addr; /* Allocation pointer */
76 size_t kd_size; /* Allocation size */
77 const char *kd_func; /* Allocation function */
78 int kd_line; /* Allocation line */
79} kmem_debug_t;
80
d6a26c6a 81spinlock_t kmem_lock;
82struct hlist_head kmem_table[KMEM_TABLE_SIZE];
83struct list_head kmem_list;
84
13cdca65 85spinlock_t vmem_lock;
86struct hlist_head vmem_table[VMEM_TABLE_SIZE];
87struct list_head vmem_list;
88
d6a26c6a 89EXPORT_SYMBOL(kmem_lock);
90EXPORT_SYMBOL(kmem_table);
91EXPORT_SYMBOL(kmem_list);
92
13cdca65 93EXPORT_SYMBOL(vmem_lock);
94EXPORT_SYMBOL(vmem_table);
95EXPORT_SYMBOL(vmem_list);
a0f6da3d 96# endif
13cdca65 97
c19c06f3 98int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
99#else
100int kmem_set_warning(int flag) { return 0; }
f1ca4da6 101#endif
c19c06f3 102EXPORT_SYMBOL(kmem_set_warning);
f1ca4da6 103
104/*
105 * Slab allocation interfaces
106 *
2fb9b26a 107 * While the Linux slab implementation was inspired by the Solaris
108 * implemenation I cannot use it to emulate the Solaris APIs. I
109 * require two features which are not provided by the Linux slab.
110 *
111 * 1) Constructors AND destructors. Recent versions of the Linux
112 * kernel have removed support for destructors. This is a deal
113 * breaker for the SPL which contains particularly expensive
114 * initializers for mutex's, condition variables, etc. We also
a0f6da3d 115 * require a minimal level of cleanup for these data types unlike
116 * many Linux data type which do need to be explicitly destroyed.
2fb9b26a 117 *
a0f6da3d 118 * 2) Virtual address space backed slab. Callers of the Solaris slab
2fb9b26a 119 * expect it to work well for both small are very large allocations.
120 * Because of memory fragmentation the Linux slab which is backed
121 * by kmalloc'ed memory performs very badly when confronted with
122 * large numbers of large allocations. Basing the slab on the
123 * virtual address space removes the need for contigeous pages
124 * and greatly improve performance for large allocations.
125 *
126 * For these reasons, the SPL has its own slab implementation with
127 * the needed features. It is not as highly optimized as either the
128 * Solaris or Linux slabs, but it should get me most of what is
129 * needed until it can be optimized or obsoleted by another approach.
130 *
131 * One serious concern I do have about this method is the relatively
132 * small virtual address space on 32bit arches. This will seriously
133 * constrain the size of the slab caches and their performance.
134 *
2fb9b26a 135 * XXX: Implement work requests to keep an eye on each cache and
4afaaefa 136 * shrink them via spl_slab_reclaim() when they are wasting lots
2fb9b26a 137 * of space. Currently this process is driven by the reapers.
138 *
2fb9b26a 139 * XXX: Improve the partial slab list by carefully maintaining a
140 * strict ordering of fullest to emptiest slabs based on
141 * the slab reference count. This gaurentees the when freeing
142 * slabs back to the system we need only linearly traverse the
143 * last N slabs in the list to discover all the freeable slabs.
144 *
145 * XXX: NUMA awareness for optionally allocating memory close to a
146 * particular core. This can be adventageous if you know the slab
147 * object will be short lived and primarily accessed from one core.
148 *
149 * XXX: Slab coloring may also yield performance improvements and would
150 * be desirable to implement.
4afaaefa 151 *
152 * XXX: Proper hardware cache alignment would be good too.
f1ca4da6 153 */
2fb9b26a 154
a0f6da3d 155struct list_head spl_kmem_cache_list; /* List of caches */
156struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
c30df9c8 157
4afaaefa 158static int spl_cache_flush(spl_kmem_cache_t *skc,
a0f6da3d 159 spl_kmem_magazine_t *skm, int flush);
4afaaefa 160
57d86234 161#ifdef HAVE_SET_SHRINKER
2fb9b26a 162static struct shrinker *spl_kmem_cache_shrinker;
57d86234 163#else
4afaaefa 164static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
a0f6da3d 165 unsigned int gfp_mask);
2fb9b26a 166static struct shrinker spl_kmem_cache_shrinker = {
4afaaefa 167 .shrink = spl_kmem_cache_generic_shrinker,
57d86234 168 .seeks = KMC_DEFAULT_SEEKS,
169};
170#endif
f1ca4da6 171
a0f6da3d 172#ifdef DEBUG_KMEM
173# ifdef DEBUG_KMEM_TRACKING
174
175static kmem_debug_t *
176kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
177 void *addr)
178{
179 struct hlist_head *head;
180 struct hlist_node *node;
181 struct kmem_debug *p;
182 unsigned long flags;
183 ENTRY;
184
185 spin_lock_irqsave(lock, flags);
186
187 head = &table[hash_ptr(addr, bits)];
188 hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
189 if (p->kd_addr == addr) {
190 hlist_del_init(&p->kd_hlist);
191 list_del_init(&p->kd_list);
192 spin_unlock_irqrestore(lock, flags);
193 return p;
194 }
195 }
196
197 spin_unlock_irqrestore(lock, flags);
198
199 RETURN(NULL);
200}
201
202void *
203kmem_alloc_track(size_t size, int flags, const char *func, int line,
204 int node_alloc, int node)
205{
206 void *ptr = NULL;
207 kmem_debug_t *dptr;
208 unsigned long irq_flags;
209 ENTRY;
210
211 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
212 flags & ~__GFP_ZERO);
213
214 if (dptr == NULL) {
215 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
216 sizeof(kmem_debug_t), flags);
217 } else {
218 /* Marked unlikely because we should never be doing this,
219 * we tolerate to up 2 pages but a single page is best. */
220 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
221 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
222 (unsigned long long) size, flags,
223 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
224
c8e60837 225 /* We use kstrdup() below because the string pointed to by
226 * __FUNCTION__ might not be available by the time we want
227 * to print it since the module might have been unloaded. */
228 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
229 if (unlikely(dptr->kd_func == NULL)) {
230 kfree(dptr);
231 CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
232 "(%lld/%llu)\n", (unsigned long long) size, flags,
233 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
234 goto out;
235 }
236
a0f6da3d 237 /* Use the correct allocator */
238 if (node_alloc) {
239 ASSERT(!(flags & __GFP_ZERO));
240 ptr = kmalloc_node(size, flags, node);
241 } else if (flags & __GFP_ZERO) {
242 ptr = kzalloc(size, flags & ~__GFP_ZERO);
243 } else {
244 ptr = kmalloc(size, flags);
245 }
246
247 if (unlikely(ptr == NULL)) {
c8e60837 248 kfree(dptr->kd_func);
a0f6da3d 249 kfree(dptr);
250 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
251 (unsigned long long) size, flags,
252 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
253 goto out;
254 }
255
256 atomic64_add(size, &kmem_alloc_used);
257 if (unlikely(atomic64_read(&kmem_alloc_used) >
258 kmem_alloc_max))
259 kmem_alloc_max =
260 atomic64_read(&kmem_alloc_used);
261
262 INIT_HLIST_NODE(&dptr->kd_hlist);
263 INIT_LIST_HEAD(&dptr->kd_list);
264
265 dptr->kd_addr = ptr;
266 dptr->kd_size = size;
a0f6da3d 267 dptr->kd_line = line;
268
269 spin_lock_irqsave(&kmem_lock, irq_flags);
270 hlist_add_head_rcu(&dptr->kd_hlist,
271 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
272 list_add_tail(&dptr->kd_list, &kmem_list);
273 spin_unlock_irqrestore(&kmem_lock, irq_flags);
274
275 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
276 "(%lld/%llu)\n", (unsigned long long) size, flags,
277 ptr, atomic64_read(&kmem_alloc_used),
278 kmem_alloc_max);
279 }
280out:
281 RETURN(ptr);
282}
283EXPORT_SYMBOL(kmem_alloc_track);
284
285void
286kmem_free_track(void *ptr, size_t size)
287{
288 kmem_debug_t *dptr;
289 ENTRY;
290
291 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
292 (unsigned long long) size);
293
294 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
295
296 ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
297
298 /* Size must match */
299 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
300 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
301 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
302
303 atomic64_sub(size, &kmem_alloc_used);
304
305 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
306 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
307 kmem_alloc_max);
308
c8e60837 309 kfree(dptr->kd_func);
310
a0f6da3d 311 memset(dptr, 0x5a, sizeof(kmem_debug_t));
312 kfree(dptr);
313
314 memset(ptr, 0x5a, size);
315 kfree(ptr);
316
317 EXIT;
318}
319EXPORT_SYMBOL(kmem_free_track);
320
321void *
322vmem_alloc_track(size_t size, int flags, const char *func, int line)
323{
324 void *ptr = NULL;
325 kmem_debug_t *dptr;
326 unsigned long irq_flags;
327 ENTRY;
328
329 ASSERT(flags & KM_SLEEP);
330
331 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
332 if (dptr == NULL) {
333 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
334 sizeof(kmem_debug_t), flags);
335 } else {
c8e60837 336 /* We use kstrdup() below because the string pointed to by
337 * __FUNCTION__ might not be available by the time we want
338 * to print it, since the module might have been unloaded. */
339 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
340 if (unlikely(dptr->kd_func == NULL)) {
341 kfree(dptr);
342 CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
343 "(%lld/%llu)\n", (unsigned long long) size, flags,
344 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
345 goto out;
346 }
347
a0f6da3d 348 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
349 PAGE_KERNEL);
350
351 if (unlikely(ptr == NULL)) {
c8e60837 352 kfree(dptr->kd_func);
a0f6da3d 353 kfree(dptr);
354 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
355 (unsigned long long) size, flags,
356 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
357 goto out;
358 }
359
360 if (flags & __GFP_ZERO)
361 memset(ptr, 0, size);
362
363 atomic64_add(size, &vmem_alloc_used);
364 if (unlikely(atomic64_read(&vmem_alloc_used) >
365 vmem_alloc_max))
366 vmem_alloc_max =
367 atomic64_read(&vmem_alloc_used);
368
369 INIT_HLIST_NODE(&dptr->kd_hlist);
370 INIT_LIST_HEAD(&dptr->kd_list);
371
372 dptr->kd_addr = ptr;
373 dptr->kd_size = size;
a0f6da3d 374 dptr->kd_line = line;
375
376 spin_lock_irqsave(&vmem_lock, irq_flags);
377 hlist_add_head_rcu(&dptr->kd_hlist,
378 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
379 list_add_tail(&dptr->kd_list, &vmem_list);
380 spin_unlock_irqrestore(&vmem_lock, irq_flags);
381
382 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
383 "(%lld/%llu)\n", (unsigned long long) size, flags,
384 ptr, atomic64_read(&vmem_alloc_used),
385 vmem_alloc_max);
386 }
387out:
388 RETURN(ptr);
389}
390EXPORT_SYMBOL(vmem_alloc_track);
391
392void
393vmem_free_track(void *ptr, size_t size)
394{
395 kmem_debug_t *dptr;
396 ENTRY;
397
398 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
399 (unsigned long long) size);
400
401 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
402 ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
403
404 /* Size must match */
405 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
406 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
407 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
408
409 atomic64_sub(size, &vmem_alloc_used);
410 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
411 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
412 vmem_alloc_max);
413
c8e60837 414 kfree(dptr->kd_func);
415
a0f6da3d 416 memset(dptr, 0x5a, sizeof(kmem_debug_t));
417 kfree(dptr);
418
419 memset(ptr, 0x5a, size);
420 vfree(ptr);
421
422 EXIT;
423}
424EXPORT_SYMBOL(vmem_free_track);
425
426# else /* DEBUG_KMEM_TRACKING */
427
428void *
429kmem_alloc_debug(size_t size, int flags, const char *func, int line,
430 int node_alloc, int node)
431{
432 void *ptr;
433 ENTRY;
434
435 /* Marked unlikely because we should never be doing this,
436 * we tolerate to up 2 pages but a single page is best. */
437 if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
438 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
439 (unsigned long long) size, flags,
440 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
441
442 /* Use the correct allocator */
443 if (node_alloc) {
444 ASSERT(!(flags & __GFP_ZERO));
445 ptr = kmalloc_node(size, flags, node);
446 } else if (flags & __GFP_ZERO) {
447 ptr = kzalloc(size, flags & (~__GFP_ZERO));
448 } else {
449 ptr = kmalloc(size, flags);
450 }
451
452 if (ptr == NULL) {
453 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
454 (unsigned long long) size, flags,
455 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
456 } else {
457 atomic64_add(size, &kmem_alloc_used);
458 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
459 kmem_alloc_max = atomic64_read(&kmem_alloc_used);
460
461 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
462 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
463 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
464 }
465 RETURN(ptr);
466}
467EXPORT_SYMBOL(kmem_alloc_debug);
468
469void
470kmem_free_debug(void *ptr, size_t size)
471{
472 ENTRY;
473
474 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
475 (unsigned long long) size);
476
477 atomic64_sub(size, &kmem_alloc_used);
478
479 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
480 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
481 kmem_alloc_max);
482
483 memset(ptr, 0x5a, size);
484 kfree(ptr);
485
486 EXIT;
487}
488EXPORT_SYMBOL(kmem_free_debug);
489
490void *
491vmem_alloc_debug(size_t size, int flags, const char *func, int line)
492{
493 void *ptr;
494 ENTRY;
495
496 ASSERT(flags & KM_SLEEP);
497
498 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
499 PAGE_KERNEL);
500 if (ptr == NULL) {
501 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
502 (unsigned long long) size, flags,
503 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
504 } else {
505 if (flags & __GFP_ZERO)
506 memset(ptr, 0, size);
507
508 atomic64_add(size, &vmem_alloc_used);
509
510 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
511 vmem_alloc_max = atomic64_read(&vmem_alloc_used);
512
513 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
514 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
515 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
516 }
517
518 RETURN(ptr);
519}
520EXPORT_SYMBOL(vmem_alloc_debug);
521
522void
523vmem_free_debug(void *ptr, size_t size)
524{
525 ENTRY;
526
527 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
528 (unsigned long long) size);
529
530 atomic64_sub(size, &vmem_alloc_used);
531
532 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
533 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
534 vmem_alloc_max);
535
536 memset(ptr, 0x5a, size);
537 vfree(ptr);
538
539 EXIT;
540}
541EXPORT_SYMBOL(vmem_free_debug);
542
543# endif /* DEBUG_KMEM_TRACKING */
544#endif /* DEBUG_KMEM */
545
a1502d76 546static void *
547kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
fece7c99 548{
a1502d76 549 void *ptr;
f1ca4da6 550
a1502d76 551 if (skc->skc_flags & KMC_KMEM) {
552 if (size > (2 * PAGE_SIZE)) {
553 ptr = (void *)__get_free_pages(flags, get_order(size));
554 } else
555 ptr = kmem_alloc(size, flags);
556 } else {
557 ptr = vmem_alloc(size, flags);
d6a26c6a 558 }
fece7c99 559
a1502d76 560 return ptr;
561}
fece7c99 562
a1502d76 563static void
564kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
565{
566 if (skc->skc_flags & KMC_KMEM) {
567 if (size > (2 * PAGE_SIZE))
568 free_pages((unsigned long)ptr, get_order(size));
569 else
570 kmem_free(ptr, size);
571 } else {
572 vmem_free(ptr, size);
573 }
fece7c99 574}
575
576static spl_kmem_slab_t *
a1502d76 577spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
fece7c99 578{
579 spl_kmem_slab_t *sks;
a1502d76 580 spl_kmem_obj_t *sko, *n;
581 void *base, *obj;
582 int i, size, rc = 0;
583
584 /* It's important that we pack the spl_kmem_obj_t structure
585 * and the actual objects in to one large address space
586 * to minimize the number of calls to the allocator. It
587 * is far better to do a few large allocations and then
588 * subdivide it ourselves. Now which allocator we use
589 * requires balancling a few trade offs.
590 *
591 * For small objects we use kmem_alloc() because as long
592 * as you are only requesting a small number of pages
593 * (ideally just one) its cheap. However, when you start
594 * requesting multiple pages kmem_alloc() get increasingly
595 * expensive since it requires contigeous pages. For this
596 * reason we shift to vmem_alloc() for slabs of large
597 * objects which removes the need for contigeous pages.
598 * We do not use vmem_alloc() in all cases because there
599 * is significant locking overhead in __get_vm_area_node().
600 * This function takes a single global lock when aquiring
601 * an available virtual address range which serialize all
602 * vmem_alloc()'s for all slab caches. Using slightly
603 * different allocation functions for small and large
604 * objects should give us the best of both worlds.
fece7c99 605 *
a1502d76 606 * sks struct: sizeof(spl_kmem_slab_t)
607 * obj data: skc->skc_obj_size
608 * obj struct: sizeof(spl_kmem_obj_t)
609 * <N obj data + obj structs>
fece7c99 610 *
611 * XXX: It would probably be a good idea to more carefully
a1502d76 612 * align these data structures in memory.
fece7c99 613 */
a1502d76 614 base = kv_alloc(skc, skc->skc_slab_size, flags);
615 if (base == NULL)
fece7c99 616 RETURN(NULL);
617
a1502d76 618 sks = (spl_kmem_slab_t *)base;
619 sks->sks_magic = SKS_MAGIC;
620 sks->sks_objs = skc->skc_slab_objs;
621 sks->sks_age = jiffies;
622 sks->sks_cache = skc;
623 INIT_LIST_HEAD(&sks->sks_list);
624 INIT_LIST_HEAD(&sks->sks_free_list);
625 sks->sks_ref = 0;
626 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
fece7c99 627
628 for (i = 0; i < sks->sks_objs; i++) {
a1502d76 629 if (skc->skc_flags & KMC_OFFSLAB) {
630 obj = kv_alloc(skc, size, flags);
631 if (!obj)
632 GOTO(out, rc = -ENOMEM);
633 } else {
634 obj = base + sizeof(spl_kmem_slab_t) + i * size;
635 }
636
637 sko = obj + skc->skc_obj_size;
fece7c99 638 sko->sko_addr = obj;
639 sko->sko_magic = SKO_MAGIC;
640 sko->sko_slab = sks;
641 INIT_LIST_HEAD(&sko->sko_list);
fece7c99 642 list_add_tail(&sko->sko_list, &sks->sks_free_list);
643 }
644
fece7c99 645 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
646 if (skc->skc_ctor)
647 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
2fb9b26a 648out:
a1502d76 649 if (rc) {
650 if (skc->skc_flags & KMC_OFFSLAB)
651 list_for_each_entry_safe(sko,n,&sks->sks_free_list,sko_list)
652 kv_free(skc, sko->sko_addr, size);
fece7c99 653
a1502d76 654 kv_free(skc, base, skc->skc_slab_size);
655 sks = NULL;
fece7c99 656 }
657
a1502d76 658 RETURN(sks);
fece7c99 659}
660
2fb9b26a 661/* Removes slab from complete or partial list, so it must
d46630e0 662 * be called with the 'skc->skc_lock' held.
fece7c99 663 */
f1ca4da6 664static void
4afaaefa 665spl_slab_free(spl_kmem_slab_t *sks) {
2fb9b26a 666 spl_kmem_cache_t *skc;
667 spl_kmem_obj_t *sko, *n;
a1502d76 668 int size;
2fb9b26a 669 ENTRY;
57d86234 670
2fb9b26a 671 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 672 ASSERT(sks->sks_ref == 0);
d6a26c6a 673
fece7c99 674 skc = sks->sks_cache;
675 ASSERT(skc->skc_magic == SKC_MAGIC);
d46630e0 676 ASSERT(spin_is_locked(&skc->skc_lock));
f1ca4da6 677
fece7c99 678 skc->skc_obj_total -= sks->sks_objs;
679 skc->skc_slab_total--;
680 list_del(&sks->sks_list);
a1502d76 681 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
937879f1 682
fece7c99 683 /* Run destructors slab is being released */
a1502d76 684 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
685 ASSERT(sko->sko_magic == SKO_MAGIC);
686
2fb9b26a 687 if (skc->skc_dtor)
688 skc->skc_dtor(sko->sko_addr, skc->skc_private);
0a6fd143 689
a1502d76 690 if (skc->skc_flags & KMC_OFFSLAB)
691 kv_free(skc, sko->sko_addr, size);
692 }
d61e12af 693
a1502d76 694 kv_free(skc, sks, skc->skc_slab_size);
2fb9b26a 695 EXIT;
696}
d6a26c6a 697
2fb9b26a 698static int
4afaaefa 699__spl_slab_reclaim(spl_kmem_cache_t *skc)
2fb9b26a 700{
701 spl_kmem_slab_t *sks, *m;
702 int rc = 0;
703 ENTRY;
704
d46630e0 705 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 706 /*
707 * Free empty slabs which have not been touched in skc_delay
708 * seconds. This delay time is important to avoid thrashing.
709 * Empty slabs will be at the end of the skc_partial_list.
710 */
711 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
712 sks_list) {
4afaaefa 713 if (sks->sks_ref > 0)
2fb9b26a 714 break;
715
716 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
4afaaefa 717 spl_slab_free(sks);
2fb9b26a 718 rc++;
719 }
720 }
721
722 /* Returns number of slabs reclaimed */
723 RETURN(rc);
f1ca4da6 724}
725
2fb9b26a 726static int
4afaaefa 727spl_slab_reclaim(spl_kmem_cache_t *skc)
f1ca4da6 728{
2fb9b26a 729 int rc;
730 ENTRY;
f1ca4da6 731
d46630e0 732 spin_lock(&skc->skc_lock);
4afaaefa 733 rc = __spl_slab_reclaim(skc);
d46630e0 734 spin_unlock(&skc->skc_lock);
4efd4118 735
2fb9b26a 736 RETURN(rc);
737}
f1ca4da6 738
4afaaefa 739static int
740spl_magazine_size(spl_kmem_cache_t *skc)
741{
742 int size;
743 ENTRY;
744
745 /* Guesses for reasonable magazine sizes, they
746 * should really adapt based on observed usage. */
747 if (skc->skc_obj_size > (PAGE_SIZE * 256))
4afaaefa 748 size = 4;
ff449ac4 749 else if (skc->skc_obj_size > (PAGE_SIZE * 32))
4afaaefa 750 size = 16;
ff449ac4 751 else if (skc->skc_obj_size > (PAGE_SIZE))
752 size = 64;
4afaaefa 753 else if (skc->skc_obj_size > (PAGE_SIZE / 4))
ff449ac4 754 size = 128;
4afaaefa 755 else
ff449ac4 756 size = 512;
4afaaefa 757
758 RETURN(size);
759}
760
761static spl_kmem_magazine_t *
762spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
763{
764 spl_kmem_magazine_t *skm;
765 int size = sizeof(spl_kmem_magazine_t) +
766 sizeof(void *) * skc->skc_mag_size;
767 ENTRY;
768
3d061e9d 769 skm = kmem_alloc_node(size, GFP_KERNEL, node);
4afaaefa 770 if (skm) {
771 skm->skm_magic = SKM_MAGIC;
772 skm->skm_avail = 0;
773 skm->skm_size = skc->skc_mag_size;
774 skm->skm_refill = skc->skc_mag_refill;
a1502d76 775 if (!(skc->skc_flags & KMC_NOTOUCH))
776 skm->skm_age = jiffies;
4afaaefa 777 }
778
779 RETURN(skm);
780}
781
782static void
783spl_magazine_free(spl_kmem_magazine_t *skm)
784{
a0f6da3d 785 int size = sizeof(spl_kmem_magazine_t) +
786 sizeof(void *) * skm->skm_size;
787
4afaaefa 788 ENTRY;
789 ASSERT(skm->skm_magic == SKM_MAGIC);
790 ASSERT(skm->skm_avail == 0);
a0f6da3d 791
792 kmem_free(skm, size);
4afaaefa 793 EXIT;
794}
795
796static int
797spl_magazine_create(spl_kmem_cache_t *skc)
798{
799 int i;
800 ENTRY;
801
802 skc->skc_mag_size = spl_magazine_size(skc);
803 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
804
805 for_each_online_cpu(i) {
806 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
807 if (!skc->skc_mag[i]) {
808 for (i--; i >= 0; i--)
809 spl_magazine_free(skc->skc_mag[i]);
810
811 RETURN(-ENOMEM);
812 }
813 }
814
815 RETURN(0);
816}
817
818static void
819spl_magazine_destroy(spl_kmem_cache_t *skc)
820{
821 spl_kmem_magazine_t *skm;
822 int i;
823 ENTRY;
824
825 for_each_online_cpu(i) {
826 skm = skc->skc_mag[i];
827 (void)spl_cache_flush(skc, skm, skm->skm_avail);
828 spl_magazine_free(skm);
829 }
830
831 EXIT;
832}
833
2fb9b26a 834spl_kmem_cache_t *
835spl_kmem_cache_create(char *name, size_t size, size_t align,
836 spl_kmem_ctor_t ctor,
837 spl_kmem_dtor_t dtor,
838 spl_kmem_reclaim_t reclaim,
839 void *priv, void *vmp, int flags)
840{
841 spl_kmem_cache_t *skc;
a1502d76 842 uint32_t slab_max, slab_size, slab_objs;
843 int rc, kmem_flags = KM_SLEEP;
2fb9b26a 844 ENTRY;
937879f1 845
a1502d76 846 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
847 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
848 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
849
2fb9b26a 850 /* We may be called when there is a non-zero preempt_count or
851 * interrupts are disabled is which case we must not sleep.
852 */
e9d7a2be 853 if (current_thread_info()->preempt_count || irqs_disabled())
2fb9b26a 854 kmem_flags = KM_NOSLEEP;
0a6fd143 855
2fb9b26a 856 /* Allocate new cache memory and initialize. */
ff449ac4 857 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
e9d7a2be 858 if (skc == NULL)
2fb9b26a 859 RETURN(NULL);
d61e12af 860
2fb9b26a 861 skc->skc_magic = SKC_MAGIC;
2fb9b26a 862 skc->skc_name_size = strlen(name) + 1;
863 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
864 if (skc->skc_name == NULL) {
865 kmem_free(skc, sizeof(*skc));
866 RETURN(NULL);
867 }
868 strncpy(skc->skc_name, name, skc->skc_name_size);
869
e9d7a2be 870 skc->skc_ctor = ctor;
871 skc->skc_dtor = dtor;
872 skc->skc_reclaim = reclaim;
2fb9b26a 873 skc->skc_private = priv;
874 skc->skc_vmp = vmp;
875 skc->skc_flags = flags;
876 skc->skc_obj_size = size;
2fb9b26a 877 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
878
2fb9b26a 879 INIT_LIST_HEAD(&skc->skc_list);
880 INIT_LIST_HEAD(&skc->skc_complete_list);
881 INIT_LIST_HEAD(&skc->skc_partial_list);
d46630e0 882 spin_lock_init(&skc->skc_lock);
e9d7a2be 883 skc->skc_slab_fail = 0;
884 skc->skc_slab_create = 0;
885 skc->skc_slab_destroy = 0;
2fb9b26a 886 skc->skc_slab_total = 0;
887 skc->skc_slab_alloc = 0;
888 skc->skc_slab_max = 0;
889 skc->skc_obj_total = 0;
890 skc->skc_obj_alloc = 0;
891 skc->skc_obj_max = 0;
a1502d76 892
893 /* If none passed select a cache type based on object size */
894 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
895 if (skc->skc_obj_size < (PAGE_SIZE / 8)) {
896 skc->skc_flags |= KMC_KMEM;
897 } else {
898 skc->skc_flags |= KMC_VMEM;
899 }
900 }
901
902 /* Size slabs properly so ensure they are not too large */
903 slab_max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
904 if (skc->skc_flags & KMC_OFFSLAB) {
905 skc->skc_slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
906 skc->skc_slab_size = sizeof(spl_kmem_slab_t);
907 ASSERT(skc->skc_obj_size < slab_max);
908 } else {
909 slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB + 1;
910
911 do {
912 slab_objs--;
913 slab_size = sizeof(spl_kmem_slab_t) + slab_objs *
914 (skc->skc_obj_size+sizeof(spl_kmem_obj_t));
915 } while (slab_size > slab_max);
916
917 skc->skc_slab_objs = slab_objs;
918 skc->skc_slab_size = slab_size;
919 }
4afaaefa 920
921 rc = spl_magazine_create(skc);
922 if (rc) {
4afaaefa 923 kmem_free(skc->skc_name, skc->skc_name_size);
924 kmem_free(skc, sizeof(*skc));
925 RETURN(NULL);
926 }
2fb9b26a 927
928 down_write(&spl_kmem_cache_sem);
e9d7a2be 929 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
2fb9b26a 930 up_write(&spl_kmem_cache_sem);
931
e9d7a2be 932 RETURN(skc);
f1ca4da6 933}
2fb9b26a 934EXPORT_SYMBOL(spl_kmem_cache_create);
f1ca4da6 935
2fb9b26a 936void
937spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
f1ca4da6 938{
2fb9b26a 939 spl_kmem_slab_t *sks, *m;
940 ENTRY;
f1ca4da6 941
e9d7a2be 942 ASSERT(skc->skc_magic == SKC_MAGIC);
943
944 down_write(&spl_kmem_cache_sem);
945 list_del_init(&skc->skc_list);
946 up_write(&spl_kmem_cache_sem);
2fb9b26a 947
4afaaefa 948 spl_magazine_destroy(skc);
d46630e0 949 spin_lock(&skc->skc_lock);
d6a26c6a 950
2fb9b26a 951 /* Validate there are no objects in use and free all the
4afaaefa 952 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
2fb9b26a 953 ASSERT(list_empty(&skc->skc_complete_list));
a1502d76 954 ASSERT(skc->skc_slab_alloc == 0);
955 ASSERT(skc->skc_obj_alloc == 0);
d6a26c6a 956
e9d7a2be 957 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
4afaaefa 958 spl_slab_free(sks);
2fb9b26a 959
a1502d76 960 ASSERT(skc->skc_slab_total == 0);
961 ASSERT(skc->skc_obj_total == 0);
962
2fb9b26a 963 kmem_free(skc->skc_name, skc->skc_name_size);
d46630e0 964 spin_unlock(&skc->skc_lock);
ff449ac4 965
4afaaefa 966 kmem_free(skc, sizeof(*skc));
2fb9b26a 967
968 EXIT;
f1ca4da6 969}
2fb9b26a 970EXPORT_SYMBOL(spl_kmem_cache_destroy);
f1ca4da6 971
4afaaefa 972static void *
973spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
f1ca4da6 974{
2fb9b26a 975 spl_kmem_obj_t *sko;
f1ca4da6 976
e9d7a2be 977 ASSERT(skc->skc_magic == SKC_MAGIC);
978 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 979 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 980
a1502d76 981 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
4afaaefa 982 ASSERT(sko->sko_magic == SKO_MAGIC);
983 ASSERT(sko->sko_addr != NULL);
2fb9b26a 984
a1502d76 985 /* Remove from sks_free_list */
4afaaefa 986 list_del_init(&sko->sko_list);
2fb9b26a 987
4afaaefa 988 sks->sks_age = jiffies;
989 sks->sks_ref++;
990 skc->skc_obj_alloc++;
2fb9b26a 991
4afaaefa 992 /* Track max obj usage statistics */
993 if (skc->skc_obj_alloc > skc->skc_obj_max)
994 skc->skc_obj_max = skc->skc_obj_alloc;
2fb9b26a 995
4afaaefa 996 /* Track max slab usage statistics */
997 if (sks->sks_ref == 1) {
998 skc->skc_slab_alloc++;
f1ca4da6 999
4afaaefa 1000 if (skc->skc_slab_alloc > skc->skc_slab_max)
1001 skc->skc_slab_max = skc->skc_slab_alloc;
2fb9b26a 1002 }
1003
4afaaefa 1004 return sko->sko_addr;
1005}
c30df9c8 1006
4afaaefa 1007/* No available objects create a new slab. Since this is an
1008 * expensive operation we do it without holding the spinlock
1009 * and only briefly aquire it when we link in the fully
1010 * allocated and constructed slab.
1011 */
1012static spl_kmem_slab_t *
1013spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1014{
e9d7a2be 1015 spl_kmem_slab_t *sks;
4afaaefa 1016 ENTRY;
f1ca4da6 1017
e9d7a2be 1018 ASSERT(skc->skc_magic == SKC_MAGIC);
1019
1020 if (flags & __GFP_WAIT) {
fece7c99 1021 flags |= __GFP_NOFAIL;
4afaaefa 1022 local_irq_enable();
f78a933f 1023 might_sleep();
4afaaefa 1024 }
f1ca4da6 1025
4afaaefa 1026 sks = spl_slab_alloc(skc, flags);
1027 if (sks == NULL) {
1028 if (flags & __GFP_WAIT)
1029 local_irq_disable();
1030
1031 RETURN(NULL);
1032 }
2fb9b26a 1033
e9d7a2be 1034 if (flags & __GFP_WAIT)
4afaaefa 1035 local_irq_disable();
1036
1037 /* Link the new empty slab in to the end of skc_partial_list */
d46630e0 1038 spin_lock(&skc->skc_lock);
2fb9b26a 1039 skc->skc_slab_total++;
1040 skc->skc_obj_total += sks->sks_objs;
1041 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
d46630e0 1042 spin_unlock(&skc->skc_lock);
4afaaefa 1043
1044 RETURN(sks);
f1ca4da6 1045}
1046
4afaaefa 1047static int
1048spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
f1ca4da6 1049{
e9d7a2be 1050 spl_kmem_slab_t *sks;
1051 int rc = 0, refill;
937879f1 1052 ENTRY;
f1ca4da6 1053
e9d7a2be 1054 ASSERT(skc->skc_magic == SKC_MAGIC);
1055 ASSERT(skm->skm_magic == SKM_MAGIC);
1056
4afaaefa 1057 /* XXX: Check for refill bouncing by age perhaps */
e9d7a2be 1058 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
4afaaefa 1059
d46630e0 1060 spin_lock(&skc->skc_lock);
ff449ac4 1061
4afaaefa 1062 while (refill > 0) {
1063 /* No slabs available we must grow the cache */
1064 if (list_empty(&skc->skc_partial_list)) {
1065 spin_unlock(&skc->skc_lock);
ff449ac4 1066
4afaaefa 1067 sks = spl_cache_grow(skc, flags);
1068 if (!sks)
e9d7a2be 1069 GOTO(out, rc);
4afaaefa 1070
1071 /* Rescheduled to different CPU skm is not local */
1072 if (skm != skc->skc_mag[smp_processor_id()])
e9d7a2be 1073 GOTO(out, rc);
1074
1075 /* Potentially rescheduled to the same CPU but
1076 * allocations may have occured from this CPU while
1077 * we were sleeping so recalculate max refill. */
1078 refill = MIN(refill, skm->skm_size - skm->skm_avail);
4afaaefa 1079
1080 spin_lock(&skc->skc_lock);
1081 continue;
1082 }
d46630e0 1083
4afaaefa 1084 /* Grab the next available slab */
1085 sks = list_entry((&skc->skc_partial_list)->next,
1086 spl_kmem_slab_t, sks_list);
1087 ASSERT(sks->sks_magic == SKS_MAGIC);
1088 ASSERT(sks->sks_ref < sks->sks_objs);
1089 ASSERT(!list_empty(&sks->sks_free_list));
d46630e0 1090
4afaaefa 1091 /* Consume as many objects as needed to refill the requested
e9d7a2be 1092 * cache. We must also be careful not to overfill it. */
1093 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1094 ASSERT(skm->skm_avail < skm->skm_size);
1095 ASSERT(rc < skm->skm_size);
4afaaefa 1096 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
e9d7a2be 1097 }
f1ca4da6 1098
4afaaefa 1099 /* Move slab to skc_complete_list when full */
1100 if (sks->sks_ref == sks->sks_objs) {
1101 list_del(&sks->sks_list);
1102 list_add(&sks->sks_list, &skc->skc_complete_list);
2fb9b26a 1103 }
1104 }
57d86234 1105
4afaaefa 1106 spin_unlock(&skc->skc_lock);
1107out:
1108 /* Returns the number of entries added to cache */
e9d7a2be 1109 RETURN(rc);
4afaaefa 1110}
1111
1112static void
1113spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1114{
e9d7a2be 1115 spl_kmem_slab_t *sks = NULL;
4afaaefa 1116 spl_kmem_obj_t *sko = NULL;
1117 ENTRY;
1118
e9d7a2be 1119 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 1120 ASSERT(spin_is_locked(&skc->skc_lock));
1121
a1502d76 1122 sko = obj + skc->skc_obj_size;
1123 ASSERT(sko->sko_magic == SKO_MAGIC);
4afaaefa 1124
1125 sks = sko->sko_slab;
a1502d76 1126 ASSERT(sks->sks_magic == SKS_MAGIC);
2fb9b26a 1127 ASSERT(sks->sks_cache == skc);
2fb9b26a 1128 list_add(&sko->sko_list, &sks->sks_free_list);
d6a26c6a 1129
2fb9b26a 1130 sks->sks_age = jiffies;
4afaaefa 1131 sks->sks_ref--;
2fb9b26a 1132 skc->skc_obj_alloc--;
f1ca4da6 1133
2fb9b26a 1134 /* Move slab to skc_partial_list when no longer full. Slabs
4afaaefa 1135 * are added to the head to keep the partial list is quasi-full
1136 * sorted order. Fuller at the head, emptier at the tail. */
1137 if (sks->sks_ref == (sks->sks_objs - 1)) {
2fb9b26a 1138 list_del(&sks->sks_list);
1139 list_add(&sks->sks_list, &skc->skc_partial_list);
1140 }
f1ca4da6 1141
2fb9b26a 1142 /* Move emply slabs to the end of the partial list so
4afaaefa 1143 * they can be easily found and freed during reclamation. */
1144 if (sks->sks_ref == 0) {
2fb9b26a 1145 list_del(&sks->sks_list);
1146 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1147 skc->skc_slab_alloc--;
1148 }
1149
4afaaefa 1150 EXIT;
1151}
1152
1153static int
1154spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1155{
1156 int i, count = MIN(flush, skm->skm_avail);
1157 ENTRY;
1158
e9d7a2be 1159 ASSERT(skc->skc_magic == SKC_MAGIC);
1160 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 1161
1162 spin_lock(&skc->skc_lock);
ff449ac4 1163
4afaaefa 1164 for (i = 0; i < count; i++)
1165 spl_cache_shrink(skc, skm->skm_objs[i]);
1166
e9d7a2be 1167// __spl_slab_reclaim(skc);
1168 skm->skm_avail -= count;
1169 memmove(skm->skm_objs, &(skm->skm_objs[count]),
4afaaefa 1170 sizeof(void *) * skm->skm_avail);
1171
d46630e0 1172 spin_unlock(&skc->skc_lock);
4afaaefa 1173
1174 RETURN(count);
1175}
1176
1177void *
1178spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1179{
1180 spl_kmem_magazine_t *skm;
1181 unsigned long irq_flags;
1182 void *obj = NULL;
e9d7a2be 1183 int id;
4afaaefa 1184 ENTRY;
1185
e9d7a2be 1186 ASSERT(skc->skc_magic == SKC_MAGIC);
1187 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
4afaaefa 1188 local_irq_save(irq_flags);
1189
1190restart:
1191 /* Safe to update per-cpu structure without lock, but
1192 * in the restart case we must be careful to reaquire
1193 * the local magazine since this may have changed
1194 * when we need to grow the cache. */
e9d7a2be 1195 id = smp_processor_id();
1196 ASSERTF(id < 4, "cache=%p smp_processor_id=%d\n", skc, id);
4afaaefa 1197 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 1198 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1199 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1200 skm->skm_size, skm->skm_refill, skm->skm_avail);
4afaaefa 1201
1202 if (likely(skm->skm_avail)) {
1203 /* Object available in CPU cache, use it */
1204 obj = skm->skm_objs[--skm->skm_avail];
a1502d76 1205 if (!(skc->skc_flags & KMC_NOTOUCH))
1206 skm->skm_age = jiffies;
4afaaefa 1207 } else {
1208 /* Per-CPU cache empty, directly allocate from
1209 * the slab and refill the per-CPU cache. */
1210 (void)spl_cache_refill(skc, skm, flags);
1211 GOTO(restart, obj = NULL);
1212 }
1213
1214 local_irq_restore(irq_flags);
fece7c99 1215 ASSERT(obj);
4afaaefa 1216
1217 /* Pre-emptively migrate object to CPU L1 cache */
1218 prefetchw(obj);
1219
1220 RETURN(obj);
1221}
1222EXPORT_SYMBOL(spl_kmem_cache_alloc);
1223
1224void
1225spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1226{
1227 spl_kmem_magazine_t *skm;
1228 unsigned long flags;
1229 ENTRY;
1230
e9d7a2be 1231 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 1232 local_irq_save(flags);
1233
1234 /* Safe to update per-cpu structure without lock, but
1235 * no remote memory allocation tracking is being performed
1236 * it is entirely possible to allocate an object from one
1237 * CPU cache and return it to another. */
1238 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 1239 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 1240
1241 /* Per-CPU cache full, flush it to make space */
1242 if (unlikely(skm->skm_avail >= skm->skm_size))
1243 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1244
1245 /* Available space in cache, use it */
1246 skm->skm_objs[skm->skm_avail++] = obj;
1247
1248 local_irq_restore(flags);
1249
1250 EXIT;
f1ca4da6 1251}
2fb9b26a 1252EXPORT_SYMBOL(spl_kmem_cache_free);
5c2bb9b2 1253
2fb9b26a 1254static int
4afaaefa 1255spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
2fb9b26a 1256{
e9d7a2be 1257 spl_kmem_cache_t *skc;
5c2bb9b2 1258
2fb9b26a 1259 /* Under linux a shrinker is not tightly coupled with a slab
1260 * cache. In fact linux always systematically trys calling all
1261 * registered shrinker callbacks until its target reclamation level
1262 * is reached. Because of this we only register one shrinker
1263 * function in the shim layer for all slab caches. And we always
1264 * attempt to shrink all caches when this generic shrinker is called.
c30df9c8 1265 */
e9d7a2be 1266 down_read(&spl_kmem_cache_sem);
57d86234 1267
e9d7a2be 1268 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
2fb9b26a 1269 spl_kmem_cache_reap_now(skc);
1270
e9d7a2be 1271 up_read(&spl_kmem_cache_sem);
2fb9b26a 1272
1273 /* XXX: Under linux we should return the remaining number of
1274 * entries in the cache. We should do this as well.
1275 */
1276 return 1;
5c2bb9b2 1277}
5c2bb9b2 1278
57d86234 1279void
2fb9b26a 1280spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
57d86234 1281{
4afaaefa 1282 spl_kmem_magazine_t *skm;
1283 int i;
2fb9b26a 1284 ENTRY;
e9d7a2be 1285
1286 ASSERT(skc->skc_magic == SKC_MAGIC);
2fb9b26a 1287
1288 if (skc->skc_reclaim)
1289 skc->skc_reclaim(skc->skc_private);
1290
4afaaefa 1291 /* Ensure per-CPU caches which are idle gradually flush */
1292 for_each_online_cpu(i) {
1293 skm = skc->skc_mag[i];
1294
1295 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1296 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1297 }
1298
1299 spl_slab_reclaim(skc);
1300
2fb9b26a 1301 EXIT;
57d86234 1302}
2fb9b26a 1303EXPORT_SYMBOL(spl_kmem_cache_reap_now);
57d86234 1304
f1b59d26 1305void
2fb9b26a 1306spl_kmem_reap(void)
937879f1 1307{
4afaaefa 1308 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
f1ca4da6 1309}
2fb9b26a 1310EXPORT_SYMBOL(spl_kmem_reap);
5d86345d 1311
ff449ac4 1312#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
c6dc93d6 1313static char *
4afaaefa 1314spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
d6a26c6a 1315{
e9d7a2be 1316 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
d6a26c6a 1317 int i, flag = 1;
1318
1319 ASSERT(str != NULL && len >= 17);
e9d7a2be 1320 memset(str, 0, len);
d6a26c6a 1321
1322 /* Check for a fully printable string, and while we are at
1323 * it place the printable characters in the passed buffer. */
1324 for (i = 0; i < size; i++) {
e9d7a2be 1325 str[i] = ((char *)(kd->kd_addr))[i];
1326 if (isprint(str[i])) {
1327 continue;
1328 } else {
1329 /* Minimum number of printable characters found
1330 * to make it worthwhile to print this as ascii. */
1331 if (i > min)
1332 break;
1333
1334 flag = 0;
1335 break;
1336 }
d6a26c6a 1337 }
1338
1339 if (!flag) {
1340 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1341 *((uint8_t *)kd->kd_addr),
1342 *((uint8_t *)kd->kd_addr + 2),
1343 *((uint8_t *)kd->kd_addr + 4),
1344 *((uint8_t *)kd->kd_addr + 6),
1345 *((uint8_t *)kd->kd_addr + 8),
1346 *((uint8_t *)kd->kd_addr + 10),
1347 *((uint8_t *)kd->kd_addr + 12),
1348 *((uint8_t *)kd->kd_addr + 14));
1349 }
1350
1351 return str;
1352}
1353
a1502d76 1354static int
1355spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1356{
1357 int i;
1358 ENTRY;
1359
1360 spin_lock_init(lock);
1361 INIT_LIST_HEAD(list);
1362
1363 for (i = 0; i < size; i++)
1364 INIT_HLIST_HEAD(&kmem_table[i]);
1365
1366 RETURN(0);
1367}
1368
ff449ac4 1369static void
1370spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
5d86345d 1371{
2fb9b26a 1372 unsigned long flags;
1373 kmem_debug_t *kd;
1374 char str[17];
a1502d76 1375 ENTRY;
2fb9b26a 1376
ff449ac4 1377 spin_lock_irqsave(lock, flags);
1378 if (!list_empty(list))
a0f6da3d 1379 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1380 "size", "data", "func", "line");
2fb9b26a 1381
ff449ac4 1382 list_for_each_entry(kd, list, kd_list)
a0f6da3d 1383 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1384 kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2fb9b26a 1385 kd->kd_func, kd->kd_line);
1386
ff449ac4 1387 spin_unlock_irqrestore(lock, flags);
a1502d76 1388 EXIT;
ff449ac4 1389}
1390#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
a1502d76 1391#define spl_kmem_init_tracking(list, lock, size)
ff449ac4 1392#define spl_kmem_fini_tracking(list, lock)
1393#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1394
a1502d76 1395int
1396spl_kmem_init(void)
1397{
1398 int rc = 0;
1399 ENTRY;
1400
1401 init_rwsem(&spl_kmem_cache_sem);
1402 INIT_LIST_HEAD(&spl_kmem_cache_list);
1403
1404#ifdef HAVE_SET_SHRINKER
1405 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1406 spl_kmem_cache_generic_shrinker);
1407 if (spl_kmem_cache_shrinker == NULL)
f78a933f 1408 RETURN(rc = -ENOMEM);
a1502d76 1409#else
1410 register_shrinker(&spl_kmem_cache_shrinker);
1411#endif
1412
1413#ifdef DEBUG_KMEM
1414 atomic64_set(&kmem_alloc_used, 0);
1415 atomic64_set(&vmem_alloc_used, 0);
1416
1417 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1418 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1419#endif
a1502d76 1420 RETURN(rc);
1421}
1422
ff449ac4 1423void
1424spl_kmem_fini(void)
1425{
1426#ifdef DEBUG_KMEM
1427 /* Display all unreclaimed memory addresses, including the
1428 * allocation size and the first few bytes of what's located
1429 * at that address to aid in debugging. Performance is not
1430 * a serious concern here since it is module unload time. */
1431 if (atomic64_read(&kmem_alloc_used) != 0)
1432 CWARN("kmem leaked %ld/%ld bytes\n",
550f1705 1433 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
ff449ac4 1434
2fb9b26a 1435
1436 if (atomic64_read(&vmem_alloc_used) != 0)
1437 CWARN("vmem leaked %ld/%ld bytes\n",
550f1705 1438 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
2fb9b26a 1439
ff449ac4 1440 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1441 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1442#endif /* DEBUG_KMEM */
2fb9b26a 1443 ENTRY;
1444
1445#ifdef HAVE_SET_SHRINKER
1446 remove_shrinker(spl_kmem_cache_shrinker);
1447#else
1448 unregister_shrinker(&spl_kmem_cache_shrinker);
5d86345d 1449#endif
2fb9b26a 1450
937879f1 1451 EXIT;
5d86345d 1452}