]> git.proxmox.com Git - mirror_spl-debian.git/blame - module/spl/spl-kmem.c
Remove debug check was was accidentally left in place an prevent the slab cache from...
[mirror_spl-debian.git] / module / spl / spl-kmem.c
CommitLineData
715f6251 1/*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
f4b37741 27#include <sys/kmem.h>
f1ca4da6 28
937879f1 29#ifdef DEBUG_SUBSYSTEM
a0f6da3d 30# undef DEBUG_SUBSYSTEM
937879f1 31#endif
32
33#define DEBUG_SUBSYSTEM S_KMEM
34
f1ca4da6 35/*
2fb9b26a 36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
f1ca4da6 43 */
44#ifdef DEBUG_KMEM
45/* Shim layer memory accounting */
550f1705 46atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 47unsigned long long kmem_alloc_max = 0;
550f1705 48atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 49unsigned long long vmem_alloc_max = 0;
c19c06f3 50int kmem_warning_flag = 1;
79b31f36 51
ff449ac4 52EXPORT_SYMBOL(kmem_alloc_used);
53EXPORT_SYMBOL(kmem_alloc_max);
54EXPORT_SYMBOL(vmem_alloc_used);
55EXPORT_SYMBOL(vmem_alloc_max);
56EXPORT_SYMBOL(kmem_warning_flag);
57
a0f6da3d 58# ifdef DEBUG_KMEM_TRACKING
59
60/* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
61 * highly contended particularly on xfree(). If we want to run with this
62 * detailed debugging enabled for anything other than debugging we need to
63 * minimize the contention by moving to a lock per xmem_table entry model.
64 */
65
66# define KMEM_HASH_BITS 10
67# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
68
69# define VMEM_HASH_BITS 10
70# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
71
72typedef struct kmem_debug {
73 struct hlist_node kd_hlist; /* Hash node linkage */
74 struct list_head kd_list; /* List of all allocations */
75 void *kd_addr; /* Allocation pointer */
76 size_t kd_size; /* Allocation size */
77 const char *kd_func; /* Allocation function */
78 int kd_line; /* Allocation line */
79} kmem_debug_t;
80
d6a26c6a 81spinlock_t kmem_lock;
82struct hlist_head kmem_table[KMEM_TABLE_SIZE];
83struct list_head kmem_list;
84
13cdca65 85spinlock_t vmem_lock;
86struct hlist_head vmem_table[VMEM_TABLE_SIZE];
87struct list_head vmem_list;
88
d6a26c6a 89EXPORT_SYMBOL(kmem_lock);
90EXPORT_SYMBOL(kmem_table);
91EXPORT_SYMBOL(kmem_list);
92
13cdca65 93EXPORT_SYMBOL(vmem_lock);
94EXPORT_SYMBOL(vmem_table);
95EXPORT_SYMBOL(vmem_list);
a0f6da3d 96# endif
13cdca65 97
c19c06f3 98int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
99#else
100int kmem_set_warning(int flag) { return 0; }
f1ca4da6 101#endif
c19c06f3 102EXPORT_SYMBOL(kmem_set_warning);
f1ca4da6 103
104/*
105 * Slab allocation interfaces
106 *
2fb9b26a 107 * While the Linux slab implementation was inspired by the Solaris
108 * implemenation I cannot use it to emulate the Solaris APIs. I
109 * require two features which are not provided by the Linux slab.
110 *
111 * 1) Constructors AND destructors. Recent versions of the Linux
112 * kernel have removed support for destructors. This is a deal
113 * breaker for the SPL which contains particularly expensive
114 * initializers for mutex's, condition variables, etc. We also
a0f6da3d 115 * require a minimal level of cleanup for these data types unlike
116 * many Linux data type which do need to be explicitly destroyed.
2fb9b26a 117 *
a0f6da3d 118 * 2) Virtual address space backed slab. Callers of the Solaris slab
2fb9b26a 119 * expect it to work well for both small are very large allocations.
120 * Because of memory fragmentation the Linux slab which is backed
121 * by kmalloc'ed memory performs very badly when confronted with
122 * large numbers of large allocations. Basing the slab on the
123 * virtual address space removes the need for contigeous pages
124 * and greatly improve performance for large allocations.
125 *
126 * For these reasons, the SPL has its own slab implementation with
127 * the needed features. It is not as highly optimized as either the
128 * Solaris or Linux slabs, but it should get me most of what is
129 * needed until it can be optimized or obsoleted by another approach.
130 *
131 * One serious concern I do have about this method is the relatively
132 * small virtual address space on 32bit arches. This will seriously
133 * constrain the size of the slab caches and their performance.
134 *
2fb9b26a 135 * XXX: Implement work requests to keep an eye on each cache and
4afaaefa 136 * shrink them via spl_slab_reclaim() when they are wasting lots
2fb9b26a 137 * of space. Currently this process is driven by the reapers.
138 *
2fb9b26a 139 * XXX: Improve the partial slab list by carefully maintaining a
140 * strict ordering of fullest to emptiest slabs based on
141 * the slab reference count. This gaurentees the when freeing
142 * slabs back to the system we need only linearly traverse the
143 * last N slabs in the list to discover all the freeable slabs.
144 *
145 * XXX: NUMA awareness for optionally allocating memory close to a
146 * particular core. This can be adventageous if you know the slab
147 * object will be short lived and primarily accessed from one core.
148 *
149 * XXX: Slab coloring may also yield performance improvements and would
150 * be desirable to implement.
f1ca4da6 151 */
2fb9b26a 152
a0f6da3d 153struct list_head spl_kmem_cache_list; /* List of caches */
154struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
c30df9c8 155
4afaaefa 156static int spl_cache_flush(spl_kmem_cache_t *skc,
a0f6da3d 157 spl_kmem_magazine_t *skm, int flush);
4afaaefa 158
57d86234 159#ifdef HAVE_SET_SHRINKER
2fb9b26a 160static struct shrinker *spl_kmem_cache_shrinker;
57d86234 161#else
4afaaefa 162static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
a0f6da3d 163 unsigned int gfp_mask);
2fb9b26a 164static struct shrinker spl_kmem_cache_shrinker = {
4afaaefa 165 .shrink = spl_kmem_cache_generic_shrinker,
57d86234 166 .seeks = KMC_DEFAULT_SEEKS,
167};
168#endif
f1ca4da6 169
a0f6da3d 170#ifdef DEBUG_KMEM
171# ifdef DEBUG_KMEM_TRACKING
172
173static kmem_debug_t *
174kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
175 void *addr)
176{
177 struct hlist_head *head;
178 struct hlist_node *node;
179 struct kmem_debug *p;
180 unsigned long flags;
181 ENTRY;
182
183 spin_lock_irqsave(lock, flags);
184
185 head = &table[hash_ptr(addr, bits)];
186 hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
187 if (p->kd_addr == addr) {
188 hlist_del_init(&p->kd_hlist);
189 list_del_init(&p->kd_list);
190 spin_unlock_irqrestore(lock, flags);
191 return p;
192 }
193 }
194
195 spin_unlock_irqrestore(lock, flags);
196
197 RETURN(NULL);
198}
199
200void *
201kmem_alloc_track(size_t size, int flags, const char *func, int line,
202 int node_alloc, int node)
203{
204 void *ptr = NULL;
205 kmem_debug_t *dptr;
206 unsigned long irq_flags;
207 ENTRY;
208
209 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
210 flags & ~__GFP_ZERO);
211
212 if (dptr == NULL) {
213 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
214 sizeof(kmem_debug_t), flags);
215 } else {
216 /* Marked unlikely because we should never be doing this,
217 * we tolerate to up 2 pages but a single page is best. */
218 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
219 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
220 (unsigned long long) size, flags,
221 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
222
c8e60837 223 /* We use kstrdup() below because the string pointed to by
224 * __FUNCTION__ might not be available by the time we want
225 * to print it since the module might have been unloaded. */
226 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
227 if (unlikely(dptr->kd_func == NULL)) {
228 kfree(dptr);
229 CWARN("kstrdup() failed in kmem_alloc(%llu, 0x%x) "
230 "(%lld/%llu)\n", (unsigned long long) size, flags,
231 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
232 goto out;
233 }
234
a0f6da3d 235 /* Use the correct allocator */
236 if (node_alloc) {
237 ASSERT(!(flags & __GFP_ZERO));
238 ptr = kmalloc_node(size, flags, node);
239 } else if (flags & __GFP_ZERO) {
240 ptr = kzalloc(size, flags & ~__GFP_ZERO);
241 } else {
242 ptr = kmalloc(size, flags);
243 }
244
245 if (unlikely(ptr == NULL)) {
c8e60837 246 kfree(dptr->kd_func);
a0f6da3d 247 kfree(dptr);
248 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
249 (unsigned long long) size, flags,
250 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
251 goto out;
252 }
253
254 atomic64_add(size, &kmem_alloc_used);
255 if (unlikely(atomic64_read(&kmem_alloc_used) >
256 kmem_alloc_max))
257 kmem_alloc_max =
258 atomic64_read(&kmem_alloc_used);
259
260 INIT_HLIST_NODE(&dptr->kd_hlist);
261 INIT_LIST_HEAD(&dptr->kd_list);
262
263 dptr->kd_addr = ptr;
264 dptr->kd_size = size;
a0f6da3d 265 dptr->kd_line = line;
266
267 spin_lock_irqsave(&kmem_lock, irq_flags);
268 hlist_add_head_rcu(&dptr->kd_hlist,
269 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
270 list_add_tail(&dptr->kd_list, &kmem_list);
271 spin_unlock_irqrestore(&kmem_lock, irq_flags);
272
273 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
274 "(%lld/%llu)\n", (unsigned long long) size, flags,
275 ptr, atomic64_read(&kmem_alloc_used),
276 kmem_alloc_max);
277 }
278out:
279 RETURN(ptr);
280}
281EXPORT_SYMBOL(kmem_alloc_track);
282
283void
284kmem_free_track(void *ptr, size_t size)
285{
286 kmem_debug_t *dptr;
287 ENTRY;
288
289 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
290 (unsigned long long) size);
291
292 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
293
294 ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
295
296 /* Size must match */
297 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
298 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
299 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
300
301 atomic64_sub(size, &kmem_alloc_used);
302
303 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
304 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
305 kmem_alloc_max);
306
c8e60837 307 kfree(dptr->kd_func);
308
a0f6da3d 309 memset(dptr, 0x5a, sizeof(kmem_debug_t));
310 kfree(dptr);
311
312 memset(ptr, 0x5a, size);
313 kfree(ptr);
314
315 EXIT;
316}
317EXPORT_SYMBOL(kmem_free_track);
318
319void *
320vmem_alloc_track(size_t size, int flags, const char *func, int line)
321{
322 void *ptr = NULL;
323 kmem_debug_t *dptr;
324 unsigned long irq_flags;
325 ENTRY;
326
327 ASSERT(flags & KM_SLEEP);
328
329 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
330 if (dptr == NULL) {
331 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
332 sizeof(kmem_debug_t), flags);
333 } else {
c8e60837 334 /* We use kstrdup() below because the string pointed to by
335 * __FUNCTION__ might not be available by the time we want
336 * to print it, since the module might have been unloaded. */
337 dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO);
338 if (unlikely(dptr->kd_func == NULL)) {
339 kfree(dptr);
340 CWARN("kstrdup() failed in vmem_alloc(%llu, 0x%x) "
341 "(%lld/%llu)\n", (unsigned long long) size, flags,
342 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
343 goto out;
344 }
345
a0f6da3d 346 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
347 PAGE_KERNEL);
348
349 if (unlikely(ptr == NULL)) {
c8e60837 350 kfree(dptr->kd_func);
a0f6da3d 351 kfree(dptr);
352 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
353 (unsigned long long) size, flags,
354 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
355 goto out;
356 }
357
358 if (flags & __GFP_ZERO)
359 memset(ptr, 0, size);
360
361 atomic64_add(size, &vmem_alloc_used);
362 if (unlikely(atomic64_read(&vmem_alloc_used) >
363 vmem_alloc_max))
364 vmem_alloc_max =
365 atomic64_read(&vmem_alloc_used);
366
367 INIT_HLIST_NODE(&dptr->kd_hlist);
368 INIT_LIST_HEAD(&dptr->kd_list);
369
370 dptr->kd_addr = ptr;
371 dptr->kd_size = size;
a0f6da3d 372 dptr->kd_line = line;
373
374 spin_lock_irqsave(&vmem_lock, irq_flags);
375 hlist_add_head_rcu(&dptr->kd_hlist,
376 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
377 list_add_tail(&dptr->kd_list, &vmem_list);
378 spin_unlock_irqrestore(&vmem_lock, irq_flags);
379
380 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
381 "(%lld/%llu)\n", (unsigned long long) size, flags,
382 ptr, atomic64_read(&vmem_alloc_used),
383 vmem_alloc_max);
384 }
385out:
386 RETURN(ptr);
387}
388EXPORT_SYMBOL(vmem_alloc_track);
389
390void
391vmem_free_track(void *ptr, size_t size)
392{
393 kmem_debug_t *dptr;
394 ENTRY;
395
396 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
397 (unsigned long long) size);
398
399 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
400 ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
401
402 /* Size must match */
403 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
404 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
405 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
406
407 atomic64_sub(size, &vmem_alloc_used);
408 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
409 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
410 vmem_alloc_max);
411
c8e60837 412 kfree(dptr->kd_func);
413
a0f6da3d 414 memset(dptr, 0x5a, sizeof(kmem_debug_t));
415 kfree(dptr);
416
417 memset(ptr, 0x5a, size);
418 vfree(ptr);
419
420 EXIT;
421}
422EXPORT_SYMBOL(vmem_free_track);
423
424# else /* DEBUG_KMEM_TRACKING */
425
426void *
427kmem_alloc_debug(size_t size, int flags, const char *func, int line,
428 int node_alloc, int node)
429{
430 void *ptr;
431 ENTRY;
432
433 /* Marked unlikely because we should never be doing this,
434 * we tolerate to up 2 pages but a single page is best. */
435 if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
436 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
437 (unsigned long long) size, flags,
438 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
439
440 /* Use the correct allocator */
441 if (node_alloc) {
442 ASSERT(!(flags & __GFP_ZERO));
443 ptr = kmalloc_node(size, flags, node);
444 } else if (flags & __GFP_ZERO) {
445 ptr = kzalloc(size, flags & (~__GFP_ZERO));
446 } else {
447 ptr = kmalloc(size, flags);
448 }
449
450 if (ptr == NULL) {
451 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
452 (unsigned long long) size, flags,
453 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
454 } else {
455 atomic64_add(size, &kmem_alloc_used);
456 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
457 kmem_alloc_max = atomic64_read(&kmem_alloc_used);
458
459 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
460 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
461 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
462 }
463 RETURN(ptr);
464}
465EXPORT_SYMBOL(kmem_alloc_debug);
466
467void
468kmem_free_debug(void *ptr, size_t size)
469{
470 ENTRY;
471
472 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
473 (unsigned long long) size);
474
475 atomic64_sub(size, &kmem_alloc_used);
476
477 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
478 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
479 kmem_alloc_max);
480
481 memset(ptr, 0x5a, size);
482 kfree(ptr);
483
484 EXIT;
485}
486EXPORT_SYMBOL(kmem_free_debug);
487
488void *
489vmem_alloc_debug(size_t size, int flags, const char *func, int line)
490{
491 void *ptr;
492 ENTRY;
493
494 ASSERT(flags & KM_SLEEP);
495
496 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
497 PAGE_KERNEL);
498 if (ptr == NULL) {
499 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
500 (unsigned long long) size, flags,
501 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
502 } else {
503 if (flags & __GFP_ZERO)
504 memset(ptr, 0, size);
505
506 atomic64_add(size, &vmem_alloc_used);
507
508 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
509 vmem_alloc_max = atomic64_read(&vmem_alloc_used);
510
511 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
512 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
513 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
514 }
515
516 RETURN(ptr);
517}
518EXPORT_SYMBOL(vmem_alloc_debug);
519
520void
521vmem_free_debug(void *ptr, size_t size)
522{
523 ENTRY;
524
525 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
526 (unsigned long long) size);
527
528 atomic64_sub(size, &vmem_alloc_used);
529
530 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
531 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
532 vmem_alloc_max);
533
534 memset(ptr, 0x5a, size);
535 vfree(ptr);
536
537 EXIT;
538}
539EXPORT_SYMBOL(vmem_free_debug);
540
541# endif /* DEBUG_KMEM_TRACKING */
542#endif /* DEBUG_KMEM */
543
a1502d76 544static void *
545kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
fece7c99 546{
a1502d76 547 void *ptr;
f1ca4da6 548
a1502d76 549 if (skc->skc_flags & KMC_KMEM) {
550 if (size > (2 * PAGE_SIZE)) {
551 ptr = (void *)__get_free_pages(flags, get_order(size));
552 } else
553 ptr = kmem_alloc(size, flags);
554 } else {
555 ptr = vmem_alloc(size, flags);
d6a26c6a 556 }
fece7c99 557
a1502d76 558 return ptr;
559}
fece7c99 560
a1502d76 561static void
562kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
563{
564 if (skc->skc_flags & KMC_KMEM) {
565 if (size > (2 * PAGE_SIZE))
566 free_pages((unsigned long)ptr, get_order(size));
567 else
568 kmem_free(ptr, size);
569 } else {
570 vmem_free(ptr, size);
571 }
fece7c99 572}
573
48e0606a
BB
574/* It's important that we pack the spl_kmem_obj_t structure and the
575 * actual objects in to one large address space to minimize the number
576 * of calls to the allocator. It is far better to do a few large
577 * allocations and then subdivide it ourselves. Now which allocator
578 * we use requires balancing a few trade offs.
579 *
580 * For small objects we use kmem_alloc() because as long as you are
581 * only requesting a small number of pages (ideally just one) its cheap.
582 * However, when you start requesting multiple pages with kmem_alloc()
583 * it gets increasingly expensive since it requires contigeous pages.
584 * For this reason we shift to vmem_alloc() for slabs of large objects
585 * which removes the need for contigeous pages. We do not use
586 * vmem_alloc() in all cases because there is significant locking
587 * overhead in __get_vm_area_node(). This function takes a single
588 * global lock when aquiring an available virtual address range which
589 * serializes all vmem_alloc()'s for all slab caches. Using slightly
590 * different allocation functions for small and large objects should
591 * give us the best of both worlds.
592 *
593 * KMC_ONSLAB KMC_OFFSLAB
594 *
595 * +------------------------+ +-----------------+
596 * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
597 * | skc_obj_size <-+ | | +-----------------+ | |
598 * | spl_kmem_obj_t | | | |
599 * | skc_obj_size <---+ | +-----------------+ | |
600 * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
601 * | ... v | | spl_kmem_obj_t | |
602 * +------------------------+ +-----------------+ v
603 */
fece7c99 604static spl_kmem_slab_t *
a1502d76 605spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
fece7c99 606{
607 spl_kmem_slab_t *sks;
a1502d76 608 spl_kmem_obj_t *sko, *n;
609 void *base, *obj;
48e0606a
BB
610 int i, align, size, rc = 0;
611
a1502d76 612 base = kv_alloc(skc, skc->skc_slab_size, flags);
613 if (base == NULL)
fece7c99 614 RETURN(NULL);
615
a1502d76 616 sks = (spl_kmem_slab_t *)base;
617 sks->sks_magic = SKS_MAGIC;
618 sks->sks_objs = skc->skc_slab_objs;
619 sks->sks_age = jiffies;
620 sks->sks_cache = skc;
621 INIT_LIST_HEAD(&sks->sks_list);
622 INIT_LIST_HEAD(&sks->sks_free_list);
623 sks->sks_ref = 0;
48e0606a
BB
624
625 align = skc->skc_obj_align;
626 size = P2ROUNDUP(skc->skc_obj_size, align) +
627 P2ROUNDUP(sizeof(spl_kmem_obj_t), align);
fece7c99 628
629 for (i = 0; i < sks->sks_objs; i++) {
a1502d76 630 if (skc->skc_flags & KMC_OFFSLAB) {
631 obj = kv_alloc(skc, size, flags);
632 if (!obj)
633 GOTO(out, rc = -ENOMEM);
634 } else {
48e0606a
BB
635 obj = base +
636 P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
637 (i * size);
a1502d76 638 }
639
48e0606a 640 sko = obj + P2ROUNDUP(skc->skc_obj_size, align);
fece7c99 641 sko->sko_addr = obj;
642 sko->sko_magic = SKO_MAGIC;
643 sko->sko_slab = sks;
644 INIT_LIST_HEAD(&sko->sko_list);
fece7c99 645 list_add_tail(&sko->sko_list, &sks->sks_free_list);
646 }
647
fece7c99 648 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
649 if (skc->skc_ctor)
650 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
2fb9b26a 651out:
a1502d76 652 if (rc) {
653 if (skc->skc_flags & KMC_OFFSLAB)
48e0606a
BB
654 list_for_each_entry_safe(sko, n, &sks->sks_free_list,
655 sko_list)
a1502d76 656 kv_free(skc, sko->sko_addr, size);
fece7c99 657
a1502d76 658 kv_free(skc, base, skc->skc_slab_size);
659 sks = NULL;
fece7c99 660 }
661
a1502d76 662 RETURN(sks);
fece7c99 663}
664
2fb9b26a 665/* Removes slab from complete or partial list, so it must
d46630e0 666 * be called with the 'skc->skc_lock' held.
fece7c99 667 */
f1ca4da6 668static void
4afaaefa 669spl_slab_free(spl_kmem_slab_t *sks) {
2fb9b26a 670 spl_kmem_cache_t *skc;
671 spl_kmem_obj_t *sko, *n;
a1502d76 672 int size;
2fb9b26a 673 ENTRY;
57d86234 674
2fb9b26a 675 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 676 ASSERT(sks->sks_ref == 0);
d6a26c6a 677
fece7c99 678 skc = sks->sks_cache;
679 ASSERT(skc->skc_magic == SKC_MAGIC);
d46630e0 680 ASSERT(spin_is_locked(&skc->skc_lock));
f1ca4da6 681
fece7c99 682 skc->skc_obj_total -= sks->sks_objs;
683 skc->skc_slab_total--;
684 list_del(&sks->sks_list);
48e0606a
BB
685 size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
686 P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
937879f1 687
fece7c99 688 /* Run destructors slab is being released */
a1502d76 689 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
690 ASSERT(sko->sko_magic == SKO_MAGIC);
691
2fb9b26a 692 if (skc->skc_dtor)
693 skc->skc_dtor(sko->sko_addr, skc->skc_private);
0a6fd143 694
a1502d76 695 if (skc->skc_flags & KMC_OFFSLAB)
696 kv_free(skc, sko->sko_addr, size);
697 }
d61e12af 698
a1502d76 699 kv_free(skc, sks, skc->skc_slab_size);
2fb9b26a 700 EXIT;
701}
d6a26c6a 702
2fb9b26a 703static int
4afaaefa 704__spl_slab_reclaim(spl_kmem_cache_t *skc)
2fb9b26a 705{
706 spl_kmem_slab_t *sks, *m;
707 int rc = 0;
708 ENTRY;
709
d46630e0 710 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 711 /*
712 * Free empty slabs which have not been touched in skc_delay
713 * seconds. This delay time is important to avoid thrashing.
714 * Empty slabs will be at the end of the skc_partial_list.
715 */
716 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
717 sks_list) {
4afaaefa 718 if (sks->sks_ref > 0)
2fb9b26a 719 break;
720
721 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
4afaaefa 722 spl_slab_free(sks);
2fb9b26a 723 rc++;
724 }
725 }
726
727 /* Returns number of slabs reclaimed */
728 RETURN(rc);
f1ca4da6 729}
730
2fb9b26a 731static int
4afaaefa 732spl_slab_reclaim(spl_kmem_cache_t *skc)
f1ca4da6 733{
2fb9b26a 734 int rc;
735 ENTRY;
f1ca4da6 736
d46630e0 737 spin_lock(&skc->skc_lock);
4afaaefa 738 rc = __spl_slab_reclaim(skc);
d46630e0 739 spin_unlock(&skc->skc_lock);
4efd4118 740
2fb9b26a 741 RETURN(rc);
742}
f1ca4da6 743
48e0606a
BB
744/* Size slabs properly to ensure they are not too large */
745static int
746spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
747{
748 int max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
749 int align = skc->skc_obj_align;
750
751 *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
752
753 if (skc->skc_flags & KMC_OFFSLAB) {
754 *size = sizeof(spl_kmem_slab_t);
755 } else {
756resize:
757 *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align) +
758 *objs * (P2ROUNDUP(skc->skc_obj_size, align) +
759 P2ROUNDUP(sizeof(spl_kmem_obj_t), align));
760
761 if (*size > max)
762 GOTO(resize, *objs = *objs - 1);
763
764 ASSERT(*objs > 0);
765 }
766
767 ASSERTF(*size <= max, "%d < %d\n", *size, max);
768 RETURN(0);
769}
770
4afaaefa 771static int
772spl_magazine_size(spl_kmem_cache_t *skc)
773{
48e0606a 774 int size, align = skc->skc_obj_align;
4afaaefa 775 ENTRY;
776
777 /* Guesses for reasonable magazine sizes, they
778 * should really adapt based on observed usage. */
48e0606a 779 if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256))
4afaaefa 780 size = 4;
48e0606a 781 else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32))
4afaaefa 782 size = 16;
48e0606a 783 else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE))
ff449ac4 784 size = 64;
48e0606a 785 else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4))
ff449ac4 786 size = 128;
4afaaefa 787 else
ff449ac4 788 size = 512;
4afaaefa 789
790 RETURN(size);
791}
792
793static spl_kmem_magazine_t *
794spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
795{
796 spl_kmem_magazine_t *skm;
797 int size = sizeof(spl_kmem_magazine_t) +
798 sizeof(void *) * skc->skc_mag_size;
799 ENTRY;
800
3d061e9d 801 skm = kmem_alloc_node(size, GFP_KERNEL, node);
4afaaefa 802 if (skm) {
803 skm->skm_magic = SKM_MAGIC;
804 skm->skm_avail = 0;
805 skm->skm_size = skc->skc_mag_size;
806 skm->skm_refill = skc->skc_mag_refill;
a1502d76 807 if (!(skc->skc_flags & KMC_NOTOUCH))
808 skm->skm_age = jiffies;
4afaaefa 809 }
810
811 RETURN(skm);
812}
813
814static void
815spl_magazine_free(spl_kmem_magazine_t *skm)
816{
a0f6da3d 817 int size = sizeof(spl_kmem_magazine_t) +
818 sizeof(void *) * skm->skm_size;
819
4afaaefa 820 ENTRY;
821 ASSERT(skm->skm_magic == SKM_MAGIC);
822 ASSERT(skm->skm_avail == 0);
a0f6da3d 823
824 kmem_free(skm, size);
4afaaefa 825 EXIT;
826}
827
828static int
829spl_magazine_create(spl_kmem_cache_t *skc)
830{
831 int i;
832 ENTRY;
833
834 skc->skc_mag_size = spl_magazine_size(skc);
835 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
836
837 for_each_online_cpu(i) {
838 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
839 if (!skc->skc_mag[i]) {
840 for (i--; i >= 0; i--)
841 spl_magazine_free(skc->skc_mag[i]);
842
843 RETURN(-ENOMEM);
844 }
845 }
846
847 RETURN(0);
848}
849
850static void
851spl_magazine_destroy(spl_kmem_cache_t *skc)
852{
853 spl_kmem_magazine_t *skm;
854 int i;
855 ENTRY;
856
857 for_each_online_cpu(i) {
858 skm = skc->skc_mag[i];
859 (void)spl_cache_flush(skc, skm, skm->skm_avail);
860 spl_magazine_free(skm);
861 }
862
863 EXIT;
864}
865
2fb9b26a 866spl_kmem_cache_t *
867spl_kmem_cache_create(char *name, size_t size, size_t align,
868 spl_kmem_ctor_t ctor,
869 spl_kmem_dtor_t dtor,
870 spl_kmem_reclaim_t reclaim,
871 void *priv, void *vmp, int flags)
872{
873 spl_kmem_cache_t *skc;
a1502d76 874 int rc, kmem_flags = KM_SLEEP;
2fb9b26a 875 ENTRY;
937879f1 876
a1502d76 877 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
878 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
879 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
48e0606a 880 ASSERT(vmp == NULL);
a1502d76 881
2fb9b26a 882 /* We may be called when there is a non-zero preempt_count or
883 * interrupts are disabled is which case we must not sleep.
884 */
e9d7a2be 885 if (current_thread_info()->preempt_count || irqs_disabled())
2fb9b26a 886 kmem_flags = KM_NOSLEEP;
0a6fd143 887
2fb9b26a 888 /* Allocate new cache memory and initialize. */
ff449ac4 889 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
e9d7a2be 890 if (skc == NULL)
2fb9b26a 891 RETURN(NULL);
d61e12af 892
2fb9b26a 893 skc->skc_magic = SKC_MAGIC;
2fb9b26a 894 skc->skc_name_size = strlen(name) + 1;
895 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
896 if (skc->skc_name == NULL) {
897 kmem_free(skc, sizeof(*skc));
898 RETURN(NULL);
899 }
900 strncpy(skc->skc_name, name, skc->skc_name_size);
901
e9d7a2be 902 skc->skc_ctor = ctor;
903 skc->skc_dtor = dtor;
904 skc->skc_reclaim = reclaim;
2fb9b26a 905 skc->skc_private = priv;
906 skc->skc_vmp = vmp;
907 skc->skc_flags = flags;
908 skc->skc_obj_size = size;
48e0606a 909 skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
2fb9b26a 910 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
911
2fb9b26a 912 INIT_LIST_HEAD(&skc->skc_list);
913 INIT_LIST_HEAD(&skc->skc_complete_list);
914 INIT_LIST_HEAD(&skc->skc_partial_list);
d46630e0 915 spin_lock_init(&skc->skc_lock);
e9d7a2be 916 skc->skc_slab_fail = 0;
917 skc->skc_slab_create = 0;
918 skc->skc_slab_destroy = 0;
2fb9b26a 919 skc->skc_slab_total = 0;
920 skc->skc_slab_alloc = 0;
921 skc->skc_slab_max = 0;
922 skc->skc_obj_total = 0;
923 skc->skc_obj_alloc = 0;
924 skc->skc_obj_max = 0;
a1502d76 925
48e0606a
BB
926 if (align) {
927 ASSERT((align & (align - 1)) == 0); /* Power of two */
928 ASSERT(align >= SPL_KMEM_CACHE_ALIGN); /* Minimum size */
929 skc->skc_obj_align = align;
930 }
931
a1502d76 932 /* If none passed select a cache type based on object size */
933 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
48e0606a
BB
934 if (P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) <
935 (PAGE_SIZE / 8)) {
a1502d76 936 skc->skc_flags |= KMC_KMEM;
937 } else {
938 skc->skc_flags |= KMC_VMEM;
939 }
940 }
941
48e0606a
BB
942 rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
943 if (rc)
944 GOTO(out, rc);
4afaaefa 945
946 rc = spl_magazine_create(skc);
48e0606a
BB
947 if (rc)
948 GOTO(out, rc);
2fb9b26a 949
950 down_write(&spl_kmem_cache_sem);
e9d7a2be 951 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
2fb9b26a 952 up_write(&spl_kmem_cache_sem);
953
e9d7a2be 954 RETURN(skc);
48e0606a
BB
955out:
956 kmem_free(skc->skc_name, skc->skc_name_size);
957 kmem_free(skc, sizeof(*skc));
958 RETURN(NULL);
f1ca4da6 959}
2fb9b26a 960EXPORT_SYMBOL(spl_kmem_cache_create);
f1ca4da6 961
2fb9b26a 962void
963spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
f1ca4da6 964{
2fb9b26a 965 spl_kmem_slab_t *sks, *m;
966 ENTRY;
f1ca4da6 967
e9d7a2be 968 ASSERT(skc->skc_magic == SKC_MAGIC);
969
970 down_write(&spl_kmem_cache_sem);
971 list_del_init(&skc->skc_list);
972 up_write(&spl_kmem_cache_sem);
2fb9b26a 973
4afaaefa 974 spl_magazine_destroy(skc);
d46630e0 975 spin_lock(&skc->skc_lock);
d6a26c6a 976
2fb9b26a 977 /* Validate there are no objects in use and free all the
4afaaefa 978 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
2fb9b26a 979 ASSERT(list_empty(&skc->skc_complete_list));
a1502d76 980 ASSERT(skc->skc_slab_alloc == 0);
981 ASSERT(skc->skc_obj_alloc == 0);
d6a26c6a 982
e9d7a2be 983 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
4afaaefa 984 spl_slab_free(sks);
2fb9b26a 985
a1502d76 986 ASSERT(skc->skc_slab_total == 0);
987 ASSERT(skc->skc_obj_total == 0);
988
2fb9b26a 989 kmem_free(skc->skc_name, skc->skc_name_size);
d46630e0 990 spin_unlock(&skc->skc_lock);
ff449ac4 991
4afaaefa 992 kmem_free(skc, sizeof(*skc));
2fb9b26a 993
994 EXIT;
f1ca4da6 995}
2fb9b26a 996EXPORT_SYMBOL(spl_kmem_cache_destroy);
f1ca4da6 997
4afaaefa 998static void *
999spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
f1ca4da6 1000{
2fb9b26a 1001 spl_kmem_obj_t *sko;
f1ca4da6 1002
e9d7a2be 1003 ASSERT(skc->skc_magic == SKC_MAGIC);
1004 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 1005 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 1006
a1502d76 1007 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
4afaaefa 1008 ASSERT(sko->sko_magic == SKO_MAGIC);
1009 ASSERT(sko->sko_addr != NULL);
2fb9b26a 1010
a1502d76 1011 /* Remove from sks_free_list */
4afaaefa 1012 list_del_init(&sko->sko_list);
2fb9b26a 1013
4afaaefa 1014 sks->sks_age = jiffies;
1015 sks->sks_ref++;
1016 skc->skc_obj_alloc++;
2fb9b26a 1017
4afaaefa 1018 /* Track max obj usage statistics */
1019 if (skc->skc_obj_alloc > skc->skc_obj_max)
1020 skc->skc_obj_max = skc->skc_obj_alloc;
2fb9b26a 1021
4afaaefa 1022 /* Track max slab usage statistics */
1023 if (sks->sks_ref == 1) {
1024 skc->skc_slab_alloc++;
f1ca4da6 1025
4afaaefa 1026 if (skc->skc_slab_alloc > skc->skc_slab_max)
1027 skc->skc_slab_max = skc->skc_slab_alloc;
2fb9b26a 1028 }
1029
4afaaefa 1030 return sko->sko_addr;
1031}
c30df9c8 1032
4afaaefa 1033/* No available objects create a new slab. Since this is an
1034 * expensive operation we do it without holding the spinlock
1035 * and only briefly aquire it when we link in the fully
1036 * allocated and constructed slab.
1037 */
1038static spl_kmem_slab_t *
1039spl_cache_grow(spl_kmem_cache_t *skc, int flags)
1040{
e9d7a2be 1041 spl_kmem_slab_t *sks;
4afaaefa 1042 ENTRY;
f1ca4da6 1043
e9d7a2be 1044 ASSERT(skc->skc_magic == SKC_MAGIC);
1045
1046 if (flags & __GFP_WAIT) {
fece7c99 1047 flags |= __GFP_NOFAIL;
4afaaefa 1048 local_irq_enable();
f78a933f 1049 might_sleep();
4afaaefa 1050 }
f1ca4da6 1051
4afaaefa 1052 sks = spl_slab_alloc(skc, flags);
1053 if (sks == NULL) {
1054 if (flags & __GFP_WAIT)
1055 local_irq_disable();
1056
1057 RETURN(NULL);
1058 }
2fb9b26a 1059
e9d7a2be 1060 if (flags & __GFP_WAIT)
4afaaefa 1061 local_irq_disable();
1062
1063 /* Link the new empty slab in to the end of skc_partial_list */
d46630e0 1064 spin_lock(&skc->skc_lock);
2fb9b26a 1065 skc->skc_slab_total++;
1066 skc->skc_obj_total += sks->sks_objs;
1067 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
d46630e0 1068 spin_unlock(&skc->skc_lock);
4afaaefa 1069
1070 RETURN(sks);
f1ca4da6 1071}
1072
4afaaefa 1073static int
1074spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
f1ca4da6 1075{
e9d7a2be 1076 spl_kmem_slab_t *sks;
1077 int rc = 0, refill;
937879f1 1078 ENTRY;
f1ca4da6 1079
e9d7a2be 1080 ASSERT(skc->skc_magic == SKC_MAGIC);
1081 ASSERT(skm->skm_magic == SKM_MAGIC);
1082
4afaaefa 1083 /* XXX: Check for refill bouncing by age perhaps */
e9d7a2be 1084 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
4afaaefa 1085
d46630e0 1086 spin_lock(&skc->skc_lock);
ff449ac4 1087
4afaaefa 1088 while (refill > 0) {
1089 /* No slabs available we must grow the cache */
1090 if (list_empty(&skc->skc_partial_list)) {
1091 spin_unlock(&skc->skc_lock);
ff449ac4 1092
4afaaefa 1093 sks = spl_cache_grow(skc, flags);
1094 if (!sks)
e9d7a2be 1095 GOTO(out, rc);
4afaaefa 1096
1097 /* Rescheduled to different CPU skm is not local */
1098 if (skm != skc->skc_mag[smp_processor_id()])
e9d7a2be 1099 GOTO(out, rc);
1100
1101 /* Potentially rescheduled to the same CPU but
1102 * allocations may have occured from this CPU while
1103 * we were sleeping so recalculate max refill. */
1104 refill = MIN(refill, skm->skm_size - skm->skm_avail);
4afaaefa 1105
1106 spin_lock(&skc->skc_lock);
1107 continue;
1108 }
d46630e0 1109
4afaaefa 1110 /* Grab the next available slab */
1111 sks = list_entry((&skc->skc_partial_list)->next,
1112 spl_kmem_slab_t, sks_list);
1113 ASSERT(sks->sks_magic == SKS_MAGIC);
1114 ASSERT(sks->sks_ref < sks->sks_objs);
1115 ASSERT(!list_empty(&sks->sks_free_list));
d46630e0 1116
4afaaefa 1117 /* Consume as many objects as needed to refill the requested
e9d7a2be 1118 * cache. We must also be careful not to overfill it. */
1119 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1120 ASSERT(skm->skm_avail < skm->skm_size);
1121 ASSERT(rc < skm->skm_size);
4afaaefa 1122 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
e9d7a2be 1123 }
f1ca4da6 1124
4afaaefa 1125 /* Move slab to skc_complete_list when full */
1126 if (sks->sks_ref == sks->sks_objs) {
1127 list_del(&sks->sks_list);
1128 list_add(&sks->sks_list, &skc->skc_complete_list);
2fb9b26a 1129 }
1130 }
57d86234 1131
4afaaefa 1132 spin_unlock(&skc->skc_lock);
1133out:
1134 /* Returns the number of entries added to cache */
e9d7a2be 1135 RETURN(rc);
4afaaefa 1136}
1137
1138static void
1139spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1140{
e9d7a2be 1141 spl_kmem_slab_t *sks = NULL;
4afaaefa 1142 spl_kmem_obj_t *sko = NULL;
1143 ENTRY;
1144
e9d7a2be 1145 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 1146 ASSERT(spin_is_locked(&skc->skc_lock));
1147
48e0606a 1148 sko = obj + P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align);
a1502d76 1149 ASSERT(sko->sko_magic == SKO_MAGIC);
4afaaefa 1150
1151 sks = sko->sko_slab;
a1502d76 1152 ASSERT(sks->sks_magic == SKS_MAGIC);
2fb9b26a 1153 ASSERT(sks->sks_cache == skc);
2fb9b26a 1154 list_add(&sko->sko_list, &sks->sks_free_list);
d6a26c6a 1155
2fb9b26a 1156 sks->sks_age = jiffies;
4afaaefa 1157 sks->sks_ref--;
2fb9b26a 1158 skc->skc_obj_alloc--;
f1ca4da6 1159
2fb9b26a 1160 /* Move slab to skc_partial_list when no longer full. Slabs
4afaaefa 1161 * are added to the head to keep the partial list is quasi-full
1162 * sorted order. Fuller at the head, emptier at the tail. */
1163 if (sks->sks_ref == (sks->sks_objs - 1)) {
2fb9b26a 1164 list_del(&sks->sks_list);
1165 list_add(&sks->sks_list, &skc->skc_partial_list);
1166 }
f1ca4da6 1167
2fb9b26a 1168 /* Move emply slabs to the end of the partial list so
4afaaefa 1169 * they can be easily found and freed during reclamation. */
1170 if (sks->sks_ref == 0) {
2fb9b26a 1171 list_del(&sks->sks_list);
1172 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1173 skc->skc_slab_alloc--;
1174 }
1175
4afaaefa 1176 EXIT;
1177}
1178
1179static int
1180spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1181{
1182 int i, count = MIN(flush, skm->skm_avail);
1183 ENTRY;
1184
e9d7a2be 1185 ASSERT(skc->skc_magic == SKC_MAGIC);
1186 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 1187
1188 spin_lock(&skc->skc_lock);
ff449ac4 1189
4afaaefa 1190 for (i = 0; i < count; i++)
1191 spl_cache_shrink(skc, skm->skm_objs[i]);
1192
e9d7a2be 1193// __spl_slab_reclaim(skc);
1194 skm->skm_avail -= count;
1195 memmove(skm->skm_objs, &(skm->skm_objs[count]),
4afaaefa 1196 sizeof(void *) * skm->skm_avail);
1197
d46630e0 1198 spin_unlock(&skc->skc_lock);
4afaaefa 1199
1200 RETURN(count);
1201}
1202
1203void *
1204spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1205{
1206 spl_kmem_magazine_t *skm;
1207 unsigned long irq_flags;
1208 void *obj = NULL;
1209 ENTRY;
1210
e9d7a2be 1211 ASSERT(skc->skc_magic == SKC_MAGIC);
1212 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
4afaaefa 1213 local_irq_save(irq_flags);
1214
1215restart:
1216 /* Safe to update per-cpu structure without lock, but
1217 * in the restart case we must be careful to reaquire
1218 * the local magazine since this may have changed
1219 * when we need to grow the cache. */
1220 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 1221 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1222 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1223 skm->skm_size, skm->skm_refill, skm->skm_avail);
4afaaefa 1224
1225 if (likely(skm->skm_avail)) {
1226 /* Object available in CPU cache, use it */
1227 obj = skm->skm_objs[--skm->skm_avail];
a1502d76 1228 if (!(skc->skc_flags & KMC_NOTOUCH))
1229 skm->skm_age = jiffies;
4afaaefa 1230 } else {
1231 /* Per-CPU cache empty, directly allocate from
1232 * the slab and refill the per-CPU cache. */
1233 (void)spl_cache_refill(skc, skm, flags);
1234 GOTO(restart, obj = NULL);
1235 }
1236
1237 local_irq_restore(irq_flags);
fece7c99 1238 ASSERT(obj);
48e0606a 1239 ASSERT(((unsigned long)(obj) % skc->skc_obj_align) == 0);
4afaaefa 1240
1241 /* Pre-emptively migrate object to CPU L1 cache */
1242 prefetchw(obj);
1243
1244 RETURN(obj);
1245}
1246EXPORT_SYMBOL(spl_kmem_cache_alloc);
1247
1248void
1249spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1250{
1251 spl_kmem_magazine_t *skm;
1252 unsigned long flags;
1253 ENTRY;
1254
e9d7a2be 1255 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 1256 local_irq_save(flags);
1257
1258 /* Safe to update per-cpu structure without lock, but
1259 * no remote memory allocation tracking is being performed
1260 * it is entirely possible to allocate an object from one
1261 * CPU cache and return it to another. */
1262 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 1263 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 1264
1265 /* Per-CPU cache full, flush it to make space */
1266 if (unlikely(skm->skm_avail >= skm->skm_size))
1267 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1268
1269 /* Available space in cache, use it */
1270 skm->skm_objs[skm->skm_avail++] = obj;
1271
1272 local_irq_restore(flags);
1273
1274 EXIT;
f1ca4da6 1275}
2fb9b26a 1276EXPORT_SYMBOL(spl_kmem_cache_free);
5c2bb9b2 1277
2fb9b26a 1278static int
4afaaefa 1279spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
2fb9b26a 1280{
e9d7a2be 1281 spl_kmem_cache_t *skc;
5c2bb9b2 1282
2fb9b26a 1283 /* Under linux a shrinker is not tightly coupled with a slab
1284 * cache. In fact linux always systematically trys calling all
1285 * registered shrinker callbacks until its target reclamation level
1286 * is reached. Because of this we only register one shrinker
1287 * function in the shim layer for all slab caches. And we always
1288 * attempt to shrink all caches when this generic shrinker is called.
c30df9c8 1289 */
e9d7a2be 1290 down_read(&spl_kmem_cache_sem);
57d86234 1291
e9d7a2be 1292 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
2fb9b26a 1293 spl_kmem_cache_reap_now(skc);
1294
e9d7a2be 1295 up_read(&spl_kmem_cache_sem);
2fb9b26a 1296
1297 /* XXX: Under linux we should return the remaining number of
1298 * entries in the cache. We should do this as well.
1299 */
1300 return 1;
5c2bb9b2 1301}
5c2bb9b2 1302
57d86234 1303void
2fb9b26a 1304spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
57d86234 1305{
4afaaefa 1306 spl_kmem_magazine_t *skm;
1307 int i;
2fb9b26a 1308 ENTRY;
e9d7a2be 1309
1310 ASSERT(skc->skc_magic == SKC_MAGIC);
2fb9b26a 1311
1312 if (skc->skc_reclaim)
1313 skc->skc_reclaim(skc->skc_private);
1314
4afaaefa 1315 /* Ensure per-CPU caches which are idle gradually flush */
1316 for_each_online_cpu(i) {
1317 skm = skc->skc_mag[i];
1318
1319 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1320 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1321 }
1322
1323 spl_slab_reclaim(skc);
1324
2fb9b26a 1325 EXIT;
57d86234 1326}
2fb9b26a 1327EXPORT_SYMBOL(spl_kmem_cache_reap_now);
57d86234 1328
f1b59d26 1329void
2fb9b26a 1330spl_kmem_reap(void)
937879f1 1331{
4afaaefa 1332 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
f1ca4da6 1333}
2fb9b26a 1334EXPORT_SYMBOL(spl_kmem_reap);
5d86345d 1335
ff449ac4 1336#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
c6dc93d6 1337static char *
4afaaefa 1338spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
d6a26c6a 1339{
e9d7a2be 1340 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
d6a26c6a 1341 int i, flag = 1;
1342
1343 ASSERT(str != NULL && len >= 17);
e9d7a2be 1344 memset(str, 0, len);
d6a26c6a 1345
1346 /* Check for a fully printable string, and while we are at
1347 * it place the printable characters in the passed buffer. */
1348 for (i = 0; i < size; i++) {
e9d7a2be 1349 str[i] = ((char *)(kd->kd_addr))[i];
1350 if (isprint(str[i])) {
1351 continue;
1352 } else {
1353 /* Minimum number of printable characters found
1354 * to make it worthwhile to print this as ascii. */
1355 if (i > min)
1356 break;
1357
1358 flag = 0;
1359 break;
1360 }
d6a26c6a 1361 }
1362
1363 if (!flag) {
1364 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1365 *((uint8_t *)kd->kd_addr),
1366 *((uint8_t *)kd->kd_addr + 2),
1367 *((uint8_t *)kd->kd_addr + 4),
1368 *((uint8_t *)kd->kd_addr + 6),
1369 *((uint8_t *)kd->kd_addr + 8),
1370 *((uint8_t *)kd->kd_addr + 10),
1371 *((uint8_t *)kd->kd_addr + 12),
1372 *((uint8_t *)kd->kd_addr + 14));
1373 }
1374
1375 return str;
1376}
1377
a1502d76 1378static int
1379spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1380{
1381 int i;
1382 ENTRY;
1383
1384 spin_lock_init(lock);
1385 INIT_LIST_HEAD(list);
1386
1387 for (i = 0; i < size; i++)
1388 INIT_HLIST_HEAD(&kmem_table[i]);
1389
1390 RETURN(0);
1391}
1392
ff449ac4 1393static void
1394spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
5d86345d 1395{
2fb9b26a 1396 unsigned long flags;
1397 kmem_debug_t *kd;
1398 char str[17];
a1502d76 1399 ENTRY;
2fb9b26a 1400
ff449ac4 1401 spin_lock_irqsave(lock, flags);
1402 if (!list_empty(list))
a0f6da3d 1403 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1404 "size", "data", "func", "line");
2fb9b26a 1405
ff449ac4 1406 list_for_each_entry(kd, list, kd_list)
a0f6da3d 1407 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
b6b2acc6 1408 (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2fb9b26a 1409 kd->kd_func, kd->kd_line);
1410
ff449ac4 1411 spin_unlock_irqrestore(lock, flags);
a1502d76 1412 EXIT;
ff449ac4 1413}
1414#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
a1502d76 1415#define spl_kmem_init_tracking(list, lock, size)
ff449ac4 1416#define spl_kmem_fini_tracking(list, lock)
1417#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1418
a1502d76 1419int
1420spl_kmem_init(void)
1421{
1422 int rc = 0;
1423 ENTRY;
1424
1425 init_rwsem(&spl_kmem_cache_sem);
1426 INIT_LIST_HEAD(&spl_kmem_cache_list);
1427
1428#ifdef HAVE_SET_SHRINKER
1429 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1430 spl_kmem_cache_generic_shrinker);
1431 if (spl_kmem_cache_shrinker == NULL)
f78a933f 1432 RETURN(rc = -ENOMEM);
a1502d76 1433#else
1434 register_shrinker(&spl_kmem_cache_shrinker);
1435#endif
1436
1437#ifdef DEBUG_KMEM
1438 atomic64_set(&kmem_alloc_used, 0);
1439 atomic64_set(&vmem_alloc_used, 0);
1440
1441 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1442 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1443#endif
a1502d76 1444 RETURN(rc);
1445}
1446
ff449ac4 1447void
1448spl_kmem_fini(void)
1449{
1450#ifdef DEBUG_KMEM
1451 /* Display all unreclaimed memory addresses, including the
1452 * allocation size and the first few bytes of what's located
1453 * at that address to aid in debugging. Performance is not
1454 * a serious concern here since it is module unload time. */
1455 if (atomic64_read(&kmem_alloc_used) != 0)
1456 CWARN("kmem leaked %ld/%ld bytes\n",
550f1705 1457 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
ff449ac4 1458
2fb9b26a 1459
1460 if (atomic64_read(&vmem_alloc_used) != 0)
1461 CWARN("vmem leaked %ld/%ld bytes\n",
550f1705 1462 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
2fb9b26a 1463
ff449ac4 1464 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1465 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1466#endif /* DEBUG_KMEM */
2fb9b26a 1467 ENTRY;
1468
1469#ifdef HAVE_SET_SHRINKER
1470 remove_shrinker(spl_kmem_cache_shrinker);
1471#else
1472 unregister_shrinker(&spl_kmem_cache_shrinker);
5d86345d 1473#endif
2fb9b26a 1474
937879f1 1475 EXIT;
5d86345d 1476}