]> git.proxmox.com Git - mirror_spl-debian.git/blame - modules/spl/spl-kmem.c
Under Solaris KM_SLEEP ensures success (or at least you hang forever).
[mirror_spl-debian.git] / modules / spl / spl-kmem.c
CommitLineData
715f6251 1/*
2 * This file is part of the SPL: Solaris Porting Layer.
3 *
4 * Copyright (c) 2008 Lawrence Livermore National Security, LLC.
5 * Produced at Lawrence Livermore National Laboratory
6 * Written by:
7 * Brian Behlendorf <behlendorf1@llnl.gov>,
8 * Herb Wartens <wartens2@llnl.gov>,
9 * Jim Garlick <garlick@llnl.gov>
10 * UCRL-CODE-235197
11 *
12 * This is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
20 * for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
f4b37741 27#include <sys/kmem.h>
f1ca4da6 28
937879f1 29#ifdef DEBUG_SUBSYSTEM
a0f6da3d 30# undef DEBUG_SUBSYSTEM
937879f1 31#endif
32
33#define DEBUG_SUBSYSTEM S_KMEM
34
f1ca4da6 35/*
2fb9b26a 36 * Memory allocation interfaces and debugging for basic kmem_*
37 * and vmem_* style memory allocation. When DEBUG_KMEM is enable
38 * all allocations will be tracked when they are allocated and
39 * freed. When the SPL module is unload a list of all leaked
40 * addresses and where they were allocated will be dumped to the
41 * console. Enabling this feature has a significant impant on
42 * performance but it makes finding memory leaks staight forward.
f1ca4da6 43 */
44#ifdef DEBUG_KMEM
45/* Shim layer memory accounting */
550f1705 46atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 47unsigned long long kmem_alloc_max = 0;
550f1705 48atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
a0f6da3d 49unsigned long long vmem_alloc_max = 0;
c19c06f3 50int kmem_warning_flag = 1;
79b31f36 51
ff449ac4 52EXPORT_SYMBOL(kmem_alloc_used);
53EXPORT_SYMBOL(kmem_alloc_max);
54EXPORT_SYMBOL(vmem_alloc_used);
55EXPORT_SYMBOL(vmem_alloc_max);
56EXPORT_SYMBOL(kmem_warning_flag);
57
a0f6da3d 58# ifdef DEBUG_KMEM_TRACKING
59
60/* XXX - Not to surprisingly with debugging enabled the xmem_locks are very
61 * highly contended particularly on xfree(). If we want to run with this
62 * detailed debugging enabled for anything other than debugging we need to
63 * minimize the contention by moving to a lock per xmem_table entry model.
64 */
65
66# define KMEM_HASH_BITS 10
67# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
68
69# define VMEM_HASH_BITS 10
70# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
71
72typedef struct kmem_debug {
73 struct hlist_node kd_hlist; /* Hash node linkage */
74 struct list_head kd_list; /* List of all allocations */
75 void *kd_addr; /* Allocation pointer */
76 size_t kd_size; /* Allocation size */
77 const char *kd_func; /* Allocation function */
78 int kd_line; /* Allocation line */
79} kmem_debug_t;
80
d6a26c6a 81spinlock_t kmem_lock;
82struct hlist_head kmem_table[KMEM_TABLE_SIZE];
83struct list_head kmem_list;
84
13cdca65 85spinlock_t vmem_lock;
86struct hlist_head vmem_table[VMEM_TABLE_SIZE];
87struct list_head vmem_list;
88
d6a26c6a 89EXPORT_SYMBOL(kmem_lock);
90EXPORT_SYMBOL(kmem_table);
91EXPORT_SYMBOL(kmem_list);
92
13cdca65 93EXPORT_SYMBOL(vmem_lock);
94EXPORT_SYMBOL(vmem_table);
95EXPORT_SYMBOL(vmem_list);
a0f6da3d 96# endif
13cdca65 97
c19c06f3 98int kmem_set_warning(int flag) { return (kmem_warning_flag = !!flag); }
99#else
100int kmem_set_warning(int flag) { return 0; }
f1ca4da6 101#endif
c19c06f3 102EXPORT_SYMBOL(kmem_set_warning);
f1ca4da6 103
104/*
105 * Slab allocation interfaces
106 *
2fb9b26a 107 * While the Linux slab implementation was inspired by the Solaris
108 * implemenation I cannot use it to emulate the Solaris APIs. I
109 * require two features which are not provided by the Linux slab.
110 *
111 * 1) Constructors AND destructors. Recent versions of the Linux
112 * kernel have removed support for destructors. This is a deal
113 * breaker for the SPL which contains particularly expensive
114 * initializers for mutex's, condition variables, etc. We also
a0f6da3d 115 * require a minimal level of cleanup for these data types unlike
116 * many Linux data type which do need to be explicitly destroyed.
2fb9b26a 117 *
a0f6da3d 118 * 2) Virtual address space backed slab. Callers of the Solaris slab
2fb9b26a 119 * expect it to work well for both small are very large allocations.
120 * Because of memory fragmentation the Linux slab which is backed
121 * by kmalloc'ed memory performs very badly when confronted with
122 * large numbers of large allocations. Basing the slab on the
123 * virtual address space removes the need for contigeous pages
124 * and greatly improve performance for large allocations.
125 *
126 * For these reasons, the SPL has its own slab implementation with
127 * the needed features. It is not as highly optimized as either the
128 * Solaris or Linux slabs, but it should get me most of what is
129 * needed until it can be optimized or obsoleted by another approach.
130 *
131 * One serious concern I do have about this method is the relatively
132 * small virtual address space on 32bit arches. This will seriously
133 * constrain the size of the slab caches and their performance.
134 *
2fb9b26a 135 * XXX: Implement work requests to keep an eye on each cache and
4afaaefa 136 * shrink them via spl_slab_reclaim() when they are wasting lots
2fb9b26a 137 * of space. Currently this process is driven by the reapers.
138 *
2fb9b26a 139 * XXX: Improve the partial slab list by carefully maintaining a
140 * strict ordering of fullest to emptiest slabs based on
141 * the slab reference count. This gaurentees the when freeing
142 * slabs back to the system we need only linearly traverse the
143 * last N slabs in the list to discover all the freeable slabs.
144 *
145 * XXX: NUMA awareness for optionally allocating memory close to a
146 * particular core. This can be adventageous if you know the slab
147 * object will be short lived and primarily accessed from one core.
148 *
149 * XXX: Slab coloring may also yield performance improvements and would
150 * be desirable to implement.
4afaaefa 151 *
152 * XXX: Proper hardware cache alignment would be good too.
f1ca4da6 153 */
2fb9b26a 154
a0f6da3d 155struct list_head spl_kmem_cache_list; /* List of caches */
156struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
c30df9c8 157
4afaaefa 158static int spl_cache_flush(spl_kmem_cache_t *skc,
a0f6da3d 159 spl_kmem_magazine_t *skm, int flush);
4afaaefa 160
57d86234 161#ifdef HAVE_SET_SHRINKER
2fb9b26a 162static struct shrinker *spl_kmem_cache_shrinker;
57d86234 163#else
4afaaefa 164static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
a0f6da3d 165 unsigned int gfp_mask);
2fb9b26a 166static struct shrinker spl_kmem_cache_shrinker = {
4afaaefa 167 .shrink = spl_kmem_cache_generic_shrinker,
57d86234 168 .seeks = KMC_DEFAULT_SEEKS,
169};
170#endif
f1ca4da6 171
a0f6da3d 172#ifdef DEBUG_KMEM
173# ifdef DEBUG_KMEM_TRACKING
174
175static kmem_debug_t *
176kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits,
177 void *addr)
178{
179 struct hlist_head *head;
180 struct hlist_node *node;
181 struct kmem_debug *p;
182 unsigned long flags;
183 ENTRY;
184
185 spin_lock_irqsave(lock, flags);
186
187 head = &table[hash_ptr(addr, bits)];
188 hlist_for_each_entry_rcu(p, node, head, kd_hlist) {
189 if (p->kd_addr == addr) {
190 hlist_del_init(&p->kd_hlist);
191 list_del_init(&p->kd_list);
192 spin_unlock_irqrestore(lock, flags);
193 return p;
194 }
195 }
196
197 spin_unlock_irqrestore(lock, flags);
198
199 RETURN(NULL);
200}
201
202void *
203kmem_alloc_track(size_t size, int flags, const char *func, int line,
204 int node_alloc, int node)
205{
206 void *ptr = NULL;
207 kmem_debug_t *dptr;
208 unsigned long irq_flags;
209 ENTRY;
210
211 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t),
212 flags & ~__GFP_ZERO);
213
214 if (dptr == NULL) {
215 CWARN("kmem_alloc(%ld, 0x%x) debug failed\n",
216 sizeof(kmem_debug_t), flags);
217 } else {
218 /* Marked unlikely because we should never be doing this,
219 * we tolerate to up 2 pages but a single page is best. */
220 if (unlikely((size) > (PAGE_SIZE * 2)) && kmem_warning_flag)
221 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
222 (unsigned long long) size, flags,
223 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
224
225 /* Use the correct allocator */
226 if (node_alloc) {
227 ASSERT(!(flags & __GFP_ZERO));
228 ptr = kmalloc_node(size, flags, node);
229 } else if (flags & __GFP_ZERO) {
230 ptr = kzalloc(size, flags & ~__GFP_ZERO);
231 } else {
232 ptr = kmalloc(size, flags);
233 }
234
235 if (unlikely(ptr == NULL)) {
236 kfree(dptr);
237 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
238 (unsigned long long) size, flags,
239 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
240 goto out;
241 }
242
243 atomic64_add(size, &kmem_alloc_used);
244 if (unlikely(atomic64_read(&kmem_alloc_used) >
245 kmem_alloc_max))
246 kmem_alloc_max =
247 atomic64_read(&kmem_alloc_used);
248
249 INIT_HLIST_NODE(&dptr->kd_hlist);
250 INIT_LIST_HEAD(&dptr->kd_list);
251
252 dptr->kd_addr = ptr;
253 dptr->kd_size = size;
254 dptr->kd_func = func;
255 dptr->kd_line = line;
256
257 spin_lock_irqsave(&kmem_lock, irq_flags);
258 hlist_add_head_rcu(&dptr->kd_hlist,
259 &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
260 list_add_tail(&dptr->kd_list, &kmem_list);
261 spin_unlock_irqrestore(&kmem_lock, irq_flags);
262
263 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
264 "(%lld/%llu)\n", (unsigned long long) size, flags,
265 ptr, atomic64_read(&kmem_alloc_used),
266 kmem_alloc_max);
267 }
268out:
269 RETURN(ptr);
270}
271EXPORT_SYMBOL(kmem_alloc_track);
272
273void
274kmem_free_track(void *ptr, size_t size)
275{
276 kmem_debug_t *dptr;
277 ENTRY;
278
279 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
280 (unsigned long long) size);
281
282 dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
283
284 ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */
285
286 /* Size must match */
287 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
288 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
289 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
290
291 atomic64_sub(size, &kmem_alloc_used);
292
293 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
294 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
295 kmem_alloc_max);
296
297 memset(dptr, 0x5a, sizeof(kmem_debug_t));
298 kfree(dptr);
299
300 memset(ptr, 0x5a, size);
301 kfree(ptr);
302
303 EXIT;
304}
305EXPORT_SYMBOL(kmem_free_track);
306
307void *
308vmem_alloc_track(size_t size, int flags, const char *func, int line)
309{
310 void *ptr = NULL;
311 kmem_debug_t *dptr;
312 unsigned long irq_flags;
313 ENTRY;
314
315 ASSERT(flags & KM_SLEEP);
316
317 dptr = (kmem_debug_t *) kmalloc(sizeof(kmem_debug_t), flags);
318 if (dptr == NULL) {
319 CWARN("vmem_alloc(%ld, 0x%x) debug failed\n",
320 sizeof(kmem_debug_t), flags);
321 } else {
322 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
323 PAGE_KERNEL);
324
325 if (unlikely(ptr == NULL)) {
326 kfree(dptr);
327 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
328 (unsigned long long) size, flags,
329 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
330 goto out;
331 }
332
333 if (flags & __GFP_ZERO)
334 memset(ptr, 0, size);
335
336 atomic64_add(size, &vmem_alloc_used);
337 if (unlikely(atomic64_read(&vmem_alloc_used) >
338 vmem_alloc_max))
339 vmem_alloc_max =
340 atomic64_read(&vmem_alloc_used);
341
342 INIT_HLIST_NODE(&dptr->kd_hlist);
343 INIT_LIST_HEAD(&dptr->kd_list);
344
345 dptr->kd_addr = ptr;
346 dptr->kd_size = size;
347 dptr->kd_func = func;
348 dptr->kd_line = line;
349
350 spin_lock_irqsave(&vmem_lock, irq_flags);
351 hlist_add_head_rcu(&dptr->kd_hlist,
352 &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
353 list_add_tail(&dptr->kd_list, &vmem_list);
354 spin_unlock_irqrestore(&vmem_lock, irq_flags);
355
356 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
357 "(%lld/%llu)\n", (unsigned long long) size, flags,
358 ptr, atomic64_read(&vmem_alloc_used),
359 vmem_alloc_max);
360 }
361out:
362 RETURN(ptr);
363}
364EXPORT_SYMBOL(vmem_alloc_track);
365
366void
367vmem_free_track(void *ptr, size_t size)
368{
369 kmem_debug_t *dptr;
370 ENTRY;
371
372 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
373 (unsigned long long) size);
374
375 dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
376 ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
377
378 /* Size must match */
379 ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
380 "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
381 (unsigned long long) size, dptr->kd_func, dptr->kd_line);
382
383 atomic64_sub(size, &vmem_alloc_used);
384 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
385 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
386 vmem_alloc_max);
387
388 memset(dptr, 0x5a, sizeof(kmem_debug_t));
389 kfree(dptr);
390
391 memset(ptr, 0x5a, size);
392 vfree(ptr);
393
394 EXIT;
395}
396EXPORT_SYMBOL(vmem_free_track);
397
398# else /* DEBUG_KMEM_TRACKING */
399
400void *
401kmem_alloc_debug(size_t size, int flags, const char *func, int line,
402 int node_alloc, int node)
403{
404 void *ptr;
405 ENTRY;
406
407 /* Marked unlikely because we should never be doing this,
408 * we tolerate to up 2 pages but a single page is best. */
409 if (unlikely(size > (PAGE_SIZE * 2)) && kmem_warning_flag)
410 CWARN("Large kmem_alloc(%llu, 0x%x) (%lld/%llu)\n",
411 (unsigned long long) size, flags,
412 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
413
414 /* Use the correct allocator */
415 if (node_alloc) {
416 ASSERT(!(flags & __GFP_ZERO));
417 ptr = kmalloc_node(size, flags, node);
418 } else if (flags & __GFP_ZERO) {
419 ptr = kzalloc(size, flags & (~__GFP_ZERO));
420 } else {
421 ptr = kmalloc(size, flags);
422 }
423
424 if (ptr == NULL) {
425 CWARN("kmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
426 (unsigned long long) size, flags,
427 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
428 } else {
429 atomic64_add(size, &kmem_alloc_used);
430 if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max))
431 kmem_alloc_max = atomic64_read(&kmem_alloc_used);
432
433 CDEBUG_LIMIT(D_INFO, "kmem_alloc(%llu, 0x%x) = %p "
434 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
435 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
436 }
437 RETURN(ptr);
438}
439EXPORT_SYMBOL(kmem_alloc_debug);
440
441void
442kmem_free_debug(void *ptr, size_t size)
443{
444 ENTRY;
445
446 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
447 (unsigned long long) size);
448
449 atomic64_sub(size, &kmem_alloc_used);
450
451 CDEBUG_LIMIT(D_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
452 (unsigned long long) size, atomic64_read(&kmem_alloc_used),
453 kmem_alloc_max);
454
455 memset(ptr, 0x5a, size);
456 kfree(ptr);
457
458 EXIT;
459}
460EXPORT_SYMBOL(kmem_free_debug);
461
462void *
463vmem_alloc_debug(size_t size, int flags, const char *func, int line)
464{
465 void *ptr;
466 ENTRY;
467
468 ASSERT(flags & KM_SLEEP);
469
470 ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO,
471 PAGE_KERNEL);
472 if (ptr == NULL) {
473 CWARN("vmem_alloc(%llu, 0x%x) failed (%lld/%llu)\n",
474 (unsigned long long) size, flags,
475 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
476 } else {
477 if (flags & __GFP_ZERO)
478 memset(ptr, 0, size);
479
480 atomic64_add(size, &vmem_alloc_used);
481
482 if (unlikely(atomic64_read(&vmem_alloc_used) > vmem_alloc_max))
483 vmem_alloc_max = atomic64_read(&vmem_alloc_used);
484
485 CDEBUG_LIMIT(D_INFO, "vmem_alloc(%llu, 0x%x) = %p "
486 "(%lld/%llu)\n", (unsigned long long) size, flags, ptr,
487 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
488 }
489
490 RETURN(ptr);
491}
492EXPORT_SYMBOL(vmem_alloc_debug);
493
494void
495vmem_free_debug(void *ptr, size_t size)
496{
497 ENTRY;
498
499 ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
500 (unsigned long long) size);
501
502 atomic64_sub(size, &vmem_alloc_used);
503
504 CDEBUG_LIMIT(D_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
505 (unsigned long long) size, atomic64_read(&vmem_alloc_used),
506 vmem_alloc_max);
507
508 memset(ptr, 0x5a, size);
509 vfree(ptr);
510
511 EXIT;
512}
513EXPORT_SYMBOL(vmem_free_debug);
514
515# endif /* DEBUG_KMEM_TRACKING */
516#endif /* DEBUG_KMEM */
517
a1502d76 518static void *
519kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
fece7c99 520{
a1502d76 521 void *ptr;
f1ca4da6 522
a1502d76 523 if (skc->skc_flags & KMC_KMEM) {
524 if (size > (2 * PAGE_SIZE)) {
525 ptr = (void *)__get_free_pages(flags, get_order(size));
526 } else
527 ptr = kmem_alloc(size, flags);
528 } else {
529 ptr = vmem_alloc(size, flags);
d6a26c6a 530 }
fece7c99 531
a1502d76 532 return ptr;
533}
fece7c99 534
a1502d76 535static void
536kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
537{
538 if (skc->skc_flags & KMC_KMEM) {
539 if (size > (2 * PAGE_SIZE))
540 free_pages((unsigned long)ptr, get_order(size));
541 else
542 kmem_free(ptr, size);
543 } else {
544 vmem_free(ptr, size);
545 }
fece7c99 546}
547
548static spl_kmem_slab_t *
a1502d76 549spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
fece7c99 550{
551 spl_kmem_slab_t *sks;
a1502d76 552 spl_kmem_obj_t *sko, *n;
553 void *base, *obj;
554 int i, size, rc = 0;
555
556 /* It's important that we pack the spl_kmem_obj_t structure
557 * and the actual objects in to one large address space
558 * to minimize the number of calls to the allocator. It
559 * is far better to do a few large allocations and then
560 * subdivide it ourselves. Now which allocator we use
561 * requires balancling a few trade offs.
562 *
563 * For small objects we use kmem_alloc() because as long
564 * as you are only requesting a small number of pages
565 * (ideally just one) its cheap. However, when you start
566 * requesting multiple pages kmem_alloc() get increasingly
567 * expensive since it requires contigeous pages. For this
568 * reason we shift to vmem_alloc() for slabs of large
569 * objects which removes the need for contigeous pages.
570 * We do not use vmem_alloc() in all cases because there
571 * is significant locking overhead in __get_vm_area_node().
572 * This function takes a single global lock when aquiring
573 * an available virtual address range which serialize all
574 * vmem_alloc()'s for all slab caches. Using slightly
575 * different allocation functions for small and large
576 * objects should give us the best of both worlds.
fece7c99 577 *
a1502d76 578 * sks struct: sizeof(spl_kmem_slab_t)
579 * obj data: skc->skc_obj_size
580 * obj struct: sizeof(spl_kmem_obj_t)
581 * <N obj data + obj structs>
fece7c99 582 *
583 * XXX: It would probably be a good idea to more carefully
a1502d76 584 * align these data structures in memory.
fece7c99 585 */
a1502d76 586 base = kv_alloc(skc, skc->skc_slab_size, flags);
587 if (base == NULL)
fece7c99 588 RETURN(NULL);
589
a1502d76 590 sks = (spl_kmem_slab_t *)base;
591 sks->sks_magic = SKS_MAGIC;
592 sks->sks_objs = skc->skc_slab_objs;
593 sks->sks_age = jiffies;
594 sks->sks_cache = skc;
595 INIT_LIST_HEAD(&sks->sks_list);
596 INIT_LIST_HEAD(&sks->sks_free_list);
597 sks->sks_ref = 0;
598 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
fece7c99 599
600 for (i = 0; i < sks->sks_objs; i++) {
a1502d76 601 if (skc->skc_flags & KMC_OFFSLAB) {
602 obj = kv_alloc(skc, size, flags);
603 if (!obj)
604 GOTO(out, rc = -ENOMEM);
605 } else {
606 obj = base + sizeof(spl_kmem_slab_t) + i * size;
607 }
608
609 sko = obj + skc->skc_obj_size;
fece7c99 610 sko->sko_addr = obj;
611 sko->sko_magic = SKO_MAGIC;
612 sko->sko_slab = sks;
613 INIT_LIST_HEAD(&sko->sko_list);
fece7c99 614 list_add_tail(&sko->sko_list, &sks->sks_free_list);
615 }
616
fece7c99 617 list_for_each_entry(sko, &sks->sks_free_list, sko_list)
618 if (skc->skc_ctor)
619 skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
2fb9b26a 620out:
a1502d76 621 if (rc) {
622 if (skc->skc_flags & KMC_OFFSLAB)
623 list_for_each_entry_safe(sko,n,&sks->sks_free_list,sko_list)
624 kv_free(skc, sko->sko_addr, size);
fece7c99 625
a1502d76 626 kv_free(skc, base, skc->skc_slab_size);
627 sks = NULL;
fece7c99 628 }
629
a1502d76 630 RETURN(sks);
fece7c99 631}
632
2fb9b26a 633/* Removes slab from complete or partial list, so it must
d46630e0 634 * be called with the 'skc->skc_lock' held.
fece7c99 635 */
f1ca4da6 636static void
4afaaefa 637spl_slab_free(spl_kmem_slab_t *sks) {
2fb9b26a 638 spl_kmem_cache_t *skc;
639 spl_kmem_obj_t *sko, *n;
a1502d76 640 int size;
2fb9b26a 641 ENTRY;
57d86234 642
2fb9b26a 643 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 644 ASSERT(sks->sks_ref == 0);
d6a26c6a 645
fece7c99 646 skc = sks->sks_cache;
647 ASSERT(skc->skc_magic == SKC_MAGIC);
d46630e0 648 ASSERT(spin_is_locked(&skc->skc_lock));
f1ca4da6 649
fece7c99 650 skc->skc_obj_total -= sks->sks_objs;
651 skc->skc_slab_total--;
652 list_del(&sks->sks_list);
a1502d76 653 size = sizeof(spl_kmem_obj_t) + skc->skc_obj_size;
937879f1 654
fece7c99 655 /* Run destructors slab is being released */
a1502d76 656 list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
657 ASSERT(sko->sko_magic == SKO_MAGIC);
658
2fb9b26a 659 if (skc->skc_dtor)
660 skc->skc_dtor(sko->sko_addr, skc->skc_private);
0a6fd143 661
a1502d76 662 if (skc->skc_flags & KMC_OFFSLAB)
663 kv_free(skc, sko->sko_addr, size);
664 }
d61e12af 665
a1502d76 666 kv_free(skc, sks, skc->skc_slab_size);
2fb9b26a 667 EXIT;
668}
d6a26c6a 669
2fb9b26a 670static int
4afaaefa 671__spl_slab_reclaim(spl_kmem_cache_t *skc)
2fb9b26a 672{
673 spl_kmem_slab_t *sks, *m;
674 int rc = 0;
675 ENTRY;
676
d46630e0 677 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 678 /*
679 * Free empty slabs which have not been touched in skc_delay
680 * seconds. This delay time is important to avoid thrashing.
681 * Empty slabs will be at the end of the skc_partial_list.
682 */
683 list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
684 sks_list) {
4afaaefa 685 if (sks->sks_ref > 0)
2fb9b26a 686 break;
687
688 if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) {
4afaaefa 689 spl_slab_free(sks);
2fb9b26a 690 rc++;
691 }
692 }
693
694 /* Returns number of slabs reclaimed */
695 RETURN(rc);
f1ca4da6 696}
697
2fb9b26a 698static int
4afaaefa 699spl_slab_reclaim(spl_kmem_cache_t *skc)
f1ca4da6 700{
2fb9b26a 701 int rc;
702 ENTRY;
f1ca4da6 703
d46630e0 704 spin_lock(&skc->skc_lock);
4afaaefa 705 rc = __spl_slab_reclaim(skc);
d46630e0 706 spin_unlock(&skc->skc_lock);
4efd4118 707
2fb9b26a 708 RETURN(rc);
709}
f1ca4da6 710
4afaaefa 711static int
712spl_magazine_size(spl_kmem_cache_t *skc)
713{
714 int size;
715 ENTRY;
716
717 /* Guesses for reasonable magazine sizes, they
718 * should really adapt based on observed usage. */
719 if (skc->skc_obj_size > (PAGE_SIZE * 256))
4afaaefa 720 size = 4;
ff449ac4 721 else if (skc->skc_obj_size > (PAGE_SIZE * 32))
4afaaefa 722 size = 16;
ff449ac4 723 else if (skc->skc_obj_size > (PAGE_SIZE))
724 size = 64;
4afaaefa 725 else if (skc->skc_obj_size > (PAGE_SIZE / 4))
ff449ac4 726 size = 128;
4afaaefa 727 else
ff449ac4 728 size = 512;
4afaaefa 729
730 RETURN(size);
731}
732
733static spl_kmem_magazine_t *
734spl_magazine_alloc(spl_kmem_cache_t *skc, int node)
735{
736 spl_kmem_magazine_t *skm;
737 int size = sizeof(spl_kmem_magazine_t) +
738 sizeof(void *) * skc->skc_mag_size;
739 ENTRY;
740
3d061e9d 741 skm = kmem_alloc_node(size, GFP_KERNEL, node);
4afaaefa 742 if (skm) {
743 skm->skm_magic = SKM_MAGIC;
744 skm->skm_avail = 0;
745 skm->skm_size = skc->skc_mag_size;
746 skm->skm_refill = skc->skc_mag_refill;
a1502d76 747 if (!(skc->skc_flags & KMC_NOTOUCH))
748 skm->skm_age = jiffies;
4afaaefa 749 }
750
751 RETURN(skm);
752}
753
754static void
755spl_magazine_free(spl_kmem_magazine_t *skm)
756{
a0f6da3d 757 int size = sizeof(spl_kmem_magazine_t) +
758 sizeof(void *) * skm->skm_size;
759
4afaaefa 760 ENTRY;
761 ASSERT(skm->skm_magic == SKM_MAGIC);
762 ASSERT(skm->skm_avail == 0);
a0f6da3d 763
764 kmem_free(skm, size);
4afaaefa 765 EXIT;
766}
767
768static int
769spl_magazine_create(spl_kmem_cache_t *skc)
770{
771 int i;
772 ENTRY;
773
774 skc->skc_mag_size = spl_magazine_size(skc);
775 skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
776
777 for_each_online_cpu(i) {
778 skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
779 if (!skc->skc_mag[i]) {
780 for (i--; i >= 0; i--)
781 spl_magazine_free(skc->skc_mag[i]);
782
783 RETURN(-ENOMEM);
784 }
785 }
786
787 RETURN(0);
788}
789
790static void
791spl_magazine_destroy(spl_kmem_cache_t *skc)
792{
793 spl_kmem_magazine_t *skm;
794 int i;
795 ENTRY;
796
797 for_each_online_cpu(i) {
798 skm = skc->skc_mag[i];
799 (void)spl_cache_flush(skc, skm, skm->skm_avail);
800 spl_magazine_free(skm);
801 }
802
803 EXIT;
804}
805
2fb9b26a 806spl_kmem_cache_t *
807spl_kmem_cache_create(char *name, size_t size, size_t align,
808 spl_kmem_ctor_t ctor,
809 spl_kmem_dtor_t dtor,
810 spl_kmem_reclaim_t reclaim,
811 void *priv, void *vmp, int flags)
812{
813 spl_kmem_cache_t *skc;
a1502d76 814 uint32_t slab_max, slab_size, slab_objs;
815 int rc, kmem_flags = KM_SLEEP;
2fb9b26a 816 ENTRY;
937879f1 817
a1502d76 818 ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags);
819 ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags);
820 ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags);
821
2fb9b26a 822 /* We may be called when there is a non-zero preempt_count or
823 * interrupts are disabled is which case we must not sleep.
824 */
e9d7a2be 825 if (current_thread_info()->preempt_count || irqs_disabled())
2fb9b26a 826 kmem_flags = KM_NOSLEEP;
0a6fd143 827
2fb9b26a 828 /* Allocate new cache memory and initialize. */
ff449ac4 829 skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), kmem_flags);
e9d7a2be 830 if (skc == NULL)
2fb9b26a 831 RETURN(NULL);
d61e12af 832
2fb9b26a 833 skc->skc_magic = SKC_MAGIC;
2fb9b26a 834 skc->skc_name_size = strlen(name) + 1;
835 skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags);
836 if (skc->skc_name == NULL) {
837 kmem_free(skc, sizeof(*skc));
838 RETURN(NULL);
839 }
840 strncpy(skc->skc_name, name, skc->skc_name_size);
841
e9d7a2be 842 skc->skc_ctor = ctor;
843 skc->skc_dtor = dtor;
844 skc->skc_reclaim = reclaim;
2fb9b26a 845 skc->skc_private = priv;
846 skc->skc_vmp = vmp;
847 skc->skc_flags = flags;
848 skc->skc_obj_size = size;
2fb9b26a 849 skc->skc_delay = SPL_KMEM_CACHE_DELAY;
850
2fb9b26a 851 INIT_LIST_HEAD(&skc->skc_list);
852 INIT_LIST_HEAD(&skc->skc_complete_list);
853 INIT_LIST_HEAD(&skc->skc_partial_list);
d46630e0 854 spin_lock_init(&skc->skc_lock);
e9d7a2be 855 skc->skc_slab_fail = 0;
856 skc->skc_slab_create = 0;
857 skc->skc_slab_destroy = 0;
2fb9b26a 858 skc->skc_slab_total = 0;
859 skc->skc_slab_alloc = 0;
860 skc->skc_slab_max = 0;
861 skc->skc_obj_total = 0;
862 skc->skc_obj_alloc = 0;
863 skc->skc_obj_max = 0;
a1502d76 864
865 /* If none passed select a cache type based on object size */
866 if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
867 if (skc->skc_obj_size < (PAGE_SIZE / 8)) {
868 skc->skc_flags |= KMC_KMEM;
869 } else {
870 skc->skc_flags |= KMC_VMEM;
871 }
872 }
873
874 /* Size slabs properly so ensure they are not too large */
875 slab_max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE;
876 if (skc->skc_flags & KMC_OFFSLAB) {
877 skc->skc_slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
878 skc->skc_slab_size = sizeof(spl_kmem_slab_t);
879 ASSERT(skc->skc_obj_size < slab_max);
880 } else {
881 slab_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB + 1;
882
883 do {
884 slab_objs--;
885 slab_size = sizeof(spl_kmem_slab_t) + slab_objs *
886 (skc->skc_obj_size+sizeof(spl_kmem_obj_t));
887 } while (slab_size > slab_max);
888
889 skc->skc_slab_objs = slab_objs;
890 skc->skc_slab_size = slab_size;
891 }
4afaaefa 892
893 rc = spl_magazine_create(skc);
894 if (rc) {
4afaaefa 895 kmem_free(skc->skc_name, skc->skc_name_size);
896 kmem_free(skc, sizeof(*skc));
897 RETURN(NULL);
898 }
2fb9b26a 899
900 down_write(&spl_kmem_cache_sem);
e9d7a2be 901 list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
2fb9b26a 902 up_write(&spl_kmem_cache_sem);
903
e9d7a2be 904 RETURN(skc);
f1ca4da6 905}
2fb9b26a 906EXPORT_SYMBOL(spl_kmem_cache_create);
f1ca4da6 907
2fb9b26a 908void
909spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
f1ca4da6 910{
2fb9b26a 911 spl_kmem_slab_t *sks, *m;
912 ENTRY;
f1ca4da6 913
e9d7a2be 914 ASSERT(skc->skc_magic == SKC_MAGIC);
915
916 down_write(&spl_kmem_cache_sem);
917 list_del_init(&skc->skc_list);
918 up_write(&spl_kmem_cache_sem);
2fb9b26a 919
4afaaefa 920 spl_magazine_destroy(skc);
d46630e0 921 spin_lock(&skc->skc_lock);
d6a26c6a 922
2fb9b26a 923 /* Validate there are no objects in use and free all the
4afaaefa 924 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */
2fb9b26a 925 ASSERT(list_empty(&skc->skc_complete_list));
a1502d76 926 ASSERT(skc->skc_slab_alloc == 0);
927 ASSERT(skc->skc_obj_alloc == 0);
d6a26c6a 928
e9d7a2be 929 list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list)
4afaaefa 930 spl_slab_free(sks);
2fb9b26a 931
a1502d76 932 ASSERT(skc->skc_slab_total == 0);
933 ASSERT(skc->skc_obj_total == 0);
934
2fb9b26a 935 kmem_free(skc->skc_name, skc->skc_name_size);
d46630e0 936 spin_unlock(&skc->skc_lock);
ff449ac4 937
4afaaefa 938 kmem_free(skc, sizeof(*skc));
2fb9b26a 939
940 EXIT;
f1ca4da6 941}
2fb9b26a 942EXPORT_SYMBOL(spl_kmem_cache_destroy);
f1ca4da6 943
4afaaefa 944static void *
945spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
f1ca4da6 946{
2fb9b26a 947 spl_kmem_obj_t *sko;
f1ca4da6 948
e9d7a2be 949 ASSERT(skc->skc_magic == SKC_MAGIC);
950 ASSERT(sks->sks_magic == SKS_MAGIC);
4afaaefa 951 ASSERT(spin_is_locked(&skc->skc_lock));
2fb9b26a 952
a1502d76 953 sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
4afaaefa 954 ASSERT(sko->sko_magic == SKO_MAGIC);
955 ASSERT(sko->sko_addr != NULL);
2fb9b26a 956
a1502d76 957 /* Remove from sks_free_list */
4afaaefa 958 list_del_init(&sko->sko_list);
2fb9b26a 959
4afaaefa 960 sks->sks_age = jiffies;
961 sks->sks_ref++;
962 skc->skc_obj_alloc++;
2fb9b26a 963
4afaaefa 964 /* Track max obj usage statistics */
965 if (skc->skc_obj_alloc > skc->skc_obj_max)
966 skc->skc_obj_max = skc->skc_obj_alloc;
2fb9b26a 967
4afaaefa 968 /* Track max slab usage statistics */
969 if (sks->sks_ref == 1) {
970 skc->skc_slab_alloc++;
f1ca4da6 971
4afaaefa 972 if (skc->skc_slab_alloc > skc->skc_slab_max)
973 skc->skc_slab_max = skc->skc_slab_alloc;
2fb9b26a 974 }
975
4afaaefa 976 return sko->sko_addr;
977}
c30df9c8 978
4afaaefa 979/* No available objects create a new slab. Since this is an
980 * expensive operation we do it without holding the spinlock
981 * and only briefly aquire it when we link in the fully
982 * allocated and constructed slab.
983 */
984static spl_kmem_slab_t *
985spl_cache_grow(spl_kmem_cache_t *skc, int flags)
986{
e9d7a2be 987 spl_kmem_slab_t *sks;
4afaaefa 988 ENTRY;
f1ca4da6 989
e9d7a2be 990 ASSERT(skc->skc_magic == SKC_MAGIC);
991
992 if (flags & __GFP_WAIT) {
fece7c99 993 flags |= __GFP_NOFAIL;
4afaaefa 994 local_irq_enable();
f78a933f 995 might_sleep();
4afaaefa 996 }
f1ca4da6 997
4afaaefa 998 sks = spl_slab_alloc(skc, flags);
999 if (sks == NULL) {
1000 if (flags & __GFP_WAIT)
1001 local_irq_disable();
1002
1003 RETURN(NULL);
1004 }
2fb9b26a 1005
e9d7a2be 1006 if (flags & __GFP_WAIT)
4afaaefa 1007 local_irq_disable();
1008
1009 /* Link the new empty slab in to the end of skc_partial_list */
d46630e0 1010 spin_lock(&skc->skc_lock);
2fb9b26a 1011 skc->skc_slab_total++;
1012 skc->skc_obj_total += sks->sks_objs;
1013 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
d46630e0 1014 spin_unlock(&skc->skc_lock);
4afaaefa 1015
1016 RETURN(sks);
f1ca4da6 1017}
1018
4afaaefa 1019static int
1020spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
f1ca4da6 1021{
e9d7a2be 1022 spl_kmem_slab_t *sks;
1023 int rc = 0, refill;
937879f1 1024 ENTRY;
f1ca4da6 1025
e9d7a2be 1026 ASSERT(skc->skc_magic == SKC_MAGIC);
1027 ASSERT(skm->skm_magic == SKM_MAGIC);
1028
4afaaefa 1029 /* XXX: Check for refill bouncing by age perhaps */
e9d7a2be 1030 refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
4afaaefa 1031
d46630e0 1032 spin_lock(&skc->skc_lock);
ff449ac4 1033
4afaaefa 1034 while (refill > 0) {
1035 /* No slabs available we must grow the cache */
1036 if (list_empty(&skc->skc_partial_list)) {
1037 spin_unlock(&skc->skc_lock);
ff449ac4 1038
4afaaefa 1039 sks = spl_cache_grow(skc, flags);
1040 if (!sks)
e9d7a2be 1041 GOTO(out, rc);
4afaaefa 1042
1043 /* Rescheduled to different CPU skm is not local */
1044 if (skm != skc->skc_mag[smp_processor_id()])
e9d7a2be 1045 GOTO(out, rc);
1046
1047 /* Potentially rescheduled to the same CPU but
1048 * allocations may have occured from this CPU while
1049 * we were sleeping so recalculate max refill. */
1050 refill = MIN(refill, skm->skm_size - skm->skm_avail);
4afaaefa 1051
1052 spin_lock(&skc->skc_lock);
1053 continue;
1054 }
d46630e0 1055
4afaaefa 1056 /* Grab the next available slab */
1057 sks = list_entry((&skc->skc_partial_list)->next,
1058 spl_kmem_slab_t, sks_list);
1059 ASSERT(sks->sks_magic == SKS_MAGIC);
1060 ASSERT(sks->sks_ref < sks->sks_objs);
1061 ASSERT(!list_empty(&sks->sks_free_list));
d46630e0 1062
4afaaefa 1063 /* Consume as many objects as needed to refill the requested
e9d7a2be 1064 * cache. We must also be careful not to overfill it. */
1065 while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
1066 ASSERT(skm->skm_avail < skm->skm_size);
1067 ASSERT(rc < skm->skm_size);
4afaaefa 1068 skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
e9d7a2be 1069 }
f1ca4da6 1070
4afaaefa 1071 /* Move slab to skc_complete_list when full */
1072 if (sks->sks_ref == sks->sks_objs) {
1073 list_del(&sks->sks_list);
1074 list_add(&sks->sks_list, &skc->skc_complete_list);
2fb9b26a 1075 }
1076 }
57d86234 1077
4afaaefa 1078 spin_unlock(&skc->skc_lock);
1079out:
1080 /* Returns the number of entries added to cache */
e9d7a2be 1081 RETURN(rc);
4afaaefa 1082}
1083
1084static void
1085spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
1086{
e9d7a2be 1087 spl_kmem_slab_t *sks = NULL;
4afaaefa 1088 spl_kmem_obj_t *sko = NULL;
1089 ENTRY;
1090
e9d7a2be 1091 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 1092 ASSERT(spin_is_locked(&skc->skc_lock));
1093
a1502d76 1094 sko = obj + skc->skc_obj_size;
1095 ASSERT(sko->sko_magic == SKO_MAGIC);
4afaaefa 1096
1097 sks = sko->sko_slab;
a1502d76 1098 ASSERT(sks->sks_magic == SKS_MAGIC);
2fb9b26a 1099 ASSERT(sks->sks_cache == skc);
2fb9b26a 1100 list_add(&sko->sko_list, &sks->sks_free_list);
d6a26c6a 1101
2fb9b26a 1102 sks->sks_age = jiffies;
4afaaefa 1103 sks->sks_ref--;
2fb9b26a 1104 skc->skc_obj_alloc--;
f1ca4da6 1105
2fb9b26a 1106 /* Move slab to skc_partial_list when no longer full. Slabs
4afaaefa 1107 * are added to the head to keep the partial list is quasi-full
1108 * sorted order. Fuller at the head, emptier at the tail. */
1109 if (sks->sks_ref == (sks->sks_objs - 1)) {
2fb9b26a 1110 list_del(&sks->sks_list);
1111 list_add(&sks->sks_list, &skc->skc_partial_list);
1112 }
f1ca4da6 1113
2fb9b26a 1114 /* Move emply slabs to the end of the partial list so
4afaaefa 1115 * they can be easily found and freed during reclamation. */
1116 if (sks->sks_ref == 0) {
2fb9b26a 1117 list_del(&sks->sks_list);
1118 list_add_tail(&sks->sks_list, &skc->skc_partial_list);
1119 skc->skc_slab_alloc--;
1120 }
1121
4afaaefa 1122 EXIT;
1123}
1124
1125static int
1126spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
1127{
1128 int i, count = MIN(flush, skm->skm_avail);
1129 ENTRY;
1130
e9d7a2be 1131 ASSERT(skc->skc_magic == SKC_MAGIC);
1132 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 1133
1134 spin_lock(&skc->skc_lock);
ff449ac4 1135
4afaaefa 1136 for (i = 0; i < count; i++)
1137 spl_cache_shrink(skc, skm->skm_objs[i]);
1138
e9d7a2be 1139// __spl_slab_reclaim(skc);
1140 skm->skm_avail -= count;
1141 memmove(skm->skm_objs, &(skm->skm_objs[count]),
4afaaefa 1142 sizeof(void *) * skm->skm_avail);
1143
d46630e0 1144 spin_unlock(&skc->skc_lock);
4afaaefa 1145
1146 RETURN(count);
1147}
1148
1149void *
1150spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
1151{
1152 spl_kmem_magazine_t *skm;
1153 unsigned long irq_flags;
1154 void *obj = NULL;
e9d7a2be 1155 int id;
4afaaefa 1156 ENTRY;
1157
e9d7a2be 1158 ASSERT(skc->skc_magic == SKC_MAGIC);
1159 ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */
4afaaefa 1160 local_irq_save(irq_flags);
1161
1162restart:
1163 /* Safe to update per-cpu structure without lock, but
1164 * in the restart case we must be careful to reaquire
1165 * the local magazine since this may have changed
1166 * when we need to grow the cache. */
e9d7a2be 1167 id = smp_processor_id();
1168 ASSERTF(id < 4, "cache=%p smp_processor_id=%d\n", skc, id);
4afaaefa 1169 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 1170 ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n",
1171 skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm,
1172 skm->skm_size, skm->skm_refill, skm->skm_avail);
4afaaefa 1173
1174 if (likely(skm->skm_avail)) {
1175 /* Object available in CPU cache, use it */
1176 obj = skm->skm_objs[--skm->skm_avail];
a1502d76 1177 if (!(skc->skc_flags & KMC_NOTOUCH))
1178 skm->skm_age = jiffies;
4afaaefa 1179 } else {
1180 /* Per-CPU cache empty, directly allocate from
1181 * the slab and refill the per-CPU cache. */
1182 (void)spl_cache_refill(skc, skm, flags);
1183 GOTO(restart, obj = NULL);
1184 }
1185
1186 local_irq_restore(irq_flags);
fece7c99 1187 ASSERT(obj);
4afaaefa 1188
1189 /* Pre-emptively migrate object to CPU L1 cache */
1190 prefetchw(obj);
1191
1192 RETURN(obj);
1193}
1194EXPORT_SYMBOL(spl_kmem_cache_alloc);
1195
1196void
1197spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
1198{
1199 spl_kmem_magazine_t *skm;
1200 unsigned long flags;
1201 ENTRY;
1202
e9d7a2be 1203 ASSERT(skc->skc_magic == SKC_MAGIC);
4afaaefa 1204 local_irq_save(flags);
1205
1206 /* Safe to update per-cpu structure without lock, but
1207 * no remote memory allocation tracking is being performed
1208 * it is entirely possible to allocate an object from one
1209 * CPU cache and return it to another. */
1210 skm = skc->skc_mag[smp_processor_id()];
e9d7a2be 1211 ASSERT(skm->skm_magic == SKM_MAGIC);
4afaaefa 1212
1213 /* Per-CPU cache full, flush it to make space */
1214 if (unlikely(skm->skm_avail >= skm->skm_size))
1215 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1216
1217 /* Available space in cache, use it */
1218 skm->skm_objs[skm->skm_avail++] = obj;
1219
1220 local_irq_restore(flags);
1221
1222 EXIT;
f1ca4da6 1223}
2fb9b26a 1224EXPORT_SYMBOL(spl_kmem_cache_free);
5c2bb9b2 1225
2fb9b26a 1226static int
4afaaefa 1227spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask)
2fb9b26a 1228{
e9d7a2be 1229 spl_kmem_cache_t *skc;
5c2bb9b2 1230
2fb9b26a 1231 /* Under linux a shrinker is not tightly coupled with a slab
1232 * cache. In fact linux always systematically trys calling all
1233 * registered shrinker callbacks until its target reclamation level
1234 * is reached. Because of this we only register one shrinker
1235 * function in the shim layer for all slab caches. And we always
1236 * attempt to shrink all caches when this generic shrinker is called.
c30df9c8 1237 */
e9d7a2be 1238 down_read(&spl_kmem_cache_sem);
57d86234 1239
e9d7a2be 1240 list_for_each_entry(skc, &spl_kmem_cache_list, skc_list)
2fb9b26a 1241 spl_kmem_cache_reap_now(skc);
1242
e9d7a2be 1243 up_read(&spl_kmem_cache_sem);
2fb9b26a 1244
1245 /* XXX: Under linux we should return the remaining number of
1246 * entries in the cache. We should do this as well.
1247 */
1248 return 1;
5c2bb9b2 1249}
5c2bb9b2 1250
57d86234 1251void
2fb9b26a 1252spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
57d86234 1253{
4afaaefa 1254 spl_kmem_magazine_t *skm;
1255 int i;
2fb9b26a 1256 ENTRY;
e9d7a2be 1257
1258 ASSERT(skc->skc_magic == SKC_MAGIC);
2fb9b26a 1259
1260 if (skc->skc_reclaim)
1261 skc->skc_reclaim(skc->skc_private);
1262
4afaaefa 1263 /* Ensure per-CPU caches which are idle gradually flush */
1264 for_each_online_cpu(i) {
1265 skm = skc->skc_mag[i];
1266
1267 if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
1268 (void)spl_cache_flush(skc, skm, skm->skm_refill);
1269 }
1270
1271 spl_slab_reclaim(skc);
1272
2fb9b26a 1273 EXIT;
57d86234 1274}
2fb9b26a 1275EXPORT_SYMBOL(spl_kmem_cache_reap_now);
57d86234 1276
f1b59d26 1277void
2fb9b26a 1278spl_kmem_reap(void)
937879f1 1279{
4afaaefa 1280 spl_kmem_cache_generic_shrinker(KMC_REAP_CHUNK, GFP_KERNEL);
f1ca4da6 1281}
2fb9b26a 1282EXPORT_SYMBOL(spl_kmem_reap);
5d86345d 1283
ff449ac4 1284#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
c6dc93d6 1285static char *
4afaaefa 1286spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
d6a26c6a 1287{
e9d7a2be 1288 int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
d6a26c6a 1289 int i, flag = 1;
1290
1291 ASSERT(str != NULL && len >= 17);
e9d7a2be 1292 memset(str, 0, len);
d6a26c6a 1293
1294 /* Check for a fully printable string, and while we are at
1295 * it place the printable characters in the passed buffer. */
1296 for (i = 0; i < size; i++) {
e9d7a2be 1297 str[i] = ((char *)(kd->kd_addr))[i];
1298 if (isprint(str[i])) {
1299 continue;
1300 } else {
1301 /* Minimum number of printable characters found
1302 * to make it worthwhile to print this as ascii. */
1303 if (i > min)
1304 break;
1305
1306 flag = 0;
1307 break;
1308 }
d6a26c6a 1309 }
1310
1311 if (!flag) {
1312 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
1313 *((uint8_t *)kd->kd_addr),
1314 *((uint8_t *)kd->kd_addr + 2),
1315 *((uint8_t *)kd->kd_addr + 4),
1316 *((uint8_t *)kd->kd_addr + 6),
1317 *((uint8_t *)kd->kd_addr + 8),
1318 *((uint8_t *)kd->kd_addr + 10),
1319 *((uint8_t *)kd->kd_addr + 12),
1320 *((uint8_t *)kd->kd_addr + 14));
1321 }
1322
1323 return str;
1324}
1325
a1502d76 1326static int
1327spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
1328{
1329 int i;
1330 ENTRY;
1331
1332 spin_lock_init(lock);
1333 INIT_LIST_HEAD(list);
1334
1335 for (i = 0; i < size; i++)
1336 INIT_HLIST_HEAD(&kmem_table[i]);
1337
1338 RETURN(0);
1339}
1340
ff449ac4 1341static void
1342spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
5d86345d 1343{
2fb9b26a 1344 unsigned long flags;
1345 kmem_debug_t *kd;
1346 char str[17];
a1502d76 1347 ENTRY;
2fb9b26a 1348
ff449ac4 1349 spin_lock_irqsave(lock, flags);
1350 if (!list_empty(list))
a0f6da3d 1351 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
1352 "size", "data", "func", "line");
2fb9b26a 1353
ff449ac4 1354 list_for_each_entry(kd, list, kd_list)
a0f6da3d 1355 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
1356 kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
2fb9b26a 1357 kd->kd_func, kd->kd_line);
1358
ff449ac4 1359 spin_unlock_irqrestore(lock, flags);
a1502d76 1360 EXIT;
ff449ac4 1361}
1362#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
a1502d76 1363#define spl_kmem_init_tracking(list, lock, size)
ff449ac4 1364#define spl_kmem_fini_tracking(list, lock)
1365#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
1366
a1502d76 1367int
1368spl_kmem_init(void)
1369{
1370 int rc = 0;
1371 ENTRY;
1372
1373 init_rwsem(&spl_kmem_cache_sem);
1374 INIT_LIST_HEAD(&spl_kmem_cache_list);
1375
1376#ifdef HAVE_SET_SHRINKER
1377 spl_kmem_cache_shrinker = set_shrinker(KMC_DEFAULT_SEEKS,
1378 spl_kmem_cache_generic_shrinker);
1379 if (spl_kmem_cache_shrinker == NULL)
f78a933f 1380 RETURN(rc = -ENOMEM);
a1502d76 1381#else
1382 register_shrinker(&spl_kmem_cache_shrinker);
1383#endif
1384
1385#ifdef DEBUG_KMEM
1386 atomic64_set(&kmem_alloc_used, 0);
1387 atomic64_set(&vmem_alloc_used, 0);
1388
1389 spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
1390 spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
1391#endif
a1502d76 1392 RETURN(rc);
1393}
1394
ff449ac4 1395void
1396spl_kmem_fini(void)
1397{
1398#ifdef DEBUG_KMEM
1399 /* Display all unreclaimed memory addresses, including the
1400 * allocation size and the first few bytes of what's located
1401 * at that address to aid in debugging. Performance is not
1402 * a serious concern here since it is module unload time. */
1403 if (atomic64_read(&kmem_alloc_used) != 0)
1404 CWARN("kmem leaked %ld/%ld bytes\n",
550f1705 1405 atomic64_read(&kmem_alloc_used), kmem_alloc_max);
ff449ac4 1406
2fb9b26a 1407
1408 if (atomic64_read(&vmem_alloc_used) != 0)
1409 CWARN("vmem leaked %ld/%ld bytes\n",
550f1705 1410 atomic64_read(&vmem_alloc_used), vmem_alloc_max);
2fb9b26a 1411
ff449ac4 1412 spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
1413 spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
1414#endif /* DEBUG_KMEM */
2fb9b26a 1415 ENTRY;
1416
1417#ifdef HAVE_SET_SHRINKER
1418 remove_shrinker(spl_kmem_cache_shrinker);
1419#else
1420 unregister_shrinker(&spl_kmem_cache_shrinker);
5d86345d 1421#endif
2fb9b26a 1422
937879f1 1423 EXIT;
5d86345d 1424}